rbbt 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,227 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/misc'
4
+ require 'rbbt/util/simpleDSL'
5
+
6
+ class NERFeatures < SimpleDSL
7
+ def self.tokens(text)
8
+ text.scan(/
9
+ \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
10
+ \w-\w*|
11
+ \w+-[A-Z](?!\w)|
12
+ \w+|
13
+ [.,()\/\[\]{}'"+-]
14
+ /x)
15
+ end
16
+
17
+ def self.reverse(text)
18
+ tokens(text).reverse.join(" ")
19
+ end
20
+
21
+ def define(name, *args, &block)
22
+ action = *args[0] || block || /#{name.to_s}s?/i
23
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
24
+
25
+ @types[name.to_s] = action
26
+ @order.push name.to_s
27
+
28
+ name.to_s
29
+ end
30
+
31
+ attr_accessor :reverse
32
+ def initialize(file = nil, reverse = false, &block)
33
+ @types = {}
34
+ @order = []
35
+ @context = []
36
+ @reverse = reverse
37
+
38
+ file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
39
+
40
+ super(:define,file, &block)
41
+ end
42
+
43
+ def config
44
+ @config[:define]
45
+ end
46
+
47
+ def window(positions)
48
+ @window = positions
49
+ end
50
+
51
+ def context(name, &block)
52
+ if name.is_a? Array
53
+ @context += name
54
+ else
55
+ @context.push name
56
+
57
+ # The block might be wrongly assigned to this function
58
+ # instead of the actual definition, fix that.
59
+ if block
60
+ @types[name] = block
61
+ end
62
+ end
63
+ end
64
+
65
+ def direction(dir)
66
+ if dir.to_sym == :reverse
67
+ @reverse = true
68
+ end
69
+ end
70
+
71
+ def features(word)
72
+ values = [word]
73
+
74
+ @order.each{|features|
75
+ action = @types[features]
76
+ if action.is_a?(Proc)
77
+ values.push(action.call(word))
78
+ else
79
+ m = action.match(word)
80
+ if m
81
+ if m[1]
82
+ values.push(m[1])
83
+ else
84
+ values.push(m != nil)
85
+ end
86
+ else
87
+ values.push(false)
88
+ end
89
+ end
90
+ }
91
+ values
92
+ end
93
+
94
+ def template(window=nil)
95
+ window ||= @window || [1,-1]
96
+ template = ""
97
+
98
+ i = 1
99
+ @order.each{|feat|
100
+ template += "U#{ feat }: %x[0,#{ i }]\n"
101
+
102
+ if @context.include?(feat)
103
+ window.each{|p|
104
+ template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
105
+ }
106
+ end
107
+ i += 1
108
+ }
109
+
110
+ template += "B\n"
111
+
112
+ template
113
+ end
114
+
115
+
116
+ def text_features(text, positive = nil)
117
+ text = self.class.reverse(text) if @reverse
118
+ initial = true
119
+ self.class.tokens(text).collect{|token|
120
+ features = features(token)
121
+ if !positive.nil?
122
+ features << (positive ? (initial ? 1 : 2) : 0)
123
+ initial = false
124
+ end
125
+ features
126
+ }
127
+ end
128
+
129
+ def tagged_features(text, mentions)
130
+ mentions ||= []
131
+ mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
132
+ re = mentions.collect{|mention|
133
+ Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
134
+ }.join("|")
135
+
136
+ positive = false
137
+ features = []
138
+ chunks = text.split(/(#{re})/)
139
+ chunks.each{|t|
140
+ chunk_features = text_features(t, positive)
141
+ positive = !positive
142
+ if @reverse
143
+ features = chunk_features + features
144
+ else
145
+ features = features + chunk_features
146
+ end
147
+ }
148
+ features
149
+ end
150
+
151
+ def train(features, model)
152
+ tmp_template = TmpFile.tmp_file("template-")
153
+ Open.write(tmp_template,template)
154
+
155
+ cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
156
+ system cmd
157
+ Open.write(model + '.config',config)
158
+ FileUtils.rm tmp_template
159
+ end
160
+
161
+ end
162
+
163
+ class NER
164
+
165
+ def initialize(model = nil)
166
+ begin
167
+ require 'CRFPP'
168
+ rescue Exception
169
+ require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
170
+ end
171
+
172
+ model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
173
+
174
+ @parser = NERFeatures.new(model + '.config')
175
+ @reverse = @parser.reverse
176
+ @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
177
+ end
178
+
179
+ def extract(text)
180
+ features = @parser.text_features(text)
181
+
182
+ @tagger.clear
183
+ features.each{|feats|
184
+ @tagger.add(feats.join(" "))
185
+ }
186
+
187
+ @tagger.parse
188
+
189
+ found = []
190
+ mention = []
191
+
192
+ @tagger.size.times{|i|
193
+ label = @tagger.y(i)
194
+ word = @tagger.x(i,0)
195
+
196
+ if word == ')'
197
+ mention.push(')') if mention.join =~ /\(/
198
+ next
199
+ end
200
+
201
+ case label
202
+ when 1
203
+ if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
204
+ found.push(mention)
205
+ mention = []
206
+ end
207
+ mention.push(word)
208
+ when 2
209
+ mention.push(word)
210
+ when 0
211
+ found.push(mention) if mention.any?
212
+ mention = []
213
+ end
214
+ }
215
+
216
+ found << mention if mention.any?
217
+
218
+ found.collect{|list|
219
+ list = list.reverse if @reverse
220
+ list.join(" ")
221
+ }
222
+ end
223
+
224
+ end
225
+
226
+
227
+
@@ -0,0 +1,80 @@
1
+ require 'rbbt/util/misc'
2
+ require 'rbbt/util/simpleDSL'
3
+
4
+ class CueIndex < SimpleDSL
5
+
6
+ class LexiconMissingError < StandardError; end
7
+
8
+
9
+ def define(name, *args, &block)
10
+ @rules << [name,block]
11
+ nil
12
+ end
13
+
14
+ def initialize(file = nil, &block)
15
+ @rules = []
16
+
17
+ file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
18
+
19
+ super(:define, file, &block)
20
+ end
21
+
22
+ def config
23
+ @config[:define]
24
+ end
25
+
26
+
27
+ def cues(word)
28
+ @rules.collect{|rule|
29
+ c = rule[1].call(word)
30
+ c = [c] unless c.is_a? Array
31
+ c
32
+ }
33
+ end
34
+
35
+ def clean(max)
36
+ @indexes.each{|index|
37
+ remove = []
38
+ index.each{|key,values|
39
+ remove << key if values.length > max
40
+ }
41
+ remove.each{|key|
42
+ index.delete(key)
43
+ }
44
+ }
45
+ end
46
+
47
+ def load(file, max_candidates = 50)
48
+ @indexes = Array.new(@rules.size){Hash.new}
49
+ data = Open.to_hash(file, :sep => "\t|\\|")
50
+ data.each{|code, values_lists|
51
+ values = values_lists.flatten.compact.uniq
52
+ values.each{|value|
53
+ cues(value).each_with_index{|cue_list,i|
54
+ cue_list.each{|cue|
55
+ @indexes[i][cue] ||= []
56
+ @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
+ }
58
+ }
59
+ }
60
+ }
61
+ clean(max_candidates) if max_candidates
62
+ nil
63
+ end
64
+
65
+ def match(name)
66
+ raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
+
68
+ cues = cues(name)
69
+ @indexes.each_with_index{|index,i|
70
+ best = []
71
+ cues[i].each{|cue|
72
+ best << index[cue] if index[cue]
73
+ }
74
+ return best.flatten if best.any?
75
+ }
76
+
77
+ return []
78
+ end
79
+
80
+ end
@@ -0,0 +1,213 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/simpleDSL'
3
+ require 'rbbt/util/misc'
4
+ require 'set'
5
+
6
+
7
+ class Tokenizer < SimpleDSL
8
+ #{{{ Classes for Comparisons
9
+
10
+ @@ignore_case = true
11
+
12
+ def self.ignore_case(ignore = nil)
13
+ if ignore.nil?
14
+ return @@ignore_case
15
+ else
16
+ @@ignore_case = ignore
17
+ end
18
+ end
19
+
20
+
21
+ class Operation
22
+
23
+ def initialize(comparison)
24
+ @comparison = comparison
25
+ @ignore_case = Tokenizer::ignore_case
26
+ end
27
+
28
+ def ignore_case(ignore = true)
29
+ @ignore_case = ignore
30
+ self
31
+ end
32
+
33
+ def method_missing(name, *args, &bloc)
34
+ @token = name.to_sym
35
+ @value = *args.first
36
+ self
37
+ end
38
+
39
+ def eval(list1, list2)
40
+ toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
41
+ toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
+
43
+ value = 0
44
+ case @comparison.to_s
45
+ when 'same':
46
+ if toks1 == toks2 && toks1.any?
47
+ value = @value
48
+ end
49
+ when 'diff':
50
+ if toks1 != toks2
51
+ value = @value
52
+ end
53
+ when 'common':
54
+ if toks1.to_set.intersection(toks2.to_set).length > 0
55
+ value = @value
56
+ end
57
+ when 'distinct':
58
+ if toks1.to_set.intersection(toks2.to_set).length == 0
59
+ value = @value
60
+ end
61
+ when 'miss':
62
+ missing = (toks1 - toks2)
63
+ if missing.length > 0
64
+ value = @value * missing.length
65
+ end
66
+ when 'extr':
67
+ extr = (toks2 - toks1)
68
+ if extr.length > 0
69
+ value = @value * extr.length
70
+ end
71
+ end
72
+
73
+ return value
74
+ end
75
+ end
76
+
77
+ class Custom
78
+ def initialize
79
+ @ignore_case = Tokenizer::ignore_case
80
+ end
81
+
82
+ def ignore_case(ignore = true)
83
+ @ignore_case = ignore
84
+ self
85
+ end
86
+
87
+ def method_missing(name, *args, &block)
88
+ @token = name.to_sym
89
+ @block = block
90
+ end
91
+
92
+ def eval(list1, list2)
93
+ toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
94
+ toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
+
96
+ @block.call(toks1, toks2)
97
+ end
98
+ end
99
+
100
+ class Transform
101
+ def initialize
102
+ end
103
+ def method_missing(name, *args, &block)
104
+ @token = name.to_sym
105
+ @block = block
106
+ self
107
+ end
108
+
109
+ def transform(token)
110
+ if token[1] == @token
111
+ token = @block.call(token[0])
112
+ else
113
+ token
114
+ end
115
+ end
116
+ end
117
+
118
+
119
+ #{{{ Metaprogramming hooks
120
+ def define_tokens(name, *args, &block)
121
+ action = *args[0] || block || /#{name.to_s}s?/i
122
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
123
+
124
+ @types[name.to_sym] = action
125
+ @order.push name.to_sym
126
+
127
+ name.to_sym
128
+ end
129
+
130
+ def define_comparisons(name, *args, &block)
131
+ o = nil
132
+ case name.to_sym
133
+ when :compare
134
+ o = Custom.new
135
+ @operations << o
136
+ when :transform
137
+ o = Transform.new
138
+ @transforms << o
139
+ else
140
+ o = Operation.new(name)
141
+ @operations << o
142
+ end
143
+ o
144
+ end
145
+
146
+ def main(name, *args, &block)
147
+ parse("define_" + name.to_s,block)
148
+ end
149
+
150
+ #{{{ Initialize
151
+ def initialize(file=nil, &block)
152
+ @types = {}
153
+ @order = []
154
+ @operations = []
155
+ @transforms = []
156
+
157
+ file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
158
+ super(:main, file, &block)
159
+ end
160
+
161
+
162
+ #{{{ Token Types
163
+ GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
164
+ def tokenize(word)
165
+ return word.
166
+ gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
167
+ gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
168
+ gsub(/([a-z])([A-Z])/,'\1-\2').
169
+ gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
170
+ gsub(/^(#{GREEK_RE})/,'\1-').
171
+ gsub(/(#{GREEK_RE})$/,'-\1').
172
+ split( /[^\w.]+/). # Split by separator char
173
+ select{|t| !t.empty? }
174
+ end
175
+
176
+
177
+ def type(token)
178
+ @order.each{|type|
179
+ action = @types[type]
180
+ if action.is_a? Proc
181
+ return type if action.call(token)
182
+ else
183
+ return type if action.match(token)
184
+ end
185
+ }
186
+ return :unknown
187
+ end
188
+
189
+ def token_types(word)
190
+ tokenize(word).collect{|token|
191
+ [token, type(token)]
192
+ }
193
+ end
194
+
195
+ #{{{ Comparisons
196
+
197
+ def evaluate_tokens(list1, list2)
198
+ @operations.inject(0){| acc, o|
199
+ acc + o.eval(list1, list2)
200
+ }
201
+ end
202
+
203
+ def evaluate(mention, name)
204
+ mention_tokens, name_tokens = [mention, name].collect{|n|
205
+ token_types(n).collect{|t|
206
+ @transforms.inject(t){|t,o|
207
+ t = o.transform(t)
208
+ }
209
+ }
210
+ }
211
+ evaluate_tokens(mention_tokens, name_tokens)
212
+ end
213
+ end
@@ -0,0 +1,142 @@
1
+ require 'rbbt'
2
+ require 'rbbt/ner/rnorm/cue_index'
3
+ require 'rbbt/ner/rnorm/tokens'
4
+ require 'rbbt/util/index'
5
+ require 'rbbt/util/open'
6
+ require 'rbbt/sources/entrez'
7
+
8
+ class Normalizer
9
+
10
+
11
+ # Given a list of pairs of candidates along with their scores as
12
+ # parameter +values+, and a minimum value for the scores. It returns
13
+ # a list of pairs of the candidates that score the highest and that
14
+ # score above the minimum. Otherwise it return an empty list.
15
+ def self.get_best(values, min)
16
+ return [] if values.empty?
17
+ best = values.collect{|p| p[1]}.max
18
+ return [] if best < min
19
+ values.select{|p| p[1] == best}
20
+ end
21
+
22
+ # Compares the tokens and gives each candidate a score based on the
23
+ # commonalities and differences amongst the tokens.
24
+ def token_score(candidates, mention)
25
+ candidates.collect{|code|
26
+ next if @synonyms[code].nil?
27
+ value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
28
+ case
29
+ when mention == name
30
+ 100
31
+ when mention.downcase == name.downcase
32
+ 90
33
+ when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
34
+ 80
35
+ else
36
+ @tokens.evaluate(mention, name)
37
+ end
38
+ }.max
39
+ [code, value]
40
+ }.compact
41
+ end
42
+
43
+ # Order candidates with the number of words in common between the text
44
+ # in their Entrez Gene entry and the text passed as parameter. Because
45
+ # candidate genes might be in some other format than Entrez Gene Ids,
46
+ # the +to_entrez+ variable can hold the way to translate between them,
47
+ # been a Proc or a Hash.
48
+ def entrez_score(candidates, text, to_entrez = nil)
49
+ code2entrez = {}
50
+ candidates.each{|code|
51
+ if to_entrez.is_a? Proc
52
+ entrez = to_entrez.call(code)
53
+ elsif to_entrez.is_a? Hash
54
+ entrez = @to_entrez[code]
55
+ else
56
+ entrez = code
57
+ end
58
+ code2entrez[code] = entrez unless entrez.nil?
59
+ }
60
+
61
+ # Get all at once, better performance
62
+
63
+ genes = Entrez.get_gene(code2entrez.values)
64
+ code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
65
+
66
+ code2entrez_genes.collect{|p|
67
+ [p[0], Entrez.gene_text_similarity(p[1], text)]
68
+ }
69
+ end
70
+
71
+ # Takes a list of candidate codes and selects the ones that have the
72
+ # mention explicitly in their list of synonyms, and in the earliest
73
+ # positions. This is based on the idea that synonym list order their
74
+ # synonyms by importance.
75
+ def appearence_order(candidates, mention)
76
+ positions = candidates.collect{|code|
77
+ next unless @synonyms[code]
78
+ pos = nil
79
+ @synonyms[code].each_with_index{|list,i|
80
+ next if pos
81
+ pos = i if list.include? mention
82
+ }
83
+ pos
84
+ }
85
+ return nil if positions.compact.empty?
86
+ best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
87
+ candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
88
+ end
89
+
90
+
91
+
92
+ def initialize(lexicon, options = {})
93
+ @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
94
+
95
+ @index = CueIndex.new
96
+ @index.load(lexicon, options[:max_candidates])
97
+
98
+ @to_entrez = options[:to_entrez]
99
+ @tokens = Tokenizer.new(options[:file])
100
+ end
101
+
102
+ def match(mention)
103
+ @index.match(mention)
104
+ end
105
+
106
+ def select(candidates, mention, text = nil, options = {})
107
+ threshold = options[:threshold] || 0
108
+ max_candidates = options[:max_candidates] || 200
109
+ max_entrez = options[:max_entrez] || 10
110
+
111
+ # Abort if too ambigous
112
+ return [] if candidates.empty?
113
+ return [] if candidates.length > max_candidates
114
+
115
+ scores = token_score(candidates, mention)
116
+ best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
117
+
118
+ # Abort if too ambigous
119
+ return [] if best_codes.length > max_entrez
120
+
121
+ if best_codes.length > 1 and text
122
+ scores = entrez_score(best_codes, text, @to_entrez)
123
+
124
+ Normalizer::get_best(scores, 0).collect{|p| p[0]}
125
+ else
126
+ orders = appearence_order(best_codes, mention)
127
+ if orders
128
+ orders
129
+ else
130
+ best_codes
131
+ end
132
+ end
133
+
134
+ end
135
+
136
+ def resolve(mention, text = nil, options = {})
137
+ candidates = match(mention)
138
+ select(candidates, mention, text, options)
139
+ end
140
+
141
+ end
142
+