rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,227 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/misc'
4
+ require 'rbbt/util/simpleDSL'
5
+
6
+ class NERFeatures < SimpleDSL
7
+ def self.tokens(text)
8
+ text.scan(/
9
+ \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
10
+ \w-\w*|
11
+ \w+-[A-Z](?!\w)|
12
+ \w+|
13
+ [.,()\/\[\]{}'"+-]
14
+ /x)
15
+ end
16
+
17
+ def self.reverse(text)
18
+ tokens(text).reverse.join(" ")
19
+ end
20
+
21
+ def define(name, *args, &block)
22
+ action = *args[0] || block || /#{name.to_s}s?/i
23
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
24
+
25
+ @types[name.to_s] = action
26
+ @order.push name.to_s
27
+
28
+ name.to_s
29
+ end
30
+
31
+ attr_accessor :reverse
32
+ def initialize(file = nil, reverse = false, &block)
33
+ @types = {}
34
+ @order = []
35
+ @context = []
36
+ @reverse = reverse
37
+
38
+ file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
39
+
40
+ super(:define,file, &block)
41
+ end
42
+
43
+ def config
44
+ @config[:define]
45
+ end
46
+
47
+ def window(positions)
48
+ @window = positions
49
+ end
50
+
51
+ def context(name, &block)
52
+ if name.is_a? Array
53
+ @context += name
54
+ else
55
+ @context.push name
56
+
57
+ # The block might be wrongly assigned to this function
58
+ # instead of the actual definition, fix that.
59
+ if block
60
+ @types[name] = block
61
+ end
62
+ end
63
+ end
64
+
65
+ def direction(dir)
66
+ if dir.to_sym == :reverse
67
+ @reverse = true
68
+ end
69
+ end
70
+
71
+ def features(word)
72
+ values = [word]
73
+
74
+ @order.each{|features|
75
+ action = @types[features]
76
+ if action.is_a?(Proc)
77
+ values.push(action.call(word))
78
+ else
79
+ m = action.match(word)
80
+ if m
81
+ if m[1]
82
+ values.push(m[1])
83
+ else
84
+ values.push(m != nil)
85
+ end
86
+ else
87
+ values.push(false)
88
+ end
89
+ end
90
+ }
91
+ values
92
+ end
93
+
94
+ def template(window=nil)
95
+ window ||= @window || [1,-1]
96
+ template = ""
97
+
98
+ i = 1
99
+ @order.each{|feat|
100
+ template += "U#{ feat }: %x[0,#{ i }]\n"
101
+
102
+ if @context.include?(feat)
103
+ window.each{|p|
104
+ template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
105
+ }
106
+ end
107
+ i += 1
108
+ }
109
+
110
+ template += "B\n"
111
+
112
+ template
113
+ end
114
+
115
+
116
+ def text_features(text, positive = nil)
117
+ text = self.class.reverse(text) if @reverse
118
+ initial = true
119
+ self.class.tokens(text).collect{|token|
120
+ features = features(token)
121
+ if !positive.nil?
122
+ features << (positive ? (initial ? 1 : 2) : 0)
123
+ initial = false
124
+ end
125
+ features
126
+ }
127
+ end
128
+
129
+ def tagged_features(text, mentions)
130
+ mentions ||= []
131
+ mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
132
+ re = mentions.collect{|mention|
133
+ Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
134
+ }.join("|")
135
+
136
+ positive = false
137
+ features = []
138
+ chunks = text.split(/(#{re})/)
139
+ chunks.each{|t|
140
+ chunk_features = text_features(t, positive)
141
+ positive = !positive
142
+ if @reverse
143
+ features = chunk_features + features
144
+ else
145
+ features = features + chunk_features
146
+ end
147
+ }
148
+ features
149
+ end
150
+
151
+ def train(features, model)
152
+ tmp_template = TmpFile.tmp_file("template-")
153
+ Open.write(tmp_template,template)
154
+
155
+ cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
156
+ system cmd
157
+ Open.write(model + '.config',config)
158
+ FileUtils.rm tmp_template
159
+ end
160
+
161
+ end
162
+
163
+ class NER
164
+
165
+ def initialize(model = nil)
166
+ begin
167
+ require 'CRFPP'
168
+ rescue Exception
169
+ require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
170
+ end
171
+
172
+ model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
173
+
174
+ @parser = NERFeatures.new(model + '.config')
175
+ @reverse = @parser.reverse
176
+ @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
177
+ end
178
+
179
+ def extract(text)
180
+ features = @parser.text_features(text)
181
+
182
+ @tagger.clear
183
+ features.each{|feats|
184
+ @tagger.add(feats.join(" "))
185
+ }
186
+
187
+ @tagger.parse
188
+
189
+ found = []
190
+ mention = []
191
+
192
+ @tagger.size.times{|i|
193
+ label = @tagger.y(i)
194
+ word = @tagger.x(i,0)
195
+
196
+ if word == ')'
197
+ mention.push(')') if mention.join =~ /\(/
198
+ next
199
+ end
200
+
201
+ case label
202
+ when 1
203
+ if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
204
+ found.push(mention)
205
+ mention = []
206
+ end
207
+ mention.push(word)
208
+ when 2
209
+ mention.push(word)
210
+ when 0
211
+ found.push(mention) if mention.any?
212
+ mention = []
213
+ end
214
+ }
215
+
216
+ found << mention if mention.any?
217
+
218
+ found.collect{|list|
219
+ list = list.reverse if @reverse
220
+ list.join(" ")
221
+ }
222
+ end
223
+
224
+ end
225
+
226
+
227
+
@@ -0,0 +1,80 @@
1
+ require 'rbbt/util/misc'
2
+ require 'rbbt/util/simpleDSL'
3
+
4
+ class CueIndex < SimpleDSL
5
+
6
+ class LexiconMissingError < StandardError; end
7
+
8
+
9
+ def define(name, *args, &block)
10
+ @rules << [name,block]
11
+ nil
12
+ end
13
+
14
+ def initialize(file = nil, &block)
15
+ @rules = []
16
+
17
+ file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
18
+
19
+ super(:define, file, &block)
20
+ end
21
+
22
+ def config
23
+ @config[:define]
24
+ end
25
+
26
+
27
+ def cues(word)
28
+ @rules.collect{|rule|
29
+ c = rule[1].call(word)
30
+ c = [c] unless c.is_a? Array
31
+ c
32
+ }
33
+ end
34
+
35
+ def clean(max)
36
+ @indexes.each{|index|
37
+ remove = []
38
+ index.each{|key,values|
39
+ remove << key if values.length > max
40
+ }
41
+ remove.each{|key|
42
+ index.delete(key)
43
+ }
44
+ }
45
+ end
46
+
47
+ def load(file, max_candidates = 50)
48
+ @indexes = Array.new(@rules.size){Hash.new}
49
+ data = Open.to_hash(file, :sep => "\t|\\|")
50
+ data.each{|code, values_lists|
51
+ values = values_lists.flatten.compact.uniq
52
+ values.each{|value|
53
+ cues(value).each_with_index{|cue_list,i|
54
+ cue_list.each{|cue|
55
+ @indexes[i][cue] ||= []
56
+ @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
+ }
58
+ }
59
+ }
60
+ }
61
+ clean(max_candidates) if max_candidates
62
+ nil
63
+ end
64
+
65
+ def match(name)
66
+ raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
+
68
+ cues = cues(name)
69
+ @indexes.each_with_index{|index,i|
70
+ best = []
71
+ cues[i].each{|cue|
72
+ best << index[cue] if index[cue]
73
+ }
74
+ return best.flatten if best.any?
75
+ }
76
+
77
+ return []
78
+ end
79
+
80
+ end
@@ -0,0 +1,213 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/simpleDSL'
3
+ require 'rbbt/util/misc'
4
+ require 'set'
5
+
6
+
7
+ class Tokenizer < SimpleDSL
8
+ #{{{ Classes for Comparisons
9
+
10
+ @@ignore_case = true
11
+
12
+ def self.ignore_case(ignore = nil)
13
+ if ignore.nil?
14
+ return @@ignore_case
15
+ else
16
+ @@ignore_case = ignore
17
+ end
18
+ end
19
+
20
+
21
+ class Operation
22
+
23
+ def initialize(comparison)
24
+ @comparison = comparison
25
+ @ignore_case = Tokenizer::ignore_case
26
+ end
27
+
28
+ def ignore_case(ignore = true)
29
+ @ignore_case = ignore
30
+ self
31
+ end
32
+
33
+ def method_missing(name, *args, &bloc)
34
+ @token = name.to_sym
35
+ @value = *args.first
36
+ self
37
+ end
38
+
39
+ def eval(list1, list2)
40
+ toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
41
+ toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
+
43
+ value = 0
44
+ case @comparison.to_s
45
+ when 'same':
46
+ if toks1 == toks2 && toks1.any?
47
+ value = @value
48
+ end
49
+ when 'diff':
50
+ if toks1 != toks2
51
+ value = @value
52
+ end
53
+ when 'common':
54
+ if toks1.to_set.intersection(toks2.to_set).length > 0
55
+ value = @value
56
+ end
57
+ when 'distinct':
58
+ if toks1.to_set.intersection(toks2.to_set).length == 0
59
+ value = @value
60
+ end
61
+ when 'miss':
62
+ missing = (toks1 - toks2)
63
+ if missing.length > 0
64
+ value = @value * missing.length
65
+ end
66
+ when 'extr':
67
+ extr = (toks2 - toks1)
68
+ if extr.length > 0
69
+ value = @value * extr.length
70
+ end
71
+ end
72
+
73
+ return value
74
+ end
75
+ end
76
+
77
+ class Custom
78
+ def initialize
79
+ @ignore_case = Tokenizer::ignore_case
80
+ end
81
+
82
+ def ignore_case(ignore = true)
83
+ @ignore_case = ignore
84
+ self
85
+ end
86
+
87
+ def method_missing(name, *args, &block)
88
+ @token = name.to_sym
89
+ @block = block
90
+ end
91
+
92
+ def eval(list1, list2)
93
+ toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
94
+ toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
+
96
+ @block.call(toks1, toks2)
97
+ end
98
+ end
99
+
100
+ class Transform
101
+ def initialize
102
+ end
103
+ def method_missing(name, *args, &block)
104
+ @token = name.to_sym
105
+ @block = block
106
+ self
107
+ end
108
+
109
+ def transform(token)
110
+ if token[1] == @token
111
+ token = @block.call(token[0])
112
+ else
113
+ token
114
+ end
115
+ end
116
+ end
117
+
118
+
119
+ #{{{ Metaprogramming hooks
120
+ def define_tokens(name, *args, &block)
121
+ action = *args[0] || block || /#{name.to_s}s?/i
122
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
123
+
124
+ @types[name.to_sym] = action
125
+ @order.push name.to_sym
126
+
127
+ name.to_sym
128
+ end
129
+
130
+ def define_comparisons(name, *args, &block)
131
+ o = nil
132
+ case name.to_sym
133
+ when :compare
134
+ o = Custom.new
135
+ @operations << o
136
+ when :transform
137
+ o = Transform.new
138
+ @transforms << o
139
+ else
140
+ o = Operation.new(name)
141
+ @operations << o
142
+ end
143
+ o
144
+ end
145
+
146
+ def main(name, *args, &block)
147
+ parse("define_" + name.to_s,block)
148
+ end
149
+
150
+ #{{{ Initialize
151
+ def initialize(file=nil, &block)
152
+ @types = {}
153
+ @order = []
154
+ @operations = []
155
+ @transforms = []
156
+
157
+ file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
158
+ super(:main, file, &block)
159
+ end
160
+
161
+
162
+ #{{{ Token Types
163
+ GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
164
+ def tokenize(word)
165
+ return word.
166
+ gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
167
+ gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
168
+ gsub(/([a-z])([A-Z])/,'\1-\2').
169
+ gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
170
+ gsub(/^(#{GREEK_RE})/,'\1-').
171
+ gsub(/(#{GREEK_RE})$/,'-\1').
172
+ split( /[^\w.]+/). # Split by separator char
173
+ select{|t| !t.empty? }
174
+ end
175
+
176
+
177
+ def type(token)
178
+ @order.each{|type|
179
+ action = @types[type]
180
+ if action.is_a? Proc
181
+ return type if action.call(token)
182
+ else
183
+ return type if action.match(token)
184
+ end
185
+ }
186
+ return :unknown
187
+ end
188
+
189
+ def token_types(word)
190
+ tokenize(word).collect{|token|
191
+ [token, type(token)]
192
+ }
193
+ end
194
+
195
+ #{{{ Comparisons
196
+
197
+ def evaluate_tokens(list1, list2)
198
+ @operations.inject(0){| acc, o|
199
+ acc + o.eval(list1, list2)
200
+ }
201
+ end
202
+
203
+ def evaluate(mention, name)
204
+ mention_tokens, name_tokens = [mention, name].collect{|n|
205
+ token_types(n).collect{|t|
206
+ @transforms.inject(t){|t,o|
207
+ t = o.transform(t)
208
+ }
209
+ }
210
+ }
211
+ evaluate_tokens(mention_tokens, name_tokens)
212
+ end
213
+ end
@@ -0,0 +1,142 @@
1
+ require 'rbbt'
2
+ require 'rbbt/ner/rnorm/cue_index'
3
+ require 'rbbt/ner/rnorm/tokens'
4
+ require 'rbbt/util/index'
5
+ require 'rbbt/util/open'
6
+ require 'rbbt/sources/entrez'
7
+
8
+ class Normalizer
9
+
10
+
11
+ # Given a list of pairs of candidates along with their scores as
12
+ # parameter +values+, and a minimum value for the scores. It returns
13
+ # a list of pairs of the candidates that score the highest and that
14
+ # score above the minimum. Otherwise it return an empty list.
15
+ def self.get_best(values, min)
16
+ return [] if values.empty?
17
+ best = values.collect{|p| p[1]}.max
18
+ return [] if best < min
19
+ values.select{|p| p[1] == best}
20
+ end
21
+
22
+ # Compares the tokens and gives each candidate a score based on the
23
+ # commonalities and differences amongst the tokens.
24
+ def token_score(candidates, mention)
25
+ candidates.collect{|code|
26
+ next if @synonyms[code].nil?
27
+ value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
28
+ case
29
+ when mention == name
30
+ 100
31
+ when mention.downcase == name.downcase
32
+ 90
33
+ when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
34
+ 80
35
+ else
36
+ @tokens.evaluate(mention, name)
37
+ end
38
+ }.max
39
+ [code, value]
40
+ }.compact
41
+ end
42
+
43
+ # Order candidates with the number of words in common between the text
44
+ # in their Entrez Gene entry and the text passed as parameter. Because
45
+ # candidate genes might be in some other format than Entrez Gene Ids,
46
+ # the +to_entrez+ variable can hold the way to translate between them,
47
+ # been a Proc or a Hash.
48
+ def entrez_score(candidates, text, to_entrez = nil)
49
+ code2entrez = {}
50
+ candidates.each{|code|
51
+ if to_entrez.is_a? Proc
52
+ entrez = to_entrez.call(code)
53
+ elsif to_entrez.is_a? Hash
54
+ entrez = @to_entrez[code]
55
+ else
56
+ entrez = code
57
+ end
58
+ code2entrez[code] = entrez unless entrez.nil?
59
+ }
60
+
61
+ # Get all at once, better performance
62
+
63
+ genes = Entrez.get_gene(code2entrez.values)
64
+ code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
65
+
66
+ code2entrez_genes.collect{|p|
67
+ [p[0], Entrez.gene_text_similarity(p[1], text)]
68
+ }
69
+ end
70
+
71
+ # Takes a list of candidate codes and selects the ones that have the
72
+ # mention explicitly in their list of synonyms, and in the earliest
73
+ # positions. This is based on the idea that synonym list order their
74
+ # synonyms by importance.
75
+ def appearence_order(candidates, mention)
76
+ positions = candidates.collect{|code|
77
+ next unless @synonyms[code]
78
+ pos = nil
79
+ @synonyms[code].each_with_index{|list,i|
80
+ next if pos
81
+ pos = i if list.include? mention
82
+ }
83
+ pos
84
+ }
85
+ return nil if positions.compact.empty?
86
+ best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
87
+ candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
88
+ end
89
+
90
+
91
+
92
+ def initialize(lexicon, options = {})
93
+ @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
94
+
95
+ @index = CueIndex.new
96
+ @index.load(lexicon, options[:max_candidates])
97
+
98
+ @to_entrez = options[:to_entrez]
99
+ @tokens = Tokenizer.new(options[:file])
100
+ end
101
+
102
+ def match(mention)
103
+ @index.match(mention)
104
+ end
105
+
106
+ def select(candidates, mention, text = nil, options = {})
107
+ threshold = options[:threshold] || 0
108
+ max_candidates = options[:max_candidates] || 200
109
+ max_entrez = options[:max_entrez] || 10
110
+
111
+ # Abort if too ambigous
112
+ return [] if candidates.empty?
113
+ return [] if candidates.length > max_candidates
114
+
115
+ scores = token_score(candidates, mention)
116
+ best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
117
+
118
+ # Abort if too ambigous
119
+ return [] if best_codes.length > max_entrez
120
+
121
+ if best_codes.length > 1 and text
122
+ scores = entrez_score(best_codes, text, @to_entrez)
123
+
124
+ Normalizer::get_best(scores, 0).collect{|p| p[0]}
125
+ else
126
+ orders = appearence_order(best_codes, mention)
127
+ if orders
128
+ orders
129
+ else
130
+ best_codes
131
+ end
132
+ end
133
+
134
+ end
135
+
136
+ def resolve(mention, text = nil, options = {})
137
+ candidates = match(mention)
138
+ select(candidates, mention, text, options)
139
+ end
140
+
141
+ end
142
+