rbbt 1.1.7 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,143 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/ner/rnorm/cue_index'
3
- require 'rbbt/ner/rnorm/tokens'
4
- require 'rbbt/util/index'
5
- require 'rbbt/util/open'
6
- require 'rbbt/sources/entrez'
7
- require 'rbbt/bow/bow.rb'
8
-
9
- class Normalizer
10
-
11
-
12
- # Given a list of pairs of candidates along with their scores as
13
- # parameter +values+, and a minimum value for the scores. It returns
14
- # a list of pairs of the candidates that score the highest and that
15
- # score above the minimum. Otherwise it return an empty list.
16
- def self.get_best(values, min)
17
- return [] if values.empty?
18
- best = values.collect{|p| p[1]}.max
19
- return [] if best < min
20
- values.select{|p| p[1] == best}
21
- end
22
-
23
- # Compares the tokens and gives each candidate a score based on the
24
- # commonalities and differences amongst the tokens.
25
- def token_score(candidates, mention)
26
- candidates.collect{|code|
27
- next if @synonyms[code].nil?
28
- value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
29
- case
30
- when mention == name
31
- 100
32
- when mention.downcase == name.downcase
33
- 90
34
- when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
35
- 80
36
- else
37
- @tokens.evaluate(mention, name)
38
- end
39
- }.max
40
- [code, value]
41
- }.compact
42
- end
43
-
44
- # Order candidates with the number of words in common between the text
45
- # in their Entrez Gene entry and the text passed as parameter. Because
46
- # candidate genes might be in some other format than Entrez Gene Ids,
47
- # the +to_entrez+ variable can hold the way to translate between them,
48
- # been a Proc or a Hash.
49
- def entrez_score(candidates, text, to_entrez = nil)
50
- code2entrez = {}
51
- candidates.each{|code|
52
- if to_entrez.is_a? Proc
53
- entrez = to_entrez.call(code)
54
- elsif to_entrez.is_a? Hash
55
- entrez = @to_entrez[code]
56
- else
57
- entrez = code
58
- end
59
- code2entrez[code] = entrez unless entrez.nil?
60
- }
61
-
62
- # Get all at once, better performance
63
-
64
- genes = Entrez.get_gene(code2entrez.values)
65
- code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
66
-
67
- code2entrez_genes.collect{|p|
68
- [p[0], Entrez.gene_text_similarity(p[1], text)]
69
- }
70
- end
71
-
72
- # Takes a list of candidate codes and selects the ones that have the
73
- # mention explicitly in their list of synonyms, and in the earliest
74
- # positions. This is based on the idea that synonym list order their
75
- # synonyms by importance.
76
- def appearence_order(candidates, mention)
77
- positions = candidates.collect{|code|
78
- next unless @synonyms[code]
79
- pos = nil
80
- @synonyms[code].each_with_index{|list,i|
81
- next if pos
82
- pos = i if list.include? mention
83
- }
84
- pos
85
- }
86
- return nil if positions.compact.empty?
87
- best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
88
- candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
89
- end
90
-
91
-
92
-
93
- def initialize(lexicon, options = {})
94
- @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
95
-
96
- @index = CueIndex.new
97
- @index.load(lexicon, options[:max_candidates])
98
-
99
- @to_entrez = options[:to_entrez]
100
- @tokens = Tokenizer.new(options[:file])
101
- end
102
-
103
- def match(mention)
104
- @index.match(mention)
105
- end
106
-
107
- def select(candidates, mention, text = nil, options = {})
108
- threshold = options[:threshold] || 0
109
- max_candidates = options[:max_candidates] || 200
110
- max_entrez = options[:max_entrez] || 10
111
-
112
- # Abort if too ambigous
113
- return [] if candidates.empty?
114
- return [] if candidates.length > max_candidates
115
-
116
- scores = token_score(candidates, mention)
117
- best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
118
-
119
- # Abort if too ambigous
120
- return [] if best_codes.length > max_entrez
121
-
122
- if best_codes.length > 1 and text
123
- scores = entrez_score(best_codes, text, @to_entrez)
124
-
125
- Normalizer::get_best(scores, 0).collect{|p| p[0]}
126
- else
127
- orders = appearence_order(best_codes, mention)
128
- if orders
129
- orders
130
- else
131
- best_codes
132
- end
133
- end
134
-
135
- end
136
-
137
- def resolve(mention, text = nil, options = {})
138
- candidates = match(mention)
139
- select(candidates, mention, text, options)
140
- end
141
-
142
- end
143
-
@@ -1,80 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'rbbt/util/simpleDSL'
3
-
4
- class CueIndex < SimpleDSL
5
-
6
- class LexiconMissingError < StandardError; end
7
-
8
-
9
- def define(name, *args, &block)
10
- @rules << [name,block]
11
- nil
12
- end
13
-
14
- def initialize(file = nil, &block)
15
- @rules = []
16
-
17
- file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
18
-
19
- super(:define, file, &block)
20
- end
21
-
22
- def config
23
- @config[:define]
24
- end
25
-
26
-
27
- def cues(word)
28
- @rules.collect{|rule|
29
- c = rule[1].call(word)
30
- c = [c] unless c.is_a? Array
31
- c
32
- }
33
- end
34
-
35
- def clean(max)
36
- @indexes.each{|index|
37
- remove = []
38
- index.each{|key,values|
39
- remove << key if values.length > max
40
- }
41
- remove.each{|key|
42
- index.delete(key)
43
- }
44
- }
45
- end
46
-
47
- def load(file, max_candidates = 50)
48
- @indexes = Array.new(@rules.size){Hash.new}
49
- data = Open.to_hash(file, :sep => "\t|\\|")
50
- data.each{|code, values_lists|
51
- values = values_lists.flatten.compact.uniq
52
- values.each{|value|
53
- cues(value).each_with_index{|cue_list,i|
54
- cue_list.each{|cue|
55
- @indexes[i][cue] ||= []
56
- @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
- }
58
- }
59
- }
60
- }
61
- clean(max_candidates) if max_candidates
62
- nil
63
- end
64
-
65
- def match(name)
66
- raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
-
68
- cues = cues(name)
69
- @indexes.each_with_index{|index,i|
70
- best = []
71
- cues[i].each{|cue|
72
- best << index[cue] if index[cue]
73
- }
74
- return best.flatten if best.any?
75
- }
76
-
77
- return []
78
- end
79
-
80
- end
@@ -1,213 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/simpleDSL'
3
- require 'rbbt/util/misc'
4
- require 'set'
5
-
6
-
7
- class Tokenizer < SimpleDSL
8
- #{{{ Classes for Comparisons
9
-
10
- @@ignore_case = true
11
-
12
- def self.ignore_case(ignore = nil)
13
- if ignore.nil?
14
- return @@ignore_case
15
- else
16
- @@ignore_case = ignore
17
- end
18
- end
19
-
20
-
21
- class Operation
22
-
23
- def initialize(comparison)
24
- @comparison = comparison
25
- @ignore_case = Tokenizer::ignore_case
26
- end
27
-
28
- def ignore_case(ignore = true)
29
- @ignore_case = ignore
30
- self
31
- end
32
-
33
- def method_missing(name, *args, &bloc)
34
- @token = name.to_sym
35
- @value = *args.first
36
- self
37
- end
38
-
39
- def eval(list1, list2)
40
- toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
41
- toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
-
43
- value = 0
44
- case @comparison.to_s
45
- when 'same':
46
- if toks1 == toks2 && toks1.any?
47
- value = @value
48
- end
49
- when 'diff':
50
- if toks1 != toks2
51
- value = @value
52
- end
53
- when 'common':
54
- if toks1.to_set.intersection(toks2.to_set).length > 0
55
- value = @value
56
- end
57
- when 'distinct':
58
- if toks1.to_set.intersection(toks2.to_set).length == 0
59
- value = @value
60
- end
61
- when 'miss':
62
- missing = (toks1 - toks2)
63
- if missing.length > 0
64
- value = @value * missing.length
65
- end
66
- when 'extr':
67
- extr = (toks2 - toks1)
68
- if extr.length > 0
69
- value = @value * extr.length
70
- end
71
- end
72
-
73
- return value
74
- end
75
- end
76
-
77
- class Custom
78
- def initialize
79
- @ignore_case = Tokenizer::ignore_case
80
- end
81
-
82
- def ignore_case(ignore = true)
83
- @ignore_case = ignore
84
- self
85
- end
86
-
87
- def method_missing(name, *args, &block)
88
- @token = name.to_sym
89
- @block = block
90
- end
91
-
92
- def eval(list1, list2)
93
- toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
94
- toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
-
96
- @block.call(toks1, toks2)
97
- end
98
- end
99
-
100
- class Transform
101
- def initialize
102
- end
103
- def method_missing(name, *args, &block)
104
- @token = name.to_sym
105
- @block = block
106
- self
107
- end
108
-
109
- def transform(token)
110
- if token[1] == @token
111
- token = @block.call(token[0])
112
- else
113
- token
114
- end
115
- end
116
- end
117
-
118
-
119
- #{{{ Metaprogramming hooks
120
- def define_tokens(name, *args, &block)
121
- action = *args[0] || block || /#{name.to_s}s?/i
122
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
123
-
124
- @types[name.to_sym] = action
125
- @order.push name.to_sym
126
-
127
- name.to_sym
128
- end
129
-
130
- def define_comparisons(name, *args, &block)
131
- o = nil
132
- case name.to_sym
133
- when :compare
134
- o = Custom.new
135
- @operations << o
136
- when :transform
137
- o = Transform.new
138
- @transforms << o
139
- else
140
- o = Operation.new(name)
141
- @operations << o
142
- end
143
- o
144
- end
145
-
146
- def main(name, *args, &block)
147
- parse("define_" + name.to_s,block)
148
- end
149
-
150
- #{{{ Initialize
151
- def initialize(file=nil, &block)
152
- @types = {}
153
- @order = []
154
- @operations = []
155
- @transforms = []
156
-
157
- file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
158
- super(:main, file, &block)
159
- end
160
-
161
-
162
- #{{{ Token Types
163
- GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
164
- def tokenize(word)
165
- return word.
166
- gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
167
- gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
168
- gsub(/([a-z])([A-Z])/,'\1-\2').
169
- gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
170
- gsub(/^(#{GREEK_RE})/,'\1-').
171
- gsub(/(#{GREEK_RE})$/,'-\1').
172
- split( /[^\w.]+/). # Split by separator char
173
- select{|t| !t.empty? }
174
- end
175
-
176
-
177
- def type(token)
178
- @order.each{|type|
179
- action = @types[type]
180
- if action.is_a? Proc
181
- return type if action.call(token)
182
- else
183
- return type if action.match(token)
184
- end
185
- }
186
- return :unknown
187
- end
188
-
189
- def token_types(word)
190
- tokenize(word).collect{|token|
191
- [token, type(token)]
192
- }
193
- end
194
-
195
- #{{{ Comparisons
196
-
197
- def evaluate_tokens(list1, list2)
198
- @operations.inject(0){| acc, o|
199
- acc + o.eval(list1, list2)
200
- }
201
- end
202
-
203
- def evaluate(mention, name)
204
- mention_tokens, name_tokens = [mention, name].collect{|n|
205
- token_types(n).collect{|t|
206
- @transforms.inject(t){|t,o|
207
- t = o.transform(t)
208
- }
209
- }
210
- }
211
- evaluate_tokens(mention_tokens, name_tokens)
212
- end
213
- end
@@ -1,75 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
-
5
- # Offers methods to help deal with the files distributed for the BioCreative
6
- # competition related to Gene Mention and Normalization.
7
- module Biocreative
8
-
9
- # Read the files regarding the dataset and return a hash with the entry codes
10
- # as keys and as values a hash with :text and the :mentions for that entry
11
- def self.BC2GM(dataset)
12
-
13
- data = {}
14
-
15
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each_line{|l|
16
- code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
17
- data[code] ={ :text => text }
18
- }
19
-
20
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each_line{|l|
21
- code, pos, mention = l.chomp.split(/\|/)
22
- data[code] ||= {}
23
- data[code][:mentions] ||= []
24
- data[code][:mentions].push(mention)
25
- }
26
-
27
-
28
- data
29
-
30
- end
31
-
32
- # Given a string of text and a string with a mention, return positions for
33
- # that mention in the format used in the evaluation.
34
- def self.position(text, mention)
35
-
36
- re = mention.gsub(/\W+/,' ')
37
- re = Regexp.quote(re)
38
- re = re.gsub(/\\ /,'\W*')
39
- re = '\(?' + re if mention =~ /\)/
40
- re = re + '\)?' if mention =~ /\(/
41
- re = "'?" + re + "'?" if mention =~ /'/
42
-
43
- positions = []
44
-
45
- offset = 0
46
- while text.match(/(.*?)(#{re})(.*)/s)
47
- pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
48
-
49
- start = offset + pre.gsub(/\s/,'').length
50
- last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
51
-
52
- positions << [start, last]
53
-
54
- offset = last + 1
55
- text = post
56
- end
57
-
58
- return positions
59
- end
60
-
61
- # Run the evaluation perl script
62
- def self.BC2GM_eval(results, dataset, outfile)
63
-
64
-
65
- cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
66
- -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
67
- -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
68
- #{results} > #{outfile}"
69
- system cmd
70
-
71
- end
72
-
73
- end
74
-
75
-