rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,143 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/ner/rnorm/cue_index'
3
- require 'rbbt/ner/rnorm/tokens'
4
- require 'rbbt/util/index'
5
- require 'rbbt/util/open'
6
- require 'rbbt/sources/entrez'
7
- require 'rbbt/bow/bow.rb'
8
-
9
- class Normalizer
10
-
11
-
12
- # Given a list of pairs of candidates along with their scores as
13
- # parameter +values+, and a minimum value for the scores. It returns
14
- # a list of pairs of the candidates that score the highest and that
15
- # score above the minimum. Otherwise it return an empty list.
16
- def self.get_best(values, min)
17
- return [] if values.empty?
18
- best = values.collect{|p| p[1]}.max
19
- return [] if best < min
20
- values.select{|p| p[1] == best}
21
- end
22
-
23
- # Compares the tokens and gives each candidate a score based on the
24
- # commonalities and differences amongst the tokens.
25
- def token_score(candidates, mention)
26
- candidates.collect{|code|
27
- next if @synonyms[code].nil?
28
- value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
29
- case
30
- when mention == name
31
- 100
32
- when mention.downcase == name.downcase
33
- 90
34
- when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
35
- 80
36
- else
37
- @tokens.evaluate(mention, name)
38
- end
39
- }.max
40
- [code, value]
41
- }.compact
42
- end
43
-
44
- # Order candidates with the number of words in common between the text
45
- # in their Entrez Gene entry and the text passed as parameter. Because
46
- # candidate genes might be in some other format than Entrez Gene Ids,
47
- # the +to_entrez+ variable can hold the way to translate between them,
48
- # been a Proc or a Hash.
49
- def entrez_score(candidates, text, to_entrez = nil)
50
- code2entrez = {}
51
- candidates.each{|code|
52
- if to_entrez.is_a? Proc
53
- entrez = to_entrez.call(code)
54
- elsif to_entrez.is_a? Hash
55
- entrez = @to_entrez[code]
56
- else
57
- entrez = code
58
- end
59
- code2entrez[code] = entrez unless entrez.nil?
60
- }
61
-
62
- # Get all at once, better performance
63
-
64
- genes = Entrez.get_gene(code2entrez.values)
65
- code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
66
-
67
- code2entrez_genes.collect{|p|
68
- [p[0], Entrez.gene_text_similarity(p[1], text)]
69
- }
70
- end
71
-
72
- # Takes a list of candidate codes and selects the ones that have the
73
- # mention explicitly in their list of synonyms, and in the earliest
74
- # positions. This is based on the idea that synonym list order their
75
- # synonyms by importance.
76
- def appearence_order(candidates, mention)
77
- positions = candidates.collect{|code|
78
- next unless @synonyms[code]
79
- pos = nil
80
- @synonyms[code].each_with_index{|list,i|
81
- next if pos
82
- pos = i if list.include? mention
83
- }
84
- pos
85
- }
86
- return nil if positions.compact.empty?
87
- best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
88
- candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
89
- end
90
-
91
-
92
-
93
- def initialize(lexicon, options = {})
94
- @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
95
-
96
- @index = CueIndex.new
97
- @index.load(lexicon, options[:max_candidates])
98
-
99
- @to_entrez = options[:to_entrez]
100
- @tokens = Tokenizer.new(options[:file])
101
- end
102
-
103
- def match(mention)
104
- @index.match(mention)
105
- end
106
-
107
- def select(candidates, mention, text = nil, options = {})
108
- threshold = options[:threshold] || 0
109
- max_candidates = options[:max_candidates] || 200
110
- max_entrez = options[:max_entrez] || 10
111
-
112
- # Abort if too ambigous
113
- return [] if candidates.empty?
114
- return [] if candidates.length > max_candidates
115
-
116
- scores = token_score(candidates, mention)
117
- best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
118
-
119
- # Abort if too ambigous
120
- return [] if best_codes.length > max_entrez
121
-
122
- if best_codes.length > 1 and text
123
- scores = entrez_score(best_codes, text, @to_entrez)
124
-
125
- Normalizer::get_best(scores, 0).collect{|p| p[0]}
126
- else
127
- orders = appearence_order(best_codes, mention)
128
- if orders
129
- orders
130
- else
131
- best_codes
132
- end
133
- end
134
-
135
- end
136
-
137
- def resolve(mention, text = nil, options = {})
138
- candidates = match(mention)
139
- select(candidates, mention, text, options)
140
- end
141
-
142
- end
143
-
@@ -1,80 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'rbbt/util/simpleDSL'
3
-
4
- class CueIndex < SimpleDSL
5
-
6
- class LexiconMissingError < StandardError; end
7
-
8
-
9
- def define(name, *args, &block)
10
- @rules << [name,block]
11
- nil
12
- end
13
-
14
- def initialize(file = nil, &block)
15
- @rules = []
16
-
17
- file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
18
-
19
- super(:define, file, &block)
20
- end
21
-
22
- def config
23
- @config[:define]
24
- end
25
-
26
-
27
- def cues(word)
28
- @rules.collect{|rule|
29
- c = rule[1].call(word)
30
- c = [c] unless c.is_a? Array
31
- c
32
- }
33
- end
34
-
35
- def clean(max)
36
- @indexes.each{|index|
37
- remove = []
38
- index.each{|key,values|
39
- remove << key if values.length > max
40
- }
41
- remove.each{|key|
42
- index.delete(key)
43
- }
44
- }
45
- end
46
-
47
- def load(file, max_candidates = 50)
48
- @indexes = Array.new(@rules.size){Hash.new}
49
- data = Open.to_hash(file, :sep => "\t|\\|")
50
- data.each{|code, values_lists|
51
- values = values_lists.flatten.compact.uniq
52
- values.each{|value|
53
- cues(value).each_with_index{|cue_list,i|
54
- cue_list.each{|cue|
55
- @indexes[i][cue] ||= []
56
- @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
- }
58
- }
59
- }
60
- }
61
- clean(max_candidates) if max_candidates
62
- nil
63
- end
64
-
65
- def match(name)
66
- raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
-
68
- cues = cues(name)
69
- @indexes.each_with_index{|index,i|
70
- best = []
71
- cues[i].each{|cue|
72
- best << index[cue] if index[cue]
73
- }
74
- return best.flatten if best.any?
75
- }
76
-
77
- return []
78
- end
79
-
80
- end
@@ -1,213 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/simpleDSL'
3
- require 'rbbt/util/misc'
4
- require 'set'
5
-
6
-
7
- class Tokenizer < SimpleDSL
8
- #{{{ Classes for Comparisons
9
-
10
- @@ignore_case = true
11
-
12
- def self.ignore_case(ignore = nil)
13
- if ignore.nil?
14
- return @@ignore_case
15
- else
16
- @@ignore_case = ignore
17
- end
18
- end
19
-
20
-
21
- class Operation
22
-
23
- def initialize(comparison)
24
- @comparison = comparison
25
- @ignore_case = Tokenizer::ignore_case
26
- end
27
-
28
- def ignore_case(ignore = true)
29
- @ignore_case = ignore
30
- self
31
- end
32
-
33
- def method_missing(name, *args, &bloc)
34
- @token = name.to_sym
35
- @value = *args.first
36
- self
37
- end
38
-
39
- def eval(list1, list2)
40
- toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
41
- toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
-
43
- value = 0
44
- case @comparison.to_s
45
- when 'same':
46
- if toks1 == toks2 && toks1.any?
47
- value = @value
48
- end
49
- when 'diff':
50
- if toks1 != toks2
51
- value = @value
52
- end
53
- when 'common':
54
- if toks1.to_set.intersection(toks2.to_set).length > 0
55
- value = @value
56
- end
57
- when 'distinct':
58
- if toks1.to_set.intersection(toks2.to_set).length == 0
59
- value = @value
60
- end
61
- when 'miss':
62
- missing = (toks1 - toks2)
63
- if missing.length > 0
64
- value = @value * missing.length
65
- end
66
- when 'extr':
67
- extr = (toks2 - toks1)
68
- if extr.length > 0
69
- value = @value * extr.length
70
- end
71
- end
72
-
73
- return value
74
- end
75
- end
76
-
77
- class Custom
78
- def initialize
79
- @ignore_case = Tokenizer::ignore_case
80
- end
81
-
82
- def ignore_case(ignore = true)
83
- @ignore_case = ignore
84
- self
85
- end
86
-
87
- def method_missing(name, *args, &block)
88
- @token = name.to_sym
89
- @block = block
90
- end
91
-
92
- def eval(list1, list2)
93
- toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
94
- toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
-
96
- @block.call(toks1, toks2)
97
- end
98
- end
99
-
100
- class Transform
101
- def initialize
102
- end
103
- def method_missing(name, *args, &block)
104
- @token = name.to_sym
105
- @block = block
106
- self
107
- end
108
-
109
- def transform(token)
110
- if token[1] == @token
111
- token = @block.call(token[0])
112
- else
113
- token
114
- end
115
- end
116
- end
117
-
118
-
119
- #{{{ Metaprogramming hooks
120
- def define_tokens(name, *args, &block)
121
- action = *args[0] || block || /#{name.to_s}s?/i
122
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
123
-
124
- @types[name.to_sym] = action
125
- @order.push name.to_sym
126
-
127
- name.to_sym
128
- end
129
-
130
- def define_comparisons(name, *args, &block)
131
- o = nil
132
- case name.to_sym
133
- when :compare
134
- o = Custom.new
135
- @operations << o
136
- when :transform
137
- o = Transform.new
138
- @transforms << o
139
- else
140
- o = Operation.new(name)
141
- @operations << o
142
- end
143
- o
144
- end
145
-
146
- def main(name, *args, &block)
147
- parse("define_" + name.to_s,block)
148
- end
149
-
150
- #{{{ Initialize
151
- def initialize(file=nil, &block)
152
- @types = {}
153
- @order = []
154
- @operations = []
155
- @transforms = []
156
-
157
- file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
158
- super(:main, file, &block)
159
- end
160
-
161
-
162
- #{{{ Token Types
163
- GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
164
- def tokenize(word)
165
- return word.
166
- gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
167
- gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
168
- gsub(/([a-z])([A-Z])/,'\1-\2').
169
- gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
170
- gsub(/^(#{GREEK_RE})/,'\1-').
171
- gsub(/(#{GREEK_RE})$/,'-\1').
172
- split( /[^\w.]+/). # Split by separator char
173
- select{|t| !t.empty? }
174
- end
175
-
176
-
177
- def type(token)
178
- @order.each{|type|
179
- action = @types[type]
180
- if action.is_a? Proc
181
- return type if action.call(token)
182
- else
183
- return type if action.match(token)
184
- end
185
- }
186
- return :unknown
187
- end
188
-
189
- def token_types(word)
190
- tokenize(word).collect{|token|
191
- [token, type(token)]
192
- }
193
- end
194
-
195
- #{{{ Comparisons
196
-
197
- def evaluate_tokens(list1, list2)
198
- @operations.inject(0){| acc, o|
199
- acc + o.eval(list1, list2)
200
- }
201
- end
202
-
203
- def evaluate(mention, name)
204
- mention_tokens, name_tokens = [mention, name].collect{|n|
205
- token_types(n).collect{|t|
206
- @transforms.inject(t){|t,o|
207
- t = o.transform(t)
208
- }
209
- }
210
- }
211
- evaluate_tokens(mention_tokens, name_tokens)
212
- end
213
- end
@@ -1,75 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
-
5
- # Offers methods to help deal with the files distributed for the BioCreative
6
- # competition related to Gene Mention and Normalization.
7
- module Biocreative
8
-
9
- # Read the files regarding the dataset and return a hash with the entry codes
10
- # as keys and as values a hash with :text and the :mentions for that entry
11
- def self.BC2GM(dataset)
12
-
13
- data = {}
14
-
15
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each_line{|l|
16
- code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
17
- data[code] ={ :text => text }
18
- }
19
-
20
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each_line{|l|
21
- code, pos, mention = l.chomp.split(/\|/)
22
- data[code] ||= {}
23
- data[code][:mentions] ||= []
24
- data[code][:mentions].push(mention)
25
- }
26
-
27
-
28
- data
29
-
30
- end
31
-
32
- # Given a string of text and a string with a mention, return positions for
33
- # that mention in the format used in the evaluation.
34
- def self.position(text, mention)
35
-
36
- re = mention.gsub(/\W+/,' ')
37
- re = Regexp.quote(re)
38
- re = re.gsub(/\\ /,'\W*')
39
- re = '\(?' + re if mention =~ /\)/
40
- re = re + '\)?' if mention =~ /\(/
41
- re = "'?" + re + "'?" if mention =~ /'/
42
-
43
- positions = []
44
-
45
- offset = 0
46
- while text.match(/(.*?)(#{re})(.*)/s)
47
- pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
48
-
49
- start = offset + pre.gsub(/\s/,'').length
50
- last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
51
-
52
- positions << [start, last]
53
-
54
- offset = last + 1
55
- text = post
56
- end
57
-
58
- return positions
59
- end
60
-
61
- # Run the evaluation perl script
62
- def self.BC2GM_eval(results, dataset, outfile)
63
-
64
-
65
- cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
66
- -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
67
- -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
68
- #{results} > #{outfile}"
69
- system cmd
70
-
71
- end
72
-
73
- end
74
-
75
-