rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,143 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/ner/rnorm/cue_index'
3
- require 'rbbt/ner/rnorm/tokens'
4
- require 'rbbt/util/index'
5
- require 'rbbt/util/open'
6
- require 'rbbt/sources/entrez'
7
- require 'rbbt/bow/bow.rb'
8
-
9
- class Normalizer
10
-
11
-
12
- # Given a list of pairs of candidates along with their scores as
13
- # parameter +values+, and a minimum value for the scores. It returns
14
- # a list of pairs of the candidates that score the highest and that
15
- # score above the minimum. Otherwise it return an empty list.
16
- def self.get_best(values, min)
17
- return [] if values.empty?
18
- best = values.collect{|p| p[1]}.max
19
- return [] if best < min
20
- values.select{|p| p[1] == best}
21
- end
22
-
23
- # Compares the tokens and gives each candidate a score based on the
24
- # commonalities and differences amongst the tokens.
25
- def token_score(candidates, mention)
26
- candidates.collect{|code|
27
- next if @synonyms[code].nil?
28
- value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
29
- case
30
- when mention == name
31
- 100
32
- when mention.downcase == name.downcase
33
- 90
34
- when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
35
- 80
36
- else
37
- @tokens.evaluate(mention, name)
38
- end
39
- }.max
40
- [code, value]
41
- }.compact
42
- end
43
-
44
- # Order candidates with the number of words in common between the text
45
- # in their Entrez Gene entry and the text passed as parameter. Because
46
- # candidate genes might be in some other format than Entrez Gene Ids,
47
- # the +to_entrez+ variable can hold the way to translate between them,
48
- # been a Proc or a Hash.
49
- def entrez_score(candidates, text, to_entrez = nil)
50
- code2entrez = {}
51
- candidates.each{|code|
52
- if to_entrez.is_a? Proc
53
- entrez = to_entrez.call(code)
54
- elsif to_entrez.is_a? Hash
55
- entrez = @to_entrez[code]
56
- else
57
- entrez = code
58
- end
59
- code2entrez[code] = entrez unless entrez.nil?
60
- }
61
-
62
- # Get all at once, better performance
63
- genes = Entrez.get_gene(code2entrez.values)
64
-
65
- code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
66
-
67
- code2entrez_genes.collect{|p|
68
- [p[0], Entrez.gene_text_similarity(p[1], text)]
69
- }
70
- end
71
-
72
- # Takes a list of candidate codes and selects the ones that have the
73
- # mention explicitly in their list of synonyms, and in the earliest
74
- # positions. This is based on the idea that synonym list order their
75
- # synonyms by importance.
76
- def appearence_order(candidates, mention)
77
- positions = candidates.collect{|code|
78
- next unless @synonyms[code]
79
- pos = nil
80
- @synonyms[code].each_with_index{|list,i|
81
- next if pos
82
- pos = i if list.include? mention
83
- }
84
- pos
85
- }
86
- return nil if positions.compact.empty?
87
- best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
88
- candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
89
- end
90
-
91
-
92
-
93
- def initialize(lexicon, options = {})
94
- @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
95
-
96
- @index = CueIndex.new
97
- @index.load(lexicon, options[:max_candidates])
98
-
99
- @to_entrez = options[:to_entrez]
100
- @tokens = Tokenizer.new(options[:file])
101
- end
102
-
103
- def match(mention)
104
- @index.match(mention)
105
- end
106
-
107
- def select(candidates, mention, text = nil, options = {})
108
- threshold = options[:threshold] || 0
109
- max_candidates = options[:max_candidates] || 200
110
- max_entrez = options[:max_entrez] || 10
111
-
112
- # Abort if too ambigous
113
- return [] if candidates.empty?
114
- return [] if candidates.length > max_candidates
115
-
116
- scores = token_score(candidates, mention)
117
- best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
118
-
119
- # Abort if too ambigous
120
- return [] if best_codes.length > max_entrez
121
-
122
- if best_codes.length > 1 and text
123
- scores = entrez_score(best_codes, text, @to_entrez)
124
-
125
- Normalizer::get_best(scores, 0).collect{|p| p[0]}
126
- else
127
- orders = appearence_order(best_codes, mention)
128
- if orders
129
- orders
130
- else
131
- best_codes
132
- end
133
- end
134
-
135
- end
136
-
137
- def resolve(mention, text = nil, options = {})
138
- candidates = match(mention)
139
- select(candidates, mention, text, options)
140
- end
141
-
142
- end
143
-
@@ -1,80 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'rbbt/util/simpleDSL'
3
-
4
- class CueIndex < SimpleDSL
5
-
6
- class LexiconMissingError < StandardError; end
7
-
8
-
9
- def define(name, *args, &block)
10
- @rules << [name,block]
11
- nil
12
- end
13
-
14
- def initialize(file = nil, &block)
15
- @rules = []
16
-
17
- file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
18
-
19
- super(:define, file, &block)
20
- end
21
-
22
- def config
23
- @config[:define]
24
- end
25
-
26
-
27
- def cues(word)
28
- @rules.collect{|rule|
29
- c = rule[1].call(word)
30
- c = [c] unless c.is_a? Array
31
- c
32
- }
33
- end
34
-
35
- def clean(max)
36
- @indexes.each{|index|
37
- remove = []
38
- index.each{|key,values|
39
- remove << key if values.length > max
40
- }
41
- remove.each{|key|
42
- index.delete(key)
43
- }
44
- }
45
- end
46
-
47
- def load(file, max_candidates = 50)
48
- @indexes = Array.new(@rules.size){Hash.new}
49
- data = Open.to_hash(file, :sep => "\t|\\|")
50
- data.each{|code, values_lists|
51
- values = values_lists.flatten.compact.uniq
52
- values.each{|value|
53
- cues(value).each_with_index{|cue_list,i|
54
- cue_list.each{|cue|
55
- @indexes[i][cue] ||= []
56
- @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
- }
58
- }
59
- }
60
- }
61
- clean(max_candidates) if max_candidates
62
- nil
63
- end
64
-
65
- def match(name)
66
- raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
-
68
- cues = cues(name)
69
- @indexes.each_with_index{|index,i|
70
- best = []
71
- cues[i].each{|cue|
72
- best << index[cue] if index[cue]
73
- }
74
- return best.flatten if best.any?
75
- }
76
-
77
- return []
78
- end
79
-
80
- end
@@ -1,217 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/simpleDSL'
3
- require 'rbbt/util/misc'
4
- require 'set'
5
-
6
-
7
- class Tokenizer < SimpleDSL
8
- #{{{ Classes for Comparisons
9
-
10
- @@ignore_case = true
11
-
12
- def self.ignore_case(ignore = nil)
13
- if ignore.nil?
14
- return @@ignore_case
15
- else
16
- @@ignore_case = ignore
17
- end
18
- end
19
-
20
-
21
- class Operation
22
-
23
- def initialize(comparison)
24
- @comparison = comparison
25
- @ignore_case = Tokenizer::ignore_case
26
- end
27
-
28
- def ignore_case(ignore = true)
29
- @ignore_case = ignore
30
- self
31
- end
32
-
33
- def method_missing(name, *args, &bloc)
34
- @token = name.to_sym
35
- @value = *args.first
36
- self
37
- end
38
-
39
- def eval(list1, list2)
40
- toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
41
- toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
-
43
- value = 0
44
- case @comparison.to_s
45
- when 'same':
46
- if toks1 == toks2 && toks1.any?
47
- value = @value
48
- end
49
- when 'diff':
50
- if toks1 != toks2
51
- value = @value
52
- end
53
- when 'common':
54
- if toks1.to_set.intersection(toks2.to_set).length > 0
55
- value = @value
56
- end
57
- when 'distinct':
58
- if toks1.to_set.intersection(toks2.to_set).length == 0
59
- value = @value
60
- end
61
- when 'miss':
62
- missing = (toks1 - toks2)
63
- if missing.length > 0
64
- value = @value * missing.length
65
- end
66
- when 'extr':
67
- extr = (toks2 - toks1)
68
- if extr.length > 0
69
- value = @value * extr.length
70
- end
71
- end
72
-
73
- return value
74
- end
75
- end
76
-
77
- class Custom
78
- def initialize
79
- @ignore_case = Tokenizer::ignore_case
80
- end
81
-
82
- def ignore_case(ignore = true)
83
- @ignore_case = ignore
84
- self
85
- end
86
-
87
- def method_missing(name, *args, &block)
88
- @token = name.to_sym
89
- @block = block
90
- end
91
-
92
- def eval(list1, list2)
93
- toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
94
- toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
-
96
- @block.call(toks1, toks2)
97
- end
98
- end
99
-
100
- class Transform
101
- def initialize
102
- end
103
- def method_missing(name, *args, &block)
104
- @token = name.to_sym
105
- if block_given?
106
- @block = block
107
- else
108
- @block = args.first
109
- end
110
- self
111
- end
112
-
113
- def transform(token)
114
- if token[1] == @token
115
- token = @block.call(token[0])
116
- else
117
- token
118
- end
119
- end
120
- end
121
-
122
-
123
- #{{{ Metaprogramming hooks
124
- def define_tokens(name, *args, &block)
125
- action = *args[0] || block || /#{name.to_s}s?/i
126
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
127
-
128
- @types[name.to_sym] = action
129
- @order.push name.to_sym
130
-
131
- name.to_sym
132
- end
133
-
134
- def define_comparisons(name, *args, &block)
135
- o = nil
136
- case name.to_sym
137
- when :compare
138
- o = Custom.new
139
- @operations << o
140
- when :transform
141
- o = Transform.new
142
- @transforms << o
143
- else
144
- o = Operation.new(name)
145
- @operations << o
146
- end
147
- o
148
- end
149
-
150
- def main(name, *args, &block)
151
- parse("define_" + name.to_s,block)
152
- end
153
-
154
- #{{{ Initialize
155
- def initialize(file=nil, &block)
156
- @types = {}
157
- @order = []
158
- @operations = []
159
- @transforms = []
160
-
161
- file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
162
- super(:main, file, &block)
163
- end
164
-
165
-
166
- #{{{ Token Types
167
- GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
168
- def tokenize(word)
169
- return word.
170
- gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
171
- gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
172
- gsub(/([a-z])([A-Z])/,'\1-\2').
173
- gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
174
- gsub(/^(#{GREEK_RE})/,'\1-').
175
- gsub(/(#{GREEK_RE})$/,'-\1').
176
- split( /[^\w.]+/). # Split by separator char
177
- select{|t| !t.empty? }
178
- end
179
-
180
-
181
- def type(token)
182
- @order.each{|type|
183
- action = @types[type]
184
- if action.is_a? Proc
185
- return type if action.call(token)
186
- else
187
- return type if action.match(token)
188
- end
189
- }
190
- return :unknown
191
- end
192
-
193
- def token_types(word)
194
- tokenize(word).collect{|token|
195
- [token, type(token)]
196
- }
197
- end
198
-
199
- #{{{ Comparisons
200
-
201
- def evaluate_tokens(list1, list2)
202
- @operations.inject(0){| acc, o|
203
- acc + o.eval(list1, list2)
204
- }
205
- end
206
-
207
- def evaluate(mention, name)
208
- mention_tokens, name_tokens = [mention, name].collect{|n|
209
- token_types(n).collect{|t|
210
- @transforms.inject(t){|t,o|
211
- t = o.transform(t)
212
- }
213
- }
214
- }
215
- evaluate_tokens(mention_tokens, name_tokens)
216
- end
217
- end
@@ -1,75 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
-
5
- # Offers methods to help deal with the files distributed for the BioCreative
6
- # competition related to Gene Mention and Normalization.
7
- module Biocreative
8
-
9
- # Read the files regarding the dataset and return a hash with the entry codes
10
- # as keys and as values a hash with :text and the :mentions for that entry
11
- def self.BC2GM(dataset)
12
-
13
- data = {}
14
-
15
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each_line{|l|
16
- code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
17
- data[code] ={ :text => text }
18
- }
19
-
20
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each_line{|l|
21
- code, pos, mention = l.chomp.split(/\|/)
22
- data[code] ||= {}
23
- data[code][:mentions] ||= []
24
- data[code][:mentions].push(mention)
25
- }
26
-
27
-
28
- data
29
-
30
- end
31
-
32
- # Given a string of text and a string with a mention, return positions for
33
- # that mention in the format used in the evaluation.
34
- def self.position(text, mention)
35
-
36
- re = mention.gsub(/\W+/,' ')
37
- re = Regexp.quote(re)
38
- re = re.gsub(/\\ /,'\W*')
39
- re = '\(?' + re if mention =~ /\)/
40
- re = re + '\)?' if mention =~ /\(/
41
- re = "'?" + re + "'?" if mention =~ /'/
42
-
43
- positions = []
44
-
45
- offset = 0
46
- while text.match(/(.*?)(#{re})(.*)/s)
47
- pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
48
-
49
- start = offset + pre.gsub(/\s/,'').length
50
- last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
51
-
52
- positions << [start, last]
53
-
54
- offset = last + 1
55
- text = post
56
- end
57
-
58
- return positions
59
- end
60
-
61
- # Run the evaluation perl script
62
- def self.BC2GM_eval(results, dataset, outfile)
63
-
64
-
65
- cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
66
- -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
67
- -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
68
- #{results} > #{outfile}"
69
- system cmd
70
-
71
- end
72
-
73
- end
74
-
75
-