rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,143 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/ner/rnorm/cue_index'
3
- require 'rbbt/ner/rnorm/tokens'
4
- require 'rbbt/util/index'
5
- require 'rbbt/util/open'
6
- require 'rbbt/sources/entrez'
7
- require 'rbbt/bow/bow.rb'
8
-
9
- class Normalizer
10
-
11
-
12
- # Given a list of pairs of candidates along with their scores as
13
- # parameter +values+, and a minimum value for the scores. It returns
14
- # a list of pairs of the candidates that score the highest and that
15
- # score above the minimum. Otherwise it return an empty list.
16
- def self.get_best(values, min)
17
- return [] if values.empty?
18
- best = values.collect{|p| p[1]}.max
19
- return [] if best < min
20
- values.select{|p| p[1] == best}
21
- end
22
-
23
- # Compares the tokens and gives each candidate a score based on the
24
- # commonalities and differences amongst the tokens.
25
- def token_score(candidates, mention)
26
- candidates.collect{|code|
27
- next if @synonyms[code].nil?
28
- value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
29
- case
30
- when mention == name
31
- 100
32
- when mention.downcase == name.downcase
33
- 90
34
- when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
35
- 80
36
- else
37
- @tokens.evaluate(mention, name)
38
- end
39
- }.max
40
- [code, value]
41
- }.compact
42
- end
43
-
44
- # Order candidates with the number of words in common between the text
45
- # in their Entrez Gene entry and the text passed as parameter. Because
46
- # candidate genes might be in some other format than Entrez Gene Ids,
47
- # the +to_entrez+ variable can hold the way to translate between them,
48
- # been a Proc or a Hash.
49
- def entrez_score(candidates, text, to_entrez = nil)
50
- code2entrez = {}
51
- candidates.each{|code|
52
- if to_entrez.is_a? Proc
53
- entrez = to_entrez.call(code)
54
- elsif to_entrez.is_a? Hash
55
- entrez = @to_entrez[code]
56
- else
57
- entrez = code
58
- end
59
- code2entrez[code] = entrez unless entrez.nil?
60
- }
61
-
62
- # Get all at once, better performance
63
- genes = Entrez.get_gene(code2entrez.values)
64
-
65
- code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
66
-
67
- code2entrez_genes.collect{|p|
68
- [p[0], Entrez.gene_text_similarity(p[1], text)]
69
- }
70
- end
71
-
72
- # Takes a list of candidate codes and selects the ones that have the
73
- # mention explicitly in their list of synonyms, and in the earliest
74
- # positions. This is based on the idea that synonym list order their
75
- # synonyms by importance.
76
- def appearence_order(candidates, mention)
77
- positions = candidates.collect{|code|
78
- next unless @synonyms[code]
79
- pos = nil
80
- @synonyms[code].each_with_index{|list,i|
81
- next if pos
82
- pos = i if list.include? mention
83
- }
84
- pos
85
- }
86
- return nil if positions.compact.empty?
87
- best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
88
- candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
89
- end
90
-
91
-
92
-
93
- def initialize(lexicon, options = {})
94
- @synonyms = Open.to_hash(lexicon, :sep => "\t|\\|", :flatten => true)
95
-
96
- @index = CueIndex.new
97
- @index.load(lexicon, options[:max_candidates])
98
-
99
- @to_entrez = options[:to_entrez]
100
- @tokens = Tokenizer.new(options[:file])
101
- end
102
-
103
- def match(mention)
104
- @index.match(mention)
105
- end
106
-
107
- def select(candidates, mention, text = nil, options = {})
108
- threshold = options[:threshold] || 0
109
- max_candidates = options[:max_candidates] || 200
110
- max_entrez = options[:max_entrez] || 10
111
-
112
- # Abort if too ambigous
113
- return [] if candidates.empty?
114
- return [] if candidates.length > max_candidates
115
-
116
- scores = token_score(candidates, mention)
117
- best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
118
-
119
- # Abort if too ambigous
120
- return [] if best_codes.length > max_entrez
121
-
122
- if best_codes.length > 1 and text
123
- scores = entrez_score(best_codes, text, @to_entrez)
124
-
125
- Normalizer::get_best(scores, 0).collect{|p| p[0]}
126
- else
127
- orders = appearence_order(best_codes, mention)
128
- if orders
129
- orders
130
- else
131
- best_codes
132
- end
133
- end
134
-
135
- end
136
-
137
- def resolve(mention, text = nil, options = {})
138
- candidates = match(mention)
139
- select(candidates, mention, text, options)
140
- end
141
-
142
- end
143
-
@@ -1,80 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'rbbt/util/simpleDSL'
3
-
4
- class CueIndex < SimpleDSL
5
-
6
- class LexiconMissingError < StandardError; end
7
-
8
-
9
- def define(name, *args, &block)
10
- @rules << [name,block]
11
- nil
12
- end
13
-
14
- def initialize(file = nil, &block)
15
- @rules = []
16
-
17
- file ||= File.join(Rbbt.datadir,'norm/config/cue_default.rb') if !file && !block
18
-
19
- super(:define, file, &block)
20
- end
21
-
22
- def config
23
- @config[:define]
24
- end
25
-
26
-
27
- def cues(word)
28
- @rules.collect{|rule|
29
- c = rule[1].call(word)
30
- c = [c] unless c.is_a? Array
31
- c
32
- }
33
- end
34
-
35
- def clean(max)
36
- @indexes.each{|index|
37
- remove = []
38
- index.each{|key,values|
39
- remove << key if values.length > max
40
- }
41
- remove.each{|key|
42
- index.delete(key)
43
- }
44
- }
45
- end
46
-
47
- def load(file, max_candidates = 50)
48
- @indexes = Array.new(@rules.size){Hash.new}
49
- data = Open.to_hash(file, :sep => "\t|\\|")
50
- data.each{|code, values_lists|
51
- values = values_lists.flatten.compact.uniq
52
- values.each{|value|
53
- cues(value).each_with_index{|cue_list,i|
54
- cue_list.each{|cue|
55
- @indexes[i][cue] ||= []
56
- @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
- }
58
- }
59
- }
60
- }
61
- clean(max_candidates) if max_candidates
62
- nil
63
- end
64
-
65
- def match(name)
66
- raise LexiconMissingError, "Load Lexicon before matching" unless @indexes
67
-
68
- cues = cues(name)
69
- @indexes.each_with_index{|index,i|
70
- best = []
71
- cues[i].each{|cue|
72
- best << index[cue] if index[cue]
73
- }
74
- return best.flatten if best.any?
75
- }
76
-
77
- return []
78
- end
79
-
80
- end
@@ -1,217 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/simpleDSL'
3
- require 'rbbt/util/misc'
4
- require 'set'
5
-
6
-
7
- class Tokenizer < SimpleDSL
8
- #{{{ Classes for Comparisons
9
-
10
- @@ignore_case = true
11
-
12
- def self.ignore_case(ignore = nil)
13
- if ignore.nil?
14
- return @@ignore_case
15
- else
16
- @@ignore_case = ignore
17
- end
18
- end
19
-
20
-
21
- class Operation
22
-
23
- def initialize(comparison)
24
- @comparison = comparison
25
- @ignore_case = Tokenizer::ignore_case
26
- end
27
-
28
- def ignore_case(ignore = true)
29
- @ignore_case = ignore
30
- self
31
- end
32
-
33
- def method_missing(name, *args, &bloc)
34
- @token = name.to_sym
35
- @value = *args.first
36
- self
37
- end
38
-
39
- def eval(list1, list2)
40
- toks1 = list1.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
41
- toks2 = list2.select{|p| p[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
42
-
43
- value = 0
44
- case @comparison.to_s
45
- when 'same':
46
- if toks1 == toks2 && toks1.any?
47
- value = @value
48
- end
49
- when 'diff':
50
- if toks1 != toks2
51
- value = @value
52
- end
53
- when 'common':
54
- if toks1.to_set.intersection(toks2.to_set).length > 0
55
- value = @value
56
- end
57
- when 'distinct':
58
- if toks1.to_set.intersection(toks2.to_set).length == 0
59
- value = @value
60
- end
61
- when 'miss':
62
- missing = (toks1 - toks2)
63
- if missing.length > 0
64
- value = @value * missing.length
65
- end
66
- when 'extr':
67
- extr = (toks2 - toks1)
68
- if extr.length > 0
69
- value = @value * extr.length
70
- end
71
- end
72
-
73
- return value
74
- end
75
- end
76
-
77
- class Custom
78
- def initialize
79
- @ignore_case = Tokenizer::ignore_case
80
- end
81
-
82
- def ignore_case(ignore = true)
83
- @ignore_case = ignore
84
- self
85
- end
86
-
87
- def method_missing(name, *args, &block)
88
- @token = name.to_sym
89
- @block = block
90
- end
91
-
92
- def eval(list1, list2)
93
- toks1 = list1.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
94
- toks2 = list2.select{|t| t[1] == @token}.collect{|t| @ignore_case ? t[0].to_s.downcase : t[0].to_s}
95
-
96
- @block.call(toks1, toks2)
97
- end
98
- end
99
-
100
- class Transform
101
- def initialize
102
- end
103
- def method_missing(name, *args, &block)
104
- @token = name.to_sym
105
- if block_given?
106
- @block = block
107
- else
108
- @block = args.first
109
- end
110
- self
111
- end
112
-
113
- def transform(token)
114
- if token[1] == @token
115
- token = @block.call(token[0])
116
- else
117
- token
118
- end
119
- end
120
- end
121
-
122
-
123
- #{{{ Metaprogramming hooks
124
- def define_tokens(name, *args, &block)
125
- action = *args[0] || block || /#{name.to_s}s?/i
126
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
127
-
128
- @types[name.to_sym] = action
129
- @order.push name.to_sym
130
-
131
- name.to_sym
132
- end
133
-
134
- def define_comparisons(name, *args, &block)
135
- o = nil
136
- case name.to_sym
137
- when :compare
138
- o = Custom.new
139
- @operations << o
140
- when :transform
141
- o = Transform.new
142
- @transforms << o
143
- else
144
- o = Operation.new(name)
145
- @operations << o
146
- end
147
- o
148
- end
149
-
150
- def main(name, *args, &block)
151
- parse("define_" + name.to_s,block)
152
- end
153
-
154
- #{{{ Initialize
155
- def initialize(file=nil, &block)
156
- @types = {}
157
- @order = []
158
- @operations = []
159
- @transforms = []
160
-
161
- file ||= File.join(Rbbt.datadir,'norm/config/tokens_default.rb') if !file && !block
162
- super(:main, file, &block)
163
- end
164
-
165
-
166
- #{{{ Token Types
167
- GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
168
- def tokenize(word)
169
- return word.
170
- gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
171
- gsub(/(\d+[,.]?\d+|\d+)/,'|\1|'). # Separate number
172
- gsub(/([a-z])([A-Z])/,'\1-\2').
173
- gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
174
- gsub(/^(#{GREEK_RE})/,'\1-').
175
- gsub(/(#{GREEK_RE})$/,'-\1').
176
- split( /[^\w.]+/). # Split by separator char
177
- select{|t| !t.empty? }
178
- end
179
-
180
-
181
- def type(token)
182
- @order.each{|type|
183
- action = @types[type]
184
- if action.is_a? Proc
185
- return type if action.call(token)
186
- else
187
- return type if action.match(token)
188
- end
189
- }
190
- return :unknown
191
- end
192
-
193
- def token_types(word)
194
- tokenize(word).collect{|token|
195
- [token, type(token)]
196
- }
197
- end
198
-
199
- #{{{ Comparisons
200
-
201
- def evaluate_tokens(list1, list2)
202
- @operations.inject(0){| acc, o|
203
- acc + o.eval(list1, list2)
204
- }
205
- end
206
-
207
- def evaluate(mention, name)
208
- mention_tokens, name_tokens = [mention, name].collect{|n|
209
- token_types(n).collect{|t|
210
- @transforms.inject(t){|t,o|
211
- t = o.transform(t)
212
- }
213
- }
214
- }
215
- evaluate_tokens(mention_tokens, name_tokens)
216
- end
217
- end
@@ -1,75 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
-
5
- # Offers methods to help deal with the files distributed for the BioCreative
6
- # competition related to Gene Mention and Normalization.
7
- module Biocreative
8
-
9
- # Read the files regarding the dataset and return a hash with the entry codes
10
- # as keys and as values a hash with :text and the :mentions for that entry
11
- def self.BC2GM(dataset)
12
-
13
- data = {}
14
-
15
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each_line{|l|
16
- code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
17
- data[code] ={ :text => text }
18
- }
19
-
20
- Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each_line{|l|
21
- code, pos, mention = l.chomp.split(/\|/)
22
- data[code] ||= {}
23
- data[code][:mentions] ||= []
24
- data[code][:mentions].push(mention)
25
- }
26
-
27
-
28
- data
29
-
30
- end
31
-
32
- # Given a string of text and a string with a mention, return positions for
33
- # that mention in the format used in the evaluation.
34
- def self.position(text, mention)
35
-
36
- re = mention.gsub(/\W+/,' ')
37
- re = Regexp.quote(re)
38
- re = re.gsub(/\\ /,'\W*')
39
- re = '\(?' + re if mention =~ /\)/
40
- re = re + '\)?' if mention =~ /\(/
41
- re = "'?" + re + "'?" if mention =~ /'/
42
-
43
- positions = []
44
-
45
- offset = 0
46
- while text.match(/(.*?)(#{re})(.*)/s)
47
- pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
48
-
49
- start = offset + pre.gsub(/\s/,'').length
50
- last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
51
-
52
- positions << [start, last]
53
-
54
- offset = last + 1
55
- text = post
56
- end
57
-
58
- return positions
59
- end
60
-
61
- # Run the evaluation perl script
62
- def self.BC2GM_eval(results, dataset, outfile)
63
-
64
-
65
- cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
66
- -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
67
- -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
68
- #{results} > #{outfile}"
69
- system cmd
70
-
71
- end
72
-
73
- end
74
-
75
-