rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,34 +0,0 @@
1
- require 'rbbt'
2
- require 'rjb'
3
-
4
- # Offers a Ruby interface to the Abner Named Entity Recognition Package
5
- # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
6
- class Abner
7
-
8
- @@JFile = Rjb::import('java.io.File')
9
- @@Tagger = Rjb::import('abner.Tagger')
10
- @@Trainer = Rjb::import('abner.Trainer')
11
-
12
- # If modelfile is present a custom trained model can be used,
13
- # otherwise, the default BioCreative model is used.
14
- def initialize(modelfile=nil)
15
- if modelfile == nil
16
- @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
17
- else
18
- @tagger = @@Tagger.new(@@JFile.new(modelfile))
19
- end
20
- end
21
-
22
- # Given a chunk of text, it finds all the mentions appearing in it. It
23
- # returns all the mentions found, regardless of type, to be coherent
24
- # with the rest of NER packages in Rbbt.
25
- def extract(text)
26
-
27
- res = @tagger.getEntities(text)
28
- types = res[1]
29
- strings = res[0]
30
-
31
- return strings.collect{|s| s.to_s}
32
- end
33
-
34
- end
@@ -1,73 +0,0 @@
1
- require 'rbbt'
2
- require 'rjb'
3
-
4
- # Offers a Ruby interface to the Banner Named Entity Recognition Package
5
- # in Java. Banner[http://banner.sourceforge.net/].
6
- class Banner
7
-
8
-
9
- @@JFile = Rjb::import('java.io.File')
10
- @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
11
- @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
12
- @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
13
- @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
14
- @@Sentence = Rjb::import('banner.Sentence')
15
- @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
16
-
17
-
18
-
19
- # The parameters are set to default values, the only one that one
20
- # might want to change is the modelfile to point to a custom trained
21
- # one.
22
- def initialize(modelfile = File.join(Rbbt.datadir, 'third_party/banner/gene_model.bin'),
23
- lemmadir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/lemmatiser'),
24
- taggerdir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/tagger')
25
- )
26
-
27
- @tokenizer = @@SimpleTokenizer.new
28
-
29
- model = @@JFile.new(modelfile)
30
- lemma = @@EngLemmatiser.new(lemmadir,false,true)
31
- helper = @@HeppleTagger.new(taggerdir)
32
-
33
- # The next lines are needed to avoid colisions with
34
- # metraprograming that could define load (activesupport in
35
- # particular :@ ). RJB seems to call java on method missing
36
- class << @@CRFTagger
37
- if method_defined? :load
38
- undef_method :load
39
- end
40
- end
41
-
42
- @tagger = @@CRFTagger.load( model, lemma, helper)
43
- @parenPP = @@ParenthesisPostProcessor.new()
44
- end
45
-
46
-
47
- # Returns an array with the mention found in the provided piece of
48
- # text.
49
- def extract(text)
50
- text.gsub!(/\n/,' ')
51
- text.gsub!(/\|/,'/') # Character | gives an error
52
- sentence = @@Sentence.new(text)
53
- @tokenizer.tokenize(sentence)
54
- @tagger.tag(sentence)
55
- @parenPP.postProcess(sentence)
56
- tagged = sentence.getSGML
57
-
58
- res = tagged.scan(/<GENE>.*?<\/GENE>/).
59
- collect{|r|
60
- r.match(/<GENE>(.*?)<\/GENE>/)
61
- mention = $1
62
- mention.sub!(/^\s*/,'')
63
- mention.sub!(/\s*$/,'')
64
- mention
65
- }
66
- res
67
- end
68
-
69
-
70
- end
71
-
72
-
73
-
@@ -1,98 +0,0 @@
1
- # This class loads a dictionary of codes with associated names, it then can
2
- # find those names in a string of text. It works word-wise.
3
- class DictionaryNER
4
-
5
- A_INT = "a"[0]
6
- DOWNCASE_OFFSET = "A"[0].bytes.first - "a"[0].bytes.first
7
-
8
- require 'rbbt/bow/bow'
9
- # Divides a string of text into words. A slash separates words, only if the
10
- # second one begins with a letter.
11
- def self.chunk(text)
12
- text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
13
- end
14
-
15
- # Simplify the text to widen the matches. Currently only downcases the keys
16
- def self.simplify(text)
17
- if text.length > 2 && text[0] < A_INT && text[1] > A_INT
18
- text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1]
19
- else
20
- return text
21
- end
22
- end
23
-
24
- # Given a dictionary structure, find the matches in the text.
25
- def self.match(dict, text) #:nodoc:
26
-
27
- if Array === text
28
- words = text
29
- else
30
- words = chunk(text)
31
- end
32
-
33
- result = {}
34
- words.each_with_index{|word, pos|
35
- key = simplify(word)
36
- next if dict[key].nil?
37
- dict[key].each{|entrie|
38
- case
39
- when String === entrie
40
- result[word] ||= []
41
- result[word] << entrie unless result[word].include? entrie
42
- when Hash === entrie
43
- rec_words = words[(pos + 1)..-1]
44
- rec_result = match(entrie, rec_words)
45
- rec_result.each{|rec_key, rec_list|
46
- composite_key = word + ' ' + rec_key
47
- result[composite_key] ||= []
48
- result[composite_key] += rec_list
49
- result[composite_key].uniq!
50
- }
51
- end
52
- }
53
- }
54
- result
55
- end
56
-
57
- # Add a name to a structure
58
- def self.add_name(dict, name, code)
59
- if Array === name
60
- words = name
61
- else
62
- words = chunk(name)
63
- end
64
-
65
- key = simplify(words.shift)
66
- if words.empty?
67
- dict[key] ||= []
68
- dict[key] << code unless dict[key].include? code
69
- else
70
- rec_dict = {}
71
- add_name(rec_dict, words , code)
72
- dict[key] ||= []
73
- dict[key] << rec_dict
74
- end
75
- end
76
-
77
- def self.load(dictionary)
78
- dict = {}
79
-
80
- dictionary = File.open(dictionary).read if File.exists? dictionary
81
-
82
- dictionary.each_line{|l|
83
- names = l.chomp.split(/\t/)
84
- code = names.shift
85
- names.each{|name| add_name(dict, name, code) }
86
- }
87
- dict
88
- end
89
-
90
- def initialize(dictionary)
91
- @dict = DictionaryNER.load(dictionary)
92
- end
93
-
94
- def match(text)
95
- DictionaryNER.match(@dict, text)
96
- end
97
-
98
- end
@@ -1,70 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt/util/misc'
3
-
4
- class RegExpNER
5
-
6
- def self.match_re(text, res)
7
- res = [res] unless Array === res
8
-
9
- res.collect{|re|
10
- text.scan(re)
11
- }.flatten
12
- end
13
-
14
- def self.build_re_old(names, ignorecase=true)
15
- names.compact.select{|n| n != ""}.
16
- sort{|a,b| b.length <=> a.length}.
17
- collect{|n|
18
- re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
19
- }
20
- end
21
-
22
- def self.build_re(names, ignorecase=true)
23
- res = names.compact.select{|n| n != ""}.
24
- sort{|a,b| b.length <=> a.length}.
25
- collect{|n|
26
- Regexp.quote(n)
27
- }
28
-
29
- /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
30
- end
31
-
32
-
33
- def initialize(lexicon, options = {})
34
- options = {:flatten => true, :ignorecase => true, :stopwords => nil}.merge options
35
-
36
- options[:stopwords] = $stopwords if $stopwords && (options[:stopwords].nil? || options[:stopwords] == true)
37
- options[:stopwords] ||= []
38
-
39
- data = Open.to_hash(lexicon, options)
40
-
41
- @index = {}
42
- data.collect{|code, names|
43
- next if code.nil? || code == ""
44
- if options[:stopwords].any?
45
- names = names.select{|n|
46
- ! options[:stopwords].include?(options[:ignorecase] ? n.downcase : n)
47
- }
48
- end
49
- @index[code] = RegExpNER.build_re(names, options[:ignorecase])
50
- }
51
- end
52
-
53
- def match_hash(text)
54
- return {} if text.nil? || text == ""
55
- matches = {}
56
- @index.each{|code, re|
57
- RegExpNER.match_re(text, re).each{|match|
58
- matches[code] ||= []
59
- matches[code] << match
60
- }
61
- }
62
- matches
63
- end
64
-
65
- def match(text)
66
- match_hash(text)
67
- end
68
-
69
- end
70
-
@@ -1,227 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/util/misc'
4
- require 'rbbt/util/simpleDSL'
5
-
6
- class NERFeatures < SimpleDSL
7
- def self.tokens(text)
8
- text.scan(/
9
- \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
10
- \w-\w*|
11
- \w+-[A-Z](?!\w)|
12
- \w+|
13
- [.,()\/\[\]{}'"+-]
14
- /x)
15
- end
16
-
17
- def self.reverse(text)
18
- tokens(text).reverse.join(" ")
19
- end
20
-
21
- def define(name, *args, &block)
22
- action = *args[0] || block || /#{name.to_s}s?/i
23
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
24
-
25
- @types[name.to_s] = action
26
- @order.push name.to_s
27
-
28
- name.to_s
29
- end
30
-
31
- attr_accessor :reverse
32
- def initialize(file = nil, reverse = false, &block)
33
- @types = {}
34
- @order = []
35
- @context = []
36
- @reverse = reverse
37
-
38
- file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
39
-
40
- super(:define,file, &block)
41
- end
42
-
43
- def config
44
- @config[:define]
45
- end
46
-
47
- def window(positions)
48
- @window = positions
49
- end
50
-
51
- def context(name, &block)
52
- if name.is_a? Array
53
- @context += name
54
- else
55
- @context.push name
56
-
57
- # The block might be wrongly assigned to this function
58
- # instead of the actual definition, fix that.
59
- if block
60
- @types[name] = block
61
- end
62
- end
63
- end
64
-
65
- def direction(dir)
66
- if dir.to_sym == :reverse
67
- @reverse = true
68
- end
69
- end
70
-
71
- def features(word)
72
- values = [word]
73
-
74
- @order.each{|features|
75
- action = @types[features]
76
- if action.is_a?(Proc)
77
- values.push(action.call(word))
78
- else
79
- m = action.match(word)
80
- if m
81
- if m[1]
82
- values.push(m[1])
83
- else
84
- values.push(m != nil)
85
- end
86
- else
87
- values.push(false)
88
- end
89
- end
90
- }
91
- values
92
- end
93
-
94
- def template(window=nil)
95
- window ||= @window || [1,-1]
96
- template = ""
97
-
98
- i = 1
99
- @order.each{|feat|
100
- template += "U#{ feat }: %x[0,#{ i }]\n"
101
-
102
- if @context.include?(feat)
103
- window.each{|p|
104
- template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
105
- }
106
- end
107
- i += 1
108
- }
109
-
110
- template += "B\n"
111
-
112
- template
113
- end
114
-
115
-
116
- def text_features(text, positive = nil)
117
- text = self.class.reverse(text) if @reverse
118
- initial = true
119
- self.class.tokens(text).collect{|token|
120
- features = features(token)
121
- if !positive.nil?
122
- features << (positive ? (initial ? 1 : 2) : 0)
123
- initial = false
124
- end
125
- features
126
- }
127
- end
128
-
129
- def tagged_features(text, mentions)
130
- mentions ||= []
131
- mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
132
- re = mentions.collect{|mention|
133
- Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
134
- }.join("|")
135
-
136
- positive = false
137
- features = []
138
- chunks = text.split(/(#{re})/)
139
- chunks.each{|t|
140
- chunk_features = text_features(t, positive)
141
- positive = !positive
142
- if @reverse
143
- features = chunk_features + features
144
- else
145
- features = features + chunk_features
146
- end
147
- }
148
- features
149
- end
150
-
151
- def train(features, model)
152
- tmp_template = TmpFile.tmp_file("template-")
153
- Open.write(tmp_template,template)
154
-
155
- cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
156
- system cmd
157
- Open.write(model + '.config',config)
158
- FileUtils.rm tmp_template
159
- end
160
-
161
- end
162
-
163
- class NER
164
-
165
- def initialize(model = nil)
166
- begin
167
- require 'CRFPP'
168
- rescue Exception
169
- require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
170
- end
171
-
172
- model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
173
-
174
- @parser = NERFeatures.new(model + '.config')
175
- @reverse = @parser.reverse
176
- @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
177
- end
178
-
179
- def extract(text)
180
- features = @parser.text_features(text)
181
-
182
- @tagger.clear
183
- features.each{|feats|
184
- @tagger.add(feats.join(" "))
185
- }
186
-
187
- @tagger.parse
188
-
189
- found = []
190
- mention = []
191
-
192
- @tagger.size.times{|i|
193
- label = @tagger.y(i)
194
- word = @tagger.x(i,0)
195
-
196
- if word == ')'
197
- mention.push(')') if mention.join =~ /\(/
198
- next
199
- end
200
-
201
- case label
202
- when 1
203
- if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
204
- found.push(mention)
205
- mention = []
206
- end
207
- mention.push(word)
208
- when 2
209
- mention.push(word)
210
- when 0
211
- found.push(mention) if mention.any?
212
- mention = []
213
- end
214
- }
215
-
216
- found << mention if mention.any?
217
-
218
- found.collect{|list|
219
- list = list.reverse if @reverse
220
- list.join(" ")
221
- }
222
- end
223
-
224
- end
225
-
226
-
227
-