rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,34 +0,0 @@
1
- require 'rbbt'
2
- require 'rjb'
3
-
4
- # Offers a Ruby interface to the Abner Named Entity Recognition Package
5
- # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
6
- class Abner
7
-
8
- @@JFile = Rjb::import('java.io.File')
9
- @@Tagger = Rjb::import('abner.Tagger')
10
- @@Trainer = Rjb::import('abner.Trainer')
11
-
12
- # If modelfile is present a custom trained model can be used,
13
- # otherwise, the default BioCreative model is used.
14
- def initialize(modelfile=nil)
15
- if modelfile == nil
16
- @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
17
- else
18
- @tagger = @@Tagger.new(@@JFile.new(modelfile))
19
- end
20
- end
21
-
22
- # Given a chunk of text, it finds all the mentions appearing in it. It
23
- # returns all the mentions found, regardless of type, to be coherent
24
- # with the rest of NER packages in Rbbt.
25
- def extract(text)
26
-
27
- res = @tagger.getEntities(text)
28
- types = res[1]
29
- strings = res[0]
30
-
31
- return strings.collect{|s| s.to_s}
32
- end
33
-
34
- end
@@ -1,73 +0,0 @@
1
- require 'rbbt'
2
- require 'rjb'
3
-
4
- # Offers a Ruby interface to the Banner Named Entity Recognition Package
5
- # in Java. Banner[http://banner.sourceforge.net/].
6
- class Banner
7
-
8
-
9
- @@JFile = Rjb::import('java.io.File')
10
- @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
11
- @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
12
- @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
13
- @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
14
- @@Sentence = Rjb::import('banner.Sentence')
15
- @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
16
-
17
-
18
-
19
- # The parameters are set to default values, the only one that one
20
- # might want to change is the modelfile to point to a custom trained
21
- # one.
22
- def initialize(modelfile = File.join(Rbbt.datadir, 'third_party/banner/gene_model.bin'),
23
- lemmadir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/lemmatiser'),
24
- taggerdir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/tagger')
25
- )
26
-
27
- @tokenizer = @@SimpleTokenizer.new
28
-
29
- model = @@JFile.new(modelfile)
30
- lemma = @@EngLemmatiser.new(lemmadir,false,true)
31
- helper = @@HeppleTagger.new(taggerdir)
32
-
33
- # The next lines are needed to avoid colisions with
34
- # metraprograming that could define load (activesupport in
35
- # particular :@ ). RJB seems to call java on method missing
36
- class << @@CRFTagger
37
- if method_defined? :load
38
- undef_method :load
39
- end
40
- end
41
-
42
- @tagger = @@CRFTagger.load( model, lemma, helper)
43
- @parenPP = @@ParenthesisPostProcessor.new()
44
- end
45
-
46
-
47
- # Returns an array with the mention found in the provided piece of
48
- # text.
49
- def extract(text)
50
- text.gsub!(/\n/,' ')
51
- text.gsub!(/\|/,'/') # Character | gives an error
52
- sentence = @@Sentence.new(text)
53
- @tokenizer.tokenize(sentence)
54
- @tagger.tag(sentence)
55
- @parenPP.postProcess(sentence)
56
- tagged = sentence.getSGML
57
-
58
- res = tagged.scan(/<GENE>.*?<\/GENE>/).
59
- collect{|r|
60
- r.match(/<GENE>(.*?)<\/GENE>/)
61
- mention = $1
62
- mention.sub!(/^\s*/,'')
63
- mention.sub!(/\s*$/,'')
64
- mention
65
- }
66
- res
67
- end
68
-
69
-
70
- end
71
-
72
-
73
-
@@ -1,98 +0,0 @@
1
- # This class loads a dictionary of codes with associated names, it then can
2
- # find those names in a string of text. It works word-wise.
3
- class DictionaryNER
4
-
5
- A_INT = "a"[0]
6
- DOWNCASE_OFFSET = "A"[0].bytes.first - "a"[0].bytes.first
7
-
8
- require 'rbbt/bow/bow'
9
- # Divides a string of text into words. A slash separates words, only if the
10
- # second one begins with a letter.
11
- def self.chunk(text)
12
- text.split(/(?:[\s.,]|-(?=[a-zA-Z]))+/)
13
- end
14
-
15
- # Simplify the text to widen the matches. Currently only downcases the keys
16
- def self.simplify(text)
17
- if text.length > 2 && text[0] < A_INT && text[1] > A_INT
18
- text = (text[0] - DOWNCASE_OFFSET).chr + text[1..-1]
19
- else
20
- return text
21
- end
22
- end
23
-
24
- # Given a dictionary structure, find the matches in the text.
25
- def self.match(dict, text) #:nodoc:
26
-
27
- if Array === text
28
- words = text
29
- else
30
- words = chunk(text)
31
- end
32
-
33
- result = {}
34
- words.each_with_index{|word, pos|
35
- key = simplify(word)
36
- next if dict[key].nil?
37
- dict[key].each{|entrie|
38
- case
39
- when String === entrie
40
- result[word] ||= []
41
- result[word] << entrie unless result[word].include? entrie
42
- when Hash === entrie
43
- rec_words = words[(pos + 1)..-1]
44
- rec_result = match(entrie, rec_words)
45
- rec_result.each{|rec_key, rec_list|
46
- composite_key = word + ' ' + rec_key
47
- result[composite_key] ||= []
48
- result[composite_key] += rec_list
49
- result[composite_key].uniq!
50
- }
51
- end
52
- }
53
- }
54
- result
55
- end
56
-
57
- # Add a name to a structure
58
- def self.add_name(dict, name, code)
59
- if Array === name
60
- words = name
61
- else
62
- words = chunk(name)
63
- end
64
-
65
- key = simplify(words.shift)
66
- if words.empty?
67
- dict[key] ||= []
68
- dict[key] << code unless dict[key].include? code
69
- else
70
- rec_dict = {}
71
- add_name(rec_dict, words , code)
72
- dict[key] ||= []
73
- dict[key] << rec_dict
74
- end
75
- end
76
-
77
- def self.load(dictionary)
78
- dict = {}
79
-
80
- dictionary = File.open(dictionary).read if File.exists? dictionary
81
-
82
- dictionary.each_line{|l|
83
- names = l.chomp.split(/\t/)
84
- code = names.shift
85
- names.each{|name| add_name(dict, name, code) }
86
- }
87
- dict
88
- end
89
-
90
- def initialize(dictionary)
91
- @dict = DictionaryNER.load(dictionary)
92
- end
93
-
94
- def match(text)
95
- DictionaryNER.match(@dict, text)
96
- end
97
-
98
- end
@@ -1,70 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt/util/misc'
3
-
4
- class RegExpNER
5
-
6
- def self.match_re(text, res)
7
- res = [res] unless Array === res
8
-
9
- res.collect{|re|
10
- text.scan(re)
11
- }.flatten
12
- end
13
-
14
- def self.build_re_old(names, ignorecase=true)
15
- names.compact.select{|n| n != ""}.
16
- sort{|a,b| b.length <=> a.length}.
17
- collect{|n|
18
- re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
19
- }
20
- end
21
-
22
- def self.build_re(names, ignorecase=true)
23
- res = names.compact.select{|n| n != ""}.
24
- sort{|a,b| b.length <=> a.length}.
25
- collect{|n|
26
- Regexp.quote(n)
27
- }
28
-
29
- /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
30
- end
31
-
32
-
33
- def initialize(lexicon, options = {})
34
- options = {:flatten => true, :ignorecase => true, :stopwords => nil}.merge options
35
-
36
- options[:stopwords] = $stopwords if $stopwords && (options[:stopwords].nil? || options[:stopwords] == true)
37
- options[:stopwords] ||= []
38
-
39
- data = Open.to_hash(lexicon, options)
40
-
41
- @index = {}
42
- data.collect{|code, names|
43
- next if code.nil? || code == ""
44
- if options[:stopwords].any?
45
- names = names.select{|n|
46
- ! options[:stopwords].include?(options[:ignorecase] ? n.downcase : n)
47
- }
48
- end
49
- @index[code] = RegExpNER.build_re(names, options[:ignorecase])
50
- }
51
- end
52
-
53
- def match_hash(text)
54
- return {} if text.nil? || text == ""
55
- matches = {}
56
- @index.each{|code, re|
57
- RegExpNER.match_re(text, re).each{|match|
58
- matches[code] ||= []
59
- matches[code] << match
60
- }
61
- }
62
- matches
63
- end
64
-
65
- def match(text)
66
- match_hash(text)
67
- end
68
-
69
- end
70
-
@@ -1,227 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/util/misc'
4
- require 'rbbt/util/simpleDSL'
5
-
6
- class NERFeatures < SimpleDSL
7
- def self.tokens(text)
8
- text.scan(/
9
- \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
10
- \w-\w*|
11
- \w+-[A-Z](?!\w)|
12
- \w+|
13
- [.,()\/\[\]{}'"+-]
14
- /x)
15
- end
16
-
17
- def self.reverse(text)
18
- tokens(text).reverse.join(" ")
19
- end
20
-
21
- def define(name, *args, &block)
22
- action = *args[0] || block || /#{name.to_s}s?/i
23
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
24
-
25
- @types[name.to_s] = action
26
- @order.push name.to_s
27
-
28
- name.to_s
29
- end
30
-
31
- attr_accessor :reverse
32
- def initialize(file = nil, reverse = false, &block)
33
- @types = {}
34
- @order = []
35
- @context = []
36
- @reverse = reverse
37
-
38
- file ||= File.join(Rbbt.datadir,'ner/config/default.rb') if !file && !block
39
-
40
- super(:define,file, &block)
41
- end
42
-
43
- def config
44
- @config[:define]
45
- end
46
-
47
- def window(positions)
48
- @window = positions
49
- end
50
-
51
- def context(name, &block)
52
- if name.is_a? Array
53
- @context += name
54
- else
55
- @context.push name
56
-
57
- # The block might be wrongly assigned to this function
58
- # instead of the actual definition, fix that.
59
- if block
60
- @types[name] = block
61
- end
62
- end
63
- end
64
-
65
- def direction(dir)
66
- if dir.to_sym == :reverse
67
- @reverse = true
68
- end
69
- end
70
-
71
- def features(word)
72
- values = [word]
73
-
74
- @order.each{|features|
75
- action = @types[features]
76
- if action.is_a?(Proc)
77
- values.push(action.call(word))
78
- else
79
- m = action.match(word)
80
- if m
81
- if m[1]
82
- values.push(m[1])
83
- else
84
- values.push(m != nil)
85
- end
86
- else
87
- values.push(false)
88
- end
89
- end
90
- }
91
- values
92
- end
93
-
94
- def template(window=nil)
95
- window ||= @window || [1,-1]
96
- template = ""
97
-
98
- i = 1
99
- @order.each{|feat|
100
- template += "U#{ feat }: %x[0,#{ i }]\n"
101
-
102
- if @context.include?(feat)
103
- window.each{|p|
104
- template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
105
- }
106
- end
107
- i += 1
108
- }
109
-
110
- template += "B\n"
111
-
112
- template
113
- end
114
-
115
-
116
- def text_features(text, positive = nil)
117
- text = self.class.reverse(text) if @reverse
118
- initial = true
119
- self.class.tokens(text).collect{|token|
120
- features = features(token)
121
- if !positive.nil?
122
- features << (positive ? (initial ? 1 : 2) : 0)
123
- initial = false
124
- end
125
- features
126
- }
127
- end
128
-
129
- def tagged_features(text, mentions)
130
- mentions ||= []
131
- mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
132
- re = mentions.collect{|mention|
133
- Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
134
- }.join("|")
135
-
136
- positive = false
137
- features = []
138
- chunks = text.split(/(#{re})/)
139
- chunks.each{|t|
140
- chunk_features = text_features(t, positive)
141
- positive = !positive
142
- if @reverse
143
- features = chunk_features + features
144
- else
145
- features = features + chunk_features
146
- end
147
- }
148
- features
149
- end
150
-
151
- def train(features, model)
152
- tmp_template = TmpFile.tmp_file("template-")
153
- Open.write(tmp_template,template)
154
-
155
- cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
156
- system cmd
157
- Open.write(model + '.config',config)
158
- FileUtils.rm tmp_template
159
- end
160
-
161
- end
162
-
163
- class NER
164
-
165
- def initialize(model = nil)
166
- begin
167
- require 'CRFPP'
168
- rescue Exception
169
- require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
170
- end
171
-
172
- model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
173
-
174
- @parser = NERFeatures.new(model + '.config')
175
- @reverse = @parser.reverse
176
- @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
177
- end
178
-
179
- def extract(text)
180
- features = @parser.text_features(text)
181
-
182
- @tagger.clear
183
- features.each{|feats|
184
- @tagger.add(feats.join(" "))
185
- }
186
-
187
- @tagger.parse
188
-
189
- found = []
190
- mention = []
191
-
192
- @tagger.size.times{|i|
193
- label = @tagger.y(i)
194
- word = @tagger.x(i,0)
195
-
196
- if word == ')'
197
- mention.push(')') if mention.join =~ /\(/
198
- next
199
- end
200
-
201
- case label
202
- when 1
203
- if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
204
- found.push(mention)
205
- mention = []
206
- end
207
- mention.push(word)
208
- when 2
209
- mention.push(word)
210
- when 0
211
- found.push(mention) if mention.any?
212
- mention = []
213
- end
214
- }
215
-
216
- found << mention if mention.any?
217
-
218
- found.collect{|list|
219
- list = list.reverse if @reverse
220
- list.join(" ")
221
- }
222
- end
223
-
224
- end
225
-
226
-
227
-