rbbt 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1 @@
1
+ a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where
@@ -0,0 +1,87 @@
1
+ require 'rbbt'
2
+ require 'stemmer'
3
+ require 'rbbt/util/misc'
4
+
5
+ # This module provides methods to extract a bag of words (or bag of bigrams)
6
+ # representation for strings of text, and to produce a vector representations
7
+ # of that bag of words for a given list of terms. This BOW representations of
8
+ # the texts is usually first used to build a Dictionary, and then, with the
9
+ # best selection of terms as determined by the Dictionary::TF_IDF.best of
10
+ # Dictionary::KL.best methods, determine the vector representations for that
11
+ # text.
12
+ module BagOfWords
13
+
14
+ # Divide the input string into an array of words (sequences of \w characters).
15
+ # Words are stemmed and filtered to remove stopwords and words with less than
16
+ # 2 characters. The list of stopwords is a global variable defined in
17
+ # 'rbbt/util/misc'.
18
+ def self.words(text)
19
+ return [] if text.nil?
20
+ text.scan(/\w+/).
21
+ collect{|word| word.downcase.stem}.
22
+ select{|word|
23
+ ! $stopwords.include?(word) &&
24
+ word.length > 2 &&
25
+ word =~ /[a-z]/
26
+ }
27
+ end
28
+
29
+ # Take the array of words for the text and form all the bigrams
30
+ def self.bigrams(text)
31
+ words = words(text)
32
+ bigrams = []
33
+ lastword = nil
34
+
35
+ words.each{|word|
36
+ if lastword
37
+ bigrams << "#{lastword} #{word}"
38
+ end
39
+ lastword = word
40
+ }
41
+
42
+ words + bigrams
43
+ end
44
+
45
+ # Given an array of terms return a hash with the number of appearances of
46
+ # each term
47
+ def self.count(terms)
48
+ count = Hash.new(0)
49
+ terms.each{|word| count[word] += 1}
50
+ count
51
+ end
52
+
53
+
54
+ # Given a string of text find all the words (or bigrams) and return a hash
55
+ # with their counts
56
+ def self.terms(text, bigrams = true)
57
+
58
+ if bigrams
59
+ count(bigrams(text))
60
+ else
61
+ count(words(text))
62
+ end
63
+ end
64
+
65
+ # Given a string of text and a list of terms, which may or may not contain
66
+ # bigrams, return an array with one entry per term which holds the number of
67
+ # occurrences of each term in the text.
68
+ def self.features(text, terms, bigrams = nil)
69
+ bigrams ||= terms.select{|term| term =~ / /}.any?
70
+ count = bigrams ? count(bigrams(text)) : count(words(text))
71
+ count.values_at(*terms)
72
+ end
73
+ end
74
+
75
+ class String
76
+ # Shortcut for BagOfWords.words(self)
77
+ def words
78
+ BagOfWords.words(self)
79
+ end
80
+
81
+ # Shortcut for BagOfWords.bigrams(self)
82
+ def bigrams
83
+ BagOfWords.bigrams(self)
84
+ end
85
+ end
86
+
87
+
@@ -0,0 +1,118 @@
1
+ require 'rbbt/bow/bow'
2
+ require 'rsruby'
3
+
4
+ # This class uses R to build and use classification models. It needs the
5
+ # 'e1071' R package.
6
+ class Classifier
7
+
8
+
9
+ # Given the path to a features file, which specifies a number of instances
10
+ # along with their classes and features in a tab separated format, it uses R
11
+ # to build a svm model which is save to file in the path specified as
12
+ # modelfile.
13
+ def self.create_model(featuresfile, modelfile, dictfile = nil)
14
+
15
+ r = RSRuby.instance
16
+ r.source(File.join(Rbbt.datadir, 'classifier/R/classify.R'))
17
+ r.BOW_classification_model(featuresfile, modelfile)
18
+
19
+ nil
20
+ end
21
+
22
+ attr_reader :terms
23
+
24
+ # Loads an R interpreter which loads the svm model under modelfile.
25
+ def initialize(modelfile)
26
+ @r = RSRuby.instance
27
+ @r.library('e1071')
28
+ @r.source(File.join(Rbbt.datadir, 'classifier/R/classify.R'))
29
+
30
+ @r.load(modelfile)
31
+
32
+ @model = @r.svm_model
33
+ @terms = @r.eval_R("terms = unlist(attr(attr(svm.model$terms,'factors'),'dimnames')[2])")
34
+ end
35
+
36
+ def classify_feature_array(input) #:nodoc:
37
+ @r.assign('input', input)
38
+
39
+ @r.eval_R('input = t(as.data.frame(input))')
40
+ @r.eval_R('rownames(input) <- NULL')
41
+ @r.eval_R('colnames(input) <- terms')
42
+
43
+ results = @r.eval_R('BOW.classification.classify(svm.model, input, svm.weights)')
44
+ results.sort.collect{|p| p[1]}
45
+ end
46
+
47
+ def classify_feature_hash(input) #:nodoc:
48
+ names = []
49
+ features = []
50
+ input.each{|name, feats|
51
+ names << name.to_s
52
+ features << feats
53
+ }
54
+
55
+ @r.assign('input', features)
56
+ @r.assign('input.names', names)
57
+
58
+ @r.eval_R('input = t(as.data.frame(input))')
59
+ @r.eval_R('rownames(input) <- input.names')
60
+ @r.eval_R('colnames(input) <- terms')
61
+
62
+ @r.eval_R('BOW.classification.classify(svm.model, input, svm.weights)')
63
+ end
64
+
65
+ def classify_text_array(input) #:nodoc:
66
+ features = input.collect{|text|
67
+ BagOfWords.features(text, @terms)
68
+ }
69
+
70
+ classify_feature_array(features)
71
+ end
72
+
73
+ def classify_text_hash(input) #:nodoc:
74
+ features = {}
75
+ input.each{|key,text|
76
+ features[key] = BagOfWords.features(text, @terms)
77
+ }
78
+
79
+ classify_feature_hash(features)
80
+ end
81
+
82
+
83
+ # This is a polymorphic method. The input variable may be a single input, in
84
+ # which case the results will be just the class, a hash of inputs, in which
85
+ # case the result will be a hash with the results for each input, or an
86
+ # array, in which case the result is an array of the results in the same
87
+ # order. Each input may also be in the form of a string, in which case it
88
+ # will be transformed into a feature vector, or an array in which case it
89
+ # will be considered as an feature vector itself.
90
+ def classify(input)
91
+ if input.is_a? String
92
+ return classify_text_array([input]).first
93
+ end
94
+
95
+
96
+ if input.is_a? Hash
97
+ return {} if input.empty?
98
+ if input.values.first.is_a? String
99
+ return classify_text_hash(input)
100
+ elsif input.values.first.is_a? Array
101
+ return classify_feature_hash(input)
102
+ end
103
+ end
104
+
105
+ if input.is_a? Array
106
+ return [] if input.empty?
107
+ if input.first.is_a? String
108
+ return classify_text_array(input)
109
+ elsif input.first.is_a? Array
110
+ return classify_feature_array(input)
111
+ end
112
+ end
113
+
114
+ end
115
+
116
+
117
+
118
+ end
@@ -0,0 +1,218 @@
1
+ class Dictionary
2
+ attr_reader :terms
3
+ def initialize
4
+ @terms = Hash.new(0)
5
+ end
6
+
7
+ def add(terms, &block)
8
+ terms.each{|term, count|
9
+ @terms[term] += count
10
+ }
11
+ end
12
+ end
13
+
14
+ class Dictionary::TF_IDF
15
+ attr_reader :terms, :docs, :total_terms, :num_docs
16
+
17
+ def initialize(options = {})
18
+ @term_limit = {
19
+ :limit => 500_000,
20
+ }.merge(options)[:limit]
21
+
22
+ @terms = Hash.new(0)
23
+ @docs = Hash.new(0)
24
+ @num_docs = 0
25
+ @total_terms = 0
26
+ end
27
+
28
+
29
+ def add(terms)
30
+ if @term_limit && @terms.length > @term_limit
31
+ terms = terms.delete_if{|term, count| !@terms.include? term }
32
+ end
33
+
34
+ terms.each{|term, count|
35
+ @terms[term] += count
36
+ @total_terms += count
37
+ @docs[term] += 1
38
+ }
39
+ @num_docs += 1
40
+ end
41
+
42
+ def df
43
+ df = Hash.new(0)
44
+ @docs.each{|term, count|
45
+ df[term] = count.to_f / @num_docs
46
+ }
47
+ df
48
+ end
49
+
50
+ def tf
51
+ tf = Hash.new(0)
52
+ @terms.each{|term, count|
53
+ tf[term] = count.to_f / @total_terms
54
+ }
55
+ tf
56
+ end
57
+
58
+ def idf
59
+ idf = Hash.new(0)
60
+ num_docs = @num_docs.to_f
61
+ @docs.each{|term, count|
62
+ idf[term] = Math::log(num_docs / count)
63
+ }
64
+ idf
65
+ end
66
+
67
+ def tf_idf
68
+ tf_idf = Hash.new(0)
69
+ num_docs = @num_docs.to_f
70
+ @docs.each{|term, count|
71
+ tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count)
72
+ }
73
+ tf_idf
74
+ end
75
+
76
+ def best(options = {})
77
+ hi, low, limit = {
78
+ :low => 0,
79
+ :hi => 1,
80
+ }.merge(options).
81
+ values_at(:hi, :low, :limit)
82
+
83
+ num_docs = @num_docs.to_f
84
+ best = df.select{|term, value|
85
+ value >= low && value <= hi
86
+ }.collect{|p|
87
+ term = p.first
88
+ df_value = p.last
89
+ [term,
90
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
+ ]
92
+ }
93
+ if limit
94
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
95
+ else
96
+ Hash[*best.flatten]
97
+ end
98
+ end
99
+
100
+ def weights(options = {})
101
+ best_terms = best(options).keys
102
+ weights = {}
103
+
104
+ num_docs = @num_docs.to_f
105
+ best_terms.each{|term|
106
+ weights[term] = Math::log(num_docs / @docs[term])
107
+ }
108
+ weights
109
+ end
110
+
111
+ end
112
+
113
+ class Dictionary::KL
114
+ attr_reader :pos_dict, :neg_dict
115
+
116
+ def initialize(options = {})
117
+ @pos_dict = Dictionary::TF_IDF.new(options)
118
+ @neg_dict = Dictionary::TF_IDF.new(options)
119
+ end
120
+
121
+ def terms
122
+ (pos_dict.terms.keys + neg_dict.terms.keys).uniq
123
+ end
124
+
125
+ def add(terms, c)
126
+ dict = (c == :+ || c == '+' ? @pos_dict : @neg_dict)
127
+ dict.add(terms)
128
+ end
129
+
130
+ def kl
131
+ kl = {}
132
+ pos_df = @pos_dict.df
133
+ neg_df = @neg_dict.df
134
+
135
+ terms.each{|term|
136
+ pos = pos_df[term]
137
+ neg = neg_df[term]
138
+
139
+ pos = 0.000001 if pos == 0
140
+ pos = 0.999999 if pos == 1
141
+ neg = 0.000001 if neg == 0
142
+ neg = 0.999999 if neg == 1
143
+
144
+ kl[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
145
+ }
146
+ kl
147
+ end
148
+
149
+ def best(options = {})
150
+ hi, low, limit = {
151
+ :low => 0,
152
+ :hi => 1,
153
+ }.merge(options).
154
+ values_at(:hi, :low, :limit)
155
+
156
+ pos_df = @pos_dict.df
157
+ neg_df = @neg_dict.df
158
+
159
+ best = {}
160
+ terms.select{|term|
161
+ pos_df[term] >= low && pos_df[term] <= hi ||
162
+ neg_df[term] >= low && neg_df[term] <= hi
163
+ }.each{|term|
164
+ pos = pos_df[term]
165
+ neg = neg_df[term]
166
+
167
+ pos = 0.000001 if pos == 0
168
+ pos = 0.999999 if pos == 1
169
+ neg = 0.000001 if neg == 0
170
+ neg = 0.999999 if neg == 1
171
+
172
+ best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
173
+ }
174
+ if limit
175
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
176
+ else
177
+ Hash[*best.flatten]
178
+ end
179
+ end
180
+
181
+ def weights(options = {})
182
+ best(options)
183
+ end
184
+
185
+
186
+
187
+ end
188
+
189
+ if __FILE__ == $0
190
+
191
+ require 'benchmark'
192
+ require 'rbbt/sources/pubmed'
193
+ require 'rbbt/bow/bow'
194
+ require 'progress-meter'
195
+
196
+ max = 10000
197
+
198
+ pmids = PubMed.query("Homo Sapiens", max)
199
+ Progress.monitor "Get pimds"
200
+ docs = PubMed.get_article(pmids).values.collect{|article| BagOfWords.terms(article.text)}
201
+
202
+ dict = Dictionary::TF_IDF.new()
203
+
204
+ puts "Starting Benchmark"
205
+ puts Benchmark.measure{
206
+ docs.each{|doc|
207
+ dict.add doc
208
+ }
209
+ }
210
+ puts Benchmark.measure{
211
+ dict.weights
212
+ }
213
+
214
+ puts dict.terms.length
215
+
216
+
217
+ end
218
+
@@ -0,0 +1,34 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+
4
+ # Offers a Ruby interface to the Abner Named Entity Recognition Package
5
+ # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
6
+ class Abner
7
+
8
+ @@JFile = Rjb::import('java.io.File')
9
+ @@Tagger = Rjb::import('abner.Tagger')
10
+ @@Trainer = Rjb::import('abner.Trainer')
11
+
12
+ # If modelfile is present a custom trained model can be used,
13
+ # otherwise, the default BioCreative model is used.
14
+ def initialize(modelfile=nil)
15
+ if modelfile == nil
16
+ @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
17
+ else
18
+ @tagger = @@Tagger.new(@@JFile.new(modelfile))
19
+ end
20
+ end
21
+
22
+ # Given a chunk of text, it finds all the mentions appearing in it. It
23
+ # returns all the mentions found, regardless of type, to be coherent
24
+ # with the rest of NER packages in Rbbt.
25
+ def extract(text)
26
+
27
+ res = @tagger.getEntities(text)
28
+ types = res[1]
29
+ strings = res[0]
30
+
31
+ return strings.collect{|s| s.to_s}
32
+ end
33
+
34
+ end
@@ -0,0 +1,73 @@
1
+ require 'rbbt'
2
+ require 'rjb'
3
+
4
+ # Offers a Ruby interface to the Banner Named Entity Recognition Package
5
+ # in Java. Banner[http://banner.sourceforge.net/].
6
+ class Banner
7
+
8
+
9
+ @@JFile = Rjb::import('java.io.File')
10
+ @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
11
+ @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
12
+ @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
13
+ @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
14
+ @@Sentence = Rjb::import('banner.Sentence')
15
+ @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
16
+
17
+
18
+
19
+ # The parameters are set to default values, the only one that one
20
+ # might want to change is the modelfile to point to a custom trained
21
+ # one.
22
+ def initialize(modelfile = File.join(Rbbt.datadir, 'third_party/banner/gene_model.bin'),
23
+ lemmadir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/lemmatiser'),
24
+ taggerdir = File.join(Rbbt.datadir, 'third_party/banner/nlpdata/tagger')
25
+ )
26
+
27
+ @tokenizer = @@SimpleTokenizer.new
28
+
29
+ model = @@JFile.new(modelfile)
30
+ lemma = @@EngLemmatiser.new(lemmadir,false,true)
31
+ helper = @@HeppleTagger.new(taggerdir)
32
+
33
+ # The next lines are needed to avoid colisions with
34
+ # metraprograming that could define load (activesupport in
35
+ # particular :@ ). RJB seems to call java on method missing
36
+ class << @@CRFTagger
37
+ if method_defined? :load
38
+ undef_method :load
39
+ end
40
+ end
41
+
42
+ @tagger = @@CRFTagger.load( model, lemma, helper)
43
+ @parenPP = @@ParenthesisPostProcessor.new()
44
+ end
45
+
46
+
47
+ # Returns an array with the mention found in the provided piece of
48
+ # text.
49
+ def extract(text)
50
+ text.gsub!(/\n/,' ')
51
+ text.gsub!(/\|/,'/') # Character | gives an error
52
+ sentence = @@Sentence.new(text)
53
+ @tokenizer.tokenize(sentence)
54
+ @tagger.tag(sentence)
55
+ @parenPP.postProcess(sentence)
56
+ tagged = sentence.getSGML
57
+
58
+ res = tagged.scan(/<GENE>.*?<\/GENE>/).
59
+ collect{|r|
60
+ r.match(/<GENE>(.*?)<\/GENE>/)
61
+ mention = $1
62
+ mention.sub!(/^\s*/,'')
63
+ mention.sub!(/\s*$/,'')
64
+ mention
65
+ }
66
+ res
67
+ end
68
+
69
+
70
+ end
71
+
72
+
73
+
@@ -0,0 +1,62 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/util/misc'
3
+
4
+ class RegExpNER
5
+
6
+ def self.match_re(text, res)
7
+ res = [res] unless Array === res
8
+
9
+ res.collect{|re|
10
+ if text.match(re)
11
+ $1
12
+ else
13
+ nil
14
+ end
15
+ }.compact
16
+ end
17
+
18
+ def self.build_re(names, ignorecase=true)
19
+ names.compact.select{|n| n != ""}.
20
+ sort{|a,b| b.length <=> a.length}.
21
+ collect{|n|
22
+ re = Regexp.quote(n).gsub(/\\?\s/,'\s+')
23
+ /(?:^|[^\w])(#{ re })(?:$|[^\w])/i
24
+ }
25
+ end
26
+
27
+ def initialize(lexicon, options = {})
28
+ options[:flatten] = true
29
+ options[:ignorecase] = true if options[:ignorecase].nil?
30
+ options[:stopwords] = true if options[:stopwords].nil?
31
+
32
+ data = Open.to_hash(lexicon, options)
33
+
34
+ @index = {}
35
+ data.collect{|code, names|
36
+ next if code.nil? || code == ""
37
+ if options[:stopwords]
38
+ names = names.select{|n|
39
+ ! $stopwords.include?(options[:ignorecase] ? n.downcase : n)
40
+ }
41
+ end
42
+ @index[code] = RegExpNER.build_re(names, options[:ignorecase])
43
+ }
44
+ end
45
+
46
+ def match_hash(text)
47
+ matches = {}
48
+ @index.each{|code, re|
49
+ RegExpNER.match_re(text, re).each{|match|
50
+ matches[code] ||= []
51
+ matches[code] << match
52
+ }
53
+ }
54
+ matches
55
+ end
56
+
57
+ def match(text)
58
+ match_hash(text).values.flatten
59
+ end
60
+
61
+ end
62
+