rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
data/LICENSE DELETED
@@ -1,20 +0,0 @@
1
- Copyright (c) 2009 Miguel Vazquez
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,245 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
4
-
5
- require 'rubygems'
6
- require 'rake'
7
-
8
-
9
- require 'simpleconsole'
10
-
11
- begin
12
- require 'rbbt'
13
- rescue Rbbt::NoConfig
14
- $noconfig = true
15
- end
16
-
17
- TASKS= %w(organisms ner norm classifier biocreative entrez go wordlists polysearch abner banner crf++)
18
-
19
- $USAGE =<<EOT
20
- #{__FILE__} <action> [<subaction>] [--update] [--organism <org>]
21
-
22
- actions:
23
-
24
- * configure: Set paths for data, cache, and tmp directories
25
-
26
- * prepare:
27
-
28
- Basic subactions:
29
-
30
- * organisms: Install processing scripts to process organisms
31
- * ner: Install processing scripts for Named Entity Recognition
32
- * norm: Install processing scripts for Gene Mention Normalization
33
- * classifier: Install processing scripts for Classification
34
-
35
- * biocreative: Download and train and test data from BioCreative
36
- * entrez: Download and install data from Entrez
37
- * go: Download and install data from The Gene Ontology
38
- * wordlists: Install word lists
39
- * polysearch: Download and install Polysearch dictionaries
40
-
41
- * abner: Download and install Abner NER system: http://pages.cs.wisc.edu/~bsettles/abner/
42
- * banner: Download and install Banner NER system: http://sourceforge.net/projects/banner/
43
- * crf++: Download and install CRF++ a CRF framework: http://crfpp.sourceforge.net/
44
-
45
- Subactions grouped by task:
46
-
47
- * identifiers: entrez, organisms
48
- * rner: entrez, organisms, biocreative, ner, crf++
49
- * java_ner: entrez, organisms, abner, banner
50
- * norm: entrez organisms, biocreative, crf++, norm, polysearch
51
- * bow: organisms, wordlists
52
- * classifier: organisms, wordlists, classifier, go
53
- * all: #{TASKS.join(", ")}
54
-
55
- * install:
56
- * organisms: Gather organisms data
57
- * ner: Build Named Entity Recognition Models. Mention Normalization needs no training.
58
- * classification: Build Function/Process Classifiers
59
-
60
- --update: Rebuild models or reprocess organism data even if present. You may want to purge the cache
61
- to be up to date with the data in the internet.
62
-
63
- --organism: Gather data only for that particular organism. The organism must be specified by the
64
- keyword. Use '#{__FILE__} organisms' to see find the keywords.
65
-
66
- * purge_cache: Clean the non-persistent cache, which holds general things
67
- downloaded using Open.read, like organism identifiers downloaded from
68
- BioMart. The persistent cache, which hold pubmed articles or entrez gene
69
- descriptions, is not cleaned, as these are not likely to change
70
-
71
- * organisms: Show a list of all organisms along with their identifier in the system
72
- EOT
73
-
74
- class Controller < SimpleConsole::Controller
75
-
76
- params :bool => {:u => :update},
77
- :string => {:o => :organism}
78
-
79
- def organisms
80
- end
81
-
82
-
83
- def default
84
- render :action => :usage
85
- end
86
-
87
- def help
88
- render :action => :usage
89
- end
90
-
91
- def install
92
- raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
93
-
94
- case params[:id]
95
- when "organisms"
96
- @location = File.join(Rbbt.datadir,'organisms')
97
- when "ner"
98
- @location = File.join(Rbbt.datadir,'ner')
99
- when "classifier"
100
- @location = File.join(Rbbt.datadir,'classifier')
101
- else
102
- redirect_to :action => :help, :id => :update
103
- end
104
-
105
- $force = true if params[:update]
106
- $org = params[:organism] if params[:organism]
107
-
108
- end
109
-
110
- def prepare
111
- raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
112
- case params[:id]
113
- when "identifiers"
114
- require 'rbbt/sources/organism'
115
- require 'rbbt/sources/entrez'
116
- @tasks = %w(entrez organisms)
117
- when "rner"
118
- require 'rbbt/ner/rner'
119
- require 'rbbt/sources/entrez'
120
- @tasks = %w(entrez organisms biocreative ner crf++)
121
- when "java_ner"
122
- require 'rjb'
123
- @tasks = %w(entrez organisms abner banner)
124
- when "norm"
125
- require 'rbbt/ner/rner'
126
- require 'rbbt/ner/rnorm'
127
- require 'rbbt/ner/regexpNER'
128
- require 'rbbt/sources/entrez'
129
- @tasks = %w(entrez organisms biocreative crf++ norm polysearch)
130
- when "bow"
131
- require 'rbbt/bow/bow'
132
- require 'rbbt/bow/dictionary'
133
- @tasks = %w(organisms wordlists)
134
- when "classifier"
135
- require 'rbbt/bow/bow'
136
- require 'rbbt/bow/dictionary'
137
- require 'rbbt/bow/classifier'
138
- @tasks = %w(organisms wordlists classifier go)
139
- when "all"
140
- @tasks = TASKS
141
- when nil
142
- redirect_to :action => :help, :id => :install
143
- else
144
- redirect_to :action => :help, :id => :install if ! TASKS.include? params[:id]
145
- @tasks = [params[:id]]
146
- end
147
-
148
- $force = true if params[:update]
149
- $org = params[:organism] if params[:organism]
150
-
151
- end
152
-
153
- def configure
154
- end
155
-
156
- def purge_cache
157
- end
158
-
159
- end
160
-
161
- class View < SimpleConsole::View
162
- def usage
163
- puts $USAGE
164
- end
165
-
166
- def organisms
167
- require 'rbbt/sources/organism'
168
- all = Organism.all(false)
169
- installed = Organism.all
170
-
171
- all.each{|org|
172
- puts "#{Organism.name(org)}: #{org} #{installed.include?(org) ? "(installed)" : ""}"
173
- }
174
- end
175
-
176
-
177
- def prepare
178
- load File.join(Rbbt.rootdir, 'tasks/install.rake')
179
-
180
- @tasks.each{|t|
181
- puts "Invoking #{ t }"
182
- Rake::Task[t].invoke
183
- }
184
- end
185
-
186
- def install
187
-
188
- puts "Changing directory to #{@location}"
189
- chdir @location
190
-
191
- load "./Rakefile"
192
-
193
- Rake::Task['default'].invoke
194
- end
195
-
196
-
197
- def configure
198
-
199
- defaultdir = File.join(ENV['HOME'],'rbbt')
200
-
201
- cachedir = File.join(defaultdir, 'cache')
202
- tmpdir = File.join(defaultdir, 'tmp')
203
- datadir = File.join(defaultdir, 'data')
204
-
205
- puts "Please indicate where you wish to place the data directories"
206
- puts
207
-
208
- puts
209
- puts "* Cache Directory: This directory will hold downloads, from PubMed,
210
- Entrez and other, for local store. It might grow considerably."
211
- print "[#{ cachedir }]? "
212
- input = STDIN.gets
213
- cachedir = input if input =~ /\w/
214
-
215
- puts
216
- puts "* Tmp Directory: Temporary files."
217
- print "[#{ tmpdir }]? "
218
- input = STDIN.gets
219
- tmpdir = input if input =~ /\w/
220
-
221
- puts
222
- puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
223
- print "[#{ datadir }]? "
224
- input = STDIN.gets
225
- datadir = input if input =~ /\w/
226
-
227
-
228
-
229
- fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
230
- fout.puts "cachedir: #{cachedir}"
231
- fout.puts "tmpdir: #{tmpdir}"
232
- fout.puts "datadir: #{datadir}"
233
- fout.close
234
-
235
- end
236
-
237
- def purge_cache
238
- FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
239
- end
240
-
241
- end
242
-
243
- SimpleConsole::Application.run(ARGV, Controller, View)
244
-
245
-
@@ -1,36 +0,0 @@
1
- library('e1071')
2
-
3
- BOW.norm <- function(x, weights = NULL){
4
- x = 1 + log(x);
5
- x[x==-Inf] = 0;
6
- x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
7
- x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
8
- x.norm = x * x.sum;
9
- rm(x.sum);
10
- x.norm[is.na(x.norm)] = 0
11
-
12
- if (!is.null(weights)){
13
- x.norm = x.norm * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
14
- }
15
-
16
- x.norm;
17
- }
18
-
19
-
20
- BOW.classification.model <- function(features, modelfile, dictfile = NULL){
21
- feats = read.table(features, sep="\t", header=T, row.names=1);
22
-
23
- if (!is.null(dictfile)){
24
- svm.weights = read.table(file=dictfile, sep="\t")[2];
25
- }else {
26
- svm.weights = NULL;
27
- }
28
- feats[-1] = BOW.norm(feats[-1], svm.weights);
29
- svm.model = svm(Class ~ ., data=feats, svm.weights);
30
- save(svm.model,svm.weights, file=modelfile);
31
- }
32
-
33
- BOW.classification.classify <- function(modelfile, x, weights = NULL){
34
- x = BOW.norm(x, weights);
35
- predict(modelfile, x);
36
- }
@@ -1,140 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/sources/organism'
3
- require 'rbbt/sources/pubmed'
4
- require 'rbbt/bow/bow'
5
- require 'rbbt/bow/dictionary'
6
- require 'rbbt/bow/classifier'
7
- require 'rbbt/util/misc'
8
-
9
- require 'progress-monitor'
10
- require 'rand'
11
-
12
- $hi ||= ENV['hi'] || 0.8
13
- $low ||= ENV['low'] || 0.01
14
- $max ||= ENV['max'] || 3000
15
- $bigrams ||= ENV['bigrams'] == 'true'
16
-
17
- $ndocs ||= ENV['ndocs'] || 5000
18
-
19
- desc "Bilds Dictionary and Features for an organism"
20
- rule(/data\/(.*)/) do |t|
21
- org = File.basename(t.name)
22
-
23
- go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
24
- all = Organism.literature(org).flatten.uniq - go
25
-
26
- ndocs = [go.length, all.length, $ndocs.to_i].min
27
- puts "Using #{ ndocs } from each class\n\n"
28
-
29
- go = go.shuffle[0..ndocs - 1]
30
- all = all.shuffle[0..ndocs - 1]
31
-
32
- dict = Dictionary::KL.new
33
-
34
-
35
-
36
- chunks = all.chunk(50)
37
- Progress.monitor("Building Dictionary for #{ org }: -")
38
- chunks.each{|chunk|
39
- PubMed.get_article(chunk).each{|pmid, article|
40
- words = BagOfWords.terms(article.text,$bigrams)
41
- dict.add(words, :-)
42
- }
43
- }
44
-
45
- chunks = go.chunk(50)
46
- Progress.monitor("Building Dictionary for #{ org }: +")
47
- chunks.each{|chunk|
48
- PubMed.get_article(chunk).each{|pmid, article|
49
- words = BagOfWords.terms(article.text,$bigrams)
50
- dict.add(words, :+)
51
- }
52
- }
53
-
54
- term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
55
- Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
56
-
57
- terms = term_weigths.keys.sort
58
-
59
- fout = File.open(t.name, 'w')
60
- fout.puts((['Name','Class'] + terms).join("\t"))
61
-
62
- Progress.monitor("Building Features for #{ org }")
63
- all.each{|pmid|
64
- text = PubMed.get_article(pmid).text
65
- fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
66
- }
67
- go.each{|pmid|
68
- text = PubMed.get_article(pmid).text
69
- fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
70
- }
71
-
72
-
73
- fout.close
74
- end
75
-
76
- rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
77
- features = t.name.sub(/model/,'data')
78
- Classifier.create_model(features, t.name, features + '.dict')
79
- end
80
-
81
- rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
82
- model = t.name.sub(/results/,'model')
83
- features = t.name.sub(/results/,'data')
84
- org = File.basename(t.name)
85
-
86
- ndocs = 1000
87
-
88
- used = Open.read(features).read.split(/\n/).collect{|l| l.chomp.split(/\t/).first}[1..-1]
89
-
90
- classifier = Classifier.new(model)
91
- go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
92
- all = Organism.literature(org).flatten.uniq - go - used
93
-
94
- go = go.shuffle[0..ndocs - 1]
95
- all = all.shuffle[0..ndocs - 1]
96
-
97
- ndocs = go.length + all.length
98
-
99
- raise "Not enogh unused articles to evaluate" if go.empty? || all.empty?
100
-
101
- features_go = PubMed.get_article(go).collect{|pmid, article|
102
- article.text
103
- }
104
- pos = classifier.classify(features_go).select{|v| v == '+'}.length
105
-
106
- features_all = PubMed.get_article(all).collect{|pmid, article|
107
- article.text
108
- }
109
- neg = classifier.classify(features_all).select{|v| v == '-'}.length
110
-
111
- puts "#{ pos } #{ neg }"
112
-
113
- precision = (pos + neg) / (ndocs).to_f
114
- recall = pos / go.length.to_f
115
- f1 = ( 2 * precision * recall) / (precision + recall ).to_f
116
-
117
- puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
118
- end
119
-
120
- task 'clean' do
121
- FileUtils.rm Dir.glob("data/*")
122
- FileUtils.rm Dir.glob("model/*")
123
- FileUtils.rm Dir.glob("results/*")
124
-
125
- end
126
- task 'all' do
127
- Organism.all.each{|org|
128
- Rake::Task["model/#{ org }"].invoke
129
- }
130
- end
131
- task 'update' do
132
- if $org
133
- FileUtils.rm Dir.glob("**/#{$org}.*") if $force
134
- Rake::Task["model/#{$org}"].invoke
135
- else
136
- Rake::Task['clean'].invoke if $force
137
- Rake::Task['all'].invoke
138
- end
139
- end
140
-
@@ -1,2 +0,0 @@
1
- #!/bin/bash
2
- wget http://pages.cs.wisc.edu/~bsettles/abner/abner.jar
@@ -1,25 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
4
- wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
5
- mv BANNER_v02.zip BANNER.zip
6
- mv gene_model_v02.bin gene_model.bin
7
- unzip BANNER.zip
8
- cd BANNER
9
- libs=`find libs/ -name "*.jar"`
10
- mkdir classes
11
- javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
12
- cd classes
13
- for f in ../libs/*.jar; do jar xf "$f";done
14
- jar cf banner.jar *
15
- mv banner.jar ../..
16
- cd ..
17
- cp -R nlpdata/ ../
18
- cd ..
19
- rm BANNER.zip
20
- rm -Rf BANNER
21
-
22
-
23
-
24
-
25
-