rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
data/LICENSE DELETED
@@ -1,20 +0,0 @@
1
- Copyright (c) 2009 Miguel Vazquez
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,245 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
4
-
5
- require 'rubygems'
6
- require 'rake'
7
-
8
-
9
- require 'simpleconsole'
10
-
11
- begin
12
- require 'rbbt'
13
- rescue Rbbt::NoConfig
14
- $noconfig = true
15
- end
16
-
17
- TASKS= %w(organisms ner norm classifier biocreative entrez go wordlists polysearch abner banner crf++)
18
-
19
- $USAGE =<<EOT
20
- #{__FILE__} <action> [<subaction>] [--update] [--organism <org>]
21
-
22
- actions:
23
-
24
- * configure: Set paths for data, cache, and tmp directories
25
-
26
- * prepare:
27
-
28
- Basic subactions:
29
-
30
- * organisms: Install processing scripts to process organisms
31
- * ner: Install processing scripts for Named Entity Recognition
32
- * norm: Install processing scripts for Gene Mention Normalization
33
- * classifier: Install processing scripts for Classification
34
-
35
- * biocreative: Download and train and test data from BioCreative
36
- * entrez: Download and install data from Entrez
37
- * go: Download and install data from The Gene Ontology
38
- * wordlists: Install word lists
39
- * polysearch: Download and install Polysearch dictionaries
40
-
41
- * abner: Download and install Abner NER system: http://pages.cs.wisc.edu/~bsettles/abner/
42
- * banner: Download and install Banner NER system: http://sourceforge.net/projects/banner/
43
- * crf++: Download and install CRF++ a CRF framework: http://crfpp.sourceforge.net/
44
-
45
- Subactions grouped by task:
46
-
47
- * identifiers: entrez, organisms
48
- * rner: entrez, organisms, biocreative, ner, crf++
49
- * java_ner: entrez, organisms, abner, banner
50
- * norm: entrez organisms, biocreative, crf++, norm, polysearch
51
- * bow: organisms, wordlists
52
- * classifier: organisms, wordlists, classifier, go
53
- * all: #{TASKS.join(", ")}
54
-
55
- * install:
56
- * organisms: Gather organisms data
57
- * ner: Build Named Entity Recognition Models. Mention Normalization needs no training.
58
- * classification: Build Function/Process Classifiers
59
-
60
- --update: Rebuild models or reprocess organism data even if present. You may want to purge the cache
61
- to be up to date with the data in the internet.
62
-
63
- --organism: Gather data only for that particular organism. The organism must be specified by the
64
- keyword. Use '#{__FILE__} organisms' to see find the keywords.
65
-
66
- * purge_cache: Clean the non-persistent cache, which holds general things
67
- downloaded using Open.read, like organism identifiers downloaded from
68
- BioMart. The persistent cache, which hold pubmed articles or entrez gene
69
- descriptions, is not cleaned, as these are not likely to change
70
-
71
- * organisms: Show a list of all organisms along with their identifier in the system
72
- EOT
73
-
74
- class Controller < SimpleConsole::Controller
75
-
76
- params :bool => {:u => :update},
77
- :string => {:o => :organism}
78
-
79
- def organisms
80
- end
81
-
82
-
83
- def default
84
- render :action => :usage
85
- end
86
-
87
- def help
88
- render :action => :usage
89
- end
90
-
91
- def install
92
- raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
93
-
94
- case params[:id]
95
- when "organisms"
96
- @location = File.join(Rbbt.datadir,'organisms')
97
- when "ner"
98
- @location = File.join(Rbbt.datadir,'ner')
99
- when "classifier"
100
- @location = File.join(Rbbt.datadir,'classifier')
101
- else
102
- redirect_to :action => :help, :id => :update
103
- end
104
-
105
- $force = true if params[:update]
106
- $org = params[:organism] if params[:organism]
107
-
108
- end
109
-
110
- def prepare
111
- raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
112
- case params[:id]
113
- when "identifiers"
114
- require 'rbbt/sources/organism'
115
- require 'rbbt/sources/entrez'
116
- @tasks = %w(entrez organisms)
117
- when "rner"
118
- require 'rbbt/ner/rner'
119
- require 'rbbt/sources/entrez'
120
- @tasks = %w(entrez organisms biocreative ner crf++)
121
- when "java_ner"
122
- require 'rjb'
123
- @tasks = %w(entrez organisms abner banner)
124
- when "norm"
125
- require 'rbbt/ner/rner'
126
- require 'rbbt/ner/rnorm'
127
- require 'rbbt/ner/regexpNER'
128
- require 'rbbt/sources/entrez'
129
- @tasks = %w(entrez organisms biocreative crf++ norm polysearch)
130
- when "bow"
131
- require 'rbbt/bow/bow'
132
- require 'rbbt/bow/dictionary'
133
- @tasks = %w(organisms wordlists)
134
- when "classifier"
135
- require 'rbbt/bow/bow'
136
- require 'rbbt/bow/dictionary'
137
- require 'rbbt/bow/classifier'
138
- @tasks = %w(organisms wordlists classifier go)
139
- when "all"
140
- @tasks = TASKS
141
- when nil
142
- redirect_to :action => :help, :id => :install
143
- else
144
- redirect_to :action => :help, :id => :install if ! TASKS.include? params[:id]
145
- @tasks = [params[:id]]
146
- end
147
-
148
- $force = true if params[:update]
149
- $org = params[:organism] if params[:organism]
150
-
151
- end
152
-
153
- def configure
154
- end
155
-
156
- def purge_cache
157
- end
158
-
159
- end
160
-
161
- class View < SimpleConsole::View
162
- def usage
163
- puts $USAGE
164
- end
165
-
166
- def organisms
167
- require 'rbbt/sources/organism'
168
- all = Organism.all(false)
169
- installed = Organism.all
170
-
171
- all.each{|org|
172
- puts "#{Organism.name(org)}: #{org} #{installed.include?(org) ? "(installed)" : ""}"
173
- }
174
- end
175
-
176
-
177
- def prepare
178
- load File.join(Rbbt.rootdir, 'tasks/install.rake')
179
-
180
- @tasks.each{|t|
181
- puts "Invoking #{ t }"
182
- Rake::Task[t].invoke
183
- }
184
- end
185
-
186
- def install
187
-
188
- puts "Changing directory to #{@location}"
189
- chdir @location
190
-
191
- load "./Rakefile"
192
-
193
- Rake::Task['default'].invoke
194
- end
195
-
196
-
197
- def configure
198
-
199
- defaultdir = File.join(ENV['HOME'],'rbbt')
200
-
201
- cachedir = File.join(defaultdir, 'cache')
202
- tmpdir = File.join(defaultdir, 'tmp')
203
- datadir = File.join(defaultdir, 'data')
204
-
205
- puts "Please indicate where you wish to place the data directories"
206
- puts
207
-
208
- puts
209
- puts "* Cache Directory: This directory will hold downloads, from PubMed,
210
- Entrez and other, for local store. It might grow considerably."
211
- print "[#{ cachedir }]? "
212
- input = STDIN.gets
213
- cachedir = input if input =~ /\w/
214
-
215
- puts
216
- puts "* Tmp Directory: Temporary files."
217
- print "[#{ tmpdir }]? "
218
- input = STDIN.gets
219
- tmpdir = input if input =~ /\w/
220
-
221
- puts
222
- puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
223
- print "[#{ datadir }]? "
224
- input = STDIN.gets
225
- datadir = input if input =~ /\w/
226
-
227
-
228
-
229
- fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
230
- fout.puts "cachedir: #{cachedir}"
231
- fout.puts "tmpdir: #{tmpdir}"
232
- fout.puts "datadir: #{datadir}"
233
- fout.close
234
-
235
- end
236
-
237
- def purge_cache
238
- FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
239
- end
240
-
241
- end
242
-
243
- SimpleConsole::Application.run(ARGV, Controller, View)
244
-
245
-
@@ -1,36 +0,0 @@
1
- library('e1071')
2
-
3
- BOW.norm <- function(x, weights = NULL){
4
- x = 1 + log(x);
5
- x[x==-Inf] = 0;
6
- x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
7
- x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
8
- x.norm = x * x.sum;
9
- rm(x.sum);
10
- x.norm[is.na(x.norm)] = 0
11
-
12
- if (!is.null(weights)){
13
- x.norm = x.norm * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
14
- }
15
-
16
- x.norm;
17
- }
18
-
19
-
20
- BOW.classification.model <- function(features, modelfile, dictfile = NULL){
21
- feats = read.table(features, sep="\t", header=T, row.names=1);
22
-
23
- if (!is.null(dictfile)){
24
- svm.weights = read.table(file=dictfile, sep="\t")[2];
25
- }else {
26
- svm.weights = NULL;
27
- }
28
- feats[-1] = BOW.norm(feats[-1], svm.weights);
29
- svm.model = svm(Class ~ ., data=feats, svm.weights);
30
- save(svm.model,svm.weights, file=modelfile);
31
- }
32
-
33
- BOW.classification.classify <- function(modelfile, x, weights = NULL){
34
- x = BOW.norm(x, weights);
35
- predict(modelfile, x);
36
- }
@@ -1,140 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/sources/organism'
3
- require 'rbbt/sources/pubmed'
4
- require 'rbbt/bow/bow'
5
- require 'rbbt/bow/dictionary'
6
- require 'rbbt/bow/classifier'
7
- require 'rbbt/util/misc'
8
-
9
- require 'progress-monitor'
10
- require 'rand'
11
-
12
- $hi ||= ENV['hi'] || 0.8
13
- $low ||= ENV['low'] || 0.01
14
- $max ||= ENV['max'] || 3000
15
- $bigrams ||= ENV['bigrams'] == 'true'
16
-
17
- $ndocs ||= ENV['ndocs'] || 5000
18
-
19
- desc "Bilds Dictionary and Features for an organism"
20
- rule(/data\/(.*)/) do |t|
21
- org = File.basename(t.name)
22
-
23
- go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
24
- all = Organism.literature(org).flatten.uniq - go
25
-
26
- ndocs = [go.length, all.length, $ndocs.to_i].min
27
- puts "Using #{ ndocs } from each class\n\n"
28
-
29
- go = go.shuffle[0..ndocs - 1]
30
- all = all.shuffle[0..ndocs - 1]
31
-
32
- dict = Dictionary::KL.new
33
-
34
-
35
-
36
- chunks = all.chunk(50)
37
- Progress.monitor("Building Dictionary for #{ org }: -")
38
- chunks.each{|chunk|
39
- PubMed.get_article(chunk).each{|pmid, article|
40
- words = BagOfWords.terms(article.text,$bigrams)
41
- dict.add(words, :-)
42
- }
43
- }
44
-
45
- chunks = go.chunk(50)
46
- Progress.monitor("Building Dictionary for #{ org }: +")
47
- chunks.each{|chunk|
48
- PubMed.get_article(chunk).each{|pmid, article|
49
- words = BagOfWords.terms(article.text,$bigrams)
50
- dict.add(words, :+)
51
- }
52
- }
53
-
54
- term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
55
- Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
56
-
57
- terms = term_weigths.keys.sort
58
-
59
- fout = File.open(t.name, 'w')
60
- fout.puts((['Name','Class'] + terms).join("\t"))
61
-
62
- Progress.monitor("Building Features for #{ org }")
63
- all.each{|pmid|
64
- text = PubMed.get_article(pmid).text
65
- fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
66
- }
67
- go.each{|pmid|
68
- text = PubMed.get_article(pmid).text
69
- fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
70
- }
71
-
72
-
73
- fout.close
74
- end
75
-
76
- rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
77
- features = t.name.sub(/model/,'data')
78
- Classifier.create_model(features, t.name, features + '.dict')
79
- end
80
-
81
- rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
82
- model = t.name.sub(/results/,'model')
83
- features = t.name.sub(/results/,'data')
84
- org = File.basename(t.name)
85
-
86
- ndocs = 1000
87
-
88
- used = Open.read(features).read.split(/\n/).collect{|l| l.chomp.split(/\t/).first}[1..-1]
89
-
90
- classifier = Classifier.new(model)
91
- go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
92
- all = Organism.literature(org).flatten.uniq - go - used
93
-
94
- go = go.shuffle[0..ndocs - 1]
95
- all = all.shuffle[0..ndocs - 1]
96
-
97
- ndocs = go.length + all.length
98
-
99
- raise "Not enogh unused articles to evaluate" if go.empty? || all.empty?
100
-
101
- features_go = PubMed.get_article(go).collect{|pmid, article|
102
- article.text
103
- }
104
- pos = classifier.classify(features_go).select{|v| v == '+'}.length
105
-
106
- features_all = PubMed.get_article(all).collect{|pmid, article|
107
- article.text
108
- }
109
- neg = classifier.classify(features_all).select{|v| v == '-'}.length
110
-
111
- puts "#{ pos } #{ neg }"
112
-
113
- precision = (pos + neg) / (ndocs).to_f
114
- recall = pos / go.length.to_f
115
- f1 = ( 2 * precision * recall) / (precision + recall ).to_f
116
-
117
- puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
118
- end
119
-
120
- task 'clean' do
121
- FileUtils.rm Dir.glob("data/*")
122
- FileUtils.rm Dir.glob("model/*")
123
- FileUtils.rm Dir.glob("results/*")
124
-
125
- end
126
- task 'all' do
127
- Organism.all.each{|org|
128
- Rake::Task["model/#{ org }"].invoke
129
- }
130
- end
131
- task 'update' do
132
- if $org
133
- FileUtils.rm Dir.glob("**/#{$org}.*") if $force
134
- Rake::Task["model/#{$org}"].invoke
135
- else
136
- Rake::Task['clean'].invoke if $force
137
- Rake::Task['all'].invoke
138
- end
139
- end
140
-
@@ -1,2 +0,0 @@
1
- #!/bin/bash
2
- wget http://pages.cs.wisc.edu/~bsettles/abner/abner.jar
@@ -1,25 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
4
- wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
5
- mv BANNER_v02.zip BANNER.zip
6
- mv gene_model_v02.bin gene_model.bin
7
- unzip BANNER.zip
8
- cd BANNER
9
- libs=`find libs/ -name "*.jar"`
10
- mkdir classes
11
- javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
12
- cd classes
13
- for f in ../libs/*.jar; do jar xf "$f";done
14
- jar cf banner.jar *
15
- mv banner.jar ../..
16
- cd ..
17
- cp -R nlpdata/ ../
18
- cd ..
19
- rm BANNER.zip
20
- rm -Rf BANNER
21
-
22
-
23
-
24
-
25
-