rbbt 1.2.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
data/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Copyright (c) 2009 Miguel Vazquez
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
-
a copy of this software and associated documentation files (the
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
10
|
-
|
11
|
-
The above copyright notice and this permission notice shall be
|
12
|
-
included in all copies or substantial portions of the Software.
|
13
|
-
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/rbbt_config
DELETED
@@ -1,245 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
require 'rake'
|
7
|
-
|
8
|
-
|
9
|
-
require 'simpleconsole'
|
10
|
-
|
11
|
-
begin
|
12
|
-
require 'rbbt'
|
13
|
-
rescue Rbbt::NoConfig
|
14
|
-
$noconfig = true
|
15
|
-
end
|
16
|
-
|
17
|
-
TASKS= %w(organisms ner norm classifier biocreative entrez go wordlists polysearch abner banner crf++)
|
18
|
-
|
19
|
-
$USAGE =<<EOT
|
20
|
-
#{__FILE__} <action> [<subaction>] [--update] [--organism <org>]
|
21
|
-
|
22
|
-
actions:
|
23
|
-
|
24
|
-
* configure: Set paths for data, cache, and tmp directories
|
25
|
-
|
26
|
-
* prepare:
|
27
|
-
|
28
|
-
Basic subactions:
|
29
|
-
|
30
|
-
* organisms: Install processing scripts to process organisms
|
31
|
-
* ner: Install processing scripts for Named Entity Recognition
|
32
|
-
* norm: Install processing scripts for Gene Mention Normalization
|
33
|
-
* classifier: Install processing scripts for Classification
|
34
|
-
|
35
|
-
* biocreative: Download and train and test data from BioCreative
|
36
|
-
* entrez: Download and install data from Entrez
|
37
|
-
* go: Download and install data from The Gene Ontology
|
38
|
-
* wordlists: Install word lists
|
39
|
-
* polysearch: Download and install Polysearch dictionaries
|
40
|
-
|
41
|
-
* abner: Download and install Abner NER system: http://pages.cs.wisc.edu/~bsettles/abner/
|
42
|
-
* banner: Download and install Banner NER system: http://sourceforge.net/projects/banner/
|
43
|
-
* crf++: Download and install CRF++ a CRF framework: http://crfpp.sourceforge.net/
|
44
|
-
|
45
|
-
Subactions grouped by task:
|
46
|
-
|
47
|
-
* identifiers: entrez, organisms
|
48
|
-
* rner: entrez, organisms, biocreative, ner, crf++
|
49
|
-
* java_ner: entrez, organisms, abner, banner
|
50
|
-
* norm: entrez organisms, biocreative, crf++, norm, polysearch
|
51
|
-
* bow: organisms, wordlists
|
52
|
-
* classifier: organisms, wordlists, classifier, go
|
53
|
-
* all: #{TASKS.join(", ")}
|
54
|
-
|
55
|
-
* install:
|
56
|
-
* organisms: Gather organisms data
|
57
|
-
* ner: Build Named Entity Recognition Models. Mention Normalization needs no training.
|
58
|
-
* classification: Build Function/Process Classifiers
|
59
|
-
|
60
|
-
--update: Rebuild models or reprocess organism data even if present. You may want to purge the cache
|
61
|
-
to be up to date with the data in the internet.
|
62
|
-
|
63
|
-
--organism: Gather data only for that particular organism. The organism must be specified by the
|
64
|
-
keyword. Use '#{__FILE__} organisms' to see find the keywords.
|
65
|
-
|
66
|
-
* purge_cache: Clean the non-persistent cache, which holds general things
|
67
|
-
downloaded using Open.read, like organism identifiers downloaded from
|
68
|
-
BioMart. The persistent cache, which hold pubmed articles or entrez gene
|
69
|
-
descriptions, is not cleaned, as these are not likely to change
|
70
|
-
|
71
|
-
* organisms: Show a list of all organisms along with their identifier in the system
|
72
|
-
EOT
|
73
|
-
|
74
|
-
class Controller < SimpleConsole::Controller
|
75
|
-
|
76
|
-
params :bool => {:u => :update},
|
77
|
-
:string => {:o => :organism}
|
78
|
-
|
79
|
-
def organisms
|
80
|
-
end
|
81
|
-
|
82
|
-
|
83
|
-
def default
|
84
|
-
render :action => :usage
|
85
|
-
end
|
86
|
-
|
87
|
-
def help
|
88
|
-
render :action => :usage
|
89
|
-
end
|
90
|
-
|
91
|
-
def install
|
92
|
-
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
93
|
-
|
94
|
-
case params[:id]
|
95
|
-
when "organisms"
|
96
|
-
@location = File.join(Rbbt.datadir,'organisms')
|
97
|
-
when "ner"
|
98
|
-
@location = File.join(Rbbt.datadir,'ner')
|
99
|
-
when "classifier"
|
100
|
-
@location = File.join(Rbbt.datadir,'classifier')
|
101
|
-
else
|
102
|
-
redirect_to :action => :help, :id => :update
|
103
|
-
end
|
104
|
-
|
105
|
-
$force = true if params[:update]
|
106
|
-
$org = params[:organism] if params[:organism]
|
107
|
-
|
108
|
-
end
|
109
|
-
|
110
|
-
def prepare
|
111
|
-
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
112
|
-
case params[:id]
|
113
|
-
when "identifiers"
|
114
|
-
require 'rbbt/sources/organism'
|
115
|
-
require 'rbbt/sources/entrez'
|
116
|
-
@tasks = %w(entrez organisms)
|
117
|
-
when "rner"
|
118
|
-
require 'rbbt/ner/rner'
|
119
|
-
require 'rbbt/sources/entrez'
|
120
|
-
@tasks = %w(entrez organisms biocreative ner crf++)
|
121
|
-
when "java_ner"
|
122
|
-
require 'rjb'
|
123
|
-
@tasks = %w(entrez organisms abner banner)
|
124
|
-
when "norm"
|
125
|
-
require 'rbbt/ner/rner'
|
126
|
-
require 'rbbt/ner/rnorm'
|
127
|
-
require 'rbbt/ner/regexpNER'
|
128
|
-
require 'rbbt/sources/entrez'
|
129
|
-
@tasks = %w(entrez organisms biocreative crf++ norm polysearch)
|
130
|
-
when "bow"
|
131
|
-
require 'rbbt/bow/bow'
|
132
|
-
require 'rbbt/bow/dictionary'
|
133
|
-
@tasks = %w(organisms wordlists)
|
134
|
-
when "classifier"
|
135
|
-
require 'rbbt/bow/bow'
|
136
|
-
require 'rbbt/bow/dictionary'
|
137
|
-
require 'rbbt/bow/classifier'
|
138
|
-
@tasks = %w(organisms wordlists classifier go)
|
139
|
-
when "all"
|
140
|
-
@tasks = TASKS
|
141
|
-
when nil
|
142
|
-
redirect_to :action => :help, :id => :install
|
143
|
-
else
|
144
|
-
redirect_to :action => :help, :id => :install if ! TASKS.include? params[:id]
|
145
|
-
@tasks = [params[:id]]
|
146
|
-
end
|
147
|
-
|
148
|
-
$force = true if params[:update]
|
149
|
-
$org = params[:organism] if params[:organism]
|
150
|
-
|
151
|
-
end
|
152
|
-
|
153
|
-
def configure
|
154
|
-
end
|
155
|
-
|
156
|
-
def purge_cache
|
157
|
-
end
|
158
|
-
|
159
|
-
end
|
160
|
-
|
161
|
-
class View < SimpleConsole::View
|
162
|
-
def usage
|
163
|
-
puts $USAGE
|
164
|
-
end
|
165
|
-
|
166
|
-
def organisms
|
167
|
-
require 'rbbt/sources/organism'
|
168
|
-
all = Organism.all(false)
|
169
|
-
installed = Organism.all
|
170
|
-
|
171
|
-
all.each{|org|
|
172
|
-
puts "#{Organism.name(org)}: #{org} #{installed.include?(org) ? "(installed)" : ""}"
|
173
|
-
}
|
174
|
-
end
|
175
|
-
|
176
|
-
|
177
|
-
def prepare
|
178
|
-
load File.join(Rbbt.rootdir, 'tasks/install.rake')
|
179
|
-
|
180
|
-
@tasks.each{|t|
|
181
|
-
puts "Invoking #{ t }"
|
182
|
-
Rake::Task[t].invoke
|
183
|
-
}
|
184
|
-
end
|
185
|
-
|
186
|
-
def install
|
187
|
-
|
188
|
-
puts "Changing directory to #{@location}"
|
189
|
-
chdir @location
|
190
|
-
|
191
|
-
load "./Rakefile"
|
192
|
-
|
193
|
-
Rake::Task['default'].invoke
|
194
|
-
end
|
195
|
-
|
196
|
-
|
197
|
-
def configure
|
198
|
-
|
199
|
-
defaultdir = File.join(ENV['HOME'],'rbbt')
|
200
|
-
|
201
|
-
cachedir = File.join(defaultdir, 'cache')
|
202
|
-
tmpdir = File.join(defaultdir, 'tmp')
|
203
|
-
datadir = File.join(defaultdir, 'data')
|
204
|
-
|
205
|
-
puts "Please indicate where you wish to place the data directories"
|
206
|
-
puts
|
207
|
-
|
208
|
-
puts
|
209
|
-
puts "* Cache Directory: This directory will hold downloads, from PubMed,
|
210
|
-
Entrez and other, for local store. It might grow considerably."
|
211
|
-
print "[#{ cachedir }]? "
|
212
|
-
input = STDIN.gets
|
213
|
-
cachedir = input if input =~ /\w/
|
214
|
-
|
215
|
-
puts
|
216
|
-
puts "* Tmp Directory: Temporary files."
|
217
|
-
print "[#{ tmpdir }]? "
|
218
|
-
input = STDIN.gets
|
219
|
-
tmpdir = input if input =~ /\w/
|
220
|
-
|
221
|
-
puts
|
222
|
-
puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
|
223
|
-
print "[#{ datadir }]? "
|
224
|
-
input = STDIN.gets
|
225
|
-
datadir = input if input =~ /\w/
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
|
230
|
-
fout.puts "cachedir: #{cachedir}"
|
231
|
-
fout.puts "tmpdir: #{tmpdir}"
|
232
|
-
fout.puts "datadir: #{datadir}"
|
233
|
-
fout.close
|
234
|
-
|
235
|
-
end
|
236
|
-
|
237
|
-
def purge_cache
|
238
|
-
FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
|
239
|
-
end
|
240
|
-
|
241
|
-
end
|
242
|
-
|
243
|
-
SimpleConsole::Application.run(ARGV, Controller, View)
|
244
|
-
|
245
|
-
|
@@ -1,36 +0,0 @@
|
|
1
|
-
library('e1071')
|
2
|
-
|
3
|
-
BOW.norm <- function(x, weights = NULL){
|
4
|
-
x = 1 + log(x);
|
5
|
-
x[x==-Inf] = 0;
|
6
|
-
x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
|
7
|
-
x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
|
8
|
-
x.norm = x * x.sum;
|
9
|
-
rm(x.sum);
|
10
|
-
x.norm[is.na(x.norm)] = 0
|
11
|
-
|
12
|
-
if (!is.null(weights)){
|
13
|
-
x.norm = x.norm * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
|
14
|
-
}
|
15
|
-
|
16
|
-
x.norm;
|
17
|
-
}
|
18
|
-
|
19
|
-
|
20
|
-
BOW.classification.model <- function(features, modelfile, dictfile = NULL){
|
21
|
-
feats = read.table(features, sep="\t", header=T, row.names=1);
|
22
|
-
|
23
|
-
if (!is.null(dictfile)){
|
24
|
-
svm.weights = read.table(file=dictfile, sep="\t")[2];
|
25
|
-
}else {
|
26
|
-
svm.weights = NULL;
|
27
|
-
}
|
28
|
-
feats[-1] = BOW.norm(feats[-1], svm.weights);
|
29
|
-
svm.model = svm(Class ~ ., data=feats, svm.weights);
|
30
|
-
save(svm.model,svm.weights, file=modelfile);
|
31
|
-
}
|
32
|
-
|
33
|
-
BOW.classification.classify <- function(modelfile, x, weights = NULL){
|
34
|
-
x = BOW.norm(x, weights);
|
35
|
-
predict(modelfile, x);
|
36
|
-
}
|
@@ -1,140 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/sources/organism'
|
3
|
-
require 'rbbt/sources/pubmed'
|
4
|
-
require 'rbbt/bow/bow'
|
5
|
-
require 'rbbt/bow/dictionary'
|
6
|
-
require 'rbbt/bow/classifier'
|
7
|
-
require 'rbbt/util/misc'
|
8
|
-
|
9
|
-
require 'progress-monitor'
|
10
|
-
require 'rand'
|
11
|
-
|
12
|
-
$hi ||= ENV['hi'] || 0.8
|
13
|
-
$low ||= ENV['low'] || 0.01
|
14
|
-
$max ||= ENV['max'] || 3000
|
15
|
-
$bigrams ||= ENV['bigrams'] == 'true'
|
16
|
-
|
17
|
-
$ndocs ||= ENV['ndocs'] || 5000
|
18
|
-
|
19
|
-
desc "Bilds Dictionary and Features for an organism"
|
20
|
-
rule(/data\/(.*)/) do |t|
|
21
|
-
org = File.basename(t.name)
|
22
|
-
|
23
|
-
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
|
24
|
-
all = Organism.literature(org).flatten.uniq - go
|
25
|
-
|
26
|
-
ndocs = [go.length, all.length, $ndocs.to_i].min
|
27
|
-
puts "Using #{ ndocs } from each class\n\n"
|
28
|
-
|
29
|
-
go = go.shuffle[0..ndocs - 1]
|
30
|
-
all = all.shuffle[0..ndocs - 1]
|
31
|
-
|
32
|
-
dict = Dictionary::KL.new
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
chunks = all.chunk(50)
|
37
|
-
Progress.monitor("Building Dictionary for #{ org }: -")
|
38
|
-
chunks.each{|chunk|
|
39
|
-
PubMed.get_article(chunk).each{|pmid, article|
|
40
|
-
words = BagOfWords.terms(article.text,$bigrams)
|
41
|
-
dict.add(words, :-)
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
chunks = go.chunk(50)
|
46
|
-
Progress.monitor("Building Dictionary for #{ org }: +")
|
47
|
-
chunks.each{|chunk|
|
48
|
-
PubMed.get_article(chunk).each{|pmid, article|
|
49
|
-
words = BagOfWords.terms(article.text,$bigrams)
|
50
|
-
dict.add(words, :+)
|
51
|
-
}
|
52
|
-
}
|
53
|
-
|
54
|
-
term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
|
55
|
-
Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
|
56
|
-
|
57
|
-
terms = term_weigths.keys.sort
|
58
|
-
|
59
|
-
fout = File.open(t.name, 'w')
|
60
|
-
fout.puts((['Name','Class'] + terms).join("\t"))
|
61
|
-
|
62
|
-
Progress.monitor("Building Features for #{ org }")
|
63
|
-
all.each{|pmid|
|
64
|
-
text = PubMed.get_article(pmid).text
|
65
|
-
fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
|
66
|
-
}
|
67
|
-
go.each{|pmid|
|
68
|
-
text = PubMed.get_article(pmid).text
|
69
|
-
fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
|
70
|
-
}
|
71
|
-
|
72
|
-
|
73
|
-
fout.close
|
74
|
-
end
|
75
|
-
|
76
|
-
rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
|
77
|
-
features = t.name.sub(/model/,'data')
|
78
|
-
Classifier.create_model(features, t.name, features + '.dict')
|
79
|
-
end
|
80
|
-
|
81
|
-
rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
|
82
|
-
model = t.name.sub(/results/,'model')
|
83
|
-
features = t.name.sub(/results/,'data')
|
84
|
-
org = File.basename(t.name)
|
85
|
-
|
86
|
-
ndocs = 1000
|
87
|
-
|
88
|
-
used = Open.read(features).read.split(/\n/).collect{|l| l.chomp.split(/\t/).first}[1..-1]
|
89
|
-
|
90
|
-
classifier = Classifier.new(model)
|
91
|
-
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
|
92
|
-
all = Organism.literature(org).flatten.uniq - go - used
|
93
|
-
|
94
|
-
go = go.shuffle[0..ndocs - 1]
|
95
|
-
all = all.shuffle[0..ndocs - 1]
|
96
|
-
|
97
|
-
ndocs = go.length + all.length
|
98
|
-
|
99
|
-
raise "Not enogh unused articles to evaluate" if go.empty? || all.empty?
|
100
|
-
|
101
|
-
features_go = PubMed.get_article(go).collect{|pmid, article|
|
102
|
-
article.text
|
103
|
-
}
|
104
|
-
pos = classifier.classify(features_go).select{|v| v == '+'}.length
|
105
|
-
|
106
|
-
features_all = PubMed.get_article(all).collect{|pmid, article|
|
107
|
-
article.text
|
108
|
-
}
|
109
|
-
neg = classifier.classify(features_all).select{|v| v == '-'}.length
|
110
|
-
|
111
|
-
puts "#{ pos } #{ neg }"
|
112
|
-
|
113
|
-
precision = (pos + neg) / (ndocs).to_f
|
114
|
-
recall = pos / go.length.to_f
|
115
|
-
f1 = ( 2 * precision * recall) / (precision + recall ).to_f
|
116
|
-
|
117
|
-
puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
|
118
|
-
end
|
119
|
-
|
120
|
-
task 'clean' do
|
121
|
-
FileUtils.rm Dir.glob("data/*")
|
122
|
-
FileUtils.rm Dir.glob("model/*")
|
123
|
-
FileUtils.rm Dir.glob("results/*")
|
124
|
-
|
125
|
-
end
|
126
|
-
task 'all' do
|
127
|
-
Organism.all.each{|org|
|
128
|
-
Rake::Task["model/#{ org }"].invoke
|
129
|
-
}
|
130
|
-
end
|
131
|
-
task 'update' do
|
132
|
-
if $org
|
133
|
-
FileUtils.rm Dir.glob("**/#{$org}.*") if $force
|
134
|
-
Rake::Task["model/#{$org}"].invoke
|
135
|
-
else
|
136
|
-
Rake::Task['clean'].invoke if $force
|
137
|
-
Rake::Task['all'].invoke
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
@@ -1,25 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
|
4
|
-
wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
|
5
|
-
mv BANNER_v02.zip BANNER.zip
|
6
|
-
mv gene_model_v02.bin gene_model.bin
|
7
|
-
unzip BANNER.zip
|
8
|
-
cd BANNER
|
9
|
-
libs=`find libs/ -name "*.jar"`
|
10
|
-
mkdir classes
|
11
|
-
javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
|
12
|
-
cd classes
|
13
|
-
for f in ../libs/*.jar; do jar xf "$f";done
|
14
|
-
jar cf banner.jar *
|
15
|
-
mv banner.jar ../..
|
16
|
-
cd ..
|
17
|
-
cp -R nlpdata/ ../
|
18
|
-
cd ..
|
19
|
-
rm BANNER.zip
|
20
|
-
rm -Rf BANNER
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|