rbbt 1.2.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
data/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Copyright (c) 2009 Miguel Vazquez
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
-
a copy of this software and associated documentation files (the
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
10
|
-
|
11
|
-
The above copyright notice and this permission notice shall be
|
12
|
-
included in all copies or substantial portions of the Software.
|
13
|
-
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/rbbt_config
DELETED
@@ -1,245 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
require 'rake'
|
7
|
-
|
8
|
-
|
9
|
-
require 'simpleconsole'
|
10
|
-
|
11
|
-
begin
|
12
|
-
require 'rbbt'
|
13
|
-
rescue Rbbt::NoConfig
|
14
|
-
$noconfig = true
|
15
|
-
end
|
16
|
-
|
17
|
-
TASKS= %w(organisms ner norm classifier biocreative entrez go wordlists polysearch abner banner crf++)
|
18
|
-
|
19
|
-
$USAGE =<<EOT
|
20
|
-
#{__FILE__} <action> [<subaction>] [--update] [--organism <org>]
|
21
|
-
|
22
|
-
actions:
|
23
|
-
|
24
|
-
* configure: Set paths for data, cache, and tmp directories
|
25
|
-
|
26
|
-
* prepare:
|
27
|
-
|
28
|
-
Basic subactions:
|
29
|
-
|
30
|
-
* organisms: Install processing scripts to process organisms
|
31
|
-
* ner: Install processing scripts for Named Entity Recognition
|
32
|
-
* norm: Install processing scripts for Gene Mention Normalization
|
33
|
-
* classifier: Install processing scripts for Classification
|
34
|
-
|
35
|
-
* biocreative: Download and train and test data from BioCreative
|
36
|
-
* entrez: Download and install data from Entrez
|
37
|
-
* go: Download and install data from The Gene Ontology
|
38
|
-
* wordlists: Install word lists
|
39
|
-
* polysearch: Download and install Polysearch dictionaries
|
40
|
-
|
41
|
-
* abner: Download and install Abner NER system: http://pages.cs.wisc.edu/~bsettles/abner/
|
42
|
-
* banner: Download and install Banner NER system: http://sourceforge.net/projects/banner/
|
43
|
-
* crf++: Download and install CRF++ a CRF framework: http://crfpp.sourceforge.net/
|
44
|
-
|
45
|
-
Subactions grouped by task:
|
46
|
-
|
47
|
-
* identifiers: entrez, organisms
|
48
|
-
* rner: entrez, organisms, biocreative, ner, crf++
|
49
|
-
* java_ner: entrez, organisms, abner, banner
|
50
|
-
* norm: entrez organisms, biocreative, crf++, norm, polysearch
|
51
|
-
* bow: organisms, wordlists
|
52
|
-
* classifier: organisms, wordlists, classifier, go
|
53
|
-
* all: #{TASKS.join(", ")}
|
54
|
-
|
55
|
-
* install:
|
56
|
-
* organisms: Gather organisms data
|
57
|
-
* ner: Build Named Entity Recognition Models. Mention Normalization needs no training.
|
58
|
-
* classification: Build Function/Process Classifiers
|
59
|
-
|
60
|
-
--update: Rebuild models or reprocess organism data even if present. You may want to purge the cache
|
61
|
-
to be up to date with the data in the internet.
|
62
|
-
|
63
|
-
--organism: Gather data only for that particular organism. The organism must be specified by the
|
64
|
-
keyword. Use '#{__FILE__} organisms' to see find the keywords.
|
65
|
-
|
66
|
-
* purge_cache: Clean the non-persistent cache, which holds general things
|
67
|
-
downloaded using Open.read, like organism identifiers downloaded from
|
68
|
-
BioMart. The persistent cache, which hold pubmed articles or entrez gene
|
69
|
-
descriptions, is not cleaned, as these are not likely to change
|
70
|
-
|
71
|
-
* organisms: Show a list of all organisms along with their identifier in the system
|
72
|
-
EOT
|
73
|
-
|
74
|
-
class Controller < SimpleConsole::Controller
|
75
|
-
|
76
|
-
params :bool => {:u => :update},
|
77
|
-
:string => {:o => :organism}
|
78
|
-
|
79
|
-
def organisms
|
80
|
-
end
|
81
|
-
|
82
|
-
|
83
|
-
def default
|
84
|
-
render :action => :usage
|
85
|
-
end
|
86
|
-
|
87
|
-
def help
|
88
|
-
render :action => :usage
|
89
|
-
end
|
90
|
-
|
91
|
-
def install
|
92
|
-
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
93
|
-
|
94
|
-
case params[:id]
|
95
|
-
when "organisms"
|
96
|
-
@location = File.join(Rbbt.datadir,'organisms')
|
97
|
-
when "ner"
|
98
|
-
@location = File.join(Rbbt.datadir,'ner')
|
99
|
-
when "classifier"
|
100
|
-
@location = File.join(Rbbt.datadir,'classifier')
|
101
|
-
else
|
102
|
-
redirect_to :action => :help, :id => :update
|
103
|
-
end
|
104
|
-
|
105
|
-
$force = true if params[:update]
|
106
|
-
$org = params[:organism] if params[:organism]
|
107
|
-
|
108
|
-
end
|
109
|
-
|
110
|
-
def prepare
|
111
|
-
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
112
|
-
case params[:id]
|
113
|
-
when "identifiers"
|
114
|
-
require 'rbbt/sources/organism'
|
115
|
-
require 'rbbt/sources/entrez'
|
116
|
-
@tasks = %w(entrez organisms)
|
117
|
-
when "rner"
|
118
|
-
require 'rbbt/ner/rner'
|
119
|
-
require 'rbbt/sources/entrez'
|
120
|
-
@tasks = %w(entrez organisms biocreative ner crf++)
|
121
|
-
when "java_ner"
|
122
|
-
require 'rjb'
|
123
|
-
@tasks = %w(entrez organisms abner banner)
|
124
|
-
when "norm"
|
125
|
-
require 'rbbt/ner/rner'
|
126
|
-
require 'rbbt/ner/rnorm'
|
127
|
-
require 'rbbt/ner/regexpNER'
|
128
|
-
require 'rbbt/sources/entrez'
|
129
|
-
@tasks = %w(entrez organisms biocreative crf++ norm polysearch)
|
130
|
-
when "bow"
|
131
|
-
require 'rbbt/bow/bow'
|
132
|
-
require 'rbbt/bow/dictionary'
|
133
|
-
@tasks = %w(organisms wordlists)
|
134
|
-
when "classifier"
|
135
|
-
require 'rbbt/bow/bow'
|
136
|
-
require 'rbbt/bow/dictionary'
|
137
|
-
require 'rbbt/bow/classifier'
|
138
|
-
@tasks = %w(organisms wordlists classifier go)
|
139
|
-
when "all"
|
140
|
-
@tasks = TASKS
|
141
|
-
when nil
|
142
|
-
redirect_to :action => :help, :id => :install
|
143
|
-
else
|
144
|
-
redirect_to :action => :help, :id => :install if ! TASKS.include? params[:id]
|
145
|
-
@tasks = [params[:id]]
|
146
|
-
end
|
147
|
-
|
148
|
-
$force = true if params[:update]
|
149
|
-
$org = params[:organism] if params[:organism]
|
150
|
-
|
151
|
-
end
|
152
|
-
|
153
|
-
def configure
|
154
|
-
end
|
155
|
-
|
156
|
-
def purge_cache
|
157
|
-
end
|
158
|
-
|
159
|
-
end
|
160
|
-
|
161
|
-
class View < SimpleConsole::View
|
162
|
-
def usage
|
163
|
-
puts $USAGE
|
164
|
-
end
|
165
|
-
|
166
|
-
def organisms
|
167
|
-
require 'rbbt/sources/organism'
|
168
|
-
all = Organism.all(false)
|
169
|
-
installed = Organism.all
|
170
|
-
|
171
|
-
all.each{|org|
|
172
|
-
puts "#{Organism.name(org)}: #{org} #{installed.include?(org) ? "(installed)" : ""}"
|
173
|
-
}
|
174
|
-
end
|
175
|
-
|
176
|
-
|
177
|
-
def prepare
|
178
|
-
load File.join(Rbbt.rootdir, 'tasks/install.rake')
|
179
|
-
|
180
|
-
@tasks.each{|t|
|
181
|
-
puts "Invoking #{ t }"
|
182
|
-
Rake::Task[t].invoke
|
183
|
-
}
|
184
|
-
end
|
185
|
-
|
186
|
-
def install
|
187
|
-
|
188
|
-
puts "Changing directory to #{@location}"
|
189
|
-
chdir @location
|
190
|
-
|
191
|
-
load "./Rakefile"
|
192
|
-
|
193
|
-
Rake::Task['default'].invoke
|
194
|
-
end
|
195
|
-
|
196
|
-
|
197
|
-
def configure
|
198
|
-
|
199
|
-
defaultdir = File.join(ENV['HOME'],'rbbt')
|
200
|
-
|
201
|
-
cachedir = File.join(defaultdir, 'cache')
|
202
|
-
tmpdir = File.join(defaultdir, 'tmp')
|
203
|
-
datadir = File.join(defaultdir, 'data')
|
204
|
-
|
205
|
-
puts "Please indicate where you wish to place the data directories"
|
206
|
-
puts
|
207
|
-
|
208
|
-
puts
|
209
|
-
puts "* Cache Directory: This directory will hold downloads, from PubMed,
|
210
|
-
Entrez and other, for local store. It might grow considerably."
|
211
|
-
print "[#{ cachedir }]? "
|
212
|
-
input = STDIN.gets
|
213
|
-
cachedir = input if input =~ /\w/
|
214
|
-
|
215
|
-
puts
|
216
|
-
puts "* Tmp Directory: Temporary files."
|
217
|
-
print "[#{ tmpdir }]? "
|
218
|
-
input = STDIN.gets
|
219
|
-
tmpdir = input if input =~ /\w/
|
220
|
-
|
221
|
-
puts
|
222
|
-
puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
|
223
|
-
print "[#{ datadir }]? "
|
224
|
-
input = STDIN.gets
|
225
|
-
datadir = input if input =~ /\w/
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
|
230
|
-
fout.puts "cachedir: #{cachedir}"
|
231
|
-
fout.puts "tmpdir: #{tmpdir}"
|
232
|
-
fout.puts "datadir: #{datadir}"
|
233
|
-
fout.close
|
234
|
-
|
235
|
-
end
|
236
|
-
|
237
|
-
def purge_cache
|
238
|
-
FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
|
239
|
-
end
|
240
|
-
|
241
|
-
end
|
242
|
-
|
243
|
-
SimpleConsole::Application.run(ARGV, Controller, View)
|
244
|
-
|
245
|
-
|
@@ -1,36 +0,0 @@
|
|
1
|
-
library('e1071')
|
2
|
-
|
3
|
-
BOW.norm <- function(x, weights = NULL){
|
4
|
-
x = 1 + log(x);
|
5
|
-
x[x==-Inf] = 0;
|
6
|
-
x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
|
7
|
-
x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
|
8
|
-
x.norm = x * x.sum;
|
9
|
-
rm(x.sum);
|
10
|
-
x.norm[is.na(x.norm)] = 0
|
11
|
-
|
12
|
-
if (!is.null(weights)){
|
13
|
-
x.norm = x.norm * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
|
14
|
-
}
|
15
|
-
|
16
|
-
x.norm;
|
17
|
-
}
|
18
|
-
|
19
|
-
|
20
|
-
BOW.classification.model <- function(features, modelfile, dictfile = NULL){
|
21
|
-
feats = read.table(features, sep="\t", header=T, row.names=1);
|
22
|
-
|
23
|
-
if (!is.null(dictfile)){
|
24
|
-
svm.weights = read.table(file=dictfile, sep="\t")[2];
|
25
|
-
}else {
|
26
|
-
svm.weights = NULL;
|
27
|
-
}
|
28
|
-
feats[-1] = BOW.norm(feats[-1], svm.weights);
|
29
|
-
svm.model = svm(Class ~ ., data=feats, svm.weights);
|
30
|
-
save(svm.model,svm.weights, file=modelfile);
|
31
|
-
}
|
32
|
-
|
33
|
-
BOW.classification.classify <- function(modelfile, x, weights = NULL){
|
34
|
-
x = BOW.norm(x, weights);
|
35
|
-
predict(modelfile, x);
|
36
|
-
}
|
@@ -1,140 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/sources/organism'
|
3
|
-
require 'rbbt/sources/pubmed'
|
4
|
-
require 'rbbt/bow/bow'
|
5
|
-
require 'rbbt/bow/dictionary'
|
6
|
-
require 'rbbt/bow/classifier'
|
7
|
-
require 'rbbt/util/misc'
|
8
|
-
|
9
|
-
require 'progress-monitor'
|
10
|
-
require 'rand'
|
11
|
-
|
12
|
-
$hi ||= ENV['hi'] || 0.8
|
13
|
-
$low ||= ENV['low'] || 0.01
|
14
|
-
$max ||= ENV['max'] || 3000
|
15
|
-
$bigrams ||= ENV['bigrams'] == 'true'
|
16
|
-
|
17
|
-
$ndocs ||= ENV['ndocs'] || 5000
|
18
|
-
|
19
|
-
desc "Bilds Dictionary and Features for an organism"
|
20
|
-
rule(/data\/(.*)/) do |t|
|
21
|
-
org = File.basename(t.name)
|
22
|
-
|
23
|
-
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
|
24
|
-
all = Organism.literature(org).flatten.uniq - go
|
25
|
-
|
26
|
-
ndocs = [go.length, all.length, $ndocs.to_i].min
|
27
|
-
puts "Using #{ ndocs } from each class\n\n"
|
28
|
-
|
29
|
-
go = go.shuffle[0..ndocs - 1]
|
30
|
-
all = all.shuffle[0..ndocs - 1]
|
31
|
-
|
32
|
-
dict = Dictionary::KL.new
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
chunks = all.chunk(50)
|
37
|
-
Progress.monitor("Building Dictionary for #{ org }: -")
|
38
|
-
chunks.each{|chunk|
|
39
|
-
PubMed.get_article(chunk).each{|pmid, article|
|
40
|
-
words = BagOfWords.terms(article.text,$bigrams)
|
41
|
-
dict.add(words, :-)
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
chunks = go.chunk(50)
|
46
|
-
Progress.monitor("Building Dictionary for #{ org }: +")
|
47
|
-
chunks.each{|chunk|
|
48
|
-
PubMed.get_article(chunk).each{|pmid, article|
|
49
|
-
words = BagOfWords.terms(article.text,$bigrams)
|
50
|
-
dict.add(words, :+)
|
51
|
-
}
|
52
|
-
}
|
53
|
-
|
54
|
-
term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
|
55
|
-
Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
|
56
|
-
|
57
|
-
terms = term_weigths.keys.sort
|
58
|
-
|
59
|
-
fout = File.open(t.name, 'w')
|
60
|
-
fout.puts((['Name','Class'] + terms).join("\t"))
|
61
|
-
|
62
|
-
Progress.monitor("Building Features for #{ org }")
|
63
|
-
all.each{|pmid|
|
64
|
-
text = PubMed.get_article(pmid).text
|
65
|
-
fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
|
66
|
-
}
|
67
|
-
go.each{|pmid|
|
68
|
-
text = PubMed.get_article(pmid).text
|
69
|
-
fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
|
70
|
-
}
|
71
|
-
|
72
|
-
|
73
|
-
fout.close
|
74
|
-
end
|
75
|
-
|
76
|
-
rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
|
77
|
-
features = t.name.sub(/model/,'data')
|
78
|
-
Classifier.create_model(features, t.name, features + '.dict')
|
79
|
-
end
|
80
|
-
|
81
|
-
rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
|
82
|
-
model = t.name.sub(/results/,'model')
|
83
|
-
features = t.name.sub(/results/,'data')
|
84
|
-
org = File.basename(t.name)
|
85
|
-
|
86
|
-
ndocs = 1000
|
87
|
-
|
88
|
-
used = Open.read(features).read.split(/\n/).collect{|l| l.chomp.split(/\t/).first}[1..-1]
|
89
|
-
|
90
|
-
classifier = Classifier.new(model)
|
91
|
-
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
|
92
|
-
all = Organism.literature(org).flatten.uniq - go - used
|
93
|
-
|
94
|
-
go = go.shuffle[0..ndocs - 1]
|
95
|
-
all = all.shuffle[0..ndocs - 1]
|
96
|
-
|
97
|
-
ndocs = go.length + all.length
|
98
|
-
|
99
|
-
raise "Not enogh unused articles to evaluate" if go.empty? || all.empty?
|
100
|
-
|
101
|
-
features_go = PubMed.get_article(go).collect{|pmid, article|
|
102
|
-
article.text
|
103
|
-
}
|
104
|
-
pos = classifier.classify(features_go).select{|v| v == '+'}.length
|
105
|
-
|
106
|
-
features_all = PubMed.get_article(all).collect{|pmid, article|
|
107
|
-
article.text
|
108
|
-
}
|
109
|
-
neg = classifier.classify(features_all).select{|v| v == '-'}.length
|
110
|
-
|
111
|
-
puts "#{ pos } #{ neg }"
|
112
|
-
|
113
|
-
precision = (pos + neg) / (ndocs).to_f
|
114
|
-
recall = pos / go.length.to_f
|
115
|
-
f1 = ( 2 * precision * recall) / (precision + recall ).to_f
|
116
|
-
|
117
|
-
puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
|
118
|
-
end
|
119
|
-
|
120
|
-
task 'clean' do
|
121
|
-
FileUtils.rm Dir.glob("data/*")
|
122
|
-
FileUtils.rm Dir.glob("model/*")
|
123
|
-
FileUtils.rm Dir.glob("results/*")
|
124
|
-
|
125
|
-
end
|
126
|
-
task 'all' do
|
127
|
-
Organism.all.each{|org|
|
128
|
-
Rake::Task["model/#{ org }"].invoke
|
129
|
-
}
|
130
|
-
end
|
131
|
-
task 'update' do
|
132
|
-
if $org
|
133
|
-
FileUtils.rm Dir.glob("**/#{$org}.*") if $force
|
134
|
-
Rake::Task["model/#{$org}"].invoke
|
135
|
-
else
|
136
|
-
Rake::Task['clean'].invoke if $force
|
137
|
-
Rake::Task['all'].invoke
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
@@ -1,25 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
|
4
|
-
wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
|
5
|
-
mv BANNER_v02.zip BANNER.zip
|
6
|
-
mv gene_model_v02.bin gene_model.bin
|
7
|
-
unzip BANNER.zip
|
8
|
-
cd BANNER
|
9
|
-
libs=`find libs/ -name "*.jar"`
|
10
|
-
mkdir classes
|
11
|
-
javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
|
12
|
-
cd classes
|
13
|
-
for f in ../libs/*.jar; do jar xf "$f";done
|
14
|
-
jar cf banner.jar *
|
15
|
-
mv banner.jar ../..
|
16
|
-
cd ..
|
17
|
-
cp -R nlpdata/ ../
|
18
|
-
cd ..
|
19
|
-
rm BANNER.zip
|
20
|
-
rm -Rf BANNER
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|