rbbt 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/bin/rbbt_config +180 -0
- data/install_scripts/classifier/R/classify.R +36 -0
- data/install_scripts/classifier/Rakefile +140 -0
- data/install_scripts/get_abner.sh +2 -0
- data/install_scripts/get_banner.sh +25 -0
- data/install_scripts/get_biocreative.sh +72 -0
- data/install_scripts/get_crf++.sh +26 -0
- data/install_scripts/get_entrez.sh +4 -0
- data/install_scripts/get_go.sh +4 -0
- data/install_scripts/get_polysearch.sh +8 -0
- data/install_scripts/ner/Rakefile +206 -0
- data/install_scripts/ner/config/default.rb +52 -0
- data/install_scripts/norm/Rakefile +218 -0
- data/install_scripts/norm/config/cue_default.rb +10 -0
- data/install_scripts/norm/config/tokens_default.rb +79 -0
- data/install_scripts/norm/functions.sh +21 -0
- data/install_scripts/organisms/Rakefile +25 -0
- data/install_scripts/organisms/cgd.Rakefile +84 -0
- data/install_scripts/organisms/human.Rakefile +145 -0
- data/install_scripts/organisms/mgi.Rakefile +77 -0
- data/install_scripts/organisms/pombe.Rakefile +40 -0
- data/install_scripts/organisms/rake-include.rb +258 -0
- data/install_scripts/organisms/rgd.Rakefile +88 -0
- data/install_scripts/organisms/sgd.Rakefile +66 -0
- data/install_scripts/organisms/tair.Rakefile +54 -0
- data/install_scripts/organisms/worm.Rakefile +109 -0
- data/install_scripts/stopwords +1 -0
- data/install_scripts/wordlists/consonants +897 -0
- data/install_scripts/wordlists/stopwords +1 -0
- data/lib/rbbt/bow/bow.rb +87 -0
- data/lib/rbbt/bow/classifier.rb +118 -0
- data/lib/rbbt/bow/dictionary.rb +218 -0
- data/lib/rbbt/ner/abner.rb +34 -0
- data/lib/rbbt/ner/banner.rb +73 -0
- data/lib/rbbt/ner/regexpNER.rb +62 -0
- data/lib/rbbt/ner/rner.rb +227 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
- data/lib/rbbt/ner/rnorm.rb +142 -0
- data/lib/rbbt/sources/biocreative.rb +75 -0
- data/lib/rbbt/sources/biomart.rb +106 -0
- data/lib/rbbt/sources/entrez.rb +211 -0
- data/lib/rbbt/sources/go.rb +40 -0
- data/lib/rbbt/sources/organism.rb +197 -0
- data/lib/rbbt/sources/polysearch.rb +88 -0
- data/lib/rbbt/sources/pubmed.rb +111 -0
- data/lib/rbbt/util/arrayHash.rb +255 -0
- data/lib/rbbt/util/filecache.rb +72 -0
- data/lib/rbbt/util/index.rb +69 -0
- data/lib/rbbt/util/misc.rb +101 -0
- data/lib/rbbt/util/open.rb +207 -0
- data/lib/rbbt/util/simpleDSL.rb +87 -0
- data/lib/rbbt/util/tmpfile.rb +19 -0
- data/lib/rbbt/version.rb +10 -0
- data/lib/rbbt.rb +86 -0
- data/tasks/install.rake +123 -0
- metadata +114 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Miguel Vazquez
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
= rbbt
|
2
|
+
|
3
|
+
Description goes here.
|
4
|
+
|
5
|
+
== Note on Patches/Pull Requests
|
6
|
+
|
7
|
+
* Fork the project.
|
8
|
+
* Make your feature addition or bug fix.
|
9
|
+
* Add tests for it. This is important so I don't break it in a
|
10
|
+
future version unintentionally.
|
11
|
+
* Commit, do not mess with rakefile, version, or history.
|
12
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
13
|
+
* Send me a pull request. Bonus points for topic branches.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2009 Miguel Vazquez. See LICENSE for details.
|
data/bin/rbbt_config
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
|
6
|
+
|
7
|
+
require 'simpleconsole'
|
8
|
+
|
9
|
+
begin
|
10
|
+
require 'rbbt'
|
11
|
+
rescue Rbbt::NoConfig
|
12
|
+
$noconfig = true
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
$USAGE =<<EOT
|
17
|
+
#{__FILE__} <action> [<subaction>] [--force] [--organism <org>]
|
18
|
+
actions:
|
19
|
+
* configure: Set paths for data, cache, and tmp directories
|
20
|
+
|
21
|
+
* install:
|
22
|
+
* basic: Third party software
|
23
|
+
* databases: Entrez and Biocreative
|
24
|
+
* models: Gene Mention and Classification
|
25
|
+
* organisms: Rules to gather data for organisms
|
26
|
+
* all: 3party wordlists entrez biocreative go ner norm classifier organisms polysearch
|
27
|
+
|
28
|
+
* update:
|
29
|
+
* organisms: Gather data for organisms
|
30
|
+
* ner: Build Named Entity Recognition Models for Gene Mention
|
31
|
+
* classification:
|
32
|
+
Build Function/Process Classifiers
|
33
|
+
|
34
|
+
* purge_cache: Clean the non-persistent cache, which holds general things
|
35
|
+
downloaded using Open.read, like organism identifiers downloaded from
|
36
|
+
BioMart. The persistent cache, which hold pubmed articles or entrez gene
|
37
|
+
descriptions, is not cleaned, as these are not likely to change
|
38
|
+
|
39
|
+
|
40
|
+
EOT
|
41
|
+
|
42
|
+
class Controller < SimpleConsole::Controller
|
43
|
+
|
44
|
+
params :bool => {:f => :force},
|
45
|
+
:string => {:o => :organism}
|
46
|
+
|
47
|
+
def default
|
48
|
+
render :action => :usage
|
49
|
+
end
|
50
|
+
|
51
|
+
def help
|
52
|
+
render :action => :usage
|
53
|
+
end
|
54
|
+
|
55
|
+
def update
|
56
|
+
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
57
|
+
|
58
|
+
case params[:id]
|
59
|
+
when "organisms"
|
60
|
+
@location = File.join(Rbbt.datadir,'organisms')
|
61
|
+
when "ner"
|
62
|
+
@location = File.join(Rbbt.datadir,'ner')
|
63
|
+
when "classifier"
|
64
|
+
@location = File.join(Rbbt.datadir,'classifier')
|
65
|
+
else
|
66
|
+
redirect_to :action => :help, :id => :update
|
67
|
+
end
|
68
|
+
|
69
|
+
$force = true if params[:force]
|
70
|
+
$org = params[:organism] if params[:organism]
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
def install
|
75
|
+
raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
|
76
|
+
|
77
|
+
case params[:id]
|
78
|
+
when "basic"
|
79
|
+
@tasks = %w(3party wordlists polysearch)
|
80
|
+
when "databases"
|
81
|
+
@tasks = %w(entrez biocreative go)
|
82
|
+
when "models"
|
83
|
+
@tasks = %w(ner norm classifier)
|
84
|
+
when "organisms"
|
85
|
+
@tasks = %w(organisms)
|
86
|
+
when "all"
|
87
|
+
@tasks = %w(3party wordlists entrez biocreative go ner norm classifier organisms polysearch)
|
88
|
+
when nil
|
89
|
+
redirect_to :action => :help, :id => :install
|
90
|
+
else
|
91
|
+
@tasks = [params[:id]]
|
92
|
+
end
|
93
|
+
|
94
|
+
$force = true if params[:force]
|
95
|
+
$org = params[:organism] if params[:organism]
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
def configure
|
100
|
+
end
|
101
|
+
|
102
|
+
def purge_cache
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
class View < SimpleConsole::View
|
108
|
+
def usage
|
109
|
+
puts $USAGE
|
110
|
+
end
|
111
|
+
|
112
|
+
def install
|
113
|
+
load File.join(Rbbt.rootdir, 'tasks/install.rake')
|
114
|
+
|
115
|
+
@tasks.each{|t|
|
116
|
+
puts "Invoking #{ t }"
|
117
|
+
Rake::Task[t].invoke
|
118
|
+
}
|
119
|
+
end
|
120
|
+
|
121
|
+
def update
|
122
|
+
|
123
|
+
puts "Changing directory to #{@location}"
|
124
|
+
chdir @location
|
125
|
+
|
126
|
+
load "./Rakefile"
|
127
|
+
|
128
|
+
Rake::Task['default'].invoke
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
def configure
|
133
|
+
|
134
|
+
defaultdir = File.join(ENV['HOME'],'rbbt')
|
135
|
+
|
136
|
+
cachedir = File.join(defaultdir, 'cache')
|
137
|
+
tmpdir = File.join(defaultdir, 'tmp')
|
138
|
+
datadir = File.join(defaultdir, 'data')
|
139
|
+
|
140
|
+
puts "Please indicate where you wish to place the data directories"
|
141
|
+
puts
|
142
|
+
|
143
|
+
puts
|
144
|
+
puts "* Cache Directory: This directory will hold downloads, from PubMed,
|
145
|
+
Entrez and other, for local store. It might grow considerably."
|
146
|
+
print "[#{ cachedir }]? "
|
147
|
+
input = STDIN.gets
|
148
|
+
cachedir = input if input =~ /\w/
|
149
|
+
|
150
|
+
puts
|
151
|
+
puts "* Tmp Directory: Temporary files."
|
152
|
+
print "[#{ tmpdir }]? "
|
153
|
+
input = STDIN.gets
|
154
|
+
tmpdir = input if input =~ /\w/
|
155
|
+
|
156
|
+
puts
|
157
|
+
puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
|
158
|
+
print "[#{ datadir }]? "
|
159
|
+
input = STDIN.gets
|
160
|
+
datadir = input if input =~ /\w/
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
|
165
|
+
fout.puts "cachedir: #{cachedir}"
|
166
|
+
fout.puts "tmpdir: #{tmpdir}"
|
167
|
+
fout.puts "datadir: #{datadir}"
|
168
|
+
fout.close
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
def purge_cache
|
173
|
+
FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
SimpleConsole::Application.run(ARGV, Controller, View)
|
179
|
+
|
180
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
library('e1071')
|
2
|
+
|
3
|
+
BOW.norm <- function(x, weights = NULL){
|
4
|
+
x = 1 + log(x);
|
5
|
+
x[x==-Inf] = 0;
|
6
|
+
x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
|
7
|
+
x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
|
8
|
+
x.norm = x * x.sum;
|
9
|
+
rm(x.sum);
|
10
|
+
x.norm[is.na(x.norm)] = 0
|
11
|
+
|
12
|
+
if (!is.null(weights)){
|
13
|
+
x.norm = x.norm * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
|
14
|
+
}
|
15
|
+
|
16
|
+
x.norm;
|
17
|
+
}
|
18
|
+
|
19
|
+
|
20
|
+
BOW.classification.model <- function(features, modelfile, dictfile = NULL){
|
21
|
+
feats = read.table(features, sep="\t", header=T, row.names=1);
|
22
|
+
|
23
|
+
if (!is.null(dictfile)){
|
24
|
+
svm.weights = read.table(file=dictfile, sep="\t")[2];
|
25
|
+
}else {
|
26
|
+
svm.weights = NULL;
|
27
|
+
}
|
28
|
+
feats[-1] = BOW.norm(feats[-1], svm.weights);
|
29
|
+
svm.model = svm(Class ~ ., data=feats, svm.weights);
|
30
|
+
save(svm.model,svm.weights, file=modelfile);
|
31
|
+
}
|
32
|
+
|
33
|
+
BOW.classification.classify <- function(modelfile, x, weights = NULL){
|
34
|
+
x = BOW.norm(x, weights);
|
35
|
+
predict(modelfile, x);
|
36
|
+
}
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
require 'rbbt/sources/pubmed'
|
4
|
+
require 'rbbt/bow/bow'
|
5
|
+
require 'rbbt/bow/dictionary'
|
6
|
+
require 'rbbt/bow/classifier'
|
7
|
+
require 'rbbt/util/misc'
|
8
|
+
|
9
|
+
require 'progress-monitor'
|
10
|
+
require 'rand'
|
11
|
+
|
12
|
+
$hi = ENV['hi'] || 0.8
|
13
|
+
$low = ENV['low'] || 0.01
|
14
|
+
$max = ENV['max'] || 3000
|
15
|
+
$bigrams = ENV['bigrams'] == 'true' || false
|
16
|
+
|
17
|
+
$ndocs = ENV['ndocs'] || 5000
|
18
|
+
|
19
|
+
desc "Bilds Dictionary and Features for an organism"
|
20
|
+
rule(/data\/(.*)/) do |t|
|
21
|
+
org = File.basename(t.name)
|
22
|
+
|
23
|
+
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
|
24
|
+
all = Organism.literature(org).flatten.uniq - go
|
25
|
+
|
26
|
+
ndocs = [go.length, all.length, $ndocs.to_i].min
|
27
|
+
puts "Using #{ ndocs } from each class\n\n"
|
28
|
+
|
29
|
+
go = go.shuffle[0..ndocs - 1]
|
30
|
+
all = all.shuffle[0..ndocs - 1]
|
31
|
+
|
32
|
+
dict = Dictionary::KL.new
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
chunks = all.chunk(50)
|
37
|
+
Progress.monitor("Building Dictionary for #{ org }: -",1000)
|
38
|
+
chunks.each{|chunk|
|
39
|
+
PubMed.get_article(chunk).each{|pmid, article|
|
40
|
+
words = BagOfWords.terms(article.text,$bigrams)
|
41
|
+
dict.add(words, :-)
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
chunks = go.chunk(50)
|
46
|
+
Progress.monitor("Building Dictionary for #{ org }: +",1000)
|
47
|
+
chunks.each{|chunk|
|
48
|
+
PubMed.get_article(chunk).each{|pmid, article|
|
49
|
+
words = BagOfWords.terms(article.text,$bigrams)
|
50
|
+
dict.add(words, :+)
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
|
55
|
+
Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
|
56
|
+
|
57
|
+
terms = term_weigths.keys.sort
|
58
|
+
|
59
|
+
fout = File.open(t.name, 'w')
|
60
|
+
fout.puts((['Name','Class'] + terms).join("\t"))
|
61
|
+
|
62
|
+
Progress.monitor("Building Features for #{ org }", 1000)
|
63
|
+
all.each{|pmid|
|
64
|
+
text = PubMed.get_article(pmid).text
|
65
|
+
fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
|
66
|
+
}
|
67
|
+
go.each{|pmid|
|
68
|
+
text = PubMed.get_article(pmid).text
|
69
|
+
fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
|
70
|
+
}
|
71
|
+
|
72
|
+
|
73
|
+
fout.close
|
74
|
+
end
|
75
|
+
|
76
|
+
rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
|
77
|
+
features = t.name.sub(/model/,'data')
|
78
|
+
Classifier.create_model(features, t.name, features + '.dict')
|
79
|
+
end
|
80
|
+
|
81
|
+
rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
|
82
|
+
model = t.name.sub(/results/,'model')
|
83
|
+
features = t.name.sub(/results/,'data')
|
84
|
+
org = File.basename(t.name)
|
85
|
+
|
86
|
+
ndocs = 100
|
87
|
+
|
88
|
+
used = Open.read(features).collect{|l| l.chomp.split(/\t/).first}[1..-1]
|
89
|
+
|
90
|
+
classifier = Classifier.new(model)
|
91
|
+
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
|
92
|
+
all = Organism.literature(org).flatten.uniq - go - used
|
93
|
+
|
94
|
+
go = go.shuffle[0..ndocs - 1]
|
95
|
+
all = all.shuffle[0..ndocs - 1]
|
96
|
+
|
97
|
+
ndocs = go.length + all.length
|
98
|
+
|
99
|
+
raise "Not enogh unused articles to evaluate" if go.empty? || all.empty?
|
100
|
+
|
101
|
+
features_go = PubMed.get_article(go).collect{|pmid, article|
|
102
|
+
article = article.text
|
103
|
+
}
|
104
|
+
pos = classifier.classify(features_go).select{|v| v == '+'}.length
|
105
|
+
|
106
|
+
features_all = PubMed.get_article(all).collect{|pmid, article|
|
107
|
+
article = article.text
|
108
|
+
}
|
109
|
+
neg = classifier.classify(features_all).select{|v| v == '-'}.length
|
110
|
+
|
111
|
+
puts "#{ pos } #{ neg }"
|
112
|
+
|
113
|
+
precision = (pos + neg) / (ndocs).to_f
|
114
|
+
recall = pos / go.length.to_f
|
115
|
+
f1 = ( 2 * precision * recall) / (precision + recall ).to_f
|
116
|
+
|
117
|
+
puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
|
118
|
+
end
|
119
|
+
|
120
|
+
task 'clean' do
|
121
|
+
FileUtils.rm Dir.glob("data/*")
|
122
|
+
FileUtils.rm Dir.glob("model/*")
|
123
|
+
FileUtils.rm Dir.glob("results/*")
|
124
|
+
|
125
|
+
end
|
126
|
+
task 'all' do
|
127
|
+
Organism.all.each{|org|
|
128
|
+
Rake::Task["model/#{ org }"].invoke
|
129
|
+
}
|
130
|
+
end
|
131
|
+
task 'update' do
|
132
|
+
if $org
|
133
|
+
FileUtils.rm Dir.glob("**/#{$org}.*") if $force
|
134
|
+
Rake::Task["model/#{$org}"].invoke
|
135
|
+
else
|
136
|
+
Rake::Task['clean'].invoke if $force
|
137
|
+
Rake::Task['all'].invoke
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
|
4
|
+
wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
|
5
|
+
mv BANNER_v02.zip BANNER.zip
|
6
|
+
mv gene_model_v02.bin gene_model.bin
|
7
|
+
unzip BANNER.zip
|
8
|
+
cd BANNER
|
9
|
+
libs=`find libs/ -name "*.jar"`
|
10
|
+
mkdir classes
|
11
|
+
javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
|
12
|
+
cd classes
|
13
|
+
for f in ../libs/*.jar; do jar xf "$f";done
|
14
|
+
jar cf banner.jar *
|
15
|
+
mv banner.jar ../..
|
16
|
+
cd ..
|
17
|
+
cp -R nlpdata/ ../
|
18
|
+
cd ..
|
19
|
+
rm BANNER.zip
|
20
|
+
rm -Rf BANNER
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
mkdir src
|
4
|
+
cd src
|
5
|
+
wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2GNandGMgold_Subs.tar.gz"
|
6
|
+
wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1a.tar.gz"
|
7
|
+
wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1b.tar.gz"
|
8
|
+
wget "http://mesh.dl.sourceforge.net/sourceforge/biocreative/biocreative1task2.tar.gz"
|
9
|
+
wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2geneMention.tar.gz"
|
10
|
+
wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/bc2normal.1.4.tar.gz"
|
11
|
+
wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/bc2GNtest.zip"
|
12
|
+
|
13
|
+
for f in *.gz; do tar xfz $f; done
|
14
|
+
unzip bc2GNtest.zip
|
15
|
+
|
16
|
+
cd ..
|
17
|
+
|
18
|
+
mkdir BC2GM
|
19
|
+
cp -R src/bc2geneMention/train/ BC2GM/
|
20
|
+
cp -R src/sourceforgeDistrib-22-Sept-07/genemention/BC2GM/test/ BC2GM/
|
21
|
+
mv BC2GM/train/alt_eval.perl BC2GM/
|
22
|
+
|
23
|
+
mkdir BC2GN
|
24
|
+
cp -R src/biocreative2normalization/* BC2GN/
|
25
|
+
mv BC2GN/noisyTrainingData/ BC2GN/NoisyTrain
|
26
|
+
mv BC2GN/trainingData/ BC2GN/Train
|
27
|
+
cp -R src/bc2GNtest/bc2GNtestdocs/ BC2GN/Test
|
28
|
+
mv BC2GN/NoisyTrain/noisytrain.genelist BC2GN/NoisyTrain/genelist
|
29
|
+
mv BC2GN/Train/training.genelist BC2GN/Train/genelist
|
30
|
+
cp src/sourceforgeDistrib-22-Sept-07/genenormalization/bc2test.genelist BC2GN/Test/genelist
|
31
|
+
|
32
|
+
mkdir BC1GN
|
33
|
+
cp -R src/biocreative1/bc1task1b/* BC1GN/
|
34
|
+
mv BC1GN/fly/FlyDevTest/ BC1GN/fly/devtest
|
35
|
+
mv BC1GN/fly/FlyEvaluation/ BC1GN/fly/test
|
36
|
+
mv BC1GN/fly/FlyNoisyTraining/ BC1GN/fly/train
|
37
|
+
mv BC1GN/fly/*.list BC1GN/fly/synonyms.list
|
38
|
+
mv BC1GN/fly/test/*gene_list BC1GN/fly/test/genelist
|
39
|
+
for f in BC1GN/fly/train/gene_list/*; do cat "$f" >> BC1GN/fly/train/genelist;done
|
40
|
+
for f in BC1GN/fly/devtest/gene_lists/*; do cat "$f" >> BC1GN/fly/devtest/genelist;done
|
41
|
+
mv BC1GN/mouse/MouseDevTest/ BC1GN/mouse/devtest
|
42
|
+
mv BC1GN/mouse/MouseEvaluation/ BC1GN/mouse/test
|
43
|
+
mv BC1GN/mouse/MouseNoisyTraining/ BC1GN/mouse/train
|
44
|
+
mv BC1GN/mouse/*.list BC1GN/mouse/synonyms.list
|
45
|
+
mv BC1GN/mouse/test/*gene_list BC1GN/mouse/test/genelist
|
46
|
+
for f in BC1GN/mouse/train/gene_list/*; do cat "$f" >> BC1GN/mouse/train/genelist;done
|
47
|
+
for f in BC1GN/mouse/devtest/gene_lists/*; do cat "$f" >> BC1GN/mouse/devtest/genelist;done
|
48
|
+
mv BC1GN/yeast/YeastDevTest/ BC1GN/yeast/devtest
|
49
|
+
mv BC1GN/yeast/YeastEvaluation/ BC1GN/yeast/test
|
50
|
+
mv BC1GN/yeast/YeastNoisyTraining/ BC1GN/yeast/train
|
51
|
+
mv BC1GN/yeast/*.list BC1GN/yeast/synonyms.list
|
52
|
+
mv BC1GN/yeast/test/*gene_list BC1GN/yeast/test/genelist
|
53
|
+
for f in BC1GN/yeast/train/gene_list/*; do cat "$f" >> BC1GN/yeast/train/genelist;done
|
54
|
+
for f in BC1GN/yeast/devtest/gene_lists/*; do cat "$f" >> BC1GN/yeast/devtest/genelist;done
|
55
|
+
# Fix a bug in the perl script! :-|
|
56
|
+
cat BC1GN/task1Bscorer.pl |grep -v 'else {EVALFILE = STDIN;}' >foo; mv foo BC1GN/task1Bscorer.pl
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
rm -Rf src
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
wget "http://downloads.sourceforge.net/crfpp/CRF%2B%2B-0.51.tar.gz?modtime=1215793886&big_mirror=0" -O crf++.tar.gz
|
2
|
+
tar xvfz crf++.tar.gz
|
3
|
+
rm crf++.tar.gz
|
4
|
+
cd CRF*
|
5
|
+
PREFIX=$(dirname $PWD)
|
6
|
+
|
7
|
+
if [ `uname -m` == 'x86_64' ]; then
|
8
|
+
WITH_PIC='--with-pic';
|
9
|
+
else
|
10
|
+
WITH_PIC=''
|
11
|
+
fi
|
12
|
+
|
13
|
+
./configure --prefix=$PREFIX --exec-prefix=$PREFIX $WITH_PIC;
|
14
|
+
make install
|
15
|
+
cd ruby
|
16
|
+
|
17
|
+
ruby extconf.rb --with-opt-lib=$PREFIX/lib/ --with-opt-include=$PREFIX/include/
|
18
|
+
make
|
19
|
+
cc -shared -o CRFPP.so CRFPP_wrap.o ../../lib/libcrfpp.a -L. -L/usr/lib -L. -rdynamic -Wl,-export-dynamic -lruby -lpthread -lpthread -ldl -lcrypt -lm -lc -lstdc++
|
20
|
+
|
21
|
+
mkdir ../../ruby/
|
22
|
+
cp CRFPP.so ../../ruby/
|
23
|
+
cd ../../
|
24
|
+
rm -Rf CRF* include
|
25
|
+
|
26
|
+
|
@@ -0,0 +1,8 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
wget http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt -O disease.txt
|
4
|
+
wget http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt -O organ.txt
|
5
|
+
wget http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt -O tissue.txt
|
6
|
+
wget http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt -O subcellular.txt
|
7
|
+
wget http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt -O drug.txt
|
8
|
+
wget http://wishart.biology.ualberta.ca/polysearch/include/HMDBnames.txt -O metabolite.txt
|