rbbt 1.1.7 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +72 -136
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -246
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -145
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -79
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/cgd.Rakefile +0 -84
- data/install_scripts/organisms/human.Rakefile +0 -145
- data/install_scripts/organisms/mgi.Rakefile +0 -77
- data/install_scripts/organisms/pombe.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -258
- data/install_scripts/organisms/rgd.Rakefile +0 -88
- data/install_scripts/organisms/sgd.Rakefile +0 -66
- data/install_scripts/organisms/tair.Rakefile +0 -54
- data/install_scripts/organisms/worm.Rakefile +0 -109
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -86
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -40
- data/lib/rbbt/sources/organism.rb +0 -245
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -111
- data/lib/rbbt/util/arrayHash.rb +0 -255
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -235
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -19
- data/tasks/install.rake +0 -124
@@ -1,36 +0,0 @@
|
|
1
|
-
library('e1071')
|
2
|
-
|
3
|
-
BOW.norm <- function(x, weights = NULL){
|
4
|
-
x = 1 + log(x);
|
5
|
-
x[x==-Inf] = 0;
|
6
|
-
x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
|
7
|
-
x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
|
8
|
-
x.norm = x * x.sum;
|
9
|
-
rm(x.sum);
|
10
|
-
x.norm[is.na(x.norm)] = 0
|
11
|
-
|
12
|
-
if (!is.null(weights)){
|
13
|
-
x.norm = x.norm * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
|
14
|
-
}
|
15
|
-
|
16
|
-
x.norm;
|
17
|
-
}
|
18
|
-
|
19
|
-
|
20
|
-
BOW.classification.model <- function(features, modelfile, dictfile = NULL){
|
21
|
-
feats = read.table(features, sep="\t", header=T, row.names=1);
|
22
|
-
|
23
|
-
if (!is.null(dictfile)){
|
24
|
-
svm.weights = read.table(file=dictfile, sep="\t")[2];
|
25
|
-
}else {
|
26
|
-
svm.weights = NULL;
|
27
|
-
}
|
28
|
-
feats[-1] = BOW.norm(feats[-1], svm.weights);
|
29
|
-
svm.model = svm(Class ~ ., data=feats, svm.weights);
|
30
|
-
save(svm.model,svm.weights, file=modelfile);
|
31
|
-
}
|
32
|
-
|
33
|
-
BOW.classification.classify <- function(modelfile, x, weights = NULL){
|
34
|
-
x = BOW.norm(x, weights);
|
35
|
-
predict(modelfile, x);
|
36
|
-
}
|
@@ -1,145 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/sources/organism'
|
3
|
-
require 'rbbt/sources/pubmed'
|
4
|
-
require 'rbbt/bow/bow'
|
5
|
-
require 'rbbt/bow/dictionary'
|
6
|
-
require 'rbbt/bow/classifier'
|
7
|
-
require 'rbbt/util/misc'
|
8
|
-
|
9
|
-
require 'progress-monitor'
|
10
|
-
require 'rand'
|
11
|
-
|
12
|
-
$hi = ENV['hi'] || 0.8
|
13
|
-
$low = ENV['low'] || 0.01
|
14
|
-
$max = ENV['max'] || 3000
|
15
|
-
$bigrams = ENV['bigrams'] == 'true' || false
|
16
|
-
|
17
|
-
$ndocs = ENV['ndocs'] || 5000
|
18
|
-
|
19
|
-
desc "Bilds Dictionary and Features for an organism"
|
20
|
-
rule(/data\/(.*)/) do |t|
|
21
|
-
org = File.basename(t.name)
|
22
|
-
|
23
|
-
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
|
24
|
-
all = Organism.literature(org).flatten.uniq - go
|
25
|
-
|
26
|
-
ndocs = [go.length, all.length, $ndocs.to_i].min
|
27
|
-
puts "Using #{ ndocs } from each class\n\n"
|
28
|
-
|
29
|
-
go = go.shuffle[0..ndocs - 1]
|
30
|
-
all = all.shuffle[0..ndocs - 1]
|
31
|
-
|
32
|
-
dict = Dictionary::KL.new
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
chunks = all.chunk(50)
|
37
|
-
Progress.monitor("Building Dictionary for #{ org }: -",1000)
|
38
|
-
chunks.each{|chunk|
|
39
|
-
PubMed.get_article(chunk).each{|pmid, article|
|
40
|
-
words = BagOfWords.terms(article.text,$bigrams)
|
41
|
-
dict.add(words, :-)
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
chunks = go.chunk(50)
|
46
|
-
Progress.monitor("Building Dictionary for #{ org }: +",1000)
|
47
|
-
chunks.each{|chunk|
|
48
|
-
PubMed.get_article(chunk).each{|pmid, article|
|
49
|
-
words = BagOfWords.terms(article.text,$bigrams)
|
50
|
-
dict.add(words, :+)
|
51
|
-
}
|
52
|
-
}
|
53
|
-
|
54
|
-
term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
|
55
|
-
Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
|
56
|
-
|
57
|
-
terms = term_weigths.keys.sort
|
58
|
-
|
59
|
-
fout = File.open(t.name, 'w')
|
60
|
-
fout.puts((['Name','Class'] + terms).join("\t"))
|
61
|
-
|
62
|
-
Progress.monitor("Building Features for #{ org }", 1000)
|
63
|
-
all.each{|pmid|
|
64
|
-
text = PubMed.get_article(pmid).text
|
65
|
-
fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
|
66
|
-
}
|
67
|
-
go.each{|pmid|
|
68
|
-
text = PubMed.get_article(pmid).text
|
69
|
-
fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
|
70
|
-
}
|
71
|
-
|
72
|
-
|
73
|
-
fout.close
|
74
|
-
end
|
75
|
-
|
76
|
-
rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
|
77
|
-
features = t.name.sub(/model/,'data')
|
78
|
-
Classifier.create_model(features, t.name, features + '.dict')
|
79
|
-
end
|
80
|
-
|
81
|
-
rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
|
82
|
-
model = t.name.sub(/results/,'model')
|
83
|
-
features = t.name.sub(/results/,'data')
|
84
|
-
org = File.basename(t.name)
|
85
|
-
|
86
|
-
ndocs = 100
|
87
|
-
|
88
|
-
used = []
|
89
|
-
if "".respond_to? :collect
|
90
|
-
used = Open.read(features).collect{|l| l.chomp.split(/\t/).first}[1..-1]
|
91
|
-
else
|
92
|
-
used = Open.read(features).lines.collect{|l| l.chomp.split(/\t/).first}[1..-1]
|
93
|
-
end
|
94
|
-
|
95
|
-
classifier = Classifier.new(model)
|
96
|
-
go = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
|
97
|
-
all = Organism.literature(org).flatten.uniq - go - used
|
98
|
-
|
99
|
-
go = go.shuffle[0..ndocs - 1]
|
100
|
-
all = all.shuffle[0..ndocs - 1]
|
101
|
-
|
102
|
-
ndocs = go.length + all.length
|
103
|
-
|
104
|
-
raise "Not enogh unused articles to evaluate" if go.empty? || all.empty?
|
105
|
-
|
106
|
-
features_go = PubMed.get_article(go).collect{|pmid, article|
|
107
|
-
article = article.text
|
108
|
-
}
|
109
|
-
pos = classifier.classify(features_go).select{|v| v == '+'}.length
|
110
|
-
|
111
|
-
features_all = PubMed.get_article(all).collect{|pmid, article|
|
112
|
-
article = article.text
|
113
|
-
}
|
114
|
-
neg = classifier.classify(features_all).select{|v| v == '-'}.length
|
115
|
-
|
116
|
-
puts "#{ pos } #{ neg }"
|
117
|
-
|
118
|
-
precision = (pos + neg) / (ndocs).to_f
|
119
|
-
recall = pos / go.length.to_f
|
120
|
-
f1 = ( 2 * precision * recall) / (precision + recall ).to_f
|
121
|
-
|
122
|
-
puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
|
123
|
-
end
|
124
|
-
|
125
|
-
task 'clean' do
|
126
|
-
FileUtils.rm Dir.glob("data/*")
|
127
|
-
FileUtils.rm Dir.glob("model/*")
|
128
|
-
FileUtils.rm Dir.glob("results/*")
|
129
|
-
|
130
|
-
end
|
131
|
-
task 'all' do
|
132
|
-
Organism.all.each{|org|
|
133
|
-
Rake::Task["model/#{ org }"].invoke
|
134
|
-
}
|
135
|
-
end
|
136
|
-
task 'update' do
|
137
|
-
if $org
|
138
|
-
FileUtils.rm Dir.glob("**/#{$org}.*") if $force
|
139
|
-
Rake::Task["model/#{$org}"].invoke
|
140
|
-
else
|
141
|
-
Rake::Task['clean'].invoke if $force
|
142
|
-
Rake::Task['all'].invoke
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
@@ -1,25 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
|
4
|
-
wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
|
5
|
-
mv BANNER_v02.zip BANNER.zip
|
6
|
-
mv gene_model_v02.bin gene_model.bin
|
7
|
-
unzip BANNER.zip
|
8
|
-
cd BANNER
|
9
|
-
libs=`find libs/ -name "*.jar"`
|
10
|
-
mkdir classes
|
11
|
-
javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
|
12
|
-
cd classes
|
13
|
-
for f in ../libs/*.jar; do jar xf "$f";done
|
14
|
-
jar cf banner.jar *
|
15
|
-
mv banner.jar ../..
|
16
|
-
cd ..
|
17
|
-
cp -R nlpdata/ ../
|
18
|
-
cd ..
|
19
|
-
rm BANNER.zip
|
20
|
-
rm -Rf BANNER
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
@@ -1,72 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
mkdir src
|
4
|
-
cd src
|
5
|
-
wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2GNandGMgold_Subs.tar.gz"
|
6
|
-
wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1a.tar.gz"
|
7
|
-
wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1b.tar.gz"
|
8
|
-
wget "http://mesh.dl.sourceforge.net/sourceforge/biocreative/biocreative1task2.tar.gz"
|
9
|
-
wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2geneMention.tar.gz"
|
10
|
-
wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/bc2normal.1.4.tar.gz"
|
11
|
-
wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/bc2GNtest.zip"
|
12
|
-
|
13
|
-
for f in *.gz; do tar xfz $f; done
|
14
|
-
unzip bc2GNtest.zip
|
15
|
-
|
16
|
-
cd ..
|
17
|
-
|
18
|
-
mkdir BC2GM
|
19
|
-
cp -R src/bc2geneMention/train/ BC2GM/
|
20
|
-
cp -R src/sourceforgeDistrib-22-Sept-07/genemention/BC2GM/test/ BC2GM/
|
21
|
-
mv BC2GM/train/alt_eval.perl BC2GM/
|
22
|
-
|
23
|
-
mkdir BC2GN
|
24
|
-
cp -R src/biocreative2normalization/* BC2GN/
|
25
|
-
mv BC2GN/noisyTrainingData/ BC2GN/NoisyTrain
|
26
|
-
mv BC2GN/trainingData/ BC2GN/Train
|
27
|
-
cp -R src/bc2GNtest/bc2GNtestdocs/ BC2GN/Test
|
28
|
-
mv BC2GN/NoisyTrain/noisytrain.genelist BC2GN/NoisyTrain/genelist
|
29
|
-
mv BC2GN/Train/training.genelist BC2GN/Train/genelist
|
30
|
-
cp src/sourceforgeDistrib-22-Sept-07/genenormalization/bc2test.genelist BC2GN/Test/genelist
|
31
|
-
|
32
|
-
mkdir BC1GN
|
33
|
-
cp -R src/biocreative1/bc1task1b/* BC1GN/
|
34
|
-
mv BC1GN/fly/FlyDevTest/ BC1GN/fly/devtest
|
35
|
-
mv BC1GN/fly/FlyEvaluation/ BC1GN/fly/test
|
36
|
-
mv BC1GN/fly/FlyNoisyTraining/ BC1GN/fly/train
|
37
|
-
mv BC1GN/fly/*.list BC1GN/fly/synonyms.list
|
38
|
-
mv BC1GN/fly/test/*gene_list BC1GN/fly/test/genelist
|
39
|
-
for f in BC1GN/fly/train/gene_list/*; do cat "$f" >> BC1GN/fly/train/genelist;done
|
40
|
-
for f in BC1GN/fly/devtest/gene_lists/*; do cat "$f" >> BC1GN/fly/devtest/genelist;done
|
41
|
-
mv BC1GN/mouse/MouseDevTest/ BC1GN/mouse/devtest
|
42
|
-
mv BC1GN/mouse/MouseEvaluation/ BC1GN/mouse/test
|
43
|
-
mv BC1GN/mouse/MouseNoisyTraining/ BC1GN/mouse/train
|
44
|
-
mv BC1GN/mouse/*.list BC1GN/mouse/synonyms.list
|
45
|
-
mv BC1GN/mouse/test/*gene_list BC1GN/mouse/test/genelist
|
46
|
-
for f in BC1GN/mouse/train/gene_list/*; do cat "$f" >> BC1GN/mouse/train/genelist;done
|
47
|
-
for f in BC1GN/mouse/devtest/gene_lists/*; do cat "$f" >> BC1GN/mouse/devtest/genelist;done
|
48
|
-
mv BC1GN/yeast/YeastDevTest/ BC1GN/yeast/devtest
|
49
|
-
mv BC1GN/yeast/YeastEvaluation/ BC1GN/yeast/test
|
50
|
-
mv BC1GN/yeast/YeastNoisyTraining/ BC1GN/yeast/train
|
51
|
-
mv BC1GN/yeast/*.list BC1GN/yeast/synonyms.list
|
52
|
-
mv BC1GN/yeast/test/*gene_list BC1GN/yeast/test/genelist
|
53
|
-
for f in BC1GN/yeast/train/gene_list/*; do cat "$f" >> BC1GN/yeast/train/genelist;done
|
54
|
-
for f in BC1GN/yeast/devtest/gene_lists/*; do cat "$f" >> BC1GN/yeast/devtest/genelist;done
|
55
|
-
# Fix a bug in the perl script! :-|
|
56
|
-
cat BC1GN/task1Bscorer.pl |grep -v 'else {EVALFILE = STDIN;}' >foo; mv foo BC1GN/task1Bscorer.pl
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
rm -Rf src
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
@@ -1,26 +0,0 @@
|
|
1
|
-
wget "http://downloads.sourceforge.net/crfpp/CRF%2B%2B-0.51.tar.gz?modtime=1215793886&big_mirror=0" -O crf++.tar.gz
|
2
|
-
tar xvfz crf++.tar.gz
|
3
|
-
rm crf++.tar.gz
|
4
|
-
cd CRF*
|
5
|
-
PREFIX=$(dirname $PWD)
|
6
|
-
|
7
|
-
if [ `uname -m` == 'x86_64' ]; then
|
8
|
-
WITH_PIC='--with-pic';
|
9
|
-
else
|
10
|
-
WITH_PIC=''
|
11
|
-
fi
|
12
|
-
|
13
|
-
./configure --prefix=$PREFIX --exec-prefix=$PREFIX $WITH_PIC;
|
14
|
-
make install
|
15
|
-
cd ruby
|
16
|
-
|
17
|
-
ruby extconf.rb --with-opt-lib=$PREFIX/lib/ --with-opt-include=$PREFIX/include/
|
18
|
-
make
|
19
|
-
cc -shared -o CRFPP.so CRFPP_wrap.o ../../lib/libcrfpp.a -L. -L/usr/lib -L. -rdynamic -Wl,-export-dynamic -lruby -lpthread -lpthread -ldl -lcrypt -lm -lc -lstdc++
|
20
|
-
|
21
|
-
mkdir ../../ruby/
|
22
|
-
cp CRFPP.so ../../ruby/
|
23
|
-
cd ../../
|
24
|
-
rm -Rf CRF* include
|
25
|
-
|
26
|
-
|
data/install_scripts/get_go.sh
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt -O disease.txt
|
4
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt -O organ.txt
|
5
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt -O tissue.txt
|
6
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt -O subcellular.txt
|
7
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt -O drug.txt
|
8
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/HMDBnames.txt -O metabolite.txt
|
@@ -1,206 +0,0 @@
|
|
1
|
-
require 'rbbt/sources/organism'
|
2
|
-
require 'rbbt/sources/biocreative'
|
3
|
-
require 'rbbt/ner/rner'
|
4
|
-
|
5
|
-
require 'progress-monitor'
|
6
|
-
|
7
|
-
|
8
|
-
$type = ENV['type'] || 'rner'
|
9
|
-
|
10
|
-
#{{{ FEATURES
|
11
|
-
|
12
|
-
def BC2GM_features(dataset, outfile)
|
13
|
-
data = Biocreative.BC2GM(dataset)
|
14
|
-
|
15
|
-
fout = File.open(outfile,'w')
|
16
|
-
parser = NERFeatures.new
|
17
|
-
|
18
|
-
Progress.monitor("CRFPP Features BC2GM #{ dataset }")
|
19
|
-
data.each{|code, info|
|
20
|
-
text = info[:text]
|
21
|
-
mentions = info[:mentions]
|
22
|
-
|
23
|
-
features = parser.tagged_features(text,mentions)
|
24
|
-
|
25
|
-
features.each{|feat|
|
26
|
-
fout.puts feat.join(" ")
|
27
|
-
}
|
28
|
-
fout.puts
|
29
|
-
}
|
30
|
-
fout.close
|
31
|
-
end
|
32
|
-
|
33
|
-
def BC2GN_features(dataset, outfile)
|
34
|
-
data = {}
|
35
|
-
Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
|
36
|
-
code = File.basename(f).sub(/.txt/,'')
|
37
|
-
data[code] = {}
|
38
|
-
data[code][:text] = Open.read(f)
|
39
|
-
}
|
40
|
-
Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each_line{|l|
|
41
|
-
code, gene, mention = l.chomp.split(/\t/)
|
42
|
-
data[code][:mentions] ||= []
|
43
|
-
data[code][:mentions] << mention
|
44
|
-
}
|
45
|
-
|
46
|
-
fout = File.open(outfile,'w')
|
47
|
-
parser = NERFeatures.new
|
48
|
-
|
49
|
-
Progress.monitor("CRFPP Features BC2GN #{ dataset }")
|
50
|
-
data.each{|code, info|
|
51
|
-
text = info[:text]
|
52
|
-
mentions = info[:mentions]
|
53
|
-
next if mentions.nil?
|
54
|
-
|
55
|
-
features = parser.tagged_features(text,mentions)
|
56
|
-
|
57
|
-
features.each{|feat|
|
58
|
-
fout.puts feat.join(" ")
|
59
|
-
}
|
60
|
-
fout.puts
|
61
|
-
}
|
62
|
-
fout.close
|
63
|
-
end
|
64
|
-
|
65
|
-
def org_features(org, outfile)
|
66
|
-
names = Organism.lexicon(org).collect{|code, names|
|
67
|
-
names
|
68
|
-
}.flatten
|
69
|
-
|
70
|
-
fout = File.open(outfile,'w')
|
71
|
-
parser = NERFeatures.new
|
72
|
-
|
73
|
-
Progress.monitor("CRFPP Features #{ org }")
|
74
|
-
names.each{|name|
|
75
|
-
features = parser.text_features(name, true)
|
76
|
-
features.each{|feat|
|
77
|
-
fout.puts feat.join(" ")
|
78
|
-
}
|
79
|
-
fout.puts
|
80
|
-
}
|
81
|
-
fout.close
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
file "data/BC2GM_train.features" do |t|
|
87
|
-
BC2GM_features(:train, 'data/BC2GM_train.features')
|
88
|
-
end
|
89
|
-
|
90
|
-
file "data/BC2GM_test.features" do |t|
|
91
|
-
BC2GM_features(:test, 'data/BC2GM_test.features')
|
92
|
-
end
|
93
|
-
file "data/BC2GN_Train.features" do |t|
|
94
|
-
BC2GN_features('Train', 'data/BC2GN_Train.features')
|
95
|
-
end
|
96
|
-
|
97
|
-
file "data/BC2GN_Test.features" do |t|
|
98
|
-
BC2GN_features('Test', 'data/BC2GN_Test.features')
|
99
|
-
end
|
100
|
-
|
101
|
-
|
102
|
-
file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
|
103
|
-
Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
|
104
|
-
Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
|
105
|
-
end
|
106
|
-
|
107
|
-
file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
|
108
|
-
Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
|
109
|
-
Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
|
110
|
-
end
|
111
|
-
|
112
|
-
|
113
|
-
file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
|
114
|
-
Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
|
115
|
-
Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
|
116
|
-
end
|
117
|
-
|
118
|
-
file "data/train.features" => [
|
119
|
-
#'data/BC2GN.features',
|
120
|
-
'data/BC2GM_train.features'
|
121
|
-
] do |t|
|
122
|
-
t.prerequisites.each_with_index{|f,i|
|
123
|
-
if i == 0
|
124
|
-
Open.write('data/train.features',Open.read(f))
|
125
|
-
else
|
126
|
-
Open.append('data/train.features',Open.read(f))
|
127
|
-
end
|
128
|
-
}
|
129
|
-
end
|
130
|
-
|
131
|
-
rule (/data\/(.*).features/) => ['data/BC2.features'] do |t|
|
132
|
-
org = File.basename(t.name).sub(/.features$/,'')
|
133
|
-
org_features(org, t.name)
|
134
|
-
Open.append(t.name, Open.read('data/BC2.features'))
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
#{{{ MODEL
|
140
|
-
rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
|
141
|
-
parser = NERFeatures.new
|
142
|
-
parser.train( t.name.sub(/model/,'data') + '.features', t.name)
|
143
|
-
end
|
144
|
-
|
145
|
-
task 'clean' do
|
146
|
-
FileUtils.rm Dir.glob("data/*")
|
147
|
-
FileUtils.rm Dir.glob("model/*")
|
148
|
-
FileUtils.rm Dir.glob("results/*")
|
149
|
-
|
150
|
-
end
|
151
|
-
|
152
|
-
task 'all' do
|
153
|
-
Organism.all.each{|org|
|
154
|
-
Rake::Task["model/#{ org }"].invoke
|
155
|
-
}
|
156
|
-
end
|
157
|
-
|
158
|
-
task 'default' do
|
159
|
-
if $org
|
160
|
-
FileUtils.rm Dir.glob("**/#{$org}.*") if $force
|
161
|
-
Rake::Task["model/#{$org}"].invoke
|
162
|
-
else
|
163
|
-
Rake::Task['clean'].invoke if $force
|
164
|
-
Rake::Task['all'].invoke
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
#{{{ EVALUATE
|
169
|
-
|
170
|
-
|
171
|
-
def find(model, type, outfile)
|
172
|
-
ner = Organism.ner(:human,type,:model => model)
|
173
|
-
|
174
|
-
data = Biocreative.BC2GM(:test)
|
175
|
-
|
176
|
-
fout = File.open(outfile,'w')
|
177
|
-
|
178
|
-
Progress.monitor("Test")
|
179
|
-
data.each{|code,info|
|
180
|
-
text = info[:text]
|
181
|
-
mentions = ner.extract(text)
|
182
|
-
|
183
|
-
mentions.each{|mention|
|
184
|
-
positions = Biocreative.position(text,mention)
|
185
|
-
positions.each{|pos|
|
186
|
-
fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
|
187
|
-
}
|
188
|
-
}
|
189
|
-
}
|
190
|
-
|
191
|
-
end
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
rule (/results\/test$/) do |t|
|
196
|
-
org = File.basename(t.name)
|
197
|
-
|
198
|
-
if $type == 'rner'
|
199
|
-
Rake::Task['model/train'].invoke
|
200
|
-
end
|
201
|
-
find('model/train',$type,t.name)
|
202
|
-
end
|
203
|
-
|
204
|
-
rule (/results\/test.eval$/) => ['results/test'] do |t|
|
205
|
-
Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
|
206
|
-
end
|