rbbt 1.2.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
@@ -1,72 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
mkdir src
|
4
|
-
cd src
|
5
|
-
wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2GNandGMgold_Subs.tar.gz"
|
6
|
-
wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1a.tar.gz"
|
7
|
-
wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1b.tar.gz"
|
8
|
-
wget "http://mesh.dl.sourceforge.net/sourceforge/biocreative/biocreative1task2.tar.gz"
|
9
|
-
wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2geneMention.tar.gz"
|
10
|
-
wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/bc2normal.1.4.tar.gz"
|
11
|
-
wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/bc2GNtest.zip"
|
12
|
-
|
13
|
-
for f in *.gz; do tar xfz $f; done
|
14
|
-
unzip bc2GNtest.zip
|
15
|
-
|
16
|
-
cd ..
|
17
|
-
|
18
|
-
mkdir BC2GM
|
19
|
-
cp -R src/bc2geneMention/train/ BC2GM/
|
20
|
-
cp -R src/sourceforgeDistrib-22-Sept-07/genemention/BC2GM/test/ BC2GM/
|
21
|
-
mv BC2GM/train/alt_eval.perl BC2GM/
|
22
|
-
|
23
|
-
mkdir BC2GN
|
24
|
-
cp -R src/biocreative2normalization/* BC2GN/
|
25
|
-
mv BC2GN/noisyTrainingData/ BC2GN/NoisyTrain
|
26
|
-
mv BC2GN/trainingData/ BC2GN/Train
|
27
|
-
cp -R src/bc2GNtest/bc2GNtestdocs/ BC2GN/Test
|
28
|
-
mv BC2GN/NoisyTrain/noisytrain.genelist BC2GN/NoisyTrain/genelist
|
29
|
-
mv BC2GN/Train/training.genelist BC2GN/Train/genelist
|
30
|
-
cp src/sourceforgeDistrib-22-Sept-07/genenormalization/bc2test.genelist BC2GN/Test/genelist
|
31
|
-
|
32
|
-
mkdir BC1GN
|
33
|
-
cp -R src/biocreative1/bc1task1b/* BC1GN/
|
34
|
-
mv BC1GN/fly/FlyDevTest/ BC1GN/fly/devtest
|
35
|
-
mv BC1GN/fly/FlyEvaluation/ BC1GN/fly/test
|
36
|
-
mv BC1GN/fly/FlyNoisyTraining/ BC1GN/fly/train
|
37
|
-
mv BC1GN/fly/*.list BC1GN/fly/synonyms.list
|
38
|
-
mv BC1GN/fly/test/*gene_list BC1GN/fly/test/genelist
|
39
|
-
for f in BC1GN/fly/train/gene_list/*; do cat "$f" >> BC1GN/fly/train/genelist;done
|
40
|
-
for f in BC1GN/fly/devtest/gene_lists/*; do cat "$f" >> BC1GN/fly/devtest/genelist;done
|
41
|
-
mv BC1GN/mouse/MouseDevTest/ BC1GN/mouse/devtest
|
42
|
-
mv BC1GN/mouse/MouseEvaluation/ BC1GN/mouse/test
|
43
|
-
mv BC1GN/mouse/MouseNoisyTraining/ BC1GN/mouse/train
|
44
|
-
mv BC1GN/mouse/*.list BC1GN/mouse/synonyms.list
|
45
|
-
mv BC1GN/mouse/test/*gene_list BC1GN/mouse/test/genelist
|
46
|
-
for f in BC1GN/mouse/train/gene_list/*; do cat "$f" >> BC1GN/mouse/train/genelist;done
|
47
|
-
for f in BC1GN/mouse/devtest/gene_lists/*; do cat "$f" >> BC1GN/mouse/devtest/genelist;done
|
48
|
-
mv BC1GN/yeast/YeastDevTest/ BC1GN/yeast/devtest
|
49
|
-
mv BC1GN/yeast/YeastEvaluation/ BC1GN/yeast/test
|
50
|
-
mv BC1GN/yeast/YeastNoisyTraining/ BC1GN/yeast/train
|
51
|
-
mv BC1GN/yeast/*.list BC1GN/yeast/synonyms.list
|
52
|
-
mv BC1GN/yeast/test/*gene_list BC1GN/yeast/test/genelist
|
53
|
-
for f in BC1GN/yeast/train/gene_list/*; do cat "$f" >> BC1GN/yeast/train/genelist;done
|
54
|
-
for f in BC1GN/yeast/devtest/gene_lists/*; do cat "$f" >> BC1GN/yeast/devtest/genelist;done
|
55
|
-
# Fix a bug in the perl script! :-|
|
56
|
-
cat BC1GN/task1Bscorer.pl |grep -v 'else {EVALFILE = STDIN;}' >foo; mv foo BC1GN/task1Bscorer.pl
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
rm -Rf src
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
@@ -1,26 +0,0 @@
|
|
1
|
-
wget "http://downloads.sourceforge.net/crfpp/CRF%2B%2B-0.51.tar.gz?modtime=1215793886&big_mirror=0" -O crf++.tar.gz
|
2
|
-
tar xvfz crf++.tar.gz
|
3
|
-
rm crf++.tar.gz
|
4
|
-
cd CRF*
|
5
|
-
PREFIX=$(dirname $PWD)
|
6
|
-
|
7
|
-
if [ `uname -m` == 'x86_64' ]; then
|
8
|
-
WITH_PIC='--with-pic';
|
9
|
-
else
|
10
|
-
WITH_PIC=''
|
11
|
-
fi
|
12
|
-
|
13
|
-
./configure --prefix=$PREFIX --exec-prefix=$PREFIX $WITH_PIC;
|
14
|
-
make install
|
15
|
-
cd ruby
|
16
|
-
|
17
|
-
ruby extconf.rb --with-opt-lib=$PREFIX/lib/ --with-opt-include=$PREFIX/include/
|
18
|
-
make
|
19
|
-
cc -shared -o CRFPP.so CRFPP_wrap.o ../../lib/libcrfpp.a -L. -L/usr/lib -L. -rdynamic -Wl,-export-dynamic -lruby -lpthread -lpthread -ldl -lcrypt -lm -lc -lstdc++
|
20
|
-
|
21
|
-
mkdir ../../ruby/
|
22
|
-
cp CRFPP.so ../../ruby/
|
23
|
-
cd ../../
|
24
|
-
rm -Rf CRF* include
|
25
|
-
|
26
|
-
|
data/install_scripts/get_go.sh
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt -O disease.txt
|
4
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt -O organ.txt
|
5
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt -O tissue.txt
|
6
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt -O subcellular.txt
|
7
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt -O drug.txt
|
8
|
-
wget http://wishart.biology.ualberta.ca/polysearch/include/HMDBnames.txt -O metabolite.txt
|
@@ -1,206 +0,0 @@
|
|
1
|
-
require 'rbbt/sources/organism'
|
2
|
-
require 'rbbt/sources/biocreative'
|
3
|
-
require 'rbbt/ner/rner'
|
4
|
-
|
5
|
-
require 'progress-monitor'
|
6
|
-
|
7
|
-
|
8
|
-
$type = ENV['type'] || 'rner'
|
9
|
-
|
10
|
-
#{{{ FEATURES
|
11
|
-
|
12
|
-
def BC2GM_features(dataset, outfile)
|
13
|
-
data = Biocreative.BC2GM(dataset)
|
14
|
-
|
15
|
-
fout = File.open(outfile,'w')
|
16
|
-
parser = NERFeatures.new
|
17
|
-
|
18
|
-
Progress.monitor("CRFPP Features BC2GM #{ dataset }")
|
19
|
-
data.each{|code, info|
|
20
|
-
text = info[:text]
|
21
|
-
mentions = info[:mentions]
|
22
|
-
|
23
|
-
features = parser.tagged_features(text,mentions)
|
24
|
-
|
25
|
-
features.each{|feat|
|
26
|
-
fout.puts feat.join(" ")
|
27
|
-
}
|
28
|
-
fout.puts
|
29
|
-
}
|
30
|
-
fout.close
|
31
|
-
end
|
32
|
-
|
33
|
-
def BC2GN_features(dataset, outfile)
|
34
|
-
data = {}
|
35
|
-
Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
|
36
|
-
code = File.basename(f).sub(/.txt/,'')
|
37
|
-
data[code] = {}
|
38
|
-
data[code][:text] = Open.read(f)
|
39
|
-
}
|
40
|
-
Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each_line{|l|
|
41
|
-
code, gene, mention = l.chomp.split(/\t/)
|
42
|
-
data[code][:mentions] ||= []
|
43
|
-
data[code][:mentions] << mention
|
44
|
-
}
|
45
|
-
|
46
|
-
fout = File.open(outfile,'w')
|
47
|
-
parser = NERFeatures.new
|
48
|
-
|
49
|
-
Progress.monitor("CRFPP Features BC2GN #{ dataset }")
|
50
|
-
data.each{|code, info|
|
51
|
-
text = info[:text]
|
52
|
-
mentions = info[:mentions]
|
53
|
-
next if mentions.nil?
|
54
|
-
|
55
|
-
features = parser.tagged_features(text,mentions)
|
56
|
-
|
57
|
-
features.each{|feat|
|
58
|
-
fout.puts feat.join(" ")
|
59
|
-
}
|
60
|
-
fout.puts
|
61
|
-
}
|
62
|
-
fout.close
|
63
|
-
end
|
64
|
-
|
65
|
-
def org_features(org, outfile)
|
66
|
-
names = Organism.lexicon(org).collect{|code, names|
|
67
|
-
names
|
68
|
-
}.flatten
|
69
|
-
|
70
|
-
fout = File.open(outfile,'w')
|
71
|
-
parser = NERFeatures.new
|
72
|
-
|
73
|
-
Progress.monitor("CRFPP Features #{ org }")
|
74
|
-
names.each{|name|
|
75
|
-
features = parser.text_features(name, true)
|
76
|
-
features.each{|feat|
|
77
|
-
fout.puts feat.join(" ")
|
78
|
-
}
|
79
|
-
fout.puts
|
80
|
-
}
|
81
|
-
fout.close
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
file "data/BC2GM_train.features" do |t|
|
87
|
-
BC2GM_features(:train, 'data/BC2GM_train.features')
|
88
|
-
end
|
89
|
-
|
90
|
-
file "data/BC2GM_test.features" do |t|
|
91
|
-
BC2GM_features(:test, 'data/BC2GM_test.features')
|
92
|
-
end
|
93
|
-
file "data/BC2GN_Train.features" do |t|
|
94
|
-
BC2GN_features('Train', 'data/BC2GN_Train.features')
|
95
|
-
end
|
96
|
-
|
97
|
-
file "data/BC2GN_Test.features" do |t|
|
98
|
-
BC2GN_features('Test', 'data/BC2GN_Test.features')
|
99
|
-
end
|
100
|
-
|
101
|
-
|
102
|
-
file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
|
103
|
-
Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
|
104
|
-
Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
|
105
|
-
end
|
106
|
-
|
107
|
-
file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
|
108
|
-
Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
|
109
|
-
Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
|
110
|
-
end
|
111
|
-
|
112
|
-
|
113
|
-
file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
|
114
|
-
Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
|
115
|
-
Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
|
116
|
-
end
|
117
|
-
|
118
|
-
file "data/train.features" => [
|
119
|
-
#'data/BC2GN.features',
|
120
|
-
'data/BC2GM_train.features'
|
121
|
-
] do |t|
|
122
|
-
t.prerequisites.each_with_index{|f,i|
|
123
|
-
if i == 0
|
124
|
-
Open.write('data/train.features',Open.read(f))
|
125
|
-
else
|
126
|
-
Open.append('data/train.features',Open.read(f))
|
127
|
-
end
|
128
|
-
}
|
129
|
-
end
|
130
|
-
|
131
|
-
rule (/data\/(.*).features/) => ['data/BC2.features'] do |t|
|
132
|
-
org = File.basename(t.name).sub(/.features$/,'')
|
133
|
-
org_features(org, t.name)
|
134
|
-
Open.append(t.name, Open.read('data/BC2.features'))
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
#{{{ MODEL
|
140
|
-
rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
|
141
|
-
parser = NERFeatures.new
|
142
|
-
parser.train( t.name.sub(/model/,'data') + '.features', t.name)
|
143
|
-
end
|
144
|
-
|
145
|
-
task 'clean' do
|
146
|
-
FileUtils.rm Dir.glob("data/*")
|
147
|
-
FileUtils.rm Dir.glob("model/*")
|
148
|
-
FileUtils.rm Dir.glob("results/*")
|
149
|
-
|
150
|
-
end
|
151
|
-
|
152
|
-
task 'all' do
|
153
|
-
Organism.all.each{|org|
|
154
|
-
Rake::Task["model/#{ org }"].invoke
|
155
|
-
}
|
156
|
-
end
|
157
|
-
|
158
|
-
task 'default' do
|
159
|
-
if $org
|
160
|
-
FileUtils.rm Dir.glob("**/#{$org}.*") if $force
|
161
|
-
Rake::Task["model/#{$org}"].invoke
|
162
|
-
else
|
163
|
-
Rake::Task['clean'].invoke if $force
|
164
|
-
Rake::Task['all'].invoke
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
#{{{ EVALUATE
|
169
|
-
|
170
|
-
|
171
|
-
def find(model, type, outfile)
|
172
|
-
ner = Organism.ner(:human,type,:model => model)
|
173
|
-
|
174
|
-
data = Biocreative.BC2GM(:test)
|
175
|
-
|
176
|
-
fout = File.open(outfile,'w')
|
177
|
-
|
178
|
-
Progress.monitor("Test")
|
179
|
-
data.each{|code,info|
|
180
|
-
text = info[:text]
|
181
|
-
mentions = ner.extract(text)
|
182
|
-
|
183
|
-
mentions.each{|mention|
|
184
|
-
positions = Biocreative.position(text,mention)
|
185
|
-
positions.each{|pos|
|
186
|
-
fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
|
187
|
-
}
|
188
|
-
}
|
189
|
-
}
|
190
|
-
|
191
|
-
end
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
rule (/results\/test$/) do |t|
|
196
|
-
org = File.basename(t.name)
|
197
|
-
|
198
|
-
if $type == 'rner'
|
199
|
-
Rake::Task['model/train'].invoke
|
200
|
-
end
|
201
|
-
find('model/train',$type,t.name)
|
202
|
-
end
|
203
|
-
|
204
|
-
rule (/results\/test.eval$/) => ['results/test'] do |t|
|
205
|
-
Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
|
206
|
-
end
|
@@ -1,52 +0,0 @@
|
|
1
|
-
isLetters /^[A-Z]+$/i
|
2
|
-
isUpper /^[A-Z]+$/
|
3
|
-
isLower /^[a-z]+$/
|
4
|
-
isDigits /^[0-9]+$/i
|
5
|
-
isRoman /^[IVX]+$/
|
6
|
-
isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
|
7
|
-
isPunctuation /^[,.;]$/
|
8
|
-
isDelim /^[\/()\[\]{}\-]$/
|
9
|
-
isNonWord /^[^\w]+$/
|
10
|
-
isConjunction /^and|or|&|,$/
|
11
|
-
|
12
|
-
hasLetters /[A-Z]/i
|
13
|
-
hasUpper /.[A-Z]/
|
14
|
-
hasLower /[a-z]/
|
15
|
-
hasDigits /[0-9]/i
|
16
|
-
hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
|
17
|
-
hasPunctuation /[,.;]/
|
18
|
-
hasDelim /[\/()\[\]{}\-]/
|
19
|
-
hasNonWord /[^\w]/
|
20
|
-
caspMix /[a-z].[A-Z]/
|
21
|
-
keywords /(?:protein|gene|domain|ase)s?$/
|
22
|
-
hasSuffix /[a-z][A-Z0-9]$/
|
23
|
-
|
24
|
-
numLetters do |w| w.scan(/[A-Z]/i).length end
|
25
|
-
numDigits do |w| w.scan(/[0-9]/).length end
|
26
|
-
#
|
27
|
-
prefix_3 /^(...)/
|
28
|
-
prefix_4 /^(....)/
|
29
|
-
suffix_3 /(...)$/
|
30
|
-
suffix_4 /(....)$/
|
31
|
-
|
32
|
-
|
33
|
-
token1 do |w|
|
34
|
-
w.sub(/[A-Z]/,'A').
|
35
|
-
sub(/[a-z]/,'a').
|
36
|
-
sub(/[0-9]/,'0').
|
37
|
-
sub(/[^0-9a-z]/i,'x')
|
38
|
-
end
|
39
|
-
token2 do |w|
|
40
|
-
w.sub(/[A-Z]+/,'A').
|
41
|
-
sub(/[a-z]+/,'a').
|
42
|
-
sub(/[0-9]+/,'0').
|
43
|
-
sub(/[^0-9a-z]+/i,'x')
|
44
|
-
end
|
45
|
-
token3 do |w| w.downcase end
|
46
|
-
special do |w| w.is_special? end
|
47
|
-
|
48
|
-
context %w(special token2 isPunctuation isDelim)
|
49
|
-
window %w(1 2 3 -1 -2 -3)
|
50
|
-
#direction :reverse
|
51
|
-
|
52
|
-
|
@@ -1,219 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/sources/organism'
|
3
|
-
require 'rbbt/util/open'
|
4
|
-
require 'rbbt/ner/rner'
|
5
|
-
require 'rbbt/ner/rnorm'
|
6
|
-
|
7
|
-
|
8
|
-
require 'progress-monitor'
|
9
|
-
|
10
|
-
$type = ENV['ner'] || :rner
|
11
|
-
$debug = !ENV['debug'].nil?
|
12
|
-
$perfect = !ENV['perfect'].nil?
|
13
|
-
$docs = ENV['docs']
|
14
|
-
|
15
|
-
|
16
|
-
$org2rbbt = {
|
17
|
-
'yeast' => 'Sce',
|
18
|
-
'mouse' => 'Mmu',
|
19
|
-
'fly' => 'Sce',
|
20
|
-
'bc2gn' => 'Hsa',
|
21
|
-
}
|
22
|
-
|
23
|
-
def match(org, filedir, goldstandard,outfile)
|
24
|
-
|
25
|
-
t = Time.now
|
26
|
-
if org == 'bc2gn'
|
27
|
-
custom_file = File.join('config', org + '.config')
|
28
|
-
norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
|
29
|
-
:to_entrez => false,
|
30
|
-
:file => (File.exist?(custom_file) ? custom_file : nil),
|
31
|
-
:max_candidates => 200)
|
32
|
-
else
|
33
|
-
custom_file = File.join('config', org + '.config')
|
34
|
-
norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
|
35
|
-
:to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
|
36
|
-
:native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
|
37
|
-
:fix => proc{|l| l.sub(/S000/,'S0')}),
|
38
|
-
:file => (File.exist?(custom_file) ? custom_file : nil),
|
39
|
-
:max_candidates => 200)
|
40
|
-
end
|
41
|
-
STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
|
42
|
-
|
43
|
-
|
44
|
-
if $type.to_s == 'rner'
|
45
|
-
ner = NER.new('models/' + org)
|
46
|
-
else
|
47
|
-
ner = Organism.ner($org2rbbt[org], $type)
|
48
|
-
end
|
49
|
-
|
50
|
-
|
51
|
-
fout=File.open(outfile,'w')
|
52
|
-
|
53
|
-
gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
|
54
|
-
gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
|
55
|
-
|
56
|
-
if org == 'bc2gn'
|
57
|
-
lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
|
58
|
-
else
|
59
|
-
lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
|
60
|
-
end
|
61
|
-
|
62
|
-
if $docs
|
63
|
-
files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
|
64
|
-
else
|
65
|
-
files = Dir.glob(filedir + '*.txt').sort
|
66
|
-
end
|
67
|
-
|
68
|
-
Progress.monitor("Processing Files")
|
69
|
-
files.each{|f|
|
70
|
-
fid = File.basename(f).sub(/.txt/,'')
|
71
|
-
|
72
|
-
text = Open.read(f)
|
73
|
-
if $perfect
|
74
|
-
mentions = (gs_mentions[fid] || []).flatten
|
75
|
-
|
76
|
-
else
|
77
|
-
mentions = ner.extract(text).uniq
|
78
|
-
end
|
79
|
-
|
80
|
-
if $debug
|
81
|
-
puts "------------------------------------"
|
82
|
-
puts "FILE #{fid}"
|
83
|
-
puts
|
84
|
-
puts text
|
85
|
-
puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
|
86
|
-
puts "MENTIONS: #{mentions.join(", ")}"
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
found = []
|
91
|
-
mentions.each{|mention|
|
92
|
-
|
93
|
-
codes = norm.select(norm.match(mention),mention,text)
|
94
|
-
|
95
|
-
found += codes
|
96
|
-
codes.each{|code|
|
97
|
-
#code = code.sub(/S000/,'S0')
|
98
|
-
fout.puts "#{ fid }\t#{ code}\t#{mention}"
|
99
|
-
}
|
100
|
-
|
101
|
-
puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
|
102
|
-
}
|
103
|
-
|
104
|
-
if $debug
|
105
|
-
found.uniq!
|
106
|
-
fn = (gs[fid] || []).flatten.uniq - found
|
107
|
-
fp = found - (gs[fid] || []).flatten.uniq
|
108
|
-
|
109
|
-
fn.each{|code|
|
110
|
-
if lex[code]
|
111
|
-
puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
|
112
|
-
else
|
113
|
-
puts "FN: #{ code }"
|
114
|
-
end
|
115
|
-
}
|
116
|
-
fp.each{|code|
|
117
|
-
if lex[code]
|
118
|
-
puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
|
119
|
-
else
|
120
|
-
puts "FN: #{ code }"
|
121
|
-
end
|
122
|
-
}
|
123
|
-
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
}
|
128
|
-
fout.close
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
|
133
|
-
org = File.basename(t.name).sub(/\.features/,'')
|
134
|
-
|
135
|
-
if org == 'bc2gn'
|
136
|
-
lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
|
137
|
-
else
|
138
|
-
lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
|
139
|
-
end
|
140
|
-
|
141
|
-
names = File.open(lexicon).collect{|l|
|
142
|
-
names = l.split(/\t/)
|
143
|
-
names.shift
|
144
|
-
names.compact.select{|n| !n.empty?}
|
145
|
-
}.flatten
|
146
|
-
|
147
|
-
fout = File.open(t.name,'w')
|
148
|
-
parser = NERFeatures.new
|
149
|
-
|
150
|
-
Progress.monitor("CRFPP Features #{ org }")
|
151
|
-
names.each{|name|
|
152
|
-
features = parser.text_features(name, true)
|
153
|
-
features.each{|feat|
|
154
|
-
fout.puts feat.join(" ")
|
155
|
-
}
|
156
|
-
fout.puts
|
157
|
-
}
|
158
|
-
fout.close
|
159
|
-
if org != 'bc2gn'
|
160
|
-
Open.append(t.name, Open.read('../ner/data/BC2.features'))
|
161
|
-
else
|
162
|
-
Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
|
163
|
-
Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
|
164
|
-
end
|
165
|
-
|
166
|
-
end
|
167
|
-
|
168
|
-
rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
|
169
|
-
org = File.basename(t.name)
|
170
|
-
|
171
|
-
parser = NERFeatures.new
|
172
|
-
parser.train( t.name + '.features', t.name)
|
173
|
-
end
|
174
|
-
|
175
|
-
|
176
|
-
rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
|
177
|
-
org, dataset = File.basename(t.name).split(/_/)
|
178
|
-
|
179
|
-
if $type.to_sym == :rner
|
180
|
-
Rake::Task['models/' + org].invoke
|
181
|
-
end
|
182
|
-
|
183
|
-
filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
|
184
|
-
goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
|
185
|
-
|
186
|
-
match(org,filedir, goldstandard,t.name)
|
187
|
-
end
|
188
|
-
|
189
|
-
rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
|
190
|
-
org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
|
191
|
-
|
192
|
-
cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
|
193
|
-
puts cmd
|
194
|
-
system cmd
|
195
|
-
end
|
196
|
-
|
197
|
-
rule (/results\/bc2gn$/) do |t|
|
198
|
-
org = 'bc2gn'
|
199
|
-
|
200
|
-
if $type.to_sym == :rner
|
201
|
-
Rake::Task['models/' + org].invoke
|
202
|
-
end
|
203
|
-
|
204
|
-
filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
|
205
|
-
goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
|
206
|
-
|
207
|
-
match(org,filedir, goldstandard,t.name)
|
208
|
-
end
|
209
|
-
|
210
|
-
rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
|
211
|
-
|
212
|
-
cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
|
213
|
-
system cmd
|
214
|
-
|
215
|
-
end
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|