rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,72 +0,0 @@
1
- #!/bin/bash
2
-
3
- mkdir src
4
- cd src
5
- wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2GNandGMgold_Subs.tar.gz"
6
- wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1a.tar.gz"
7
- wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1b.tar.gz"
8
- wget "http://mesh.dl.sourceforge.net/sourceforge/biocreative/biocreative1task2.tar.gz"
9
- wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2geneMention.tar.gz"
10
- wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/bc2normal.1.4.tar.gz"
11
- wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/bc2GNtest.zip"
12
-
13
- for f in *.gz; do tar xfz $f; done
14
- unzip bc2GNtest.zip
15
-
16
- cd ..
17
-
18
- mkdir BC2GM
19
- cp -R src/bc2geneMention/train/ BC2GM/
20
- cp -R src/sourceforgeDistrib-22-Sept-07/genemention/BC2GM/test/ BC2GM/
21
- mv BC2GM/train/alt_eval.perl BC2GM/
22
-
23
- mkdir BC2GN
24
- cp -R src/biocreative2normalization/* BC2GN/
25
- mv BC2GN/noisyTrainingData/ BC2GN/NoisyTrain
26
- mv BC2GN/trainingData/ BC2GN/Train
27
- cp -R src/bc2GNtest/bc2GNtestdocs/ BC2GN/Test
28
- mv BC2GN/NoisyTrain/noisytrain.genelist BC2GN/NoisyTrain/genelist
29
- mv BC2GN/Train/training.genelist BC2GN/Train/genelist
30
- cp src/sourceforgeDistrib-22-Sept-07/genenormalization/bc2test.genelist BC2GN/Test/genelist
31
-
32
- mkdir BC1GN
33
- cp -R src/biocreative1/bc1task1b/* BC1GN/
34
- mv BC1GN/fly/FlyDevTest/ BC1GN/fly/devtest
35
- mv BC1GN/fly/FlyEvaluation/ BC1GN/fly/test
36
- mv BC1GN/fly/FlyNoisyTraining/ BC1GN/fly/train
37
- mv BC1GN/fly/*.list BC1GN/fly/synonyms.list
38
- mv BC1GN/fly/test/*gene_list BC1GN/fly/test/genelist
39
- for f in BC1GN/fly/train/gene_list/*; do cat "$f" >> BC1GN/fly/train/genelist;done
40
- for f in BC1GN/fly/devtest/gene_lists/*; do cat "$f" >> BC1GN/fly/devtest/genelist;done
41
- mv BC1GN/mouse/MouseDevTest/ BC1GN/mouse/devtest
42
- mv BC1GN/mouse/MouseEvaluation/ BC1GN/mouse/test
43
- mv BC1GN/mouse/MouseNoisyTraining/ BC1GN/mouse/train
44
- mv BC1GN/mouse/*.list BC1GN/mouse/synonyms.list
45
- mv BC1GN/mouse/test/*gene_list BC1GN/mouse/test/genelist
46
- for f in BC1GN/mouse/train/gene_list/*; do cat "$f" >> BC1GN/mouse/train/genelist;done
47
- for f in BC1GN/mouse/devtest/gene_lists/*; do cat "$f" >> BC1GN/mouse/devtest/genelist;done
48
- mv BC1GN/yeast/YeastDevTest/ BC1GN/yeast/devtest
49
- mv BC1GN/yeast/YeastEvaluation/ BC1GN/yeast/test
50
- mv BC1GN/yeast/YeastNoisyTraining/ BC1GN/yeast/train
51
- mv BC1GN/yeast/*.list BC1GN/yeast/synonyms.list
52
- mv BC1GN/yeast/test/*gene_list BC1GN/yeast/test/genelist
53
- for f in BC1GN/yeast/train/gene_list/*; do cat "$f" >> BC1GN/yeast/train/genelist;done
54
- for f in BC1GN/yeast/devtest/gene_lists/*; do cat "$f" >> BC1GN/yeast/devtest/genelist;done
55
- # Fix a bug in the perl script! :-|
56
- cat BC1GN/task1Bscorer.pl |grep -v 'else {EVALFILE = STDIN;}' >foo; mv foo BC1GN/task1Bscorer.pl
57
-
58
-
59
-
60
- rm -Rf src
61
-
62
-
63
-
64
-
65
-
66
-
67
-
68
-
69
-
70
-
71
-
72
-
@@ -1,26 +0,0 @@
1
- wget "http://downloads.sourceforge.net/crfpp/CRF%2B%2B-0.51.tar.gz?modtime=1215793886&big_mirror=0" -O crf++.tar.gz
2
- tar xvfz crf++.tar.gz
3
- rm crf++.tar.gz
4
- cd CRF*
5
- PREFIX=$(dirname $PWD)
6
-
7
- if [ `uname -m` == 'x86_64' ]; then
8
- WITH_PIC='--with-pic';
9
- else
10
- WITH_PIC=''
11
- fi
12
-
13
- ./configure --prefix=$PREFIX --exec-prefix=$PREFIX $WITH_PIC;
14
- make install
15
- cd ruby
16
-
17
- ruby extconf.rb --with-opt-lib=$PREFIX/lib/ --with-opt-include=$PREFIX/include/
18
- make
19
- cc -shared -o CRFPP.so CRFPP_wrap.o ../../lib/libcrfpp.a -L. -L/usr/lib -L. -rdynamic -Wl,-export-dynamic -lruby -lpthread -lpthread -ldl -lcrypt -lm -lc -lstdc++
20
-
21
- mkdir ../../ruby/
22
- cp CRFPP.so ../../ruby/
23
- cd ../../
24
- rm -Rf CRF* include
25
-
26
-
@@ -1,4 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz; gunzip gene_info.gz
4
- wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz; gunzip gene2pubmed.gz
@@ -1,4 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo
4
- wget http://www.geneontology.org/GO_slims/goslim_generic.obo
@@ -1,8 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt -O disease.txt
4
- wget http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt -O organ.txt
5
- wget http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt -O tissue.txt
6
- wget http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt -O subcellular.txt
7
- wget http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt -O drug.txt
8
- wget http://wishart.biology.ualberta.ca/polysearch/include/HMDBnames.txt -O metabolite.txt
@@ -1,206 +0,0 @@
1
- require 'rbbt/sources/organism'
2
- require 'rbbt/sources/biocreative'
3
- require 'rbbt/ner/rner'
4
-
5
- require 'progress-monitor'
6
-
7
-
8
- $type = ENV['type'] || 'rner'
9
-
10
- #{{{ FEATURES
11
-
12
- def BC2GM_features(dataset, outfile)
13
- data = Biocreative.BC2GM(dataset)
14
-
15
- fout = File.open(outfile,'w')
16
- parser = NERFeatures.new
17
-
18
- Progress.monitor("CRFPP Features BC2GM #{ dataset }")
19
- data.each{|code, info|
20
- text = info[:text]
21
- mentions = info[:mentions]
22
-
23
- features = parser.tagged_features(text,mentions)
24
-
25
- features.each{|feat|
26
- fout.puts feat.join(" ")
27
- }
28
- fout.puts
29
- }
30
- fout.close
31
- end
32
-
33
- def BC2GN_features(dataset, outfile)
34
- data = {}
35
- Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
36
- code = File.basename(f).sub(/.txt/,'')
37
- data[code] = {}
38
- data[code][:text] = Open.read(f)
39
- }
40
- Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each_line{|l|
41
- code, gene, mention = l.chomp.split(/\t/)
42
- data[code][:mentions] ||= []
43
- data[code][:mentions] << mention
44
- }
45
-
46
- fout = File.open(outfile,'w')
47
- parser = NERFeatures.new
48
-
49
- Progress.monitor("CRFPP Features BC2GN #{ dataset }")
50
- data.each{|code, info|
51
- text = info[:text]
52
- mentions = info[:mentions]
53
- next if mentions.nil?
54
-
55
- features = parser.tagged_features(text,mentions)
56
-
57
- features.each{|feat|
58
- fout.puts feat.join(" ")
59
- }
60
- fout.puts
61
- }
62
- fout.close
63
- end
64
-
65
- def org_features(org, outfile)
66
- names = Organism.lexicon(org).collect{|code, names|
67
- names
68
- }.flatten
69
-
70
- fout = File.open(outfile,'w')
71
- parser = NERFeatures.new
72
-
73
- Progress.monitor("CRFPP Features #{ org }")
74
- names.each{|name|
75
- features = parser.text_features(name, true)
76
- features.each{|feat|
77
- fout.puts feat.join(" ")
78
- }
79
- fout.puts
80
- }
81
- fout.close
82
-
83
-
84
- end
85
-
86
- file "data/BC2GM_train.features" do |t|
87
- BC2GM_features(:train, 'data/BC2GM_train.features')
88
- end
89
-
90
- file "data/BC2GM_test.features" do |t|
91
- BC2GM_features(:test, 'data/BC2GM_test.features')
92
- end
93
- file "data/BC2GN_Train.features" do |t|
94
- BC2GN_features('Train', 'data/BC2GN_Train.features')
95
- end
96
-
97
- file "data/BC2GN_Test.features" do |t|
98
- BC2GN_features('Test', 'data/BC2GN_Test.features')
99
- end
100
-
101
-
102
- file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
103
- Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
104
- Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
105
- end
106
-
107
- file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
108
- Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
109
- Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
110
- end
111
-
112
-
113
- file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
114
- Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
115
- Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
116
- end
117
-
118
- file "data/train.features" => [
119
- #'data/BC2GN.features',
120
- 'data/BC2GM_train.features'
121
- ] do |t|
122
- t.prerequisites.each_with_index{|f,i|
123
- if i == 0
124
- Open.write('data/train.features',Open.read(f))
125
- else
126
- Open.append('data/train.features',Open.read(f))
127
- end
128
- }
129
- end
130
-
131
- rule (/data\/(.*).features/) => ['data/BC2.features'] do |t|
132
- org = File.basename(t.name).sub(/.features$/,'')
133
- org_features(org, t.name)
134
- Open.append(t.name, Open.read('data/BC2.features'))
135
- end
136
-
137
-
138
-
139
- #{{{ MODEL
140
- rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
141
- parser = NERFeatures.new
142
- parser.train( t.name.sub(/model/,'data') + '.features', t.name)
143
- end
144
-
145
- task 'clean' do
146
- FileUtils.rm Dir.glob("data/*")
147
- FileUtils.rm Dir.glob("model/*")
148
- FileUtils.rm Dir.glob("results/*")
149
-
150
- end
151
-
152
- task 'all' do
153
- Organism.all.each{|org|
154
- Rake::Task["model/#{ org }"].invoke
155
- }
156
- end
157
-
158
- task 'default' do
159
- if $org
160
- FileUtils.rm Dir.glob("**/#{$org}.*") if $force
161
- Rake::Task["model/#{$org}"].invoke
162
- else
163
- Rake::Task['clean'].invoke if $force
164
- Rake::Task['all'].invoke
165
- end
166
- end
167
-
168
- #{{{ EVALUATE
169
-
170
-
171
- def find(model, type, outfile)
172
- ner = Organism.ner(:human,type,:model => model)
173
-
174
- data = Biocreative.BC2GM(:test)
175
-
176
- fout = File.open(outfile,'w')
177
-
178
- Progress.monitor("Test")
179
- data.each{|code,info|
180
- text = info[:text]
181
- mentions = ner.extract(text)
182
-
183
- mentions.each{|mention|
184
- positions = Biocreative.position(text,mention)
185
- positions.each{|pos|
186
- fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
187
- }
188
- }
189
- }
190
-
191
- end
192
-
193
-
194
-
195
- rule (/results\/test$/) do |t|
196
- org = File.basename(t.name)
197
-
198
- if $type == 'rner'
199
- Rake::Task['model/train'].invoke
200
- end
201
- find('model/train',$type,t.name)
202
- end
203
-
204
- rule (/results\/test.eval$/) => ['results/test'] do |t|
205
- Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
206
- end
@@ -1,52 +0,0 @@
1
- isLetters /^[A-Z]+$/i
2
- isUpper /^[A-Z]+$/
3
- isLower /^[a-z]+$/
4
- isDigits /^[0-9]+$/i
5
- isRoman /^[IVX]+$/
6
- isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
- isPunctuation /^[,.;]$/
8
- isDelim /^[\/()\[\]{}\-]$/
9
- isNonWord /^[^\w]+$/
10
- isConjunction /^and|or|&|,$/
11
-
12
- hasLetters /[A-Z]/i
13
- hasUpper /.[A-Z]/
14
- hasLower /[a-z]/
15
- hasDigits /[0-9]/i
16
- hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
- hasPunctuation /[,.;]/
18
- hasDelim /[\/()\[\]{}\-]/
19
- hasNonWord /[^\w]/
20
- caspMix /[a-z].[A-Z]/
21
- keywords /(?:protein|gene|domain|ase)s?$/
22
- hasSuffix /[a-z][A-Z0-9]$/
23
-
24
- numLetters do |w| w.scan(/[A-Z]/i).length end
25
- numDigits do |w| w.scan(/[0-9]/).length end
26
- #
27
- prefix_3 /^(...)/
28
- prefix_4 /^(....)/
29
- suffix_3 /(...)$/
30
- suffix_4 /(....)$/
31
-
32
-
33
- token1 do |w|
34
- w.sub(/[A-Z]/,'A').
35
- sub(/[a-z]/,'a').
36
- sub(/[0-9]/,'0').
37
- sub(/[^0-9a-z]/i,'x')
38
- end
39
- token2 do |w|
40
- w.sub(/[A-Z]+/,'A').
41
- sub(/[a-z]+/,'a').
42
- sub(/[0-9]+/,'0').
43
- sub(/[^0-9a-z]+/i,'x')
44
- end
45
- token3 do |w| w.downcase end
46
- special do |w| w.is_special? end
47
-
48
- context %w(special token2 isPunctuation isDelim)
49
- window %w(1 2 3 -1 -2 -3)
50
- #direction :reverse
51
-
52
-
@@ -1,219 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/sources/organism'
3
- require 'rbbt/util/open'
4
- require 'rbbt/ner/rner'
5
- require 'rbbt/ner/rnorm'
6
-
7
-
8
- require 'progress-monitor'
9
-
10
- $type = ENV['ner'] || :rner
11
- $debug = !ENV['debug'].nil?
12
- $perfect = !ENV['perfect'].nil?
13
- $docs = ENV['docs']
14
-
15
-
16
- $org2rbbt = {
17
- 'yeast' => 'Sce',
18
- 'mouse' => 'Mmu',
19
- 'fly' => 'Sce',
20
- 'bc2gn' => 'Hsa',
21
- }
22
-
23
- def match(org, filedir, goldstandard,outfile)
24
-
25
- t = Time.now
26
- if org == 'bc2gn'
27
- custom_file = File.join('config', org + '.config')
28
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
29
- :to_entrez => false,
30
- :file => (File.exist?(custom_file) ? custom_file : nil),
31
- :max_candidates => 200)
32
- else
33
- custom_file = File.join('config', org + '.config')
34
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
35
- :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
36
- :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
37
- :fix => proc{|l| l.sub(/S000/,'S0')}),
38
- :file => (File.exist?(custom_file) ? custom_file : nil),
39
- :max_candidates => 200)
40
- end
41
- STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
42
-
43
-
44
- if $type.to_s == 'rner'
45
- ner = NER.new('models/' + org)
46
- else
47
- ner = Organism.ner($org2rbbt[org], $type)
48
- end
49
-
50
-
51
- fout=File.open(outfile,'w')
52
-
53
- gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
54
- gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
55
-
56
- if org == 'bc2gn'
57
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
58
- else
59
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
60
- end
61
-
62
- if $docs
63
- files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
64
- else
65
- files = Dir.glob(filedir + '*.txt').sort
66
- end
67
-
68
- Progress.monitor("Processing Files")
69
- files.each{|f|
70
- fid = File.basename(f).sub(/.txt/,'')
71
-
72
- text = Open.read(f)
73
- if $perfect
74
- mentions = (gs_mentions[fid] || []).flatten
75
-
76
- else
77
- mentions = ner.extract(text).uniq
78
- end
79
-
80
- if $debug
81
- puts "------------------------------------"
82
- puts "FILE #{fid}"
83
- puts
84
- puts text
85
- puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
86
- puts "MENTIONS: #{mentions.join(", ")}"
87
- end
88
-
89
-
90
- found = []
91
- mentions.each{|mention|
92
-
93
- codes = norm.select(norm.match(mention),mention,text)
94
-
95
- found += codes
96
- codes.each{|code|
97
- #code = code.sub(/S000/,'S0')
98
- fout.puts "#{ fid }\t#{ code}\t#{mention}"
99
- }
100
-
101
- puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
102
- }
103
-
104
- if $debug
105
- found.uniq!
106
- fn = (gs[fid] || []).flatten.uniq - found
107
- fp = found - (gs[fid] || []).flatten.uniq
108
-
109
- fn.each{|code|
110
- if lex[code]
111
- puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
112
- else
113
- puts "FN: #{ code }"
114
- end
115
- }
116
- fp.each{|code|
117
- if lex[code]
118
- puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
119
- else
120
- puts "FN: #{ code }"
121
- end
122
- }
123
-
124
-
125
- end
126
-
127
- }
128
- fout.close
129
-
130
- end
131
-
132
- rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
133
- org = File.basename(t.name).sub(/\.features/,'')
134
-
135
- if org == 'bc2gn'
136
- lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
137
- else
138
- lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
139
- end
140
-
141
- names = File.open(lexicon).collect{|l|
142
- names = l.split(/\t/)
143
- names.shift
144
- names.compact.select{|n| !n.empty?}
145
- }.flatten
146
-
147
- fout = File.open(t.name,'w')
148
- parser = NERFeatures.new
149
-
150
- Progress.monitor("CRFPP Features #{ org }")
151
- names.each{|name|
152
- features = parser.text_features(name, true)
153
- features.each{|feat|
154
- fout.puts feat.join(" ")
155
- }
156
- fout.puts
157
- }
158
- fout.close
159
- if org != 'bc2gn'
160
- Open.append(t.name, Open.read('../ner/data/BC2.features'))
161
- else
162
- Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
163
- Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
164
- end
165
-
166
- end
167
-
168
- rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
169
- org = File.basename(t.name)
170
-
171
- parser = NERFeatures.new
172
- parser.train( t.name + '.features', t.name)
173
- end
174
-
175
-
176
- rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
177
- org, dataset = File.basename(t.name).split(/_/)
178
-
179
- if $type.to_sym == :rner
180
- Rake::Task['models/' + org].invoke
181
- end
182
-
183
- filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
184
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
185
-
186
- match(org,filedir, goldstandard,t.name)
187
- end
188
-
189
- rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
190
- org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
191
-
192
- cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
193
- puts cmd
194
- system cmd
195
- end
196
-
197
- rule (/results\/bc2gn$/) do |t|
198
- org = 'bc2gn'
199
-
200
- if $type.to_sym == :rner
201
- Rake::Task['models/' + org].invoke
202
- end
203
-
204
- filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
205
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
206
-
207
- match(org,filedir, goldstandard,t.name)
208
- end
209
-
210
- rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
211
-
212
- cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
213
- system cmd
214
-
215
- end
216
-
217
-
218
-
219
-