rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,72 +0,0 @@
1
- #!/bin/bash
2
-
3
- mkdir src
4
- cd src
5
- wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2GNandGMgold_Subs.tar.gz"
6
- wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1a.tar.gz"
7
- wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1b.tar.gz"
8
- wget "http://mesh.dl.sourceforge.net/sourceforge/biocreative/biocreative1task2.tar.gz"
9
- wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2geneMention.tar.gz"
10
- wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/bc2normal.1.4.tar.gz"
11
- wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/bc2GNtest.zip"
12
-
13
- for f in *.gz; do tar xfz $f; done
14
- unzip bc2GNtest.zip
15
-
16
- cd ..
17
-
18
- mkdir BC2GM
19
- cp -R src/bc2geneMention/train/ BC2GM/
20
- cp -R src/sourceforgeDistrib-22-Sept-07/genemention/BC2GM/test/ BC2GM/
21
- mv BC2GM/train/alt_eval.perl BC2GM/
22
-
23
- mkdir BC2GN
24
- cp -R src/biocreative2normalization/* BC2GN/
25
- mv BC2GN/noisyTrainingData/ BC2GN/NoisyTrain
26
- mv BC2GN/trainingData/ BC2GN/Train
27
- cp -R src/bc2GNtest/bc2GNtestdocs/ BC2GN/Test
28
- mv BC2GN/NoisyTrain/noisytrain.genelist BC2GN/NoisyTrain/genelist
29
- mv BC2GN/Train/training.genelist BC2GN/Train/genelist
30
- cp src/sourceforgeDistrib-22-Sept-07/genenormalization/bc2test.genelist BC2GN/Test/genelist
31
-
32
- mkdir BC1GN
33
- cp -R src/biocreative1/bc1task1b/* BC1GN/
34
- mv BC1GN/fly/FlyDevTest/ BC1GN/fly/devtest
35
- mv BC1GN/fly/FlyEvaluation/ BC1GN/fly/test
36
- mv BC1GN/fly/FlyNoisyTraining/ BC1GN/fly/train
37
- mv BC1GN/fly/*.list BC1GN/fly/synonyms.list
38
- mv BC1GN/fly/test/*gene_list BC1GN/fly/test/genelist
39
- for f in BC1GN/fly/train/gene_list/*; do cat "$f" >> BC1GN/fly/train/genelist;done
40
- for f in BC1GN/fly/devtest/gene_lists/*; do cat "$f" >> BC1GN/fly/devtest/genelist;done
41
- mv BC1GN/mouse/MouseDevTest/ BC1GN/mouse/devtest
42
- mv BC1GN/mouse/MouseEvaluation/ BC1GN/mouse/test
43
- mv BC1GN/mouse/MouseNoisyTraining/ BC1GN/mouse/train
44
- mv BC1GN/mouse/*.list BC1GN/mouse/synonyms.list
45
- mv BC1GN/mouse/test/*gene_list BC1GN/mouse/test/genelist
46
- for f in BC1GN/mouse/train/gene_list/*; do cat "$f" >> BC1GN/mouse/train/genelist;done
47
- for f in BC1GN/mouse/devtest/gene_lists/*; do cat "$f" >> BC1GN/mouse/devtest/genelist;done
48
- mv BC1GN/yeast/YeastDevTest/ BC1GN/yeast/devtest
49
- mv BC1GN/yeast/YeastEvaluation/ BC1GN/yeast/test
50
- mv BC1GN/yeast/YeastNoisyTraining/ BC1GN/yeast/train
51
- mv BC1GN/yeast/*.list BC1GN/yeast/synonyms.list
52
- mv BC1GN/yeast/test/*gene_list BC1GN/yeast/test/genelist
53
- for f in BC1GN/yeast/train/gene_list/*; do cat "$f" >> BC1GN/yeast/train/genelist;done
54
- for f in BC1GN/yeast/devtest/gene_lists/*; do cat "$f" >> BC1GN/yeast/devtest/genelist;done
55
- # Fix a bug in the perl script! :-|
56
- cat BC1GN/task1Bscorer.pl |grep -v 'else {EVALFILE = STDIN;}' >foo; mv foo BC1GN/task1Bscorer.pl
57
-
58
-
59
-
60
- rm -Rf src
61
-
62
-
63
-
64
-
65
-
66
-
67
-
68
-
69
-
70
-
71
-
72
-
@@ -1,26 +0,0 @@
1
- wget "http://downloads.sourceforge.net/crfpp/CRF%2B%2B-0.51.tar.gz?modtime=1215793886&big_mirror=0" -O crf++.tar.gz
2
- tar xvfz crf++.tar.gz
3
- rm crf++.tar.gz
4
- cd CRF*
5
- PREFIX=$(dirname $PWD)
6
-
7
- if [ `uname -m` == 'x86_64' ]; then
8
- WITH_PIC='--with-pic';
9
- else
10
- WITH_PIC=''
11
- fi
12
-
13
- ./configure --prefix=$PREFIX --exec-prefix=$PREFIX $WITH_PIC;
14
- make install
15
- cd ruby
16
-
17
- ruby extconf.rb --with-opt-lib=$PREFIX/lib/ --with-opt-include=$PREFIX/include/
18
- make
19
- cc -shared -o CRFPP.so CRFPP_wrap.o ../../lib/libcrfpp.a -L. -L/usr/lib -L. -rdynamic -Wl,-export-dynamic -lruby -lpthread -lpthread -ldl -lcrypt -lm -lc -lstdc++
20
-
21
- mkdir ../../ruby/
22
- cp CRFPP.so ../../ruby/
23
- cd ../../
24
- rm -Rf CRF* include
25
-
26
-
@@ -1,4 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz; gunzip gene_info.gz
4
- wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz; gunzip gene2pubmed.gz
@@ -1,4 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo
4
- wget http://www.geneontology.org/GO_slims/goslim_generic.obo
@@ -1,8 +0,0 @@
1
- #!/bin/bash
2
-
3
- wget http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt -O disease.txt
4
- wget http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt -O organ.txt
5
- wget http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt -O tissue.txt
6
- wget http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt -O subcellular.txt
7
- wget http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt -O drug.txt
8
- wget http://wishart.biology.ualberta.ca/polysearch/include/HMDBnames.txt -O metabolite.txt
@@ -1,206 +0,0 @@
1
- require 'rbbt/sources/organism'
2
- require 'rbbt/sources/biocreative'
3
- require 'rbbt/ner/rner'
4
-
5
- require 'progress-monitor'
6
-
7
-
8
- $type = ENV['type'] || 'rner'
9
-
10
- #{{{ FEATURES
11
-
12
- def BC2GM_features(dataset, outfile)
13
- data = Biocreative.BC2GM(dataset)
14
-
15
- fout = File.open(outfile,'w')
16
- parser = NERFeatures.new
17
-
18
- Progress.monitor("CRFPP Features BC2GM #{ dataset }")
19
- data.each{|code, info|
20
- text = info[:text]
21
- mentions = info[:mentions]
22
-
23
- features = parser.tagged_features(text,mentions)
24
-
25
- features.each{|feat|
26
- fout.puts feat.join(" ")
27
- }
28
- fout.puts
29
- }
30
- fout.close
31
- end
32
-
33
- def BC2GN_features(dataset, outfile)
34
- data = {}
35
- Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
36
- code = File.basename(f).sub(/.txt/,'')
37
- data[code] = {}
38
- data[code][:text] = Open.read(f)
39
- }
40
- Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each_line{|l|
41
- code, gene, mention = l.chomp.split(/\t/)
42
- data[code][:mentions] ||= []
43
- data[code][:mentions] << mention
44
- }
45
-
46
- fout = File.open(outfile,'w')
47
- parser = NERFeatures.new
48
-
49
- Progress.monitor("CRFPP Features BC2GN #{ dataset }")
50
- data.each{|code, info|
51
- text = info[:text]
52
- mentions = info[:mentions]
53
- next if mentions.nil?
54
-
55
- features = parser.tagged_features(text,mentions)
56
-
57
- features.each{|feat|
58
- fout.puts feat.join(" ")
59
- }
60
- fout.puts
61
- }
62
- fout.close
63
- end
64
-
65
- def org_features(org, outfile)
66
- names = Organism.lexicon(org).collect{|code, names|
67
- names
68
- }.flatten
69
-
70
- fout = File.open(outfile,'w')
71
- parser = NERFeatures.new
72
-
73
- Progress.monitor("CRFPP Features #{ org }")
74
- names.each{|name|
75
- features = parser.text_features(name, true)
76
- features.each{|feat|
77
- fout.puts feat.join(" ")
78
- }
79
- fout.puts
80
- }
81
- fout.close
82
-
83
-
84
- end
85
-
86
- file "data/BC2GM_train.features" do |t|
87
- BC2GM_features(:train, 'data/BC2GM_train.features')
88
- end
89
-
90
- file "data/BC2GM_test.features" do |t|
91
- BC2GM_features(:test, 'data/BC2GM_test.features')
92
- end
93
- file "data/BC2GN_Train.features" do |t|
94
- BC2GN_features('Train', 'data/BC2GN_Train.features')
95
- end
96
-
97
- file "data/BC2GN_Test.features" do |t|
98
- BC2GN_features('Test', 'data/BC2GN_Test.features')
99
- end
100
-
101
-
102
- file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
103
- Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
104
- Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
105
- end
106
-
107
- file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
108
- Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
109
- Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
110
- end
111
-
112
-
113
- file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
114
- Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
115
- Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
116
- end
117
-
118
- file "data/train.features" => [
119
- #'data/BC2GN.features',
120
- 'data/BC2GM_train.features'
121
- ] do |t|
122
- t.prerequisites.each_with_index{|f,i|
123
- if i == 0
124
- Open.write('data/train.features',Open.read(f))
125
- else
126
- Open.append('data/train.features',Open.read(f))
127
- end
128
- }
129
- end
130
-
131
- rule (/data\/(.*).features/) => ['data/BC2.features'] do |t|
132
- org = File.basename(t.name).sub(/.features$/,'')
133
- org_features(org, t.name)
134
- Open.append(t.name, Open.read('data/BC2.features'))
135
- end
136
-
137
-
138
-
139
- #{{{ MODEL
140
- rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
141
- parser = NERFeatures.new
142
- parser.train( t.name.sub(/model/,'data') + '.features', t.name)
143
- end
144
-
145
- task 'clean' do
146
- FileUtils.rm Dir.glob("data/*")
147
- FileUtils.rm Dir.glob("model/*")
148
- FileUtils.rm Dir.glob("results/*")
149
-
150
- end
151
-
152
- task 'all' do
153
- Organism.all.each{|org|
154
- Rake::Task["model/#{ org }"].invoke
155
- }
156
- end
157
-
158
- task 'default' do
159
- if $org
160
- FileUtils.rm Dir.glob("**/#{$org}.*") if $force
161
- Rake::Task["model/#{$org}"].invoke
162
- else
163
- Rake::Task['clean'].invoke if $force
164
- Rake::Task['all'].invoke
165
- end
166
- end
167
-
168
- #{{{ EVALUATE
169
-
170
-
171
- def find(model, type, outfile)
172
- ner = Organism.ner(:human,type,:model => model)
173
-
174
- data = Biocreative.BC2GM(:test)
175
-
176
- fout = File.open(outfile,'w')
177
-
178
- Progress.monitor("Test")
179
- data.each{|code,info|
180
- text = info[:text]
181
- mentions = ner.extract(text)
182
-
183
- mentions.each{|mention|
184
- positions = Biocreative.position(text,mention)
185
- positions.each{|pos|
186
- fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
187
- }
188
- }
189
- }
190
-
191
- end
192
-
193
-
194
-
195
- rule (/results\/test$/) do |t|
196
- org = File.basename(t.name)
197
-
198
- if $type == 'rner'
199
- Rake::Task['model/train'].invoke
200
- end
201
- find('model/train',$type,t.name)
202
- end
203
-
204
- rule (/results\/test.eval$/) => ['results/test'] do |t|
205
- Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
206
- end
@@ -1,52 +0,0 @@
1
- isLetters /^[A-Z]+$/i
2
- isUpper /^[A-Z]+$/
3
- isLower /^[a-z]+$/
4
- isDigits /^[0-9]+$/i
5
- isRoman /^[IVX]+$/
6
- isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
- isPunctuation /^[,.;]$/
8
- isDelim /^[\/()\[\]{}\-]$/
9
- isNonWord /^[^\w]+$/
10
- isConjunction /^and|or|&|,$/
11
-
12
- hasLetters /[A-Z]/i
13
- hasUpper /.[A-Z]/
14
- hasLower /[a-z]/
15
- hasDigits /[0-9]/i
16
- hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
- hasPunctuation /[,.;]/
18
- hasDelim /[\/()\[\]{}\-]/
19
- hasNonWord /[^\w]/
20
- caspMix /[a-z].[A-Z]/
21
- keywords /(?:protein|gene|domain|ase)s?$/
22
- hasSuffix /[a-z][A-Z0-9]$/
23
-
24
- numLetters do |w| w.scan(/[A-Z]/i).length end
25
- numDigits do |w| w.scan(/[0-9]/).length end
26
- #
27
- prefix_3 /^(...)/
28
- prefix_4 /^(....)/
29
- suffix_3 /(...)$/
30
- suffix_4 /(....)$/
31
-
32
-
33
- token1 do |w|
34
- w.sub(/[A-Z]/,'A').
35
- sub(/[a-z]/,'a').
36
- sub(/[0-9]/,'0').
37
- sub(/[^0-9a-z]/i,'x')
38
- end
39
- token2 do |w|
40
- w.sub(/[A-Z]+/,'A').
41
- sub(/[a-z]+/,'a').
42
- sub(/[0-9]+/,'0').
43
- sub(/[^0-9a-z]+/i,'x')
44
- end
45
- token3 do |w| w.downcase end
46
- special do |w| w.is_special? end
47
-
48
- context %w(special token2 isPunctuation isDelim)
49
- window %w(1 2 3 -1 -2 -3)
50
- #direction :reverse
51
-
52
-
@@ -1,219 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/sources/organism'
3
- require 'rbbt/util/open'
4
- require 'rbbt/ner/rner'
5
- require 'rbbt/ner/rnorm'
6
-
7
-
8
- require 'progress-monitor'
9
-
10
- $type = ENV['ner'] || :rner
11
- $debug = !ENV['debug'].nil?
12
- $perfect = !ENV['perfect'].nil?
13
- $docs = ENV['docs']
14
-
15
-
16
- $org2rbbt = {
17
- 'yeast' => 'Sce',
18
- 'mouse' => 'Mmu',
19
- 'fly' => 'Sce',
20
- 'bc2gn' => 'Hsa',
21
- }
22
-
23
- def match(org, filedir, goldstandard,outfile)
24
-
25
- t = Time.now
26
- if org == 'bc2gn'
27
- custom_file = File.join('config', org + '.config')
28
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
29
- :to_entrez => false,
30
- :file => (File.exist?(custom_file) ? custom_file : nil),
31
- :max_candidates => 200)
32
- else
33
- custom_file = File.join('config', org + '.config')
34
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
35
- :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
36
- :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
37
- :fix => proc{|l| l.sub(/S000/,'S0')}),
38
- :file => (File.exist?(custom_file) ? custom_file : nil),
39
- :max_candidates => 200)
40
- end
41
- STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
42
-
43
-
44
- if $type.to_s == 'rner'
45
- ner = NER.new('models/' + org)
46
- else
47
- ner = Organism.ner($org2rbbt[org], $type)
48
- end
49
-
50
-
51
- fout=File.open(outfile,'w')
52
-
53
- gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
54
- gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
55
-
56
- if org == 'bc2gn'
57
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
58
- else
59
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
60
- end
61
-
62
- if $docs
63
- files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
64
- else
65
- files = Dir.glob(filedir + '*.txt').sort
66
- end
67
-
68
- Progress.monitor("Processing Files")
69
- files.each{|f|
70
- fid = File.basename(f).sub(/.txt/,'')
71
-
72
- text = Open.read(f)
73
- if $perfect
74
- mentions = (gs_mentions[fid] || []).flatten
75
-
76
- else
77
- mentions = ner.extract(text).uniq
78
- end
79
-
80
- if $debug
81
- puts "------------------------------------"
82
- puts "FILE #{fid}"
83
- puts
84
- puts text
85
- puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
86
- puts "MENTIONS: #{mentions.join(", ")}"
87
- end
88
-
89
-
90
- found = []
91
- mentions.each{|mention|
92
-
93
- codes = norm.select(norm.match(mention),mention,text)
94
-
95
- found += codes
96
- codes.each{|code|
97
- #code = code.sub(/S000/,'S0')
98
- fout.puts "#{ fid }\t#{ code}\t#{mention}"
99
- }
100
-
101
- puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
102
- }
103
-
104
- if $debug
105
- found.uniq!
106
- fn = (gs[fid] || []).flatten.uniq - found
107
- fp = found - (gs[fid] || []).flatten.uniq
108
-
109
- fn.each{|code|
110
- if lex[code]
111
- puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
112
- else
113
- puts "FN: #{ code }"
114
- end
115
- }
116
- fp.each{|code|
117
- if lex[code]
118
- puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
119
- else
120
- puts "FN: #{ code }"
121
- end
122
- }
123
-
124
-
125
- end
126
-
127
- }
128
- fout.close
129
-
130
- end
131
-
132
- rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
133
- org = File.basename(t.name).sub(/\.features/,'')
134
-
135
- if org == 'bc2gn'
136
- lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
137
- else
138
- lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
139
- end
140
-
141
- names = File.open(lexicon).collect{|l|
142
- names = l.split(/\t/)
143
- names.shift
144
- names.compact.select{|n| !n.empty?}
145
- }.flatten
146
-
147
- fout = File.open(t.name,'w')
148
- parser = NERFeatures.new
149
-
150
- Progress.monitor("CRFPP Features #{ org }")
151
- names.each{|name|
152
- features = parser.text_features(name, true)
153
- features.each{|feat|
154
- fout.puts feat.join(" ")
155
- }
156
- fout.puts
157
- }
158
- fout.close
159
- if org != 'bc2gn'
160
- Open.append(t.name, Open.read('../ner/data/BC2.features'))
161
- else
162
- Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
163
- Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
164
- end
165
-
166
- end
167
-
168
- rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
169
- org = File.basename(t.name)
170
-
171
- parser = NERFeatures.new
172
- parser.train( t.name + '.features', t.name)
173
- end
174
-
175
-
176
- rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
177
- org, dataset = File.basename(t.name).split(/_/)
178
-
179
- if $type.to_sym == :rner
180
- Rake::Task['models/' + org].invoke
181
- end
182
-
183
- filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
184
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
185
-
186
- match(org,filedir, goldstandard,t.name)
187
- end
188
-
189
- rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
190
- org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
191
-
192
- cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
193
- puts cmd
194
- system cmd
195
- end
196
-
197
- rule (/results\/bc2gn$/) do |t|
198
- org = 'bc2gn'
199
-
200
- if $type.to_sym == :rner
201
- Rake::Task['models/' + org].invoke
202
- end
203
-
204
- filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
205
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
206
-
207
- match(org,filedir, goldstandard,t.name)
208
- end
209
-
210
- rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
211
-
212
- cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
213
- system cmd
214
-
215
- end
216
-
217
-
218
-
219
-