rbbt 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/bin/rbbt_config +180 -0
- data/install_scripts/classifier/R/classify.R +36 -0
- data/install_scripts/classifier/Rakefile +140 -0
- data/install_scripts/get_abner.sh +2 -0
- data/install_scripts/get_banner.sh +25 -0
- data/install_scripts/get_biocreative.sh +72 -0
- data/install_scripts/get_crf++.sh +26 -0
- data/install_scripts/get_entrez.sh +4 -0
- data/install_scripts/get_go.sh +4 -0
- data/install_scripts/get_polysearch.sh +8 -0
- data/install_scripts/ner/Rakefile +206 -0
- data/install_scripts/ner/config/default.rb +52 -0
- data/install_scripts/norm/Rakefile +218 -0
- data/install_scripts/norm/config/cue_default.rb +10 -0
- data/install_scripts/norm/config/tokens_default.rb +79 -0
- data/install_scripts/norm/functions.sh +21 -0
- data/install_scripts/organisms/Rakefile +25 -0
- data/install_scripts/organisms/cgd.Rakefile +84 -0
- data/install_scripts/organisms/human.Rakefile +145 -0
- data/install_scripts/organisms/mgi.Rakefile +77 -0
- data/install_scripts/organisms/pombe.Rakefile +40 -0
- data/install_scripts/organisms/rake-include.rb +258 -0
- data/install_scripts/organisms/rgd.Rakefile +88 -0
- data/install_scripts/organisms/sgd.Rakefile +66 -0
- data/install_scripts/organisms/tair.Rakefile +54 -0
- data/install_scripts/organisms/worm.Rakefile +109 -0
- data/install_scripts/stopwords +1 -0
- data/install_scripts/wordlists/consonants +897 -0
- data/install_scripts/wordlists/stopwords +1 -0
- data/lib/rbbt/bow/bow.rb +87 -0
- data/lib/rbbt/bow/classifier.rb +118 -0
- data/lib/rbbt/bow/dictionary.rb +218 -0
- data/lib/rbbt/ner/abner.rb +34 -0
- data/lib/rbbt/ner/banner.rb +73 -0
- data/lib/rbbt/ner/regexpNER.rb +62 -0
- data/lib/rbbt/ner/rner.rb +227 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
- data/lib/rbbt/ner/rnorm.rb +142 -0
- data/lib/rbbt/sources/biocreative.rb +75 -0
- data/lib/rbbt/sources/biomart.rb +106 -0
- data/lib/rbbt/sources/entrez.rb +211 -0
- data/lib/rbbt/sources/go.rb +40 -0
- data/lib/rbbt/sources/organism.rb +197 -0
- data/lib/rbbt/sources/polysearch.rb +88 -0
- data/lib/rbbt/sources/pubmed.rb +111 -0
- data/lib/rbbt/util/arrayHash.rb +255 -0
- data/lib/rbbt/util/filecache.rb +72 -0
- data/lib/rbbt/util/index.rb +69 -0
- data/lib/rbbt/util/misc.rb +101 -0
- data/lib/rbbt/util/open.rb +207 -0
- data/lib/rbbt/util/simpleDSL.rb +87 -0
- data/lib/rbbt/util/tmpfile.rb +19 -0
- data/lib/rbbt/version.rb +10 -0
- data/lib/rbbt.rb +86 -0
- data/tasks/install.rake +123 -0
- metadata +114 -0
@@ -0,0 +1,206 @@
|
|
1
|
+
require 'rbbt/sources/organism'
|
2
|
+
require 'rbbt/sources/biocreative'
|
3
|
+
require 'rbbt/ner/rner'
|
4
|
+
|
5
|
+
require 'progress-monitor'
|
6
|
+
|
7
|
+
|
8
|
+
$type = ENV['type'] || 'rner'
|
9
|
+
|
10
|
+
#{{{ FEATURES
|
11
|
+
|
12
|
+
def BC2GM_features(dataset, outfile)
|
13
|
+
data = Biocreative.BC2GM(dataset)
|
14
|
+
|
15
|
+
fout = File.open(outfile,'w')
|
16
|
+
parser = NERFeatures.new
|
17
|
+
|
18
|
+
Progress.monitor("CRFPP Features BC2GM #{ dataset }")
|
19
|
+
data.each{|code, info|
|
20
|
+
text = info[:text]
|
21
|
+
mentions = info[:mentions]
|
22
|
+
|
23
|
+
features = parser.tagged_features(text,mentions)
|
24
|
+
|
25
|
+
features.each{|feat|
|
26
|
+
fout.puts feat.join(" ")
|
27
|
+
}
|
28
|
+
fout.puts
|
29
|
+
}
|
30
|
+
fout.close
|
31
|
+
end
|
32
|
+
|
33
|
+
def BC2GN_features(dataset, outfile)
|
34
|
+
data = {}
|
35
|
+
Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
|
36
|
+
code = File.basename(f).sub(/.txt/,'')
|
37
|
+
data[code] = {}
|
38
|
+
data[code][:text] = Open.read(f)
|
39
|
+
}
|
40
|
+
Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each{|l|
|
41
|
+
code, gene, mention = l.chomp.split(/\t/)
|
42
|
+
data[code][:mentions] ||= []
|
43
|
+
data[code][:mentions] << mention
|
44
|
+
}
|
45
|
+
|
46
|
+
fout = File.open(outfile,'w')
|
47
|
+
parser = NERFeatures.new
|
48
|
+
|
49
|
+
Progress.monitor("CRFPP Features BC2GN #{ dataset }")
|
50
|
+
data.each{|code, info|
|
51
|
+
text = info[:text]
|
52
|
+
mentions = info[:mentions]
|
53
|
+
next if mentions.nil?
|
54
|
+
|
55
|
+
features = parser.tagged_features(text,mentions)
|
56
|
+
|
57
|
+
features.each{|feat|
|
58
|
+
fout.puts feat.join(" ")
|
59
|
+
}
|
60
|
+
fout.puts
|
61
|
+
}
|
62
|
+
fout.close
|
63
|
+
end
|
64
|
+
|
65
|
+
def org_features(org, outfile)
|
66
|
+
names = Organism.lexicon(org).collect{|code, names|
|
67
|
+
names
|
68
|
+
}.flatten
|
69
|
+
|
70
|
+
fout = File.open(outfile,'w')
|
71
|
+
parser = NERFeatures.new
|
72
|
+
|
73
|
+
Progress.monitor("CRFPP Features #{ org }")
|
74
|
+
names.each{|name|
|
75
|
+
features = parser.text_features(name, true)
|
76
|
+
features.each{|feat|
|
77
|
+
fout.puts feat.join(" ")
|
78
|
+
}
|
79
|
+
fout.puts
|
80
|
+
}
|
81
|
+
fout.close
|
82
|
+
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
file "data/BC2GM_train.features" do |t|
|
87
|
+
BC2GM_features(:train, 'data/BC2GM_train.features')
|
88
|
+
end
|
89
|
+
|
90
|
+
file "data/BC2GM_test.features" do |t|
|
91
|
+
BC2GM_features(:test, 'data/BC2GM_test.features')
|
92
|
+
end
|
93
|
+
file "data/BC2GN_Train.features" do |t|
|
94
|
+
BC2GN_features('Train', 'data/BC2GN_Train.features')
|
95
|
+
end
|
96
|
+
|
97
|
+
file "data/BC2GN_Test.features" do |t|
|
98
|
+
BC2GN_features('Test', 'data/BC2GN_Test.features')
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
|
103
|
+
Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
|
104
|
+
Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
|
105
|
+
end
|
106
|
+
|
107
|
+
file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
|
108
|
+
Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
|
109
|
+
Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
|
114
|
+
Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
|
115
|
+
Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
|
116
|
+
end
|
117
|
+
|
118
|
+
file "data/train.features" => [
|
119
|
+
#'data/BC2GN.features',
|
120
|
+
'data/BC2GM_train.features'
|
121
|
+
] do |t|
|
122
|
+
t.prerequisites.each_with_index{|f,i|
|
123
|
+
if i == 0
|
124
|
+
Open.write('data/train.features',Open.read(f))
|
125
|
+
else
|
126
|
+
Open.append('data/train.features',Open.read(f))
|
127
|
+
end
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
rule (/data\/(.*).features/) => ['data/BC2.features'] do |t|
|
132
|
+
org = File.basename(t.name).sub(/.features$/,'')
|
133
|
+
org_features(org, t.name)
|
134
|
+
Open.append(t.name, Open.read('data/BC2.features'))
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
#{{{ MODEL
|
140
|
+
rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
|
141
|
+
parser = NERFeatures.new
|
142
|
+
parser.train( t.name.sub(/model/,'data') + '.features', t.name)
|
143
|
+
end
|
144
|
+
|
145
|
+
task 'clean' do
|
146
|
+
FileUtils.rm Dir.glob("data/*")
|
147
|
+
FileUtils.rm Dir.glob("model/*")
|
148
|
+
FileUtils.rm Dir.glob("results/*")
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
task 'all' do
|
153
|
+
Organism.all.each{|org|
|
154
|
+
Rake::Task["model/#{ org }"].invoke
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
task 'default' do
|
159
|
+
if $org
|
160
|
+
FileUtils.rm Dir.glob("**/#{$org}.*") if $force
|
161
|
+
Rake::Task["model/#{$org}"].invoke
|
162
|
+
else
|
163
|
+
Rake::Task['clean'].invoke if $force
|
164
|
+
Rake::Task['all'].invoke
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
#{{{ EVALUATE
|
169
|
+
|
170
|
+
|
171
|
+
def find(model, type, outfile)
|
172
|
+
ner = Organism.ner(:human,type,:model => model)
|
173
|
+
|
174
|
+
data = Biocreative.BC2GM(:test)
|
175
|
+
|
176
|
+
fout = File.open(outfile,'w')
|
177
|
+
|
178
|
+
Progress.monitor("Test")
|
179
|
+
data.each{|code,info|
|
180
|
+
text = info[:text]
|
181
|
+
mentions = ner.extract(text)
|
182
|
+
|
183
|
+
mentions.each{|mention|
|
184
|
+
positions = Biocreative.position(text,mention)
|
185
|
+
positions.each{|pos|
|
186
|
+
fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
|
187
|
+
}
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
rule (/results\/test$/) do |t|
|
196
|
+
org = File.basename(t.name)
|
197
|
+
|
198
|
+
if $type == 'rner'
|
199
|
+
Rake::Task['model/train'].invoke
|
200
|
+
end
|
201
|
+
find('model/train',$type,t.name)
|
202
|
+
end
|
203
|
+
|
204
|
+
rule (/results\/test.eval$/) => ['results/test'] do |t|
|
205
|
+
Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
|
206
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
isLetters /^[A-Z]+$/i
|
2
|
+
isUpper /^[A-Z]+$/
|
3
|
+
isLower /^[a-z]+$/
|
4
|
+
isDigits /^[0-9]+$/i
|
5
|
+
isRoman /^[IVX]+$/
|
6
|
+
isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
|
7
|
+
isPunctuation /^[,.;]$/
|
8
|
+
isDelim /^[\/()\[\]{}\-]$/
|
9
|
+
isNonWord /^[^\w]+$/
|
10
|
+
isConjunction /^and|or|&|,$/
|
11
|
+
|
12
|
+
hasLetters /[A-Z]/i
|
13
|
+
hasUpper /.[A-Z]/
|
14
|
+
hasLower /[a-z]/
|
15
|
+
hasDigits /[0-9]/i
|
16
|
+
hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
|
17
|
+
hasPunctuation /[,.;]/
|
18
|
+
hasDelim /[\/()\[\]{}\-]/
|
19
|
+
hasNonWord /[^\w]/
|
20
|
+
caspMix /[a-z].[A-Z]/
|
21
|
+
keywords /(?:protein|gene|domain|ase)s?$/
|
22
|
+
hasSuffix /[a-z][A-Z0-9]$/
|
23
|
+
|
24
|
+
numLetters do |w| w.scan(/[A-Z]/i).length end
|
25
|
+
numDigits do |w| w.scan(/[0-9]/).length end
|
26
|
+
#
|
27
|
+
prefix_3 /^(...)/
|
28
|
+
prefix_4 /^(....)/
|
29
|
+
suffix_3 /(...)$/
|
30
|
+
suffix_4 /(....)$/
|
31
|
+
|
32
|
+
|
33
|
+
token1 do |w|
|
34
|
+
w.sub(/[A-Z]/,'A').
|
35
|
+
sub(/[a-z]/,'a').
|
36
|
+
sub(/[0-9]/,'0').
|
37
|
+
sub(/[^0-9a-z]/i,'x')
|
38
|
+
end
|
39
|
+
token2 do |w|
|
40
|
+
w.sub(/[A-Z]+/,'A').
|
41
|
+
sub(/[a-z]+/,'a').
|
42
|
+
sub(/[0-9]+/,'0').
|
43
|
+
sub(/[^0-9a-z]+/i,'x')
|
44
|
+
end
|
45
|
+
token3 do |w| w.downcase end
|
46
|
+
special do |w| w.is_special? end
|
47
|
+
|
48
|
+
context %w(special token2 isPunctuation isDelim)
|
49
|
+
window %w(1 2 3 -1 -2 -3)
|
50
|
+
#direction :reverse
|
51
|
+
|
52
|
+
|
@@ -0,0 +1,218 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/sources/organism'
|
3
|
+
require 'rbbt/util/open'
|
4
|
+
require 'rbbt/ner/rner'
|
5
|
+
|
6
|
+
|
7
|
+
require 'progress-meter'
|
8
|
+
|
9
|
+
$type = ENV['ner'] || :rner
|
10
|
+
$debug = !ENV['debug'].nil?
|
11
|
+
$perfect = !ENV['perfect'].nil?
|
12
|
+
$docs = ENV['docs']
|
13
|
+
|
14
|
+
|
15
|
+
$org2rbbt = {
|
16
|
+
'yeast' => 'sgd',
|
17
|
+
'mouse' => 'mgi',
|
18
|
+
'fly' => 'sgd',
|
19
|
+
'bc2gn' => 'human',
|
20
|
+
}
|
21
|
+
|
22
|
+
def match(org, filedir, goldstandard,outfile)
|
23
|
+
|
24
|
+
t = Time.now
|
25
|
+
if org == 'bc2gn'
|
26
|
+
custom_file = File.join('config', org + '.config')
|
27
|
+
norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
|
28
|
+
:to_entrez => false,
|
29
|
+
:file => (File.exist?(custom_file) ? custom_file : nil),
|
30
|
+
:max_candidates => 200)
|
31
|
+
else
|
32
|
+
custom_file = File.join('config', org + '.config')
|
33
|
+
norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
|
34
|
+
:to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
|
35
|
+
:native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
|
36
|
+
:fix => proc{|l| l.sub(/S000/,'S0')}),
|
37
|
+
:file => (File.exist?(custom_file) ? custom_file : nil),
|
38
|
+
:max_candidates => 200)
|
39
|
+
end
|
40
|
+
STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
|
41
|
+
|
42
|
+
|
43
|
+
if $type.to_s == 'rner'
|
44
|
+
ner = NER.new('models/' + org)
|
45
|
+
else
|
46
|
+
ner = Organism.ner($org2rbbt[org], $type)
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
fout=File.open(outfile,'w')
|
51
|
+
|
52
|
+
gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
|
53
|
+
gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
|
54
|
+
|
55
|
+
if org == 'bc2gn'
|
56
|
+
lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
|
57
|
+
else
|
58
|
+
lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
|
59
|
+
end
|
60
|
+
|
61
|
+
if $docs
|
62
|
+
files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
|
63
|
+
else
|
64
|
+
files = Dir.glob(filedir + '*.txt').sort
|
65
|
+
end
|
66
|
+
|
67
|
+
Progress.monitor("Processing Files")
|
68
|
+
files.each{|f|
|
69
|
+
fid = File.basename(f).sub(/.txt/,'')
|
70
|
+
|
71
|
+
text = Open.read(f)
|
72
|
+
if $perfect
|
73
|
+
mentions = (gs_mentions[fid] || []).flatten
|
74
|
+
|
75
|
+
else
|
76
|
+
mentions = ner.extract(text).uniq
|
77
|
+
end
|
78
|
+
|
79
|
+
if $debug
|
80
|
+
puts "------------------------------------"
|
81
|
+
puts "FILE #{fid}"
|
82
|
+
puts
|
83
|
+
puts text
|
84
|
+
puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
|
85
|
+
puts "MENTIONS: #{mentions.join(", ")}"
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
found = []
|
90
|
+
mentions.each{|mention|
|
91
|
+
|
92
|
+
codes = norm.select(norm.match(mention),mention,text)
|
93
|
+
|
94
|
+
found += codes
|
95
|
+
codes.each{|code|
|
96
|
+
#code = code.sub(/S000/,'S0')
|
97
|
+
fout.puts "#{ fid }\t#{ code}\t#{mention}"
|
98
|
+
}
|
99
|
+
|
100
|
+
puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
|
101
|
+
}
|
102
|
+
|
103
|
+
if $debug
|
104
|
+
found.uniq!
|
105
|
+
fn = (gs[fid] || []).flatten.uniq - found
|
106
|
+
fp = found - (gs[fid] || []).flatten.uniq
|
107
|
+
|
108
|
+
fn.each{|code|
|
109
|
+
if lex[code]
|
110
|
+
puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
|
111
|
+
else
|
112
|
+
puts "FN: #{ code }"
|
113
|
+
end
|
114
|
+
}
|
115
|
+
fp.each{|code|
|
116
|
+
if lex[code]
|
117
|
+
puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
|
118
|
+
else
|
119
|
+
puts "FN: #{ code }"
|
120
|
+
end
|
121
|
+
}
|
122
|
+
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
}
|
127
|
+
fout.close
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
|
132
|
+
org = File.basename(t.name).sub(/\.features/,'')
|
133
|
+
|
134
|
+
if org == 'bc2gn'
|
135
|
+
lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
|
136
|
+
else
|
137
|
+
lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
|
138
|
+
end
|
139
|
+
|
140
|
+
names = File.open(lexicon).collect{|l|
|
141
|
+
names = l.split(/\t/)
|
142
|
+
names.shift
|
143
|
+
names.compact.select{|n| !n.empty?}
|
144
|
+
}.flatten
|
145
|
+
|
146
|
+
fout = File.open(t.name,'w')
|
147
|
+
parser = NERFeatures.new
|
148
|
+
|
149
|
+
Progress.monitor("CRFPP Features #{ org }")
|
150
|
+
names.each{|name|
|
151
|
+
features = parser.text_features(name, true)
|
152
|
+
features.each{|feat|
|
153
|
+
fout.puts feat.join(" ")
|
154
|
+
}
|
155
|
+
fout.puts
|
156
|
+
}
|
157
|
+
fout.close
|
158
|
+
if org != 'bc2gn'
|
159
|
+
Open.append(t.name, Open.read('../ner/data/BC2.features'))
|
160
|
+
else
|
161
|
+
Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
|
162
|
+
Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
|
167
|
+
rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
|
168
|
+
org = File.basename(t.name)
|
169
|
+
|
170
|
+
parser = NERFeatures.new
|
171
|
+
parser.train( t.name + '.features', t.name)
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
|
176
|
+
org, dataset = File.basename(t.name).split(/_/)
|
177
|
+
|
178
|
+
if $type.to_sym == :rner
|
179
|
+
Rake::Task['models/' + org].invoke
|
180
|
+
end
|
181
|
+
|
182
|
+
filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
|
183
|
+
goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
|
184
|
+
|
185
|
+
match(org,filedir, goldstandard,t.name)
|
186
|
+
end
|
187
|
+
|
188
|
+
rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
|
189
|
+
org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
|
190
|
+
|
191
|
+
cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
|
192
|
+
puts cmd
|
193
|
+
system cmd
|
194
|
+
end
|
195
|
+
|
196
|
+
rule (/results\/bc2gn$/) do |t|
|
197
|
+
org = 'bc2gn'
|
198
|
+
|
199
|
+
if $type.to_sym == :rner
|
200
|
+
Rake::Task['models/' + org].invoke
|
201
|
+
end
|
202
|
+
|
203
|
+
filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
|
204
|
+
goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
|
205
|
+
|
206
|
+
match(org,filedir, goldstandard,t.name)
|
207
|
+
end
|
208
|
+
|
209
|
+
rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
|
210
|
+
|
211
|
+
cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
|
212
|
+
system cmd
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
|
218
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
equal do |w| [w] end
|
2
|
+
standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
|
3
|
+
cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
|
4
|
+
special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
|
5
|
+
words do |w|
|
6
|
+
w.sub(/(.*)I$/,'\1I \1').
|
7
|
+
scan(/[a-z][a-z]+/i).
|
8
|
+
sort{|a,b| b.length <=> a.length}.
|
9
|
+
collect{|n| n.downcase}
|
10
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
2
|
+
tokens do
|
3
|
+
|
4
|
+
# Some (possible) single letters first
|
5
|
+
receptor /^(?:receptor|r)s?$/i
|
6
|
+
protein /^(?:protein|p)s?$/i
|
7
|
+
roman /^[IV]+$/
|
8
|
+
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
9
|
+
|
10
|
+
|
11
|
+
# Some words for removal
|
12
|
+
stopword do |w| $stopwords.include?( w.downcase_first) end
|
13
|
+
gene /genes?/i
|
14
|
+
dna
|
15
|
+
cdna
|
16
|
+
rna
|
17
|
+
mrna
|
18
|
+
trna
|
19
|
+
cdna
|
20
|
+
component
|
21
|
+
exon
|
22
|
+
intron
|
23
|
+
domain
|
24
|
+
family
|
25
|
+
|
26
|
+
|
27
|
+
# Important words
|
28
|
+
number /^(?:\d+[.,]?\d+|\d)$/
|
29
|
+
greek do |w| $greek[w.downcase] != nil end
|
30
|
+
special do |w| w.is_special? end
|
31
|
+
promoter
|
32
|
+
similar /^(homolog.*|like|related|associated)$/
|
33
|
+
ase /ase$/
|
34
|
+
in_end /in$/
|
35
|
+
end
|
36
|
+
|
37
|
+
comparisons do
|
38
|
+
|
39
|
+
compare.number do |l1,l2|
|
40
|
+
v = 0
|
41
|
+
case
|
42
|
+
when l1.empty? && l2.empty?
|
43
|
+
v = 0
|
44
|
+
when l1.sort.uniq == l2.sort.uniq
|
45
|
+
v = 3
|
46
|
+
when l1.any? && l1[0] == l2[0]
|
47
|
+
v = -3
|
48
|
+
when l1.empty? && l2 == ['1']
|
49
|
+
v = -5
|
50
|
+
else
|
51
|
+
v = -10
|
52
|
+
end
|
53
|
+
v
|
54
|
+
end
|
55
|
+
|
56
|
+
diff.promoter -10
|
57
|
+
diff.receptor -10
|
58
|
+
diff.similar -10
|
59
|
+
diff.capital -10
|
60
|
+
|
61
|
+
same.unknown 1
|
62
|
+
miss.unknown -2
|
63
|
+
extr.unknown -2
|
64
|
+
|
65
|
+
same.greek 1
|
66
|
+
miss.greek -2
|
67
|
+
extr.greek -2
|
68
|
+
|
69
|
+
same.special 4
|
70
|
+
miss.special -3
|
71
|
+
extr.special -3
|
72
|
+
|
73
|
+
transform.roman do |t| [t.arabic, :number] end
|
74
|
+
transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
|
75
|
+
transform.ase do |t| [t, :special] end
|
76
|
+
transform.in_end do |t| [t, :special] end
|
77
|
+
transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
|
78
|
+
end
|
79
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
function norm(){
|
3
|
+
o=$1
|
4
|
+
shift
|
5
|
+
s=$1
|
6
|
+
shift
|
7
|
+
n=$1
|
8
|
+
shift
|
9
|
+
|
10
|
+
echo "rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval"
|
11
|
+
rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval
|
12
|
+
}
|
13
|
+
|
14
|
+
|
15
|
+
function norm_2(){
|
16
|
+
n=$1
|
17
|
+
shift
|
18
|
+
|
19
|
+
echo "rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval"
|
20
|
+
rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval
|
21
|
+
}
|
@@ -0,0 +1,25 @@
|
|
1
|
+
$org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
|
2
|
+
|
3
|
+
task 'default' do
|
4
|
+
if $org
|
5
|
+
orgs = [$org]
|
6
|
+
else
|
7
|
+
|
8
|
+
orgs = Dir.glob('*').
|
9
|
+
select{|t|
|
10
|
+
File.directory?(t ) &&
|
11
|
+
File.exist?(t + '/Rakefile')
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
orgs.each{|org|
|
16
|
+
puts "Updating #{ org }"
|
17
|
+
pid = Process.fork{
|
18
|
+
Dir.chdir(org)
|
19
|
+
load 'Rakefile'
|
20
|
+
Rake::Task['update'].invoke
|
21
|
+
}
|
22
|
+
Process.waitpid pid
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Candida albicans"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "Systematic Name"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 237561,
|
10
|
+
:fix => proc{|code| code.sub(/^CaO/,'orf') },
|
11
|
+
:check => proc{|code| code.match(/^orf/)},
|
12
|
+
:native => 3
|
13
|
+
}
|
14
|
+
|
15
|
+
$lexicon = {
|
16
|
+
:file => {
|
17
|
+
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
18
|
+
:native => 0,
|
19
|
+
:extra => [8,1,2],
|
20
|
+
:exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
|
21
|
+
},
|
22
|
+
}
|
23
|
+
|
24
|
+
$identifiers = {
|
25
|
+
:file => {
|
26
|
+
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
27
|
+
:native => 0,
|
28
|
+
:extra => [8,1,2],
|
29
|
+
:exclude => proc{|l| l.match(/^!/)},
|
30
|
+
:fields => ["GCD ID", "Gene Name", "Gene Alias"]
|
31
|
+
},
|
32
|
+
}
|
33
|
+
|
34
|
+
$go = {
|
35
|
+
:url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
|
36
|
+
:code => 10,
|
37
|
+
:go => 4,
|
38
|
+
:pmid => 5,
|
39
|
+
:fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
|
40
|
+
}
|
41
|
+
|
42
|
+
$query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
43
|
+
|
44
|
+
####
|
45
|
+
|
46
|
+
#Rake::Task['identifiers'].clear
|
47
|
+
#file 'identifiers' => ['lexicon'] do |t|
|
48
|
+
# identifiers = {}
|
49
|
+
# if $identifiers[:file]
|
50
|
+
# identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
|
54
|
+
#
|
55
|
+
# translations = {}
|
56
|
+
#
|
57
|
+
# Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
|
58
|
+
# orfs.each{|orf|
|
59
|
+
# translations[orf] ||= []
|
60
|
+
# translations[orf] << entrez
|
61
|
+
# }
|
62
|
+
# }
|
63
|
+
#
|
64
|
+
# orf2native.each{|orf, native|
|
65
|
+
# next unless identifiers[native]
|
66
|
+
# identifiers[native] << [orf]
|
67
|
+
# if translations[orf]
|
68
|
+
# identifiers[native] << translations[orf]
|
69
|
+
# else
|
70
|
+
# identifiers[native] << []
|
71
|
+
# end
|
72
|
+
#
|
73
|
+
# }
|
74
|
+
#
|
75
|
+
# header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
|
76
|
+
# Open.write('identifiers',
|
77
|
+
# header +
|
78
|
+
# identifiers.collect{|code, name_lists|
|
79
|
+
# "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
|
80
|
+
# }.join("\n")
|
81
|
+
# )
|
82
|
+
#end
|
83
|
+
#
|
84
|
+
#
|