rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,206 @@
1
+ require 'rbbt/sources/organism'
2
+ require 'rbbt/sources/biocreative'
3
+ require 'rbbt/ner/rner'
4
+
5
+ require 'progress-monitor'
6
+
7
+
8
+ $type = ENV['type'] || 'rner'
9
+
10
+ #{{{ FEATURES
11
+
12
+ def BC2GM_features(dataset, outfile)
13
+ data = Biocreative.BC2GM(dataset)
14
+
15
+ fout = File.open(outfile,'w')
16
+ parser = NERFeatures.new
17
+
18
+ Progress.monitor("CRFPP Features BC2GM #{ dataset }")
19
+ data.each{|code, info|
20
+ text = info[:text]
21
+ mentions = info[:mentions]
22
+
23
+ features = parser.tagged_features(text,mentions)
24
+
25
+ features.each{|feat|
26
+ fout.puts feat.join(" ")
27
+ }
28
+ fout.puts
29
+ }
30
+ fout.close
31
+ end
32
+
33
+ def BC2GN_features(dataset, outfile)
34
+ data = {}
35
+ Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
36
+ code = File.basename(f).sub(/.txt/,'')
37
+ data[code] = {}
38
+ data[code][:text] = Open.read(f)
39
+ }
40
+ Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each{|l|
41
+ code, gene, mention = l.chomp.split(/\t/)
42
+ data[code][:mentions] ||= []
43
+ data[code][:mentions] << mention
44
+ }
45
+
46
+ fout = File.open(outfile,'w')
47
+ parser = NERFeatures.new
48
+
49
+ Progress.monitor("CRFPP Features BC2GN #{ dataset }")
50
+ data.each{|code, info|
51
+ text = info[:text]
52
+ mentions = info[:mentions]
53
+ next if mentions.nil?
54
+
55
+ features = parser.tagged_features(text,mentions)
56
+
57
+ features.each{|feat|
58
+ fout.puts feat.join(" ")
59
+ }
60
+ fout.puts
61
+ }
62
+ fout.close
63
+ end
64
+
65
+ def org_features(org, outfile)
66
+ names = Organism.lexicon(org).collect{|code, names|
67
+ names
68
+ }.flatten
69
+
70
+ fout = File.open(outfile,'w')
71
+ parser = NERFeatures.new
72
+
73
+ Progress.monitor("CRFPP Features #{ org }")
74
+ names.each{|name|
75
+ features = parser.text_features(name, true)
76
+ features.each{|feat|
77
+ fout.puts feat.join(" ")
78
+ }
79
+ fout.puts
80
+ }
81
+ fout.close
82
+
83
+
84
+ end
85
+
86
+ file "data/BC2GM_train.features" do |t|
87
+ BC2GM_features(:train, 'data/BC2GM_train.features')
88
+ end
89
+
90
+ file "data/BC2GM_test.features" do |t|
91
+ BC2GM_features(:test, 'data/BC2GM_test.features')
92
+ end
93
+ file "data/BC2GN_Train.features" do |t|
94
+ BC2GN_features('Train', 'data/BC2GN_Train.features')
95
+ end
96
+
97
+ file "data/BC2GN_Test.features" do |t|
98
+ BC2GN_features('Test', 'data/BC2GN_Test.features')
99
+ end
100
+
101
+
102
+ file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
103
+ Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
104
+ Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
105
+ end
106
+
107
+ file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
108
+ Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
109
+ Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
110
+ end
111
+
112
+
113
+ file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
114
+ Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
115
+ Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
116
+ end
117
+
118
+ file "data/train.features" => [
119
+ #'data/BC2GN.features',
120
+ 'data/BC2GM_train.features'
121
+ ] do |t|
122
+ t.prerequisites.each_with_index{|f,i|
123
+ if i == 0
124
+ Open.write('data/train.features',Open.read(f))
125
+ else
126
+ Open.append('data/train.features',Open.read(f))
127
+ end
128
+ }
129
+ end
130
+
131
+ rule (/data\/(.*).features/) => ['data/BC2.features'] do |t|
132
+ org = File.basename(t.name).sub(/.features$/,'')
133
+ org_features(org, t.name)
134
+ Open.append(t.name, Open.read('data/BC2.features'))
135
+ end
136
+
137
+
138
+
139
+ #{{{ MODEL
140
+ rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
141
+ parser = NERFeatures.new
142
+ parser.train( t.name.sub(/model/,'data') + '.features', t.name)
143
+ end
144
+
145
+ task 'clean' do
146
+ FileUtils.rm Dir.glob("data/*")
147
+ FileUtils.rm Dir.glob("model/*")
148
+ FileUtils.rm Dir.glob("results/*")
149
+
150
+ end
151
+
152
+ task 'all' do
153
+ Organism.all.each{|org|
154
+ Rake::Task["model/#{ org }"].invoke
155
+ }
156
+ end
157
+
158
+ task 'default' do
159
+ if $org
160
+ FileUtils.rm Dir.glob("**/#{$org}.*") if $force
161
+ Rake::Task["model/#{$org}"].invoke
162
+ else
163
+ Rake::Task['clean'].invoke if $force
164
+ Rake::Task['all'].invoke
165
+ end
166
+ end
167
+
168
+ #{{{ EVALUATE
169
+
170
+
171
+ def find(model, type, outfile)
172
+ ner = Organism.ner(:human,type,:model => model)
173
+
174
+ data = Biocreative.BC2GM(:test)
175
+
176
+ fout = File.open(outfile,'w')
177
+
178
+ Progress.monitor("Test")
179
+ data.each{|code,info|
180
+ text = info[:text]
181
+ mentions = ner.extract(text)
182
+
183
+ mentions.each{|mention|
184
+ positions = Biocreative.position(text,mention)
185
+ positions.each{|pos|
186
+ fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
187
+ }
188
+ }
189
+ }
190
+
191
+ end
192
+
193
+
194
+
195
+ rule (/results\/test$/) do |t|
196
+ org = File.basename(t.name)
197
+
198
+ if $type == 'rner'
199
+ Rake::Task['model/train'].invoke
200
+ end
201
+ find('model/train',$type,t.name)
202
+ end
203
+
204
+ rule (/results\/test.eval$/) => ['results/test'] do |t|
205
+ Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
206
+ end
@@ -0,0 +1,52 @@
1
+ isLetters /^[A-Z]+$/i
2
+ isUpper /^[A-Z]+$/
3
+ isLower /^[a-z]+$/
4
+ isDigits /^[0-9]+$/i
5
+ isRoman /^[IVX]+$/
6
+ isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
+ isPunctuation /^[,.;]$/
8
+ isDelim /^[\/()\[\]{}\-]$/
9
+ isNonWord /^[^\w]+$/
10
+ isConjunction /^and|or|&|,$/
11
+
12
+ hasLetters /[A-Z]/i
13
+ hasUpper /.[A-Z]/
14
+ hasLower /[a-z]/
15
+ hasDigits /[0-9]/i
16
+ hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
+ hasPunctuation /[,.;]/
18
+ hasDelim /[\/()\[\]{}\-]/
19
+ hasNonWord /[^\w]/
20
+ caspMix /[a-z].[A-Z]/
21
+ keywords /(?:protein|gene|domain|ase)s?$/
22
+ hasSuffix /[a-z][A-Z0-9]$/
23
+
24
+ numLetters do |w| w.scan(/[A-Z]/i).length end
25
+ numDigits do |w| w.scan(/[0-9]/).length end
26
+ #
27
+ prefix_3 /^(...)/
28
+ prefix_4 /^(....)/
29
+ suffix_3 /(...)$/
30
+ suffix_4 /(....)$/
31
+
32
+
33
+ token1 do |w|
34
+ w.sub(/[A-Z]/,'A').
35
+ sub(/[a-z]/,'a').
36
+ sub(/[0-9]/,'0').
37
+ sub(/[^0-9a-z]/i,'x')
38
+ end
39
+ token2 do |w|
40
+ w.sub(/[A-Z]+/,'A').
41
+ sub(/[a-z]+/,'a').
42
+ sub(/[0-9]+/,'0').
43
+ sub(/[^0-9a-z]+/i,'x')
44
+ end
45
+ token3 do |w| w.downcase end
46
+ special do |w| w.is_special? end
47
+
48
+ context %w(special token2 isPunctuation isDelim)
49
+ window %w(1 2 3 -1 -2 -3)
50
+ #direction :reverse
51
+
52
+
@@ -0,0 +1,218 @@
1
+ require 'rbbt'
2
+ require 'rbbt/sources/organism'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/ner/rner'
5
+
6
+
7
+ require 'progress-meter'
8
+
9
+ $type = ENV['ner'] || :rner
10
+ $debug = !ENV['debug'].nil?
11
+ $perfect = !ENV['perfect'].nil?
12
+ $docs = ENV['docs']
13
+
14
+
15
+ $org2rbbt = {
16
+ 'yeast' => 'sgd',
17
+ 'mouse' => 'mgi',
18
+ 'fly' => 'sgd',
19
+ 'bc2gn' => 'human',
20
+ }
21
+
22
+ def match(org, filedir, goldstandard,outfile)
23
+
24
+ t = Time.now
25
+ if org == 'bc2gn'
26
+ custom_file = File.join('config', org + '.config')
27
+ norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
28
+ :to_entrez => false,
29
+ :file => (File.exist?(custom_file) ? custom_file : nil),
30
+ :max_candidates => 200)
31
+ else
32
+ custom_file = File.join('config', org + '.config')
33
+ norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
34
+ :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
35
+ :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
36
+ :fix => proc{|l| l.sub(/S000/,'S0')}),
37
+ :file => (File.exist?(custom_file) ? custom_file : nil),
38
+ :max_candidates => 200)
39
+ end
40
+ STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
41
+
42
+
43
+ if $type.to_s == 'rner'
44
+ ner = NER.new('models/' + org)
45
+ else
46
+ ner = Organism.ner($org2rbbt[org], $type)
47
+ end
48
+
49
+
50
+ fout=File.open(outfile,'w')
51
+
52
+ gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
53
+ gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
54
+
55
+ if org == 'bc2gn'
56
+ lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
57
+ else
58
+ lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
59
+ end
60
+
61
+ if $docs
62
+ files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
63
+ else
64
+ files = Dir.glob(filedir + '*.txt').sort
65
+ end
66
+
67
+ Progress.monitor("Processing Files")
68
+ files.each{|f|
69
+ fid = File.basename(f).sub(/.txt/,'')
70
+
71
+ text = Open.read(f)
72
+ if $perfect
73
+ mentions = (gs_mentions[fid] || []).flatten
74
+
75
+ else
76
+ mentions = ner.extract(text).uniq
77
+ end
78
+
79
+ if $debug
80
+ puts "------------------------------------"
81
+ puts "FILE #{fid}"
82
+ puts
83
+ puts text
84
+ puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
85
+ puts "MENTIONS: #{mentions.join(", ")}"
86
+ end
87
+
88
+
89
+ found = []
90
+ mentions.each{|mention|
91
+
92
+ codes = norm.select(norm.match(mention),mention,text)
93
+
94
+ found += codes
95
+ codes.each{|code|
96
+ #code = code.sub(/S000/,'S0')
97
+ fout.puts "#{ fid }\t#{ code}\t#{mention}"
98
+ }
99
+
100
+ puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
101
+ }
102
+
103
+ if $debug
104
+ found.uniq!
105
+ fn = (gs[fid] || []).flatten.uniq - found
106
+ fp = found - (gs[fid] || []).flatten.uniq
107
+
108
+ fn.each{|code|
109
+ if lex[code]
110
+ puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
111
+ else
112
+ puts "FN: #{ code }"
113
+ end
114
+ }
115
+ fp.each{|code|
116
+ if lex[code]
117
+ puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
118
+ else
119
+ puts "FN: #{ code }"
120
+ end
121
+ }
122
+
123
+
124
+ end
125
+
126
+ }
127
+ fout.close
128
+
129
+ end
130
+
131
+ rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
132
+ org = File.basename(t.name).sub(/\.features/,'')
133
+
134
+ if org == 'bc2gn'
135
+ lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
136
+ else
137
+ lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
138
+ end
139
+
140
+ names = File.open(lexicon).collect{|l|
141
+ names = l.split(/\t/)
142
+ names.shift
143
+ names.compact.select{|n| !n.empty?}
144
+ }.flatten
145
+
146
+ fout = File.open(t.name,'w')
147
+ parser = NERFeatures.new
148
+
149
+ Progress.monitor("CRFPP Features #{ org }")
150
+ names.each{|name|
151
+ features = parser.text_features(name, true)
152
+ features.each{|feat|
153
+ fout.puts feat.join(" ")
154
+ }
155
+ fout.puts
156
+ }
157
+ fout.close
158
+ if org != 'bc2gn'
159
+ Open.append(t.name, Open.read('../ner/data/BC2.features'))
160
+ else
161
+ Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
162
+ Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
163
+ end
164
+
165
+ end
166
+
167
+ rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
168
+ org = File.basename(t.name)
169
+
170
+ parser = NERFeatures.new
171
+ parser.train( t.name + '.features', t.name)
172
+ end
173
+
174
+
175
+ rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
176
+ org, dataset = File.basename(t.name).split(/_/)
177
+
178
+ if $type.to_sym == :rner
179
+ Rake::Task['models/' + org].invoke
180
+ end
181
+
182
+ filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
183
+ goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
184
+
185
+ match(org,filedir, goldstandard,t.name)
186
+ end
187
+
188
+ rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
189
+ org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
190
+
191
+ cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
192
+ puts cmd
193
+ system cmd
194
+ end
195
+
196
+ rule (/results\/bc2gn$/) do |t|
197
+ org = 'bc2gn'
198
+
199
+ if $type.to_sym == :rner
200
+ Rake::Task['models/' + org].invoke
201
+ end
202
+
203
+ filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
204
+ goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
205
+
206
+ match(org,filedir, goldstandard,t.name)
207
+ end
208
+
209
+ rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
210
+
211
+ cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
212
+ system cmd
213
+
214
+ end
215
+
216
+
217
+
218
+
@@ -0,0 +1,10 @@
1
+ equal do |w| [w] end
2
+ standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
+ cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
4
+ special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
+ words do |w|
6
+ w.sub(/(.*)I$/,'\1I \1').
7
+ scan(/[a-z][a-z]+/i).
8
+ sort{|a,b| b.length <=> a.length}.
9
+ collect{|n| n.downcase}
10
+ end
@@ -0,0 +1,79 @@
1
+ require 'rbbt/util/misc'
2
+ tokens do
3
+
4
+ # Some (possible) single letters first
5
+ receptor /^(?:receptor|r)s?$/i
6
+ protein /^(?:protein|p)s?$/i
7
+ roman /^[IV]+$/
8
+ greek_letter do |w| $inverse_greek[w.downcase] != nil end
9
+
10
+
11
+ # Some words for removal
12
+ stopword do |w| $stopwords.include?( w.downcase_first) end
13
+ gene /genes?/i
14
+ dna
15
+ cdna
16
+ rna
17
+ mrna
18
+ trna
19
+ cdna
20
+ component
21
+ exon
22
+ intron
23
+ domain
24
+ family
25
+
26
+
27
+ # Important words
28
+ number /^(?:\d+[.,]?\d+|\d)$/
29
+ greek do |w| $greek[w.downcase] != nil end
30
+ special do |w| w.is_special? end
31
+ promoter
32
+ similar /^(homolog.*|like|related|associated)$/
33
+ ase /ase$/
34
+ in_end /in$/
35
+ end
36
+
37
+ comparisons do
38
+
39
+ compare.number do |l1,l2|
40
+ v = 0
41
+ case
42
+ when l1.empty? && l2.empty?
43
+ v = 0
44
+ when l1.sort.uniq == l2.sort.uniq
45
+ v = 3
46
+ when l1.any? && l1[0] == l2[0]
47
+ v = -3
48
+ when l1.empty? && l2 == ['1']
49
+ v = -5
50
+ else
51
+ v = -10
52
+ end
53
+ v
54
+ end
55
+
56
+ diff.promoter -10
57
+ diff.receptor -10
58
+ diff.similar -10
59
+ diff.capital -10
60
+
61
+ same.unknown 1
62
+ miss.unknown -2
63
+ extr.unknown -2
64
+
65
+ same.greek 1
66
+ miss.greek -2
67
+ extr.greek -2
68
+
69
+ same.special 4
70
+ miss.special -3
71
+ extr.special -3
72
+
73
+ transform.roman do |t| [t.arabic, :number] end
74
+ transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
75
+ transform.ase do |t| [t, :special] end
76
+ transform.in_end do |t| [t, :special] end
77
+ transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
78
+ end
79
+
@@ -0,0 +1,21 @@
1
+ #!/bin/bash
2
+ function norm(){
3
+ o=$1
4
+ shift
5
+ s=$1
6
+ shift
7
+ n=$1
8
+ shift
9
+
10
+ echo "rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval"
11
+ rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval
12
+ }
13
+
14
+
15
+ function norm_2(){
16
+ n=$1
17
+ shift
18
+
19
+ echo "rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval"
20
+ rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval
21
+ }
@@ -0,0 +1,25 @@
1
+ $org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
2
+
3
+ task 'default' do
4
+ if $org
5
+ orgs = [$org]
6
+ else
7
+
8
+ orgs = Dir.glob('*').
9
+ select{|t|
10
+ File.directory?(t ) &&
11
+ File.exist?(t + '/Rakefile')
12
+ }
13
+ end
14
+
15
+ orgs.each{|org|
16
+ puts "Updating #{ org }"
17
+ pid = Process.fork{
18
+ Dir.chdir(org)
19
+ load 'Rakefile'
20
+ Rake::Task['update'].invoke
21
+ }
22
+ Process.waitpid pid
23
+ }
24
+ end
25
+
@@ -0,0 +1,84 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Candida albicans"
4
+
5
+
6
+ $native_id = "Systematic Name"
7
+
8
+ $entrez2native = {
9
+ :tax => 237561,
10
+ :fix => proc{|code| code.sub(/^CaO/,'orf') },
11
+ :check => proc{|code| code.match(/^orf/)},
12
+ :native => 3
13
+ }
14
+
15
+ $lexicon = {
16
+ :file => {
17
+ :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
18
+ :native => 0,
19
+ :extra => [8,1,2],
20
+ :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
21
+ },
22
+ }
23
+
24
+ $identifiers = {
25
+ :file => {
26
+ :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
27
+ :native => 0,
28
+ :extra => [8,1,2],
29
+ :exclude => proc{|l| l.match(/^!/)},
30
+ :fields => ["GCD ID", "Gene Name", "Gene Alias"]
31
+ },
32
+ }
33
+
34
+ $go = {
35
+ :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
36
+ :code => 10,
37
+ :go => 4,
38
+ :pmid => 5,
39
+ :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
40
+ }
41
+
42
+ $query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
43
+
44
+ ####
45
+
46
+ #Rake::Task['identifiers'].clear
47
+ #file 'identifiers' => ['lexicon'] do |t|
48
+ # identifiers = {}
49
+ # if $identifiers[:file]
50
+ # identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
51
+ # end
52
+ #
53
+ # orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
54
+ #
55
+ # translations = {}
56
+ #
57
+ # Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
58
+ # orfs.each{|orf|
59
+ # translations[orf] ||= []
60
+ # translations[orf] << entrez
61
+ # }
62
+ # }
63
+ #
64
+ # orf2native.each{|orf, native|
65
+ # next unless identifiers[native]
66
+ # identifiers[native] << [orf]
67
+ # if translations[orf]
68
+ # identifiers[native] << translations[orf]
69
+ # else
70
+ # identifiers[native] << []
71
+ # end
72
+ #
73
+ # }
74
+ #
75
+ # header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
76
+ # Open.write('identifiers',
77
+ # header +
78
+ # identifiers.collect{|code, name_lists|
79
+ # "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
80
+ # }.join("\n")
81
+ # )
82
+ #end
83
+ #
84
+ #