rbbt 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,206 @@
1
+ require 'rbbt/sources/organism'
2
+ require 'rbbt/sources/biocreative'
3
+ require 'rbbt/ner/rner'
4
+
5
+ require 'progress-monitor'
6
+
7
+
8
+ $type = ENV['type'] || 'rner'
9
+
10
+ #{{{ FEATURES
11
+
12
+ def BC2GM_features(dataset, outfile)
13
+ data = Biocreative.BC2GM(dataset)
14
+
15
+ fout = File.open(outfile,'w')
16
+ parser = NERFeatures.new
17
+
18
+ Progress.monitor("CRFPP Features BC2GM #{ dataset }")
19
+ data.each{|code, info|
20
+ text = info[:text]
21
+ mentions = info[:mentions]
22
+
23
+ features = parser.tagged_features(text,mentions)
24
+
25
+ features.each{|feat|
26
+ fout.puts feat.join(" ")
27
+ }
28
+ fout.puts
29
+ }
30
+ fout.close
31
+ end
32
+
33
+ def BC2GN_features(dataset, outfile)
34
+ data = {}
35
+ Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
36
+ code = File.basename(f).sub(/.txt/,'')
37
+ data[code] = {}
38
+ data[code][:text] = Open.read(f)
39
+ }
40
+ Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each{|l|
41
+ code, gene, mention = l.chomp.split(/\t/)
42
+ data[code][:mentions] ||= []
43
+ data[code][:mentions] << mention
44
+ }
45
+
46
+ fout = File.open(outfile,'w')
47
+ parser = NERFeatures.new
48
+
49
+ Progress.monitor("CRFPP Features BC2GN #{ dataset }")
50
+ data.each{|code, info|
51
+ text = info[:text]
52
+ mentions = info[:mentions]
53
+ next if mentions.nil?
54
+
55
+ features = parser.tagged_features(text,mentions)
56
+
57
+ features.each{|feat|
58
+ fout.puts feat.join(" ")
59
+ }
60
+ fout.puts
61
+ }
62
+ fout.close
63
+ end
64
+
65
+ def org_features(org, outfile)
66
+ names = Organism.lexicon(org).collect{|code, names|
67
+ names
68
+ }.flatten
69
+
70
+ fout = File.open(outfile,'w')
71
+ parser = NERFeatures.new
72
+
73
+ Progress.monitor("CRFPP Features #{ org }")
74
+ names.each{|name|
75
+ features = parser.text_features(name, true)
76
+ features.each{|feat|
77
+ fout.puts feat.join(" ")
78
+ }
79
+ fout.puts
80
+ }
81
+ fout.close
82
+
83
+
84
+ end
85
+
86
+ file "data/BC2GM_train.features" do |t|
87
+ BC2GM_features(:train, 'data/BC2GM_train.features')
88
+ end
89
+
90
+ file "data/BC2GM_test.features" do |t|
91
+ BC2GM_features(:test, 'data/BC2GM_test.features')
92
+ end
93
+ file "data/BC2GN_Train.features" do |t|
94
+ BC2GN_features('Train', 'data/BC2GN_Train.features')
95
+ end
96
+
97
+ file "data/BC2GN_Test.features" do |t|
98
+ BC2GN_features('Test', 'data/BC2GN_Test.features')
99
+ end
100
+
101
+
102
+ file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
103
+ Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
104
+ Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
105
+ end
106
+
107
+ file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
108
+ Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
109
+ Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
110
+ end
111
+
112
+
113
+ file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
114
+ Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
115
+ Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
116
+ end
117
+
118
+ file "data/train.features" => [
119
+ #'data/BC2GN.features',
120
+ 'data/BC2GM_train.features'
121
+ ] do |t|
122
+ t.prerequisites.each_with_index{|f,i|
123
+ if i == 0
124
+ Open.write('data/train.features',Open.read(f))
125
+ else
126
+ Open.append('data/train.features',Open.read(f))
127
+ end
128
+ }
129
+ end
130
+
131
+ rule (/data\/(.*).features/) => ['data/BC2.features'] do |t|
132
+ org = File.basename(t.name).sub(/.features$/,'')
133
+ org_features(org, t.name)
134
+ Open.append(t.name, Open.read('data/BC2.features'))
135
+ end
136
+
137
+
138
+
139
+ #{{{ MODEL
140
+ rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
141
+ parser = NERFeatures.new
142
+ parser.train( t.name.sub(/model/,'data') + '.features', t.name)
143
+ end
144
+
145
+ task 'clean' do
146
+ FileUtils.rm Dir.glob("data/*")
147
+ FileUtils.rm Dir.glob("model/*")
148
+ FileUtils.rm Dir.glob("results/*")
149
+
150
+ end
151
+
152
+ task 'all' do
153
+ Organism.all.each{|org|
154
+ Rake::Task["model/#{ org }"].invoke
155
+ }
156
+ end
157
+
158
+ task 'default' do
159
+ if $org
160
+ FileUtils.rm Dir.glob("**/#{$org}.*") if $force
161
+ Rake::Task["model/#{$org}"].invoke
162
+ else
163
+ Rake::Task['clean'].invoke if $force
164
+ Rake::Task['all'].invoke
165
+ end
166
+ end
167
+
168
+ #{{{ EVALUATE
169
+
170
+
171
+ def find(model, type, outfile)
172
+ ner = Organism.ner(:human,type,:model => model)
173
+
174
+ data = Biocreative.BC2GM(:test)
175
+
176
+ fout = File.open(outfile,'w')
177
+
178
+ Progress.monitor("Test")
179
+ data.each{|code,info|
180
+ text = info[:text]
181
+ mentions = ner.extract(text)
182
+
183
+ mentions.each{|mention|
184
+ positions = Biocreative.position(text,mention)
185
+ positions.each{|pos|
186
+ fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
187
+ }
188
+ }
189
+ }
190
+
191
+ end
192
+
193
+
194
+
195
+ rule (/results\/test$/) do |t|
196
+ org = File.basename(t.name)
197
+
198
+ if $type == 'rner'
199
+ Rake::Task['model/train'].invoke
200
+ end
201
+ find('model/train',$type,t.name)
202
+ end
203
+
204
+ rule (/results\/test.eval$/) => ['results/test'] do |t|
205
+ Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
206
+ end
@@ -0,0 +1,52 @@
1
+ isLetters /^[A-Z]+$/i
2
+ isUpper /^[A-Z]+$/
3
+ isLower /^[a-z]+$/
4
+ isDigits /^[0-9]+$/i
5
+ isRoman /^[IVX]+$/
6
+ isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
+ isPunctuation /^[,.;]$/
8
+ isDelim /^[\/()\[\]{}\-]$/
9
+ isNonWord /^[^\w]+$/
10
+ isConjunction /^and|or|&|,$/
11
+
12
+ hasLetters /[A-Z]/i
13
+ hasUpper /.[A-Z]/
14
+ hasLower /[a-z]/
15
+ hasDigits /[0-9]/i
16
+ hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
+ hasPunctuation /[,.;]/
18
+ hasDelim /[\/()\[\]{}\-]/
19
+ hasNonWord /[^\w]/
20
+ caspMix /[a-z].[A-Z]/
21
+ keywords /(?:protein|gene|domain|ase)s?$/
22
+ hasSuffix /[a-z][A-Z0-9]$/
23
+
24
+ numLetters do |w| w.scan(/[A-Z]/i).length end
25
+ numDigits do |w| w.scan(/[0-9]/).length end
26
+ #
27
+ prefix_3 /^(...)/
28
+ prefix_4 /^(....)/
29
+ suffix_3 /(...)$/
30
+ suffix_4 /(....)$/
31
+
32
+
33
+ token1 do |w|
34
+ w.sub(/[A-Z]/,'A').
35
+ sub(/[a-z]/,'a').
36
+ sub(/[0-9]/,'0').
37
+ sub(/[^0-9a-z]/i,'x')
38
+ end
39
+ token2 do |w|
40
+ w.sub(/[A-Z]+/,'A').
41
+ sub(/[a-z]+/,'a').
42
+ sub(/[0-9]+/,'0').
43
+ sub(/[^0-9a-z]+/i,'x')
44
+ end
45
+ token3 do |w| w.downcase end
46
+ special do |w| w.is_special? end
47
+
48
+ context %w(special token2 isPunctuation isDelim)
49
+ window %w(1 2 3 -1 -2 -3)
50
+ #direction :reverse
51
+
52
+
@@ -0,0 +1,218 @@
1
+ require 'rbbt'
2
+ require 'rbbt/sources/organism'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/ner/rner'
5
+
6
+
7
+ require 'progress-meter'
8
+
9
+ $type = ENV['ner'] || :rner
10
+ $debug = !ENV['debug'].nil?
11
+ $perfect = !ENV['perfect'].nil?
12
+ $docs = ENV['docs']
13
+
14
+
15
+ $org2rbbt = {
16
+ 'yeast' => 'sgd',
17
+ 'mouse' => 'mgi',
18
+ 'fly' => 'sgd',
19
+ 'bc2gn' => 'human',
20
+ }
21
+
22
+ def match(org, filedir, goldstandard,outfile)
23
+
24
+ t = Time.now
25
+ if org == 'bc2gn'
26
+ custom_file = File.join('config', org + '.config')
27
+ norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
28
+ :to_entrez => false,
29
+ :file => (File.exist?(custom_file) ? custom_file : nil),
30
+ :max_candidates => 200)
31
+ else
32
+ custom_file = File.join('config', org + '.config')
33
+ norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
34
+ :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
35
+ :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
36
+ :fix => proc{|l| l.sub(/S000/,'S0')}),
37
+ :file => (File.exist?(custom_file) ? custom_file : nil),
38
+ :max_candidates => 200)
39
+ end
40
+ STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
41
+
42
+
43
+ if $type.to_s == 'rner'
44
+ ner = NER.new('models/' + org)
45
+ else
46
+ ner = Organism.ner($org2rbbt[org], $type)
47
+ end
48
+
49
+
50
+ fout=File.open(outfile,'w')
51
+
52
+ gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
53
+ gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
54
+
55
+ if org == 'bc2gn'
56
+ lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
57
+ else
58
+ lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
59
+ end
60
+
61
+ if $docs
62
+ files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
63
+ else
64
+ files = Dir.glob(filedir + '*.txt').sort
65
+ end
66
+
67
+ Progress.monitor("Processing Files")
68
+ files.each{|f|
69
+ fid = File.basename(f).sub(/.txt/,'')
70
+
71
+ text = Open.read(f)
72
+ if $perfect
73
+ mentions = (gs_mentions[fid] || []).flatten
74
+
75
+ else
76
+ mentions = ner.extract(text).uniq
77
+ end
78
+
79
+ if $debug
80
+ puts "------------------------------------"
81
+ puts "FILE #{fid}"
82
+ puts
83
+ puts text
84
+ puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
85
+ puts "MENTIONS: #{mentions.join(", ")}"
86
+ end
87
+
88
+
89
+ found = []
90
+ mentions.each{|mention|
91
+
92
+ codes = norm.select(norm.match(mention),mention,text)
93
+
94
+ found += codes
95
+ codes.each{|code|
96
+ #code = code.sub(/S000/,'S0')
97
+ fout.puts "#{ fid }\t#{ code}\t#{mention}"
98
+ }
99
+
100
+ puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
101
+ }
102
+
103
+ if $debug
104
+ found.uniq!
105
+ fn = (gs[fid] || []).flatten.uniq - found
106
+ fp = found - (gs[fid] || []).flatten.uniq
107
+
108
+ fn.each{|code|
109
+ if lex[code]
110
+ puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
111
+ else
112
+ puts "FN: #{ code }"
113
+ end
114
+ }
115
+ fp.each{|code|
116
+ if lex[code]
117
+ puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
118
+ else
119
+ puts "FN: #{ code }"
120
+ end
121
+ }
122
+
123
+
124
+ end
125
+
126
+ }
127
+ fout.close
128
+
129
+ end
130
+
131
+ rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
132
+ org = File.basename(t.name).sub(/\.features/,'')
133
+
134
+ if org == 'bc2gn'
135
+ lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
136
+ else
137
+ lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
138
+ end
139
+
140
+ names = File.open(lexicon).collect{|l|
141
+ names = l.split(/\t/)
142
+ names.shift
143
+ names.compact.select{|n| !n.empty?}
144
+ }.flatten
145
+
146
+ fout = File.open(t.name,'w')
147
+ parser = NERFeatures.new
148
+
149
+ Progress.monitor("CRFPP Features #{ org }")
150
+ names.each{|name|
151
+ features = parser.text_features(name, true)
152
+ features.each{|feat|
153
+ fout.puts feat.join(" ")
154
+ }
155
+ fout.puts
156
+ }
157
+ fout.close
158
+ if org != 'bc2gn'
159
+ Open.append(t.name, Open.read('../ner/data/BC2.features'))
160
+ else
161
+ Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
162
+ Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
163
+ end
164
+
165
+ end
166
+
167
+ rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
168
+ org = File.basename(t.name)
169
+
170
+ parser = NERFeatures.new
171
+ parser.train( t.name + '.features', t.name)
172
+ end
173
+
174
+
175
+ rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
176
+ org, dataset = File.basename(t.name).split(/_/)
177
+
178
+ if $type.to_sym == :rner
179
+ Rake::Task['models/' + org].invoke
180
+ end
181
+
182
+ filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
183
+ goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
184
+
185
+ match(org,filedir, goldstandard,t.name)
186
+ end
187
+
188
+ rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
189
+ org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
190
+
191
+ cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
192
+ puts cmd
193
+ system cmd
194
+ end
195
+
196
+ rule (/results\/bc2gn$/) do |t|
197
+ org = 'bc2gn'
198
+
199
+ if $type.to_sym == :rner
200
+ Rake::Task['models/' + org].invoke
201
+ end
202
+
203
+ filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
204
+ goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
205
+
206
+ match(org,filedir, goldstandard,t.name)
207
+ end
208
+
209
+ rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
210
+
211
+ cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
212
+ system cmd
213
+
214
+ end
215
+
216
+
217
+
218
+
@@ -0,0 +1,10 @@
1
+ equal do |w| [w] end
2
+ standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
+ cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
4
+ special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
+ words do |w|
6
+ w.sub(/(.*)I$/,'\1I \1').
7
+ scan(/[a-z][a-z]+/i).
8
+ sort{|a,b| b.length <=> a.length}.
9
+ collect{|n| n.downcase}
10
+ end
@@ -0,0 +1,79 @@
1
+ require 'rbbt/util/misc'
2
+ tokens do
3
+
4
+ # Some (possible) single letters first
5
+ receptor /^(?:receptor|r)s?$/i
6
+ protein /^(?:protein|p)s?$/i
7
+ roman /^[IV]+$/
8
+ greek_letter do |w| $inverse_greek[w.downcase] != nil end
9
+
10
+
11
+ # Some words for removal
12
+ stopword do |w| $stopwords.include?( w.downcase_first) end
13
+ gene /genes?/i
14
+ dna
15
+ cdna
16
+ rna
17
+ mrna
18
+ trna
19
+ cdna
20
+ component
21
+ exon
22
+ intron
23
+ domain
24
+ family
25
+
26
+
27
+ # Important words
28
+ number /^(?:\d+[.,]?\d+|\d)$/
29
+ greek do |w| $greek[w.downcase] != nil end
30
+ special do |w| w.is_special? end
31
+ promoter
32
+ similar /^(homolog.*|like|related|associated)$/
33
+ ase /ase$/
34
+ in_end /in$/
35
+ end
36
+
37
+ comparisons do
38
+
39
+ compare.number do |l1,l2|
40
+ v = 0
41
+ case
42
+ when l1.empty? && l2.empty?
43
+ v = 0
44
+ when l1.sort.uniq == l2.sort.uniq
45
+ v = 3
46
+ when l1.any? && l1[0] == l2[0]
47
+ v = -3
48
+ when l1.empty? && l2 == ['1']
49
+ v = -5
50
+ else
51
+ v = -10
52
+ end
53
+ v
54
+ end
55
+
56
+ diff.promoter -10
57
+ diff.receptor -10
58
+ diff.similar -10
59
+ diff.capital -10
60
+
61
+ same.unknown 1
62
+ miss.unknown -2
63
+ extr.unknown -2
64
+
65
+ same.greek 1
66
+ miss.greek -2
67
+ extr.greek -2
68
+
69
+ same.special 4
70
+ miss.special -3
71
+ extr.special -3
72
+
73
+ transform.roman do |t| [t.arabic, :number] end
74
+ transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
75
+ transform.ase do |t| [t, :special] end
76
+ transform.in_end do |t| [t, :special] end
77
+ transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
78
+ end
79
+
@@ -0,0 +1,21 @@
1
+ #!/bin/bash
2
+ function norm(){
3
+ o=$1
4
+ shift
5
+ s=$1
6
+ shift
7
+ n=$1
8
+ shift
9
+
10
+ echo "rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval"
11
+ rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval
12
+ }
13
+
14
+
15
+ function norm_2(){
16
+ n=$1
17
+ shift
18
+
19
+ echo "rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval"
20
+ rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval
21
+ }
@@ -0,0 +1,25 @@
1
+ $org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
2
+
3
+ task 'default' do
4
+ if $org
5
+ orgs = [$org]
6
+ else
7
+
8
+ orgs = Dir.glob('*').
9
+ select{|t|
10
+ File.directory?(t ) &&
11
+ File.exist?(t + '/Rakefile')
12
+ }
13
+ end
14
+
15
+ orgs.each{|org|
16
+ puts "Updating #{ org }"
17
+ pid = Process.fork{
18
+ Dir.chdir(org)
19
+ load 'Rakefile'
20
+ Rake::Task['update'].invoke
21
+ }
22
+ Process.waitpid pid
23
+ }
24
+ end
25
+
@@ -0,0 +1,84 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Candida albicans"
4
+
5
+
6
+ $native_id = "Systematic Name"
7
+
8
+ $entrez2native = {
9
+ :tax => 237561,
10
+ :fix => proc{|code| code.sub(/^CaO/,'orf') },
11
+ :check => proc{|code| code.match(/^orf/)},
12
+ :native => 3
13
+ }
14
+
15
+ $lexicon = {
16
+ :file => {
17
+ :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
18
+ :native => 0,
19
+ :extra => [8,1,2],
20
+ :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
21
+ },
22
+ }
23
+
24
+ $identifiers = {
25
+ :file => {
26
+ :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
27
+ :native => 0,
28
+ :extra => [8,1,2],
29
+ :exclude => proc{|l| l.match(/^!/)},
30
+ :fields => ["GCD ID", "Gene Name", "Gene Alias"]
31
+ },
32
+ }
33
+
34
+ $go = {
35
+ :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
36
+ :code => 10,
37
+ :go => 4,
38
+ :pmid => 5,
39
+ :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
40
+ }
41
+
42
+ $query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
43
+
44
+ ####
45
+
46
+ #Rake::Task['identifiers'].clear
47
+ #file 'identifiers' => ['lexicon'] do |t|
48
+ # identifiers = {}
49
+ # if $identifiers[:file]
50
+ # identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
51
+ # end
52
+ #
53
+ # orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
54
+ #
55
+ # translations = {}
56
+ #
57
+ # Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
58
+ # orfs.each{|orf|
59
+ # translations[orf] ||= []
60
+ # translations[orf] << entrez
61
+ # }
62
+ # }
63
+ #
64
+ # orf2native.each{|orf, native|
65
+ # next unless identifiers[native]
66
+ # identifiers[native] << [orf]
67
+ # if translations[orf]
68
+ # identifiers[native] << translations[orf]
69
+ # else
70
+ # identifiers[native] << []
71
+ # end
72
+ #
73
+ # }
74
+ #
75
+ # header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
76
+ # Open.write('identifiers',
77
+ # header +
78
+ # identifiers.collect{|code, name_lists|
79
+ # "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
80
+ # }.join("\n")
81
+ # )
82
+ #end
83
+ #
84
+ #