rbbt 1.1.7 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,52 +0,0 @@
1
- isLetters /^[A-Z]+$/i
2
- isUpper /^[A-Z]+$/
3
- isLower /^[a-z]+$/
4
- isDigits /^[0-9]+$/i
5
- isRoman /^[IVX]+$/
6
- isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
- isPunctuation /^[,.;]$/
8
- isDelim /^[\/()\[\]{}\-]$/
9
- isNonWord /^[^\w]+$/
10
- isConjunction /^and|or|&|,$/
11
-
12
- hasLetters /[A-Z]/i
13
- hasUpper /.[A-Z]/
14
- hasLower /[a-z]/
15
- hasDigits /[0-9]/i
16
- hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
- hasPunctuation /[,.;]/
18
- hasDelim /[\/()\[\]{}\-]/
19
- hasNonWord /[^\w]/
20
- caspMix /[a-z].[A-Z]/
21
- keywords /(?:protein|gene|domain|ase)s?$/
22
- hasSuffix /[a-z][A-Z0-9]$/
23
-
24
- numLetters do |w| w.scan(/[A-Z]/i).length end
25
- numDigits do |w| w.scan(/[0-9]/).length end
26
- #
27
- prefix_3 /^(...)/
28
- prefix_4 /^(....)/
29
- suffix_3 /(...)$/
30
- suffix_4 /(....)$/
31
-
32
-
33
- token1 do |w|
34
- w.sub(/[A-Z]/,'A').
35
- sub(/[a-z]/,'a').
36
- sub(/[0-9]/,'0').
37
- sub(/[^0-9a-z]/i,'x')
38
- end
39
- token2 do |w|
40
- w.sub(/[A-Z]+/,'A').
41
- sub(/[a-z]+/,'a').
42
- sub(/[0-9]+/,'0').
43
- sub(/[^0-9a-z]+/i,'x')
44
- end
45
- token3 do |w| w.downcase end
46
- special do |w| w.is_special? end
47
-
48
- context %w(special token2 isPunctuation isDelim)
49
- window %w(1 2 3 -1 -2 -3)
50
- #direction :reverse
51
-
52
-
@@ -1,219 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/sources/organism'
3
- require 'rbbt/util/open'
4
- require 'rbbt/ner/rner'
5
- require 'rbbt/ner/rnorm'
6
-
7
-
8
- require 'progress-monitor'
9
-
10
- $type = ENV['ner'] || :rner
11
- $debug = !ENV['debug'].nil?
12
- $perfect = !ENV['perfect'].nil?
13
- $docs = ENV['docs']
14
-
15
-
16
- $org2rbbt = {
17
- 'yeast' => 'sgd',
18
- 'mouse' => 'mgi',
19
- 'fly' => 'sgd',
20
- 'bc2gn' => 'human',
21
- }
22
-
23
- def match(org, filedir, goldstandard,outfile)
24
-
25
- t = Time.now
26
- if org == 'bc2gn'
27
- custom_file = File.join('config', org + '.config')
28
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
29
- :to_entrez => false,
30
- :file => (File.exist?(custom_file) ? custom_file : nil),
31
- :max_candidates => 200)
32
- else
33
- custom_file = File.join('config', org + '.config')
34
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
35
- :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
36
- :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
37
- :fix => proc{|l| l.sub(/S000/,'S0')}),
38
- :file => (File.exist?(custom_file) ? custom_file : nil),
39
- :max_candidates => 200)
40
- end
41
- STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
42
-
43
-
44
- if $type.to_s == 'rner'
45
- ner = NER.new('models/' + org)
46
- else
47
- ner = Organism.ner($org2rbbt[org], $type)
48
- end
49
-
50
-
51
- fout=File.open(outfile,'w')
52
-
53
- gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
54
- gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
55
-
56
- if org == 'bc2gn'
57
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
58
- else
59
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
60
- end
61
-
62
- if $docs
63
- files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
64
- else
65
- files = Dir.glob(filedir + '*.txt').sort
66
- end
67
-
68
- Progress.monitor("Processing Files")
69
- files.each{|f|
70
- fid = File.basename(f).sub(/.txt/,'')
71
-
72
- text = Open.read(f)
73
- if $perfect
74
- mentions = (gs_mentions[fid] || []).flatten
75
-
76
- else
77
- mentions = ner.extract(text).uniq
78
- end
79
-
80
- if $debug
81
- puts "------------------------------------"
82
- puts "FILE #{fid}"
83
- puts
84
- puts text
85
- puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
86
- puts "MENTIONS: #{mentions.join(", ")}"
87
- end
88
-
89
-
90
- found = []
91
- mentions.each{|mention|
92
-
93
- codes = norm.select(norm.match(mention),mention,text)
94
-
95
- found += codes
96
- codes.each{|code|
97
- #code = code.sub(/S000/,'S0')
98
- fout.puts "#{ fid }\t#{ code}\t#{mention}"
99
- }
100
-
101
- puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
102
- }
103
-
104
- if $debug
105
- found.uniq!
106
- fn = (gs[fid] || []).flatten.uniq - found
107
- fp = found - (gs[fid] || []).flatten.uniq
108
-
109
- fn.each{|code|
110
- if lex[code]
111
- puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
112
- else
113
- puts "FN: #{ code }"
114
- end
115
- }
116
- fp.each{|code|
117
- if lex[code]
118
- puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
119
- else
120
- puts "FN: #{ code }"
121
- end
122
- }
123
-
124
-
125
- end
126
-
127
- }
128
- fout.close
129
-
130
- end
131
-
132
- rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
133
- org = File.basename(t.name).sub(/\.features/,'')
134
-
135
- if org == 'bc2gn'
136
- lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
137
- else
138
- lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
139
- end
140
-
141
- names = File.open(lexicon).collect{|l|
142
- names = l.split(/\t/)
143
- names.shift
144
- names.compact.select{|n| !n.empty?}
145
- }.flatten
146
-
147
- fout = File.open(t.name,'w')
148
- parser = NERFeatures.new
149
-
150
- Progress.monitor("CRFPP Features #{ org }")
151
- names.each{|name|
152
- features = parser.text_features(name, true)
153
- features.each{|feat|
154
- fout.puts feat.join(" ")
155
- }
156
- fout.puts
157
- }
158
- fout.close
159
- if org != 'bc2gn'
160
- Open.append(t.name, Open.read('../ner/data/BC2.features'))
161
- else
162
- Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
163
- Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
164
- end
165
-
166
- end
167
-
168
- rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
169
- org = File.basename(t.name)
170
-
171
- parser = NERFeatures.new
172
- parser.train( t.name + '.features', t.name)
173
- end
174
-
175
-
176
- rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
177
- org, dataset = File.basename(t.name).split(/_/)
178
-
179
- if $type.to_sym == :rner
180
- Rake::Task['models/' + org].invoke
181
- end
182
-
183
- filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
184
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
185
-
186
- match(org,filedir, goldstandard,t.name)
187
- end
188
-
189
- rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
190
- org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
191
-
192
- cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
193
- puts cmd
194
- system cmd
195
- end
196
-
197
- rule (/results\/bc2gn$/) do |t|
198
- org = 'bc2gn'
199
-
200
- if $type.to_sym == :rner
201
- Rake::Task['models/' + org].invoke
202
- end
203
-
204
- filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
205
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
206
-
207
- match(org,filedir, goldstandard,t.name)
208
- end
209
-
210
- rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
211
-
212
- cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
213
- system cmd
214
-
215
- end
216
-
217
-
218
-
219
-
@@ -1,10 +0,0 @@
1
- equal do |w| [w] end
2
- standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
- cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
4
- special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
- words do |w|
6
- w.sub(/(.*)I$/,'\1I \1').
7
- scan(/[a-z][a-z]+/i).
8
- sort{|a,b| b.length <=> a.length}.
9
- collect{|n| n.downcase}
10
- end
@@ -1,79 +0,0 @@
1
- require 'rbbt/util/misc'
2
- tokens do
3
-
4
- # Some (possible) single letters first
5
- receptor /^(?:receptor|r)s?$/i
6
- protein /^(?:protein|p)s?$/i
7
- roman /^[IV]+$/
8
- greek_letter do |w| $inverse_greek[w.downcase] != nil end
9
-
10
-
11
- # Some words for removal
12
- stopword do |w| $stopwords.include?( w.downcase_first) end
13
- gene /genes?/i
14
- dna
15
- cdna
16
- rna
17
- mrna
18
- trna
19
- cdna
20
- component
21
- exon
22
- intron
23
- domain
24
- family
25
-
26
-
27
- # Important words
28
- number /^(?:\d+[.,]?\d+|\d)$/
29
- greek do |w| $greek[w.downcase] != nil end
30
- special do |w| w.is_special? end
31
- promoter
32
- similar /^(homolog.*|like|related|associated)$/
33
- ase /ase$/
34
- in_end /in$/
35
- end
36
-
37
- comparisons do
38
-
39
- compare.number do |l1,l2|
40
- v = 0
41
- case
42
- when l1.empty? && l2.empty?
43
- v = 0
44
- when l1.sort.uniq == l2.sort.uniq
45
- v = 3
46
- when l1.any? && l1[0] == l2[0]
47
- v = -3
48
- when l1.empty? && l2 == ['1']
49
- v = -5
50
- else
51
- v = -10
52
- end
53
- v
54
- end
55
-
56
- diff.promoter -10
57
- diff.receptor -10
58
- diff.similar -10
59
- diff.capital -10
60
-
61
- same.unknown 1
62
- miss.unknown -2
63
- extr.unknown -2
64
-
65
- same.greek 1
66
- miss.greek -2
67
- extr.greek -2
68
-
69
- same.special 4
70
- miss.special -3
71
- extr.special -3
72
-
73
- transform.roman do |t| [t.arabic, :number] end
74
- transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
75
- transform.ase do |t| [t, :special] end
76
- transform.in_end do |t| [t, :special] end
77
- transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
78
- end
79
-
@@ -1,23 +0,0 @@
1
- #!/bin/bash
2
- function norm(){
3
- organism=$1
4
- shift
5
- dataset=$1
6
- shift
7
- ner=$1
8
- shift
9
-
10
- CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
11
- echo $CMD
12
- $CMD
13
- }
14
-
15
-
16
- function norm_2(){
17
- ner=$1
18
- shift
19
-
20
- CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
21
- echo $CMD
22
- $CMD
23
- }
@@ -1,43 +0,0 @@
1
- $org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
2
-
3
- task 'names' do
4
- orgs = Dir.glob('*').
5
- select{|t|
6
- File.directory?(t ) &&
7
- File.exist?(t + '/Rakefile')
8
- }
9
-
10
- orgs.each{|org|
11
- pid = Process.fork{
12
- Dir.chdir(org)
13
- load 'Rakefile'
14
- Rake::Task['name'].invoke
15
- }
16
- Process.waitpid pid
17
- }
18
-
19
- end
20
-
21
- task 'default' do
22
- if $org
23
- orgs = [$org]
24
- else
25
-
26
- orgs = Dir.glob('*').
27
- select{|t|
28
- File.directory?(t ) &&
29
- File.exist?(t + '/Rakefile')
30
- }
31
- end
32
-
33
- orgs.each{|org|
34
- puts "Updating #{ org }"
35
- pid = Process.fork{
36
- Dir.chdir(org)
37
- load 'Rakefile'
38
- Rake::Task['update'].invoke
39
- }
40
- Process.waitpid pid
41
- }
42
- end
43
-
@@ -1,84 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Candida albicans"
4
-
5
-
6
- $native_id = "Systematic Name"
7
-
8
- $entrez2native = {
9
- :tax => 237561,
10
- :fix => proc{|code| code.sub(/^CaO/,'orf') },
11
- :check => proc{|code| code.match(/^orf/)},
12
- :native => 3
13
- }
14
-
15
- $lexicon = {
16
- :file => {
17
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
18
- :native => 0,
19
- :extra => [8,1,2],
20
- :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
21
- },
22
- }
23
-
24
- $identifiers = {
25
- :file => {
26
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
27
- :native => 0,
28
- :extra => [8,1,2],
29
- :exclude => proc{|l| l.match(/^!/)},
30
- :fields => ["GCD ID", "Gene Name", "Gene Alias"]
31
- },
32
- }
33
-
34
- $go = {
35
- :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
36
- :code => 10,
37
- :go => 4,
38
- :pmid => 5,
39
- :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
40
- }
41
-
42
- $query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
43
-
44
- ####
45
-
46
- #Rake::Task['identifiers'].clear
47
- #file 'identifiers' => ['lexicon'] do |t|
48
- # identifiers = {}
49
- # if $identifiers[:file]
50
- # identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
51
- # end
52
- #
53
- # orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
54
- #
55
- # translations = {}
56
- #
57
- # Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
58
- # orfs.each{|orf|
59
- # translations[orf] ||= []
60
- # translations[orf] << entrez
61
- # }
62
- # }
63
- #
64
- # orf2native.each{|orf, native|
65
- # next unless identifiers[native]
66
- # identifiers[native] << [orf]
67
- # if translations[orf]
68
- # identifiers[native] << translations[orf]
69
- # else
70
- # identifiers[native] << []
71
- # end
72
- #
73
- # }
74
- #
75
- # header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
76
- # Open.write('identifiers',
77
- # header +
78
- # identifiers.collect{|code, name_lists|
79
- # "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
80
- # }.join("\n")
81
- # )
82
- #end
83
- #
84
- #