rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,52 +0,0 @@
1
- isLetters /^[A-Z]+$/i
2
- isUpper /^[A-Z]+$/
3
- isLower /^[a-z]+$/
4
- isDigits /^[0-9]+$/i
5
- isRoman /^[IVX]+$/
6
- isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
- isPunctuation /^[,.;]$/
8
- isDelim /^[\/()\[\]{}\-]$/
9
- isNonWord /^[^\w]+$/
10
- isConjunction /^and|or|&|,$/
11
-
12
- hasLetters /[A-Z]/i
13
- hasUpper /.[A-Z]/
14
- hasLower /[a-z]/
15
- hasDigits /[0-9]/i
16
- hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
- hasPunctuation /[,.;]/
18
- hasDelim /[\/()\[\]{}\-]/
19
- hasNonWord /[^\w]/
20
- caspMix /[a-z].[A-Z]/
21
- keywords /(?:protein|gene|domain|ase)s?$/
22
- hasSuffix /[a-z][A-Z0-9]$/
23
-
24
- numLetters do |w| w.scan(/[A-Z]/i).length end
25
- numDigits do |w| w.scan(/[0-9]/).length end
26
- #
27
- prefix_3 /^(...)/
28
- prefix_4 /^(....)/
29
- suffix_3 /(...)$/
30
- suffix_4 /(....)$/
31
-
32
-
33
- token1 do |w|
34
- w.sub(/[A-Z]/,'A').
35
- sub(/[a-z]/,'a').
36
- sub(/[0-9]/,'0').
37
- sub(/[^0-9a-z]/i,'x')
38
- end
39
- token2 do |w|
40
- w.sub(/[A-Z]+/,'A').
41
- sub(/[a-z]+/,'a').
42
- sub(/[0-9]+/,'0').
43
- sub(/[^0-9a-z]+/i,'x')
44
- end
45
- token3 do |w| w.downcase end
46
- special do |w| w.is_special? end
47
-
48
- context %w(special token2 isPunctuation isDelim)
49
- window %w(1 2 3 -1 -2 -3)
50
- #direction :reverse
51
-
52
-
@@ -1,219 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/sources/organism'
3
- require 'rbbt/util/open'
4
- require 'rbbt/ner/rner'
5
- require 'rbbt/ner/rnorm'
6
-
7
-
8
- require 'progress-monitor'
9
-
10
- $type = ENV['ner'] || :rner
11
- $debug = !ENV['debug'].nil?
12
- $perfect = !ENV['perfect'].nil?
13
- $docs = ENV['docs']
14
-
15
-
16
- $org2rbbt = {
17
- 'yeast' => 'sgd',
18
- 'mouse' => 'mgi',
19
- 'fly' => 'sgd',
20
- 'bc2gn' => 'human',
21
- }
22
-
23
- def match(org, filedir, goldstandard,outfile)
24
-
25
- t = Time.now
26
- if org == 'bc2gn'
27
- custom_file = File.join('config', org + '.config')
28
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
29
- :to_entrez => false,
30
- :file => (File.exist?(custom_file) ? custom_file : nil),
31
- :max_candidates => 200)
32
- else
33
- custom_file = File.join('config', org + '.config')
34
- norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
35
- :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
36
- :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
37
- :fix => proc{|l| l.sub(/S000/,'S0')}),
38
- :file => (File.exist?(custom_file) ? custom_file : nil),
39
- :max_candidates => 200)
40
- end
41
- STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
42
-
43
-
44
- if $type.to_s == 'rner'
45
- ner = NER.new('models/' + org)
46
- else
47
- ner = Organism.ner($org2rbbt[org], $type)
48
- end
49
-
50
-
51
- fout=File.open(outfile,'w')
52
-
53
- gs = Open.to_hash(goldstandard,:native => 0,:extra => 1)
54
- gs_mentions = Open.to_hash(goldstandard,:native => 0,:extra => 2)
55
-
56
- if org == 'bc2gn'
57
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
58
- else
59
- lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
60
- end
61
-
62
- if $docs
63
- files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
64
- else
65
- files = Dir.glob(filedir + '*.txt').sort
66
- end
67
-
68
- Progress.monitor("Processing Files")
69
- files.each{|f|
70
- fid = File.basename(f).sub(/.txt/,'')
71
-
72
- text = Open.read(f)
73
- if $perfect
74
- mentions = (gs_mentions[fid] || []).flatten
75
-
76
- else
77
- mentions = ner.extract(text).uniq
78
- end
79
-
80
- if $debug
81
- puts "------------------------------------"
82
- puts "FILE #{fid}"
83
- puts
84
- puts text
85
- puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
86
- puts "MENTIONS: #{mentions.join(", ")}"
87
- end
88
-
89
-
90
- found = []
91
- mentions.each{|mention|
92
-
93
- codes = norm.select(norm.match(mention),mention,text)
94
-
95
- found += codes
96
- codes.each{|code|
97
- #code = code.sub(/S000/,'S0')
98
- fout.puts "#{ fid }\t#{ code}\t#{mention}"
99
- }
100
-
101
- puts "Mention: #{ mention } => #{ codes.join(", ") }" if $debug
102
- }
103
-
104
- if $debug
105
- found.uniq!
106
- fn = (gs[fid] || []).flatten.uniq - found
107
- fp = found - (gs[fid] || []).flatten.uniq
108
-
109
- fn.each{|code|
110
- if lex[code]
111
- puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
112
- else
113
- puts "FN: #{ code }"
114
- end
115
- }
116
- fp.each{|code|
117
- if lex[code]
118
- puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
119
- else
120
- puts "FN: #{ code }"
121
- end
122
- }
123
-
124
-
125
- end
126
-
127
- }
128
- fout.close
129
-
130
- end
131
-
132
- rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
133
- org = File.basename(t.name).sub(/\.features/,'')
134
-
135
- if org == 'bc2gn'
136
- lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
137
- else
138
- lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
139
- end
140
-
141
- names = File.open(lexicon).collect{|l|
142
- names = l.split(/\t/)
143
- names.shift
144
- names.compact.select{|n| !n.empty?}
145
- }.flatten
146
-
147
- fout = File.open(t.name,'w')
148
- parser = NERFeatures.new
149
-
150
- Progress.monitor("CRFPP Features #{ org }")
151
- names.each{|name|
152
- features = parser.text_features(name, true)
153
- features.each{|feat|
154
- fout.puts feat.join(" ")
155
- }
156
- fout.puts
157
- }
158
- fout.close
159
- if org != 'bc2gn'
160
- Open.append(t.name, Open.read('../ner/data/BC2.features'))
161
- else
162
- Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
163
- Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
164
- end
165
-
166
- end
167
-
168
- rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
169
- org = File.basename(t.name)
170
-
171
- parser = NERFeatures.new
172
- parser.train( t.name + '.features', t.name)
173
- end
174
-
175
-
176
- rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
177
- org, dataset = File.basename(t.name).split(/_/)
178
-
179
- if $type.to_sym == :rner
180
- Rake::Task['models/' + org].invoke
181
- end
182
-
183
- filedir = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
184
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
185
-
186
- match(org,filedir, goldstandard,t.name)
187
- end
188
-
189
- rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
190
- org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
191
-
192
- cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
193
- puts cmd
194
- system cmd
195
- end
196
-
197
- rule (/results\/bc2gn$/) do |t|
198
- org = 'bc2gn'
199
-
200
- if $type.to_sym == :rner
201
- Rake::Task['models/' + org].invoke
202
- end
203
-
204
- filedir = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
205
- goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
206
-
207
- match(org,filedir, goldstandard,t.name)
208
- end
209
-
210
- rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
211
-
212
- cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
213
- system cmd
214
-
215
- end
216
-
217
-
218
-
219
-
@@ -1,10 +0,0 @@
1
- equal do |w| [w] end
2
- standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
- cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
4
- special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
- words do |w|
6
- w.sub(/(.*)I$/,'\1I \1').
7
- scan(/[a-z][a-z]+/i).
8
- sort{|a,b| b.length <=> a.length}.
9
- collect{|n| n.downcase}
10
- end
@@ -1,79 +0,0 @@
1
- require 'rbbt/util/misc'
2
- tokens do
3
-
4
- # Some (possible) single letters first
5
- receptor /^(?:receptor|r)s?$/i
6
- protein /^(?:protein|p)s?$/i
7
- roman /^[IV]+$/
8
- greek_letter do |w| $inverse_greek[w.downcase] != nil end
9
-
10
-
11
- # Some words for removal
12
- stopword do |w| $stopwords.include?( w.downcase_first) end
13
- gene /genes?/i
14
- dna
15
- cdna
16
- rna
17
- mrna
18
- trna
19
- cdna
20
- component
21
- exon
22
- intron
23
- domain
24
- family
25
-
26
-
27
- # Important words
28
- number /^(?:\d+[.,]?\d+|\d)$/
29
- greek do |w| $greek[w.downcase] != nil end
30
- special do |w| w.is_special? end
31
- promoter
32
- similar /^(homolog.*|like|related|associated)$/
33
- ase /ase$/
34
- in_end /in$/
35
- end
36
-
37
- comparisons do
38
-
39
- compare.number do |l1,l2|
40
- v = 0
41
- case
42
- when l1.empty? && l2.empty?
43
- v = 0
44
- when l1.sort.uniq == l2.sort.uniq
45
- v = 3
46
- when l1.any? && l1[0] == l2[0]
47
- v = -3
48
- when l1.empty? && l2 == ['1']
49
- v = -5
50
- else
51
- v = -10
52
- end
53
- v
54
- end
55
-
56
- diff.promoter -10
57
- diff.receptor -10
58
- diff.similar -10
59
- diff.capital -10
60
-
61
- same.unknown 1
62
- miss.unknown -2
63
- extr.unknown -2
64
-
65
- same.greek 1
66
- miss.greek -2
67
- extr.greek -2
68
-
69
- same.special 4
70
- miss.special -3
71
- extr.special -3
72
-
73
- transform.roman do |t| [t.arabic, :number] end
74
- transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
75
- transform.ase do |t| [t, :special] end
76
- transform.in_end do |t| [t, :special] end
77
- transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
78
- end
79
-
@@ -1,23 +0,0 @@
1
- #!/bin/bash
2
- function norm(){
3
- organism=$1
4
- shift
5
- dataset=$1
6
- shift
7
- ner=$1
8
- shift
9
-
10
- CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
11
- echo $CMD
12
- $CMD
13
- }
14
-
15
-
16
- function norm_2(){
17
- ner=$1
18
- shift
19
-
20
- CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
21
- echo $CMD
22
- $CMD
23
- }
@@ -1,43 +0,0 @@
1
- $org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
2
-
3
- task 'names' do
4
- orgs = Dir.glob('*').
5
- select{|t|
6
- File.directory?(t ) &&
7
- File.exist?(t + '/Rakefile')
8
- }
9
-
10
- orgs.each{|org|
11
- pid = Process.fork{
12
- Dir.chdir(org)
13
- load 'Rakefile'
14
- Rake::Task['name'].invoke
15
- }
16
- Process.waitpid pid
17
- }
18
-
19
- end
20
-
21
- task 'default' do
22
- if $org
23
- orgs = [$org]
24
- else
25
-
26
- orgs = Dir.glob('*').
27
- select{|t|
28
- File.directory?(t ) &&
29
- File.exist?(t + '/Rakefile')
30
- }
31
- end
32
-
33
- orgs.each{|org|
34
- puts "Updating #{ org }"
35
- pid = Process.fork{
36
- Dir.chdir(org)
37
- load 'Rakefile'
38
- Rake::Task['update'].invoke
39
- }
40
- Process.waitpid pid
41
- }
42
- end
43
-
@@ -1,84 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Candida albicans"
4
-
5
-
6
- $native_id = "Systematic Name"
7
-
8
- $entrez2native = {
9
- :tax => 237561,
10
- :fix => proc{|code| code.sub(/^CaO/,'orf') },
11
- :check => proc{|code| code.match(/^orf/)},
12
- :native => 3
13
- }
14
-
15
- $lexicon = {
16
- :file => {
17
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
18
- :native => 0,
19
- :extra => [8,1,2],
20
- :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
21
- },
22
- }
23
-
24
- $identifiers = {
25
- :file => {
26
- :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
27
- :native => 0,
28
- :extra => [8,1,2],
29
- :exclude => proc{|l| l.match(/^!/)},
30
- :fields => ["GCD ID", "Gene Name", "Gene Alias"]
31
- },
32
- }
33
-
34
- $go = {
35
- :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
36
- :code => 10,
37
- :go => 4,
38
- :pmid => 5,
39
- :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
40
- }
41
-
42
- $query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
43
-
44
- ####
45
-
46
- #Rake::Task['identifiers'].clear
47
- #file 'identifiers' => ['lexicon'] do |t|
48
- # identifiers = {}
49
- # if $identifiers[:file]
50
- # identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
51
- # end
52
- #
53
- # orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
54
- #
55
- # translations = {}
56
- #
57
- # Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
58
- # orfs.each{|orf|
59
- # translations[orf] ||= []
60
- # translations[orf] << entrez
61
- # }
62
- # }
63
- #
64
- # orf2native.each{|orf, native|
65
- # next unless identifiers[native]
66
- # identifiers[native] << [orf]
67
- # if translations[orf]
68
- # identifiers[native] << translations[orf]
69
- # else
70
- # identifiers[native] << []
71
- # end
72
- #
73
- # }
74
- #
75
- # header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
76
- # Open.write('identifiers',
77
- # header +
78
- # identifiers.collect{|code, name_lists|
79
- # "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
80
- # }.join("\n")
81
- # )
82
- #end
83
- #
84
- #