rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,75 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+
5
+ # Offers methods to help deal with the files distributed for the BioCreative
6
+ # competition related to Gene Mention and Normalization.
7
+ module Biocreative
8
+
9
+ # Read the files regarding the dataset and return a hash with the entry codes
10
+ # as keys and as values a hash with :text and the :mentions for that entry
11
+ def self.BC2GM(dataset)
12
+
13
+ data = {}
14
+
15
+ Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each{|l|
16
+ code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
17
+ data[code] ={ :text => text }
18
+ }
19
+
20
+ Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each{|l|
21
+ code, pos, mention = l.chomp.split(/\|/)
22
+ data[code] ||= {}
23
+ data[code][:mentions] ||= []
24
+ data[code][:mentions].push(mention)
25
+ }
26
+
27
+
28
+ data
29
+
30
+ end
31
+
32
+ # Given a string of text and a string with a mention, return positions for
33
+ # that mention in the format used in the evaluation.
34
+ def self.position(text, mention)
35
+
36
+ re = mention.gsub(/\W+/,' ')
37
+ re = Regexp.quote(re)
38
+ re = re.gsub(/\\ /,'\W*')
39
+ re = '\(?' + re if mention =~ /\)/
40
+ re = re + '\)?' if mention =~ /\(/
41
+ re = "'?" + re + "'?" if mention =~ /'/
42
+
43
+ positions = []
44
+
45
+ offset = 0
46
+ while text.match(/(.*?)(#{re})(.*)/s)
47
+ pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
48
+
49
+ start = offset + pre.gsub(/\s/,'').length
50
+ last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
51
+
52
+ positions << [start, last]
53
+
54
+ offset = last + 1
55
+ text = post
56
+ end
57
+
58
+ return positions
59
+ end
60
+
61
+ # Run the evaluation perl script
62
+ def self.BC2GM_eval(results, dataset, outfile)
63
+
64
+
65
+ cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
66
+ -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
67
+ -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
68
+ #{results} > #{outfile}"
69
+ system cmd
70
+
71
+ end
72
+
73
+ end
74
+
75
+
@@ -0,0 +1,106 @@
1
+
2
+ require 'rbbt/util/open'
3
+ require 'rbbt'
4
+
5
+ # This module interacts with BioMart. It performs queries to BioMart and
6
+ # synthesises a hash with the results. Note that this module connects to the
7
+ # online BioMart WS using the Open in 'rbbt/util/open' module which offers
8
+ # caching by default. To obtain up to date results you may need to clear the
9
+ # cache from previous queries.
10
+ module BioMart
11
+
12
+ class BioMart::QueryError < StandardError; end
13
+ private
14
+
15
+ @@biomart_query_xml = <<-EOT
16
+ <?xml version="1.0" encoding="UTF-8"?>
17
+ <!DOCTYPE Query>
18
+ <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
19
+ <Dataset name = "<!--DATABASE-->" interface = "default" >
20
+ <!--FILTERS-->
21
+ <!--MAIN-->
22
+ <!--ATTRIBUTES-->
23
+ </Dataset>
24
+ </Query>
25
+ EOT
26
+
27
+
28
+
29
+
30
+ def self.get(database, main, attrs = nil, filters = nil, data = nil)
31
+ attrs ||= []
32
+ filters ||= ["with_#{main}"]
33
+ data ||= {}
34
+
35
+ query = @@biomart_query_xml.clone
36
+ query.sub!(/<!--DATABASE-->/,database)
37
+ query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
38
+ query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
39
+ query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
40
+
41
+ rows = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
42
+ if rows =~ /Query ERROR:/
43
+ raise BioMart::QueryError, rows
44
+ end
45
+
46
+ rows.each{|l|
47
+ parts = l.chomp.split(/\t/)
48
+ main = parts.shift
49
+ next if main.nil? || main.empty?
50
+
51
+ data[main] ||= {}
52
+ attrs.each{|name|
53
+ value = parts.shift
54
+ data[main][name] ||= []
55
+ next if value.nil?
56
+ data[main][name] << value
57
+ }
58
+ }
59
+
60
+ data
61
+
62
+ end
63
+
64
+ public
65
+
66
+ # This method performs a query in biomart for a datasets and a given set of
67
+ # attributes, there must be a main attribute that will be used as the key in
68
+ # the result hash, optionally there may be a list of additional attributes
69
+ # and filters. The data parameter at the end is used internally to
70
+ # incrementally building the result, due to a limitation of the BioMart WS
71
+ # that only allows 3 external arguments, users normally should leave it
72
+ # unspecified or nil. The result is a hash, where the keys are the different
73
+ # values for the main attribute, and the value is a hash with every other
74
+ # attribute as key, and as value and array with all possible values (Note
75
+ # that for a given value of the main attribute, there may be more than one
76
+ # value for another attribute). If filters is left a nil it adds a filter to
77
+ # the BioMart query to remove results with the main attribute empty, this may
78
+ # cause an error if the BioMart WS does not allow filtering with that
79
+ # attribute.
80
+ def self.query(database, main, attrs = nil, filters = nil, data = nil)
81
+ attrs ||= []
82
+ data ||= {}
83
+
84
+ chunks = []
85
+ chunk = []
86
+ attrs.each{|a|
87
+ chunk << a
88
+ if chunk.length == 2
89
+ chunks << chunk
90
+ chunk = []
91
+ end
92
+ }
93
+
94
+ chunks << chunk if chunk.any?
95
+
96
+ chunks.each{|chunk|
97
+ data = get(database,main,chunk, filters, data)
98
+ }
99
+
100
+ data
101
+ end
102
+
103
+
104
+
105
+ end
106
+
@@ -0,0 +1,211 @@
1
+
2
+ require 'rbbt'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/util/tmpfile'
5
+ require 'rbbt/util/filecache'
6
+ require 'rbbt/bow/bow.rb'
7
+ require 'set'
8
+
9
+
10
+ # This module is used to parse and extract information from the
11
+ # gene_info file at Entrez Gene, as well as from the gene2pubmed file.
12
+ # Both need to be downloaded and accesible for Rbbt, which is done as
13
+ # part of a normal installation.
14
+ module Entrez
15
+
16
+ class NoFileError < StandardError; end
17
+
18
+ # Given a taxonomy, or set of taxonomies, it returns an inverse hash,
19
+ # where each key is the entrez id of a gene, and the value is an array
20
+ # of possible synonyms in other databases. Is mostly used to translate
21
+ # entrez ids to the native database id of the organism. The parameter
22
+ # +native+ specifies the position of the key containing synonym, the
23
+ # fifth by default, +fix+ and +check+ are Procs used, if present, to
24
+ # pre-process lines and to check if they should be processed.
25
+ def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
26
+
27
+ raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
28
+
29
+ native ||= 5
30
+
31
+ taxs = [taxs] unless taxs.is_a?(Array)
32
+ taxs = taxs.collect{|t| t.to_s}
33
+
34
+ lexicon = {}
35
+ tmp = TmpFile.tmp_file("entrez-")
36
+ system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
37
+ File.open(tmp).each{|l|
38
+ parts = l.chomp.split(/\t/)
39
+ next if parts[native] == '-'
40
+ entrez = parts[1]
41
+ parts[native].split(/\|/).each{|id|
42
+ id = fix.call(id) if fix
43
+ next if check && !check.call(id)
44
+
45
+ lexicon[entrez] ||= []
46
+ lexicon[entrez] << id
47
+ }
48
+ }
49
+ FileUtils.rm tmp
50
+
51
+ lexicon
52
+ end
53
+
54
+ # For a given taxonomy, or set of taxonomies, it returns a hash with
55
+ # genes as keys and arrays of related PubMed ids as values, as
56
+ # extracted from the gene2pubmed file from Entrez Gene.
57
+ def self.entrez2pubmed(taxs)
58
+ raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
59
+
60
+ taxs = [taxs] unless taxs.is_a?(Array)
61
+ taxs = taxs.collect{|t| t.to_s}
62
+
63
+ data = {}
64
+ tmp = TmpFile.tmp_file("entrez-")
65
+ system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
66
+
67
+ data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
68
+
69
+ FileUtils.rm tmp
70
+
71
+ data
72
+ end
73
+
74
+
75
+
76
+ # This class parses an xml containing the information for a particular
77
+ # gene as served by Entrez Gene, and hold some of its information.
78
+ class Gene
79
+ attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
80
+
81
+ def initialize(xml)
82
+ return if xml.nil?
83
+
84
+ @organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
85
+ @symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
86
+ @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
87
+ @aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
88
+ @protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
89
+ @summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
90
+ @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
91
+
92
+
93
+ end
94
+
95
+ # Joins the text from symbol, description, aka, protnames, and
96
+ # summary
97
+ def text
98
+ #[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
99
+ [@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
100
+ end
101
+ end
102
+
103
+ private
104
+
105
+ @@last = Time.now
106
+ @@entrez_lag = 1
107
+ def self.get_online(geneids)
108
+
109
+ geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
110
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
111
+
112
+ diff = Time.now - @@last
113
+ sleep @@entrez_lag - diff unless diff > @@entrez_lag
114
+
115
+ xml = Open.read(url, :quiet => true, :nocache => true)
116
+
117
+ @@last = Time.now
118
+
119
+ genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
120
+
121
+ if geneids.is_a? Array
122
+ list = {}
123
+ genes.each_with_index{|gene,i|
124
+ #geneid = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
125
+ geneid = geneids[i]
126
+ list[geneid ] = gene
127
+ }
128
+ return list
129
+ else
130
+ return genes.first
131
+ end
132
+
133
+ end
134
+
135
+ public
136
+
137
+ # Build a file name for a gene based on the id. Prefix the id by 'gene-',
138
+ # substitute the slashes with '_SLASH_', and add a '.xml' extension.
139
+ def self.gene_filename(id)
140
+ FileCache.clean_path('gene-' + id.to_s + '.xml')
141
+ end
142
+
143
+ # Returns a Gene object for the given Entrez Gene id. If an array of
144
+ # ids is given instead, a hash is returned. This method uses the
145
+ # caching facilities from Rbbt.
146
+ def self.get_gene(geneid)
147
+
148
+ return nil if geneid.nil?
149
+
150
+ if Array === geneid
151
+ missing = []
152
+ list = {}
153
+
154
+ geneid.each{|p|
155
+ next if p.nil?
156
+ filename = gene_filename p
157
+ if File.exists? FileCache.path(filename)
158
+ list[p] = Gene.new(Open.read(FileCache.path(filename)))
159
+ else
160
+ missing << p
161
+ end
162
+ }
163
+
164
+ return list unless missing.any?
165
+ genes = get_online(missing)
166
+
167
+ genes.each{|p, xml|
168
+ filename = gene_filename p
169
+ FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
170
+ list[p] = Gene.new(xml)
171
+ }
172
+
173
+ return list
174
+
175
+ else
176
+ filename = gene_filename geneid
177
+
178
+ if File.exists? FileCache.path(filename)
179
+ return Gene.new(Open.read(FileCache.path(filename)))
180
+ else
181
+ xml = get_online(geneid)
182
+ FileCache.add_file(filename,xml)
183
+
184
+ return Gene.new(xml)
185
+ end
186
+ end
187
+ end
188
+
189
+ # Counts the words in common between a chunk of text and the text
190
+ # found in Entrez Gene for that particular gene. The +gene+ may be a
191
+ # gene identifier or a Gene class instance.
192
+ def self.gene_text_similarity(gene, text)
193
+ case
194
+ when Entrez::Gene === gene
195
+ gene_text = gene.text
196
+ when String === gene || Fixnum === gene
197
+ gene_text = get_gene(gene).text
198
+ else
199
+ return 0
200
+ end
201
+
202
+
203
+ gene_words = gene_text.words.to_set
204
+ text_words = text.words.to_set
205
+
206
+ return 0 if gene_words.empty? || text_words.empty?
207
+
208
+ common = gene_words.intersection(text_words)
209
+ common.length / (gene_words.length + text_words.length).to_f
210
+ end
211
+ end
@@ -0,0 +1,40 @@
1
+ require 'rbbt'
2
+
3
+
4
+ # This module holds helper methods to deal with the Gene Ontology files. Right
5
+ # now all it does is provide a translation form id to the actual names.
6
+ module GO
7
+ @@info = nil
8
+
9
+ # This method needs to be called before any translations can be made, it is
10
+ # called automatically the first time the id2name method is called. It loads
11
+ # the gene_ontology.obo file and extracts all the fields, although right now,
12
+ # only the name field is used.
13
+ def self.init
14
+ @@info = {}
15
+ File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
16
+ split(/\[Term\]/).
17
+ each{|term|
18
+ term_info = {}
19
+ term.split(/\n/).
20
+ select{|l| l =~ /:/}.
21
+ each{|l|
22
+ key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
23
+ term_info[key.strip] = value.strip
24
+ }
25
+ @@info[term_info["id"]] = term_info
26
+ }
27
+ end
28
+
29
+ def self.id2name(id)
30
+ self.init unless @@info
31
+ if id.kind_of? Array
32
+ @@info.values_at(*id).collect{|i| i['name'] if i}
33
+ else
34
+ return "Name not found" unless @@info[id]
35
+ @@info[id]['name']
36
+ end
37
+ end
38
+
39
+
40
+ end
@@ -0,0 +1,197 @@
1
+
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rnorm'
4
+ require 'rbbt/util/open'
5
+
6
+ module Organism
7
+
8
+ class OrganismNotProcessedError < StandardError; end
9
+
10
+ def self.all
11
+ Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/name').collect{|f| File.basename(File.dirname(f))}
12
+ end
13
+
14
+
15
+ def self.name(org)
16
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
17
+ end
18
+
19
+ NAME2ORG = {}
20
+ Organism::all.each{|org|
21
+ name = Organism.name(org).strip.downcase
22
+ NAME2ORG[name] = org
23
+ }
24
+
25
+ def self.name2org(name)
26
+ NAME2ORG[name.strip.downcase]
27
+ end
28
+
29
+ def self.id_formats(org)
30
+ id_types = {}
31
+ formats = supported_ids(org)
32
+
33
+ lines = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).collect
34
+
35
+ lines.each{|l|
36
+ ids_per_type = l.split(/\t/)
37
+ formats.zip(ids_per_type).each{|p|
38
+ format = p[0]
39
+ ids = p[1].split(/\|/)
40
+ ids.each{|id|
41
+ next if id.nil? || id == ""
42
+ id_types[id.downcase] ||= []
43
+ id_types[id.downcase] << format unless id_types[id.downcase].include? format
44
+ }
45
+ }
46
+ }
47
+
48
+ return id_types
49
+ end
50
+
51
+ def self.guessIdFormat(formats, query)
52
+ query = query.compact.collect{|gene| gene.downcase}.uniq
53
+ if String === formats
54
+ formats = id_formats(formats)
55
+ end
56
+
57
+ return nil if formats.values.empty?
58
+ values = formats.values_at(*query)
59
+ return nil if values.empty?
60
+
61
+ format_count = {}
62
+ values.compact.collect{|types| types.uniq}.flatten.each{|f|
63
+ format_count[f] ||= 0
64
+ format_count[f] += 1
65
+ }
66
+
67
+ return nil if format_count.values.empty?
68
+ format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
69
+ end
70
+
71
+ def self.ner(org, type=:abner, options = {})
72
+
73
+ case type.to_sym
74
+ when :abner
75
+ require 'rbbt/ner/abner'
76
+ return Abner.new
77
+ when :banner
78
+ require 'rbbt/ner/banner'
79
+ return Banner.new
80
+ when :rner
81
+ require 'rbbt/ner/rner'
82
+ model = options[:model]
83
+ model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
84
+ model ||= File.join(Rbbt.datadir,'ner/model/BC2')
85
+ return NER.new(model)
86
+ else
87
+ raise "Ner type (#{ type }) unknown"
88
+ end
89
+
90
+ end
91
+
92
+ def self.norm(org, to_entrez = nil)
93
+ if to_entrez.nil?
94
+ to_entrez = id_index(org, :native => 'Entrez Gene ID', :other => [supported_ids(org).first])
95
+ end
96
+
97
+ token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
98
+ if !File.exists? token_file
99
+ token_file = nil
100
+ end
101
+
102
+ Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
103
+ end
104
+
105
+ def self.lexicon(org, options = {})
106
+ options[:sep] = "\t|\\|" unless options[:sep]
107
+ Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
108
+ end
109
+
110
+ def self.goterms(org)
111
+ goterms = {}
112
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each{|l|
113
+ gene, go = l.chomp.split(/\t/)
114
+ goterms[gene.strip] ||= []
115
+ goterms[gene.strip] << go.strip
116
+ }
117
+ goterms
118
+ end
119
+
120
+ def self.literature(org)
121
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).collect{|l| l.chomp.scan(/\d+/)}.flatten
122
+ end
123
+
124
+ def self.gene_literature(org)
125
+ Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
126
+ end
127
+
128
+ def self.gene_literature_go(org)
129
+ Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
130
+ end
131
+
132
+ def self.supported_ids(org, options = {})
133
+ formats = []
134
+ examples = [] if options[:examples]
135
+ i= 0
136
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each{|l|
137
+ if i == 0
138
+ i += 1
139
+ next unless l=~/^\s*#/
140
+ formats = l.chomp.sub(/^[\s#]+/,'').split(/\t/).collect{|n| n.strip}
141
+ return formats unless examples
142
+ next
143
+ end
144
+
145
+ if l.chomp.split(/\t/).select{|name| name && name =~ /\w/}.length > examples.length
146
+ examples = l.chomp.split(/\t/).collect{|name| name.split(/\|/).first}
147
+ end
148
+ i += 1
149
+ }
150
+
151
+ formats.zip(examples)
152
+ end
153
+
154
+ def self.id_position(supported_ids, id_name, options = {})
155
+ pos = 0
156
+ supported_ids.each_with_index{|id, i|
157
+ if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
158
+ pos = i;
159
+ end
160
+ }
161
+ pos
162
+ end
163
+
164
+ def self.id_index(org, option = {})
165
+ native = option[:native]
166
+ other = option[:other]
167
+ option[:case_sensitive] = false if option[:case_sensitive].nil?
168
+
169
+ if native.nil? and other.nil?
170
+ Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
171
+ else
172
+ supported = Organism.supported_ids(org)
173
+
174
+ first = nil
175
+ if native
176
+ first = id_position(supported,native,option)
177
+ else
178
+ first = 0
179
+ end
180
+
181
+ rest = nil
182
+ if other
183
+ rest = other.collect{|name| id_position(supported,name, option)}
184
+ else
185
+ rest = (0..supported.length - 1).to_a - [first]
186
+ end
187
+
188
+ option[:native] = first
189
+ option[:extra] = rest
190
+ index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
191
+
192
+ index
193
+ end
194
+ end
195
+
196
+ end
197
+