rbbt 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,75 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+
5
+ # Offers methods to help deal with the files distributed for the BioCreative
6
+ # competition related to Gene Mention and Normalization.
7
+ module Biocreative
8
+
9
+ # Read the files regarding the dataset and return a hash with the entry codes
10
+ # as keys and as values a hash with :text and the :mentions for that entry
11
+ def self.BC2GM(dataset)
12
+
13
+ data = {}
14
+
15
+ Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each{|l|
16
+ code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
17
+ data[code] ={ :text => text }
18
+ }
19
+
20
+ Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each{|l|
21
+ code, pos, mention = l.chomp.split(/\|/)
22
+ data[code] ||= {}
23
+ data[code][:mentions] ||= []
24
+ data[code][:mentions].push(mention)
25
+ }
26
+
27
+
28
+ data
29
+
30
+ end
31
+
32
+ # Given a string of text and a string with a mention, return positions for
33
+ # that mention in the format used in the evaluation.
34
+ def self.position(text, mention)
35
+
36
+ re = mention.gsub(/\W+/,' ')
37
+ re = Regexp.quote(re)
38
+ re = re.gsub(/\\ /,'\W*')
39
+ re = '\(?' + re if mention =~ /\)/
40
+ re = re + '\)?' if mention =~ /\(/
41
+ re = "'?" + re + "'?" if mention =~ /'/
42
+
43
+ positions = []
44
+
45
+ offset = 0
46
+ while text.match(/(.*?)(#{re})(.*)/s)
47
+ pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
48
+
49
+ start = offset + pre.gsub(/\s/,'').length
50
+ last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
51
+
52
+ positions << [start, last]
53
+
54
+ offset = last + 1
55
+ text = post
56
+ end
57
+
58
+ return positions
59
+ end
60
+
61
+ # Run the evaluation perl script
62
+ def self.BC2GM_eval(results, dataset, outfile)
63
+
64
+
65
+ cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
66
+ -gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
67
+ -altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
68
+ #{results} > #{outfile}"
69
+ system cmd
70
+
71
+ end
72
+
73
+ end
74
+
75
+
@@ -0,0 +1,106 @@
1
+
2
+ require 'rbbt/util/open'
3
+ require 'rbbt'
4
+
5
+ # This module interacts with BioMart. It performs queries to BioMart and
6
+ # synthesises a hash with the results. Note that this module connects to the
7
+ # online BioMart WS using the Open in 'rbbt/util/open' module which offers
8
+ # caching by default. To obtain up to date results you may need to clear the
9
+ # cache from previous queries.
10
+ module BioMart
11
+
12
+ class BioMart::QueryError < StandardError; end
13
+ private
14
+
15
+ @@biomart_query_xml = <<-EOT
16
+ <?xml version="1.0" encoding="UTF-8"?>
17
+ <!DOCTYPE Query>
18
+ <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
19
+ <Dataset name = "<!--DATABASE-->" interface = "default" >
20
+ <!--FILTERS-->
21
+ <!--MAIN-->
22
+ <!--ATTRIBUTES-->
23
+ </Dataset>
24
+ </Query>
25
+ EOT
26
+
27
+
28
+
29
+
30
+ def self.get(database, main, attrs = nil, filters = nil, data = nil)
31
+ attrs ||= []
32
+ filters ||= ["with_#{main}"]
33
+ data ||= {}
34
+
35
+ query = @@biomart_query_xml.clone
36
+ query.sub!(/<!--DATABASE-->/,database)
37
+ query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
38
+ query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
39
+ query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
40
+
41
+ rows = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
42
+ if rows =~ /Query ERROR:/
43
+ raise BioMart::QueryError, rows
44
+ end
45
+
46
+ rows.each{|l|
47
+ parts = l.chomp.split(/\t/)
48
+ main = parts.shift
49
+ next if main.nil? || main.empty?
50
+
51
+ data[main] ||= {}
52
+ attrs.each{|name|
53
+ value = parts.shift
54
+ data[main][name] ||= []
55
+ next if value.nil?
56
+ data[main][name] << value
57
+ }
58
+ }
59
+
60
+ data
61
+
62
+ end
63
+
64
+ public
65
+
66
+ # This method performs a query in biomart for a datasets and a given set of
67
+ # attributes, there must be a main attribute that will be used as the key in
68
+ # the result hash, optionally there may be a list of additional attributes
69
+ # and filters. The data parameter at the end is used internally to
70
+ # incrementally building the result, due to a limitation of the BioMart WS
71
+ # that only allows 3 external arguments, users normally should leave it
72
+ # unspecified or nil. The result is a hash, where the keys are the different
73
+ # values for the main attribute, and the value is a hash with every other
74
+ # attribute as key, and as value and array with all possible values (Note
75
+ # that for a given value of the main attribute, there may be more than one
76
+ # value for another attribute). If filters is left a nil it adds a filter to
77
+ # the BioMart query to remove results with the main attribute empty, this may
78
+ # cause an error if the BioMart WS does not allow filtering with that
79
+ # attribute.
80
+ def self.query(database, main, attrs = nil, filters = nil, data = nil)
81
+ attrs ||= []
82
+ data ||= {}
83
+
84
+ chunks = []
85
+ chunk = []
86
+ attrs.each{|a|
87
+ chunk << a
88
+ if chunk.length == 2
89
+ chunks << chunk
90
+ chunk = []
91
+ end
92
+ }
93
+
94
+ chunks << chunk if chunk.any?
95
+
96
+ chunks.each{|chunk|
97
+ data = get(database,main,chunk, filters, data)
98
+ }
99
+
100
+ data
101
+ end
102
+
103
+
104
+
105
+ end
106
+
@@ -0,0 +1,211 @@
1
+
2
+ require 'rbbt'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/util/tmpfile'
5
+ require 'rbbt/util/filecache'
6
+ require 'rbbt/bow/bow.rb'
7
+ require 'set'
8
+
9
+
10
+ # This module is used to parse and extract information from the
11
+ # gene_info file at Entrez Gene, as well as from the gene2pubmed file.
12
+ # Both need to be downloaded and accesible for Rbbt, which is done as
13
+ # part of a normal installation.
14
+ module Entrez
15
+
16
+ class NoFileError < StandardError; end
17
+
18
+ # Given a taxonomy, or set of taxonomies, it returns an inverse hash,
19
+ # where each key is the entrez id of a gene, and the value is an array
20
+ # of possible synonyms in other databases. Is mostly used to translate
21
+ # entrez ids to the native database id of the organism. The parameter
22
+ # +native+ specifies the position of the key containing synonym, the
23
+ # fifth by default, +fix+ and +check+ are Procs used, if present, to
24
+ # pre-process lines and to check if they should be processed.
25
+ def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
26
+
27
+ raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
28
+
29
+ native ||= 5
30
+
31
+ taxs = [taxs] unless taxs.is_a?(Array)
32
+ taxs = taxs.collect{|t| t.to_s}
33
+
34
+ lexicon = {}
35
+ tmp = TmpFile.tmp_file("entrez-")
36
+ system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
37
+ File.open(tmp).each{|l|
38
+ parts = l.chomp.split(/\t/)
39
+ next if parts[native] == '-'
40
+ entrez = parts[1]
41
+ parts[native].split(/\|/).each{|id|
42
+ id = fix.call(id) if fix
43
+ next if check && !check.call(id)
44
+
45
+ lexicon[entrez] ||= []
46
+ lexicon[entrez] << id
47
+ }
48
+ }
49
+ FileUtils.rm tmp
50
+
51
+ lexicon
52
+ end
53
+
54
+ # For a given taxonomy, or set of taxonomies, it returns a hash with
55
+ # genes as keys and arrays of related PubMed ids as values, as
56
+ # extracted from the gene2pubmed file from Entrez Gene.
57
+ def self.entrez2pubmed(taxs)
58
+ raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
59
+
60
+ taxs = [taxs] unless taxs.is_a?(Array)
61
+ taxs = taxs.collect{|t| t.to_s}
62
+
63
+ data = {}
64
+ tmp = TmpFile.tmp_file("entrez-")
65
+ system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
66
+
67
+ data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
68
+
69
+ FileUtils.rm tmp
70
+
71
+ data
72
+ end
73
+
74
+
75
+
76
+ # This class parses an xml containing the information for a particular
77
+ # gene as served by Entrez Gene, and hold some of its information.
78
+ class Gene
79
+ attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
80
+
81
+ def initialize(xml)
82
+ return if xml.nil?
83
+
84
+ @organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
85
+ @symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
86
+ @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
87
+ @aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
88
+ @protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
89
+ @summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
90
+ @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
91
+
92
+
93
+ end
94
+
95
+ # Joins the text from symbol, description, aka, protnames, and
96
+ # summary
97
+ def text
98
+ #[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
99
+ [@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
100
+ end
101
+ end
102
+
103
+ private
104
+
105
+ @@last = Time.now
106
+ @@entrez_lag = 1
107
+ def self.get_online(geneids)
108
+
109
+ geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
110
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
111
+
112
+ diff = Time.now - @@last
113
+ sleep @@entrez_lag - diff unless diff > @@entrez_lag
114
+
115
+ xml = Open.read(url, :quiet => true, :nocache => true)
116
+
117
+ @@last = Time.now
118
+
119
+ genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
120
+
121
+ if geneids.is_a? Array
122
+ list = {}
123
+ genes.each_with_index{|gene,i|
124
+ #geneid = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
125
+ geneid = geneids[i]
126
+ list[geneid ] = gene
127
+ }
128
+ return list
129
+ else
130
+ return genes.first
131
+ end
132
+
133
+ end
134
+
135
+ public
136
+
137
+ # Build a file name for a gene based on the id. Prefix the id by 'gene-',
138
+ # substitute the slashes with '_SLASH_', and add a '.xml' extension.
139
+ def self.gene_filename(id)
140
+ FileCache.clean_path('gene-' + id.to_s + '.xml')
141
+ end
142
+
143
+ # Returns a Gene object for the given Entrez Gene id. If an array of
144
+ # ids is given instead, a hash is returned. This method uses the
145
+ # caching facilities from Rbbt.
146
+ def self.get_gene(geneid)
147
+
148
+ return nil if geneid.nil?
149
+
150
+ if Array === geneid
151
+ missing = []
152
+ list = {}
153
+
154
+ geneid.each{|p|
155
+ next if p.nil?
156
+ filename = gene_filename p
157
+ if File.exists? FileCache.path(filename)
158
+ list[p] = Gene.new(Open.read(FileCache.path(filename)))
159
+ else
160
+ missing << p
161
+ end
162
+ }
163
+
164
+ return list unless missing.any?
165
+ genes = get_online(missing)
166
+
167
+ genes.each{|p, xml|
168
+ filename = gene_filename p
169
+ FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
170
+ list[p] = Gene.new(xml)
171
+ }
172
+
173
+ return list
174
+
175
+ else
176
+ filename = gene_filename geneid
177
+
178
+ if File.exists? FileCache.path(filename)
179
+ return Gene.new(Open.read(FileCache.path(filename)))
180
+ else
181
+ xml = get_online(geneid)
182
+ FileCache.add_file(filename,xml)
183
+
184
+ return Gene.new(xml)
185
+ end
186
+ end
187
+ end
188
+
189
+ # Counts the words in common between a chunk of text and the text
190
+ # found in Entrez Gene for that particular gene. The +gene+ may be a
191
+ # gene identifier or a Gene class instance.
192
+ def self.gene_text_similarity(gene, text)
193
+ case
194
+ when Entrez::Gene === gene
195
+ gene_text = gene.text
196
+ when String === gene || Fixnum === gene
197
+ gene_text = get_gene(gene).text
198
+ else
199
+ return 0
200
+ end
201
+
202
+
203
+ gene_words = gene_text.words.to_set
204
+ text_words = text.words.to_set
205
+
206
+ return 0 if gene_words.empty? || text_words.empty?
207
+
208
+ common = gene_words.intersection(text_words)
209
+ common.length / (gene_words.length + text_words.length).to_f
210
+ end
211
+ end
@@ -0,0 +1,40 @@
1
+ require 'rbbt'
2
+
3
+
4
+ # This module holds helper methods to deal with the Gene Ontology files. Right
5
+ # now all it does is provide a translation form id to the actual names.
6
+ module GO
7
+ @@info = nil
8
+
9
+ # This method needs to be called before any translations can be made, it is
10
+ # called automatically the first time the id2name method is called. It loads
11
+ # the gene_ontology.obo file and extracts all the fields, although right now,
12
+ # only the name field is used.
13
+ def self.init
14
+ @@info = {}
15
+ File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
16
+ split(/\[Term\]/).
17
+ each{|term|
18
+ term_info = {}
19
+ term.split(/\n/).
20
+ select{|l| l =~ /:/}.
21
+ each{|l|
22
+ key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
23
+ term_info[key.strip] = value.strip
24
+ }
25
+ @@info[term_info["id"]] = term_info
26
+ }
27
+ end
28
+
29
+ def self.id2name(id)
30
+ self.init unless @@info
31
+ if id.kind_of? Array
32
+ @@info.values_at(*id).collect{|i| i['name'] if i}
33
+ else
34
+ return "Name not found" unless @@info[id]
35
+ @@info[id]['name']
36
+ end
37
+ end
38
+
39
+
40
+ end
@@ -0,0 +1,197 @@
1
+
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rnorm'
4
+ require 'rbbt/util/open'
5
+
6
+ module Organism
7
+
8
+ class OrganismNotProcessedError < StandardError; end
9
+
10
+ def self.all
11
+ Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/name').collect{|f| File.basename(File.dirname(f))}
12
+ end
13
+
14
+
15
+ def self.name(org)
16
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
17
+ end
18
+
19
+ NAME2ORG = {}
20
+ Organism::all.each{|org|
21
+ name = Organism.name(org).strip.downcase
22
+ NAME2ORG[name] = org
23
+ }
24
+
25
+ def self.name2org(name)
26
+ NAME2ORG[name.strip.downcase]
27
+ end
28
+
29
+ def self.id_formats(org)
30
+ id_types = {}
31
+ formats = supported_ids(org)
32
+
33
+ lines = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).collect
34
+
35
+ lines.each{|l|
36
+ ids_per_type = l.split(/\t/)
37
+ formats.zip(ids_per_type).each{|p|
38
+ format = p[0]
39
+ ids = p[1].split(/\|/)
40
+ ids.each{|id|
41
+ next if id.nil? || id == ""
42
+ id_types[id.downcase] ||= []
43
+ id_types[id.downcase] << format unless id_types[id.downcase].include? format
44
+ }
45
+ }
46
+ }
47
+
48
+ return id_types
49
+ end
50
+
51
+ def self.guessIdFormat(formats, query)
52
+ query = query.compact.collect{|gene| gene.downcase}.uniq
53
+ if String === formats
54
+ formats = id_formats(formats)
55
+ end
56
+
57
+ return nil if formats.values.empty?
58
+ values = formats.values_at(*query)
59
+ return nil if values.empty?
60
+
61
+ format_count = {}
62
+ values.compact.collect{|types| types.uniq}.flatten.each{|f|
63
+ format_count[f] ||= 0
64
+ format_count[f] += 1
65
+ }
66
+
67
+ return nil if format_count.values.empty?
68
+ format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
69
+ end
70
+
71
+ def self.ner(org, type=:abner, options = {})
72
+
73
+ case type.to_sym
74
+ when :abner
75
+ require 'rbbt/ner/abner'
76
+ return Abner.new
77
+ when :banner
78
+ require 'rbbt/ner/banner'
79
+ return Banner.new
80
+ when :rner
81
+ require 'rbbt/ner/rner'
82
+ model = options[:model]
83
+ model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
84
+ model ||= File.join(Rbbt.datadir,'ner/model/BC2')
85
+ return NER.new(model)
86
+ else
87
+ raise "Ner type (#{ type }) unknown"
88
+ end
89
+
90
+ end
91
+
92
+ def self.norm(org, to_entrez = nil)
93
+ if to_entrez.nil?
94
+ to_entrez = id_index(org, :native => 'Entrez Gene ID', :other => [supported_ids(org).first])
95
+ end
96
+
97
+ token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
98
+ if !File.exists? token_file
99
+ token_file = nil
100
+ end
101
+
102
+ Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
103
+ end
104
+
105
+ def self.lexicon(org, options = {})
106
+ options[:sep] = "\t|\\|" unless options[:sep]
107
+ Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
108
+ end
109
+
110
+ def self.goterms(org)
111
+ goterms = {}
112
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each{|l|
113
+ gene, go = l.chomp.split(/\t/)
114
+ goterms[gene.strip] ||= []
115
+ goterms[gene.strip] << go.strip
116
+ }
117
+ goterms
118
+ end
119
+
120
+ def self.literature(org)
121
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).collect{|l| l.chomp.scan(/\d+/)}.flatten
122
+ end
123
+
124
+ def self.gene_literature(org)
125
+ Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
126
+ end
127
+
128
+ def self.gene_literature_go(org)
129
+ Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
130
+ end
131
+
132
+ def self.supported_ids(org, options = {})
133
+ formats = []
134
+ examples = [] if options[:examples]
135
+ i= 0
136
+ Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each{|l|
137
+ if i == 0
138
+ i += 1
139
+ next unless l=~/^\s*#/
140
+ formats = l.chomp.sub(/^[\s#]+/,'').split(/\t/).collect{|n| n.strip}
141
+ return formats unless examples
142
+ next
143
+ end
144
+
145
+ if l.chomp.split(/\t/).select{|name| name && name =~ /\w/}.length > examples.length
146
+ examples = l.chomp.split(/\t/).collect{|name| name.split(/\|/).first}
147
+ end
148
+ i += 1
149
+ }
150
+
151
+ formats.zip(examples)
152
+ end
153
+
154
+ def self.id_position(supported_ids, id_name, options = {})
155
+ pos = 0
156
+ supported_ids.each_with_index{|id, i|
157
+ if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
158
+ pos = i;
159
+ end
160
+ }
161
+ pos
162
+ end
163
+
164
+ def self.id_index(org, option = {})
165
+ native = option[:native]
166
+ other = option[:other]
167
+ option[:case_sensitive] = false if option[:case_sensitive].nil?
168
+
169
+ if native.nil? and other.nil?
170
+ Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
171
+ else
172
+ supported = Organism.supported_ids(org)
173
+
174
+ first = nil
175
+ if native
176
+ first = id_position(supported,native,option)
177
+ else
178
+ first = 0
179
+ end
180
+
181
+ rest = nil
182
+ if other
183
+ rest = other.collect{|name| id_position(supported,name, option)}
184
+ else
185
+ rest = (0..supported.length - 1).to_a - [first]
186
+ end
187
+
188
+ option[:native] = first
189
+ option[:extra] = rest
190
+ index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
191
+
192
+ index
193
+ end
194
+ end
195
+
196
+ end
197
+