rbbt 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/bin/rbbt_config +180 -0
- data/install_scripts/classifier/R/classify.R +36 -0
- data/install_scripts/classifier/Rakefile +140 -0
- data/install_scripts/get_abner.sh +2 -0
- data/install_scripts/get_banner.sh +25 -0
- data/install_scripts/get_biocreative.sh +72 -0
- data/install_scripts/get_crf++.sh +26 -0
- data/install_scripts/get_entrez.sh +4 -0
- data/install_scripts/get_go.sh +4 -0
- data/install_scripts/get_polysearch.sh +8 -0
- data/install_scripts/ner/Rakefile +206 -0
- data/install_scripts/ner/config/default.rb +52 -0
- data/install_scripts/norm/Rakefile +218 -0
- data/install_scripts/norm/config/cue_default.rb +10 -0
- data/install_scripts/norm/config/tokens_default.rb +79 -0
- data/install_scripts/norm/functions.sh +21 -0
- data/install_scripts/organisms/Rakefile +25 -0
- data/install_scripts/organisms/cgd.Rakefile +84 -0
- data/install_scripts/organisms/human.Rakefile +145 -0
- data/install_scripts/organisms/mgi.Rakefile +77 -0
- data/install_scripts/organisms/pombe.Rakefile +40 -0
- data/install_scripts/organisms/rake-include.rb +258 -0
- data/install_scripts/organisms/rgd.Rakefile +88 -0
- data/install_scripts/organisms/sgd.Rakefile +66 -0
- data/install_scripts/organisms/tair.Rakefile +54 -0
- data/install_scripts/organisms/worm.Rakefile +109 -0
- data/install_scripts/stopwords +1 -0
- data/install_scripts/wordlists/consonants +897 -0
- data/install_scripts/wordlists/stopwords +1 -0
- data/lib/rbbt/bow/bow.rb +87 -0
- data/lib/rbbt/bow/classifier.rb +118 -0
- data/lib/rbbt/bow/dictionary.rb +218 -0
- data/lib/rbbt/ner/abner.rb +34 -0
- data/lib/rbbt/ner/banner.rb +73 -0
- data/lib/rbbt/ner/regexpNER.rb +62 -0
- data/lib/rbbt/ner/rner.rb +227 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
- data/lib/rbbt/ner/rnorm.rb +142 -0
- data/lib/rbbt/sources/biocreative.rb +75 -0
- data/lib/rbbt/sources/biomart.rb +106 -0
- data/lib/rbbt/sources/entrez.rb +211 -0
- data/lib/rbbt/sources/go.rb +40 -0
- data/lib/rbbt/sources/organism.rb +197 -0
- data/lib/rbbt/sources/polysearch.rb +88 -0
- data/lib/rbbt/sources/pubmed.rb +111 -0
- data/lib/rbbt/util/arrayHash.rb +255 -0
- data/lib/rbbt/util/filecache.rb +72 -0
- data/lib/rbbt/util/index.rb +69 -0
- data/lib/rbbt/util/misc.rb +101 -0
- data/lib/rbbt/util/open.rb +207 -0
- data/lib/rbbt/util/simpleDSL.rb +87 -0
- data/lib/rbbt/util/tmpfile.rb +19 -0
- data/lib/rbbt/version.rb +10 -0
- data/lib/rbbt.rb +86 -0
- data/tasks/install.rake +123 -0
- metadata +114 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
|
4
|
+
|
5
|
+
# Offers methods to help deal with the files distributed for the BioCreative
|
6
|
+
# competition related to Gene Mention and Normalization.
|
7
|
+
module Biocreative
|
8
|
+
|
9
|
+
# Read the files regarding the dataset and return a hash with the entry codes
|
10
|
+
# as keys and as values a hash with :text and the :mentions for that entry
|
11
|
+
def self.BC2GM(dataset)
|
12
|
+
|
13
|
+
data = {}
|
14
|
+
|
15
|
+
Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/#{dataset}.in")).each{|l|
|
16
|
+
code, text = l.chomp.match(/(.*?) (.*)/).values_at(1,2)
|
17
|
+
data[code] ={ :text => text }
|
18
|
+
}
|
19
|
+
|
20
|
+
Open.read(File.join(Rbbt.datadir,"biocreative/BC2GM/#{dataset}/GENE.eval")).each{|l|
|
21
|
+
code, pos, mention = l.chomp.split(/\|/)
|
22
|
+
data[code] ||= {}
|
23
|
+
data[code][:mentions] ||= []
|
24
|
+
data[code][:mentions].push(mention)
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
data
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
# Given a string of text and a string with a mention, return positions for
|
33
|
+
# that mention in the format used in the evaluation.
|
34
|
+
def self.position(text, mention)
|
35
|
+
|
36
|
+
re = mention.gsub(/\W+/,' ')
|
37
|
+
re = Regexp.quote(re)
|
38
|
+
re = re.gsub(/\\ /,'\W*')
|
39
|
+
re = '\(?' + re if mention =~ /\)/
|
40
|
+
re = re + '\)?' if mention =~ /\(/
|
41
|
+
re = "'?" + re + "'?" if mention =~ /'/
|
42
|
+
|
43
|
+
positions = []
|
44
|
+
|
45
|
+
offset = 0
|
46
|
+
while text.match(/(.*?)(#{re})(.*)/s)
|
47
|
+
pre, mention, post = text.match(/(.*?)(#{re})(.*)/s).values_at(1,2,3)
|
48
|
+
|
49
|
+
start = offset + pre.gsub(/\s/,'').length
|
50
|
+
last = offset + pre.gsub(/\s/,'').length + mention.gsub(/\s/,'').length - 1
|
51
|
+
|
52
|
+
positions << [start, last]
|
53
|
+
|
54
|
+
offset = last + 1
|
55
|
+
text = post
|
56
|
+
end
|
57
|
+
|
58
|
+
return positions
|
59
|
+
end
|
60
|
+
|
61
|
+
# Run the evaluation perl script
|
62
|
+
def self.BC2GM_eval(results, dataset, outfile)
|
63
|
+
|
64
|
+
|
65
|
+
cmd = "/usr/bin/perl #{File.join(Rbbt.datadir, 'biocreative/BC2GM/alt_eval.perl')}\
|
66
|
+
-gene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/GENE.eval")}\
|
67
|
+
-altgene #{File.join(Rbbt.datadir, "biocreative/BC2GM/#{dataset}/ALTGENE.eval")}\
|
68
|
+
#{results} > #{outfile}"
|
69
|
+
system cmd
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
|
@@ -0,0 +1,106 @@
|
|
1
|
+
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt'
|
4
|
+
|
5
|
+
# This module interacts with BioMart. It performs queries to BioMart and
|
6
|
+
# synthesises a hash with the results. Note that this module connects to the
|
7
|
+
# online BioMart WS using the Open in 'rbbt/util/open' module which offers
|
8
|
+
# caching by default. To obtain up to date results you may need to clear the
|
9
|
+
# cache from previous queries.
|
10
|
+
module BioMart
|
11
|
+
|
12
|
+
class BioMart::QueryError < StandardError; end
|
13
|
+
private
|
14
|
+
|
15
|
+
@@biomart_query_xml = <<-EOT
|
16
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
17
|
+
<!DOCTYPE Query>
|
18
|
+
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
19
|
+
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
20
|
+
<!--FILTERS-->
|
21
|
+
<!--MAIN-->
|
22
|
+
<!--ATTRIBUTES-->
|
23
|
+
</Dataset>
|
24
|
+
</Query>
|
25
|
+
EOT
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
def self.get(database, main, attrs = nil, filters = nil, data = nil)
|
31
|
+
attrs ||= []
|
32
|
+
filters ||= ["with_#{main}"]
|
33
|
+
data ||= {}
|
34
|
+
|
35
|
+
query = @@biomart_query_xml.clone
|
36
|
+
query.sub!(/<!--DATABASE-->/,database)
|
37
|
+
query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
|
38
|
+
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
39
|
+
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
40
|
+
|
41
|
+
rows = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
|
42
|
+
if rows =~ /Query ERROR:/
|
43
|
+
raise BioMart::QueryError, rows
|
44
|
+
end
|
45
|
+
|
46
|
+
rows.each{|l|
|
47
|
+
parts = l.chomp.split(/\t/)
|
48
|
+
main = parts.shift
|
49
|
+
next if main.nil? || main.empty?
|
50
|
+
|
51
|
+
data[main] ||= {}
|
52
|
+
attrs.each{|name|
|
53
|
+
value = parts.shift
|
54
|
+
data[main][name] ||= []
|
55
|
+
next if value.nil?
|
56
|
+
data[main][name] << value
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
data
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
public
|
65
|
+
|
66
|
+
# This method performs a query in biomart for a datasets and a given set of
|
67
|
+
# attributes, there must be a main attribute that will be used as the key in
|
68
|
+
# the result hash, optionally there may be a list of additional attributes
|
69
|
+
# and filters. The data parameter at the end is used internally to
|
70
|
+
# incrementally building the result, due to a limitation of the BioMart WS
|
71
|
+
# that only allows 3 external arguments, users normally should leave it
|
72
|
+
# unspecified or nil. The result is a hash, where the keys are the different
|
73
|
+
# values for the main attribute, and the value is a hash with every other
|
74
|
+
# attribute as key, and as value and array with all possible values (Note
|
75
|
+
# that for a given value of the main attribute, there may be more than one
|
76
|
+
# value for another attribute). If filters is left a nil it adds a filter to
|
77
|
+
# the BioMart query to remove results with the main attribute empty, this may
|
78
|
+
# cause an error if the BioMart WS does not allow filtering with that
|
79
|
+
# attribute.
|
80
|
+
def self.query(database, main, attrs = nil, filters = nil, data = nil)
|
81
|
+
attrs ||= []
|
82
|
+
data ||= {}
|
83
|
+
|
84
|
+
chunks = []
|
85
|
+
chunk = []
|
86
|
+
attrs.each{|a|
|
87
|
+
chunk << a
|
88
|
+
if chunk.length == 2
|
89
|
+
chunks << chunk
|
90
|
+
chunk = []
|
91
|
+
end
|
92
|
+
}
|
93
|
+
|
94
|
+
chunks << chunk if chunk.any?
|
95
|
+
|
96
|
+
chunks.each{|chunk|
|
97
|
+
data = get(database,main,chunk, filters, data)
|
98
|
+
}
|
99
|
+
|
100
|
+
data
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
end
|
106
|
+
|
@@ -0,0 +1,211 @@
|
|
1
|
+
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/util/open'
|
4
|
+
require 'rbbt/util/tmpfile'
|
5
|
+
require 'rbbt/util/filecache'
|
6
|
+
require 'rbbt/bow/bow.rb'
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
|
10
|
+
# This module is used to parse and extract information from the
|
11
|
+
# gene_info file at Entrez Gene, as well as from the gene2pubmed file.
|
12
|
+
# Both need to be downloaded and accesible for Rbbt, which is done as
|
13
|
+
# part of a normal installation.
|
14
|
+
module Entrez
|
15
|
+
|
16
|
+
class NoFileError < StandardError; end
|
17
|
+
|
18
|
+
# Given a taxonomy, or set of taxonomies, it returns an inverse hash,
|
19
|
+
# where each key is the entrez id of a gene, and the value is an array
|
20
|
+
# of possible synonyms in other databases. Is mostly used to translate
|
21
|
+
# entrez ids to the native database id of the organism. The parameter
|
22
|
+
# +native+ specifies the position of the key containing synonym, the
|
23
|
+
# fifth by default, +fix+ and +check+ are Procs used, if present, to
|
24
|
+
# pre-process lines and to check if they should be processed.
|
25
|
+
def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
|
26
|
+
|
27
|
+
raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
|
28
|
+
|
29
|
+
native ||= 5
|
30
|
+
|
31
|
+
taxs = [taxs] unless taxs.is_a?(Array)
|
32
|
+
taxs = taxs.collect{|t| t.to_s}
|
33
|
+
|
34
|
+
lexicon = {}
|
35
|
+
tmp = TmpFile.tmp_file("entrez-")
|
36
|
+
system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
|
37
|
+
File.open(tmp).each{|l|
|
38
|
+
parts = l.chomp.split(/\t/)
|
39
|
+
next if parts[native] == '-'
|
40
|
+
entrez = parts[1]
|
41
|
+
parts[native].split(/\|/).each{|id|
|
42
|
+
id = fix.call(id) if fix
|
43
|
+
next if check && !check.call(id)
|
44
|
+
|
45
|
+
lexicon[entrez] ||= []
|
46
|
+
lexicon[entrez] << id
|
47
|
+
}
|
48
|
+
}
|
49
|
+
FileUtils.rm tmp
|
50
|
+
|
51
|
+
lexicon
|
52
|
+
end
|
53
|
+
|
54
|
+
# For a given taxonomy, or set of taxonomies, it returns a hash with
|
55
|
+
# genes as keys and arrays of related PubMed ids as values, as
|
56
|
+
# extracted from the gene2pubmed file from Entrez Gene.
|
57
|
+
def self.entrez2pubmed(taxs)
|
58
|
+
raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
|
59
|
+
|
60
|
+
taxs = [taxs] unless taxs.is_a?(Array)
|
61
|
+
taxs = taxs.collect{|t| t.to_s}
|
62
|
+
|
63
|
+
data = {}
|
64
|
+
tmp = TmpFile.tmp_file("entrez-")
|
65
|
+
system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
|
66
|
+
|
67
|
+
data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
|
68
|
+
|
69
|
+
FileUtils.rm tmp
|
70
|
+
|
71
|
+
data
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
# This class parses an xml containing the information for a particular
|
77
|
+
# gene as served by Entrez Gene, and hold some of its information.
|
78
|
+
class Gene
|
79
|
+
attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
|
80
|
+
|
81
|
+
def initialize(xml)
|
82
|
+
return if xml.nil?
|
83
|
+
|
84
|
+
@organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
|
85
|
+
@symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
|
86
|
+
@description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
|
87
|
+
@aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
|
88
|
+
@protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
|
89
|
+
@summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
|
90
|
+
@comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
|
91
|
+
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
# Joins the text from symbol, description, aka, protnames, and
|
96
|
+
# summary
|
97
|
+
def text
|
98
|
+
#[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
|
99
|
+
[@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
private
|
104
|
+
|
105
|
+
@@last = Time.now
|
106
|
+
@@entrez_lag = 1
|
107
|
+
def self.get_online(geneids)
|
108
|
+
|
109
|
+
geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
|
110
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
|
111
|
+
|
112
|
+
diff = Time.now - @@last
|
113
|
+
sleep @@entrez_lag - diff unless diff > @@entrez_lag
|
114
|
+
|
115
|
+
xml = Open.read(url, :quiet => true, :nocache => true)
|
116
|
+
|
117
|
+
@@last = Time.now
|
118
|
+
|
119
|
+
genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
|
120
|
+
|
121
|
+
if geneids.is_a? Array
|
122
|
+
list = {}
|
123
|
+
genes.each_with_index{|gene,i|
|
124
|
+
#geneid = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
|
125
|
+
geneid = geneids[i]
|
126
|
+
list[geneid ] = gene
|
127
|
+
}
|
128
|
+
return list
|
129
|
+
else
|
130
|
+
return genes.first
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
public
|
136
|
+
|
137
|
+
# Build a file name for a gene based on the id. Prefix the id by 'gene-',
|
138
|
+
# substitute the slashes with '_SLASH_', and add a '.xml' extension.
|
139
|
+
def self.gene_filename(id)
|
140
|
+
FileCache.clean_path('gene-' + id.to_s + '.xml')
|
141
|
+
end
|
142
|
+
|
143
|
+
# Returns a Gene object for the given Entrez Gene id. If an array of
|
144
|
+
# ids is given instead, a hash is returned. This method uses the
|
145
|
+
# caching facilities from Rbbt.
|
146
|
+
def self.get_gene(geneid)
|
147
|
+
|
148
|
+
return nil if geneid.nil?
|
149
|
+
|
150
|
+
if Array === geneid
|
151
|
+
missing = []
|
152
|
+
list = {}
|
153
|
+
|
154
|
+
geneid.each{|p|
|
155
|
+
next if p.nil?
|
156
|
+
filename = gene_filename p
|
157
|
+
if File.exists? FileCache.path(filename)
|
158
|
+
list[p] = Gene.new(Open.read(FileCache.path(filename)))
|
159
|
+
else
|
160
|
+
missing << p
|
161
|
+
end
|
162
|
+
}
|
163
|
+
|
164
|
+
return list unless missing.any?
|
165
|
+
genes = get_online(missing)
|
166
|
+
|
167
|
+
genes.each{|p, xml|
|
168
|
+
filename = gene_filename p
|
169
|
+
FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
|
170
|
+
list[p] = Gene.new(xml)
|
171
|
+
}
|
172
|
+
|
173
|
+
return list
|
174
|
+
|
175
|
+
else
|
176
|
+
filename = gene_filename geneid
|
177
|
+
|
178
|
+
if File.exists? FileCache.path(filename)
|
179
|
+
return Gene.new(Open.read(FileCache.path(filename)))
|
180
|
+
else
|
181
|
+
xml = get_online(geneid)
|
182
|
+
FileCache.add_file(filename,xml)
|
183
|
+
|
184
|
+
return Gene.new(xml)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# Counts the words in common between a chunk of text and the text
|
190
|
+
# found in Entrez Gene for that particular gene. The +gene+ may be a
|
191
|
+
# gene identifier or a Gene class instance.
|
192
|
+
def self.gene_text_similarity(gene, text)
|
193
|
+
case
|
194
|
+
when Entrez::Gene === gene
|
195
|
+
gene_text = gene.text
|
196
|
+
when String === gene || Fixnum === gene
|
197
|
+
gene_text = get_gene(gene).text
|
198
|
+
else
|
199
|
+
return 0
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
gene_words = gene_text.words.to_set
|
204
|
+
text_words = text.words.to_set
|
205
|
+
|
206
|
+
return 0 if gene_words.empty? || text_words.empty?
|
207
|
+
|
208
|
+
common = gene_words.intersection(text_words)
|
209
|
+
common.length / (gene_words.length + text_words.length).to_f
|
210
|
+
end
|
211
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
|
3
|
+
|
4
|
+
# This module holds helper methods to deal with the Gene Ontology files. Right
|
5
|
+
# now all it does is provide a translation form id to the actual names.
|
6
|
+
module GO
|
7
|
+
@@info = nil
|
8
|
+
|
9
|
+
# This method needs to be called before any translations can be made, it is
|
10
|
+
# called automatically the first time the id2name method is called. It loads
|
11
|
+
# the gene_ontology.obo file and extracts all the fields, although right now,
|
12
|
+
# only the name field is used.
|
13
|
+
def self.init
|
14
|
+
@@info = {}
|
15
|
+
File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
|
16
|
+
split(/\[Term\]/).
|
17
|
+
each{|term|
|
18
|
+
term_info = {}
|
19
|
+
term.split(/\n/).
|
20
|
+
select{|l| l =~ /:/}.
|
21
|
+
each{|l|
|
22
|
+
key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
|
23
|
+
term_info[key.strip] = value.strip
|
24
|
+
}
|
25
|
+
@@info[term_info["id"]] = term_info
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.id2name(id)
|
30
|
+
self.init unless @@info
|
31
|
+
if id.kind_of? Array
|
32
|
+
@@info.values_at(*id).collect{|i| i['name'] if i}
|
33
|
+
else
|
34
|
+
return "Name not found" unless @@info[id]
|
35
|
+
@@info[id]['name']
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/ner/rnorm'
|
4
|
+
require 'rbbt/util/open'
|
5
|
+
|
6
|
+
module Organism
|
7
|
+
|
8
|
+
class OrganismNotProcessedError < StandardError; end
|
9
|
+
|
10
|
+
def self.all
|
11
|
+
Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/name').collect{|f| File.basename(File.dirname(f))}
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def self.name(org)
|
16
|
+
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
|
17
|
+
end
|
18
|
+
|
19
|
+
NAME2ORG = {}
|
20
|
+
Organism::all.each{|org|
|
21
|
+
name = Organism.name(org).strip.downcase
|
22
|
+
NAME2ORG[name] = org
|
23
|
+
}
|
24
|
+
|
25
|
+
def self.name2org(name)
|
26
|
+
NAME2ORG[name.strip.downcase]
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.id_formats(org)
|
30
|
+
id_types = {}
|
31
|
+
formats = supported_ids(org)
|
32
|
+
|
33
|
+
lines = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).collect
|
34
|
+
|
35
|
+
lines.each{|l|
|
36
|
+
ids_per_type = l.split(/\t/)
|
37
|
+
formats.zip(ids_per_type).each{|p|
|
38
|
+
format = p[0]
|
39
|
+
ids = p[1].split(/\|/)
|
40
|
+
ids.each{|id|
|
41
|
+
next if id.nil? || id == ""
|
42
|
+
id_types[id.downcase] ||= []
|
43
|
+
id_types[id.downcase] << format unless id_types[id.downcase].include? format
|
44
|
+
}
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
return id_types
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.guessIdFormat(formats, query)
|
52
|
+
query = query.compact.collect{|gene| gene.downcase}.uniq
|
53
|
+
if String === formats
|
54
|
+
formats = id_formats(formats)
|
55
|
+
end
|
56
|
+
|
57
|
+
return nil if formats.values.empty?
|
58
|
+
values = formats.values_at(*query)
|
59
|
+
return nil if values.empty?
|
60
|
+
|
61
|
+
format_count = {}
|
62
|
+
values.compact.collect{|types| types.uniq}.flatten.each{|f|
|
63
|
+
format_count[f] ||= 0
|
64
|
+
format_count[f] += 1
|
65
|
+
}
|
66
|
+
|
67
|
+
return nil if format_count.values.empty?
|
68
|
+
format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.ner(org, type=:abner, options = {})
|
72
|
+
|
73
|
+
case type.to_sym
|
74
|
+
when :abner
|
75
|
+
require 'rbbt/ner/abner'
|
76
|
+
return Abner.new
|
77
|
+
when :banner
|
78
|
+
require 'rbbt/ner/banner'
|
79
|
+
return Banner.new
|
80
|
+
when :rner
|
81
|
+
require 'rbbt/ner/rner'
|
82
|
+
model = options[:model]
|
83
|
+
model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
|
84
|
+
model ||= File.join(Rbbt.datadir,'ner/model/BC2')
|
85
|
+
return NER.new(model)
|
86
|
+
else
|
87
|
+
raise "Ner type (#{ type }) unknown"
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
def self.norm(org, to_entrez = nil)
|
93
|
+
if to_entrez.nil?
|
94
|
+
to_entrez = id_index(org, :native => 'Entrez Gene ID', :other => [supported_ids(org).first])
|
95
|
+
end
|
96
|
+
|
97
|
+
token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
|
98
|
+
if !File.exists? token_file
|
99
|
+
token_file = nil
|
100
|
+
end
|
101
|
+
|
102
|
+
Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.lexicon(org, options = {})
|
106
|
+
options[:sep] = "\t|\\|" unless options[:sep]
|
107
|
+
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.goterms(org)
|
111
|
+
goterms = {}
|
112
|
+
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each{|l|
|
113
|
+
gene, go = l.chomp.split(/\t/)
|
114
|
+
goterms[gene.strip] ||= []
|
115
|
+
goterms[gene.strip] << go.strip
|
116
|
+
}
|
117
|
+
goterms
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.literature(org)
|
121
|
+
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).collect{|l| l.chomp.scan(/\d+/)}.flatten
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.gene_literature(org)
|
125
|
+
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.gene_literature_go(org)
|
129
|
+
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
|
130
|
+
end
|
131
|
+
|
132
|
+
def self.supported_ids(org, options = {})
|
133
|
+
formats = []
|
134
|
+
examples = [] if options[:examples]
|
135
|
+
i= 0
|
136
|
+
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each{|l|
|
137
|
+
if i == 0
|
138
|
+
i += 1
|
139
|
+
next unless l=~/^\s*#/
|
140
|
+
formats = l.chomp.sub(/^[\s#]+/,'').split(/\t/).collect{|n| n.strip}
|
141
|
+
return formats unless examples
|
142
|
+
next
|
143
|
+
end
|
144
|
+
|
145
|
+
if l.chomp.split(/\t/).select{|name| name && name =~ /\w/}.length > examples.length
|
146
|
+
examples = l.chomp.split(/\t/).collect{|name| name.split(/\|/).first}
|
147
|
+
end
|
148
|
+
i += 1
|
149
|
+
}
|
150
|
+
|
151
|
+
formats.zip(examples)
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.id_position(supported_ids, id_name, options = {})
|
155
|
+
pos = 0
|
156
|
+
supported_ids.each_with_index{|id, i|
|
157
|
+
if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
|
158
|
+
pos = i;
|
159
|
+
end
|
160
|
+
}
|
161
|
+
pos
|
162
|
+
end
|
163
|
+
|
164
|
+
def self.id_index(org, option = {})
|
165
|
+
native = option[:native]
|
166
|
+
other = option[:other]
|
167
|
+
option[:case_sensitive] = false if option[:case_sensitive].nil?
|
168
|
+
|
169
|
+
if native.nil? and other.nil?
|
170
|
+
Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
|
171
|
+
else
|
172
|
+
supported = Organism.supported_ids(org)
|
173
|
+
|
174
|
+
first = nil
|
175
|
+
if native
|
176
|
+
first = id_position(supported,native,option)
|
177
|
+
else
|
178
|
+
first = 0
|
179
|
+
end
|
180
|
+
|
181
|
+
rest = nil
|
182
|
+
if other
|
183
|
+
rest = other.collect{|name| id_position(supported,name, option)}
|
184
|
+
else
|
185
|
+
rest = (0..supported.length - 1).to_a - [first]
|
186
|
+
end
|
187
|
+
|
188
|
+
option[:native] = first
|
189
|
+
option[:extra] = rest
|
190
|
+
index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
|
191
|
+
|
192
|
+
index
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
|