rbbt 1.1.7 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +72 -136
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -246
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -145
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -79
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/cgd.Rakefile +0 -84
- data/install_scripts/organisms/human.Rakefile +0 -145
- data/install_scripts/organisms/mgi.Rakefile +0 -77
- data/install_scripts/organisms/pombe.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -258
- data/install_scripts/organisms/rgd.Rakefile +0 -88
- data/install_scripts/organisms/sgd.Rakefile +0 -66
- data/install_scripts/organisms/tair.Rakefile +0 -54
- data/install_scripts/organisms/worm.Rakefile +0 -109
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -86
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -40
- data/lib/rbbt/sources/organism.rb +0 -245
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -111
- data/lib/rbbt/util/arrayHash.rb +0 -255
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -235
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -19
- data/tasks/install.rake +0 -124
data/lib/rbbt/sources/biomart.rb
DELETED
@@ -1,105 +0,0 @@
|
|
1
|
-
require 'rbbt/util/open'
|
2
|
-
require 'rbbt'
|
3
|
-
|
4
|
-
# This module interacts with BioMart. It performs queries to BioMart and
|
5
|
-
# synthesises a hash with the results. Note that this module connects to the
|
6
|
-
# online BioMart WS using the Open in 'rbbt/util/open' module which offers
|
7
|
-
# caching by default. To obtain up to date results you may need to clear the
|
8
|
-
# cache from previous queries.
|
9
|
-
module BioMart
|
10
|
-
|
11
|
-
class BioMart::QueryError < StandardError; end
|
12
|
-
private
|
13
|
-
|
14
|
-
@@biomart_query_xml = <<-EOT
|
15
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
16
|
-
<!DOCTYPE Query>
|
17
|
-
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
18
|
-
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
19
|
-
<!--FILTERS-->
|
20
|
-
<!--MAIN-->
|
21
|
-
<!--ATTRIBUTES-->
|
22
|
-
</Dataset>
|
23
|
-
</Query>
|
24
|
-
EOT
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
def self.get(database, main, attrs = nil, filters = nil, data = nil)
|
30
|
-
attrs ||= []
|
31
|
-
filters ||= ["with_#{main}"]
|
32
|
-
data ||= {}
|
33
|
-
|
34
|
-
query = @@biomart_query_xml.clone
|
35
|
-
query.sub!(/<!--DATABASE-->/,database)
|
36
|
-
query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
|
37
|
-
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
38
|
-
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
39
|
-
|
40
|
-
response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
|
41
|
-
if response =~ /Query ERROR:/
|
42
|
-
raise BioMart::QueryError, response
|
43
|
-
end
|
44
|
-
|
45
|
-
response.each_line{|l|
|
46
|
-
parts = l.chomp.split(/\t/)
|
47
|
-
main = parts.shift
|
48
|
-
next if main.nil? || main.empty?
|
49
|
-
|
50
|
-
data[main] ||= {}
|
51
|
-
attrs.each{|name|
|
52
|
-
value = parts.shift
|
53
|
-
data[main][name] ||= []
|
54
|
-
next if value.nil?
|
55
|
-
data[main][name] << value
|
56
|
-
}
|
57
|
-
}
|
58
|
-
|
59
|
-
data
|
60
|
-
|
61
|
-
end
|
62
|
-
|
63
|
-
public
|
64
|
-
|
65
|
-
# This method performs a query in biomart for a datasets and a given set of
|
66
|
-
# attributes, there must be a main attribute that will be used as the key in
|
67
|
-
# the result hash, optionally there may be a list of additional attributes
|
68
|
-
# and filters. The data parameter at the end is used internally to
|
69
|
-
# incrementally building the result, due to a limitation of the BioMart WS
|
70
|
-
# that only allows 3 external arguments, users normally should leave it
|
71
|
-
# unspecified or nil. The result is a hash, where the keys are the different
|
72
|
-
# values for the main attribute, and the value is a hash with every other
|
73
|
-
# attribute as key, and as value and array with all possible values (Note
|
74
|
-
# that for a given value of the main attribute, there may be more than one
|
75
|
-
# value for another attribute). If filters is left a nil it adds a filter to
|
76
|
-
# the BioMart query to remove results with the main attribute empty, this may
|
77
|
-
# cause an error if the BioMart WS does not allow filtering with that
|
78
|
-
# attribute.
|
79
|
-
def self.query(database, main, attrs = nil, filters = nil, data = nil)
|
80
|
-
attrs ||= []
|
81
|
-
data ||= {}
|
82
|
-
|
83
|
-
chunks = []
|
84
|
-
chunk = []
|
85
|
-
attrs.each{|a|
|
86
|
-
chunk << a
|
87
|
-
if chunk.length == 2
|
88
|
-
chunks << chunk
|
89
|
-
chunk = []
|
90
|
-
end
|
91
|
-
}
|
92
|
-
|
93
|
-
chunks << chunk if chunk.any?
|
94
|
-
|
95
|
-
chunks.each{|chunk|
|
96
|
-
data = get(database,main,chunk, filters, data)
|
97
|
-
}
|
98
|
-
|
99
|
-
data
|
100
|
-
end
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
end
|
105
|
-
|
data/lib/rbbt/sources/entrez.rb
DELETED
@@ -1,211 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/util/tmpfile'
|
4
|
-
require 'rbbt/util/filecache'
|
5
|
-
require 'rbbt/bow/bow.rb'
|
6
|
-
require 'set'
|
7
|
-
|
8
|
-
|
9
|
-
# This module is used to parse and extract information from the
|
10
|
-
# gene_info file at Entrez Gene, as well as from the gene2pubmed file.
|
11
|
-
# Both need to be downloaded and accesible for Rbbt, which is done as
|
12
|
-
# part of a normal installation.
|
13
|
-
module Entrez
|
14
|
-
|
15
|
-
class NoFileError < StandardError; end
|
16
|
-
|
17
|
-
# Given a taxonomy, or set of taxonomies, it returns an inverse hash,
|
18
|
-
# where each key is the entrez id of a gene, and the value is an array
|
19
|
-
# of possible synonyms in other databases. Is mostly used to translate
|
20
|
-
# entrez ids to the native database id of the organism. The parameter
|
21
|
-
# +native+ specifies the position of the key containing synonym, the
|
22
|
-
# fifth by default, +fix+ and +check+ are Procs used, if present, to
|
23
|
-
# pre-process lines and to check if they should be processed.
|
24
|
-
def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
|
25
|
-
|
26
|
-
raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
|
27
|
-
|
28
|
-
native ||= 5
|
29
|
-
|
30
|
-
taxs = [taxs] unless taxs.is_a?(Array)
|
31
|
-
taxs = taxs.collect{|t| t.to_s}
|
32
|
-
|
33
|
-
lexicon = {}
|
34
|
-
tmp = TmpFile.tmp_file("entrez-")
|
35
|
-
system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
|
36
|
-
File.open(tmp).each{|l|
|
37
|
-
parts = l.chomp.split(/\t/)
|
38
|
-
next if parts[native] == '-'
|
39
|
-
entrez = parts[1]
|
40
|
-
parts[native].split(/\|/).each{|id|
|
41
|
-
id = fix.call(id) if fix
|
42
|
-
next if check && !check.call(id)
|
43
|
-
|
44
|
-
lexicon[entrez] ||= []
|
45
|
-
lexicon[entrez] << id
|
46
|
-
}
|
47
|
-
}
|
48
|
-
FileUtils.rm tmp
|
49
|
-
|
50
|
-
lexicon
|
51
|
-
end
|
52
|
-
|
53
|
-
# For a given taxonomy, or set of taxonomies, it returns a hash with
|
54
|
-
# genes as keys and arrays of related PubMed ids as values, as
|
55
|
-
# extracted from the gene2pubmed file from Entrez Gene.
|
56
|
-
def self.entrez2pubmed(taxs)
|
57
|
-
raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
|
58
|
-
|
59
|
-
taxs = [taxs] unless taxs.is_a?(Array)
|
60
|
-
taxs = taxs.collect{|t| t.to_s}
|
61
|
-
|
62
|
-
data = {}
|
63
|
-
tmp = TmpFile.tmp_file("entrez-")
|
64
|
-
system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
|
65
|
-
|
66
|
-
data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
|
67
|
-
|
68
|
-
FileUtils.rm tmp
|
69
|
-
|
70
|
-
data
|
71
|
-
end
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
# This class parses an xml containing the information for a particular
|
76
|
-
# gene as served by Entrez Gene, and hold some of its information.
|
77
|
-
class Gene
|
78
|
-
attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
|
79
|
-
|
80
|
-
def initialize(xml)
|
81
|
-
return if xml.nil?
|
82
|
-
|
83
|
-
@organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
|
84
|
-
@symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
|
85
|
-
@description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
|
86
|
-
@aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
|
87
|
-
@protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
|
88
|
-
@summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
|
89
|
-
@comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
|
90
|
-
|
91
|
-
|
92
|
-
end
|
93
|
-
|
94
|
-
# Joins the text from symbol, description, aka, protnames, and
|
95
|
-
# summary
|
96
|
-
def text
|
97
|
-
#[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
|
98
|
-
[@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
private
|
103
|
-
|
104
|
-
@@last = Time.now
|
105
|
-
@@entrez_lag = 1
|
106
|
-
def self.get_online(geneids)
|
107
|
-
|
108
|
-
geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
|
109
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
|
110
|
-
|
111
|
-
diff = Time.now - @@last
|
112
|
-
sleep @@entrez_lag - diff unless diff > @@entrez_lag
|
113
|
-
|
114
|
-
xml = Open.read(url, :quiet => true, :nocache => true)
|
115
|
-
|
116
|
-
@@last = Time.now
|
117
|
-
|
118
|
-
genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
|
119
|
-
|
120
|
-
if geneids.is_a? Array
|
121
|
-
list = {}
|
122
|
-
genes.each_with_index{|gene,i|
|
123
|
-
#geneid = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
|
124
|
-
geneid = geneids[i]
|
125
|
-
list[geneid ] = gene
|
126
|
-
}
|
127
|
-
return list
|
128
|
-
else
|
129
|
-
return genes.first
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|
133
|
-
|
134
|
-
public
|
135
|
-
|
136
|
-
# Build a file name for a gene based on the id. Prefix the id by 'gene-',
|
137
|
-
# substitute the slashes with '_SLASH_', and add a '.xml' extension.
|
138
|
-
def self.gene_filename(id)
|
139
|
-
FileCache.clean_path('gene-' + id.to_s + '.xml')
|
140
|
-
end
|
141
|
-
|
142
|
-
# Returns a Gene object for the given Entrez Gene id. If an array of
|
143
|
-
# ids is given instead, a hash is returned. This method uses the
|
144
|
-
# caching facilities from Rbbt.
|
145
|
-
def self.get_gene(geneid)
|
146
|
-
|
147
|
-
return nil if geneid.nil?
|
148
|
-
|
149
|
-
if Array === geneid
|
150
|
-
missing = []
|
151
|
-
list = {}
|
152
|
-
|
153
|
-
geneid.each{|p|
|
154
|
-
next if p.nil?
|
155
|
-
filename = gene_filename p
|
156
|
-
if File.exists? FileCache.path(filename)
|
157
|
-
list[p] = Gene.new(Open.read(FileCache.path(filename)))
|
158
|
-
else
|
159
|
-
missing << p
|
160
|
-
end
|
161
|
-
}
|
162
|
-
|
163
|
-
return list unless missing.any?
|
164
|
-
genes = get_online(missing)
|
165
|
-
|
166
|
-
genes.each{|p, xml|
|
167
|
-
filename = gene_filename p
|
168
|
-
FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
|
169
|
-
list[p] = Gene.new(xml)
|
170
|
-
}
|
171
|
-
|
172
|
-
return list
|
173
|
-
|
174
|
-
else
|
175
|
-
filename = gene_filename geneid
|
176
|
-
|
177
|
-
if File.exists? FileCache.path(filename)
|
178
|
-
return Gene.new(Open.read(FileCache.path(filename)))
|
179
|
-
else
|
180
|
-
xml = get_online(geneid)
|
181
|
-
FileCache.add_file(filename,xml)
|
182
|
-
|
183
|
-
return Gene.new(xml)
|
184
|
-
end
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
|
-
# Counts the words in common between a chunk of text and the text
|
189
|
-
# found in Entrez Gene for that particular gene. The +gene+ may be a
|
190
|
-
# gene identifier or a Gene class instance.
|
191
|
-
def self.gene_text_similarity(gene, text)
|
192
|
-
|
193
|
-
case
|
194
|
-
when Entrez::Gene === gene
|
195
|
-
gene_text = gene.text
|
196
|
-
when String === gene || Fixnum === gene
|
197
|
-
gene_text = get_gene(gene).text
|
198
|
-
else
|
199
|
-
return 0
|
200
|
-
end
|
201
|
-
|
202
|
-
|
203
|
-
gene_words = gene_text.words.to_set
|
204
|
-
text_words = text.words.to_set
|
205
|
-
|
206
|
-
return 0 if gene_words.empty? || text_words.empty?
|
207
|
-
|
208
|
-
common = gene_words.intersection(text_words)
|
209
|
-
common.length / (gene_words.length + text_words.length).to_f
|
210
|
-
end
|
211
|
-
end
|
data/lib/rbbt/sources/go.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
|
3
|
-
|
4
|
-
# This module holds helper methods to deal with the Gene Ontology files. Right
|
5
|
-
# now all it does is provide a translation form id to the actual names.
|
6
|
-
module GO
|
7
|
-
@@info = nil
|
8
|
-
|
9
|
-
# This method needs to be called before any translations can be made, it is
|
10
|
-
# called automatically the first time the id2name method is called. It loads
|
11
|
-
# the gene_ontology.obo file and extracts all the fields, although right now,
|
12
|
-
# only the name field is used.
|
13
|
-
def self.init
|
14
|
-
@@info = {}
|
15
|
-
File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
|
16
|
-
split(/\[Term\]/).
|
17
|
-
each{|term|
|
18
|
-
term_info = {}
|
19
|
-
term.split(/\n/).
|
20
|
-
select{|l| l =~ /:/}.
|
21
|
-
each{|l|
|
22
|
-
key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
|
23
|
-
term_info[key.strip] = value.strip
|
24
|
-
}
|
25
|
-
@@info[term_info["id"]] = term_info
|
26
|
-
}
|
27
|
-
end
|
28
|
-
|
29
|
-
def self.id2name(id)
|
30
|
-
self.init unless @@info
|
31
|
-
if id.kind_of? Array
|
32
|
-
@@info.values_at(*id).collect{|i| i['name'] if i}
|
33
|
-
else
|
34
|
-
return "Name not found" unless @@info[id]
|
35
|
-
@@info[id]['name']
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
end
|
@@ -1,245 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/util/index'
|
4
|
-
|
5
|
-
# This module contains some Organism centric functionalities. Each organism is
|
6
|
-
# identified by a keyword.
|
7
|
-
module Organism
|
8
|
-
|
9
|
-
# Raised when trying to access information for an organism that has not been
|
10
|
-
# prepared already.
|
11
|
-
class OrganismNotProcessedError < StandardError; end
|
12
|
-
|
13
|
-
# Return the list of all supported organisms. The prepared flag is used to
|
14
|
-
# show only those that have been prepared.
|
15
|
-
def self.all(prepared = true)
|
16
|
-
if prepared
|
17
|
-
Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
|
18
|
-
else
|
19
|
-
Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
# Return the complete name of an organism. The org parameter is the organism
|
25
|
-
# keyword
|
26
|
-
def self.name(org)
|
27
|
-
raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
|
28
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
|
29
|
-
end
|
30
|
-
|
31
|
-
# Hash linking all the organism log names with their keywords in Rbbt. Its
|
32
|
-
# the inverse of the name method.
|
33
|
-
NAME2ORG = {}
|
34
|
-
Organism::all.each{|org|
|
35
|
-
name = Organism.name(org).strip.downcase
|
36
|
-
NAME2ORG[name] = org
|
37
|
-
}
|
38
|
-
|
39
|
-
|
40
|
-
# Return the key word associated with an organism.
|
41
|
-
def self.name2org(name)
|
42
|
-
NAME2ORG[name.strip.downcase]
|
43
|
-
end
|
44
|
-
|
45
|
-
# FIXME: The NER related stuff is harder to install, thats why we hide the
|
46
|
-
# requires next to where they are needed, next to options
|
47
|
-
|
48
|
-
# Return a NER object which could be of RNER, Abner or Banner class, this is
|
49
|
-
# selected using the type parameter.
|
50
|
-
def self.ner(org, type=:rner, options = {})
|
51
|
-
|
52
|
-
case type.to_sym
|
53
|
-
when :abner
|
54
|
-
require 'rbbt/ner/abner'
|
55
|
-
return Abner.new
|
56
|
-
when :banner
|
57
|
-
require 'rbbt/ner/banner'
|
58
|
-
return Banner.new
|
59
|
-
when :rner
|
60
|
-
require 'rbbt/ner/rner'
|
61
|
-
model = options[:model]
|
62
|
-
model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
|
63
|
-
model ||= File.join(Rbbt.datadir,'ner/model/BC2')
|
64
|
-
return NER.new(model)
|
65
|
-
else
|
66
|
-
raise "Ner type (#{ type }) unknown"
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
# Return a normalization object.
|
72
|
-
def self.norm(org, to_entrez = nil)
|
73
|
-
require 'rbbt/ner/rnorm'
|
74
|
-
if to_entrez.nil?
|
75
|
-
to_entrez = id_index(org, :native => 'Entrez Gene ID', :other => [supported_ids(org).first])
|
76
|
-
end
|
77
|
-
|
78
|
-
token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
|
79
|
-
if !File.exists? token_file
|
80
|
-
token_file = nil
|
81
|
-
end
|
82
|
-
|
83
|
-
Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
|
84
|
-
end
|
85
|
-
|
86
|
-
# Returns a hash with the names associated with each gene id. The ids are
|
87
|
-
# in Rbbt native format for that organism.
|
88
|
-
def self.lexicon(org, options = {})
|
89
|
-
options = {:sep => "\t|\\|", :flatten => true}.merge(options)
|
90
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
|
91
|
-
end
|
92
|
-
|
93
|
-
# Returns a hash with the list of go terms for each gene id. Gene ids are in
|
94
|
-
# Rbbt native format for that organism.
|
95
|
-
def self.goterms(org)
|
96
|
-
goterms = {}
|
97
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go")).each_line{|l|
|
98
|
-
gene, go = l.chomp.split(/\t/)
|
99
|
-
goterms[gene.strip] ||= []
|
100
|
-
goterms[gene.strip] << go.strip
|
101
|
-
}
|
102
|
-
goterms
|
103
|
-
end
|
104
|
-
|
105
|
-
# Return list of PubMed ids associated to the organism. Determined using a
|
106
|
-
# PubMed query with the name of the organism
|
107
|
-
def self.literature(org)
|
108
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
|
109
|
-
end
|
110
|
-
|
111
|
-
# Return hash that associates genes to a list of PubMed ids.
|
112
|
-
def self.gene_literature(org)
|
113
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
|
114
|
-
end
|
115
|
-
|
116
|
-
# Return hash that associates genes to a list of PubMed ids. Includes only
|
117
|
-
# those found to support GO term associations.
|
118
|
-
def self.gene_literature_go(org)
|
119
|
-
Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
|
120
|
-
end
|
121
|
-
|
122
|
-
# Returns a list with the names of the id formats supported for an organism.
|
123
|
-
# If examples are produced, the list is of [format, example] pairs.
|
124
|
-
#
|
125
|
-
# *Options:*
|
126
|
-
#
|
127
|
-
# *examples:* Include example ids for each format
|
128
|
-
def self.supported_ids(org, options = {})
|
129
|
-
formats = []
|
130
|
-
examples = [] if options[:examples]
|
131
|
-
i= 0
|
132
|
-
Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each_line{|l|
|
133
|
-
if i == 0
|
134
|
-
i += 1
|
135
|
-
next unless l=~/^\s*#/
|
136
|
-
formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
|
137
|
-
return formats unless examples
|
138
|
-
next
|
139
|
-
end
|
140
|
-
|
141
|
-
if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.length
|
142
|
-
examples = Open.fields(l).collect{|name| name.split(/\|/).first}
|
143
|
-
end
|
144
|
-
i += 1
|
145
|
-
}
|
146
|
-
|
147
|
-
formats.zip(examples)
|
148
|
-
end
|
149
|
-
|
150
|
-
# Creates a hash where each possible id is associated with the names of the
|
151
|
-
# formats (its potentially possible for different formats to have the same
|
152
|
-
# id). This is used in the guessIdFormat method.
|
153
|
-
def self.id_formats(org)
|
154
|
-
id_types = {}
|
155
|
-
formats = supported_ids(org)
|
156
|
-
|
157
|
-
text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
|
158
|
-
|
159
|
-
if text.respond_to? :collect
|
160
|
-
lines = text.collect
|
161
|
-
else
|
162
|
-
lines = text.lines
|
163
|
-
end
|
164
|
-
|
165
|
-
lines.each{|l|
|
166
|
-
ids_per_type = Open.fields(l)
|
167
|
-
formats.zip(ids_per_type).each{|p|
|
168
|
-
format = p[0]
|
169
|
-
p[1] ||= ""
|
170
|
-
ids = p[1].split(/\|/)
|
171
|
-
ids.each{|id|
|
172
|
-
next if id.nil? || id == ""
|
173
|
-
id_types[id.downcase] ||= []
|
174
|
-
id_types[id.downcase] << format unless id_types[id.downcase].include? format
|
175
|
-
}
|
176
|
-
}
|
177
|
-
}
|
178
|
-
|
179
|
-
return id_types
|
180
|
-
end
|
181
|
-
|
182
|
-
def self.guessIdFormat(formats, query)
|
183
|
-
query = query.compact.collect{|gene| gene.downcase}.uniq
|
184
|
-
if String === formats
|
185
|
-
formats = id_formats(formats)
|
186
|
-
end
|
187
|
-
|
188
|
-
return nil if formats.values.empty?
|
189
|
-
values = formats.values_at(*query)
|
190
|
-
return nil if values.empty?
|
191
|
-
|
192
|
-
format_count = {}
|
193
|
-
values.compact.collect{|types| types.uniq}.flatten.each{|f|
|
194
|
-
format_count[f] ||= 0
|
195
|
-
format_count[f] += 1
|
196
|
-
}
|
197
|
-
|
198
|
-
return nil if format_count.values.empty?
|
199
|
-
format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
|
200
|
-
end
|
201
|
-
|
202
|
-
def self.id_position(supported_ids, id_name, options = {})
|
203
|
-
pos = 0
|
204
|
-
supported_ids.each_with_index{|id, i|
|
205
|
-
if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
|
206
|
-
pos = i;
|
207
|
-
end
|
208
|
-
}
|
209
|
-
pos
|
210
|
-
end
|
211
|
-
|
212
|
-
def self.id_index(org, option = {})
|
213
|
-
native = option[:native]
|
214
|
-
other = option[:other]
|
215
|
-
option[:case_sensitive] = false if option[:case_sensitive].nil?
|
216
|
-
|
217
|
-
if native.nil? and other.nil?
|
218
|
-
Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
|
219
|
-
else
|
220
|
-
supported = Organism.supported_ids(org)
|
221
|
-
|
222
|
-
first = nil
|
223
|
-
if native
|
224
|
-
first = id_position(supported,native,option)
|
225
|
-
else
|
226
|
-
first = 0
|
227
|
-
end
|
228
|
-
|
229
|
-
rest = nil
|
230
|
-
if other
|
231
|
-
rest = other.collect{|name| id_position(supported,name, option)}
|
232
|
-
else
|
233
|
-
rest = (0..supported.length - 1).to_a - [first]
|
234
|
-
end
|
235
|
-
|
236
|
-
option[:native] = first
|
237
|
-
option[:extra] = rest
|
238
|
-
index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), option)
|
239
|
-
|
240
|
-
index
|
241
|
-
end
|
242
|
-
end
|
243
|
-
|
244
|
-
end
|
245
|
-
|