rbbt-sources 3.4.0 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/etc/allowed_biomart_archives +2 -4
- data/etc/biomart/missing_in_archive +2 -0
- data/etc/build_organism +4 -4
- data/etc/organisms +1 -0
- data/lib/rbbt/sources/biomart.rb +48 -13
- data/lib/rbbt/sources/ensembl_ftp.rb +31 -15
- data/lib/rbbt/sources/entrez.rb +13 -0
- data/lib/rbbt/sources/go.rb +2 -2
- data/lib/rbbt/sources/organism.rb +45 -24
- data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} +23 -15
- data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} +3 -20
- data/share/install/Organism/{Rno/Rakefile → Rno.rake} +3 -8
- data/share/install/Organism/Sce.rake +38 -0
- data/share/install/Organism/organism_helpers.rb +123 -50
- data/test/rbbt/sources/test_biomart.rb +44 -6
- data/test/rbbt/sources/test_ensembl_ftp.rb +11 -0
- data/test/rbbt/sources/test_entrez.rb +5 -0
- data/test/rbbt/sources/test_organism.rb +15 -15
- metadata +9 -7
- data/share/install/Organism/Sce/Rakefile +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
|
4
|
+
data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
|
7
|
+
data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0
|
data/etc/build_organism
CHANGED
data/etc/organisms
CHANGED
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -3,6 +3,7 @@ require 'rbbt/tsv'
|
|
3
3
|
require 'rbbt/tsv/attach'
|
4
4
|
require 'rbbt/util/log'
|
5
5
|
require 'cgi'
|
6
|
+
require 'rbbt/sources/organism'
|
6
7
|
|
7
8
|
# This module interacts with BioMart. It performs queries to BioMart and
|
8
9
|
# synthesises a hash with the results. Note that this module connects to the
|
@@ -13,7 +14,7 @@ module BioMart
|
|
13
14
|
|
14
15
|
class BioMart::QueryError < StandardError; end
|
15
16
|
|
16
|
-
BIOMART_URL = '
|
17
|
+
BIOMART_URL = 'ensembl.org/biomart/martservice'
|
17
18
|
|
18
19
|
MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
|
19
20
|
|
@@ -22,7 +23,7 @@ module BioMart
|
|
22
23
|
@@biomart_query_xml = <<-EOT
|
23
24
|
<?xml version="1.0" encoding="UTF-8"?>
|
24
25
|
<!DOCTYPE Query>
|
25
|
-
<Query completionStamp="1" virtualSchemaName = "
|
26
|
+
<Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
|
26
27
|
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
27
28
|
<!--FILTERS-->
|
28
29
|
<!--MAIN-->
|
@@ -36,14 +37,10 @@ module BioMart
|
|
36
37
|
raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
|
37
38
|
end
|
38
39
|
Thread.current['archive'] = date
|
39
|
-
Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
|
40
|
-
Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
|
41
40
|
end
|
42
41
|
|
43
42
|
def self.unset_archive
|
44
|
-
Log.debug "Restoring current version URL #{BIOMART_URL}"
|
45
43
|
Thread.current['archive'] = nil
|
46
|
-
Thread.current['archive_url'] = nil
|
47
44
|
end
|
48
45
|
|
49
46
|
def self.with_archive(data)
|
@@ -55,6 +52,21 @@ module BioMart
|
|
55
52
|
end
|
56
53
|
end
|
57
54
|
|
55
|
+
def self.final_url(query, archive = nil, ensembl_domain = nil)
|
56
|
+
url_domain = if archive.nil?
|
57
|
+
if ensembl_domain.nil?
|
58
|
+
'www'
|
59
|
+
else
|
60
|
+
ensembl_domain
|
61
|
+
end
|
62
|
+
elsif ensembl_domain
|
63
|
+
[archive, ensembl_domain] * "-"
|
64
|
+
else
|
65
|
+
[archive, 'archive'] * "."
|
66
|
+
end
|
67
|
+
"http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
|
68
|
+
end
|
69
|
+
|
58
70
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
59
71
|
open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
|
60
72
|
repeats = true
|
@@ -75,11 +87,17 @@ module BioMart
|
|
75
87
|
|
76
88
|
query = @@biomart_query_xml.dup
|
77
89
|
query.sub!(/<!--DATABASE-->/,database)
|
90
|
+
if Thread.current["ensembl_domain"]
|
91
|
+
query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
|
92
|
+
else
|
93
|
+
query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
|
94
|
+
end
|
78
95
|
query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
|
79
96
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
80
97
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
81
98
|
|
82
|
-
url = Thread.current[
|
99
|
+
url = final_url(query, Thread.current["archive"], Thread.current["ensembl_domain"])
|
100
|
+
|
83
101
|
|
84
102
|
begin
|
85
103
|
response = Open.read(url, open_options.dup)
|
@@ -105,10 +123,17 @@ module BioMart
|
|
105
123
|
|
106
124
|
new_datafile = TmpFile.tmp_file
|
107
125
|
if data.nil?
|
108
|
-
|
126
|
+
Open.open(result_file) do |file|
|
127
|
+
Open.write(new_datafile, Open.collapse_stream(file))
|
128
|
+
end
|
109
129
|
data = new_datafile
|
110
130
|
else
|
111
|
-
|
131
|
+
Open.open(result_file) do |stream_result|
|
132
|
+
Open.open(data) do |stream_data|
|
133
|
+
Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
#TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
|
112
137
|
FileUtils.rm data
|
113
138
|
data = new_datafile
|
114
139
|
end
|
@@ -142,9 +167,9 @@ module BioMart
|
|
142
167
|
|
143
168
|
IndiferentHash.setup(open_options)
|
144
169
|
|
145
|
-
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{
|
170
|
+
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
|
146
171
|
|
147
|
-
max_items =
|
172
|
+
max_items = 1
|
148
173
|
chunks = []
|
149
174
|
chunk = []
|
150
175
|
attrs.each{|a|
|
@@ -178,7 +203,7 @@ module BioMart
|
|
178
203
|
results
|
179
204
|
else
|
180
205
|
Open.write(filename) do |f|
|
181
|
-
f.puts "#: " << Misc.hash2string(TSV
|
206
|
+
f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
|
182
207
|
if field_names.nil?
|
183
208
|
f.puts "#" << [main, attrs].flatten * "\t"
|
184
209
|
else
|
@@ -211,7 +236,17 @@ module BioMart
|
|
211
236
|
changes = {}
|
212
237
|
missing.select{|m| m.include? "~" }.each do |str|
|
213
238
|
orig,_sep, new = str.partition "~"
|
214
|
-
|
239
|
+
if orig.include?(":")
|
240
|
+
target_db, _sep, orig = orig.partition(":")
|
241
|
+
if target_db[0] == "-"
|
242
|
+
next if database == target_db[1..-1]
|
243
|
+
else
|
244
|
+
next unless database == target_db
|
245
|
+
end
|
246
|
+
changes[orig] = new
|
247
|
+
else
|
248
|
+
changes[orig] = new
|
249
|
+
end
|
215
250
|
end
|
216
251
|
changed = true
|
217
252
|
while changed
|
@@ -9,11 +9,29 @@ module Ensembl
|
|
9
9
|
module FTP
|
10
10
|
|
11
11
|
SERVER = "ftp.ensembl.org"
|
12
|
+
DOMAIN_SERVER = "ftp.ensemblgenomes.org"
|
12
13
|
|
13
|
-
def self.
|
14
|
+
def self.ftp_name_for_domain(domain, organism, subdir='mysql')
|
15
|
+
code, build = organism.split "/"
|
16
|
+
build ||= "current"
|
17
|
+
|
18
|
+
release = build == "current" ? 'current' : Ensembl.releases[build]
|
19
|
+
name = Organism.scientific_name(organism)
|
20
|
+
ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
|
21
|
+
ftp.passive = true
|
22
|
+
ftp.login
|
23
|
+
dir = File.join('pub', domain, 'current', subdir)
|
24
|
+
ftp.chdir(dir)
|
25
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
|
26
|
+
ftp.close
|
27
|
+
[release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
|
14
28
|
end
|
15
29
|
|
16
|
-
def self.ftp_name_for(organism)
|
30
|
+
def self.ftp_name_for(organism, subdir='mysql')
|
31
|
+
if domain = Thread.current["ensembl_domain"]
|
32
|
+
return ftp_name_for_domain(domain, organism,subdir)
|
33
|
+
end
|
34
|
+
|
17
35
|
code, build = organism.split "/"
|
18
36
|
build ||= "current"
|
19
37
|
|
@@ -23,8 +41,9 @@ module Ensembl
|
|
23
41
|
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
24
42
|
ftp.passive = true
|
25
43
|
ftp.login
|
26
|
-
|
27
|
-
|
44
|
+
dir = File.join('pub', "current_#{subdir}")
|
45
|
+
ftp.chdir(dir)
|
46
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
|
28
47
|
ftp.close
|
29
48
|
else
|
30
49
|
release = Ensembl.releases[build]
|
@@ -32,24 +51,21 @@ module Ensembl
|
|
32
51
|
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
33
52
|
ftp.passive = true
|
34
53
|
ftp.login
|
35
|
-
|
36
|
-
|
54
|
+
dir = File.join('pub', release, subdir)
|
55
|
+
ftp.chdir(dir)
|
56
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
|
37
57
|
ftp.close
|
38
58
|
end
|
39
|
-
[release, file]
|
59
|
+
[release, File.join(Ensembl::FTP::SERVER, dir, file)]
|
40
60
|
end
|
41
61
|
|
42
|
-
def self.
|
43
|
-
release,
|
44
|
-
|
45
|
-
File.join('/pub/', 'current_mysql', ftp_name)
|
46
|
-
else
|
47
|
-
File.join('/pub/', release, 'mysql', ftp_name)
|
48
|
-
end
|
62
|
+
def self.ftp_url_for(organism)
|
63
|
+
release, ftp_url = ftp_name_for(organism)
|
64
|
+
ftp_url
|
49
65
|
end
|
50
66
|
|
51
67
|
def self.base_url(organism)
|
52
|
-
File.join("ftp://"
|
68
|
+
File.join("ftp://", ftp_url_for(organism) )
|
53
69
|
end
|
54
70
|
|
55
71
|
def self.url_for(organism, table, extension)
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -8,6 +8,19 @@ module Entrez
|
|
8
8
|
|
9
9
|
Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
|
10
10
|
Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
11
|
+
Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
|
12
|
+
TmpFile.with_dir do |dir|
|
13
|
+
Misc.in_dir dir do
|
14
|
+
CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
|
15
|
+
CMD.cmd("tar xvfz taxdump.tar.gz")
|
16
|
+
CMD.cmd("grep 'scientific name' names.dmp |cut -f 1,3 > tmp.tsv")
|
17
|
+
tsv = TSV.open('tmp.tsv', type: :single)
|
18
|
+
tsv.key_field = "Entrez Tax ID"
|
19
|
+
tsv.fields = ["Scientific Name"]
|
20
|
+
Open.write(filename, tsv.to_s)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
11
24
|
|
12
25
|
def self.entrez2native(taxs, options = {})
|
13
26
|
options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -25,8 +25,8 @@ module GO
|
|
25
25
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
26
26
|
# only the name field is used.
|
27
27
|
def self.init
|
28
|
-
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
29
|
-
info.serializer = :marshal if info.respond_to? :serializer
|
28
|
+
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
|
29
|
+
#info.serializer = :marshal if info.respond_to? :serializer
|
30
30
|
Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
|
31
31
|
term_info = {}
|
32
32
|
|
@@ -9,6 +9,10 @@ module Organism
|
|
9
9
|
ARCHIVE_MONTH_INDEX = {}
|
10
10
|
%w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
|
11
11
|
|
12
|
+
def self.rake_organism_helper
|
13
|
+
Rbbt.share.install.Organism["organism_helpers.rb"].find
|
14
|
+
end
|
15
|
+
|
12
16
|
def self.compare_archives(a1, a2)
|
13
17
|
a1 = a1.partition("/").last if a1 and a1.include? "/"
|
14
18
|
a2 = a2.partition("/").last if a2 and a2.include? "/"
|
@@ -29,7 +33,8 @@ module Organism
|
|
29
33
|
end
|
30
34
|
|
31
35
|
def self.default_code(organism = "Hsa")
|
32
|
-
|
36
|
+
latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
|
37
|
+
organism.split("/").first << "/" << latest
|
33
38
|
end
|
34
39
|
|
35
40
|
def self.organism_codes(organism = nil)
|
@@ -43,7 +48,8 @@ module Organism
|
|
43
48
|
end
|
44
49
|
|
45
50
|
def self.installed_organisms
|
46
|
-
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
|
51
|
+
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
|
52
|
+
Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
|
47
53
|
end
|
48
54
|
|
49
55
|
def self.prepared_organisms
|
@@ -62,25 +68,6 @@ module Organism
|
|
62
68
|
nil
|
63
69
|
end
|
64
70
|
|
65
|
-
Organism.installable_organisms.each do |organism|
|
66
|
-
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
67
|
-
|
68
|
-
module_eval "#{ organism } = with_key '#{organism}'"
|
69
|
-
end
|
70
|
-
|
71
|
-
Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
|
72
|
-
Open.mkdir File.dirname(file) unless File.directory?(file)
|
73
|
-
url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
74
|
-
CMD.cmd_log("wget '#{url}' -O '#{file}'")
|
75
|
-
CMD.cmd("chmod 0755 '#{file}'")
|
76
|
-
Rbbt.set_software_env
|
77
|
-
nil
|
78
|
-
end
|
79
|
-
|
80
|
-
CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
|
81
|
-
|
82
|
-
Rbbt.set_software_env
|
83
|
-
|
84
71
|
def self.hg_build(organism)
|
85
72
|
require 'rbbt/sources/ensembl_ftp'
|
86
73
|
organism = organism.strip
|
@@ -257,7 +244,16 @@ module Organism
|
|
257
244
|
end
|
258
245
|
|
259
246
|
def self.scientific_name(organism)
|
260
|
-
Organism[organism]
|
247
|
+
Organism[organism].scientific_name.read.strip
|
248
|
+
end
|
249
|
+
|
250
|
+
def self.make_organism(name, long = false)
|
251
|
+
first, _, second = name.partition(/[ _]/)
|
252
|
+
if long
|
253
|
+
first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
|
254
|
+
else
|
255
|
+
first[0].upcase + second[0..1].downcase
|
256
|
+
end
|
261
257
|
end
|
262
258
|
|
263
259
|
def self.organism(name)
|
@@ -295,7 +291,7 @@ module Organism
|
|
295
291
|
organism ||= "Hsa"
|
296
292
|
|
297
293
|
@@gene_start_end ||= {}
|
298
|
-
gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :
|
294
|
+
gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
|
299
295
|
|
300
296
|
ranges = genes.collect{|gene|
|
301
297
|
start, eend = gene_start_end[gene]
|
@@ -339,7 +335,8 @@ module Organism
|
|
339
335
|
def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
|
340
336
|
chromosome_sizes = {}
|
341
337
|
|
342
|
-
Organism
|
338
|
+
Organism.chromosomes(organism).produce.tsv.each do |chr|
|
339
|
+
file = Organism[organism]["chromosome_#{chr}"].produce.find
|
343
340
|
chromosome = file.split("_").last.split(".").first
|
344
341
|
size = if Open.gzip?(file) || Open.bgzip?(file)
|
345
342
|
CMD.cmd("zcat '#{ file }' | wc -c ").read
|
@@ -352,4 +349,28 @@ module Organism
|
|
352
349
|
chromosome_sizes
|
353
350
|
end
|
354
351
|
|
352
|
+
Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
|
353
|
+
Open.mkdir File.dirname(file) unless File.directory?(file)
|
354
|
+
url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
355
|
+
CMD.cmd_log("wget '#{url}' -O '#{file}'")
|
356
|
+
CMD.cmd("chmod 0755 '#{file}'")
|
357
|
+
Rbbt.set_software_env
|
358
|
+
nil
|
359
|
+
end
|
360
|
+
|
361
|
+
CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
|
362
|
+
|
363
|
+
Rbbt.set_software_env
|
364
|
+
|
365
|
+
Organism.installable_organisms.each do |organism|
|
366
|
+
if Rbbt.share.install.Organism[organism].Rakefile.exists?
|
367
|
+
rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
|
368
|
+
else
|
369
|
+
rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
|
370
|
+
end
|
371
|
+
|
372
|
+
claim Organism[organism], :rake, rakefile
|
373
|
+
|
374
|
+
module_eval "#{ organism } = with_key '#{organism}'"
|
375
|
+
end
|
355
376
|
end
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [9606]
|
7
2
|
$scientific_name = "Homo sapiens"
|
8
3
|
$ortholog_key = "hsapiens_homolog_ensembl_gene"
|
@@ -95,17 +90,30 @@ $biomart_identifiers = [
|
|
95
90
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
96
91
|
]
|
97
92
|
|
98
|
-
$
|
99
|
-
|
100
|
-
|
93
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
94
|
+
Thread.current["namespace"] = $namespace
|
95
|
+
load Organism.rake_organism_helper
|
96
|
+
|
97
|
+
file 'regulators' do |t|
|
98
|
+
regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
|
99
|
+
regulatory_fields = [
|
100
|
+
['Chromosome Name','chromosome_name'],
|
101
|
+
['Region Start', 'chromosome_start'],
|
102
|
+
['Region End', 'chromosome_end'],
|
103
|
+
['Feature type', 'feature_type_name'],
|
101
104
|
]
|
105
|
+
regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
|
106
|
+
|
107
|
+
Misc.sensiblewrite(t.name, regulators.to_s)
|
108
|
+
end
|
102
109
|
|
103
|
-
|
104
|
-
[
|
105
|
-
|
106
|
-
|
110
|
+
file 'regulator_activity' do |t|
|
111
|
+
regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
|
112
|
+
regulatory_fields = [
|
113
|
+
['Epigenome name','epigenome_name'],
|
114
|
+
['Activity', 'activity'],
|
107
115
|
]
|
116
|
+
regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
108
117
|
|
109
|
-
|
110
|
-
|
111
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
118
|
+
Misc.sensiblewrite(t.name, regulators.to_s)
|
119
|
+
end
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [10090]
|
7
2
|
$scientific_name = "Mus musculus"
|
8
3
|
$ortholog_key = "mmusculus_homolog_ensembl_gene"
|
@@ -43,18 +38,6 @@ $biomart_identifiers = [
|
|
43
38
|
[ 'EMBL (Genbank) ID' , "embl"] ,
|
44
39
|
]
|
45
40
|
|
46
|
-
$
|
47
|
-
|
48
|
-
|
49
|
-
]
|
50
|
-
|
51
|
-
$biomart_go_2009= [
|
52
|
-
["GO BP ID", 'go_biological_process_id'],
|
53
|
-
["GO MF ID", 'go_molecular_function_id'],
|
54
|
-
["GO CC ID", 'go_cellular_component_id'],
|
55
|
-
]
|
56
|
-
|
57
|
-
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
58
|
-
Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
|
59
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
60
|
-
|
41
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
42
|
+
Thread.current["namespace"] = $namespace
|
43
|
+
load Organism.rake_organism_helper
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [10116]
|
7
2
|
$scientific_name = "Rattus norvegicus"
|
8
3
|
|
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
|
|
50
45
|
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
|
51
46
|
]
|
52
47
|
|
53
|
-
$namespace = File.basename(
|
54
|
-
Thread.current["namespace"] =
|
55
|
-
load
|
48
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
49
|
+
Thread.current["namespace"] = $namespace
|
50
|
+
load Organism.rake_organism_helper
|
@@ -0,0 +1,38 @@
|
|
1
|
+
$taxs = [559292,4932]
|
2
|
+
$scientific_name = "Saccharomyces cerevisiae"
|
3
|
+
$ensembl_domain = 'fungi'
|
4
|
+
#$ortholog_key = "yeast_ensembl_gene"
|
5
|
+
|
6
|
+
$biomart_db = 'scerevisiae_eg_gene'
|
7
|
+
|
8
|
+
$biomart_lexicon = [
|
9
|
+
[ 'Associated Gene Name' , "external_gene_name"],
|
10
|
+
]
|
11
|
+
|
12
|
+
$biomart_protein_identifiers = [
|
13
|
+
[ 'Protein ID', "protein_id" ],
|
14
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
15
|
+
[ 'Unigene ID', "unigene" ],
|
16
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
17
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
18
|
+
]
|
19
|
+
|
20
|
+
$biomart_probe_identifiers = [
|
21
|
+
]
|
22
|
+
|
23
|
+
$biomart_identifiers = [
|
24
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
25
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
26
|
+
[ 'Associated Gene Name', "external_gene_name" ],
|
27
|
+
[ 'Protein ID', "protein_id" ],
|
28
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
29
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
30
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
31
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
32
|
+
[ 'RefSeq DNA' , "refseq_dna"] ,
|
33
|
+
]
|
34
|
+
|
35
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
36
|
+
Thread.current["namespace"] = $namespace
|
37
|
+
Thread.current["ensembl_domain"] = $ensembl_domain
|
38
|
+
load Organism.rake_organism_helper
|
@@ -1,8 +1,11 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
|
2
|
+
|
1
3
|
require 'net/ftp'
|
4
|
+
require 'rbbt/sources/biomart'
|
5
|
+
require 'rbbt/sources/entrez'
|
6
|
+
require File.join(File.dirname(__FILE__), '../lib/helpers')
|
2
7
|
require 'rbbt/sources/ensembl_ftp'
|
3
8
|
|
4
|
-
#Thread.current['namespace'] = $namespace
|
5
|
-
|
6
9
|
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
7
10
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
8
11
|
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
@@ -77,6 +80,17 @@ $biomart_pfam= [
|
|
77
80
|
["Pfam Domain", 'pfam'],
|
78
81
|
]
|
79
82
|
|
83
|
+
$biomart_go= [
|
84
|
+
["GO ID", 'go_id'],
|
85
|
+
["GO Namespace", 'namespace_1003'],
|
86
|
+
]
|
87
|
+
|
88
|
+
$biomart_go_2009= [
|
89
|
+
["GO BP ID", 'go_biological_process_id'],
|
90
|
+
["GO MF ID", 'go_molecular_function_id'],
|
91
|
+
["GO CC ID", 'go_cellular_component_id'],
|
92
|
+
]
|
93
|
+
|
80
94
|
$biomart_gene_biotype= [
|
81
95
|
["Biotype", 'gene_biotype'],
|
82
96
|
]
|
@@ -91,7 +105,13 @@ $biomart_exons = [
|
|
91
105
|
#{{{ Rules
|
92
106
|
|
93
107
|
file 'entrez_taxids' do |t|
|
94
|
-
|
108
|
+
if $tax && $tax.any?
|
109
|
+
Misc.sensiblewrite(t.name, $taxs * "\n")
|
110
|
+
else
|
111
|
+
tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
|
112
|
+
taxs = tsv[$scientific_name] || []
|
113
|
+
Misc.sensiblewrite(t.name, taxs * "\n")
|
114
|
+
end
|
95
115
|
end
|
96
116
|
|
97
117
|
file 'scientific_name' do |t|
|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
|
|
104
124
|
Misc.sensiblewrite(t.name, $ortholog_key)
|
105
125
|
end
|
106
126
|
|
107
|
-
file 'identifiers' do |t|
|
127
|
+
file 'identifiers' => 'entrez_taxids' do |t|
|
128
|
+
tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
|
108
129
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
|
109
130
|
identifiers.unnamed = true
|
110
131
|
|
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
|
|
116
137
|
end
|
117
138
|
|
118
139
|
name_pos = identifiers.identify_field "Associated Gene Name"
|
119
|
-
|
120
|
-
|
121
|
-
|
140
|
+
if tax_codes and tax_codes.any?
|
141
|
+
entrez2name = Entrez.entrez2name(tax_codes)
|
142
|
+
identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
|
143
|
+
names = values[name_pos] || []
|
122
144
|
|
123
|
-
|
124
|
-
|
125
|
-
|
145
|
+
matches = entrez.select do |e|
|
146
|
+
entrez2name.include?(e) && (names & entrez2name[e]).any?
|
147
|
+
end
|
126
148
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
149
|
+
if matches.any?
|
150
|
+
matches
|
151
|
+
else
|
152
|
+
entrez
|
153
|
+
end
|
131
154
|
end
|
132
155
|
end
|
133
156
|
|
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
|
|
147
170
|
identifiers = identifiers.reorder(:key, ordered_fields)
|
148
171
|
end
|
149
172
|
|
150
|
-
|
151
|
-
|
152
|
-
|
173
|
+
if tax_codes and tax_codes.any?
|
174
|
+
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
|
175
|
+
entrez_synonyms.key_field = "Entrez Gene ID"
|
176
|
+
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
153
177
|
|
154
|
-
|
178
|
+
identifiers.attach entrez_synonyms
|
179
|
+
end
|
155
180
|
|
156
181
|
identifiers.with_unnamed do
|
157
182
|
identifiers.each do |key, values|
|
158
183
|
values.each do |list|
|
184
|
+
list ||= []
|
159
185
|
list.reject!{|v| v.nil? or v.empty?}
|
160
186
|
list.uniq!
|
161
187
|
end
|
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
|
|
166
192
|
Misc.sensiblewrite(t.name, identifiers.to_s)
|
167
193
|
end
|
168
194
|
|
169
|
-
file 'lexicon' => 'identifiers' do |t|
|
195
|
+
file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
|
170
196
|
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
|
197
|
+
tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
|
171
198
|
|
172
|
-
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep =>
|
199
|
+
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
|
173
200
|
entrez_description.key_field = "Entrez Gene ID"
|
174
201
|
entrez_description.fields = ["Entrez Gene Description"]
|
175
202
|
|
@@ -308,8 +335,9 @@ end
|
|
308
335
|
|
309
336
|
# {{{ Other info
|
310
337
|
|
311
|
-
file 'gene_pmids' do |t|
|
312
|
-
|
338
|
+
file 'gene_pmids' => 'entrez_taxids' do |t|
|
339
|
+
tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
|
340
|
+
tsv = Entrez.entrez2pubmed(tax_codes)
|
313
341
|
text = "#: :namespace=#{Thread.current['namespace']}\n"
|
314
342
|
text += "#Entrez Gene ID\tPMID"
|
315
343
|
tsv.each do |gene, pmids|
|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
|
|
417
445
|
|
418
446
|
gene_go.monitor = true
|
419
447
|
gene_go.process "GO ID" do |key, go_id, values|
|
420
|
-
clean =
|
448
|
+
clean = NamedArray.zip_fields(values).select do |id, type|
|
421
449
|
type == "biological_process"
|
422
450
|
end
|
423
451
|
clean.collect{|id, type| id}
|
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
|
|
487
515
|
end
|
488
516
|
|
489
517
|
file 'chromosomes' do |t|
|
490
|
-
|
518
|
+
tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
491
519
|
|
492
|
-
Misc.sensiblewrite(t.name,
|
520
|
+
Misc.sensiblewrite(t.name, tsv.keys * "\n")
|
493
521
|
end
|
494
522
|
|
495
523
|
file 'blacklist_chromosomes' => 'chromosomes' do |t|
|
@@ -511,6 +539,15 @@ end
|
|
511
539
|
|
512
540
|
rule /^chromosome_.*/ do |t|
|
513
541
|
chr = t.name.match(/chromosome_(.*)/)[1]
|
542
|
+
path = File.expand_path(t.name)
|
543
|
+
dirname = File.dirname(path)
|
544
|
+
|
545
|
+
organism = File.basename(dirname)
|
546
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
547
|
+
archive = organism
|
548
|
+
organism = File.basename(File.dirname(dirname))
|
549
|
+
organism = File.join(organism, archive)
|
550
|
+
end
|
514
551
|
|
515
552
|
# HACK: Skip LRG chromosomes
|
516
553
|
raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
|
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
|
|
519
556
|
|
520
557
|
release = Ensembl.releases[archive]
|
521
558
|
|
522
|
-
|
559
|
+
fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
|
560
|
+
server, _, path = fasta_url.partition("/")
|
561
|
+
path = "/" + path
|
562
|
+
|
563
|
+
ftp = Net::FTP.new(server)
|
523
564
|
ftp.passive = true
|
524
565
|
ftp.login
|
525
|
-
|
526
|
-
ftp.chdir("pub/current_fasta/")
|
527
|
-
else
|
528
|
-
ftp.chdir("pub/#{ release }/fasta/")
|
529
|
-
end
|
530
|
-
ftp.chdir($scientific_name.downcase.sub(" ",'_'))
|
566
|
+
ftp.chdir(path)
|
531
567
|
ftp.chdir('dna')
|
532
|
-
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
533
|
-
|
534
|
-
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
535
568
|
|
536
|
-
|
569
|
+
file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
|
570
|
+
if file
|
571
|
+
Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
|
537
572
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
573
|
+
Misc.lock t.name + '.rake' do
|
574
|
+
TmpFile.with_file do |tmpfile|
|
575
|
+
ftp.getbinaryfile(file, tmpfile)
|
576
|
+
Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
|
577
|
+
ftp.close
|
578
|
+
end
|
543
579
|
end
|
580
|
+
else
|
581
|
+
file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
|
582
|
+
Misc.lock t.name + '.rake' do
|
583
|
+
TmpFile.with_file do |tmpfile|
|
584
|
+
ftp.getbinaryfile(file, tmpfile)
|
585
|
+
txt = Open.read(tmpfile, :gzip => true)
|
586
|
+
|
587
|
+
chr_txt = []
|
588
|
+
|
589
|
+
in_chr = false
|
590
|
+
txt.split("\n").each do |line|
|
591
|
+
if line.start_with?(">#{chr}")
|
592
|
+
in_chr = true
|
593
|
+
elsif line.start_with?(">")
|
594
|
+
in_chr = false
|
595
|
+
else
|
596
|
+
chr_txt << line if in_chr
|
597
|
+
end
|
598
|
+
end
|
599
|
+
Misc.sensiblewrite(t.name, chr_txt * "" )
|
600
|
+
ftp.close
|
601
|
+
end
|
602
|
+
end
|
603
|
+
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
544
604
|
end
|
545
605
|
end
|
546
606
|
|
@@ -584,6 +644,16 @@ end
|
|
584
644
|
require 'bio'
|
585
645
|
|
586
646
|
file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
|
647
|
+
path = File.expand_path(t.name)
|
648
|
+
dirname = File.dirname(path)
|
649
|
+
|
650
|
+
organism = File.basename(dirname)
|
651
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
652
|
+
archive = organism
|
653
|
+
organism = File.basename(File.dirname(dirname))
|
654
|
+
organism = File.join(organism, archive)
|
655
|
+
end
|
656
|
+
|
587
657
|
exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
|
588
658
|
|
589
659
|
chr_transcript_ranges ||= {}
|
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
|
|
616
686
|
chr_transcript_ranges.each do |chr, transcript_ranges|
|
617
687
|
begin
|
618
688
|
raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
|
619
|
-
|
620
|
-
|
621
|
-
p.
|
622
|
-
chr_str = p.
|
689
|
+
pkgdir = Thread.current["resource"]
|
690
|
+
p = pkgdir[organism]["chromosome_#{chr}"]
|
691
|
+
p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
|
692
|
+
chr_str = p.read
|
623
693
|
rescue Exception
|
624
694
|
Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
|
625
695
|
raise $! unless $!.message =~ /not supported/
|
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
|
656
726
|
organism = File.join(organism, archive)
|
657
727
|
end
|
658
728
|
|
659
|
-
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :
|
729
|
+
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
|
660
730
|
|
661
731
|
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
|
662
732
|
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
|
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
|
670
740
|
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
|
671
741
|
end
|
672
742
|
|
673
|
-
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :
|
674
|
-
transcript_exons = TSV.open("./transcript_exons", :
|
675
|
-
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :
|
743
|
+
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
744
|
+
transcript_exons = TSV.open("./transcript_exons", :unnamed => true)
|
745
|
+
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
|
676
746
|
|
677
747
|
transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
|
678
748
|
transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
|
@@ -719,12 +789,13 @@ end
|
|
719
789
|
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
|
720
790
|
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
|
721
791
|
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
|
722
|
-
transcript_phase
|
792
|
+
transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
|
723
793
|
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
|
724
794
|
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
725
795
|
|
726
796
|
|
727
797
|
protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
|
798
|
+
transcript_sequence.monitor = true
|
728
799
|
transcript_sequence.through do |transcript, sequence|
|
729
800
|
protein = transcript_protein[transcript]
|
730
801
|
next if protein.nil? or protein.empty?
|
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
|
|
777
848
|
uni_seq = UniProt.get_uniprot_sequence(uni)
|
778
849
|
ensps = uni2ensps[uni]
|
779
850
|
next if ensps.nil? or ensps.empty?
|
851
|
+
|
780
852
|
best_ensp = ensps.sort_by do |ensp|
|
781
853
|
ensp_seq = ensp2seq[ensp]
|
782
854
|
if ensp_seq
|
@@ -829,3 +901,4 @@ file 'cdna_fasta' do |t|
|
|
829
901
|
Open.download(url, "#{t.name}.gz")
|
830
902
|
nil
|
831
903
|
end
|
904
|
+
|
@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
|
3
3
|
require 'rbbt/sources/pubmed'
|
4
4
|
require 'test/unit'
|
5
5
|
require 'rbbt/sources/biomart'
|
6
|
+
require 'rbbt/sources/organism'
|
6
7
|
require 'rbbt/util/tmpfile'
|
7
8
|
require 'test/unit'
|
8
9
|
|
9
10
|
class TestBioMart < Test::Unit::TestCase
|
10
11
|
|
11
12
|
def setup
|
12
|
-
BioMart.set_archive
|
13
|
+
BioMart.set_archive "feb2014"
|
13
14
|
end
|
14
15
|
|
15
16
|
def teardown
|
16
17
|
BioMart.unset_archive
|
17
18
|
end
|
18
19
|
|
19
|
-
def
|
20
|
+
def test_get_Sce
|
20
21
|
assert_raise BioMart::QueryError do
|
21
22
|
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
22
23
|
end
|
23
24
|
|
24
|
-
|
25
|
+
BioMart.set_archive "feb2023-fungi"
|
26
|
+
data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
|
25
27
|
tsv = TSV.open data, :double, :merge => true
|
26
|
-
assert(tsv['852236'][0].include? 'CAA84864')
|
28
|
+
assert(tsv['852236'][0].include? 'CAA84864.1')
|
27
29
|
|
28
|
-
data = BioMart.get('
|
30
|
+
data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
|
29
31
|
tsv = TSV.open data, :double, :merge => true
|
30
32
|
assert(tsv['852236'][1].include? 'YBL044W')
|
31
33
|
end
|
32
34
|
|
33
|
-
def
|
35
|
+
def test_get_Hsa
|
36
|
+
Log.severity = 0
|
37
|
+
data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
|
38
|
+
tsv = TSV.open data, :double, :merge => true
|
39
|
+
assert(tsv['852236'][0].include? 'CAA84864.1')
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def test_query
|
34
44
|
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
35
45
|
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
36
46
|
|
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
|
|
41
51
|
end
|
42
52
|
end
|
43
53
|
|
54
|
+
def __test_transcrip_exons
|
55
|
+
Log.with_severity 1 do
|
56
|
+
TmpFile.with_file do |f|
|
57
|
+
fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
|
58
|
+
main = fields[0]
|
59
|
+
attrs = fields.values_at(1, 2)
|
60
|
+
attrs_first = [attrs.first]
|
61
|
+
attrs_last = [attrs.last]
|
62
|
+
database = 'hsapiens_gene_ensembl'
|
63
|
+
|
64
|
+
filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
65
|
+
ppp Open.read(filename)
|
66
|
+
|
67
|
+
filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
68
|
+
ppp Open.read(filename)
|
69
|
+
|
70
|
+
filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
71
|
+
ppp Open.read(filename)
|
72
|
+
|
73
|
+
filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
|
74
|
+
ppp Open.read(filename)
|
75
|
+
|
76
|
+
data = TSV.open Open.open(filename)
|
77
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
44
82
|
def test_tsv
|
45
83
|
data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
46
84
|
assert(data['852236']['Protein ID'].include? 'CAA84864')
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
class TestEnsemblFTP < Test::Unit::TestCase
|
5
|
+
def test_ftp_for
|
6
|
+
assert_nothing_raised do
|
7
|
+
Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
|
|
5
5
|
|
6
6
|
class TestOrganism < Test::Unit::TestCase
|
7
7
|
|
8
|
-
def
|
8
|
+
def _test_known_ids
|
9
9
|
assert Organism.known_ids("Hsa").include?("Associated Gene Name")
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def _test_location
|
13
13
|
assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
16
|
+
def _test_identifiers
|
17
17
|
assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
|
18
18
|
assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
19
19
|
assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
20
20
|
end
|
21
21
|
|
22
|
-
def
|
22
|
+
def _test_lexicon
|
23
23
|
assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def _test_guess_id
|
27
27
|
ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
|
28
28
|
gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
|
29
29
|
assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
|
30
30
|
assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
|
31
31
|
end
|
32
32
|
|
33
|
-
def
|
33
|
+
def _test_organisms
|
34
34
|
assert Organism.organisms.include? "Hsa"
|
35
35
|
assert_equal "Hsa", Organism.organism("Homo sapiens")
|
36
36
|
end
|
37
37
|
|
38
|
-
def
|
38
|
+
def _test_attach_translations
|
39
39
|
tsv = TSV.setup({"1020" => []}, :type => :list)
|
40
40
|
tsv.key_field = "Entrez Gene ID"
|
41
41
|
tsv.fields = []
|
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
|
|
47
47
|
assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
|
48
48
|
end
|
49
49
|
|
50
|
-
def
|
50
|
+
def _test_entrez_taxids
|
51
51
|
assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
|
52
52
|
end
|
53
53
|
|
54
|
-
def
|
54
|
+
def _test_lift_over
|
55
55
|
mutation_19 = "19:21131664:T"
|
56
56
|
mutation_18 = "19:20923504:T"
|
57
|
-
source_build =
|
57
|
+
source_build = "Hsa/feb2014"
|
58
58
|
target_build = "Hsa/may2009"
|
59
59
|
|
60
60
|
assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
|
61
61
|
assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
|
62
62
|
end
|
63
63
|
|
64
|
-
def
|
64
|
+
def _test_orhtolog
|
65
65
|
require 'rbbt/entity/gene'
|
66
66
|
assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
|
67
67
|
end
|
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
|
|
70
70
|
assert Organism.chromosome_sizes["2"].to_i > 10_000_000
|
71
71
|
end
|
72
72
|
|
73
|
-
def
|
73
|
+
def _test_build_organism
|
74
74
|
assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
|
75
75
|
assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
|
76
76
|
assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
|
77
77
|
end
|
78
78
|
|
79
|
-
#def
|
79
|
+
#def _test_genes_at_chromosome
|
80
80
|
# pos = [12, 117799500]
|
81
81
|
# assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
82
82
|
#end
|
83
83
|
|
84
|
-
#def
|
84
|
+
#def _test_genes_at_chromosome_array
|
85
85
|
# pos = [12, [117799500, 106903900]]
|
86
86
|
# assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
87
87
|
#end
|
88
88
|
|
89
|
-
#def
|
89
|
+
#def _test_genes_at_genomic_positions
|
90
90
|
# pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
|
91
91
|
# assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
|
92
92
|
#end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.4.
|
4
|
+
version: 3.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -120,10 +120,10 @@ files:
|
|
120
120
|
- share/install/KEGG/Rakefile
|
121
121
|
- share/install/Matador/Rakefile
|
122
122
|
- share/install/NCI/Rakefile
|
123
|
-
- share/install/Organism/Hsa
|
124
|
-
- share/install/Organism/Mmu
|
125
|
-
- share/install/Organism/Rno
|
126
|
-
- share/install/Organism/Sce
|
123
|
+
- share/install/Organism/Hsa.rake
|
124
|
+
- share/install/Organism/Mmu.rake
|
125
|
+
- share/install/Organism/Rno.rake
|
126
|
+
- share/install/Organism/Sce.rake
|
127
127
|
- share/install/Organism/organism_helpers.rb
|
128
128
|
- share/install/PharmaGKB/Rakefile
|
129
129
|
- share/install/Pina/Rakefile
|
@@ -133,6 +133,7 @@ files:
|
|
133
133
|
- share/install/lib/rake_helper.rb
|
134
134
|
- test/rbbt/sources/test_HPRD.rb
|
135
135
|
- test/rbbt/sources/test_biomart.rb
|
136
|
+
- test/rbbt/sources/test_ensembl_ftp.rb
|
136
137
|
- test/rbbt/sources/test_entrez.rb
|
137
138
|
- test/rbbt/sources/test_go.rb
|
138
139
|
- test/rbbt/sources/test_gscholar.rb
|
@@ -166,13 +167,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
167
|
- !ruby/object:Gem::Version
|
167
168
|
version: '0'
|
168
169
|
requirements: []
|
169
|
-
rubygems_version: 3.5.
|
170
|
+
rubygems_version: 3.5.23
|
170
171
|
signing_key:
|
171
172
|
specification_version: 4
|
172
173
|
summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
|
173
174
|
test_files:
|
174
175
|
- test/rbbt/sources/test_HPRD.rb
|
175
176
|
- test/rbbt/sources/test_biomart.rb
|
177
|
+
- test/rbbt/sources/test_ensembl_ftp.rb
|
176
178
|
- test/rbbt/sources/test_entrez.rb
|
177
179
|
- test/rbbt/sources/test_go.rb
|
178
180
|
- test/rbbt/sources/test_gscholar.rb
|
@@ -1,52 +0,0 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
|
-
$taxs = [559292,4932]
|
7
|
-
$scientific_name = "Saccharomyces cerevisiae"
|
8
|
-
#$ortholog_key = "yeast_ensembl_gene"
|
9
|
-
|
10
|
-
$biomart_db = 'scerevisiae_gene_ensembl'
|
11
|
-
|
12
|
-
$biomart_lexicon = [
|
13
|
-
[ 'Associated Gene Name' , "external_gene_id"],
|
14
|
-
]
|
15
|
-
|
16
|
-
$biomart_protein_identifiers = [
|
17
|
-
[ 'Protein ID', "protein_id" ],
|
18
|
-
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
19
|
-
[ 'Unigene ID', "unigene" ],
|
20
|
-
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
21
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
22
|
-
]
|
23
|
-
|
24
|
-
$biomart_probe_identifiers = [
|
25
|
-
]
|
26
|
-
|
27
|
-
$biomart_identifiers = [
|
28
|
-
[ 'Entrez Gene ID', "entrezgene"],
|
29
|
-
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
30
|
-
[ 'Associated Gene Name', "external_gene_id" ],
|
31
|
-
[ 'Protein ID', "protein_id" ],
|
32
|
-
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
33
|
-
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
34
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
35
|
-
[ 'EMBL (Genbank) ID' , "embl"] ,
|
36
|
-
[ 'RefSeq mRNA' , "refseq_mrna"] ,
|
37
|
-
]
|
38
|
-
|
39
|
-
$biomart_go= [
|
40
|
-
["GO ID", 'go_id'],
|
41
|
-
["GO Namespace", 'namespace_1003'],
|
42
|
-
]
|
43
|
-
|
44
|
-
$biomart_go_2009= [
|
45
|
-
["GO BP ID", 'go_biological_process_id'],
|
46
|
-
["GO MF ID", 'go_molecular_function_id'],
|
47
|
-
["GO CC ID", 'go_cellular_component_id'],
|
48
|
-
]
|
49
|
-
|
50
|
-
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
51
|
-
Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
|
52
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|