rbbt-sources 3.4.0 → 3.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/etc/allowed_biomart_archives +2 -4
- data/etc/biomart/missing_in_archive +2 -0
- data/etc/build_organism +4 -4
- data/etc/organisms +1 -0
- data/lib/rbbt/sources/biomart.rb +48 -13
- data/lib/rbbt/sources/ensembl_ftp.rb +31 -15
- data/lib/rbbt/sources/entrez.rb +13 -0
- data/lib/rbbt/sources/go.rb +2 -2
- data/lib/rbbt/sources/organism.rb +45 -24
- data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} +23 -15
- data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} +3 -20
- data/share/install/Organism/{Rno/Rakefile → Rno.rake} +3 -8
- data/share/install/Organism/Sce.rake +38 -0
- data/share/install/Organism/organism_helpers.rb +123 -50
- data/test/rbbt/sources/test_biomart.rb +44 -6
- data/test/rbbt/sources/test_ensembl_ftp.rb +11 -0
- data/test/rbbt/sources/test_entrez.rb +5 -0
- data/test/rbbt/sources/test_organism.rb +15 -15
- metadata +9 -7
- data/share/install/Organism/Sce/Rakefile +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
|
4
|
+
data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
|
7
|
+
data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0
|
data/etc/build_organism
CHANGED
data/etc/organisms
CHANGED
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -3,6 +3,7 @@ require 'rbbt/tsv'
|
|
3
3
|
require 'rbbt/tsv/attach'
|
4
4
|
require 'rbbt/util/log'
|
5
5
|
require 'cgi'
|
6
|
+
require 'rbbt/sources/organism'
|
6
7
|
|
7
8
|
# This module interacts with BioMart. It performs queries to BioMart and
|
8
9
|
# synthesises a hash with the results. Note that this module connects to the
|
@@ -13,7 +14,7 @@ module BioMart
|
|
13
14
|
|
14
15
|
class BioMart::QueryError < StandardError; end
|
15
16
|
|
16
|
-
BIOMART_URL = '
|
17
|
+
BIOMART_URL = 'ensembl.org/biomart/martservice'
|
17
18
|
|
18
19
|
MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
|
19
20
|
|
@@ -22,7 +23,7 @@ module BioMart
|
|
22
23
|
@@biomart_query_xml = <<-EOT
|
23
24
|
<?xml version="1.0" encoding="UTF-8"?>
|
24
25
|
<!DOCTYPE Query>
|
25
|
-
<Query completionStamp="1" virtualSchemaName = "
|
26
|
+
<Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
|
26
27
|
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
27
28
|
<!--FILTERS-->
|
28
29
|
<!--MAIN-->
|
@@ -36,14 +37,10 @@ module BioMart
|
|
36
37
|
raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
|
37
38
|
end
|
38
39
|
Thread.current['archive'] = date
|
39
|
-
Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
|
40
|
-
Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
|
41
40
|
end
|
42
41
|
|
43
42
|
def self.unset_archive
|
44
|
-
Log.debug "Restoring current version URL #{BIOMART_URL}"
|
45
43
|
Thread.current['archive'] = nil
|
46
|
-
Thread.current['archive_url'] = nil
|
47
44
|
end
|
48
45
|
|
49
46
|
def self.with_archive(data)
|
@@ -55,6 +52,21 @@ module BioMart
|
|
55
52
|
end
|
56
53
|
end
|
57
54
|
|
55
|
+
def self.final_url(query, archive = nil, ensembl_domain = nil)
|
56
|
+
url_domain = if archive.nil?
|
57
|
+
if ensembl_domain.nil?
|
58
|
+
'www'
|
59
|
+
else
|
60
|
+
ensembl_domain
|
61
|
+
end
|
62
|
+
elsif ensembl_domain
|
63
|
+
[archive, ensembl_domain] * "-"
|
64
|
+
else
|
65
|
+
[archive, 'archive'] * "."
|
66
|
+
end
|
67
|
+
"http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
|
68
|
+
end
|
69
|
+
|
58
70
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
59
71
|
open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
|
60
72
|
repeats = true
|
@@ -75,11 +87,17 @@ module BioMart
|
|
75
87
|
|
76
88
|
query = @@biomart_query_xml.dup
|
77
89
|
query.sub!(/<!--DATABASE-->/,database)
|
90
|
+
if Thread.current["ensembl_domain"]
|
91
|
+
query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
|
92
|
+
else
|
93
|
+
query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
|
94
|
+
end
|
78
95
|
query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
|
79
96
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
80
97
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
81
98
|
|
82
|
-
url = Thread.current[
|
99
|
+
url = final_url(query, Thread.current["archive"], Thread.current["ensembl_domain"])
|
100
|
+
|
83
101
|
|
84
102
|
begin
|
85
103
|
response = Open.read(url, open_options.dup)
|
@@ -105,10 +123,17 @@ module BioMart
|
|
105
123
|
|
106
124
|
new_datafile = TmpFile.tmp_file
|
107
125
|
if data.nil?
|
108
|
-
|
126
|
+
Open.open(result_file) do |file|
|
127
|
+
Open.write(new_datafile, Open.collapse_stream(file))
|
128
|
+
end
|
109
129
|
data = new_datafile
|
110
130
|
else
|
111
|
-
|
131
|
+
Open.open(result_file) do |stream_result|
|
132
|
+
Open.open(data) do |stream_data|
|
133
|
+
Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
#TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
|
112
137
|
FileUtils.rm data
|
113
138
|
data = new_datafile
|
114
139
|
end
|
@@ -142,9 +167,9 @@ module BioMart
|
|
142
167
|
|
143
168
|
IndiferentHash.setup(open_options)
|
144
169
|
|
145
|
-
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{
|
170
|
+
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
|
146
171
|
|
147
|
-
max_items =
|
172
|
+
max_items = 1
|
148
173
|
chunks = []
|
149
174
|
chunk = []
|
150
175
|
attrs.each{|a|
|
@@ -178,7 +203,7 @@ module BioMart
|
|
178
203
|
results
|
179
204
|
else
|
180
205
|
Open.write(filename) do |f|
|
181
|
-
f.puts "#: " << Misc.hash2string(TSV
|
206
|
+
f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
|
182
207
|
if field_names.nil?
|
183
208
|
f.puts "#" << [main, attrs].flatten * "\t"
|
184
209
|
else
|
@@ -211,7 +236,17 @@ module BioMart
|
|
211
236
|
changes = {}
|
212
237
|
missing.select{|m| m.include? "~" }.each do |str|
|
213
238
|
orig,_sep, new = str.partition "~"
|
214
|
-
|
239
|
+
if orig.include?(":")
|
240
|
+
target_db, _sep, orig = orig.partition(":")
|
241
|
+
if target_db[0] == "-"
|
242
|
+
next if database == target_db[1..-1]
|
243
|
+
else
|
244
|
+
next unless database == target_db
|
245
|
+
end
|
246
|
+
changes[orig] = new
|
247
|
+
else
|
248
|
+
changes[orig] = new
|
249
|
+
end
|
215
250
|
end
|
216
251
|
changed = true
|
217
252
|
while changed
|
@@ -9,11 +9,29 @@ module Ensembl
|
|
9
9
|
module FTP
|
10
10
|
|
11
11
|
SERVER = "ftp.ensembl.org"
|
12
|
+
DOMAIN_SERVER = "ftp.ensemblgenomes.org"
|
12
13
|
|
13
|
-
def self.
|
14
|
+
def self.ftp_name_for_domain(domain, organism, subdir='mysql')
|
15
|
+
code, build = organism.split "/"
|
16
|
+
build ||= "current"
|
17
|
+
|
18
|
+
release = build == "current" ? 'current' : Ensembl.releases[build]
|
19
|
+
name = Organism.scientific_name(organism)
|
20
|
+
ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
|
21
|
+
ftp.passive = true
|
22
|
+
ftp.login
|
23
|
+
dir = File.join('pub', domain, 'current', subdir)
|
24
|
+
ftp.chdir(dir)
|
25
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
|
26
|
+
ftp.close
|
27
|
+
[release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
|
14
28
|
end
|
15
29
|
|
16
|
-
def self.ftp_name_for(organism)
|
30
|
+
def self.ftp_name_for(organism, subdir='mysql')
|
31
|
+
if domain = Thread.current["ensembl_domain"]
|
32
|
+
return ftp_name_for_domain(domain, organism,subdir)
|
33
|
+
end
|
34
|
+
|
17
35
|
code, build = organism.split "/"
|
18
36
|
build ||= "current"
|
19
37
|
|
@@ -23,8 +41,9 @@ module Ensembl
|
|
23
41
|
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
24
42
|
ftp.passive = true
|
25
43
|
ftp.login
|
26
|
-
|
27
|
-
|
44
|
+
dir = File.join('pub', "current_#{subdir}")
|
45
|
+
ftp.chdir(dir)
|
46
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
|
28
47
|
ftp.close
|
29
48
|
else
|
30
49
|
release = Ensembl.releases[build]
|
@@ -32,24 +51,21 @@ module Ensembl
|
|
32
51
|
ftp = Net::FTP.new(Ensembl::FTP::SERVER)
|
33
52
|
ftp.passive = true
|
34
53
|
ftp.login
|
35
|
-
|
36
|
-
|
54
|
+
dir = File.join('pub', release, subdir)
|
55
|
+
ftp.chdir(dir)
|
56
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
|
37
57
|
ftp.close
|
38
58
|
end
|
39
|
-
[release, file]
|
59
|
+
[release, File.join(Ensembl::FTP::SERVER, dir, file)]
|
40
60
|
end
|
41
61
|
|
42
|
-
def self.
|
43
|
-
release,
|
44
|
-
|
45
|
-
File.join('/pub/', 'current_mysql', ftp_name)
|
46
|
-
else
|
47
|
-
File.join('/pub/', release, 'mysql', ftp_name)
|
48
|
-
end
|
62
|
+
def self.ftp_url_for(organism)
|
63
|
+
release, ftp_url = ftp_name_for(organism)
|
64
|
+
ftp_url
|
49
65
|
end
|
50
66
|
|
51
67
|
def self.base_url(organism)
|
52
|
-
File.join("ftp://"
|
68
|
+
File.join("ftp://", ftp_url_for(organism) )
|
53
69
|
end
|
54
70
|
|
55
71
|
def self.url_for(organism, table, extension)
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -8,6 +8,19 @@ module Entrez
|
|
8
8
|
|
9
9
|
Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
|
10
10
|
Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
|
11
|
+
Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
|
12
|
+
TmpFile.with_dir do |dir|
|
13
|
+
Misc.in_dir dir do
|
14
|
+
CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
|
15
|
+
CMD.cmd("tar xvfz taxdump.tar.gz")
|
16
|
+
CMD.cmd("grep 'scientific name' names.dmp |cut -f 1,3 > tmp.tsv")
|
17
|
+
tsv = TSV.open('tmp.tsv', type: :single)
|
18
|
+
tsv.key_field = "Entrez Tax ID"
|
19
|
+
tsv.fields = ["Scientific Name"]
|
20
|
+
Open.write(filename, tsv.to_s)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
11
24
|
|
12
25
|
def self.entrez2native(taxs, options = {})
|
13
26
|
options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -25,8 +25,8 @@ module GO
|
|
25
25
|
# the gene_ontology.obo file and extracts all the fields, although right now,
|
26
26
|
# only the name field is used.
|
27
27
|
def self.init
|
28
|
-
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
29
|
-
info.serializer = :marshal if info.respond_to? :serializer
|
28
|
+
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
|
29
|
+
#info.serializer = :marshal if info.respond_to? :serializer
|
30
30
|
Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
|
31
31
|
term_info = {}
|
32
32
|
|
@@ -9,6 +9,10 @@ module Organism
|
|
9
9
|
ARCHIVE_MONTH_INDEX = {}
|
10
10
|
%w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
|
11
11
|
|
12
|
+
def self.rake_organism_helper
|
13
|
+
Rbbt.share.install.Organism["organism_helpers.rb"].find
|
14
|
+
end
|
15
|
+
|
12
16
|
def self.compare_archives(a1, a2)
|
13
17
|
a1 = a1.partition("/").last if a1 and a1.include? "/"
|
14
18
|
a2 = a2.partition("/").last if a2 and a2.include? "/"
|
@@ -29,7 +33,8 @@ module Organism
|
|
29
33
|
end
|
30
34
|
|
31
35
|
def self.default_code(organism = "Hsa")
|
32
|
-
|
36
|
+
latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
|
37
|
+
organism.split("/").first << "/" << latest
|
33
38
|
end
|
34
39
|
|
35
40
|
def self.organism_codes(organism = nil)
|
@@ -43,7 +48,8 @@ module Organism
|
|
43
48
|
end
|
44
49
|
|
45
50
|
def self.installed_organisms
|
46
|
-
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
|
51
|
+
Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
|
52
|
+
Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
|
47
53
|
end
|
48
54
|
|
49
55
|
def self.prepared_organisms
|
@@ -62,25 +68,6 @@ module Organism
|
|
62
68
|
nil
|
63
69
|
end
|
64
70
|
|
65
|
-
Organism.installable_organisms.each do |organism|
|
66
|
-
claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
|
67
|
-
|
68
|
-
module_eval "#{ organism } = with_key '#{organism}'"
|
69
|
-
end
|
70
|
-
|
71
|
-
Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
|
72
|
-
Open.mkdir File.dirname(file) unless File.directory?(file)
|
73
|
-
url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
74
|
-
CMD.cmd_log("wget '#{url}' -O '#{file}'")
|
75
|
-
CMD.cmd("chmod 0755 '#{file}'")
|
76
|
-
Rbbt.set_software_env
|
77
|
-
nil
|
78
|
-
end
|
79
|
-
|
80
|
-
CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
|
81
|
-
|
82
|
-
Rbbt.set_software_env
|
83
|
-
|
84
71
|
def self.hg_build(organism)
|
85
72
|
require 'rbbt/sources/ensembl_ftp'
|
86
73
|
organism = organism.strip
|
@@ -257,7 +244,16 @@ module Organism
|
|
257
244
|
end
|
258
245
|
|
259
246
|
def self.scientific_name(organism)
|
260
|
-
Organism[organism]
|
247
|
+
Organism[organism].scientific_name.read.strip
|
248
|
+
end
|
249
|
+
|
250
|
+
def self.make_organism(name, long = false)
|
251
|
+
first, _, second = name.partition(/[ _]/)
|
252
|
+
if long
|
253
|
+
first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
|
254
|
+
else
|
255
|
+
first[0].upcase + second[0..1].downcase
|
256
|
+
end
|
261
257
|
end
|
262
258
|
|
263
259
|
def self.organism(name)
|
@@ -295,7 +291,7 @@ module Organism
|
|
295
291
|
organism ||= "Hsa"
|
296
292
|
|
297
293
|
@@gene_start_end ||= {}
|
298
|
-
gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :
|
294
|
+
gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
|
299
295
|
|
300
296
|
ranges = genes.collect{|gene|
|
301
297
|
start, eend = gene_start_end[gene]
|
@@ -339,7 +335,8 @@ module Organism
|
|
339
335
|
def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
|
340
336
|
chromosome_sizes = {}
|
341
337
|
|
342
|
-
Organism
|
338
|
+
Organism.chromosomes(organism).produce.tsv.each do |chr|
|
339
|
+
file = Organism[organism]["chromosome_#{chr}"].produce.find
|
343
340
|
chromosome = file.split("_").last.split(".").first
|
344
341
|
size = if Open.gzip?(file) || Open.bgzip?(file)
|
345
342
|
CMD.cmd("zcat '#{ file }' | wc -c ").read
|
@@ -352,4 +349,28 @@ module Organism
|
|
352
349
|
chromosome_sizes
|
353
350
|
end
|
354
351
|
|
352
|
+
Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
|
353
|
+
Open.mkdir File.dirname(file) unless File.directory?(file)
|
354
|
+
url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
|
355
|
+
CMD.cmd_log("wget '#{url}' -O '#{file}'")
|
356
|
+
CMD.cmd("chmod 0755 '#{file}'")
|
357
|
+
Rbbt.set_software_env
|
358
|
+
nil
|
359
|
+
end
|
360
|
+
|
361
|
+
CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
|
362
|
+
|
363
|
+
Rbbt.set_software_env
|
364
|
+
|
365
|
+
Organism.installable_organisms.each do |organism|
|
366
|
+
if Rbbt.share.install.Organism[organism].Rakefile.exists?
|
367
|
+
rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
|
368
|
+
else
|
369
|
+
rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
|
370
|
+
end
|
371
|
+
|
372
|
+
claim Organism[organism], :rake, rakefile
|
373
|
+
|
374
|
+
module_eval "#{ organism } = with_key '#{organism}'"
|
375
|
+
end
|
355
376
|
end
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [9606]
|
7
2
|
$scientific_name = "Homo sapiens"
|
8
3
|
$ortholog_key = "hsapiens_homolog_ensembl_gene"
|
@@ -95,17 +90,30 @@ $biomart_identifiers = [
|
|
95
90
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
96
91
|
]
|
97
92
|
|
98
|
-
$
|
99
|
-
|
100
|
-
|
93
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
94
|
+
Thread.current["namespace"] = $namespace
|
95
|
+
load Organism.rake_organism_helper
|
96
|
+
|
97
|
+
file 'regulators' do |t|
|
98
|
+
regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
|
99
|
+
regulatory_fields = [
|
100
|
+
['Chromosome Name','chromosome_name'],
|
101
|
+
['Region Start', 'chromosome_start'],
|
102
|
+
['Region End', 'chromosome_end'],
|
103
|
+
['Feature type', 'feature_type_name'],
|
101
104
|
]
|
105
|
+
regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
|
106
|
+
|
107
|
+
Misc.sensiblewrite(t.name, regulators.to_s)
|
108
|
+
end
|
102
109
|
|
103
|
-
|
104
|
-
[
|
105
|
-
|
106
|
-
|
110
|
+
file 'regulator_activity' do |t|
|
111
|
+
regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
|
112
|
+
regulatory_fields = [
|
113
|
+
['Epigenome name','epigenome_name'],
|
114
|
+
['Activity', 'activity'],
|
107
115
|
]
|
116
|
+
regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
108
117
|
|
109
|
-
|
110
|
-
|
111
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
118
|
+
Misc.sensiblewrite(t.name, regulators.to_s)
|
119
|
+
end
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [10090]
|
7
2
|
$scientific_name = "Mus musculus"
|
8
3
|
$ortholog_key = "mmusculus_homolog_ensembl_gene"
|
@@ -43,18 +38,6 @@ $biomart_identifiers = [
|
|
43
38
|
[ 'EMBL (Genbank) ID' , "embl"] ,
|
44
39
|
]
|
45
40
|
|
46
|
-
$
|
47
|
-
|
48
|
-
|
49
|
-
]
|
50
|
-
|
51
|
-
$biomart_go_2009= [
|
52
|
-
["GO BP ID", 'go_biological_process_id'],
|
53
|
-
["GO MF ID", 'go_molecular_function_id'],
|
54
|
-
["GO CC ID", 'go_cellular_component_id'],
|
55
|
-
]
|
56
|
-
|
57
|
-
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
58
|
-
Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
|
59
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
60
|
-
|
41
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
42
|
+
Thread.current["namespace"] = $namespace
|
43
|
+
load Organism.rake_organism_helper
|
@@ -1,8 +1,3 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
1
|
$taxs = [10116]
|
7
2
|
$scientific_name = "Rattus norvegicus"
|
8
3
|
|
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
|
|
50
45
|
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
|
51
46
|
]
|
52
47
|
|
53
|
-
$namespace = File.basename(
|
54
|
-
Thread.current["namespace"] =
|
55
|
-
load
|
48
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
49
|
+
Thread.current["namespace"] = $namespace
|
50
|
+
load Organism.rake_organism_helper
|
@@ -0,0 +1,38 @@
|
|
1
|
+
$taxs = [559292,4932]
|
2
|
+
$scientific_name = "Saccharomyces cerevisiae"
|
3
|
+
$ensembl_domain = 'fungi'
|
4
|
+
#$ortholog_key = "yeast_ensembl_gene"
|
5
|
+
|
6
|
+
$biomart_db = 'scerevisiae_eg_gene'
|
7
|
+
|
8
|
+
$biomart_lexicon = [
|
9
|
+
[ 'Associated Gene Name' , "external_gene_name"],
|
10
|
+
]
|
11
|
+
|
12
|
+
$biomart_protein_identifiers = [
|
13
|
+
[ 'Protein ID', "protein_id" ],
|
14
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
15
|
+
[ 'Unigene ID', "unigene" ],
|
16
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
17
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
18
|
+
]
|
19
|
+
|
20
|
+
$biomart_probe_identifiers = [
|
21
|
+
]
|
22
|
+
|
23
|
+
$biomart_identifiers = [
|
24
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
25
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
26
|
+
[ 'Associated Gene Name', "external_gene_name" ],
|
27
|
+
[ 'Protein ID', "protein_id" ],
|
28
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
29
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
30
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
31
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
32
|
+
[ 'RefSeq DNA' , "refseq_dna"] ,
|
33
|
+
]
|
34
|
+
|
35
|
+
$namespace = File.basename(__FILE__).sub(/\.rake$/,'')
|
36
|
+
Thread.current["namespace"] = $namespace
|
37
|
+
Thread.current["ensembl_domain"] = $ensembl_domain
|
38
|
+
load Organism.rake_organism_helper
|
@@ -1,8 +1,11 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
|
2
|
+
|
1
3
|
require 'net/ftp'
|
4
|
+
require 'rbbt/sources/biomart'
|
5
|
+
require 'rbbt/sources/entrez'
|
6
|
+
require File.join(File.dirname(__FILE__), '../lib/helpers')
|
2
7
|
require 'rbbt/sources/ensembl_ftp'
|
3
8
|
|
4
|
-
#Thread.current['namespace'] = $namespace
|
5
|
-
|
6
9
|
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
7
10
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
8
11
|
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
@@ -77,6 +80,17 @@ $biomart_pfam= [
|
|
77
80
|
["Pfam Domain", 'pfam'],
|
78
81
|
]
|
79
82
|
|
83
|
+
$biomart_go= [
|
84
|
+
["GO ID", 'go_id'],
|
85
|
+
["GO Namespace", 'namespace_1003'],
|
86
|
+
]
|
87
|
+
|
88
|
+
$biomart_go_2009= [
|
89
|
+
["GO BP ID", 'go_biological_process_id'],
|
90
|
+
["GO MF ID", 'go_molecular_function_id'],
|
91
|
+
["GO CC ID", 'go_cellular_component_id'],
|
92
|
+
]
|
93
|
+
|
80
94
|
$biomart_gene_biotype= [
|
81
95
|
["Biotype", 'gene_biotype'],
|
82
96
|
]
|
@@ -91,7 +105,13 @@ $biomart_exons = [
|
|
91
105
|
#{{{ Rules
|
92
106
|
|
93
107
|
file 'entrez_taxids' do |t|
|
94
|
-
|
108
|
+
if $tax && $tax.any?
|
109
|
+
Misc.sensiblewrite(t.name, $taxs * "\n")
|
110
|
+
else
|
111
|
+
tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
|
112
|
+
taxs = tsv[$scientific_name] || []
|
113
|
+
Misc.sensiblewrite(t.name, taxs * "\n")
|
114
|
+
end
|
95
115
|
end
|
96
116
|
|
97
117
|
file 'scientific_name' do |t|
|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
|
|
104
124
|
Misc.sensiblewrite(t.name, $ortholog_key)
|
105
125
|
end
|
106
126
|
|
107
|
-
file 'identifiers' do |t|
|
127
|
+
file 'identifiers' => 'entrez_taxids' do |t|
|
128
|
+
tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
|
108
129
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
|
109
130
|
identifiers.unnamed = true
|
110
131
|
|
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
|
|
116
137
|
end
|
117
138
|
|
118
139
|
name_pos = identifiers.identify_field "Associated Gene Name"
|
119
|
-
|
120
|
-
|
121
|
-
|
140
|
+
if tax_codes and tax_codes.any?
|
141
|
+
entrez2name = Entrez.entrez2name(tax_codes)
|
142
|
+
identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
|
143
|
+
names = values[name_pos] || []
|
122
144
|
|
123
|
-
|
124
|
-
|
125
|
-
|
145
|
+
matches = entrez.select do |e|
|
146
|
+
entrez2name.include?(e) && (names & entrez2name[e]).any?
|
147
|
+
end
|
126
148
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
149
|
+
if matches.any?
|
150
|
+
matches
|
151
|
+
else
|
152
|
+
entrez
|
153
|
+
end
|
131
154
|
end
|
132
155
|
end
|
133
156
|
|
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
|
|
147
170
|
identifiers = identifiers.reorder(:key, ordered_fields)
|
148
171
|
end
|
149
172
|
|
150
|
-
|
151
|
-
|
152
|
-
|
173
|
+
if tax_codes and tax_codes.any?
|
174
|
+
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
|
175
|
+
entrez_synonyms.key_field = "Entrez Gene ID"
|
176
|
+
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
153
177
|
|
154
|
-
|
178
|
+
identifiers.attach entrez_synonyms
|
179
|
+
end
|
155
180
|
|
156
181
|
identifiers.with_unnamed do
|
157
182
|
identifiers.each do |key, values|
|
158
183
|
values.each do |list|
|
184
|
+
list ||= []
|
159
185
|
list.reject!{|v| v.nil? or v.empty?}
|
160
186
|
list.uniq!
|
161
187
|
end
|
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
|
|
166
192
|
Misc.sensiblewrite(t.name, identifiers.to_s)
|
167
193
|
end
|
168
194
|
|
169
|
-
file 'lexicon' => 'identifiers' do |t|
|
195
|
+
file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
|
170
196
|
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
|
197
|
+
tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
|
171
198
|
|
172
|
-
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep =>
|
199
|
+
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
|
173
200
|
entrez_description.key_field = "Entrez Gene ID"
|
174
201
|
entrez_description.fields = ["Entrez Gene Description"]
|
175
202
|
|
@@ -308,8 +335,9 @@ end
|
|
308
335
|
|
309
336
|
# {{{ Other info
|
310
337
|
|
311
|
-
file 'gene_pmids' do |t|
|
312
|
-
|
338
|
+
file 'gene_pmids' => 'entrez_taxids' do |t|
|
339
|
+
tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
|
340
|
+
tsv = Entrez.entrez2pubmed(tax_codes)
|
313
341
|
text = "#: :namespace=#{Thread.current['namespace']}\n"
|
314
342
|
text += "#Entrez Gene ID\tPMID"
|
315
343
|
tsv.each do |gene, pmids|
|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
|
|
417
445
|
|
418
446
|
gene_go.monitor = true
|
419
447
|
gene_go.process "GO ID" do |key, go_id, values|
|
420
|
-
clean =
|
448
|
+
clean = NamedArray.zip_fields(values).select do |id, type|
|
421
449
|
type == "biological_process"
|
422
450
|
end
|
423
451
|
clean.collect{|id, type| id}
|
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
|
|
487
515
|
end
|
488
516
|
|
489
517
|
file 'chromosomes' do |t|
|
490
|
-
|
518
|
+
tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
491
519
|
|
492
|
-
Misc.sensiblewrite(t.name,
|
520
|
+
Misc.sensiblewrite(t.name, tsv.keys * "\n")
|
493
521
|
end
|
494
522
|
|
495
523
|
file 'blacklist_chromosomes' => 'chromosomes' do |t|
|
@@ -511,6 +539,15 @@ end
|
|
511
539
|
|
512
540
|
rule /^chromosome_.*/ do |t|
|
513
541
|
chr = t.name.match(/chromosome_(.*)/)[1]
|
542
|
+
path = File.expand_path(t.name)
|
543
|
+
dirname = File.dirname(path)
|
544
|
+
|
545
|
+
organism = File.basename(dirname)
|
546
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
547
|
+
archive = organism
|
548
|
+
organism = File.basename(File.dirname(dirname))
|
549
|
+
organism = File.join(organism, archive)
|
550
|
+
end
|
514
551
|
|
515
552
|
# HACK: Skip LRG chromosomes
|
516
553
|
raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
|
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
|
|
519
556
|
|
520
557
|
release = Ensembl.releases[archive]
|
521
558
|
|
522
|
-
|
559
|
+
fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
|
560
|
+
server, _, path = fasta_url.partition("/")
|
561
|
+
path = "/" + path
|
562
|
+
|
563
|
+
ftp = Net::FTP.new(server)
|
523
564
|
ftp.passive = true
|
524
565
|
ftp.login
|
525
|
-
|
526
|
-
ftp.chdir("pub/current_fasta/")
|
527
|
-
else
|
528
|
-
ftp.chdir("pub/#{ release }/fasta/")
|
529
|
-
end
|
530
|
-
ftp.chdir($scientific_name.downcase.sub(" ",'_'))
|
566
|
+
ftp.chdir(path)
|
531
567
|
ftp.chdir('dna')
|
532
|
-
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
533
|
-
|
534
|
-
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
535
568
|
|
536
|
-
|
569
|
+
file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
|
570
|
+
if file
|
571
|
+
Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
|
537
572
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
573
|
+
Misc.lock t.name + '.rake' do
|
574
|
+
TmpFile.with_file do |tmpfile|
|
575
|
+
ftp.getbinaryfile(file, tmpfile)
|
576
|
+
Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
|
577
|
+
ftp.close
|
578
|
+
end
|
543
579
|
end
|
580
|
+
else
|
581
|
+
file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
|
582
|
+
Misc.lock t.name + '.rake' do
|
583
|
+
TmpFile.with_file do |tmpfile|
|
584
|
+
ftp.getbinaryfile(file, tmpfile)
|
585
|
+
txt = Open.read(tmpfile, :gzip => true)
|
586
|
+
|
587
|
+
chr_txt = []
|
588
|
+
|
589
|
+
in_chr = false
|
590
|
+
txt.split("\n").each do |line|
|
591
|
+
if line.start_with?(">#{chr}")
|
592
|
+
in_chr = true
|
593
|
+
elsif line.start_with?(">")
|
594
|
+
in_chr = false
|
595
|
+
else
|
596
|
+
chr_txt << line if in_chr
|
597
|
+
end
|
598
|
+
end
|
599
|
+
Misc.sensiblewrite(t.name, chr_txt * "" )
|
600
|
+
ftp.close
|
601
|
+
end
|
602
|
+
end
|
603
|
+
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
544
604
|
end
|
545
605
|
end
|
546
606
|
|
@@ -584,6 +644,16 @@ end
|
|
584
644
|
require 'bio'
|
585
645
|
|
586
646
|
file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
|
647
|
+
path = File.expand_path(t.name)
|
648
|
+
dirname = File.dirname(path)
|
649
|
+
|
650
|
+
organism = File.basename(dirname)
|
651
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
652
|
+
archive = organism
|
653
|
+
organism = File.basename(File.dirname(dirname))
|
654
|
+
organism = File.join(organism, archive)
|
655
|
+
end
|
656
|
+
|
587
657
|
exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
|
588
658
|
|
589
659
|
chr_transcript_ranges ||= {}
|
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
|
|
616
686
|
chr_transcript_ranges.each do |chr, transcript_ranges|
|
617
687
|
begin
|
618
688
|
raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
|
619
|
-
|
620
|
-
|
621
|
-
p.
|
622
|
-
chr_str = p.
|
689
|
+
pkgdir = Thread.current["resource"]
|
690
|
+
p = pkgdir[organism]["chromosome_#{chr}"]
|
691
|
+
p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
|
692
|
+
chr_str = p.read
|
623
693
|
rescue Exception
|
624
694
|
Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
|
625
695
|
raise $! unless $!.message =~ /not supported/
|
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
|
656
726
|
organism = File.join(organism, archive)
|
657
727
|
end
|
658
728
|
|
659
|
-
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :
|
729
|
+
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
|
660
730
|
|
661
731
|
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
|
662
732
|
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
|
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
|
670
740
|
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
|
671
741
|
end
|
672
742
|
|
673
|
-
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :
|
674
|
-
transcript_exons = TSV.open("./transcript_exons", :
|
675
|
-
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :
|
743
|
+
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
744
|
+
transcript_exons = TSV.open("./transcript_exons", :unnamed => true)
|
745
|
+
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
|
676
746
|
|
677
747
|
transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
|
678
748
|
transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
|
@@ -719,12 +789,13 @@ end
|
|
719
789
|
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
|
720
790
|
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
|
721
791
|
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
|
722
|
-
transcript_phase
|
792
|
+
transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
|
723
793
|
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
|
724
794
|
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
725
795
|
|
726
796
|
|
727
797
|
protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
|
798
|
+
transcript_sequence.monitor = true
|
728
799
|
transcript_sequence.through do |transcript, sequence|
|
729
800
|
protein = transcript_protein[transcript]
|
730
801
|
next if protein.nil? or protein.empty?
|
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
|
|
777
848
|
uni_seq = UniProt.get_uniprot_sequence(uni)
|
778
849
|
ensps = uni2ensps[uni]
|
779
850
|
next if ensps.nil? or ensps.empty?
|
851
|
+
|
780
852
|
best_ensp = ensps.sort_by do |ensp|
|
781
853
|
ensp_seq = ensp2seq[ensp]
|
782
854
|
if ensp_seq
|
@@ -829,3 +901,4 @@ file 'cdna_fasta' do |t|
|
|
829
901
|
Open.download(url, "#{t.name}.gz")
|
830
902
|
nil
|
831
903
|
end
|
904
|
+
|
@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
|
3
3
|
require 'rbbt/sources/pubmed'
|
4
4
|
require 'test/unit'
|
5
5
|
require 'rbbt/sources/biomart'
|
6
|
+
require 'rbbt/sources/organism'
|
6
7
|
require 'rbbt/util/tmpfile'
|
7
8
|
require 'test/unit'
|
8
9
|
|
9
10
|
class TestBioMart < Test::Unit::TestCase
|
10
11
|
|
11
12
|
def setup
|
12
|
-
BioMart.set_archive
|
13
|
+
BioMart.set_archive "feb2014"
|
13
14
|
end
|
14
15
|
|
15
16
|
def teardown
|
16
17
|
BioMart.unset_archive
|
17
18
|
end
|
18
19
|
|
19
|
-
def
|
20
|
+
def test_get_Sce
|
20
21
|
assert_raise BioMart::QueryError do
|
21
22
|
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
22
23
|
end
|
23
24
|
|
24
|
-
|
25
|
+
BioMart.set_archive "feb2023-fungi"
|
26
|
+
data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
|
25
27
|
tsv = TSV.open data, :double, :merge => true
|
26
|
-
assert(tsv['852236'][0].include? 'CAA84864')
|
28
|
+
assert(tsv['852236'][0].include? 'CAA84864.1')
|
27
29
|
|
28
|
-
data = BioMart.get('
|
30
|
+
data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
|
29
31
|
tsv = TSV.open data, :double, :merge => true
|
30
32
|
assert(tsv['852236'][1].include? 'YBL044W')
|
31
33
|
end
|
32
34
|
|
33
|
-
def
|
35
|
+
def test_get_Hsa
|
36
|
+
Log.severity = 0
|
37
|
+
data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
|
38
|
+
tsv = TSV.open data, :double, :merge => true
|
39
|
+
assert(tsv['852236'][0].include? 'CAA84864.1')
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def test_query
|
34
44
|
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
35
45
|
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
36
46
|
|
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
|
|
41
51
|
end
|
42
52
|
end
|
43
53
|
|
54
|
+
def __test_transcrip_exons
|
55
|
+
Log.with_severity 1 do
|
56
|
+
TmpFile.with_file do |f|
|
57
|
+
fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
|
58
|
+
main = fields[0]
|
59
|
+
attrs = fields.values_at(1, 2)
|
60
|
+
attrs_first = [attrs.first]
|
61
|
+
attrs_last = [attrs.last]
|
62
|
+
database = 'hsapiens_gene_ensembl'
|
63
|
+
|
64
|
+
filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
65
|
+
ppp Open.read(filename)
|
66
|
+
|
67
|
+
filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
68
|
+
ppp Open.read(filename)
|
69
|
+
|
70
|
+
filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
71
|
+
ppp Open.read(filename)
|
72
|
+
|
73
|
+
filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
|
74
|
+
ppp Open.read(filename)
|
75
|
+
|
76
|
+
data = TSV.open Open.open(filename)
|
77
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
44
82
|
def test_tsv
|
45
83
|
data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
46
84
|
assert(data['852236']['Protein ID'].include? 'CAA84864')
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
class TestEnsemblFTP < Test::Unit::TestCase
|
5
|
+
def test_ftp_for
|
6
|
+
assert_nothing_raised do
|
7
|
+
Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
|
|
5
5
|
|
6
6
|
class TestOrganism < Test::Unit::TestCase
|
7
7
|
|
8
|
-
def
|
8
|
+
def _test_known_ids
|
9
9
|
assert Organism.known_ids("Hsa").include?("Associated Gene Name")
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def _test_location
|
13
13
|
assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
16
|
+
def _test_identifiers
|
17
17
|
assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
|
18
18
|
assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
19
19
|
assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
20
20
|
end
|
21
21
|
|
22
|
-
def
|
22
|
+
def _test_lexicon
|
23
23
|
assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def _test_guess_id
|
27
27
|
ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
|
28
28
|
gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
|
29
29
|
assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
|
30
30
|
assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
|
31
31
|
end
|
32
32
|
|
33
|
-
def
|
33
|
+
def _test_organisms
|
34
34
|
assert Organism.organisms.include? "Hsa"
|
35
35
|
assert_equal "Hsa", Organism.organism("Homo sapiens")
|
36
36
|
end
|
37
37
|
|
38
|
-
def
|
38
|
+
def _test_attach_translations
|
39
39
|
tsv = TSV.setup({"1020" => []}, :type => :list)
|
40
40
|
tsv.key_field = "Entrez Gene ID"
|
41
41
|
tsv.fields = []
|
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
|
|
47
47
|
assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
|
48
48
|
end
|
49
49
|
|
50
|
-
def
|
50
|
+
def _test_entrez_taxids
|
51
51
|
assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
|
52
52
|
end
|
53
53
|
|
54
|
-
def
|
54
|
+
def _test_lift_over
|
55
55
|
mutation_19 = "19:21131664:T"
|
56
56
|
mutation_18 = "19:20923504:T"
|
57
|
-
source_build =
|
57
|
+
source_build = "Hsa/feb2014"
|
58
58
|
target_build = "Hsa/may2009"
|
59
59
|
|
60
60
|
assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
|
61
61
|
assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
|
62
62
|
end
|
63
63
|
|
64
|
-
def
|
64
|
+
def _test_orhtolog
|
65
65
|
require 'rbbt/entity/gene'
|
66
66
|
assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
|
67
67
|
end
|
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
|
|
70
70
|
assert Organism.chromosome_sizes["2"].to_i > 10_000_000
|
71
71
|
end
|
72
72
|
|
73
|
-
def
|
73
|
+
def _test_build_organism
|
74
74
|
assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
|
75
75
|
assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
|
76
76
|
assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
|
77
77
|
end
|
78
78
|
|
79
|
-
#def
|
79
|
+
#def _test_genes_at_chromosome
|
80
80
|
# pos = [12, 117799500]
|
81
81
|
# assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
82
82
|
#end
|
83
83
|
|
84
|
-
#def
|
84
|
+
#def _test_genes_at_chromosome_array
|
85
85
|
# pos = [12, [117799500, 106903900]]
|
86
86
|
# assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
87
87
|
#end
|
88
88
|
|
89
|
-
#def
|
89
|
+
#def _test_genes_at_genomic_positions
|
90
90
|
# pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
|
91
91
|
# assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
|
92
92
|
#end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.4.
|
4
|
+
version: 3.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -120,10 +120,10 @@ files:
|
|
120
120
|
- share/install/KEGG/Rakefile
|
121
121
|
- share/install/Matador/Rakefile
|
122
122
|
- share/install/NCI/Rakefile
|
123
|
-
- share/install/Organism/Hsa
|
124
|
-
- share/install/Organism/Mmu
|
125
|
-
- share/install/Organism/Rno
|
126
|
-
- share/install/Organism/Sce
|
123
|
+
- share/install/Organism/Hsa.rake
|
124
|
+
- share/install/Organism/Mmu.rake
|
125
|
+
- share/install/Organism/Rno.rake
|
126
|
+
- share/install/Organism/Sce.rake
|
127
127
|
- share/install/Organism/organism_helpers.rb
|
128
128
|
- share/install/PharmaGKB/Rakefile
|
129
129
|
- share/install/Pina/Rakefile
|
@@ -133,6 +133,7 @@ files:
|
|
133
133
|
- share/install/lib/rake_helper.rb
|
134
134
|
- test/rbbt/sources/test_HPRD.rb
|
135
135
|
- test/rbbt/sources/test_biomart.rb
|
136
|
+
- test/rbbt/sources/test_ensembl_ftp.rb
|
136
137
|
- test/rbbt/sources/test_entrez.rb
|
137
138
|
- test/rbbt/sources/test_go.rb
|
138
139
|
- test/rbbt/sources/test_gscholar.rb
|
@@ -166,13 +167,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
167
|
- !ruby/object:Gem::Version
|
167
168
|
version: '0'
|
168
169
|
requirements: []
|
169
|
-
rubygems_version: 3.5.
|
170
|
+
rubygems_version: 3.5.23
|
170
171
|
signing_key:
|
171
172
|
specification_version: 4
|
172
173
|
summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
|
173
174
|
test_files:
|
174
175
|
- test/rbbt/sources/test_HPRD.rb
|
175
176
|
- test/rbbt/sources/test_biomart.rb
|
177
|
+
- test/rbbt/sources/test_ensembl_ftp.rb
|
176
178
|
- test/rbbt/sources/test_entrez.rb
|
177
179
|
- test/rbbt/sources/test_go.rb
|
178
180
|
- test/rbbt/sources/test_gscholar.rb
|
@@ -1,52 +0,0 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
-
require 'rbbt/sources/biomart'
|
3
|
-
require 'rbbt/sources/entrez'
|
4
|
-
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
|
-
|
6
|
-
$taxs = [559292,4932]
|
7
|
-
$scientific_name = "Saccharomyces cerevisiae"
|
8
|
-
#$ortholog_key = "yeast_ensembl_gene"
|
9
|
-
|
10
|
-
$biomart_db = 'scerevisiae_gene_ensembl'
|
11
|
-
|
12
|
-
$biomart_lexicon = [
|
13
|
-
[ 'Associated Gene Name' , "external_gene_id"],
|
14
|
-
]
|
15
|
-
|
16
|
-
$biomart_protein_identifiers = [
|
17
|
-
[ 'Protein ID', "protein_id" ],
|
18
|
-
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
19
|
-
[ 'Unigene ID', "unigene" ],
|
20
|
-
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
21
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
22
|
-
]
|
23
|
-
|
24
|
-
$biomart_probe_identifiers = [
|
25
|
-
]
|
26
|
-
|
27
|
-
$biomart_identifiers = [
|
28
|
-
[ 'Entrez Gene ID', "entrezgene"],
|
29
|
-
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
30
|
-
[ 'Associated Gene Name', "external_gene_id" ],
|
31
|
-
[ 'Protein ID', "protein_id" ],
|
32
|
-
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
33
|
-
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
34
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
35
|
-
[ 'EMBL (Genbank) ID' , "embl"] ,
|
36
|
-
[ 'RefSeq mRNA' , "refseq_mrna"] ,
|
37
|
-
]
|
38
|
-
|
39
|
-
$biomart_go= [
|
40
|
-
["GO ID", 'go_id'],
|
41
|
-
["GO Namespace", 'namespace_1003'],
|
42
|
-
]
|
43
|
-
|
44
|
-
$biomart_go_2009= [
|
45
|
-
["GO BP ID", 'go_biological_process_id'],
|
46
|
-
["GO MF ID", 'go_molecular_function_id'],
|
47
|
-
["GO CC ID", 'go_cellular_component_id'],
|
48
|
-
]
|
49
|
-
|
50
|
-
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
51
|
-
Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
|
52
|
-
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|