rbbt-sources 3.1.38 → 3.1.39
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/etc/build_organism +8 -0
- data/etc/organisms +1 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +6 -6
- data/lib/rbbt/sources/organism.rb +36 -10
- data/share/install/Organism/organism_helpers.rb +22 -8
- data/test/rbbt/sources/test_organism.rb +7 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 160b144b03f1da99d829748d36b74f96d31cf2ee04fb8751b7a9fa826e5f35ad
|
4
|
+
data.tar.gz: 5228bffd99260248fc9a46ee773d17f7aaec2753f939814d5f9a35530f228276
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2c0702241dd63d5ee9858f3bab1c90c02de911f459ea2fb6c61e14e82f1e6f17ebb897f8ed9058eb002c89af415b16c54ed713127e154f3b8d87e8bfbe4cdde
|
7
|
+
data.tar.gz: 6544095d31861d13dfd89fed3289c8d2877e5e96ec35d7dbee4db84512f5b0a47bca7a6a99a3b9a7abfb8c106fbd2ec7ab61340290a78a1165b9793cdd50dc6c
|
data/etc/build_organism
ADDED
data/etc/organisms
CHANGED
@@ -33,7 +33,7 @@ module Ensembl
|
|
33
33
|
ftp.passive = true
|
34
34
|
ftp.login
|
35
35
|
ftp.chdir(File.join('pub', release, 'mysql'))
|
36
|
-
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
|
36
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
|
37
37
|
ftp.close
|
38
38
|
end
|
39
39
|
[release, file]
|
@@ -53,17 +53,17 @@ module Ensembl
|
|
53
53
|
end
|
54
54
|
|
55
55
|
def self.url_for(organism, table)
|
56
|
-
"#{base_url(organism)}/#{table}.txt.gz"
|
56
|
+
"#{base_url(organism)}/#{table}.txt.gz.bz2"
|
57
57
|
end
|
58
58
|
|
59
59
|
def self.has_table?(organism, table)
|
60
|
-
sql_file =
|
60
|
+
sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
|
61
61
|
! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
|
62
62
|
end
|
63
63
|
|
64
64
|
def self.fields_for(organism, table)
|
65
|
-
sql_file =
|
66
|
-
|
65
|
+
sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
|
66
|
+
|
67
67
|
chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
|
68
68
|
chunk.scan(/^\s+`(.*?)`/).flatten
|
69
69
|
end
|
@@ -78,7 +78,7 @@ module Ensembl
|
|
78
78
|
options[:key_field] = key_pos
|
79
79
|
options[:fields] = field_pos
|
80
80
|
end
|
81
|
-
tsv = TSV.open(url, options)
|
81
|
+
tsv = TSV.open(CMD.cmd("wget '#{url}' -O - |bunzip2|gunzip", :pipe => true), options)
|
82
82
|
tsv.key_field = key_field
|
83
83
|
tsv.fields = fields
|
84
84
|
tsv
|
@@ -86,24 +86,50 @@ module Organism
|
|
86
86
|
else
|
87
87
|
'hg38'
|
88
88
|
end
|
89
|
+
when "Mmu"
|
90
|
+
"mm10"
|
89
91
|
else
|
90
|
-
raise "Only organism 'Hsa' (Homo sapiens)
|
92
|
+
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
91
93
|
end
|
92
94
|
end
|
93
95
|
|
94
|
-
def self.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
96
|
+
def self.GRC_build(organism)
|
97
|
+
require 'rbbt/sources/ensembl_ftp'
|
98
|
+
return organism if organism =~ /^hg\d\d$/
|
99
|
+
|
100
|
+
return 'hg19' unless organism =~ /\//
|
101
|
+
|
102
|
+
species, date = organism.split("/")
|
103
|
+
|
104
|
+
case species
|
105
|
+
when "Hsa"
|
106
|
+
date = organism.split("/")[1]
|
107
|
+
|
108
|
+
release = Ensembl.releases[date]
|
109
|
+
|
110
|
+
release_number = release.sub(/.*-/,'').to_i
|
111
|
+
if release_number <= 54
|
112
|
+
'GRCh36'
|
113
|
+
elsif release_number <= 75
|
114
|
+
'GRCh37'
|
115
|
+
else
|
116
|
+
'GRCh38'
|
117
|
+
end
|
118
|
+
when "Mmu"
|
119
|
+
"GRCm38"
|
102
120
|
else
|
103
|
-
raise
|
121
|
+
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
104
122
|
end
|
105
123
|
end
|
106
124
|
|
125
|
+
def self.organism_for_build(build)
|
126
|
+
build = build.sub('_noalt', '')
|
127
|
+
|
128
|
+
build_organism = Rbbt.etc.build_organism.tsv :type => :single
|
129
|
+
|
130
|
+
build_organism[build]
|
131
|
+
end
|
132
|
+
|
107
133
|
def self.liftOver(positions, source, target)
|
108
134
|
|
109
135
|
source_hg = hg_build(source)
|
@@ -648,12 +648,12 @@ end
|
|
648
648
|
file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
649
649
|
path = File.expand_path(t.name)
|
650
650
|
dirname = File.dirname(path)
|
651
|
-
organism = File.basename(dirname)
|
652
651
|
|
653
|
-
|
654
|
-
|
652
|
+
organism = File.basename(dirname)
|
653
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
654
|
+
archive = organism
|
655
655
|
organism = File.basename(File.dirname(dirname))
|
656
|
-
organism = File.join(organism,
|
656
|
+
organism = File.join(organism, archive)
|
657
657
|
end
|
658
658
|
|
659
659
|
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
|
@@ -788,12 +788,19 @@ end
|
|
788
788
|
file 'gene_set' do |t|
|
789
789
|
path = File.expand_path(t.name)
|
790
790
|
dirname = File.dirname(path)
|
791
|
+
|
791
792
|
organism = File.basename(dirname)
|
793
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
794
|
+
archive = organism
|
795
|
+
organism = File.basename(File.dirname(dirname))
|
796
|
+
organism = File.join(organism, archive)
|
797
|
+
end
|
792
798
|
|
793
799
|
release = Ensembl.org2release(organism)
|
794
800
|
num = release.split("-").last
|
795
|
-
build_code =
|
796
|
-
|
801
|
+
build_code = Organism.GRC_build(organism)
|
802
|
+
scientific_name = $scientific_name
|
803
|
+
url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
|
797
804
|
CMD.cmd("wget '#{url}' -O #{t.name}.gz")
|
798
805
|
nil
|
799
806
|
end
|
@@ -801,12 +808,19 @@ end
|
|
801
808
|
file 'cdna_fasta' do |t|
|
802
809
|
path = File.expand_path(t.name)
|
803
810
|
dirname = File.dirname(path)
|
811
|
+
|
804
812
|
organism = File.basename(dirname)
|
813
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
814
|
+
archive = organism
|
815
|
+
organism = File.basename(File.dirname(dirname))
|
816
|
+
organism = File.join(organism, archive)
|
817
|
+
end
|
805
818
|
|
806
819
|
release = Ensembl.org2release(organism)
|
807
820
|
num = release.split("-").last
|
808
|
-
build_code =
|
809
|
-
|
821
|
+
build_code = Organism.GRC_build(organism)
|
822
|
+
scientific_name = Organism.scientific_name(organism)
|
823
|
+
url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
|
810
824
|
CMD.cmd("wget '#{url}' -O #{t.name}.gz")
|
811
825
|
nil
|
812
826
|
end
|
@@ -14,7 +14,7 @@ class TestOrganism < Test::Unit::TestCase
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def test_identifiers
|
17
|
-
assert Organism.identifiers('Hsa').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
|
17
|
+
assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
|
18
18
|
assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
19
19
|
assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
20
20
|
end
|
@@ -70,6 +70,12 @@ class TestOrganism < Test::Unit::TestCase
|
|
70
70
|
assert Organism.chromosome_sizes["2"].to_i > 10_000_000
|
71
71
|
end
|
72
72
|
|
73
|
+
def test_build_organism
|
74
|
+
assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
|
75
|
+
assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
|
76
|
+
assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
|
77
|
+
end
|
78
|
+
|
73
79
|
#def test_genes_at_chromosome
|
74
80
|
# pos = [12, 117799500]
|
75
81
|
# assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.39
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-05
|
11
|
+
date: 2019-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -88,6 +88,7 @@ extra_rdoc_files: []
|
|
88
88
|
files:
|
89
89
|
- etc/allowed_biomart_archives
|
90
90
|
- etc/biomart/missing_in_archive
|
91
|
+
- etc/build_organism
|
91
92
|
- etc/organisms
|
92
93
|
- etc/xena_hubs
|
93
94
|
- lib/rbbt/sources/CASCADE.rb
|