rbbt-sources 3.1.38 → 3.1.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/etc/build_organism +8 -0
- data/etc/organisms +1 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +6 -6
- data/lib/rbbt/sources/organism.rb +36 -10
- data/share/install/Organism/organism_helpers.rb +22 -8
- data/test/rbbt/sources/test_organism.rb +7 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 160b144b03f1da99d829748d36b74f96d31cf2ee04fb8751b7a9fa826e5f35ad
|
4
|
+
data.tar.gz: 5228bffd99260248fc9a46ee773d17f7aaec2753f939814d5f9a35530f228276
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2c0702241dd63d5ee9858f3bab1c90c02de911f459ea2fb6c61e14e82f1e6f17ebb897f8ed9058eb002c89af415b16c54ed713127e154f3b8d87e8bfbe4cdde
|
7
|
+
data.tar.gz: 6544095d31861d13dfd89fed3289c8d2877e5e96ec35d7dbee4db84512f5b0a47bca7a6a99a3b9a7abfb8c106fbd2ec7ab61340290a78a1165b9793cdd50dc6c
|
data/etc/build_organism
ADDED
data/etc/organisms
CHANGED
@@ -33,7 +33,7 @@ module Ensembl
|
|
33
33
|
ftp.passive = true
|
34
34
|
ftp.login
|
35
35
|
ftp.chdir(File.join('pub', release, 'mysql'))
|
36
|
-
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
|
36
|
+
file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
|
37
37
|
ftp.close
|
38
38
|
end
|
39
39
|
[release, file]
|
@@ -53,17 +53,17 @@ module Ensembl
|
|
53
53
|
end
|
54
54
|
|
55
55
|
def self.url_for(organism, table)
|
56
|
-
"#{base_url(organism)}/#{table}.txt.gz"
|
56
|
+
"#{base_url(organism)}/#{table}.txt.gz.bz2"
|
57
57
|
end
|
58
58
|
|
59
59
|
def self.has_table?(organism, table)
|
60
|
-
sql_file =
|
60
|
+
sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
|
61
61
|
! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
|
62
62
|
end
|
63
63
|
|
64
64
|
def self.fields_for(organism, table)
|
65
|
-
sql_file =
|
66
|
-
|
65
|
+
sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
|
66
|
+
|
67
67
|
chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
|
68
68
|
chunk.scan(/^\s+`(.*?)`/).flatten
|
69
69
|
end
|
@@ -78,7 +78,7 @@ module Ensembl
|
|
78
78
|
options[:key_field] = key_pos
|
79
79
|
options[:fields] = field_pos
|
80
80
|
end
|
81
|
-
tsv = TSV.open(url, options)
|
81
|
+
tsv = TSV.open(CMD.cmd("wget '#{url}' -O - |bunzip2|gunzip", :pipe => true), options)
|
82
82
|
tsv.key_field = key_field
|
83
83
|
tsv.fields = fields
|
84
84
|
tsv
|
@@ -86,24 +86,50 @@ module Organism
|
|
86
86
|
else
|
87
87
|
'hg38'
|
88
88
|
end
|
89
|
+
when "Mmu"
|
90
|
+
"mm10"
|
89
91
|
else
|
90
|
-
raise "Only organism 'Hsa' (Homo sapiens)
|
92
|
+
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
91
93
|
end
|
92
94
|
end
|
93
95
|
|
94
|
-
def self.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
96
|
+
def self.GRC_build(organism)
|
97
|
+
require 'rbbt/sources/ensembl_ftp'
|
98
|
+
return organism if organism =~ /^hg\d\d$/
|
99
|
+
|
100
|
+
return 'hg19' unless organism =~ /\//
|
101
|
+
|
102
|
+
species, date = organism.split("/")
|
103
|
+
|
104
|
+
case species
|
105
|
+
when "Hsa"
|
106
|
+
date = organism.split("/")[1]
|
107
|
+
|
108
|
+
release = Ensembl.releases[date]
|
109
|
+
|
110
|
+
release_number = release.sub(/.*-/,'').to_i
|
111
|
+
if release_number <= 54
|
112
|
+
'GRCh36'
|
113
|
+
elsif release_number <= 75
|
114
|
+
'GRCh37'
|
115
|
+
else
|
116
|
+
'GRCh38'
|
117
|
+
end
|
118
|
+
when "Mmu"
|
119
|
+
"GRCm38"
|
102
120
|
else
|
103
|
-
raise
|
121
|
+
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
104
122
|
end
|
105
123
|
end
|
106
124
|
|
125
|
+
def self.organism_for_build(build)
|
126
|
+
build = build.sub('_noalt', '')
|
127
|
+
|
128
|
+
build_organism = Rbbt.etc.build_organism.tsv :type => :single
|
129
|
+
|
130
|
+
build_organism[build]
|
131
|
+
end
|
132
|
+
|
107
133
|
def self.liftOver(positions, source, target)
|
108
134
|
|
109
135
|
source_hg = hg_build(source)
|
@@ -648,12 +648,12 @@ end
|
|
648
648
|
file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
649
649
|
path = File.expand_path(t.name)
|
650
650
|
dirname = File.dirname(path)
|
651
|
-
organism = File.basename(dirname)
|
652
651
|
|
653
|
-
|
654
|
-
|
652
|
+
organism = File.basename(dirname)
|
653
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
654
|
+
archive = organism
|
655
655
|
organism = File.basename(File.dirname(dirname))
|
656
|
-
organism = File.join(organism,
|
656
|
+
organism = File.join(organism, archive)
|
657
657
|
end
|
658
658
|
|
659
659
|
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
|
@@ -788,12 +788,19 @@ end
|
|
788
788
|
file 'gene_set' do |t|
|
789
789
|
path = File.expand_path(t.name)
|
790
790
|
dirname = File.dirname(path)
|
791
|
+
|
791
792
|
organism = File.basename(dirname)
|
793
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
794
|
+
archive = organism
|
795
|
+
organism = File.basename(File.dirname(dirname))
|
796
|
+
organism = File.join(organism, archive)
|
797
|
+
end
|
792
798
|
|
793
799
|
release = Ensembl.org2release(organism)
|
794
800
|
num = release.split("-").last
|
795
|
-
build_code =
|
796
|
-
|
801
|
+
build_code = Organism.GRC_build(organism)
|
802
|
+
scientific_name = $scientific_name
|
803
|
+
url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
|
797
804
|
CMD.cmd("wget '#{url}' -O #{t.name}.gz")
|
798
805
|
nil
|
799
806
|
end
|
@@ -801,12 +808,19 @@ end
|
|
801
808
|
file 'cdna_fasta' do |t|
|
802
809
|
path = File.expand_path(t.name)
|
803
810
|
dirname = File.dirname(path)
|
811
|
+
|
804
812
|
organism = File.basename(dirname)
|
813
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
814
|
+
archive = organism
|
815
|
+
organism = File.basename(File.dirname(dirname))
|
816
|
+
organism = File.join(organism, archive)
|
817
|
+
end
|
805
818
|
|
806
819
|
release = Ensembl.org2release(organism)
|
807
820
|
num = release.split("-").last
|
808
|
-
build_code =
|
809
|
-
|
821
|
+
build_code = Organism.GRC_build(organism)
|
822
|
+
scientific_name = Organism.scientific_name(organism)
|
823
|
+
url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
|
810
824
|
CMD.cmd("wget '#{url}' -O #{t.name}.gz")
|
811
825
|
nil
|
812
826
|
end
|
@@ -14,7 +14,7 @@ class TestOrganism < Test::Unit::TestCase
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def test_identifiers
|
17
|
-
assert Organism.identifiers('Hsa').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
|
17
|
+
assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
|
18
18
|
assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
19
19
|
assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
20
20
|
end
|
@@ -70,6 +70,12 @@ class TestOrganism < Test::Unit::TestCase
|
|
70
70
|
assert Organism.chromosome_sizes["2"].to_i > 10_000_000
|
71
71
|
end
|
72
72
|
|
73
|
+
def test_build_organism
|
74
|
+
assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
|
75
|
+
assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
|
76
|
+
assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
|
77
|
+
end
|
78
|
+
|
73
79
|
#def test_genes_at_chromosome
|
74
80
|
# pos = [12, 117799500]
|
75
81
|
# assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.39
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-05
|
11
|
+
date: 2019-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -88,6 +88,7 @@ extra_rdoc_files: []
|
|
88
88
|
files:
|
89
89
|
- etc/allowed_biomart_archives
|
90
90
|
- etc/biomart/missing_in_archive
|
91
|
+
- etc/build_organism
|
91
92
|
- etc/organisms
|
92
93
|
- etc/xena_hubs
|
93
94
|
- lib/rbbt/sources/CASCADE.rb
|