rbbt-sources 3.1.38 → 3.1.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f0b45df8e4ef43f18147f001595817df05dddc6ef3b43dcea3b7c17a1d379163
4
- data.tar.gz: 3b1ac66222ca9dd9d0342f97917cc3d969fc566b2c068a319101ec6fe7133f58
3
+ metadata.gz: 160b144b03f1da99d829748d36b74f96d31cf2ee04fb8751b7a9fa826e5f35ad
4
+ data.tar.gz: 5228bffd99260248fc9a46ee773d17f7aaec2753f939814d5f9a35530f228276
5
5
  SHA512:
6
- metadata.gz: b75e51b2338a0b3c55545ba788db502156126327ed920f86e448fbe239c2bb2f5a053b1009dd8e42bd7e6f425614e504c83a8ea894b070a07e9275fb1e5ea399
7
- data.tar.gz: 35b7852629a3dc69df65d8e3c226c4995051a2dd7e0a4435f485c947014e88ee9a97455f7270de902d0058db0a4d55315731e29ce2195cb0b79ceef390d92676
6
+ metadata.gz: e2c0702241dd63d5ee9858f3bab1c90c02de911f459ea2fb6c61e14e82f1e6f17ebb897f8ed9058eb002c89af415b16c54ed713127e154f3b8d87e8bfbe4cdde
7
+ data.tar.gz: 6544095d31861d13dfd89fed3289c8d2877e5e96ec35d7dbee4db84512f5b0a47bca7a6a99a3b9a7abfb8c106fbd2ec7ab61340290a78a1165b9793cdd50dc6c
@@ -0,0 +1,8 @@
1
+ #Build Organism
2
+ hg18 Hsa/may2008
3
+ hg19 Hsa/feb2014
4
+ b37 Hsa/feb2014
5
+ hg38 Hsa/may2017
6
+ GRCh38 Hsa/may2017
7
+ mm10 Mmu/may2017
8
+ GRCm38 Mmu/may2017
data/etc/organisms CHANGED
@@ -1 +1,2 @@
1
1
  Hsa
2
+ Mmu
@@ -33,7 +33,7 @@ module Ensembl
33
33
  ftp.passive = true
34
34
  ftp.login
35
35
  ftp.chdir(File.join('pub', release, 'mysql'))
36
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
36
+ file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
37
37
  ftp.close
38
38
  end
39
39
  [release, file]
@@ -53,17 +53,17 @@ module Ensembl
53
53
  end
54
54
 
55
55
  def self.url_for(organism, table)
56
- "#{base_url(organism)}/#{table}.txt.gz"
56
+ "#{base_url(organism)}/#{table}.txt.gz.bz2"
57
57
  end
58
58
 
59
59
  def self.has_table?(organism, table)
60
- sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
60
+ sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
61
61
  ! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
62
62
  end
63
63
 
64
64
  def self.fields_for(organism, table)
65
- sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
66
-
65
+ sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
66
+
67
67
  chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
68
68
  chunk.scan(/^\s+`(.*?)`/).flatten
69
69
  end
@@ -78,7 +78,7 @@ module Ensembl
78
78
  options[:key_field] = key_pos
79
79
  options[:fields] = field_pos
80
80
  end
81
- tsv = TSV.open(url, options)
81
+ tsv = TSV.open(CMD.cmd("wget '#{url}' -O - |bunzip2|gunzip", :pipe => true), options)
82
82
  tsv.key_field = key_field
83
83
  tsv.fields = fields
84
84
  tsv
@@ -86,24 +86,50 @@ module Organism
86
86
  else
87
87
  'hg38'
88
88
  end
89
+ when "Mmu"
90
+ "mm10"
89
91
  else
90
- raise "Only organism 'Hsa' (Homo sapiens) supported" unless organism =~ /^Hsa/
92
+ raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
91
93
  end
92
94
  end
93
95
 
94
- def self.organism_for_build(build)
95
- case build.to_s
96
- when 'hg18'
97
- "Hsa/may2008"
98
- when 'hg19', 'b37'
99
- "Hsa/feb2014"
100
- when 'hg38'
101
- "Hsa/may2017"
96
+ def self.GRC_build(organism)
97
+ require 'rbbt/sources/ensembl_ftp'
98
+ return organism if organism =~ /^hg\d\d$/
99
+
100
+ return 'hg19' unless organism =~ /\//
101
+
102
+ species, date = organism.split("/")
103
+
104
+ case species
105
+ when "Hsa"
106
+ date = organism.split("/")[1]
107
+
108
+ release = Ensembl.releases[date]
109
+
110
+ release_number = release.sub(/.*-/,'').to_i
111
+ if release_number <= 54
112
+ 'GRCh36'
113
+ elsif release_number <= 75
114
+ 'GRCh37'
115
+ else
116
+ 'GRCh38'
117
+ end
118
+ when "Mmu"
119
+ "GRCm38"
102
120
  else
103
- raise RbbtException, "Unknown organism build #{build}"
121
+ raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
104
122
  end
105
123
  end
106
124
 
125
+ def self.organism_for_build(build)
126
+ build = build.sub('_noalt', '')
127
+
128
+ build_organism = Rbbt.etc.build_organism.tsv :type => :single
129
+
130
+ build_organism[build]
131
+ end
132
+
107
133
  def self.liftOver(positions, source, target)
108
134
 
109
135
  source_hg = hg_build(source)
@@ -648,12 +648,12 @@ end
648
648
  file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
649
649
  path = File.expand_path(t.name)
650
650
  dirname = File.dirname(path)
651
- organism = File.basename(dirname)
652
651
 
653
- if organism =~ /[a-z]{3}20[0-9]{2}/
654
- build = organism
652
+ organism = File.basename(dirname)
653
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
654
+ archive = organism
655
655
  organism = File.basename(File.dirname(dirname))
656
- organism = File.join(organism, build)
656
+ organism = File.join(organism, archive)
657
657
  end
658
658
 
659
659
  translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
@@ -788,12 +788,19 @@ end
788
788
  file 'gene_set' do |t|
789
789
  path = File.expand_path(t.name)
790
790
  dirname = File.dirname(path)
791
+
791
792
  organism = File.basename(dirname)
793
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
794
+ archive = organism
795
+ organism = File.basename(File.dirname(dirname))
796
+ organism = File.join(organism, archive)
797
+ end
792
798
 
793
799
  release = Ensembl.org2release(organism)
794
800
  num = release.split("-").last
795
- build_code = num.to_i > 75 ? "GRCh38" : "GRCh37"
796
- url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/homo_sapiens/Homo_sapiens.#{build_code}.#{num}.gtf.gz"
801
+ build_code = Organism.GRC_build(organism)
802
+ scientific_name = $scientific_name
803
+ url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
797
804
  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
798
805
  nil
799
806
  end
@@ -801,12 +808,19 @@ end
801
808
  file 'cdna_fasta' do |t|
802
809
  path = File.expand_path(t.name)
803
810
  dirname = File.dirname(path)
811
+
804
812
  organism = File.basename(dirname)
813
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
814
+ archive = organism
815
+ organism = File.basename(File.dirname(dirname))
816
+ organism = File.join(organism, archive)
817
+ end
805
818
 
806
819
  release = Ensembl.org2release(organism)
807
820
  num = release.split("-").last
808
- build_code = num.to_i > 75 ? "GRCh38" : "GRCh37"
809
- url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/homo_sapiens/cdna/Homo_sapiens.#{build_code}.#{num}.cdna.all.fa.gz"
821
+ build_code = Organism.GRC_build(organism)
822
+ scientific_name = Organism.scientific_name(organism)
823
+ url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
810
824
  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
811
825
  nil
812
826
  end
@@ -14,7 +14,7 @@ class TestOrganism < Test::Unit::TestCase
14
14
  end
15
15
 
16
16
  def test_identifiers
17
- assert Organism.identifiers('Hsa').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
17
+ assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
18
18
  assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
19
19
  assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
20
20
  end
@@ -70,6 +70,12 @@ class TestOrganism < Test::Unit::TestCase
70
70
  assert Organism.chromosome_sizes["2"].to_i > 10_000_000
71
71
  end
72
72
 
73
+ def test_build_organism
74
+ assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
75
+ assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
76
+ assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
77
+ end
78
+
73
79
  #def test_genes_at_chromosome
74
80
  # pos = [12, 117799500]
75
81
  # assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.38
4
+ version: 3.1.39
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-22 00:00:00.000000000 Z
11
+ date: 2019-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -88,6 +88,7 @@ extra_rdoc_files: []
88
88
  files:
89
89
  - etc/allowed_biomart_archives
90
90
  - etc/biomart/missing_in_archive
91
+ - etc/build_organism
91
92
  - etc/organisms
92
93
  - etc/xena_hubs
93
94
  - lib/rbbt/sources/CASCADE.rb