rbbt-sources 3.1.38 → 3.1.39

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f0b45df8e4ef43f18147f001595817df05dddc6ef3b43dcea3b7c17a1d379163
4
- data.tar.gz: 3b1ac66222ca9dd9d0342f97917cc3d969fc566b2c068a319101ec6fe7133f58
3
+ metadata.gz: 160b144b03f1da99d829748d36b74f96d31cf2ee04fb8751b7a9fa826e5f35ad
4
+ data.tar.gz: 5228bffd99260248fc9a46ee773d17f7aaec2753f939814d5f9a35530f228276
5
5
  SHA512:
6
- metadata.gz: b75e51b2338a0b3c55545ba788db502156126327ed920f86e448fbe239c2bb2f5a053b1009dd8e42bd7e6f425614e504c83a8ea894b070a07e9275fb1e5ea399
7
- data.tar.gz: 35b7852629a3dc69df65d8e3c226c4995051a2dd7e0a4435f485c947014e88ee9a97455f7270de902d0058db0a4d55315731e29ce2195cb0b79ceef390d92676
6
+ metadata.gz: e2c0702241dd63d5ee9858f3bab1c90c02de911f459ea2fb6c61e14e82f1e6f17ebb897f8ed9058eb002c89af415b16c54ed713127e154f3b8d87e8bfbe4cdde
7
+ data.tar.gz: 6544095d31861d13dfd89fed3289c8d2877e5e96ec35d7dbee4db84512f5b0a47bca7a6a99a3b9a7abfb8c106fbd2ec7ab61340290a78a1165b9793cdd50dc6c
@@ -0,0 +1,8 @@
1
+ #Build Organism
2
+ hg18 Hsa/may2008
3
+ hg19 Hsa/feb2014
4
+ b37 Hsa/feb2014
5
+ hg38 Hsa/may2017
6
+ GRCh38 Hsa/may2017
7
+ mm10 Mmu/may2017
8
+ GRCm38 Mmu/may2017
data/etc/organisms CHANGED
@@ -1 +1,2 @@
1
1
  Hsa
2
+ Mmu
@@ -33,7 +33,7 @@ module Ensembl
33
33
  ftp.passive = true
34
34
  ftp.login
35
35
  ftp.chdir(File.join('pub', release, 'mysql'))
36
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
36
+ file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
37
37
  ftp.close
38
38
  end
39
39
  [release, file]
@@ -53,17 +53,17 @@ module Ensembl
53
53
  end
54
54
 
55
55
  def self.url_for(organism, table)
56
- "#{base_url(organism)}/#{table}.txt.gz"
56
+ "#{base_url(organism)}/#{table}.txt.gz.bz2"
57
57
  end
58
58
 
59
59
  def self.has_table?(organism, table)
60
- sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
60
+ sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
61
61
  ! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
62
62
  end
63
63
 
64
64
  def self.fields_for(organism, table)
65
- sql_file = Open.read("#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz")
66
-
65
+ sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
66
+
67
67
  chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
68
68
  chunk.scan(/^\s+`(.*?)`/).flatten
69
69
  end
@@ -78,7 +78,7 @@ module Ensembl
78
78
  options[:key_field] = key_pos
79
79
  options[:fields] = field_pos
80
80
  end
81
- tsv = TSV.open(url, options)
81
+ tsv = TSV.open(CMD.cmd("wget '#{url}' -O - |bunzip2|gunzip", :pipe => true), options)
82
82
  tsv.key_field = key_field
83
83
  tsv.fields = fields
84
84
  tsv
@@ -86,24 +86,50 @@ module Organism
86
86
  else
87
87
  'hg38'
88
88
  end
89
+ when "Mmu"
90
+ "mm10"
89
91
  else
90
- raise "Only organism 'Hsa' (Homo sapiens) supported" unless organism =~ /^Hsa/
92
+ raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
91
93
  end
92
94
  end
93
95
 
94
- def self.organism_for_build(build)
95
- case build.to_s
96
- when 'hg18'
97
- "Hsa/may2008"
98
- when 'hg19', 'b37'
99
- "Hsa/feb2014"
100
- when 'hg38'
101
- "Hsa/may2017"
96
+ def self.GRC_build(organism)
97
+ require 'rbbt/sources/ensembl_ftp'
98
+ return organism if organism =~ /^hg\d\d$/
99
+
100
+ return 'hg19' unless organism =~ /\//
101
+
102
+ species, date = organism.split("/")
103
+
104
+ case species
105
+ when "Hsa"
106
+ date = organism.split("/")[1]
107
+
108
+ release = Ensembl.releases[date]
109
+
110
+ release_number = release.sub(/.*-/,'').to_i
111
+ if release_number <= 54
112
+ 'GRCh36'
113
+ elsif release_number <= 75
114
+ 'GRCh37'
115
+ else
116
+ 'GRCh38'
117
+ end
118
+ when "Mmu"
119
+ "GRCm38"
102
120
  else
103
- raise RbbtException, "Unknown organism build #{build}"
121
+ raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
104
122
  end
105
123
  end
106
124
 
125
+ def self.organism_for_build(build)
126
+ build = build.sub('_noalt', '')
127
+
128
+ build_organism = Rbbt.etc.build_organism.tsv :type => :single
129
+
130
+ build_organism[build]
131
+ end
132
+
107
133
  def self.liftOver(positions, source, target)
108
134
 
109
135
  source_hg = hg_build(source)
@@ -648,12 +648,12 @@ end
648
648
  file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
649
649
  path = File.expand_path(t.name)
650
650
  dirname = File.dirname(path)
651
- organism = File.basename(dirname)
652
651
 
653
- if organism =~ /[a-z]{3}20[0-9]{2}/
654
- build = organism
652
+ organism = File.basename(dirname)
653
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
654
+ archive = organism
655
655
  organism = File.basename(File.dirname(dirname))
656
- organism = File.join(organism, build)
656
+ organism = File.join(organism, archive)
657
657
  end
658
658
 
659
659
  translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
@@ -788,12 +788,19 @@ end
788
788
  file 'gene_set' do |t|
789
789
  path = File.expand_path(t.name)
790
790
  dirname = File.dirname(path)
791
+
791
792
  organism = File.basename(dirname)
793
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
794
+ archive = organism
795
+ organism = File.basename(File.dirname(dirname))
796
+ organism = File.join(organism, archive)
797
+ end
792
798
 
793
799
  release = Ensembl.org2release(organism)
794
800
  num = release.split("-").last
795
- build_code = num.to_i > 75 ? "GRCh38" : "GRCh37"
796
- url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/homo_sapiens/Homo_sapiens.#{build_code}.#{num}.gtf.gz"
801
+ build_code = Organism.GRC_build(organism)
802
+ scientific_name = $scientific_name
803
+ url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
797
804
  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
798
805
  nil
799
806
  end
@@ -801,12 +808,19 @@ end
801
808
  file 'cdna_fasta' do |t|
802
809
  path = File.expand_path(t.name)
803
810
  dirname = File.dirname(path)
811
+
804
812
  organism = File.basename(dirname)
813
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
814
+ archive = organism
815
+ organism = File.basename(File.dirname(dirname))
816
+ organism = File.join(organism, archive)
817
+ end
805
818
 
806
819
  release = Ensembl.org2release(organism)
807
820
  num = release.split("-").last
808
- build_code = num.to_i > 75 ? "GRCh38" : "GRCh37"
809
- url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/homo_sapiens/cdna/Homo_sapiens.#{build_code}.#{num}.cdna.all.fa.gz"
821
+ build_code = Organism.GRC_build(organism)
822
+ scientific_name = Organism.scientific_name(organism)
823
+ url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
810
824
  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
811
825
  nil
812
826
  end
@@ -14,7 +14,7 @@ class TestOrganism < Test::Unit::TestCase
14
14
  end
15
15
 
16
16
  def test_identifiers
17
- assert Organism.identifiers('Hsa').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
17
+ assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
18
18
  assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
19
19
  assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
20
20
  end
@@ -70,6 +70,12 @@ class TestOrganism < Test::Unit::TestCase
70
70
  assert Organism.chromosome_sizes["2"].to_i > 10_000_000
71
71
  end
72
72
 
73
+ def test_build_organism
74
+ assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
75
+ assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
76
+ assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
77
+ end
78
+
73
79
  #def test_genes_at_chromosome
74
80
  # pos = [12, 117799500]
75
81
  # assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.38
4
+ version: 3.1.39
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-22 00:00:00.000000000 Z
11
+ date: 2019-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -88,6 +88,7 @@ extra_rdoc_files: []
88
88
  files:
89
89
  - etc/allowed_biomart_archives
90
90
  - etc/biomart/missing_in_archive
91
+ - etc/build_organism
91
92
  - etc/organisms
92
93
  - etc/xena_hubs
93
94
  - lib/rbbt/sources/CASCADE.rb