rbbt-sources 3.4.0 → 3.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 87f97c6af3dab4a1b39cb258acdf9bf4d105df5703a04d6264f960ff79e81faa
4
- data.tar.gz: ff91f67bc0775e0a20678ede8eeb312fa1e7a42d18095c2d9bcb1a5c0e4fc000
3
+ metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
4
+ data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
5
5
  SHA512:
6
- metadata.gz: 0b23136a81511a1ad55d5bb2af5784fd74512b9355bf40023a5197180bf25b69aefa966a3dafc8347f1864da174637fa0a2f95bb687a8973a4b23f5e6778398d
7
- data.tar.gz: 4fca8a03899b980a18da56d9cdd56bc0136ce126c718ca61836fa3cf55313f77664b7ddecadba2ad45491c2e309604da3ce7288c5de70c660f498bfcc2849aec
6
+ metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
7
+ data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0
@@ -1,8 +1,6 @@
1
1
  may2009
2
2
  feb2014
3
- may2017
4
- oct2018
5
- apr2019
3
+ may2015
4
+ sep2019
6
5
  feb2021
7
6
  feb2023
8
- oct2016
@@ -19,6 +19,8 @@
19
19
  - refseq_ncrna_predicted
20
20
  ">jun2015":
21
21
  - uniprot_swissprot_accession~uniprot_swissprot
22
+ ">jan2023":
23
+ - external_transcript_id~external_transcript_name
22
24
  <aug2014:
23
25
  - external_gene_name~external_gene_id
24
26
  may2010:
data/etc/build_organism CHANGED
@@ -2,7 +2,7 @@
2
2
  hg18 Hsa/may2008
3
3
  hg19 Hsa/feb2014
4
4
  b37 Hsa/feb2014
5
- hg38 Hsa/may2017
6
- GRCh38 Hsa/may2017
7
- mm10 Mmu/may2017
8
- GRCm38 Mmu/may2017
5
+ hg38 Hsa/feb2023
6
+ GRCh38 Hsa/feb2023
7
+ mm10 Mmu/feb2023
8
+ GRCm38 Mmu/feb2023
data/etc/organisms CHANGED
@@ -1,2 +1,3 @@
1
1
  Hsa
2
2
  Mmu
3
+ Sce
@@ -3,6 +3,7 @@ require 'rbbt/tsv'
3
3
  require 'rbbt/tsv/attach'
4
4
  require 'rbbt/util/log'
5
5
  require 'cgi'
6
+ require 'rbbt/sources/organism'
6
7
 
7
8
  # This module interacts with BioMart. It performs queries to BioMart and
8
9
  # synthesises a hash with the results. Note that this module connects to the
@@ -13,7 +14,7 @@ module BioMart
13
14
 
14
15
  class BioMart::QueryError < StandardError; end
15
16
 
16
- BIOMART_URL = 'http://www.ensembl.org/biomart/martservice?query='
17
+ BIOMART_URL = 'ensembl.org/biomart/martservice'
17
18
 
18
19
  MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
19
20
 
@@ -22,7 +23,7 @@ module BioMart
22
23
  @@biomart_query_xml = <<-EOT
23
24
  <?xml version="1.0" encoding="UTF-8"?>
24
25
  <!DOCTYPE Query>
25
- <Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
26
+ <Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
26
27
  <Dataset name = "<!--DATABASE-->" interface = "default" >
27
28
  <!--FILTERS-->
28
29
  <!--MAIN-->
@@ -36,14 +37,10 @@ module BioMart
36
37
  raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
37
38
  end
38
39
  Thread.current['archive'] = date
39
- Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
40
- Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
41
40
  end
42
41
 
43
42
  def self.unset_archive
44
- Log.debug "Restoring current version URL #{BIOMART_URL}"
45
43
  Thread.current['archive'] = nil
46
- Thread.current['archive_url'] = nil
47
44
  end
48
45
 
49
46
  def self.with_archive(data)
@@ -55,6 +52,21 @@ module BioMart
55
52
  end
56
53
  end
57
54
 
55
+ def self.final_url(query, archive = nil, ensembl_domain = nil)
56
+ url_domain = if archive.nil?
57
+ if ensembl_domain.nil?
58
+ 'www'
59
+ else
60
+ ensembl_domain
61
+ end
62
+ elsif ensembl_domain
63
+ [archive, ensembl_domain] * "-"
64
+ else
65
+ [archive, 'archive'] * "."
66
+ end
67
+ "http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
68
+ end
69
+
58
70
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
59
71
  open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
60
72
  repeats = true
@@ -75,11 +87,17 @@ module BioMart
75
87
 
76
88
  query = @@biomart_query_xml.dup
77
89
  query.sub!(/<!--DATABASE-->/,database)
90
+ if Thread.current["ensembl_domain"]
91
+ query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
92
+ else
93
+ query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
94
+ end
78
95
  query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
79
96
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
80
97
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
81
98
 
82
- url = Thread.current['archive_url'] ? Thread.current['archive_url'] + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
99
+ url = final_url(query, Thread.current["archive"], Thread.current["ensembl_domain"])
100
+
83
101
 
84
102
  begin
85
103
  response = Open.read(url, open_options.dup)
@@ -105,10 +123,17 @@ module BioMart
105
123
 
106
124
  new_datafile = TmpFile.tmp_file
107
125
  if data.nil?
108
- TSV.merge_row_fields Open.open(result_file), new_datafile
126
+ Open.open(result_file) do |file|
127
+ Open.write(new_datafile, Open.collapse_stream(file))
128
+ end
109
129
  data = new_datafile
110
130
  else
111
- TSV.merge_different_fields data, result_file, new_datafile
131
+ Open.open(result_file) do |stream_result|
132
+ Open.open(data) do |stream_data|
133
+ Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
134
+ end
135
+ end
136
+ #TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
112
137
  FileUtils.rm data
113
138
  data = new_datafile
114
139
  end
@@ -142,9 +167,9 @@ module BioMart
142
167
 
143
168
  IndiferentHash.setup(open_options)
144
169
 
145
- Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
170
+ Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
146
171
 
147
- max_items = 2
172
+ max_items = 1
148
173
  chunks = []
149
174
  chunk = []
150
175
  attrs.each{|a|
@@ -178,7 +203,7 @@ module BioMart
178
203
  results
179
204
  else
180
205
  Open.write(filename) do |f|
181
- f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
206
+ f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
182
207
  if field_names.nil?
183
208
  f.puts "#" << [main, attrs].flatten * "\t"
184
209
  else
@@ -211,7 +236,17 @@ module BioMart
211
236
  changes = {}
212
237
  missing.select{|m| m.include? "~" }.each do |str|
213
238
  orig,_sep, new = str.partition "~"
214
- changes[orig] = new
239
+ if orig.include?(":")
240
+ target_db, _sep, orig = orig.partition(":")
241
+ if target_db[0] == "-"
242
+ next if database == target_db[1..-1]
243
+ else
244
+ next unless database == target_db
245
+ end
246
+ changes[orig] = new
247
+ else
248
+ changes[orig] = new
249
+ end
215
250
  end
216
251
  changed = true
217
252
  while changed
@@ -9,11 +9,29 @@ module Ensembl
9
9
  module FTP
10
10
 
11
11
  SERVER = "ftp.ensembl.org"
12
+ DOMAIN_SERVER = "ftp.ensemblgenomes.org"
12
13
 
13
- def self.mysql_path(release)
14
+ def self.ftp_name_for_domain(domain, organism, subdir='mysql')
15
+ code, build = organism.split "/"
16
+ build ||= "current"
17
+
18
+ release = build == "current" ? 'current' : Ensembl.releases[build]
19
+ name = Organism.scientific_name(organism)
20
+ ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
21
+ ftp.passive = true
22
+ ftp.login
23
+ dir = File.join('pub', domain, 'current', subdir)
24
+ ftp.chdir(dir)
25
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
26
+ ftp.close
27
+ [release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
14
28
  end
15
29
 
16
- def self.ftp_name_for(organism)
30
+ def self.ftp_name_for(organism, subdir='mysql')
31
+ if domain = Thread.current["ensembl_domain"]
32
+ return ftp_name_for_domain(domain, organism,subdir)
33
+ end
34
+
17
35
  code, build = organism.split "/"
18
36
  build ||= "current"
19
37
 
@@ -23,8 +41,9 @@ module Ensembl
23
41
  ftp = Net::FTP.new(Ensembl::FTP::SERVER)
24
42
  ftp.passive = true
25
43
  ftp.login
26
- ftp.chdir(File.join('pub', 'current_mysql'))
27
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
44
+ dir = File.join('pub', "current_#{subdir}")
45
+ ftp.chdir(dir)
46
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
28
47
  ftp.close
29
48
  else
30
49
  release = Ensembl.releases[build]
@@ -32,24 +51,21 @@ module Ensembl
32
51
  ftp = Net::FTP.new(Ensembl::FTP::SERVER)
33
52
  ftp.passive = true
34
53
  ftp.login
35
- ftp.chdir(File.join('pub', release, 'mysql'))
36
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
54
+ dir = File.join('pub', release, subdir)
55
+ ftp.chdir(dir)
56
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
37
57
  ftp.close
38
58
  end
39
- [release, file]
59
+ [release, File.join(Ensembl::FTP::SERVER, dir, file)]
40
60
  end
41
61
 
42
- def self.ftp_directory_for(organism)
43
- release, ftp_name = ftp_name_for(organism)
44
- if release == 'current'
45
- File.join('/pub/', 'current_mysql', ftp_name)
46
- else
47
- File.join('/pub/', release, 'mysql', ftp_name)
48
- end
62
+ def self.ftp_url_for(organism)
63
+ release, ftp_url = ftp_name_for(organism)
64
+ ftp_url
49
65
  end
50
66
 
51
67
  def self.base_url(organism)
52
- File.join("ftp://" + SERVER, ftp_directory_for(organism) )
68
+ File.join("ftp://", ftp_url_for(organism) )
53
69
  end
54
70
 
55
71
  def self.url_for(organism, table, extension)
@@ -8,6 +8,19 @@ module Entrez
8
8
 
9
9
  Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
10
10
  Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
11
+ Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
12
+ TmpFile.with_dir do |dir|
13
+ Misc.in_dir dir do
14
+ CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
15
+ CMD.cmd("tar xvfz taxdump.tar.gz")
16
+ CMD.cmd("grep 'scientific name' names.dmp |cut -f 1,3 > tmp.tsv")
17
+ tsv = TSV.open('tmp.tsv', type: :single)
18
+ tsv.key_field = "Entrez Tax ID"
19
+ tsv.fields = ["Scientific Name"]
20
+ Open.write(filename, tsv.to_s)
21
+ end
22
+ end
23
+ end
11
24
 
12
25
  def self.entrez2native(taxs, options = {})
13
26
  options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
@@ -25,8 +25,8 @@ module GO
25
25
  # the gene_ontology.obo file and extracts all the fields, although right now,
26
26
  # only the name field is used.
27
27
  def self.init
28
- Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
29
- info.serializer = :marshal if info.respond_to? :serializer
28
+ Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
29
+ #info.serializer = :marshal if info.respond_to? :serializer
30
30
  Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
31
31
  term_info = {}
32
32
 
@@ -9,6 +9,10 @@ module Organism
9
9
  ARCHIVE_MONTH_INDEX = {}
10
10
  %w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
11
11
 
12
+ def self.rake_organism_helper
13
+ Rbbt.share.install.Organism["organism_helpers.rb"].find
14
+ end
15
+
12
16
  def self.compare_archives(a1, a2)
13
17
  a1 = a1.partition("/").last if a1 and a1.include? "/"
14
18
  a2 = a2.partition("/").last if a2 and a2.include? "/"
@@ -29,7 +33,8 @@ module Organism
29
33
  end
30
34
 
31
35
  def self.default_code(organism = "Hsa")
32
- organism.split("/").first << "/feb2014"
36
+ latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
37
+ organism.split("/").first << "/" << latest
33
38
  end
34
39
 
35
40
  def self.organism_codes(organism = nil)
@@ -43,7 +48,8 @@ module Organism
43
48
  end
44
49
 
45
50
  def self.installed_organisms
46
- Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
51
+ Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
52
+ Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
47
53
  end
48
54
 
49
55
  def self.prepared_organisms
@@ -62,25 +68,6 @@ module Organism
62
68
  nil
63
69
  end
64
70
 
65
- Organism.installable_organisms.each do |organism|
66
- claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
67
-
68
- module_eval "#{ organism } = with_key '#{organism}'"
69
- end
70
-
71
- Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
72
- Open.mkdir File.dirname(file) unless File.directory?(file)
73
- url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
74
- CMD.cmd_log("wget '#{url}' -O '#{file}'")
75
- CMD.cmd("chmod 0755 '#{file}'")
76
- Rbbt.set_software_env
77
- nil
78
- end
79
-
80
- CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
81
-
82
- Rbbt.set_software_env
83
-
84
71
  def self.hg_build(organism)
85
72
  require 'rbbt/sources/ensembl_ftp'
86
73
  organism = organism.strip
@@ -257,7 +244,16 @@ module Organism
257
244
  end
258
245
 
259
246
  def self.scientific_name(organism)
260
- Organism[organism]["scientific_name"].produce.read.strip
247
+ Organism[organism].scientific_name.read.strip
248
+ end
249
+
250
+ def self.make_organism(name, long = false)
251
+ first, _, second = name.partition(/[ _]/)
252
+ if long
253
+ first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
254
+ else
255
+ first[0].upcase + second[0..1].downcase
256
+ end
261
257
  end
262
258
 
263
259
  def self.organism(name)
@@ -295,7 +291,7 @@ module Organism
295
291
  organism ||= "Hsa"
296
292
 
297
293
  @@gene_start_end ||= {}
298
- gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unmamed => true)
294
+ gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
299
295
 
300
296
  ranges = genes.collect{|gene|
301
297
  start, eend = gene_start_end[gene]
@@ -339,7 +335,8 @@ module Organism
339
335
  def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
340
336
  chromosome_sizes = {}
341
337
 
342
- Organism[organism].glob_all("chromosome_*").each do |file|
338
+ Organism.chromosomes(organism).produce.tsv.each do |chr|
339
+ file = Organism[organism]["chromosome_#{chr}"].produce.find
343
340
  chromosome = file.split("_").last.split(".").first
344
341
  size = if Open.gzip?(file) || Open.bgzip?(file)
345
342
  CMD.cmd("zcat '#{ file }' | wc -c ").read
@@ -352,4 +349,28 @@ module Organism
352
349
  chromosome_sizes
353
350
  end
354
351
 
352
+ Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
353
+ Open.mkdir File.dirname(file) unless File.directory?(file)
354
+ url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
355
+ CMD.cmd_log("wget '#{url}' -O '#{file}'")
356
+ CMD.cmd("chmod 0755 '#{file}'")
357
+ Rbbt.set_software_env
358
+ nil
359
+ end
360
+
361
+ CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
362
+
363
+ Rbbt.set_software_env
364
+
365
+ Organism.installable_organisms.each do |organism|
366
+ if Rbbt.share.install.Organism[organism].Rakefile.exists?
367
+ rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
368
+ else
369
+ rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
370
+ end
371
+
372
+ claim Organism[organism], :rake, rakefile
373
+
374
+ module_eval "#{ organism } = with_key '#{organism}'"
375
+ end
355
376
  end
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [9606]
7
2
  $scientific_name = "Homo sapiens"
8
3
  $ortholog_key = "hsapiens_homolog_ensembl_gene"
@@ -95,17 +90,30 @@ $biomart_identifiers = [
95
90
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
96
91
  ]
97
92
 
98
- $biomart_go= [
99
- ["GO ID", 'go_id'],
100
- ["GO Namespace", 'namespace_1003'],
93
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
94
+ Thread.current["namespace"] = $namespace
95
+ load Organism.rake_organism_helper
96
+
97
+ file 'regulators' do |t|
98
+ regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
99
+ regulatory_fields = [
100
+ ['Chromosome Name','chromosome_name'],
101
+ ['Region Start', 'chromosome_start'],
102
+ ['Region End', 'chromosome_end'],
103
+ ['Feature type', 'feature_type_name'],
101
104
  ]
105
+ regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
106
+
107
+ Misc.sensiblewrite(t.name, regulators.to_s)
108
+ end
102
109
 
103
- $biomart_go_2009= [
104
- ["GO BP ID", 'go_biological_process_id'],
105
- ["GO MF ID", 'go_molecular_function_id'],
106
- ["GO CC ID", 'go_cellular_component_id'],
110
+ file 'regulator_activity' do |t|
111
+ regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
112
+ regulatory_fields = [
113
+ ['Epigenome name','epigenome_name'],
114
+ ['Activity', 'activity'],
107
115
  ]
116
+ regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
108
117
 
109
- #$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
110
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
111
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
118
+ Misc.sensiblewrite(t.name, regulators.to_s)
119
+ end
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [10090]
7
2
  $scientific_name = "Mus musculus"
8
3
  $ortholog_key = "mmusculus_homolog_ensembl_gene"
@@ -43,18 +38,6 @@ $biomart_identifiers = [
43
38
  [ 'EMBL (Genbank) ID' , "embl"] ,
44
39
  ]
45
40
 
46
- $biomart_go= [
47
- ["GO ID", 'go_id'],
48
- ["GO Namespace", 'namespace_1003'],
49
- ]
50
-
51
- $biomart_go_2009= [
52
- ["GO BP ID", 'go_biological_process_id'],
53
- ["GO MF ID", 'go_molecular_function_id'],
54
- ["GO CC ID", 'go_cellular_component_id'],
55
- ]
56
-
57
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
58
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
59
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
60
-
41
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
42
+ Thread.current["namespace"] = $namespace
43
+ load Organism.rake_organism_helper
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [10116]
7
2
  $scientific_name = "Rattus norvegicus"
8
3
 
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
50
45
  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
51
46
  ]
52
47
 
53
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
54
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
55
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
48
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
49
+ Thread.current["namespace"] = $namespace
50
+ load Organism.rake_organism_helper
@@ -0,0 +1,38 @@
1
+ $taxs = [559292,4932]
2
+ $scientific_name = "Saccharomyces cerevisiae"
3
+ $ensembl_domain = 'fungi'
4
+ #$ortholog_key = "yeast_ensembl_gene"
5
+
6
+ $biomart_db = 'scerevisiae_eg_gene'
7
+
8
+ $biomart_lexicon = [
9
+ [ 'Associated Gene Name' , "external_gene_name"],
10
+ ]
11
+
12
+ $biomart_protein_identifiers = [
13
+ [ 'Protein ID', "protein_id" ],
14
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
15
+ [ 'Unigene ID', "unigene" ],
16
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
17
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
18
+ ]
19
+
20
+ $biomart_probe_identifiers = [
21
+ ]
22
+
23
+ $biomart_identifiers = [
24
+ [ 'Entrez Gene ID', "entrezgene"],
25
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
26
+ [ 'Associated Gene Name', "external_gene_name" ],
27
+ [ 'Protein ID', "protein_id" ],
28
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
29
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
30
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
31
+ [ 'EMBL (Genbank) ID' , "embl"] ,
32
+ [ 'RefSeq DNA' , "refseq_dna"] ,
33
+ ]
34
+
35
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
36
+ Thread.current["namespace"] = $namespace
37
+ Thread.current["ensembl_domain"] = $ensembl_domain
38
+ load Organism.rake_organism_helper
@@ -1,8 +1,11 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
2
+
1
3
  require 'net/ftp'
4
+ require 'rbbt/sources/biomart'
5
+ require 'rbbt/sources/entrez'
6
+ require File.join(File.dirname(__FILE__), '../lib/helpers')
2
7
  require 'rbbt/sources/ensembl_ftp'
3
8
 
4
- #Thread.current['namespace'] = $namespace
5
-
6
9
  $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
7
10
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
8
11
  $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -77,6 +80,17 @@ $biomart_pfam= [
77
80
  ["Pfam Domain", 'pfam'],
78
81
  ]
79
82
 
83
+ $biomart_go= [
84
+ ["GO ID", 'go_id'],
85
+ ["GO Namespace", 'namespace_1003'],
86
+ ]
87
+
88
+ $biomart_go_2009= [
89
+ ["GO BP ID", 'go_biological_process_id'],
90
+ ["GO MF ID", 'go_molecular_function_id'],
91
+ ["GO CC ID", 'go_cellular_component_id'],
92
+ ]
93
+
80
94
  $biomart_gene_biotype= [
81
95
  ["Biotype", 'gene_biotype'],
82
96
  ]
@@ -91,7 +105,13 @@ $biomart_exons = [
91
105
  #{{{ Rules
92
106
 
93
107
  file 'entrez_taxids' do |t|
94
- Misc.sensiblewrite(t.name, $taxs * "\n")
108
+ if $tax && $tax.any?
109
+ Misc.sensiblewrite(t.name, $taxs * "\n")
110
+ else
111
+ tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
112
+ taxs = tsv[$scientific_name] || []
113
+ Misc.sensiblewrite(t.name, taxs * "\n")
114
+ end
95
115
  end
96
116
 
97
117
  file 'scientific_name' do |t|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
104
124
  Misc.sensiblewrite(t.name, $ortholog_key)
105
125
  end
106
126
 
107
- file 'identifiers' do |t|
127
+ file 'identifiers' => 'entrez_taxids' do |t|
128
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
108
129
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
109
130
  identifiers.unnamed = true
110
131
 
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
116
137
  end
117
138
 
118
139
  name_pos = identifiers.identify_field "Associated Gene Name"
119
- entrez2name = Entrez.entrez2name($taxs)
120
- identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
121
- names = values[name_pos]
140
+ if tax_codes and tax_codes.any?
141
+ entrez2name = Entrez.entrez2name(tax_codes)
142
+ identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
143
+ names = values[name_pos] || []
122
144
 
123
- matches = entrez.select do |e|
124
- entrez2name.include?(e) && (names & entrez2name[e]).any?
125
- end
145
+ matches = entrez.select do |e|
146
+ entrez2name.include?(e) && (names & entrez2name[e]).any?
147
+ end
126
148
 
127
- if matches.any?
128
- matches
129
- else
130
- entrez
149
+ if matches.any?
150
+ matches
151
+ else
152
+ entrez
153
+ end
131
154
  end
132
155
  end
133
156
 
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
147
170
  identifiers = identifiers.reorder(:key, ordered_fields)
148
171
  end
149
172
 
150
- entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
151
- entrez_synonyms.key_field = "Entrez Gene ID"
152
- entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
173
+ if tax_codes and tax_codes.any?
174
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
175
+ entrez_synonyms.key_field = "Entrez Gene ID"
176
+ entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
153
177
 
154
- identifiers.attach entrez_synonyms
178
+ identifiers.attach entrez_synonyms
179
+ end
155
180
 
156
181
  identifiers.with_unnamed do
157
182
  identifiers.each do |key, values|
158
183
  values.each do |list|
184
+ list ||= []
159
185
  list.reject!{|v| v.nil? or v.empty?}
160
186
  list.uniq!
161
187
  end
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
166
192
  Misc.sensiblewrite(t.name, identifiers.to_s)
167
193
  end
168
194
 
169
- file 'lexicon' => 'identifiers' do |t|
195
+ file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
170
196
  tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
197
+ tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
171
198
 
172
- entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
199
+ entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
173
200
  entrez_description.key_field = "Entrez Gene ID"
174
201
  entrez_description.fields = ["Entrez Gene Description"]
175
202
 
@@ -308,8 +335,9 @@ end
308
335
 
309
336
  # {{{ Other info
310
337
 
311
- file 'gene_pmids' do |t|
312
- tsv = Entrez.entrez2pubmed($taxs)
338
+ file 'gene_pmids' => 'entrez_taxids' do |t|
339
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
340
+ tsv = Entrez.entrez2pubmed(tax_codes)
313
341
  text = "#: :namespace=#{Thread.current['namespace']}\n"
314
342
  text += "#Entrez Gene ID\tPMID"
315
343
  tsv.each do |gene, pmids|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
417
445
 
418
446
  gene_go.monitor = true
419
447
  gene_go.process "GO ID" do |key, go_id, values|
420
- clean = values.zip_fields.select do |id, type|
448
+ clean = NamedArray.zip_fields(values).select do |id, type|
421
449
  type == "biological_process"
422
450
  end
423
451
  clean.collect{|id, type| id}
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
487
515
  end
488
516
 
489
517
  file 'chromosomes' do |t|
490
- goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
518
+ tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
491
519
 
492
- Misc.sensiblewrite(t.name, goterms.to_s)
520
+ Misc.sensiblewrite(t.name, tsv.keys * "\n")
493
521
  end
494
522
 
495
523
  file 'blacklist_chromosomes' => 'chromosomes' do |t|
@@ -511,6 +539,15 @@ end
511
539
 
512
540
  rule /^chromosome_.*/ do |t|
513
541
  chr = t.name.match(/chromosome_(.*)/)[1]
542
+ path = File.expand_path(t.name)
543
+ dirname = File.dirname(path)
544
+
545
+ organism = File.basename(dirname)
546
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
547
+ archive = organism
548
+ organism = File.basename(File.dirname(dirname))
549
+ organism = File.join(organism, archive)
550
+ end
514
551
 
515
552
  # HACK: Skip LRG chromosomes
516
553
  raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
519
556
 
520
557
  release = Ensembl.releases[archive]
521
558
 
522
- ftp = Net::FTP.new("ftp.ensembl.org")
559
+ fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
560
+ server, _, path = fasta_url.partition("/")
561
+ path = "/" + path
562
+
563
+ ftp = Net::FTP.new(server)
523
564
  ftp.passive = true
524
565
  ftp.login
525
- if release.nil? or release == 'current'
526
- ftp.chdir("pub/current_fasta/")
527
- else
528
- ftp.chdir("pub/#{ release }/fasta/")
529
- end
530
- ftp.chdir($scientific_name.downcase.sub(" ",'_'))
566
+ ftp.chdir(path)
531
567
  ftp.chdir('dna')
532
- file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
533
-
534
- raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
535
568
 
536
- Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
569
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
570
+ if file
571
+ Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
537
572
 
538
- Misc.lock t.name + '.rake' do
539
- TmpFile.with_file do |tmpfile|
540
- ftp.getbinaryfile(file, tmpfile)
541
- Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
542
- ftp.close
573
+ Misc.lock t.name + '.rake' do
574
+ TmpFile.with_file do |tmpfile|
575
+ ftp.getbinaryfile(file, tmpfile)
576
+ Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
577
+ ftp.close
578
+ end
543
579
  end
580
+ else
581
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
582
+ Misc.lock t.name + '.rake' do
583
+ TmpFile.with_file do |tmpfile|
584
+ ftp.getbinaryfile(file, tmpfile)
585
+ txt = Open.read(tmpfile, :gzip => true)
586
+
587
+ chr_txt = []
588
+
589
+ in_chr = false
590
+ txt.split("\n").each do |line|
591
+ if line.start_with?(">#{chr}")
592
+ in_chr = true
593
+ elsif line.start_with?(">")
594
+ in_chr = false
595
+ else
596
+ chr_txt << line if in_chr
597
+ end
598
+ end
599
+ Misc.sensiblewrite(t.name, chr_txt * "" )
600
+ ftp.close
601
+ end
602
+ end
603
+ raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
544
604
  end
545
605
  end
546
606
 
@@ -584,6 +644,16 @@ end
584
644
  require 'bio'
585
645
 
586
646
  file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
647
+ path = File.expand_path(t.name)
648
+ dirname = File.dirname(path)
649
+
650
+ organism = File.basename(dirname)
651
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
652
+ archive = organism
653
+ organism = File.basename(File.dirname(dirname))
654
+ organism = File.join(organism, archive)
655
+ end
656
+
587
657
  exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
588
658
 
589
659
  chr_transcript_ranges ||= {}
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
616
686
  chr_transcript_ranges.each do |chr, transcript_ranges|
617
687
  begin
618
688
  raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
619
- p = File.expand_path("./chromosome_#{chr}")
620
- Organism.root.annotate p
621
- p.sub!(%r{.*/organisms/},'share/organisms/')
622
- chr_str = p.produce.read
689
+ pkgdir = Thread.current["resource"]
690
+ p = pkgdir[organism]["chromosome_#{chr}"]
691
+ p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
692
+ chr_str = p.read
623
693
  rescue Exception
624
694
  Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
625
695
  raise $! unless $!.message =~ /not supported/
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
656
726
  organism = File.join(organism, archive)
657
727
  end
658
728
 
659
- translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
729
+ translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
660
730
 
661
731
  if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
662
732
  exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
670
740
  transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
671
741
  end
672
742
 
673
- transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
674
- transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
675
- exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
743
+ transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
744
+ transcript_exons = TSV.open("./transcript_exons", :unnamed => true)
745
+ exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
676
746
 
677
747
  transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
678
748
  transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
@@ -719,12 +789,13 @@ end
719
789
  file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
720
790
  transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
721
791
  transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
722
- transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
792
+ transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
723
793
  transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
724
794
  transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
725
795
 
726
796
 
727
797
  protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
798
+ transcript_sequence.monitor = true
728
799
  transcript_sequence.through do |transcript, sequence|
729
800
  protein = transcript_protein[transcript]
730
801
  next if protein.nil? or protein.empty?
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
777
848
  uni_seq = UniProt.get_uniprot_sequence(uni)
778
849
  ensps = uni2ensps[uni]
779
850
  next if ensps.nil? or ensps.empty?
851
+
780
852
  best_ensp = ensps.sort_by do |ensp|
781
853
  ensp_seq = ensp2seq[ensp]
782
854
  if ensp_seq
@@ -829,3 +901,4 @@ file 'cdna_fasta' do |t|
829
901
  Open.download(url, "#{t.name}.gz")
830
902
  nil
831
903
  end
904
+
@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
3
3
  require 'rbbt/sources/pubmed'
4
4
  require 'test/unit'
5
5
  require 'rbbt/sources/biomart'
6
+ require 'rbbt/sources/organism'
6
7
  require 'rbbt/util/tmpfile'
7
8
  require 'test/unit'
8
9
 
9
10
  class TestBioMart < Test::Unit::TestCase
10
11
 
11
12
  def setup
12
- BioMart.set_archive Organism.default_code("Hsa")
13
+ BioMart.set_archive "feb2014"
13
14
  end
14
15
 
15
16
  def teardown
16
17
  BioMart.unset_archive
17
18
  end
18
19
 
19
- def _test_get
20
+ def test_get_Sce
20
21
  assert_raise BioMart::QueryError do
21
22
  BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
22
23
  end
23
24
 
24
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :merge => true, :wget_options => {:quiet => false})
25
+ BioMart.set_archive "feb2023-fungi"
26
+ data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
25
27
  tsv = TSV.open data, :double, :merge => true
26
- assert(tsv['852236'][0].include? 'CAA84864')
28
+ assert(tsv['852236'][0].include? 'CAA84864.1')
27
29
 
28
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
30
+ data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
29
31
  tsv = TSV.open data, :double, :merge => true
30
32
  assert(tsv['852236'][1].include? 'YBL044W')
31
33
  end
32
34
 
33
- def _test_query
35
+ def test_get_Hsa
36
+ Log.severity = 0
37
+ data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
38
+ tsv = TSV.open data, :double, :merge => true
39
+ assert(tsv['852236'][0].include? 'CAA84864.1')
40
+ end
41
+
42
+
43
+ def test_query
34
44
  data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
35
45
  assert(data['852236']['external_gene_id'].include? 'YBL044W')
36
46
 
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
41
51
  end
42
52
  end
43
53
 
54
+ def __test_transcrip_exons
55
+ Log.with_severity 1 do
56
+ TmpFile.with_file do |f|
57
+ fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
58
+ main = fields[0]
59
+ attrs = fields.values_at(1, 2)
60
+ attrs_first = [attrs.first]
61
+ attrs_last = [attrs.last]
62
+ database = 'hsapiens_gene_ensembl'
63
+
64
+ filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
65
+ ppp Open.read(filename)
66
+
67
+ filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
68
+ ppp Open.read(filename)
69
+
70
+ filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
71
+ ppp Open.read(filename)
72
+
73
+ filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
74
+ ppp Open.read(filename)
75
+
76
+ data = TSV.open Open.open(filename)
77
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
78
+ end
79
+ end
80
+ end
81
+
44
82
  def test_tsv
45
83
  data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
46
84
  assert(data['852236']['Protein ID'].include? 'CAA84864')
@@ -0,0 +1,11 @@
1
+ require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
2
+ require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
3
+
4
+ class TestEnsemblFTP < Test::Unit::TestCase
5
+ def test_ftp_for
6
+ assert_nothing_raised do
7
+ Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
8
+ end
9
+ end
10
+ end
11
+
@@ -14,6 +14,11 @@ class TestEntrez < Test::Unit::TestCase
14
14
  assert(lexicon['855611'].include? 'S000005056')
15
15
  end
16
16
 
17
+ def test_entrez2name
18
+ tax = $yeast_tax
19
+ Entrez.entrez2name(tax)
20
+ end
21
+
17
22
  def test_entrez2pubmed
18
23
  tax = $yeast_tax
19
24
 
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
5
5
 
6
6
  class TestOrganism < Test::Unit::TestCase
7
7
 
8
- def test_known_ids
8
+ def _test_known_ids
9
9
  assert Organism.known_ids("Hsa").include?("Associated Gene Name")
10
10
  end
11
11
 
12
- def test_location
12
+ def _test_location
13
13
  assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
14
14
  end
15
15
 
16
- def test_identifiers
16
+ def _test_identifiers
17
17
  assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
18
18
  assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
19
19
  assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
20
20
  end
21
21
 
22
- def test_lexicon
22
+ def _test_lexicon
23
23
  assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
24
24
  end
25
25
 
26
- def test_guess_id
26
+ def _test_guess_id
27
27
  ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
28
28
  gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
29
29
  assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
30
30
  assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
31
31
  end
32
32
 
33
- def test_organisms
33
+ def _test_organisms
34
34
  assert Organism.organisms.include? "Hsa"
35
35
  assert_equal "Hsa", Organism.organism("Homo sapiens")
36
36
  end
37
37
 
38
- def test_attach_translations
38
+ def _test_attach_translations
39
39
  tsv = TSV.setup({"1020" => []}, :type => :list)
40
40
  tsv.key_field = "Entrez Gene ID"
41
41
  tsv.fields = []
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
47
47
  assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
48
48
  end
49
49
 
50
- def test_entrez_taxids
50
+ def _test_entrez_taxids
51
51
  assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
52
52
  end
53
53
 
54
- def test_lift_over
54
+ def _test_lift_over
55
55
  mutation_19 = "19:21131664:T"
56
56
  mutation_18 = "19:20923504:T"
57
- source_build = Organism.default_code("Hsa")
57
+ source_build = "Hsa/feb2014"
58
58
  target_build = "Hsa/may2009"
59
59
 
60
60
  assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
61
61
  assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
62
62
  end
63
63
 
64
- def test_orhtolog
64
+ def _test_orhtolog
65
65
  require 'rbbt/entity/gene'
66
66
  assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
67
67
  end
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
70
70
  assert Organism.chromosome_sizes["2"].to_i > 10_000_000
71
71
  end
72
72
 
73
- def test_build_organism
73
+ def _test_build_organism
74
74
  assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
75
75
  assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
76
76
  assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
77
77
  end
78
78
 
79
- #def test_genes_at_chromosome
79
+ #def _test_genes_at_chromosome
80
80
  # pos = [12, 117799500]
81
81
  # assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
82
82
  #end
83
83
 
84
- #def test_genes_at_chromosome_array
84
+ #def _test_genes_at_chromosome_array
85
85
  # pos = [12, [117799500, 106903900]]
86
86
  # assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
87
87
  #end
88
88
 
89
- #def test_genes_at_genomic_positions
89
+ #def _test_genes_at_genomic_positions
90
90
  # pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
91
91
  # assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
92
92
  #end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.0
4
+ version: 3.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-05-08 00:00:00.000000000 Z
11
+ date: 2025-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -120,10 +120,10 @@ files:
120
120
  - share/install/KEGG/Rakefile
121
121
  - share/install/Matador/Rakefile
122
122
  - share/install/NCI/Rakefile
123
- - share/install/Organism/Hsa/Rakefile
124
- - share/install/Organism/Mmu/Rakefile
125
- - share/install/Organism/Rno/Rakefile
126
- - share/install/Organism/Sce/Rakefile
123
+ - share/install/Organism/Hsa.rake
124
+ - share/install/Organism/Mmu.rake
125
+ - share/install/Organism/Rno.rake
126
+ - share/install/Organism/Sce.rake
127
127
  - share/install/Organism/organism_helpers.rb
128
128
  - share/install/PharmaGKB/Rakefile
129
129
  - share/install/Pina/Rakefile
@@ -133,6 +133,7 @@ files:
133
133
  - share/install/lib/rake_helper.rb
134
134
  - test/rbbt/sources/test_HPRD.rb
135
135
  - test/rbbt/sources/test_biomart.rb
136
+ - test/rbbt/sources/test_ensembl_ftp.rb
136
137
  - test/rbbt/sources/test_entrez.rb
137
138
  - test/rbbt/sources/test_go.rb
138
139
  - test/rbbt/sources/test_gscholar.rb
@@ -166,13 +167,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
167
  - !ruby/object:Gem::Version
167
168
  version: '0'
168
169
  requirements: []
169
- rubygems_version: 3.5.9
170
+ rubygems_version: 3.5.23
170
171
  signing_key:
171
172
  specification_version: 4
172
173
  summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
173
174
  test_files:
174
175
  - test/rbbt/sources/test_HPRD.rb
175
176
  - test/rbbt/sources/test_biomart.rb
177
+ - test/rbbt/sources/test_ensembl_ftp.rb
176
178
  - test/rbbt/sources/test_entrez.rb
177
179
  - test/rbbt/sources/test_go.rb
178
180
  - test/rbbt/sources/test_gscholar.rb
@@ -1,52 +0,0 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
- $taxs = [559292,4932]
7
- $scientific_name = "Saccharomyces cerevisiae"
8
- #$ortholog_key = "yeast_ensembl_gene"
9
-
10
- $biomart_db = 'scerevisiae_gene_ensembl'
11
-
12
- $biomart_lexicon = [
13
- [ 'Associated Gene Name' , "external_gene_id"],
14
- ]
15
-
16
- $biomart_protein_identifiers = [
17
- [ 'Protein ID', "protein_id" ],
18
- [ 'RefSeq Protein ID', "refseq_peptide" ],
19
- [ 'Unigene ID', "unigene" ],
20
- [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
21
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
22
- ]
23
-
24
- $biomart_probe_identifiers = [
25
- ]
26
-
27
- $biomart_identifiers = [
28
- [ 'Entrez Gene ID', "entrezgene"],
29
- [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
30
- [ 'Associated Gene Name', "external_gene_id" ],
31
- [ 'Protein ID', "protein_id" ],
32
- [ 'RefSeq Protein ID', "refseq_peptide" ],
33
- [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
34
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
35
- [ 'EMBL (Genbank) ID' , "embl"] ,
36
- [ 'RefSeq mRNA' , "refseq_mrna"] ,
37
- ]
38
-
39
- $biomart_go= [
40
- ["GO ID", 'go_id'],
41
- ["GO Namespace", 'namespace_1003'],
42
- ]
43
-
44
- $biomart_go_2009= [
45
- ["GO BP ID", 'go_biological_process_id'],
46
- ["GO MF ID", 'go_molecular_function_id'],
47
- ["GO CC ID", 'go_cellular_component_id'],
48
- ]
49
-
50
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
51
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
52
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')