rbbt-sources 3.4.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 87f97c6af3dab4a1b39cb258acdf9bf4d105df5703a04d6264f960ff79e81faa
4
- data.tar.gz: ff91f67bc0775e0a20678ede8eeb312fa1e7a42d18095c2d9bcb1a5c0e4fc000
3
+ metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
4
+ data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
5
5
  SHA512:
6
- metadata.gz: 0b23136a81511a1ad55d5bb2af5784fd74512b9355bf40023a5197180bf25b69aefa966a3dafc8347f1864da174637fa0a2f95bb687a8973a4b23f5e6778398d
7
- data.tar.gz: 4fca8a03899b980a18da56d9cdd56bc0136ce126c718ca61836fa3cf55313f77664b7ddecadba2ad45491c2e309604da3ce7288c5de70c660f498bfcc2849aec
6
+ metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
7
+ data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0
@@ -1,8 +1,6 @@
1
1
  may2009
2
2
  feb2014
3
- may2017
4
- oct2018
5
- apr2019
3
+ may2015
4
+ sep2019
6
5
  feb2021
7
6
  feb2023
8
- oct2016
@@ -19,6 +19,8 @@
19
19
  - refseq_ncrna_predicted
20
20
  ">jun2015":
21
21
  - uniprot_swissprot_accession~uniprot_swissprot
22
+ ">jan2023":
23
+ - external_transcript_id~external_transcript_name
22
24
  <aug2014:
23
25
  - external_gene_name~external_gene_id
24
26
  may2010:
data/etc/build_organism CHANGED
@@ -2,7 +2,7 @@
2
2
  hg18 Hsa/may2008
3
3
  hg19 Hsa/feb2014
4
4
  b37 Hsa/feb2014
5
- hg38 Hsa/may2017
6
- GRCh38 Hsa/may2017
7
- mm10 Mmu/may2017
8
- GRCm38 Mmu/may2017
5
+ hg38 Hsa/feb2023
6
+ GRCh38 Hsa/feb2023
7
+ mm10 Mmu/feb2023
8
+ GRCm38 Mmu/feb2023
data/etc/organisms CHANGED
@@ -1,2 +1,3 @@
1
1
  Hsa
2
2
  Mmu
3
+ Sce
@@ -3,6 +3,7 @@ require 'rbbt/tsv'
3
3
  require 'rbbt/tsv/attach'
4
4
  require 'rbbt/util/log'
5
5
  require 'cgi'
6
+ require 'rbbt/sources/organism'
6
7
 
7
8
  # This module interacts with BioMart. It performs queries to BioMart and
8
9
  # synthesises a hash with the results. Note that this module connects to the
@@ -13,7 +14,7 @@ module BioMart
13
14
 
14
15
  class BioMart::QueryError < StandardError; end
15
16
 
16
- BIOMART_URL = 'http://www.ensembl.org/biomart/martservice?query='
17
+ BIOMART_URL = 'ensembl.org/biomart/martservice'
17
18
 
18
19
  MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
19
20
 
@@ -22,7 +23,7 @@ module BioMart
22
23
  @@biomart_query_xml = <<-EOT
23
24
  <?xml version="1.0" encoding="UTF-8"?>
24
25
  <!DOCTYPE Query>
25
- <Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
26
+ <Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
26
27
  <Dataset name = "<!--DATABASE-->" interface = "default" >
27
28
  <!--FILTERS-->
28
29
  <!--MAIN-->
@@ -36,14 +37,10 @@ module BioMart
36
37
  raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
37
38
  end
38
39
  Thread.current['archive'] = date
39
- Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
40
- Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
41
40
  end
42
41
 
43
42
  def self.unset_archive
44
- Log.debug "Restoring current version URL #{BIOMART_URL}"
45
43
  Thread.current['archive'] = nil
46
- Thread.current['archive_url'] = nil
47
44
  end
48
45
 
49
46
  def self.with_archive(data)
@@ -55,6 +52,21 @@ module BioMart
55
52
  end
56
53
  end
57
54
 
55
+ def self.final_url(query, archive = nil, ensembl_domain = nil)
56
+ url_domain = if archive.nil?
57
+ if ensembl_domain.nil?
58
+ 'www'
59
+ else
60
+ ensembl_domain
61
+ end
62
+ elsif ensembl_domain
63
+ [archive, ensembl_domain] * "-"
64
+ else
65
+ [archive, 'archive'] * "."
66
+ end
67
+ "http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
68
+ end
69
+
58
70
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
59
71
  open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
60
72
  repeats = true
@@ -75,11 +87,17 @@ module BioMart
75
87
 
76
88
  query = @@biomart_query_xml.dup
77
89
  query.sub!(/<!--DATABASE-->/,database)
90
+ if Thread.current["ensembl_domain"]
91
+ query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
92
+ else
93
+ query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
94
+ end
78
95
  query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
79
96
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
80
97
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
81
98
 
82
- url = Thread.current['archive_url'] ? Thread.current['archive_url'] + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
99
+ url = final_url(query, Thread.current["archive"], Thread.current["ensembl_domain"])
100
+
83
101
 
84
102
  begin
85
103
  response = Open.read(url, open_options.dup)
@@ -105,10 +123,17 @@ module BioMart
105
123
 
106
124
  new_datafile = TmpFile.tmp_file
107
125
  if data.nil?
108
- TSV.merge_row_fields Open.open(result_file), new_datafile
126
+ Open.open(result_file) do |file|
127
+ Open.write(new_datafile, Open.collapse_stream(file))
128
+ end
109
129
  data = new_datafile
110
130
  else
111
- TSV.merge_different_fields data, result_file, new_datafile
131
+ Open.open(result_file) do |stream_result|
132
+ Open.open(data) do |stream_data|
133
+ Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
134
+ end
135
+ end
136
+ #TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
112
137
  FileUtils.rm data
113
138
  data = new_datafile
114
139
  end
@@ -142,9 +167,9 @@ module BioMart
142
167
 
143
168
  IndiferentHash.setup(open_options)
144
169
 
145
- Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
170
+ Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
146
171
 
147
- max_items = 2
172
+ max_items = 1
148
173
  chunks = []
149
174
  chunk = []
150
175
  attrs.each{|a|
@@ -178,7 +203,7 @@ module BioMart
178
203
  results
179
204
  else
180
205
  Open.write(filename) do |f|
181
- f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
206
+ f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
182
207
  if field_names.nil?
183
208
  f.puts "#" << [main, attrs].flatten * "\t"
184
209
  else
@@ -211,7 +236,17 @@ module BioMart
211
236
  changes = {}
212
237
  missing.select{|m| m.include? "~" }.each do |str|
213
238
  orig,_sep, new = str.partition "~"
214
- changes[orig] = new
239
+ if orig.include?(":")
240
+ target_db, _sep, orig = orig.partition(":")
241
+ if target_db[0] == "-"
242
+ next if database == target_db[1..-1]
243
+ else
244
+ next unless database == target_db
245
+ end
246
+ changes[orig] = new
247
+ else
248
+ changes[orig] = new
249
+ end
215
250
  end
216
251
  changed = true
217
252
  while changed
@@ -9,11 +9,29 @@ module Ensembl
9
9
  module FTP
10
10
 
11
11
  SERVER = "ftp.ensembl.org"
12
+ DOMAIN_SERVER = "ftp.ensemblgenomes.org"
12
13
 
13
- def self.mysql_path(release)
14
+ def self.ftp_name_for_domain(domain, organism, subdir='mysql')
15
+ code, build = organism.split "/"
16
+ build ||= "current"
17
+
18
+ release = build == "current" ? 'current' : Ensembl.releases[build]
19
+ name = Organism.scientific_name(organism)
20
+ ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
21
+ ftp.passive = true
22
+ ftp.login
23
+ dir = File.join('pub', domain, 'current', subdir)
24
+ ftp.chdir(dir)
25
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
26
+ ftp.close
27
+ [release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
14
28
  end
15
29
 
16
- def self.ftp_name_for(organism)
30
+ def self.ftp_name_for(organism, subdir='mysql')
31
+ if domain = Thread.current["ensembl_domain"]
32
+ return ftp_name_for_domain(domain, organism,subdir)
33
+ end
34
+
17
35
  code, build = organism.split "/"
18
36
  build ||= "current"
19
37
 
@@ -23,8 +41,9 @@ module Ensembl
23
41
  ftp = Net::FTP.new(Ensembl::FTP::SERVER)
24
42
  ftp.passive = true
25
43
  ftp.login
26
- ftp.chdir(File.join('pub', 'current_mysql'))
27
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
44
+ dir = File.join('pub', "current_#{subdir}")
45
+ ftp.chdir(dir)
46
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
28
47
  ftp.close
29
48
  else
30
49
  release = Ensembl.releases[build]
@@ -32,24 +51,21 @@ module Ensembl
32
51
  ftp = Net::FTP.new(Ensembl::FTP::SERVER)
33
52
  ftp.passive = true
34
53
  ftp.login
35
- ftp.chdir(File.join('pub', release, 'mysql'))
36
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
54
+ dir = File.join('pub', release, subdir)
55
+ ftp.chdir(dir)
56
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
37
57
  ftp.close
38
58
  end
39
- [release, file]
59
+ [release, File.join(Ensembl::FTP::SERVER, dir, file)]
40
60
  end
41
61
 
42
- def self.ftp_directory_for(organism)
43
- release, ftp_name = ftp_name_for(organism)
44
- if release == 'current'
45
- File.join('/pub/', 'current_mysql', ftp_name)
46
- else
47
- File.join('/pub/', release, 'mysql', ftp_name)
48
- end
62
+ def self.ftp_url_for(organism)
63
+ release, ftp_url = ftp_name_for(organism)
64
+ ftp_url
49
65
  end
50
66
 
51
67
  def self.base_url(organism)
52
- File.join("ftp://" + SERVER, ftp_directory_for(organism) )
68
+ File.join("ftp://", ftp_url_for(organism) )
53
69
  end
54
70
 
55
71
  def self.url_for(organism, table, extension)
@@ -8,6 +8,19 @@ module Entrez
8
8
 
9
9
  Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
10
10
  Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
11
+ Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
12
+ TmpFile.with_dir do |dir|
13
+ Misc.in_dir dir do
14
+ CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
15
+ CMD.cmd("tar xvfz taxdump.tar.gz")
16
+ CMD.cmd("grep 'scientific name' names.dmp |cut -f 1,3 > tmp.tsv")
17
+ tsv = TSV.open('tmp.tsv', type: :single)
18
+ tsv.key_field = "Entrez Tax ID"
19
+ tsv.fields = ["Scientific Name"]
20
+ Open.write(filename, tsv.to_s)
21
+ end
22
+ end
23
+ end
11
24
 
12
25
  def self.entrez2native(taxs, options = {})
13
26
  options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
@@ -25,8 +25,8 @@ module GO
25
25
  # the gene_ontology.obo file and extracts all the fields, although right now,
26
26
  # only the name field is used.
27
27
  def self.init
28
- Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
29
- info.serializer = :marshal if info.respond_to? :serializer
28
+ Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
29
+ #info.serializer = :marshal if info.respond_to? :serializer
30
30
  Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
31
31
  term_info = {}
32
32
 
@@ -9,6 +9,10 @@ module Organism
9
9
  ARCHIVE_MONTH_INDEX = {}
10
10
  %w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
11
11
 
12
+ def self.rake_organism_helper
13
+ Rbbt.share.install.Organism["organism_helpers.rb"].find
14
+ end
15
+
12
16
  def self.compare_archives(a1, a2)
13
17
  a1 = a1.partition("/").last if a1 and a1.include? "/"
14
18
  a2 = a2.partition("/").last if a2 and a2.include? "/"
@@ -29,7 +33,8 @@ module Organism
29
33
  end
30
34
 
31
35
  def self.default_code(organism = "Hsa")
32
- organism.split("/").first << "/feb2014"
36
+ latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
37
+ organism.split("/").first << "/" << latest
33
38
  end
34
39
 
35
40
  def self.organism_codes(organism = nil)
@@ -43,7 +48,8 @@ module Organism
43
48
  end
44
49
 
45
50
  def self.installed_organisms
46
- Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
51
+ Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
52
+ Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
47
53
  end
48
54
 
49
55
  def self.prepared_organisms
@@ -62,25 +68,6 @@ module Organism
62
68
  nil
63
69
  end
64
70
 
65
- Organism.installable_organisms.each do |organism|
66
- claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
67
-
68
- module_eval "#{ organism } = with_key '#{organism}'"
69
- end
70
-
71
- Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
72
- Open.mkdir File.dirname(file) unless File.directory?(file)
73
- url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
74
- CMD.cmd_log("wget '#{url}' -O '#{file}'")
75
- CMD.cmd("chmod 0755 '#{file}'")
76
- Rbbt.set_software_env
77
- nil
78
- end
79
-
80
- CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
81
-
82
- Rbbt.set_software_env
83
-
84
71
  def self.hg_build(organism)
85
72
  require 'rbbt/sources/ensembl_ftp'
86
73
  organism = organism.strip
@@ -257,7 +244,16 @@ module Organism
257
244
  end
258
245
 
259
246
  def self.scientific_name(organism)
260
- Organism[organism]["scientific_name"].produce.read.strip
247
+ Organism[organism].scientific_name.read.strip
248
+ end
249
+
250
+ def self.make_organism(name, long = false)
251
+ first, _, second = name.partition(/[ _]/)
252
+ if long
253
+ first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
254
+ else
255
+ first[0].upcase + second[0..1].downcase
256
+ end
261
257
  end
262
258
 
263
259
  def self.organism(name)
@@ -295,7 +291,7 @@ module Organism
295
291
  organism ||= "Hsa"
296
292
 
297
293
  @@gene_start_end ||= {}
298
- gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unmamed => true)
294
+ gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
299
295
 
300
296
  ranges = genes.collect{|gene|
301
297
  start, eend = gene_start_end[gene]
@@ -339,7 +335,8 @@ module Organism
339
335
  def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
340
336
  chromosome_sizes = {}
341
337
 
342
- Organism[organism].glob_all("chromosome_*").each do |file|
338
+ Organism.chromosomes(organism).produce.tsv.each do |chr|
339
+ file = Organism[organism]["chromosome_#{chr}"].produce.find
343
340
  chromosome = file.split("_").last.split(".").first
344
341
  size = if Open.gzip?(file) || Open.bgzip?(file)
345
342
  CMD.cmd("zcat '#{ file }' | wc -c ").read
@@ -352,4 +349,28 @@ module Organism
352
349
  chromosome_sizes
353
350
  end
354
351
 
352
+ Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
353
+ Open.mkdir File.dirname(file) unless File.directory?(file)
354
+ url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
355
+ CMD.cmd_log("wget '#{url}' -O '#{file}'")
356
+ CMD.cmd("chmod 0755 '#{file}'")
357
+ Rbbt.set_software_env
358
+ nil
359
+ end
360
+
361
+ CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
362
+
363
+ Rbbt.set_software_env
364
+
365
+ Organism.installable_organisms.each do |organism|
366
+ if Rbbt.share.install.Organism[organism].Rakefile.exists?
367
+ rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
368
+ else
369
+ rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
370
+ end
371
+
372
+ claim Organism[organism], :rake, rakefile
373
+
374
+ module_eval "#{ organism } = with_key '#{organism}'"
375
+ end
355
376
  end
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [9606]
7
2
  $scientific_name = "Homo sapiens"
8
3
  $ortholog_key = "hsapiens_homolog_ensembl_gene"
@@ -95,17 +90,30 @@ $biomart_identifiers = [
95
90
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
96
91
  ]
97
92
 
98
- $biomart_go= [
99
- ["GO ID", 'go_id'],
100
- ["GO Namespace", 'namespace_1003'],
93
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
94
+ Thread.current["namespace"] = $namespace
95
+ load Organism.rake_organism_helper
96
+
97
+ file 'regulators' do |t|
98
+ regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
99
+ regulatory_fields = [
100
+ ['Chromosome Name','chromosome_name'],
101
+ ['Region Start', 'chromosome_start'],
102
+ ['Region End', 'chromosome_end'],
103
+ ['Feature type', 'feature_type_name'],
101
104
  ]
105
+ regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
106
+
107
+ Misc.sensiblewrite(t.name, regulators.to_s)
108
+ end
102
109
 
103
- $biomart_go_2009= [
104
- ["GO BP ID", 'go_biological_process_id'],
105
- ["GO MF ID", 'go_molecular_function_id'],
106
- ["GO CC ID", 'go_cellular_component_id'],
110
+ file 'regulator_activity' do |t|
111
+ regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
112
+ regulatory_fields = [
113
+ ['Epigenome name','epigenome_name'],
114
+ ['Activity', 'activity'],
107
115
  ]
116
+ regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
108
117
 
109
- #$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
110
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
111
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
118
+ Misc.sensiblewrite(t.name, regulators.to_s)
119
+ end
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [10090]
7
2
  $scientific_name = "Mus musculus"
8
3
  $ortholog_key = "mmusculus_homolog_ensembl_gene"
@@ -43,18 +38,6 @@ $biomart_identifiers = [
43
38
  [ 'EMBL (Genbank) ID' , "embl"] ,
44
39
  ]
45
40
 
46
- $biomart_go= [
47
- ["GO ID", 'go_id'],
48
- ["GO Namespace", 'namespace_1003'],
49
- ]
50
-
51
- $biomart_go_2009= [
52
- ["GO BP ID", 'go_biological_process_id'],
53
- ["GO MF ID", 'go_molecular_function_id'],
54
- ["GO CC ID", 'go_cellular_component_id'],
55
- ]
56
-
57
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
58
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
59
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
60
-
41
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
42
+ Thread.current["namespace"] = $namespace
43
+ load Organism.rake_organism_helper
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [10116]
7
2
  $scientific_name = "Rattus norvegicus"
8
3
 
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
50
45
  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
51
46
  ]
52
47
 
53
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
54
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
55
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
48
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
49
+ Thread.current["namespace"] = $namespace
50
+ load Organism.rake_organism_helper
@@ -0,0 +1,38 @@
1
+ $taxs = [559292,4932]
2
+ $scientific_name = "Saccharomyces cerevisiae"
3
+ $ensembl_domain = 'fungi'
4
+ #$ortholog_key = "yeast_ensembl_gene"
5
+
6
+ $biomart_db = 'scerevisiae_eg_gene'
7
+
8
+ $biomart_lexicon = [
9
+ [ 'Associated Gene Name' , "external_gene_name"],
10
+ ]
11
+
12
+ $biomart_protein_identifiers = [
13
+ [ 'Protein ID', "protein_id" ],
14
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
15
+ [ 'Unigene ID', "unigene" ],
16
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
17
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
18
+ ]
19
+
20
+ $biomart_probe_identifiers = [
21
+ ]
22
+
23
+ $biomart_identifiers = [
24
+ [ 'Entrez Gene ID', "entrezgene"],
25
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
26
+ [ 'Associated Gene Name', "external_gene_name" ],
27
+ [ 'Protein ID', "protein_id" ],
28
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
29
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
30
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
31
+ [ 'EMBL (Genbank) ID' , "embl"] ,
32
+ [ 'RefSeq DNA' , "refseq_dna"] ,
33
+ ]
34
+
35
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
36
+ Thread.current["namespace"] = $namespace
37
+ Thread.current["ensembl_domain"] = $ensembl_domain
38
+ load Organism.rake_organism_helper
@@ -1,8 +1,11 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
2
+
1
3
  require 'net/ftp'
4
+ require 'rbbt/sources/biomart'
5
+ require 'rbbt/sources/entrez'
6
+ require File.join(File.dirname(__FILE__), '../lib/helpers')
2
7
  require 'rbbt/sources/ensembl_ftp'
3
8
 
4
- #Thread.current['namespace'] = $namespace
5
-
6
9
  $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
7
10
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
8
11
  $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -77,6 +80,17 @@ $biomart_pfam= [
77
80
  ["Pfam Domain", 'pfam'],
78
81
  ]
79
82
 
83
+ $biomart_go= [
84
+ ["GO ID", 'go_id'],
85
+ ["GO Namespace", 'namespace_1003'],
86
+ ]
87
+
88
+ $biomart_go_2009= [
89
+ ["GO BP ID", 'go_biological_process_id'],
90
+ ["GO MF ID", 'go_molecular_function_id'],
91
+ ["GO CC ID", 'go_cellular_component_id'],
92
+ ]
93
+
80
94
  $biomart_gene_biotype= [
81
95
  ["Biotype", 'gene_biotype'],
82
96
  ]
@@ -91,7 +105,13 @@ $biomart_exons = [
91
105
  #{{{ Rules
92
106
 
93
107
  file 'entrez_taxids' do |t|
94
- Misc.sensiblewrite(t.name, $taxs * "\n")
108
+ if $tax && $tax.any?
109
+ Misc.sensiblewrite(t.name, $taxs * "\n")
110
+ else
111
+ tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
112
+ taxs = tsv[$scientific_name] || []
113
+ Misc.sensiblewrite(t.name, taxs * "\n")
114
+ end
95
115
  end
96
116
 
97
117
  file 'scientific_name' do |t|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
104
124
  Misc.sensiblewrite(t.name, $ortholog_key)
105
125
  end
106
126
 
107
- file 'identifiers' do |t|
127
+ file 'identifiers' => 'entrez_taxids' do |t|
128
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
108
129
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
109
130
  identifiers.unnamed = true
110
131
 
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
116
137
  end
117
138
 
118
139
  name_pos = identifiers.identify_field "Associated Gene Name"
119
- entrez2name = Entrez.entrez2name($taxs)
120
- identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
121
- names = values[name_pos]
140
+ if tax_codes and tax_codes.any?
141
+ entrez2name = Entrez.entrez2name(tax_codes)
142
+ identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
143
+ names = values[name_pos] || []
122
144
 
123
- matches = entrez.select do |e|
124
- entrez2name.include?(e) && (names & entrez2name[e]).any?
125
- end
145
+ matches = entrez.select do |e|
146
+ entrez2name.include?(e) && (names & entrez2name[e]).any?
147
+ end
126
148
 
127
- if matches.any?
128
- matches
129
- else
130
- entrez
149
+ if matches.any?
150
+ matches
151
+ else
152
+ entrez
153
+ end
131
154
  end
132
155
  end
133
156
 
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
147
170
  identifiers = identifiers.reorder(:key, ordered_fields)
148
171
  end
149
172
 
150
- entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
151
- entrez_synonyms.key_field = "Entrez Gene ID"
152
- entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
173
+ if tax_codes and tax_codes.any?
174
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
175
+ entrez_synonyms.key_field = "Entrez Gene ID"
176
+ entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
153
177
 
154
- identifiers.attach entrez_synonyms
178
+ identifiers.attach entrez_synonyms
179
+ end
155
180
 
156
181
  identifiers.with_unnamed do
157
182
  identifiers.each do |key, values|
158
183
  values.each do |list|
184
+ list ||= []
159
185
  list.reject!{|v| v.nil? or v.empty?}
160
186
  list.uniq!
161
187
  end
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
166
192
  Misc.sensiblewrite(t.name, identifiers.to_s)
167
193
  end
168
194
 
169
- file 'lexicon' => 'identifiers' do |t|
195
+ file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
170
196
  tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
197
+ tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
171
198
 
172
- entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
199
+ entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
173
200
  entrez_description.key_field = "Entrez Gene ID"
174
201
  entrez_description.fields = ["Entrez Gene Description"]
175
202
 
@@ -308,8 +335,9 @@ end
308
335
 
309
336
  # {{{ Other info
310
337
 
311
- file 'gene_pmids' do |t|
312
- tsv = Entrez.entrez2pubmed($taxs)
338
+ file 'gene_pmids' => 'entrez_taxids' do |t|
339
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
340
+ tsv = Entrez.entrez2pubmed(tax_codes)
313
341
  text = "#: :namespace=#{Thread.current['namespace']}\n"
314
342
  text += "#Entrez Gene ID\tPMID"
315
343
  tsv.each do |gene, pmids|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
417
445
 
418
446
  gene_go.monitor = true
419
447
  gene_go.process "GO ID" do |key, go_id, values|
420
- clean = values.zip_fields.select do |id, type|
448
+ clean = NamedArray.zip_fields(values).select do |id, type|
421
449
  type == "biological_process"
422
450
  end
423
451
  clean.collect{|id, type| id}
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
487
515
  end
488
516
 
489
517
  file 'chromosomes' do |t|
490
- goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
518
+ tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
491
519
 
492
- Misc.sensiblewrite(t.name, goterms.to_s)
520
+ Misc.sensiblewrite(t.name, tsv.keys * "\n")
493
521
  end
494
522
 
495
523
  file 'blacklist_chromosomes' => 'chromosomes' do |t|
@@ -511,6 +539,15 @@ end
511
539
 
512
540
  rule /^chromosome_.*/ do |t|
513
541
  chr = t.name.match(/chromosome_(.*)/)[1]
542
+ path = File.expand_path(t.name)
543
+ dirname = File.dirname(path)
544
+
545
+ organism = File.basename(dirname)
546
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
547
+ archive = organism
548
+ organism = File.basename(File.dirname(dirname))
549
+ organism = File.join(organism, archive)
550
+ end
514
551
 
515
552
  # HACK: Skip LRG chromosomes
516
553
  raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
519
556
 
520
557
  release = Ensembl.releases[archive]
521
558
 
522
- ftp = Net::FTP.new("ftp.ensembl.org")
559
+ fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
560
+ server, _, path = fasta_url.partition("/")
561
+ path = "/" + path
562
+
563
+ ftp = Net::FTP.new(server)
523
564
  ftp.passive = true
524
565
  ftp.login
525
- if release.nil? or release == 'current'
526
- ftp.chdir("pub/current_fasta/")
527
- else
528
- ftp.chdir("pub/#{ release }/fasta/")
529
- end
530
- ftp.chdir($scientific_name.downcase.sub(" ",'_'))
566
+ ftp.chdir(path)
531
567
  ftp.chdir('dna')
532
- file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
533
-
534
- raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
535
568
 
536
- Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
569
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
570
+ if file
571
+ Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
537
572
 
538
- Misc.lock t.name + '.rake' do
539
- TmpFile.with_file do |tmpfile|
540
- ftp.getbinaryfile(file, tmpfile)
541
- Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
542
- ftp.close
573
+ Misc.lock t.name + '.rake' do
574
+ TmpFile.with_file do |tmpfile|
575
+ ftp.getbinaryfile(file, tmpfile)
576
+ Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
577
+ ftp.close
578
+ end
543
579
  end
580
+ else
581
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
582
+ Misc.lock t.name + '.rake' do
583
+ TmpFile.with_file do |tmpfile|
584
+ ftp.getbinaryfile(file, tmpfile)
585
+ txt = Open.read(tmpfile, :gzip => true)
586
+
587
+ chr_txt = []
588
+
589
+ in_chr = false
590
+ txt.split("\n").each do |line|
591
+ if line.start_with?(">#{chr}")
592
+ in_chr = true
593
+ elsif line.start_with?(">")
594
+ in_chr = false
595
+ else
596
+ chr_txt << line if in_chr
597
+ end
598
+ end
599
+ Misc.sensiblewrite(t.name, chr_txt * "" )
600
+ ftp.close
601
+ end
602
+ end
603
+ raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
544
604
  end
545
605
  end
546
606
 
@@ -584,6 +644,16 @@ end
584
644
  require 'bio'
585
645
 
586
646
  file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
647
+ path = File.expand_path(t.name)
648
+ dirname = File.dirname(path)
649
+
650
+ organism = File.basename(dirname)
651
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
652
+ archive = organism
653
+ organism = File.basename(File.dirname(dirname))
654
+ organism = File.join(organism, archive)
655
+ end
656
+
587
657
  exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
588
658
 
589
659
  chr_transcript_ranges ||= {}
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
616
686
  chr_transcript_ranges.each do |chr, transcript_ranges|
617
687
  begin
618
688
  raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
619
- p = File.expand_path("./chromosome_#{chr}")
620
- Organism.root.annotate p
621
- p.sub!(%r{.*/organisms/},'share/organisms/')
622
- chr_str = p.produce.read
689
+ pkgdir = Thread.current["resource"]
690
+ p = pkgdir[organism]["chromosome_#{chr}"]
691
+ p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
692
+ chr_str = p.read
623
693
  rescue Exception
624
694
  Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
625
695
  raise $! unless $!.message =~ /not supported/
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
656
726
  organism = File.join(organism, archive)
657
727
  end
658
728
 
659
- translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
729
+ translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
660
730
 
661
731
  if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
662
732
  exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
670
740
  transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
671
741
  end
672
742
 
673
- transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
674
- transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
675
- exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
743
+ transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
744
+ transcript_exons = TSV.open("./transcript_exons", :unnamed => true)
745
+ exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
676
746
 
677
747
  transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
678
748
  transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
@@ -719,12 +789,13 @@ end
719
789
  file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
720
790
  transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
721
791
  transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
722
- transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
792
+ transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
723
793
  transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
724
794
  transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
725
795
 
726
796
 
727
797
  protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
798
+ transcript_sequence.monitor = true
728
799
  transcript_sequence.through do |transcript, sequence|
729
800
  protein = transcript_protein[transcript]
730
801
  next if protein.nil? or protein.empty?
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
777
848
  uni_seq = UniProt.get_uniprot_sequence(uni)
778
849
  ensps = uni2ensps[uni]
779
850
  next if ensps.nil? or ensps.empty?
851
+
780
852
  best_ensp = ensps.sort_by do |ensp|
781
853
  ensp_seq = ensp2seq[ensp]
782
854
  if ensp_seq
@@ -829,3 +901,4 @@ file 'cdna_fasta' do |t|
829
901
  Open.download(url, "#{t.name}.gz")
830
902
  nil
831
903
  end
904
+
@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
3
3
  require 'rbbt/sources/pubmed'
4
4
  require 'test/unit'
5
5
  require 'rbbt/sources/biomart'
6
+ require 'rbbt/sources/organism'
6
7
  require 'rbbt/util/tmpfile'
7
8
  require 'test/unit'
8
9
 
9
10
  class TestBioMart < Test::Unit::TestCase
10
11
 
11
12
  def setup
12
- BioMart.set_archive Organism.default_code("Hsa")
13
+ BioMart.set_archive "feb2014"
13
14
  end
14
15
 
15
16
  def teardown
16
17
  BioMart.unset_archive
17
18
  end
18
19
 
19
- def _test_get
20
+ def test_get_Sce
20
21
  assert_raise BioMart::QueryError do
21
22
  BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
22
23
  end
23
24
 
24
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :merge => true, :wget_options => {:quiet => false})
25
+ BioMart.set_archive "feb2023-fungi"
26
+ data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
25
27
  tsv = TSV.open data, :double, :merge => true
26
- assert(tsv['852236'][0].include? 'CAA84864')
28
+ assert(tsv['852236'][0].include? 'CAA84864.1')
27
29
 
28
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
30
+ data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
29
31
  tsv = TSV.open data, :double, :merge => true
30
32
  assert(tsv['852236'][1].include? 'YBL044W')
31
33
  end
32
34
 
33
- def _test_query
35
+ def test_get_Hsa
36
+ Log.severity = 0
37
+ data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
38
+ tsv = TSV.open data, :double, :merge => true
39
+ assert(tsv['852236'][0].include? 'CAA84864.1')
40
+ end
41
+
42
+
43
+ def test_query
34
44
  data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
35
45
  assert(data['852236']['external_gene_id'].include? 'YBL044W')
36
46
 
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
41
51
  end
42
52
  end
43
53
 
54
+ def __test_transcrip_exons
55
+ Log.with_severity 1 do
56
+ TmpFile.with_file do |f|
57
+ fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
58
+ main = fields[0]
59
+ attrs = fields.values_at(1, 2)
60
+ attrs_first = [attrs.first]
61
+ attrs_last = [attrs.last]
62
+ database = 'hsapiens_gene_ensembl'
63
+
64
+ filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
65
+ ppp Open.read(filename)
66
+
67
+ filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
68
+ ppp Open.read(filename)
69
+
70
+ filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
71
+ ppp Open.read(filename)
72
+
73
+ filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
74
+ ppp Open.read(filename)
75
+
76
+ data = TSV.open Open.open(filename)
77
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
78
+ end
79
+ end
80
+ end
81
+
44
82
  def test_tsv
45
83
  data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
46
84
  assert(data['852236']['Protein ID'].include? 'CAA84864')
@@ -0,0 +1,11 @@
1
+ require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
2
+ require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
3
+
4
+ class TestEnsemblFTP < Test::Unit::TestCase
5
+ def test_ftp_for
6
+ assert_nothing_raised do
7
+ Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
8
+ end
9
+ end
10
+ end
11
+
@@ -14,6 +14,11 @@ class TestEntrez < Test::Unit::TestCase
14
14
  assert(lexicon['855611'].include? 'S000005056')
15
15
  end
16
16
 
17
+ def test_entrez2name
18
+ tax = $yeast_tax
19
+ Entrez.entrez2name(tax)
20
+ end
21
+
17
22
  def test_entrez2pubmed
18
23
  tax = $yeast_tax
19
24
 
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
5
5
 
6
6
  class TestOrganism < Test::Unit::TestCase
7
7
 
8
- def test_known_ids
8
+ def _test_known_ids
9
9
  assert Organism.known_ids("Hsa").include?("Associated Gene Name")
10
10
  end
11
11
 
12
- def test_location
12
+ def _test_location
13
13
  assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
14
14
  end
15
15
 
16
- def test_identifiers
16
+ def _test_identifiers
17
17
  assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
18
18
  assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
19
19
  assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
20
20
  end
21
21
 
22
- def test_lexicon
22
+ def _test_lexicon
23
23
  assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
24
24
  end
25
25
 
26
- def test_guess_id
26
+ def _test_guess_id
27
27
  ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
28
28
  gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
29
29
  assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
30
30
  assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
31
31
  end
32
32
 
33
- def test_organisms
33
+ def _test_organisms
34
34
  assert Organism.organisms.include? "Hsa"
35
35
  assert_equal "Hsa", Organism.organism("Homo sapiens")
36
36
  end
37
37
 
38
- def test_attach_translations
38
+ def _test_attach_translations
39
39
  tsv = TSV.setup({"1020" => []}, :type => :list)
40
40
  tsv.key_field = "Entrez Gene ID"
41
41
  tsv.fields = []
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
47
47
  assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
48
48
  end
49
49
 
50
- def test_entrez_taxids
50
+ def _test_entrez_taxids
51
51
  assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
52
52
  end
53
53
 
54
- def test_lift_over
54
+ def _test_lift_over
55
55
  mutation_19 = "19:21131664:T"
56
56
  mutation_18 = "19:20923504:T"
57
- source_build = Organism.default_code("Hsa")
57
+ source_build = "Hsa/feb2014"
58
58
  target_build = "Hsa/may2009"
59
59
 
60
60
  assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
61
61
  assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
62
62
  end
63
63
 
64
- def test_orhtolog
64
+ def _test_orhtolog
65
65
  require 'rbbt/entity/gene'
66
66
  assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
67
67
  end
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
70
70
  assert Organism.chromosome_sizes["2"].to_i > 10_000_000
71
71
  end
72
72
 
73
- def test_build_organism
73
+ def _test_build_organism
74
74
  assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
75
75
  assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
76
76
  assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
77
77
  end
78
78
 
79
- #def test_genes_at_chromosome
79
+ #def _test_genes_at_chromosome
80
80
  # pos = [12, 117799500]
81
81
  # assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
82
82
  #end
83
83
 
84
- #def test_genes_at_chromosome_array
84
+ #def _test_genes_at_chromosome_array
85
85
  # pos = [12, [117799500, 106903900]]
86
86
  # assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
87
87
  #end
88
88
 
89
- #def test_genes_at_genomic_positions
89
+ #def _test_genes_at_genomic_positions
90
90
  # pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
91
91
  # assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
92
92
  #end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.0
4
+ version: 3.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-05-08 00:00:00.000000000 Z
11
+ date: 2025-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -120,10 +120,10 @@ files:
120
120
  - share/install/KEGG/Rakefile
121
121
  - share/install/Matador/Rakefile
122
122
  - share/install/NCI/Rakefile
123
- - share/install/Organism/Hsa/Rakefile
124
- - share/install/Organism/Mmu/Rakefile
125
- - share/install/Organism/Rno/Rakefile
126
- - share/install/Organism/Sce/Rakefile
123
+ - share/install/Organism/Hsa.rake
124
+ - share/install/Organism/Mmu.rake
125
+ - share/install/Organism/Rno.rake
126
+ - share/install/Organism/Sce.rake
127
127
  - share/install/Organism/organism_helpers.rb
128
128
  - share/install/PharmaGKB/Rakefile
129
129
  - share/install/Pina/Rakefile
@@ -133,6 +133,7 @@ files:
133
133
  - share/install/lib/rake_helper.rb
134
134
  - test/rbbt/sources/test_HPRD.rb
135
135
  - test/rbbt/sources/test_biomart.rb
136
+ - test/rbbt/sources/test_ensembl_ftp.rb
136
137
  - test/rbbt/sources/test_entrez.rb
137
138
  - test/rbbt/sources/test_go.rb
138
139
  - test/rbbt/sources/test_gscholar.rb
@@ -166,13 +167,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
167
  - !ruby/object:Gem::Version
167
168
  version: '0'
168
169
  requirements: []
169
- rubygems_version: 3.5.9
170
+ rubygems_version: 3.5.23
170
171
  signing_key:
171
172
  specification_version: 4
172
173
  summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
173
174
  test_files:
174
175
  - test/rbbt/sources/test_HPRD.rb
175
176
  - test/rbbt/sources/test_biomart.rb
177
+ - test/rbbt/sources/test_ensembl_ftp.rb
176
178
  - test/rbbt/sources/test_entrez.rb
177
179
  - test/rbbt/sources/test_go.rb
178
180
  - test/rbbt/sources/test_gscholar.rb
@@ -1,52 +0,0 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
- $taxs = [559292,4932]
7
- $scientific_name = "Saccharomyces cerevisiae"
8
- #$ortholog_key = "yeast_ensembl_gene"
9
-
10
- $biomart_db = 'scerevisiae_gene_ensembl'
11
-
12
- $biomart_lexicon = [
13
- [ 'Associated Gene Name' , "external_gene_id"],
14
- ]
15
-
16
- $biomart_protein_identifiers = [
17
- [ 'Protein ID', "protein_id" ],
18
- [ 'RefSeq Protein ID', "refseq_peptide" ],
19
- [ 'Unigene ID', "unigene" ],
20
- [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
21
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
22
- ]
23
-
24
- $biomart_probe_identifiers = [
25
- ]
26
-
27
- $biomart_identifiers = [
28
- [ 'Entrez Gene ID', "entrezgene"],
29
- [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
30
- [ 'Associated Gene Name', "external_gene_id" ],
31
- [ 'Protein ID', "protein_id" ],
32
- [ 'RefSeq Protein ID', "refseq_peptide" ],
33
- [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
34
- [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
35
- [ 'EMBL (Genbank) ID' , "embl"] ,
36
- [ 'RefSeq mRNA' , "refseq_mrna"] ,
37
- ]
38
-
39
- $biomart_go= [
40
- ["GO ID", 'go_id'],
41
- ["GO Namespace", 'namespace_1003'],
42
- ]
43
-
44
- $biomart_go_2009= [
45
- ["GO BP ID", 'go_biological_process_id'],
46
- ["GO MF ID", 'go_molecular_function_id'],
47
- ["GO CC ID", 'go_cellular_component_id'],
48
- ]
49
-
50
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
51
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
52
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')