bio-sra 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +20 -0
- data/LICENSE.txt +20 -0
- data/README.md +66 -0
- data/Rakefile +634 -0
- data/VERSION +1 -0
- data/bin/sra_download +170 -0
- data/config/database.yml +7 -0
- data/lib/bio-sra.rb +19 -0
- data/lib/bio/sra/connect.rb +39 -0
- data/lib/bio/sra/sra.rb +152 -0
- data/lib/bio/sra/tables.rb +467 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-sra.rb +78 -0
- data/test/test_sra_download.rb +86 -0
- metadata +216 -0
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/bin/sra_download
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'bio-logger'
|
5
|
+
require 'csv'
|
6
|
+
require 'bio-commandeer'
|
7
|
+
|
8
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
9
|
+
require 'bio-sra'
|
10
|
+
|
11
|
+
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
12
|
+
|
13
|
+
# Parse command line options into the options hash
|
14
|
+
options = {
|
15
|
+
:logger => 'stderr',
|
16
|
+
:format => :sra,
|
17
|
+
:accessions_file => nil,
|
18
|
+
:download_all_from_study => false,
|
19
|
+
:treat_input_as_runs => false,
|
20
|
+
}
|
21
|
+
o = OptionParser.new do |opts|
|
22
|
+
opts.banner = "
|
23
|
+
Usage: #{SCRIPT_NAME} <SRA_ACCESSION>
|
24
|
+
|
25
|
+
Download data from SRA \n"
|
26
|
+
|
27
|
+
opts.on('-f', "--file FILENAME", "Provide a file of accession numbers, separated by whitespace or commas [default: not used, use the first argument <SRA_ACCESSION>]") do |f|
|
28
|
+
options[:accessions_file] = f
|
29
|
+
end
|
30
|
+
opts.on("--format FORMAT", "format for download [default: 'sra']") do |f|
|
31
|
+
format_string_to_sym = {
|
32
|
+
'sralite' => :sralite, # no longer supported by NCBI?
|
33
|
+
'sra' => :sra,
|
34
|
+
}
|
35
|
+
options[:format] = format_string_to_sym[f]
|
36
|
+
|
37
|
+
if options[:format].nil?
|
38
|
+
raise "Unexpected file format specified '#{f}'. I require one of #{format_string_to_sym.keys.join(', ') }"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
opts.on("--dry-run", "Don't download any instead print the URLs to download to the command line [default: not this, do download]") do
|
42
|
+
options[:dry_run] = true
|
43
|
+
end
|
44
|
+
|
45
|
+
# logger options
|
46
|
+
opts.separator "\nVerbosity:\n\n"
|
47
|
+
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q|
|
48
|
+
Bio::Log::CLI.trace('error')
|
49
|
+
end
|
50
|
+
opts.on("--logger filename",String,"Log to file [default #{options[:logger] }]") do | name |
|
51
|
+
options[:logger] = name
|
52
|
+
end
|
53
|
+
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG") do | s |
|
54
|
+
Bio::Log::CLI.trace(s)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
o.parse!
|
58
|
+
|
59
|
+
if options[:accessions_file].nil? and ARGV.length == 0
|
60
|
+
$stderr.puts o
|
61
|
+
exit 1
|
62
|
+
end
|
63
|
+
|
64
|
+
# Setup logging
|
65
|
+
Bio::Log::CLI.logger(options[:logger]) #bio-logger defaults to STDERR not STDOUT, I disagree
|
66
|
+
log = Bio::Log::LoggerPlus.new(LOG_NAME)
|
67
|
+
Bio::Log::CLI.configure(LOG_NAME)
|
68
|
+
|
69
|
+
almost_accessions = nil
|
70
|
+
if options[:accessions_file]
|
71
|
+
log.debug "Reading SRA accessions from file #{options[:accessions_file] }"
|
72
|
+
almost_accessions = File.open(options[:accessions_file]).read.split(/[\s,]+/)
|
73
|
+
else
|
74
|
+
almost_accessions = ARGV.collect{|r| r.split(/[\s,]+/)}.flatten
|
75
|
+
end
|
76
|
+
# Remove empty strings and extra digits at the end e.g. SRA029325.1 => SRA029325
|
77
|
+
accessions = almost_accessions.reject{|a| a==''}.collect{|a| a.gsub(/\.\d+$/,'')}
|
78
|
+
log.info "Read in #{accessions.length} accessions"
|
79
|
+
|
80
|
+
# Do we need to connect to the database? Only yes if there are accessions that are not runs
|
81
|
+
options[:treat_input_as_runs] = true
|
82
|
+
accessions.each do |acc|
|
83
|
+
unless Bio::SRA::Accession.classify_accession_type(acc) == Bio::SRA::RUN
|
84
|
+
log.debug "Found accession number #{acc} that does not appear to be a run accession, so need to connect to database" if log.debug?
|
85
|
+
options[:treat_input_as_runs] = false
|
86
|
+
break
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Connect to the database if required
|
91
|
+
log.info "Connecting to database.."
|
92
|
+
Bio::SRA::Connection.connect unless options[:treat_input_as_runs]
|
93
|
+
|
94
|
+
log.info "Collecting a list of runs to download.."
|
95
|
+
runs = []
|
96
|
+
accessions.each do |acc|
|
97
|
+
if options[:treat_input_as_runs]
|
98
|
+
runs.push [acc, acc]
|
99
|
+
else
|
100
|
+
acc_type = Bio::SRA::Accession.classify_accession_type(acc)
|
101
|
+
|
102
|
+
# Convert Run ActiveRecords into simple accessions
|
103
|
+
sra_runs = case acc_type
|
104
|
+
when Bio::SRA::RUN then
|
105
|
+
Bio::SRA::Tables::SRA.where(:run_accession => acc)
|
106
|
+
when Bio::SRA::EXPERIMENT then
|
107
|
+
Bio::SRA::Tables::SRA.where(:experiment_accession => acc)
|
108
|
+
when Bio::SRA::SAMPLE then
|
109
|
+
Bio::SRA::Tables::SRA.where(:sample_accession => acc)
|
110
|
+
when Bio::SRA::STUDY then
|
111
|
+
Bio::SRA::Tables::SRA.where(:study_accession => acc)
|
112
|
+
when Bio::SRA::SUBMISSION then
|
113
|
+
Bio::SRA::Tables::SRA.where(:submission_accession => acc)
|
114
|
+
else
|
115
|
+
raise "Programming error: unexpected accession type: #{acc_type}"
|
116
|
+
end
|
117
|
+
|
118
|
+
if sra_runs.empty?
|
119
|
+
log.warn "Unable to find accession number #{acc} in the metadata database, skipping"
|
120
|
+
next
|
121
|
+
else
|
122
|
+
log.debug "Found #{sra_runs.length} runs to download for accession number #{acc}"
|
123
|
+
end
|
124
|
+
|
125
|
+
sra_runs.each do |r|
|
126
|
+
runs.push [r.run_accession, acc]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Remove duplicate runs
|
132
|
+
runs.uniq! do |run_acc|
|
133
|
+
run_acc[0]
|
134
|
+
end
|
135
|
+
log.info "Found #{runs.length} unique run files to download, downloading them now.."
|
136
|
+
num_skipped = 0
|
137
|
+
num_downloaded = 0
|
138
|
+
runs.each_with_index do |run_acc, index|
|
139
|
+
run = run_acc[0]
|
140
|
+
acc = run_acc[1]
|
141
|
+
|
142
|
+
download_path = "#{run}.#{options[:format] }"
|
143
|
+
log.debug "Downloading to #{download_path}"
|
144
|
+
|
145
|
+
if File.exist?(download_path)
|
146
|
+
log.debug "Skipping download of run #{download_path} since a file of that accession already exists"
|
147
|
+
num_skipped += 1
|
148
|
+
next
|
149
|
+
end
|
150
|
+
|
151
|
+
if run == acc
|
152
|
+
log.info "Downloading run #{run} (#{index+1}/#{runs.length})"
|
153
|
+
else
|
154
|
+
log.info "Downloading run #{run} from #{acc} (#{index+1}/#{runs.length})"
|
155
|
+
end
|
156
|
+
|
157
|
+
url = Bio::SRA::Accession.run_download_url(run, :format => options[:format])
|
158
|
+
|
159
|
+
num_downloaded += 1
|
160
|
+
if options[:dry_run]
|
161
|
+
puts url
|
162
|
+
else
|
163
|
+
`wget #{url.inspect}`
|
164
|
+
end
|
165
|
+
end
|
166
|
+
if options[:dry_run]
|
167
|
+
log.info "Finished printing #{num_downloaded} URLs, ignoring #{num_skipped} already downloaded"
|
168
|
+
else
|
169
|
+
log.info "Finished downloading #{num_downloaded}, ignoring #{num_skipped} already downloaded"
|
170
|
+
end
|
data/config/database.yml
ADDED
data/lib/bio-sra.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
require 'active_record'
|
11
|
+
|
12
|
+
require 'bio-logger'
|
13
|
+
Bio::Log::LoggerPlus.new('bio-sra')
|
14
|
+
|
15
|
+
require 'bio/sra/connect'
|
16
|
+
require 'bio/sra/sra'
|
17
|
+
require 'bio/sra/tables'
|
18
|
+
|
19
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Bio
|
2
|
+
module SRA
|
3
|
+
def self.connect
|
4
|
+
Connection.connect
|
5
|
+
end
|
6
|
+
|
7
|
+
class Connection < ActiveRecord::Base
|
8
|
+
self.abstract_class = true
|
9
|
+
|
10
|
+
# Connect to a metadata database.
|
11
|
+
#
|
12
|
+
# * sra_metadb_path: a path to the gunzipped SRAmetadb.sqlite file which is the database. By default this is in the db/ directory of this gem, but that probably isn't where the db file is.
|
13
|
+
#
|
14
|
+
# You can download the file like so:
|
15
|
+
#
|
16
|
+
# $ wget http://watson.nci.nih.gov/~zhujack/SRAmetadb.sqlite.gz
|
17
|
+
# # gunzip SRAmetadb.sqlite.gz
|
18
|
+
def self.connect(sra_metadb_path=File.join(File.dirname(__FILE__),'..','..','..','db','SRAmetadb.sqlite'))
|
19
|
+
log = Bio::Log::LoggerPlus['bio-sra']
|
20
|
+
log.info "Attempting to connect to database #{sra_metadb_path}"
|
21
|
+
|
22
|
+
# default:
|
23
|
+
# adapter: sqlite3
|
24
|
+
# database: db/SRAmetadb.sqlite
|
25
|
+
# pool: 5
|
26
|
+
# timeout: 5000
|
27
|
+
|
28
|
+
options = {
|
29
|
+
:adapter => 'sqlite3',
|
30
|
+
:database => sra_metadb_path,
|
31
|
+
:pool => 5,
|
32
|
+
:timeout => 5000,
|
33
|
+
}
|
34
|
+
|
35
|
+
establish_connection(options)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/bio/sra/sra.rb
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
module Bio
|
2
|
+
module SRA
|
3
|
+
SUBMISSION = 'submission'
|
4
|
+
STUDY = 'study'
|
5
|
+
SAMPLE = 'sample'
|
6
|
+
EXPERIMENT = 'experiment'
|
7
|
+
RUN = 'run'
|
8
|
+
|
9
|
+
class Accession
|
10
|
+
@@log = Bio::Log::LoggerPlus['bio-sra']
|
11
|
+
|
12
|
+
# valid_in_type <- c(SRA = "submission", ERA = "submission",
|
13
|
+
# DRA = "submission", SRP = "study", ERP = "study", DRP = "study",
|
14
|
+
# SRS = "sample", ERS = "sample", DRS = "sample", SRX = "experiment",
|
15
|
+
# ERX = "experiment", DRX = "experiment", SRR = "run",
|
16
|
+
# ERR = "run", DRR = "run")
|
17
|
+
ACCESSION_TO_TYPE = {
|
18
|
+
'SRA' => Bio::SRA::SUBMISSION,
|
19
|
+
'ERA' => Bio::SRA::SUBMISSION,
|
20
|
+
'DRA' => Bio::SRA::SUBMISSION,
|
21
|
+
'SRP' => Bio::SRA::STUDY,
|
22
|
+
'ERP' => Bio::SRA::STUDY,
|
23
|
+
'DRP' => Bio::SRA::STUDY,
|
24
|
+
'SRS' => Bio::SRA::SAMPLE,
|
25
|
+
'ERS' => Bio::SRA::SAMPLE,
|
26
|
+
'DRS' => Bio::SRA::SAMPLE,
|
27
|
+
'SRX' => Bio::SRA::EXPERIMENT,
|
28
|
+
'ERX' => Bio::SRA::EXPERIMENT,
|
29
|
+
'DRX' => Bio::SRA::EXPERIMENT,
|
30
|
+
'SRR' => Bio::SRA::RUN,
|
31
|
+
'ERR' => Bio::SRA::RUN,
|
32
|
+
'DRR' => Bio::SRA::RUN,
|
33
|
+
}
|
34
|
+
|
35
|
+
TYPE_TO_COLUMN = {
|
36
|
+
Bio::SRA::SUBMISSION => :submission_accession,
|
37
|
+
Bio::SRA::STUDY => :study_accession,
|
38
|
+
Bio::SRA::SAMPLE => :sample_accession,
|
39
|
+
Bio::SRA::EXPERIMENT => :experiment_accession,
|
40
|
+
Bio::SRA::RUN => :run_accession,
|
41
|
+
}
|
42
|
+
|
43
|
+
def self.classify_accession_type(accession)
|
44
|
+
type = ACCESSION_TO_TYPE[accession[0..2]]
|
45
|
+
if type.nil?
|
46
|
+
raise "Unrecognised accession string '#{accession}'"
|
47
|
+
end
|
48
|
+
@@log.debug "Classified #{accession} as SRA type '#{type}'" if @@log.debug?
|
49
|
+
return type
|
50
|
+
end
|
51
|
+
|
52
|
+
# Given an accession, return the column name it in the SRA table that contains it as a symbol.
|
53
|
+
# e.g. accession_to_column_name('SRR617581') => :run_accession
|
54
|
+
def self.accession_to_column_name(accession)
|
55
|
+
TYPE_TO_COLUMN[classify_accession_type(accession)]
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.format_symbol_to_extension(format_symbol)
|
59
|
+
non_standard_extensions = {
|
60
|
+
:sralite => '.lite.sra',
|
61
|
+
:fastq_gz => '.fastq.gz',
|
62
|
+
}
|
63
|
+
style = format_symbol_to_standard_text format_symbol
|
64
|
+
|
65
|
+
# Default extension is the same as the format
|
66
|
+
style_extension = non_standard_extensions[format_symbol]
|
67
|
+
style_extension ||= ".#{style}"
|
68
|
+
|
69
|
+
return style_extension
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.format_symbol_to_standard_text(format_symbol)
|
73
|
+
formats = {
|
74
|
+
:sralite => 'litesra',
|
75
|
+
:sra => 'sra',
|
76
|
+
:fastq_gz => 'fastq',
|
77
|
+
:sff => 'sff'
|
78
|
+
}
|
79
|
+
style = formats[format_symbol]
|
80
|
+
if style.nil?
|
81
|
+
raise "Unexpected download format detected #{format_symbol}, I need one of '#{formats.keys.join(', ')}'"
|
82
|
+
end
|
83
|
+
return style
|
84
|
+
end
|
85
|
+
|
86
|
+
# Return the URL where a run can be downloaded. Only works if the accession is a run accession e.g. SRR000002 or DRR000002. To get run accessions from other accession type e.g. SRP000002, try Bio::SRA::Sra
|
87
|
+
#
|
88
|
+
# Options:
|
89
|
+
# :source: either :ncbi (default), or :ebi
|
90
|
+
# :format: either :sralite (default if :source if :ncbi), :fastq_gz (default if :source is :ebi), :sra
|
91
|
+
# :layout: either :single (default), :paired1, or :paired2. :paired1 for the first half, :paired2 for the second half. Only required when :source => :ebi, otherwise not used
|
92
|
+
def self.run_download_url(run_accession, options={})
|
93
|
+
options ||= {}
|
94
|
+
options[:source] ||= :ncbi
|
95
|
+
if options[:source] == :ebi
|
96
|
+
options[:format] ||= :fastq
|
97
|
+
options[:layout] ||= :single
|
98
|
+
else
|
99
|
+
options[:format] ||= :sralite #default to sralite
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
type = classify_accession_type(run_accession)
|
104
|
+
unless type == Bio::SRA::RUN
|
105
|
+
raise "Unexpected type of accession for '#{run_accession}': Expected #{Bio::SRA::RUN} but was #{type}"
|
106
|
+
end
|
107
|
+
|
108
|
+
style = format_symbol_to_standard_text options[:format]
|
109
|
+
style_extension = format_symbol_to_extension options[:format]
|
110
|
+
|
111
|
+
if options[:source] == :ncbi
|
112
|
+
# e.g.
|
113
|
+
# ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/litesra/DRR/DRR000/DRR000002/DRR000002/DRR000002.lite.sra
|
114
|
+
[
|
115
|
+
"ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun",
|
116
|
+
style,
|
117
|
+
run_accession[0..2],
|
118
|
+
run_accession[0..5],
|
119
|
+
run_accession,
|
120
|
+
"#{run_accession}#{style_extension}"
|
121
|
+
].join('/')
|
122
|
+
elsif options[:source] == :ebi
|
123
|
+
unless style == 'fastq'
|
124
|
+
raise "Unexpected format for download detected #{options[:format]} in combination with :source => :ebi. Require :fastq_gz"
|
125
|
+
end
|
126
|
+
ok_layouts = [:single, :paired1, :paired2]
|
127
|
+
unless ok_layouts.include?(options[:layout])
|
128
|
+
raise "Unexpected layout for download detected #{options[:layout]} in combination with :source => :ebi. Require on of #{ok_layouts.join(', ')}."
|
129
|
+
end
|
130
|
+
# e.g. for paired ended
|
131
|
+
# ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR069/SRR069027/SRR069027_1.fastq.gz
|
132
|
+
# e.g. for single end
|
133
|
+
# ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR404/SRR404303/SRR404303.fastq.gz
|
134
|
+
most = [
|
135
|
+
'ftp://ftp.sra.ebi.ac.uk/vol1/',
|
136
|
+
style,
|
137
|
+
run_accession[0..5],
|
138
|
+
run_accession,
|
139
|
+
]
|
140
|
+
if options[:layout] == :single
|
141
|
+
most.push "#{run_accession}#{style_extension}"
|
142
|
+
elsif options[:layout] == :format1
|
143
|
+
most.push "#{run_accession}_1#{style_extension}"
|
144
|
+
elsif options[:layout] == :format2
|
145
|
+
most.push "#{run_accession}_2#{style_extension}"
|
146
|
+
end
|
147
|
+
return most.join('/')
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,467 @@
|
|
1
|
+
module Bio
|
2
|
+
module SRA
|
3
|
+
module Tables
|
4
|
+
# > pp Bio::SRA::Tables::SRA.column_names
|
5
|
+
# ["sra_ID",
|
6
|
+
# "SRR_bamFile",
|
7
|
+
# "SRX_bamFile",
|
8
|
+
# "SRX_fastqFTP",
|
9
|
+
# "run_ID",
|
10
|
+
# "run_alias",
|
11
|
+
# "run_accession",
|
12
|
+
# "run_date",
|
13
|
+
# "updated_date",
|
14
|
+
# "spots",
|
15
|
+
# "bases",
|
16
|
+
# "run_center",
|
17
|
+
# "experiment_name",
|
18
|
+
# "run_url_link",
|
19
|
+
# "run_entrez_link",
|
20
|
+
# "run_attribute",
|
21
|
+
# "experiment_ID",
|
22
|
+
# "experiment_alias",
|
23
|
+
# "experiment_accession",
|
24
|
+
# "experiment_title",
|
25
|
+
# "study_name",
|
26
|
+
# "sample_name",
|
27
|
+
# "design_description",
|
28
|
+
# "library_name",
|
29
|
+
# "library_strategy",
|
30
|
+
# "library_source",
|
31
|
+
# "library_selection",
|
32
|
+
# "library_layout",
|
33
|
+
# "library_construction_protocol",
|
34
|
+
# "adapter_spec",
|
35
|
+
# "read_spec",
|
36
|
+
# "platform",
|
37
|
+
# "instrument_model",
|
38
|
+
# "instrument_name",
|
39
|
+
# "platform_parameters",
|
40
|
+
# "sequence_space",
|
41
|
+
# "base_caller",
|
42
|
+
# "quality_scorer",
|
43
|
+
# "number_of_levels",
|
44
|
+
# "multiplier",
|
45
|
+
# "qtype",
|
46
|
+
# "experiment_url_link",
|
47
|
+
# "experiment_entrez_link",
|
48
|
+
# "experiment_attribute",
|
49
|
+
# "sample_ID",
|
50
|
+
# "sample_alias",
|
51
|
+
# "sample_accession",
|
52
|
+
# "taxon_id",
|
53
|
+
# "common_name",
|
54
|
+
# "anonymized_name",
|
55
|
+
# "individual_name",
|
56
|
+
# "description",
|
57
|
+
# "sample_url_link",
|
58
|
+
# "sample_entrez_link",
|
59
|
+
# "sample_attribute",
|
60
|
+
# "study_ID",
|
61
|
+
# "study_alias",
|
62
|
+
# "study_accession",
|
63
|
+
# "study_title",
|
64
|
+
# "study_type",
|
65
|
+
# "study_abstract",
|
66
|
+
# "center_project_name",
|
67
|
+
# "study_description",
|
68
|
+
# "study_url_link",
|
69
|
+
# "study_entrez_link",
|
70
|
+
# "study_attribute",
|
71
|
+
# "related_studies",
|
72
|
+
# "primary_study",
|
73
|
+
# "submission_ID",
|
74
|
+
# "submission_accession",
|
75
|
+
# "submission_comment",
|
76
|
+
# "submission_center",
|
77
|
+
# "submission_lab",
|
78
|
+
# "submission_date"]
|
79
|
+
class SRA < Connection
|
80
|
+
self.table_name = 'sra'
|
81
|
+
self.primary_key = 'sra_ID'
|
82
|
+
|
83
|
+
# Foreign keys
|
84
|
+
belongs_to :submission, :foreign_key => 'submission_ID', :class_name => 'Submission', :primary_key => 'submission_ID'
|
85
|
+
belongs_to :experiment, :foreign_key => 'experiment_ID', :class_name => 'Experiment', :primary_key => 'experiment_ID'
|
86
|
+
belongs_to :study, :foreign_key => 'study_ID', :class_name => 'Study', :primary_key => 'study_ID'
|
87
|
+
belongs_to :sample, :foreign_key => 'sample_ID', :class_name => 'Sample', :primary_key => 'sample_ID'
|
88
|
+
belongs_to :run, :foreign_key => 'run_ID', :class_name => 'Run', :primary_key => 'run_ID'
|
89
|
+
|
90
|
+
# named_scope for finding by an arbitrary SRA accession number e.g.
|
91
|
+
# SRA.accession('SRA049809').all #=> Array of SRA objects that are part of the SRA049809 submission
|
92
|
+
# SRA.accession('SRA049809').first #=> SRA object for the SRR404303 run (there is only 1 since this is a run accession)
|
93
|
+
scope :accession, lambda {|accession|
|
94
|
+
type = Bio::SRA::Accession.classify_accession_type(accession)
|
95
|
+
{:conditions => {"#{type}_accession".to_sym => accession}}
|
96
|
+
}
|
97
|
+
|
98
|
+
# URLs of all the runs in this project
|
99
|
+
def study_download_urls(options = {})
|
100
|
+
SRA.where(:study_accession => study_accession).all.collect do |run|
|
101
|
+
run.download_url(options)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Return the URL where this SRA entry can be downloaded
|
106
|
+
# sraFileDir <- paste("ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/",
|
107
|
+
# sraType, "/", substring(sra_acc$experiment[i], 1,
|
108
|
+
# 3), "/", substring(sra_acc$experiment[i], 1,
|
109
|
+
# 6), "/", sra_acc$experiment[i], "/", sra_acc$run[i],
|
110
|
+
# "/", sep = "")
|
111
|
+
def download_url(options = {})
|
112
|
+
Bio::SRA::Accession.run_download_url(run_accession, options)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# pp Bio::SRA::Tables::Submission.column_names
|
117
|
+
# ["submission_ID",
|
118
|
+
# "submission_alias",
|
119
|
+
# "submission_accession",
|
120
|
+
# "submission_comment",
|
121
|
+
# "files",
|
122
|
+
# "broker_name",
|
123
|
+
# "center_name",
|
124
|
+
# "lab_name",
|
125
|
+
# "submission_date",
|
126
|
+
# "sra_link",
|
127
|
+
# "submission_url_link",
|
128
|
+
# "xref_link",
|
129
|
+
# "submission_entrez_link",
|
130
|
+
# "ddbj_link",
|
131
|
+
# "ena_link",
|
132
|
+
# "submission_attribute",
|
133
|
+
# "sradb_updated"]
|
134
|
+
class Submission < Connection
|
135
|
+
self.table_name = 'submission'
|
136
|
+
self.primary_key = 'submission_ID'
|
137
|
+
has_many :sras, :foreign_key => 'submission_ID', :class_name => 'SRA'
|
138
|
+
end
|
139
|
+
|
140
|
+
# pp Bio::SRA::Tables::Experiment.column_names
|
141
|
+
# ["experiment_ID",
|
142
|
+
# "bamFile",
|
143
|
+
# "fastqFTP",
|
144
|
+
# "experiment_alias",
|
145
|
+
# "experiment_accession",
|
146
|
+
# "broker_name",
|
147
|
+
# "center_name",
|
148
|
+
# "title",
|
149
|
+
# "study_name",
|
150
|
+
# "study_accession",
|
151
|
+
# "design_description",
|
152
|
+
# "sample_name",
|
153
|
+
# "sample_accession",
|
154
|
+
# "sample_member",
|
155
|
+
# "library_name",
|
156
|
+
# "library_strategy",
|
157
|
+
# "library_source",
|
158
|
+
# "library_selection",
|
159
|
+
# "library_layout",
|
160
|
+
# "targeted_loci",
|
161
|
+
# "library_construction_protocol",
|
162
|
+
# "spot_length",
|
163
|
+
# "adapter_spec",
|
164
|
+
# "read_spec",
|
165
|
+
# "platform",
|
166
|
+
# "instrument_model",
|
167
|
+
# "platform_parameters",
|
168
|
+
# "sequence_space",
|
169
|
+
# "base_caller",
|
170
|
+
# "quality_scorer",
|
171
|
+
# "number_of_levels",
|
172
|
+
# "multiplier",
|
173
|
+
# "qtype",
|
174
|
+
# "sra_link",
|
175
|
+
# "experiment_url_link",
|
176
|
+
# "xref_link",
|
177
|
+
# "experiment_entrez_link",
|
178
|
+
# "ddbj_link",
|
179
|
+
# "ena_link",
|
180
|
+
# "experiment_attribute",
|
181
|
+
# "submission_accession",
|
182
|
+
# "sradb_updated"]
|
183
|
+
class Experiment < Connection
|
184
|
+
self.table_name = 'experiment'
|
185
|
+
self.primary_key = 'experiment_ID'
|
186
|
+
has_many :sras, :foreign_key => 'experiment_ID', :class_name => 'SRA'
|
187
|
+
end
|
188
|
+
|
189
|
+
# pp Bio::SRA::Tables::Run.column_names
|
190
|
+
# ["run_ID",
|
191
|
+
# "bamFile",
|
192
|
+
# "run_alias",
|
193
|
+
# "run_accession",
|
194
|
+
# "broker_name",
|
195
|
+
# "instrument_name",
|
196
|
+
# "run_date",
|
197
|
+
# "run_file",
|
198
|
+
# "run_center",
|
199
|
+
# "total_data_blocks",
|
200
|
+
# "experiment_accession",
|
201
|
+
# "experiment_name",
|
202
|
+
# "sra_link",
|
203
|
+
# "run_url_link",
|
204
|
+
# "xref_link",
|
205
|
+
# "run_entrez_link",
|
206
|
+
# "ddbj_link",
|
207
|
+
# "ena_link",
|
208
|
+
# "run_attribute",
|
209
|
+
# "submission_accession",
|
210
|
+
# "sradb_updated"]
|
211
|
+
class Run < Connection
|
212
|
+
self.table_name = 'run'
|
213
|
+
self.primary_key = 'run_ID'
|
214
|
+
has_many :sras, :foreign_key => 'run_ID', :class_name => 'SRA'
|
215
|
+
end
|
216
|
+
|
217
|
+
# pp Bio::SRA::Tables::Sample.column_names
|
218
|
+
# ["sample_ID",
|
219
|
+
# "sample_alias",
|
220
|
+
# "sample_accession",
|
221
|
+
# "broker_name",
|
222
|
+
# "center_name",
|
223
|
+
# "taxon_id",
|
224
|
+
# "scientific_name",
|
225
|
+
# "common_name",
|
226
|
+
# "anonymized_name",
|
227
|
+
# "individual_name",
|
228
|
+
# "description",
|
229
|
+
# "sra_link",
|
230
|
+
# "sample_url_link",
|
231
|
+
# "xref_link",
|
232
|
+
# "sample_entrez_link",
|
233
|
+
# "ddbj_link",
|
234
|
+
# "ena_link",
|
235
|
+
# "sample_attribute",
|
236
|
+
# "submission_accession",
|
237
|
+
# "sradb_updated"]
|
238
|
+
class Sample < Connection
|
239
|
+
self.table_name = 'sample'
|
240
|
+
self.primary_key = 'sample_ID'
|
241
|
+
has_many :sras, :foreign_key => 'sample_ID', :class_name => 'SRA'
|
242
|
+
end
|
243
|
+
|
244
|
+
# pp Bio::SRA::Tables::Study.column_names
|
245
|
+
# ["study_ID",
|
246
|
+
# "study_alias",
|
247
|
+
# "study_accession",
|
248
|
+
# "study_title",
|
249
|
+
# "study_type",
|
250
|
+
# "study_abstract",
|
251
|
+
# "broker_name",
|
252
|
+
# "center_name",
|
253
|
+
# "center_project_name",
|
254
|
+
# "study_description",
|
255
|
+
# "related_studies",
|
256
|
+
# "primary_study",
|
257
|
+
# "sra_link",
|
258
|
+
# "study_url_link",
|
259
|
+
# "xref_link",
|
260
|
+
# "study_entrez_link",
|
261
|
+
# "ddbj_link",
|
262
|
+
# "ena_link",
|
263
|
+
# "study_attribute",
|
264
|
+
# "submission_accession",
|
265
|
+
# "sradb_updated"]
|
266
|
+
class Study < Connection
|
267
|
+
self.table_name = 'study'
|
268
|
+
self.primary_key = 'study_ID'
|
269
|
+
has_many :sras, :foreign_key => 'study_ID', :class_name => 'SRA'
|
270
|
+
end
|
271
|
+
|
272
|
+
# > pp Bio::SRA::Tables::SRAFt.column_names
|
273
|
+
# ["SRR_bamFile",
|
274
|
+
# "SRX_bamFile",
|
275
|
+
# "SRX_fastqFTP",
|
276
|
+
# "run_ID",
|
277
|
+
# "run_alias",
|
278
|
+
# "run_accession",
|
279
|
+
# "run_date",
|
280
|
+
# "updated_date",
|
281
|
+
# "spots",
|
282
|
+
# "bases",
|
283
|
+
# "run_center",
|
284
|
+
# "experiment_name",
|
285
|
+
# "run_url_link",
|
286
|
+
# "run_entrez_link",
|
287
|
+
# "run_attribute",
|
288
|
+
# "experiment_ID",
|
289
|
+
# "experiment_alias",
|
290
|
+
# "experiment_accession",
|
291
|
+
# "experiment_title",
|
292
|
+
# "study_name",
|
293
|
+
# "sample_name",
|
294
|
+
# "design_description",
|
295
|
+
# "library_name",
|
296
|
+
# "library_strategy",
|
297
|
+
# "library_source",
|
298
|
+
# "library_selection",
|
299
|
+
# "library_layout",
|
300
|
+
# "library_construction_protocol",
|
301
|
+
# "adapter_spec",
|
302
|
+
# "read_spec",
|
303
|
+
# "platform",
|
304
|
+
# "instrument_model",
|
305
|
+
# "instrument_name",
|
306
|
+
# "platform_parameters",
|
307
|
+
# "sequence_space",
|
308
|
+
# "base_caller",
|
309
|
+
# "quality_scorer",
|
310
|
+
# "number_of_levels",
|
311
|
+
# "multiplier",
|
312
|
+
# "qtype",
|
313
|
+
# "experiment_url_link",
|
314
|
+
# "experiment_entrez_link",
|
315
|
+
# "experiment_attribute",
|
316
|
+
# "sample_ID",
|
317
|
+
# "sample_alias",
|
318
|
+
# "sample_accession",
|
319
|
+
# "taxon_id",
|
320
|
+
# "common_name",
|
321
|
+
# "anonymized_name",
|
322
|
+
# "individual_name",
|
323
|
+
# "description",
|
324
|
+
# "sample_url_link",
|
325
|
+
# "sample_entrez_link",
|
326
|
+
# "sample_attribute",
|
327
|
+
# "study_ID",
|
328
|
+
# "study_alias",
|
329
|
+
# "study_accession",
|
330
|
+
# "study_title",
|
331
|
+
# "study_type",
|
332
|
+
# "study_abstract",
|
333
|
+
# "center_project_name",
|
334
|
+
# "study_description",
|
335
|
+
# "study_url_link",
|
336
|
+
# "study_entrez_link",
|
337
|
+
# "study_attribute",
|
338
|
+
# "related_studies",
|
339
|
+
# "primary_study",
|
340
|
+
# "submission_ID",
|
341
|
+
# "submission_accession",
|
342
|
+
# "submission_comment",
|
343
|
+
# "submission_center",
|
344
|
+
# "submission_lab",
|
345
|
+
# "submission_date",
|
346
|
+
# "sradb_updated"]
|
347
|
+
class SRAFt < Connection
|
348
|
+
self.table_name = 'sra_ft'
|
349
|
+
end
|
350
|
+
|
351
|
+
# pp Bio::SRA::Tables::SRAFtContent.column_names
|
352
|
+
# ["docid",
|
353
|
+
# "c0SRR_bamFile",
|
354
|
+
# "c1SRX_bamFile",
|
355
|
+
# "c2SRX_fastqFTP",
|
356
|
+
# "c3run_ID",
|
357
|
+
# "c4run_alias",
|
358
|
+
# "c5run_accession",
|
359
|
+
# "c6run_date",
|
360
|
+
# "c7updated_date",
|
361
|
+
# "c8spots",
|
362
|
+
# "c9bases",
|
363
|
+
# "c10run_center",
|
364
|
+
# "c11experiment_name",
|
365
|
+
# "c12run_url_link",
|
366
|
+
# "c13run_entrez_link",
|
367
|
+
# "c14run_attribute",
|
368
|
+
# "c15experiment_ID",
|
369
|
+
# "c16experiment_alias",
|
370
|
+
# "c17experiment_accession",
|
371
|
+
# "c18experiment_title",
|
372
|
+
# "c19study_name",
|
373
|
+
# "c20sample_name",
|
374
|
+
# "c21design_description",
|
375
|
+
# "c22library_name",
|
376
|
+
# "c23library_strategy",
|
377
|
+
# "c24library_source",
|
378
|
+
# "c25library_selection",
|
379
|
+
# "c26library_layout",
|
380
|
+
# "c27library_construction_protocol",
|
381
|
+
# "c28adapter_spec",
|
382
|
+
# "c29read_spec",
|
383
|
+
# "c30platform",
|
384
|
+
# "c31instrument_model",
|
385
|
+
# "c32instrument_name",
|
386
|
+
# "c33platform_parameters",
|
387
|
+
# "c34sequence_space",
|
388
|
+
# "c35base_caller",
|
389
|
+
# "c36quality_scorer",
|
390
|
+
# "c37number_of_levels",
|
391
|
+
# "c38multiplier",
|
392
|
+
# "c39qtype",
|
393
|
+
# "c40experiment_url_link",
|
394
|
+
# "c41experiment_entrez_link",
|
395
|
+
# "c42experiment_attribute",
|
396
|
+
# "c43sample_ID",
|
397
|
+
# "c44sample_alias",
|
398
|
+
# "c45sample_accession",
|
399
|
+
# "c46taxon_id",
|
400
|
+
# "c47common_name",
|
401
|
+
# "c48anonymized_name",
|
402
|
+
# "c49individual_name",
|
403
|
+
# "c50description",
|
404
|
+
# "c51sample_url_link",
|
405
|
+
# "c52sample_entrez_link",
|
406
|
+
# "c53sample_attribute",
|
407
|
+
# "c54study_ID",
|
408
|
+
# "c55study_alias",
|
409
|
+
# "c56study_accession",
|
410
|
+
# "c57study_title",
|
411
|
+
# "c58study_type",
|
412
|
+
# "c59study_abstract",
|
413
|
+
# "c60center_project_name",
|
414
|
+
# "c61study_description",
|
415
|
+
# "c62study_url_link",
|
416
|
+
# "c63study_entrez_link",
|
417
|
+
# "c64study_attribute",
|
418
|
+
# "c65related_studies",
|
419
|
+
# "c66primary_study",
|
420
|
+
# "c67submission_ID",
|
421
|
+
# "c68submission_accession",
|
422
|
+
# "c69submission_comment",
|
423
|
+
# "c70submission_center",
|
424
|
+
# "c71submission_lab",
|
425
|
+
# "c72submission_date",
|
426
|
+
# "c73sradb_updated"]
|
427
|
+
class SRAFtContent < Connection
|
428
|
+
self.table_name = 'sra_ft_content'
|
429
|
+
end
|
430
|
+
|
431
|
+
# pp Bio::SRA::Tables::SRAFtSegDir.column_names
|
432
|
+
# ["level", "idx", "start_block", "leaves_end_block", "end_block", "root"]
|
433
|
+
class SRAFtSegDir < Connection
|
434
|
+
self.table_name = 'sra_ft_segdir'
|
435
|
+
end
|
436
|
+
|
437
|
+
# pp Bio::SRA::Tables::SRAFtSegments.column_names
|
438
|
+
# ["blockid", "block"]
|
439
|
+
class SRAFtSegments < Connection
|
440
|
+
self.table_name = 'sra_ft_segments'
|
441
|
+
end
|
442
|
+
|
443
|
+
# pp Bio::SRA::Tables::MetaInfo.column_names
|
444
|
+
# ["name", "value"]
|
445
|
+
class MetaInfo < Connection
|
446
|
+
self.table_name = 'metaInfo'
|
447
|
+
end
|
448
|
+
|
449
|
+
# This table holds information about each of the columns
|
450
|
+
# in this SRAmetadb database
|
451
|
+
#
|
452
|
+
# pp Bio::SRA::Tables::ColDesc.column_names
|
453
|
+
# ["col_desc_ID",
|
454
|
+
# "table_name",
|
455
|
+
# "field_name",
|
456
|
+
# "type",
|
457
|
+
# "description",
|
458
|
+
# "value_list",
|
459
|
+
# "sradb_updated"]
|
460
|
+
class ColDesc < Connection
|
461
|
+
self.table_name = 'col_desc'
|
462
|
+
self.primary_key = 'col_desc_ID'
|
463
|
+
self.inheritance_column = nil
|
464
|
+
end
|
465
|
+
end
|
466
|
+
end
|
467
|
+
end
|