miga-base 0.3.3.1 → 0.3.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/about.rb +1 -1
- data/actions/ls.rb +5 -2
- data/actions/ncbi_get.rb +19 -9
- data/actions/stats.rb +18 -10
- data/lib/miga.rb +5 -5
- data/lib/miga/common/format.rb +10 -8
- data/lib/miga/dataset/result.rb +32 -38
- data/lib/miga/remote_dataset.rb +104 -113
- data/lib/miga/remote_dataset/base.rb +23 -17
- data/lib/miga/remote_dataset/download.rb +84 -0
- data/lib/miga/result/dates.rb +13 -11
- data/lib/miga/version.rb +1 -1
- data/scripts/assembly.bash +1 -1
- data/scripts/read_quality.bash +1 -0
- data/scripts/trimmed_fasta.bash +3 -0
- data/scripts/trimmed_reads.bash +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b7140af2e2cb8525f5231ea065c79cb362348a9f
|
4
|
+
data.tar.gz: 257de78dbf14f7e01f4c239e7311883f98ccab21
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a08c21c03a045d369078614dc41223db249b4105d20ece5198e4ef126dab468f7490278f50d7101652bba5433e4eee4653ea0fd67e85de55a78881d842d25ac
|
7
|
+
data.tar.gz: 62b07a2e62dbf801f5afbe87ac6bbb7c5a004ccc52153c640daf675e5e50e211fd568b7496d884a67759006d47e4ffd264dc280d28e299fb8351ba5ba14e5a6f
|
data/actions/about.rb
CHANGED
@@ -27,7 +27,7 @@ raise "Impossible to load project: #{o[:project]}" if p.nil?
|
|
27
27
|
|
28
28
|
if not o[:datum].nil?
|
29
29
|
v = p.metadata[ o[:datum] ]
|
30
|
-
puts v.nil? ?
|
30
|
+
puts v.nil? ? '?' : v
|
31
31
|
elsif o[:processing]
|
32
32
|
keys = MiGA::Project.DISTANCE_TASKS + MiGA::Project.INCLADE_TASKS
|
33
33
|
puts MiGA::MiGA.tabulate([:task, :status], keys.map do |k|
|
data/actions/ls.rb
CHANGED
@@ -13,7 +13,7 @@ OptionParser.new do |opt|
|
|
13
13
|
opt.on("-p", "--processing",
|
14
14
|
"Print information on processing advance."){ |v| o[:processing]=v }
|
15
15
|
opt.on("-m", "--metadata STRING",
|
16
|
-
"Print name and metadata field only. If set, ignores -i."
|
16
|
+
"Print name and metadata field only. If set, ignores -i and assumes --tab."
|
17
17
|
){ |v| o[:datum]=v }
|
18
18
|
opt.on("--tab",
|
19
19
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
@@ -43,7 +43,10 @@ ds = filter_datasets!(ds, o)
|
|
43
43
|
exit(1) if o[:silent] and ds.empty?
|
44
44
|
|
45
45
|
if not o[:datum].nil?
|
46
|
-
ds.each
|
46
|
+
ds.each do |d|
|
47
|
+
v = d.metadata[ o[:datum] ]
|
48
|
+
puts "#{d.name}\t#{v.nil? ? '?' : v}"
|
49
|
+
end
|
47
50
|
elsif o[:info]
|
48
51
|
puts MiGA::MiGA.tabulate(
|
49
52
|
MiGA::Dataset.INFO_FIELDS, ds.map{ |d| d.info }, o[:tabular])
|
data/actions/ncbi_get.rb
CHANGED
@@ -8,7 +8,8 @@ require 'miga/remote_dataset'
|
|
8
8
|
o = {q:true, query:false, unlink:false,
|
9
9
|
reference: false, ignore_plasmids: false,
|
10
10
|
complete: false, chromosome: false,
|
11
|
-
scaffold: false, contig: false, add_version: true, dry: false
|
11
|
+
scaffold: false, contig: false, add_version: true, dry: false,
|
12
|
+
get_md: false}
|
12
13
|
OptionParser.new do |opt|
|
13
14
|
opt_banner(opt)
|
14
15
|
opt_object(opt, o, [:project])
|
@@ -37,6 +38,9 @@ OptionParser.new do |opt|
|
|
37
38
|
opt.on('--blacklist PATH',
|
38
39
|
'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
|
39
40
|
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
41
|
+
opt.on('--get-metadata',
|
42
|
+
'Only download and update metadata for existing datasets'
|
43
|
+
){ |v| o[:get_md] = v }
|
40
44
|
opt.on('-q', '--query',
|
41
45
|
'Register the datasets as queries, not reference datasets.'
|
42
46
|
){ |v| o[:query]=v }
|
@@ -131,8 +135,7 @@ if o[:scaffold] or o[:contig]
|
|
131
135
|
map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
|
132
136
|
next if ids.empty?
|
133
137
|
n = "#{r[0]}_#{asm}".miga_name
|
134
|
-
|
135
|
-
ds[n] = {ids: ids, md: {type: :genome, comments: comm},
|
138
|
+
ds[n] = {ids: ids, md: {type: :genome, ncbi_asm: asm},
|
136
139
|
db: :assembly_gz, universe: :web}
|
137
140
|
end
|
138
141
|
end
|
@@ -144,23 +147,30 @@ unless o[:blacklist].nil?
|
|
144
147
|
end
|
145
148
|
|
146
149
|
# Download entries
|
147
|
-
$stderr.puts "Downloading #{ds.size}
|
150
|
+
$stderr.puts "Downloading #{ds.size} " +
|
151
|
+
(ds.size > 1 ? "entries" : "entry") unless o[:q]
|
148
152
|
ds.each do |name,body|
|
149
153
|
d << name
|
150
154
|
puts name
|
151
|
-
next
|
155
|
+
next if p.dataset(name).nil? == o[:get_md]
|
152
156
|
downloaded += 1
|
153
157
|
next if o[:dry]
|
154
158
|
$stderr.puts ' Locating remote dataset.' unless o[:q]
|
155
159
|
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
156
|
-
|
157
|
-
|
158
|
-
|
160
|
+
if o[:get_md]
|
161
|
+
$stderr.puts ' Updating dataset.' unless o[:q]
|
162
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
163
|
+
else
|
164
|
+
$stderr.puts ' Creating dataset.' unless o[:q]
|
165
|
+
rd.save_to(p, name, !o[:query], body[:md])
|
166
|
+
p.add_dataset(name)
|
167
|
+
end
|
159
168
|
end
|
160
169
|
|
161
170
|
# Finalize
|
162
171
|
$stderr.puts "Datasets listed: #{d.size}" unless o[:q]
|
163
|
-
$stderr.puts "Datasets #{
|
172
|
+
$stderr.puts "Datasets #{o[:dry] ? 'to download' : 'downloaded'}: " +
|
173
|
+
downloaded.to_s unless o[:q]
|
164
174
|
unless o[:remote_list].nil?
|
165
175
|
File.open(o[:remote_list], 'w') do |fh|
|
166
176
|
d.each { |i| fh.puts i }
|
data/actions/stats.rb
CHANGED
@@ -45,14 +45,16 @@ if o[:compute]
|
|
45
45
|
when :raw_reads
|
46
46
|
if r[:files][:pair1].nil?
|
47
47
|
s = MiGA::MiGA.seqs_length(r.file_path(:single), :fastq, gc: true)
|
48
|
-
stats = {
|
48
|
+
stats = {
|
49
|
+
reads: s[:n],
|
49
50
|
length_average: [s[:avg], "bp"],
|
50
51
|
length_standard_deviation: [s[:sd], "bp"],
|
51
52
|
g_c_content: [s[:gc], "%"]}
|
52
53
|
else
|
53
54
|
s1 = MiGA::MiGA.seqs_length(r.file_path(:pair1), :fastq, gc: true)
|
54
55
|
s2 = MiGA::MiGA.seqs_length(r.file_path(:pair2), :fastq, gc: true)
|
55
|
-
stats = {
|
56
|
+
stats = {
|
57
|
+
read_pairs: s1[:n],
|
56
58
|
forward_length_average: [s1[:avg], "bp"],
|
57
59
|
forward_length_standard_deviation: [s1[:sd], "bp"],
|
58
60
|
forward_g_c_content: [s1[:gc], "%"],
|
@@ -63,22 +65,28 @@ if o[:compute]
|
|
63
65
|
when :trimmed_fasta
|
64
66
|
f = r[:files][:coupled].nil? ? r.file_path(:single) : r.file_path(:coupled)
|
65
67
|
s = MiGA::MiGA.seqs_length(f, :fasta, gc: true)
|
66
|
-
stats = {
|
68
|
+
stats = {
|
69
|
+
reads: s[:n],
|
67
70
|
length_average: [s[:avg], "bp"],
|
68
71
|
length_standard_deviation: [s[:sd], "bp"],
|
69
72
|
g_c_content: [s[:gc], "%"]}
|
70
73
|
when :assembly
|
71
74
|
s = MiGA::MiGA.seqs_length(r.file_path(:largecontigs), :fasta,
|
72
|
-
n50:true, gc:true)
|
73
|
-
stats = {
|
74
|
-
|
75
|
+
n50: true, gc: true)
|
76
|
+
stats = {
|
77
|
+
contigs: s[:n],
|
78
|
+
n50: [s[:n50], "bp"],
|
79
|
+
total_length: [s[:tot], "bp"],
|
80
|
+
g_c_content: [s[:gc], "%"]}
|
75
81
|
when :cds
|
76
82
|
s = MiGA::MiGA.seqs_length(r.file_path(:proteins), :fasta)
|
77
|
-
stats = {
|
83
|
+
stats = {
|
84
|
+
predicted_proteins: s[:n],
|
85
|
+
average_length: [s[:avg], "aa"]}
|
78
86
|
asm = d.add_result(:assembly, false)
|
79
87
|
unless asm.nil? or asm[:stats][:total_length].nil?
|
80
88
|
stats[:coding_density] =
|
81
|
-
[300.0*s[:tot]/asm[:stats][:total_length][0], "%"]
|
89
|
+
[300.0 * s[:tot] / asm[:stats][:total_length][0], "%"]
|
82
90
|
end
|
83
91
|
when :essential_genes
|
84
92
|
if d.is_multi?
|
@@ -102,7 +110,7 @@ if o[:compute]
|
|
102
110
|
r.add_file(:report, "#{d.name}.ess/log.archaea")
|
103
111
|
end
|
104
112
|
# Extract/compute quality values
|
105
|
-
stats = {completeness:[0.0,"%"], contamination:[0.0,"%"]}
|
113
|
+
stats = {completeness: [0.0,"%"], contamination: [0.0,"%"]}
|
106
114
|
File.open(r.file_path(:report), "r") do |fh|
|
107
115
|
fh.each_line do |ln|
|
108
116
|
if /^! (Completeness|Contamination): (.*)%/.match(ln)
|
@@ -110,7 +118,7 @@ if o[:compute]
|
|
110
118
|
end
|
111
119
|
end
|
112
120
|
end
|
113
|
-
stats[:quality] = stats[:completeness][0] - stats[:contamination][0]*5
|
121
|
+
stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
|
114
122
|
d.metadata[:quality] = case stats[:quality]
|
115
123
|
when 80..100 ; :excellent
|
116
124
|
when 50..80 ; :high
|
data/lib/miga.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
4
|
+
require 'json'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'miga/common'
|
7
|
+
require 'miga/project'
|
8
|
+
require 'miga/taxonomy'
|
data/lib/miga/common/format.rb
CHANGED
@@ -2,18 +2,20 @@
|
|
2
2
|
require 'tempfile'
|
3
3
|
require 'zlib'
|
4
4
|
|
5
|
+
##
|
6
|
+
# General formatting functions shared throughout MiGA.
|
5
7
|
module MiGA::Common::Format
|
6
8
|
##
|
7
9
|
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
8
10
|
# entries as +header+. Returns an Array of String, one per line.
|
9
|
-
def tabulate(header, values, tabular=false)
|
11
|
+
def tabulate(header, values, tabular = false)
|
10
12
|
fields = [header.map(&:to_s)]
|
11
13
|
fields << fields.first.map { |h| h.gsub(/\S/, '-') } unless tabular
|
12
14
|
fields += values.map { |r| r.map { |cell| cell.nil? ? '?' : cell.to_s } }
|
13
15
|
clen = tabular ? Array.new(header.size, 0) :
|
14
16
|
fields.map { |r| r.map(&:length) }.transpose.map(&:max)
|
15
17
|
fields.map do |r|
|
16
|
-
(0
|
18
|
+
(0..(clen.size - 1)).map do |col_n|
|
17
19
|
col_n == 0 ? r[col_n].rjust(clen[col_n]) : r[col_n].ljust(clen[col_n])
|
18
20
|
end.join(tabular ? "\t" : ' ')
|
19
21
|
end
|
@@ -37,7 +39,7 @@ module MiGA::Common::Format
|
|
37
39
|
fh.each_line do |ln|
|
38
40
|
ln.chomp!
|
39
41
|
if ln =~ /^>\s*(\S+)(.*)/
|
40
|
-
|
42
|
+
id, df = $1, $2
|
41
43
|
tmp_fh.print buffer.wrap_width(80)
|
42
44
|
buffer = ''
|
43
45
|
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}"
|
@@ -66,16 +68,17 @@ module MiGA::Common::Format
|
|
66
68
|
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
67
69
|
# - +gc+: If true, it also returns the G+C content (in %).
|
68
70
|
def seqs_length(file, format, opts = {})
|
69
|
-
fh =
|
71
|
+
fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
70
72
|
l = []
|
71
73
|
gc = 0
|
72
|
-
i = 0 # <- Zlib::GzipReader doesn't set
|
74
|
+
i = 0 # <- Zlib::GzipReader doesn't set `$.`
|
73
75
|
fh.each_line do |ln|
|
74
76
|
i += 1
|
75
|
-
if (format == :fasta and ln =~ /^>/) or
|
77
|
+
if (format == :fasta and ln =~ /^>/) or
|
78
|
+
(format == :fastq and (i % 4) == 1)
|
76
79
|
l << 0
|
77
80
|
elsif format == :fasta or (i % 4) == 2
|
78
|
-
l[l.size-1] += ln.chomp.size
|
81
|
+
l[l.size - 1] += ln.chomp.size
|
79
82
|
gc += ln.scan(/[GCgc]/).count if opts[:gc]
|
80
83
|
end
|
81
84
|
end
|
@@ -131,4 +134,3 @@ class String
|
|
131
134
|
gsub(/([^\n\r]{1,#{width}})/, "\\1\n")
|
132
135
|
end
|
133
136
|
end
|
134
|
-
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -92,7 +92,7 @@ module MiGA::Dataset::Result
|
|
92
92
|
|
93
93
|
##
|
94
94
|
# Are all the dataset-specific tasks done? Passes +save+ to #add_result.
|
95
|
-
def done_preprocessing?(save=false)
|
95
|
+
def done_preprocessing?(save = false)
|
96
96
|
!first_preprocessing(save).nil? and next_preprocessing(save).nil?
|
97
97
|
end
|
98
98
|
|
@@ -103,7 +103,7 @@ module MiGA::Dataset::Result
|
|
103
103
|
# - 1 for a registered result (a completed task).
|
104
104
|
# - 2 for a queued result (a task yet to be executed).
|
105
105
|
# It passes +save+ to #add_result
|
106
|
-
def profile_advance(save=false)
|
106
|
+
def profile_advance(save = false)
|
107
107
|
first_task = first_preprocessing(save)
|
108
108
|
return Array.new(@@PREPROCESSING_TASKS.size, 0) if first_task.nil?
|
109
109
|
adv = []
|
@@ -141,12 +141,12 @@ module MiGA::Dataset::Result
|
|
141
141
|
##
|
142
142
|
# Add result type +:raw_reads+ at +base+ (no +_opts+ supported).
|
143
143
|
def add_result_raw_reads(base, _opts)
|
144
|
-
return nil unless result_files_exist?(base,
|
144
|
+
return nil unless result_files_exist?(base, '.1.fastq')
|
145
145
|
r = MiGA::Result.new("#{base}.json")
|
146
146
|
add_files_to_ds_result(r, name,
|
147
|
-
( result_files_exist?(base,
|
148
|
-
{pair1:
|
149
|
-
{single:
|
147
|
+
( result_files_exist?(base, '.2.fastq') ?
|
148
|
+
{pair1: '.1.fastq', pair2: '.2.fastq'} :
|
149
|
+
{single: '.1.fastq'} ))
|
150
150
|
end
|
151
151
|
|
152
152
|
##
|
@@ -156,13 +156,12 @@ module MiGA::Dataset::Result
|
|
156
156
|
r = MiGA::Result.new("#{base}.json")
|
157
157
|
if result_files_exist?(base, ".2.clipped.fastq")
|
158
158
|
r = add_files_to_ds_result(r, name,
|
159
|
-
pair1:".1.clipped.fastq", pair2:".2.clipped.fastq",
|
160
|
-
single:".1.clipped.single.fastq")
|
159
|
+
pair1: ".1.clipped.fastq", pair2: ".2.clipped.fastq",
|
160
|
+
single: ".1.clipped.single.fastq")
|
161
161
|
else
|
162
|
-
r = add_files_to_ds_result(r, name, single:".1.clipped.fastq")
|
162
|
+
r = add_files_to_ds_result(r, name, single: ".1.clipped.fastq")
|
163
163
|
end
|
164
164
|
r.add_file(:trimming_sumary, "#{name}.1.fastq.trimmed.summary.txt")
|
165
|
-
add_result(:raw_reads) #-> Post gunzip
|
166
165
|
r
|
167
166
|
end
|
168
167
|
|
@@ -171,10 +170,8 @@ module MiGA::Dataset::Result
|
|
171
170
|
def add_result_read_quality(base, _opts)
|
172
171
|
return nil unless result_files_exist?(base, %w[.solexaqa .fastqc])
|
173
172
|
r = MiGA::Result.new("#{base}.json")
|
174
|
-
|
175
|
-
solexaqa:".solexaqa", fastqc:".fastqc")
|
176
|
-
add_result(:trimmed_reads) #-> Post cleaning
|
177
|
-
r
|
173
|
+
add_files_to_ds_result(r, name,
|
174
|
+
solexaqa: ".solexaqa", fastqc: ".fastqc")
|
178
175
|
end
|
179
176
|
|
180
177
|
##
|
@@ -185,10 +182,8 @@ module MiGA::Dataset::Result
|
|
185
182
|
result_files_exist?(base, ".SingleReads.fa") or
|
186
183
|
result_files_exist?(base, %w[.1.fasta .2.fasta])
|
187
184
|
r = MiGA::Result.new("#{base}.json")
|
188
|
-
|
189
|
-
single:".SingleReads.fa", pair1:".1.fasta", pair2:".2.fasta")
|
190
|
-
add_result(:raw_reads) #-> Post gzip
|
191
|
-
r
|
185
|
+
add_files_to_ds_result(r, name, coupled: ".CoupledReads.fa",
|
186
|
+
single: ".SingleReads.fa", pair1: ".1.fasta", pair2: ".2.fasta")
|
192
187
|
end
|
193
188
|
|
194
189
|
##
|
@@ -197,15 +192,14 @@ module MiGA::Dataset::Result
|
|
197
192
|
def add_result_assembly(base, opts)
|
198
193
|
return nil unless result_files_exist?(base, ".LargeContigs.fna")
|
199
194
|
r = MiGA::Result.new("#{base}.json")
|
200
|
-
r = add_files_to_ds_result(r, name, largecontigs:".LargeContigs.fna",
|
201
|
-
allcontigs:".AllContigs.fna", assembly_data:
|
195
|
+
r = add_files_to_ds_result(r, name, largecontigs: ".LargeContigs.fna",
|
196
|
+
allcontigs: ".AllContigs.fna", assembly_data: '')
|
202
197
|
opts[:is_clean] ||= false
|
203
198
|
r.clean! if opts[:is_clean]
|
204
199
|
unless r.clean?
|
205
200
|
MiGA::MiGA.clean_fasta_file(r.file_path :largecontigs)
|
206
201
|
r.clean!
|
207
202
|
end
|
208
|
-
add_result(:trimmed_fasta) #-> Post interposing
|
209
203
|
r
|
210
204
|
end
|
211
205
|
|
@@ -214,8 +208,8 @@ module MiGA::Dataset::Result
|
|
214
208
|
def add_result_cds(base, opts)
|
215
209
|
return nil unless result_files_exist?(base, %w[.faa])
|
216
210
|
r = MiGA::Result.new("#{base}.json")
|
217
|
-
r = add_files_to_ds_result(r, name, proteins:".faa", genes:".fna",
|
218
|
-
gff2:".gff2", gff3:".gff3", tab:".tab")
|
211
|
+
r = add_files_to_ds_result(r, name, proteins: ".faa", genes: ".fna",
|
212
|
+
gff2: ".gff2", gff3: ".gff3", tab: ".tab")
|
219
213
|
opts[:is_clean] ||= false
|
220
214
|
r.clean! if opts[:is_clean]
|
221
215
|
unless r.clean?
|
@@ -231,8 +225,8 @@ module MiGA::Dataset::Result
|
|
231
225
|
def add_result_essential_genes(base, _opts)
|
232
226
|
return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
|
233
227
|
r = MiGA::Result.new("#{base}.json")
|
234
|
-
add_files_to_ds_result(r, name, ess_genes:".ess.faa",
|
235
|
-
collection:".ess", report:".ess/log")
|
228
|
+
add_files_to_ds_result(r, name, ess_genes: ".ess.faa",
|
229
|
+
collection: ".ess", report: ".ess/log")
|
236
230
|
end
|
237
231
|
|
238
232
|
##
|
@@ -241,8 +235,8 @@ module MiGA::Dataset::Result
|
|
241
235
|
return MiGA::Result.new("#{base}.json") if result(:assembly).nil?
|
242
236
|
return nil unless result_files_exist?(base, ".ssu.fa")
|
243
237
|
r = MiGA::Result.new("#{base}.json")
|
244
|
-
r = add_files_to_ds_result(r, name, longest_ssu_gene:".ssu.fa",
|
245
|
-
gff:".ssu.gff", all_ssu_genes:".ssu.all.fa")
|
238
|
+
r = add_files_to_ds_result(r, name, longest_ssu_gene: ".ssu.fa",
|
239
|
+
gff: ".ssu.gff", all_ssu_genes: ".ssu.all.fa")
|
246
240
|
opts[:is_clean] ||= false
|
247
241
|
r.clean! if opts[:is_clean]
|
248
242
|
unless r.clean?
|
@@ -259,11 +253,11 @@ module MiGA::Dataset::Result
|
|
259
253
|
return nil unless result_files_exist?(base, ".mytaxa") or
|
260
254
|
result_files_exist?(base, ".nomytaxa.txt")
|
261
255
|
r = MiGA::Result.new("#{base}.json")
|
262
|
-
add_files_to_ds_result(r, name, mytaxa:".mytaxa", blast:".blast",
|
263
|
-
mytaxain:".mytaxain", nomytaxa:".nomytaxa.txt",
|
264
|
-
species:".mytaxa.Species.txt", genus:".mytaxa.Genus.txt",
|
265
|
-
phylum:".mytaxa.Phylum.txt", innominate:".mytaxa.innominate",
|
266
|
-
kronain:".mytaxa.krona", krona:".html")
|
256
|
+
add_files_to_ds_result(r, name, mytaxa: ".mytaxa", blast: ".blast",
|
257
|
+
mytaxain: ".mytaxain", nomytaxa: ".nomytaxa.txt",
|
258
|
+
species: ".mytaxa.Species.txt", genus: ".mytaxa.Genus.txt",
|
259
|
+
phylum: ".mytaxa.Phylum.txt", innominate: ".mytaxa.innominate",
|
260
|
+
kronain: ".mytaxa.krona", krona: ".html")
|
267
261
|
else
|
268
262
|
MiGA::Result.new("#{base}.json")
|
269
263
|
end
|
@@ -327,8 +321,8 @@ module MiGA::Dataset::Result
|
|
327
321
|
return nil unless
|
328
322
|
File.exist?("#{pref}/01.haai/#{name}.db")
|
329
323
|
r = MiGA::Result.new("#{base}.json")
|
330
|
-
r.add_files(haai_db:"01.haai/#{name}.db", aai_db:"02.aai/#{name}.db",
|
331
|
-
ani_db:"03.ani/#{name}.db")
|
324
|
+
r.add_files(haai_db: "01.haai/#{name}.db", aai_db: "02.aai/#{name}.db",
|
325
|
+
ani_db: "03.ani/#{name}.db")
|
332
326
|
r
|
333
327
|
end
|
334
328
|
|
@@ -339,10 +333,10 @@ module MiGA::Dataset::Result
|
|
339
333
|
result_files_exist?(base, %w[.aai-medoids.tsv .aai.db]) or
|
340
334
|
result_files_exist?(base, %w[.ani-medoids.tsv .ani.db])
|
341
335
|
r = MiGA::Result.new("#{base}.json")
|
342
|
-
add_files_to_ds_result(r, name, aai_medoids:".aai-medoids.tsv",
|
343
|
-
haai_db:".haai.db", aai_db:".aai.db", ani_medoids:".ani-medoids.tsv",
|
344
|
-
ani_db:".ani.db", ref_tree:".nwk", ref_tree_pdf:".nwk.pdf",
|
345
|
-
intax_test:".intax.txt")
|
336
|
+
add_files_to_ds_result(r, name, aai_medoids: ".aai-medoids.tsv",
|
337
|
+
haai_db: ".haai.db", aai_db: ".aai.db", ani_medoids: ".ani-medoids.tsv",
|
338
|
+
ani_db: ".ani.db", ref_tree: ".nwk", ref_tree_pdf: ".nwk.pdf",
|
339
|
+
intax_test: ".intax.txt")
|
346
340
|
end
|
347
341
|
|
348
342
|
##
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -1,68 +1,12 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require 'miga/remote_dataset/
|
4
|
+
require 'miga/remote_dataset/download'
|
5
5
|
|
6
6
|
##
|
7
7
|
# MiGA representation of datasets with data in remote locations.
|
8
8
|
class MiGA::RemoteDataset < MiGA::MiGA
|
9
|
-
|
10
|
-
include MiGA::RemoteDataset::Base
|
11
|
-
|
12
|
-
# Class-level
|
13
|
-
|
14
|
-
##
|
15
|
-
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
16
|
-
# in +format+. If passed, it saves the result in +file+. Returns String.
|
17
|
-
def self.download(universe, db, ids, format, file=nil)
|
18
|
-
ids = [ids] unless ids.is_a? Array
|
19
|
-
case @@UNIVERSE[universe][:method]
|
20
|
-
when :rest
|
21
|
-
doc = download_rest(universe, db, ids, format)
|
22
|
-
when :net
|
23
|
-
doc = download_net(universe, db, ids, format)
|
24
|
-
end
|
25
|
-
unless file.nil?
|
26
|
-
ofh = File.open(file, "w")
|
27
|
-
ofh.print doc
|
28
|
-
ofh.close
|
29
|
-
end
|
30
|
-
doc
|
31
|
-
end
|
32
|
-
|
33
|
-
##
|
34
|
-
# Download data using a REST method from the +universe+ in the database +db+
|
35
|
-
# with IDs +ids+ and in +format+. Returns the doc as String.
|
36
|
-
def self.download_rest(universe, db, ids, format)
|
37
|
-
u = @@UNIVERSE[universe]
|
38
|
-
map_to = u[:dbs][db].nil? ? nil : u[:dbs][db][:map_to]
|
39
|
-
url = sprintf(u[:url], db, ids.join(","), format, map_to)
|
40
|
-
response = RestClient::Request.execute(method: :get, url:url, timeout:600)
|
41
|
-
unless response.code == 200
|
42
|
-
raise "Unable to reach #{universe} client, error code #{response.code}."
|
43
|
-
end
|
44
|
-
response.to_s
|
45
|
-
end
|
46
|
-
|
47
|
-
##
|
48
|
-
# Download data using a GET request from the +universe+ in the database +db+
|
49
|
-
# with IDs +ids+ and in +format+. Returns the doc as String.
|
50
|
-
def self.download_net(universe, db, ids, format)
|
51
|
-
u = @@UNIVERSE[universe]
|
52
|
-
map_to = u[:dbs][db].nil? ? nil : u[:dbs][db][:map_to]
|
53
|
-
url = sprintf(u[:url], db, ids.join(","), format, map_to)
|
54
|
-
doc = ""
|
55
|
-
@timeout_try = 0
|
56
|
-
begin
|
57
|
-
open(url) { |f| doc = f.read }
|
58
|
-
rescue Net::ReadTimeout
|
59
|
-
@timeout_try += 1
|
60
|
-
if @timeout_try > 3 ; raise Net::ReadTimeout
|
61
|
-
else ; retry
|
62
|
-
end
|
63
|
-
end
|
64
|
-
doc
|
65
|
-
end
|
9
|
+
include MiGA::RemoteDataset::Download
|
66
10
|
|
67
11
|
# Instance-level
|
68
12
|
|
@@ -71,7 +15,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
71
15
|
attr_reader :universe
|
72
16
|
# Database storing the dataset.
|
73
17
|
attr_reader :db
|
74
|
-
# IDs of the entries composing the dataset.
|
18
|
+
# Array of IDs of the entries composing the dataset.
|
75
19
|
attr_reader :ids
|
76
20
|
|
77
21
|
##
|
@@ -81,92 +25,91 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
81
25
|
@ids = (ids.is_a?(Array) ? ids : [ids])
|
82
26
|
@db = db.to_sym
|
83
27
|
@universe = universe.to_sym
|
84
|
-
|
85
|
-
"#{
|
86
|
-
|
87
|
-
"#{@@UNIVERSE[@universe][:dbs]}"
|
88
|
-
|
89
|
-
#
|
90
|
-
#
|
91
|
-
#
|
92
|
-
#end
|
28
|
+
@@UNIVERSE.keys.include?(@universe) or
|
29
|
+
raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
|
30
|
+
@@UNIVERSE[@universe][:dbs].include?(@db) or
|
31
|
+
raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
|
32
|
+
# FIXME: Part of the +map_to+ support:
|
33
|
+
# unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
|
34
|
+
# MiGA::RemoteDataset.download
|
35
|
+
# end
|
93
36
|
end
|
94
37
|
|
95
38
|
##
|
96
39
|
# Save dataset to the MiGA::Project +project+ identified with +name+. +is_ref+
|
97
40
|
# indicates if it should be a reference dataset, and contains +metadata+.
|
98
|
-
def save_to(project, name=nil, is_ref=true, metadata={})
|
99
|
-
name ||= ids.join(
|
41
|
+
def save_to(project, name = nil, is_ref = true, metadata = {})
|
42
|
+
name ||= ids.join('_').miga_name
|
100
43
|
project = MiGA::Project.new(project) if project.is_a? String
|
101
|
-
|
44
|
+
MiGA::Dataset.exist?(project, name) and
|
102
45
|
raise "Dataset #{name} exists in the project, aborting..."
|
103
|
-
end
|
104
46
|
metadata = get_metadata(metadata)
|
105
47
|
udb = @@UNIVERSE[universe][:dbs][db]
|
106
|
-
metadata["#{universe}_#{db}"] = ids.join(
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
base = "#{project.path}/data/#{dir}/#{name}"
|
111
|
-
l_ctg = "#{base}.LargeContigs.fna"
|
112
|
-
a_ctg = "#{base}.AllContigs.fna"
|
113
|
-
File.open("#{base}.start", "w") { |ofh| ofh.puts Time.now.to_s }
|
114
|
-
if udb[:format] == :fasta_gz
|
115
|
-
download "#{l_ctg}.gz"
|
116
|
-
system "gzip -d '#{l_ctg}.gz'"
|
117
|
-
else
|
118
|
-
download l_ctg
|
119
|
-
end
|
120
|
-
File.unlink(a_ctg) if File.exist? a_ctg
|
121
|
-
File.symlink(File.basename(l_ctg), a_ctg)
|
122
|
-
File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
|
123
|
-
else
|
124
|
-
raise "Unexpected error: Unsupported result for database #{db}."
|
125
|
-
end
|
48
|
+
metadata["#{universe}_#{db}"] = ids.join(',')
|
49
|
+
respond_to?("save_#{udb[:stage]}_to", true) or
|
50
|
+
raise "Unexpected error: Unsupported stage #{udb[:stage]} for #{db}."
|
51
|
+
send "save_#{udb[:stage]}_to", project, name, udb
|
126
52
|
dataset = MiGA::Dataset.new(project, name, is_ref, metadata)
|
127
53
|
project.add_dataset(dataset.name)
|
128
|
-
result = dataset.add_result(udb[:stage], true, is_clean:true)
|
129
|
-
|
130
|
-
|
54
|
+
result = dataset.add_result(udb[:stage], true, is_clean: true)
|
55
|
+
result.nil? and
|
56
|
+
raise 'Empty dataset: seed result not added due to incomplete files.'
|
131
57
|
result.clean!
|
132
58
|
result.save
|
133
59
|
dataset
|
134
60
|
end
|
135
61
|
|
62
|
+
##
|
63
|
+
# Updates the MiGA::Dataset +dataset+ with the remotely available metadata,
|
64
|
+
# and optionally the Hash +metadata+.
|
65
|
+
def update_metadata(dataset, metadata = {})
|
66
|
+
metadata = get_metadata(metadata)
|
67
|
+
metadata.each { |k,v| dataset.metadata[k] = v }
|
68
|
+
dataset.save
|
69
|
+
end
|
70
|
+
|
136
71
|
##
|
137
72
|
# Get metadata from the remote location.
|
138
|
-
def get_metadata(metadata={})
|
73
|
+
def get_metadata(metadata = {})
|
139
74
|
case universe
|
140
75
|
when :ebi, :ncbi
|
141
76
|
# Get taxonomy
|
142
77
|
metadata[:tax] = get_ncbi_taxonomy
|
143
78
|
end
|
79
|
+
metadata[:"#{universe}_#{db}"] = ids.join(",")
|
80
|
+
metadata = get_type_status(metadata)
|
144
81
|
metadata
|
145
82
|
end
|
146
83
|
|
147
|
-
##
|
148
|
-
# Download data into +file+.
|
149
|
-
def download(file)
|
150
|
-
MiGA::RemoteDataset.download(universe, db, ids,
|
151
|
-
@@UNIVERSE[universe][:dbs][db][:format], file)
|
152
|
-
end
|
153
|
-
|
154
84
|
##
|
155
85
|
# Get NCBI Taxonomy ID.
|
156
86
|
def get_ncbi_taxid
|
157
87
|
send("get_ncbi_taxid_from_#{universe}")
|
158
88
|
end
|
159
89
|
|
90
|
+
##
|
91
|
+
# Get the type material status and return an (updated)
|
92
|
+
# +metadata+ hash.
|
93
|
+
def get_type_status(metadata)
|
94
|
+
if metadata[:ncbi_asm]
|
95
|
+
get_type_status_ncbi_asm metadata
|
96
|
+
elsif metadata[:ncbi_nuccore]
|
97
|
+
get_type_status_ncbi_nuccore metadata
|
98
|
+
else
|
99
|
+
metadata
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
160
103
|
##
|
161
104
|
# Get NCBI taxonomy as MiGA::Taxonomy.
|
162
105
|
def get_ncbi_taxonomy
|
163
106
|
lineage = {}
|
164
107
|
tax_id = get_ncbi_taxid
|
165
|
-
|
166
|
-
doc = MiGA::RemoteDataset.download(:ebi, :taxonomy, tax_id,
|
108
|
+
until [nil, '0', '1'].include? tax_id
|
109
|
+
doc = MiGA::RemoteDataset.download(:ebi, :taxonomy, tax_id, '')
|
167
110
|
name = doc.scan(/SCIENTIFIC NAME\s+:\s+(.+)/).first.to_a.first
|
168
111
|
rank = doc.scan(/RANK\s+:\s+(.+)/).first.to_a.first
|
169
|
-
rank =
|
112
|
+
rank = 'dataset' if lineage.empty? and rank == 'no rank'
|
170
113
|
lineage[rank] = name unless rank.nil?
|
171
114
|
tax_id = doc.scan(/PARENT ID\s+:\s+(.+)/).first.to_a.first
|
172
115
|
end
|
@@ -174,24 +117,72 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
174
117
|
end
|
175
118
|
|
176
119
|
private
|
177
|
-
|
120
|
+
|
178
121
|
def get_ncbi_taxid_from_ncbi
|
179
|
-
doc =
|
180
|
-
ln = doc.grep(
|
122
|
+
doc = self.class.download(universe, db, ids, :gb).split(/\n/)
|
123
|
+
ln = doc.grep(%r{^\s+/db_xref="taxon:}).first
|
181
124
|
return nil if ln.nil?
|
182
|
-
ln.sub!(/.*(?:"taxon:)(\d+)["; ].*/,
|
125
|
+
ln.sub!(/.*(?:"taxon:)(\d+)["; ].*/, '\\1')
|
183
126
|
return nil unless ln =~ /^\d+$/
|
184
127
|
ln
|
185
128
|
end
|
186
129
|
|
187
130
|
def get_ncbi_taxid_from_ebi
|
188
|
-
doc =
|
189
|
-
ln = doc.grep(
|
131
|
+
doc = self.class.download(universe, db, ids, :annot).split(/\n/)
|
132
|
+
ln = doc.grep(%r{^FT\s+/db_xref="taxon:}).first
|
190
133
|
ln = doc.grep(/^OX\s+NCBI_TaxID=/).first if ln.nil?
|
191
134
|
return nil if ln.nil?
|
192
|
-
ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/,
|
135
|
+
ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/, '\\1')
|
193
136
|
return nil unless ln =~ /^\d+$/
|
194
137
|
ln
|
195
138
|
end
|
196
139
|
|
140
|
+
def get_type_status_ncbi_nuccore(metadata)
|
141
|
+
return metadata if metadata[:ncbi_nuccore].nil?
|
142
|
+
biosample = self.class.ncbi_map(metadata[:ncbi_nuccore],
|
143
|
+
:nuccore, :biosample)
|
144
|
+
return metadata if biosample.nil?
|
145
|
+
asm = self.class.ncbi_map(biosample,
|
146
|
+
:biosample, :assembly)
|
147
|
+
metadata[:ncbi_asm] = asm.to_s unless asm.nil?
|
148
|
+
get_type_status_ncbi_asm metadata
|
149
|
+
end
|
150
|
+
|
151
|
+
def get_type_status_ncbi_asm(metadata)
|
152
|
+
return metadata if metadata[:ncbi_asm].nil?
|
153
|
+
doc = CGI.unescapeHTML(self.class.download(:web, :text,
|
154
|
+
"https://www.ncbi.nlm.nih.gov/assembly/" \
|
155
|
+
"#{metadata[:ncbi_asm]}?report=xml", :xml)).each_line
|
156
|
+
from_type = doc.grep(%r{<FromType/?>}).first or return metadata
|
157
|
+
if from_type =~ %r{<FromType/>}
|
158
|
+
metadata[:is_type] = false
|
159
|
+
metadata[:is_ref_type] = false
|
160
|
+
elsif from_type =~ %r{<FromType>(.*)</FromType>}
|
161
|
+
if $1 == 'assembly from reference material'
|
162
|
+
metadata[:is_type] = false
|
163
|
+
metadata[:is_ref_type] = true
|
164
|
+
else
|
165
|
+
metadata[:is_type] = true
|
166
|
+
end
|
167
|
+
metadata[:type_rel] = $1
|
168
|
+
end
|
169
|
+
metadata
|
170
|
+
end
|
171
|
+
|
172
|
+
def save_assembly_to(project, name, udb)
|
173
|
+
dir = MiGA::Dataset.RESULT_DIRS[:assembly]
|
174
|
+
base = "#{project.path}/data/#{dir}/#{name}"
|
175
|
+
l_ctg = "#{base}.LargeContigs.fna"
|
176
|
+
a_ctg = "#{base}.AllContigs.fna"
|
177
|
+
File.open("#{base}.start", 'w') { |ofh| ofh.puts Time.now.to_s }
|
178
|
+
if udb[:format] == :fasta_gz
|
179
|
+
download "#{l_ctg}.gz"
|
180
|
+
system "gzip -d '#{l_ctg}.gz'"
|
181
|
+
else
|
182
|
+
download l_ctg
|
183
|
+
end
|
184
|
+
File.unlink(a_ctg) if File.exist? a_ctg
|
185
|
+
File.symlink(File.basename(l_ctg), a_ctg)
|
186
|
+
File.open("#{base}.done", 'w') { |ofh| ofh.puts Time.now.to_s }
|
187
|
+
end
|
197
188
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
|
2
|
-
require '
|
2
|
+
require 'rest-client'
|
3
3
|
require 'open-uri'
|
4
|
+
require 'cgi'
|
4
5
|
|
5
6
|
class MiGA::RemoteDataset < MiGA::MiGA
|
6
7
|
|
@@ -13,7 +14,7 @@ end
|
|
13
14
|
|
14
15
|
module MiGA::RemoteDataset::Base
|
15
16
|
|
16
|
-
@@_EUTILS =
|
17
|
+
@@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
17
18
|
|
18
19
|
##
|
19
20
|
# Structure of the different database Universes or containers. The structure
|
@@ -23,33 +24,38 @@ module MiGA::RemoteDataset::Base
|
|
23
24
|
# properties such as +stage+, +format+, and +map_to+.
|
24
25
|
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
25
26
|
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
26
|
-
#
|
27
|
-
#
|
27
|
+
# Additional parameters can be passed to certain functions using the +extra+
|
28
|
+
# option.
|
29
|
+
# - +method+ => Method used to query the URL. Only +:rest+ and +:net+ are
|
30
|
+
# currently supported.
|
28
31
|
# - +map_to_universe+ => Universe where results map to. Currently unsupported.
|
29
32
|
@@UNIVERSE = {
|
30
|
-
web:{
|
33
|
+
web: {
|
31
34
|
dbs: {
|
32
|
-
assembly:{stage: :assembly, format: :fasta},
|
33
|
-
assembly_gz:{stage: :assembly, format: :fasta_gz}
|
35
|
+
assembly: {stage: :assembly, format: :fasta},
|
36
|
+
assembly_gz: {stage: :assembly, format: :fasta_gz},
|
37
|
+
text: {stage: :metadata, format: :text}
|
34
38
|
},
|
35
39
|
url: "%2$s",
|
36
40
|
method: :net
|
37
41
|
},
|
38
|
-
ebi:{
|
39
|
-
dbs: { embl:{stage: :assembly, format: :fasta} },
|
40
|
-
url: "
|
42
|
+
ebi: {
|
43
|
+
dbs: { embl: {stage: :assembly, format: :fasta} },
|
44
|
+
url: "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
41
45
|
method: :rest
|
42
46
|
},
|
43
|
-
ncbi:{
|
44
|
-
dbs: { nuccore:{stage: :assembly, format: :fasta} },
|
47
|
+
ncbi: {
|
48
|
+
dbs: { nuccore: {stage: :assembly, format: :fasta} },
|
45
49
|
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
46
50
|
method: :rest
|
47
51
|
},
|
48
|
-
ncbi_map:{
|
49
|
-
dbs: {
|
50
|
-
|
51
|
-
|
52
|
-
|
52
|
+
ncbi_map: {
|
53
|
+
dbs: {
|
54
|
+
nuccore: {stage: :metadata, map_to: [:biosample, :assembly],
|
55
|
+
format: :json},
|
56
|
+
biosample: {stage: :metadata, map_to: [:assembly], format: :json}
|
57
|
+
},
|
58
|
+
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
|
53
59
|
method: :rest,
|
54
60
|
map_to_universe: :ncbi
|
55
61
|
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
|
2
|
+
require 'miga/remote_dataset/base'
|
3
|
+
|
4
|
+
class MiGA::RemoteDataset
|
5
|
+
include MiGA::RemoteDataset::Base
|
6
|
+
|
7
|
+
# Class-level
|
8
|
+
class << self
|
9
|
+
##
|
10
|
+
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
11
|
+
# in +format+. If passed, it saves the result in +file+. Additional
|
12
|
+
# parameters specific to the download method can be passed using +extra+.
|
13
|
+
# Returns String.
|
14
|
+
def download(universe, db, ids, format, file = nil, extra = [])
|
15
|
+
ids = [ids] unless ids.is_a? Array
|
16
|
+
case @@UNIVERSE[universe][:method]
|
17
|
+
when :rest
|
18
|
+
doc = download_rest(universe, db, ids, format, extra)
|
19
|
+
when :net
|
20
|
+
doc = download_net(universe, db, ids, format, extra)
|
21
|
+
end
|
22
|
+
unless file.nil?
|
23
|
+
ofh = File.open(file, 'w')
|
24
|
+
ofh.print doc
|
25
|
+
ofh.close
|
26
|
+
end
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Download data using a REST method from the +universe+ in the database +db+
|
32
|
+
# with IDs +ids+ and in +format+. Additional URL parameters can be passed
|
33
|
+
# using +extra+. Returns the doc as String.
|
34
|
+
def download_rest(universe, db, ids, format, extra = [])
|
35
|
+
u = @@UNIVERSE[universe]
|
36
|
+
url ||= sprintf(u[:url], db, ids.join(","), format, *extra)
|
37
|
+
response = RestClient::Request.execute(method: :get, url:url, timeout:600)
|
38
|
+
unless response.code == 200
|
39
|
+
raise "Unable to reach #{universe} client, error code #{response.code}."
|
40
|
+
end
|
41
|
+
response.to_s
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Download data using a GET request from the +universe+ in the database +db+
|
46
|
+
# with IDs +ids+ and in +format+. Additional URL parameters can be passed
|
47
|
+
# using +extra+. Returns the doc as String.
|
48
|
+
def download_net(universe, db, ids, format, extra = [])
|
49
|
+
u = @@UNIVERSE[universe]
|
50
|
+
url = sprintf(u[:url], db, ids.join(","), format, *extra)
|
51
|
+
doc = ""
|
52
|
+
@timeout_try = 0
|
53
|
+
begin
|
54
|
+
open(url) { |f| doc = f.read }
|
55
|
+
rescue Net::ReadTimeout
|
56
|
+
@timeout_try += 1
|
57
|
+
if @timeout_try > 3 ; raise Net::ReadTimeout
|
58
|
+
else ; retry
|
59
|
+
end
|
60
|
+
end
|
61
|
+
doc
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Looks for the entry +id+ in +dbfrom+, and returns the linked
|
66
|
+
# identifier in +db+ (or nil).
|
67
|
+
def ncbi_map(id, dbfrom, db)
|
68
|
+
doc = download(:ncbi_map, dbfrom, id, :json, nil, [db])
|
69
|
+
return if doc.empty?
|
70
|
+
tree = JSON.parse(doc, symbolize_names: true)
|
71
|
+
tree.dig(:linksets, 0, :linksetdbs, 0, :links, 0)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
module MiGA::RemoteDataset::Download
|
77
|
+
|
78
|
+
##
|
79
|
+
# Download data into +file+.
|
80
|
+
def download(file)
|
81
|
+
self.class.download(universe, db, ids,
|
82
|
+
self.class.UNIVERSE[universe][:dbs][db][:format], file)
|
83
|
+
end
|
84
|
+
end
|
data/lib/miga/result/dates.rb
CHANGED
@@ -1,24 +1,25 @@
|
|
1
1
|
|
2
|
-
require
|
2
|
+
require 'miga/result/base'
|
3
3
|
|
4
4
|
##
|
5
5
|
# Helper module including date-specific functions for results.
|
6
6
|
module MiGA::Result::Dates
|
7
|
-
|
8
7
|
include MiGA::Result::Base
|
9
|
-
|
8
|
+
|
10
9
|
##
|
11
|
-
# Returns the start date of processing as DateTime or +nil+ if it doesn't
|
10
|
+
# Returns the start date of processing as DateTime or +nil+ if it doesn't
|
11
|
+
# exist.
|
12
12
|
def started_at
|
13
13
|
date_at :start
|
14
14
|
end
|
15
15
|
|
16
16
|
##
|
17
|
-
# Returns the end (done) date of processing as DateTime or +nil+ if it doesn't
|
17
|
+
# Returns the end (done) date of processing as DateTime or +nil+ if it doesn't
|
18
|
+
# exist.
|
18
19
|
def done_at
|
19
20
|
date_at :done
|
20
21
|
end
|
21
|
-
|
22
|
+
|
22
23
|
##
|
23
24
|
# Time it took for the result to complete as Float in minutes.
|
24
25
|
def running_time
|
@@ -27,16 +28,17 @@ module MiGA::Result::Dates
|
|
27
28
|
(b - a).to_f * 24 * 60
|
28
29
|
end
|
29
30
|
|
30
|
-
|
31
31
|
private
|
32
32
|
|
33
33
|
##
|
34
34
|
# Internal function to detect start and end dates
|
35
35
|
def date_at(event)
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
date = self[event]
|
37
|
+
if date.nil?
|
38
|
+
f = path event
|
39
|
+
date = File.read(f) if File.size? f
|
40
|
+
end
|
41
|
+
date.nil? ? nil : DateTime.parse(date)
|
39
42
|
end
|
40
|
-
|
41
43
|
end
|
42
44
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 4, 1]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
data/scripts/assembly.bash
CHANGED
@@ -19,7 +19,7 @@ if [[ -s "$TF/$DATASET.1.fasta" \
|
|
19
19
|
FastA.interpose.pl "$TF/$DATASET.CoupledReads.fa" "$TF/$DATASET".[12].fasta
|
20
20
|
gzip -9 -f "$TF/$DATASET.1.fasta"
|
21
21
|
gzip -9 -f "$TF/$DATASET.2.fasta"
|
22
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta
|
22
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
|
23
23
|
fi
|
24
24
|
|
25
25
|
# Assemble
|
data/scripts/read_quality.bash
CHANGED
@@ -28,6 +28,7 @@ rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.paired
|
|
28
28
|
rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.single
|
29
29
|
rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed
|
30
30
|
rm -f "../02.trimmed_reads/$b".[12].fastq
|
31
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
31
32
|
|
32
33
|
# Finalize
|
33
34
|
miga date > "$DATASET.done"
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -19,6 +19,7 @@ for sis in 1 2 ; do
|
|
19
19
|
&& ! -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
|
20
20
|
&& gunzip "../02.trimmed_reads/$b.$sis.clipped.fastq.gz"
|
21
21
|
done
|
22
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
22
23
|
|
23
24
|
# FastQ -> FastA
|
24
25
|
FQ2A="$MIGA/utils/enveomics/Scripts/FastQ.toFastA.awk"
|
@@ -44,6 +45,8 @@ for sis in 1 2 ; do
|
|
44
45
|
[[ -e "../02.trimmed_reads/$b.$sis.clipped.single.fastq" ]] \
|
45
46
|
&& gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.single.fastq"
|
46
47
|
done
|
48
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
|
49
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
47
50
|
|
48
51
|
# Finalize
|
49
52
|
miga date > "$DATASET.done"
|
data/scripts/trimmed_reads.bash
CHANGED
@@ -18,6 +18,7 @@ miga date > "$DATASET.start"
|
|
18
18
|
&& gunzip "../01.raw_reads/$b.1.fastq.gz"
|
19
19
|
[[ -e "../01.raw_reads/$b.2.fastq.gz" && ! -e "../01.raw_reads/$b.2.fastq" ]] \
|
20
20
|
&& gunzip "../01.raw_reads/$b.2.fastq.gz"
|
21
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
|
21
22
|
|
22
23
|
# Clean existing files
|
23
24
|
exists "$b".[12].* && rm "$b".[12].*
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rest-client
|
@@ -162,6 +162,7 @@ files:
|
|
162
162
|
- lib/miga/project/result.rb
|
163
163
|
- lib/miga/remote_dataset.rb
|
164
164
|
- lib/miga/remote_dataset/base.rb
|
165
|
+
- lib/miga/remote_dataset/download.rb
|
165
166
|
- lib/miga/result.rb
|
166
167
|
- lib/miga/result/base.rb
|
167
168
|
- lib/miga/result/dates.rb
|
@@ -499,7 +500,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
499
500
|
requirements:
|
500
501
|
- - ">="
|
501
502
|
- !ruby/object:Gem::Version
|
502
|
-
version: '
|
503
|
+
version: '2.3'
|
503
504
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
504
505
|
requirements:
|
505
506
|
- - ">="
|