miga-base 0.3.3.1 → 0.3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/about.rb +1 -1
- data/actions/ls.rb +5 -2
- data/actions/ncbi_get.rb +19 -9
- data/actions/stats.rb +18 -10
- data/lib/miga.rb +5 -5
- data/lib/miga/common/format.rb +10 -8
- data/lib/miga/dataset/result.rb +32 -38
- data/lib/miga/remote_dataset.rb +104 -113
- data/lib/miga/remote_dataset/base.rb +23 -17
- data/lib/miga/remote_dataset/download.rb +84 -0
- data/lib/miga/result/dates.rb +13 -11
- data/lib/miga/version.rb +1 -1
- data/scripts/assembly.bash +1 -1
- data/scripts/read_quality.bash +1 -0
- data/scripts/trimmed_fasta.bash +3 -0
- data/scripts/trimmed_reads.bash +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b7140af2e2cb8525f5231ea065c79cb362348a9f
|
4
|
+
data.tar.gz: 257de78dbf14f7e01f4c239e7311883f98ccab21
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a08c21c03a045d369078614dc41223db249b4105d20ece5198e4ef126dab468f7490278f50d7101652bba5433e4eee4653ea0fd67e85de55a78881d842d25ac
|
7
|
+
data.tar.gz: 62b07a2e62dbf801f5afbe87ac6bbb7c5a004ccc52153c640daf675e5e50e211fd568b7496d884a67759006d47e4ffd264dc280d28e299fb8351ba5ba14e5a6f
|
data/actions/about.rb
CHANGED
@@ -27,7 +27,7 @@ raise "Impossible to load project: #{o[:project]}" if p.nil?
|
|
27
27
|
|
28
28
|
if not o[:datum].nil?
|
29
29
|
v = p.metadata[ o[:datum] ]
|
30
|
-
puts v.nil? ?
|
30
|
+
puts v.nil? ? '?' : v
|
31
31
|
elsif o[:processing]
|
32
32
|
keys = MiGA::Project.DISTANCE_TASKS + MiGA::Project.INCLADE_TASKS
|
33
33
|
puts MiGA::MiGA.tabulate([:task, :status], keys.map do |k|
|
data/actions/ls.rb
CHANGED
@@ -13,7 +13,7 @@ OptionParser.new do |opt|
|
|
13
13
|
opt.on("-p", "--processing",
|
14
14
|
"Print information on processing advance."){ |v| o[:processing]=v }
|
15
15
|
opt.on("-m", "--metadata STRING",
|
16
|
-
"Print name and metadata field only. If set, ignores -i."
|
16
|
+
"Print name and metadata field only. If set, ignores -i and assumes --tab."
|
17
17
|
){ |v| o[:datum]=v }
|
18
18
|
opt.on("--tab",
|
19
19
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
@@ -43,7 +43,10 @@ ds = filter_datasets!(ds, o)
|
|
43
43
|
exit(1) if o[:silent] and ds.empty?
|
44
44
|
|
45
45
|
if not o[:datum].nil?
|
46
|
-
ds.each
|
46
|
+
ds.each do |d|
|
47
|
+
v = d.metadata[ o[:datum] ]
|
48
|
+
puts "#{d.name}\t#{v.nil? ? '?' : v}"
|
49
|
+
end
|
47
50
|
elsif o[:info]
|
48
51
|
puts MiGA::MiGA.tabulate(
|
49
52
|
MiGA::Dataset.INFO_FIELDS, ds.map{ |d| d.info }, o[:tabular])
|
data/actions/ncbi_get.rb
CHANGED
@@ -8,7 +8,8 @@ require 'miga/remote_dataset'
|
|
8
8
|
o = {q:true, query:false, unlink:false,
|
9
9
|
reference: false, ignore_plasmids: false,
|
10
10
|
complete: false, chromosome: false,
|
11
|
-
scaffold: false, contig: false, add_version: true, dry: false
|
11
|
+
scaffold: false, contig: false, add_version: true, dry: false,
|
12
|
+
get_md: false}
|
12
13
|
OptionParser.new do |opt|
|
13
14
|
opt_banner(opt)
|
14
15
|
opt_object(opt, o, [:project])
|
@@ -37,6 +38,9 @@ OptionParser.new do |opt|
|
|
37
38
|
opt.on('--blacklist PATH',
|
38
39
|
'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
|
39
40
|
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
41
|
+
opt.on('--get-metadata',
|
42
|
+
'Only download and update metadata for existing datasets'
|
43
|
+
){ |v| o[:get_md] = v }
|
40
44
|
opt.on('-q', '--query',
|
41
45
|
'Register the datasets as queries, not reference datasets.'
|
42
46
|
){ |v| o[:query]=v }
|
@@ -131,8 +135,7 @@ if o[:scaffold] or o[:contig]
|
|
131
135
|
map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
|
132
136
|
next if ids.empty?
|
133
137
|
n = "#{r[0]}_#{asm}".miga_name
|
134
|
-
|
135
|
-
ds[n] = {ids: ids, md: {type: :genome, comments: comm},
|
138
|
+
ds[n] = {ids: ids, md: {type: :genome, ncbi_asm: asm},
|
136
139
|
db: :assembly_gz, universe: :web}
|
137
140
|
end
|
138
141
|
end
|
@@ -144,23 +147,30 @@ unless o[:blacklist].nil?
|
|
144
147
|
end
|
145
148
|
|
146
149
|
# Download entries
|
147
|
-
$stderr.puts "Downloading #{ds.size}
|
150
|
+
$stderr.puts "Downloading #{ds.size} " +
|
151
|
+
(ds.size > 1 ? "entries" : "entry") unless o[:q]
|
148
152
|
ds.each do |name,body|
|
149
153
|
d << name
|
150
154
|
puts name
|
151
|
-
next
|
155
|
+
next if p.dataset(name).nil? == o[:get_md]
|
152
156
|
downloaded += 1
|
153
157
|
next if o[:dry]
|
154
158
|
$stderr.puts ' Locating remote dataset.' unless o[:q]
|
155
159
|
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
156
|
-
|
157
|
-
|
158
|
-
|
160
|
+
if o[:get_md]
|
161
|
+
$stderr.puts ' Updating dataset.' unless o[:q]
|
162
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
163
|
+
else
|
164
|
+
$stderr.puts ' Creating dataset.' unless o[:q]
|
165
|
+
rd.save_to(p, name, !o[:query], body[:md])
|
166
|
+
p.add_dataset(name)
|
167
|
+
end
|
159
168
|
end
|
160
169
|
|
161
170
|
# Finalize
|
162
171
|
$stderr.puts "Datasets listed: #{d.size}" unless o[:q]
|
163
|
-
$stderr.puts "Datasets #{
|
172
|
+
$stderr.puts "Datasets #{o[:dry] ? 'to download' : 'downloaded'}: " +
|
173
|
+
downloaded.to_s unless o[:q]
|
164
174
|
unless o[:remote_list].nil?
|
165
175
|
File.open(o[:remote_list], 'w') do |fh|
|
166
176
|
d.each { |i| fh.puts i }
|
data/actions/stats.rb
CHANGED
@@ -45,14 +45,16 @@ if o[:compute]
|
|
45
45
|
when :raw_reads
|
46
46
|
if r[:files][:pair1].nil?
|
47
47
|
s = MiGA::MiGA.seqs_length(r.file_path(:single), :fastq, gc: true)
|
48
|
-
stats = {
|
48
|
+
stats = {
|
49
|
+
reads: s[:n],
|
49
50
|
length_average: [s[:avg], "bp"],
|
50
51
|
length_standard_deviation: [s[:sd], "bp"],
|
51
52
|
g_c_content: [s[:gc], "%"]}
|
52
53
|
else
|
53
54
|
s1 = MiGA::MiGA.seqs_length(r.file_path(:pair1), :fastq, gc: true)
|
54
55
|
s2 = MiGA::MiGA.seqs_length(r.file_path(:pair2), :fastq, gc: true)
|
55
|
-
stats = {
|
56
|
+
stats = {
|
57
|
+
read_pairs: s1[:n],
|
56
58
|
forward_length_average: [s1[:avg], "bp"],
|
57
59
|
forward_length_standard_deviation: [s1[:sd], "bp"],
|
58
60
|
forward_g_c_content: [s1[:gc], "%"],
|
@@ -63,22 +65,28 @@ if o[:compute]
|
|
63
65
|
when :trimmed_fasta
|
64
66
|
f = r[:files][:coupled].nil? ? r.file_path(:single) : r.file_path(:coupled)
|
65
67
|
s = MiGA::MiGA.seqs_length(f, :fasta, gc: true)
|
66
|
-
stats = {
|
68
|
+
stats = {
|
69
|
+
reads: s[:n],
|
67
70
|
length_average: [s[:avg], "bp"],
|
68
71
|
length_standard_deviation: [s[:sd], "bp"],
|
69
72
|
g_c_content: [s[:gc], "%"]}
|
70
73
|
when :assembly
|
71
74
|
s = MiGA::MiGA.seqs_length(r.file_path(:largecontigs), :fasta,
|
72
|
-
n50:true, gc:true)
|
73
|
-
stats = {
|
74
|
-
|
75
|
+
n50: true, gc: true)
|
76
|
+
stats = {
|
77
|
+
contigs: s[:n],
|
78
|
+
n50: [s[:n50], "bp"],
|
79
|
+
total_length: [s[:tot], "bp"],
|
80
|
+
g_c_content: [s[:gc], "%"]}
|
75
81
|
when :cds
|
76
82
|
s = MiGA::MiGA.seqs_length(r.file_path(:proteins), :fasta)
|
77
|
-
stats = {
|
83
|
+
stats = {
|
84
|
+
predicted_proteins: s[:n],
|
85
|
+
average_length: [s[:avg], "aa"]}
|
78
86
|
asm = d.add_result(:assembly, false)
|
79
87
|
unless asm.nil? or asm[:stats][:total_length].nil?
|
80
88
|
stats[:coding_density] =
|
81
|
-
[300.0*s[:tot]/asm[:stats][:total_length][0], "%"]
|
89
|
+
[300.0 * s[:tot] / asm[:stats][:total_length][0], "%"]
|
82
90
|
end
|
83
91
|
when :essential_genes
|
84
92
|
if d.is_multi?
|
@@ -102,7 +110,7 @@ if o[:compute]
|
|
102
110
|
r.add_file(:report, "#{d.name}.ess/log.archaea")
|
103
111
|
end
|
104
112
|
# Extract/compute quality values
|
105
|
-
stats = {completeness:[0.0,"%"], contamination:[0.0,"%"]}
|
113
|
+
stats = {completeness: [0.0,"%"], contamination: [0.0,"%"]}
|
106
114
|
File.open(r.file_path(:report), "r") do |fh|
|
107
115
|
fh.each_line do |ln|
|
108
116
|
if /^! (Completeness|Contamination): (.*)%/.match(ln)
|
@@ -110,7 +118,7 @@ if o[:compute]
|
|
110
118
|
end
|
111
119
|
end
|
112
120
|
end
|
113
|
-
stats[:quality] = stats[:completeness][0] - stats[:contamination][0]*5
|
121
|
+
stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
|
114
122
|
d.metadata[:quality] = case stats[:quality]
|
115
123
|
when 80..100 ; :excellent
|
116
124
|
when 50..80 ; :high
|
data/lib/miga.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
4
|
+
require 'json'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'miga/common'
|
7
|
+
require 'miga/project'
|
8
|
+
require 'miga/taxonomy'
|
data/lib/miga/common/format.rb
CHANGED
@@ -2,18 +2,20 @@
|
|
2
2
|
require 'tempfile'
|
3
3
|
require 'zlib'
|
4
4
|
|
5
|
+
##
|
6
|
+
# General formatting functions shared throughout MiGA.
|
5
7
|
module MiGA::Common::Format
|
6
8
|
##
|
7
9
|
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
8
10
|
# entries as +header+. Returns an Array of String, one per line.
|
9
|
-
def tabulate(header, values, tabular=false)
|
11
|
+
def tabulate(header, values, tabular = false)
|
10
12
|
fields = [header.map(&:to_s)]
|
11
13
|
fields << fields.first.map { |h| h.gsub(/\S/, '-') } unless tabular
|
12
14
|
fields += values.map { |r| r.map { |cell| cell.nil? ? '?' : cell.to_s } }
|
13
15
|
clen = tabular ? Array.new(header.size, 0) :
|
14
16
|
fields.map { |r| r.map(&:length) }.transpose.map(&:max)
|
15
17
|
fields.map do |r|
|
16
|
-
(0
|
18
|
+
(0..(clen.size - 1)).map do |col_n|
|
17
19
|
col_n == 0 ? r[col_n].rjust(clen[col_n]) : r[col_n].ljust(clen[col_n])
|
18
20
|
end.join(tabular ? "\t" : ' ')
|
19
21
|
end
|
@@ -37,7 +39,7 @@ module MiGA::Common::Format
|
|
37
39
|
fh.each_line do |ln|
|
38
40
|
ln.chomp!
|
39
41
|
if ln =~ /^>\s*(\S+)(.*)/
|
40
|
-
|
42
|
+
id, df = $1, $2
|
41
43
|
tmp_fh.print buffer.wrap_width(80)
|
42
44
|
buffer = ''
|
43
45
|
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}"
|
@@ -66,16 +68,17 @@ module MiGA::Common::Format
|
|
66
68
|
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
67
69
|
# - +gc+: If true, it also returns the G+C content (in %).
|
68
70
|
def seqs_length(file, format, opts = {})
|
69
|
-
fh =
|
71
|
+
fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
70
72
|
l = []
|
71
73
|
gc = 0
|
72
|
-
i = 0 # <- Zlib::GzipReader doesn't set
|
74
|
+
i = 0 # <- Zlib::GzipReader doesn't set `$.`
|
73
75
|
fh.each_line do |ln|
|
74
76
|
i += 1
|
75
|
-
if (format == :fasta and ln =~ /^>/) or
|
77
|
+
if (format == :fasta and ln =~ /^>/) or
|
78
|
+
(format == :fastq and (i % 4) == 1)
|
76
79
|
l << 0
|
77
80
|
elsif format == :fasta or (i % 4) == 2
|
78
|
-
l[l.size-1] += ln.chomp.size
|
81
|
+
l[l.size - 1] += ln.chomp.size
|
79
82
|
gc += ln.scan(/[GCgc]/).count if opts[:gc]
|
80
83
|
end
|
81
84
|
end
|
@@ -131,4 +134,3 @@ class String
|
|
131
134
|
gsub(/([^\n\r]{1,#{width}})/, "\\1\n")
|
132
135
|
end
|
133
136
|
end
|
134
|
-
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -92,7 +92,7 @@ module MiGA::Dataset::Result
|
|
92
92
|
|
93
93
|
##
|
94
94
|
# Are all the dataset-specific tasks done? Passes +save+ to #add_result.
|
95
|
-
def done_preprocessing?(save=false)
|
95
|
+
def done_preprocessing?(save = false)
|
96
96
|
!first_preprocessing(save).nil? and next_preprocessing(save).nil?
|
97
97
|
end
|
98
98
|
|
@@ -103,7 +103,7 @@ module MiGA::Dataset::Result
|
|
103
103
|
# - 1 for a registered result (a completed task).
|
104
104
|
# - 2 for a queued result (a task yet to be executed).
|
105
105
|
# It passes +save+ to #add_result
|
106
|
-
def profile_advance(save=false)
|
106
|
+
def profile_advance(save = false)
|
107
107
|
first_task = first_preprocessing(save)
|
108
108
|
return Array.new(@@PREPROCESSING_TASKS.size, 0) if first_task.nil?
|
109
109
|
adv = []
|
@@ -141,12 +141,12 @@ module MiGA::Dataset::Result
|
|
141
141
|
##
|
142
142
|
# Add result type +:raw_reads+ at +base+ (no +_opts+ supported).
|
143
143
|
def add_result_raw_reads(base, _opts)
|
144
|
-
return nil unless result_files_exist?(base,
|
144
|
+
return nil unless result_files_exist?(base, '.1.fastq')
|
145
145
|
r = MiGA::Result.new("#{base}.json")
|
146
146
|
add_files_to_ds_result(r, name,
|
147
|
-
( result_files_exist?(base,
|
148
|
-
{pair1:
|
149
|
-
{single:
|
147
|
+
( result_files_exist?(base, '.2.fastq') ?
|
148
|
+
{pair1: '.1.fastq', pair2: '.2.fastq'} :
|
149
|
+
{single: '.1.fastq'} ))
|
150
150
|
end
|
151
151
|
|
152
152
|
##
|
@@ -156,13 +156,12 @@ module MiGA::Dataset::Result
|
|
156
156
|
r = MiGA::Result.new("#{base}.json")
|
157
157
|
if result_files_exist?(base, ".2.clipped.fastq")
|
158
158
|
r = add_files_to_ds_result(r, name,
|
159
|
-
pair1:".1.clipped.fastq", pair2:".2.clipped.fastq",
|
160
|
-
single:".1.clipped.single.fastq")
|
159
|
+
pair1: ".1.clipped.fastq", pair2: ".2.clipped.fastq",
|
160
|
+
single: ".1.clipped.single.fastq")
|
161
161
|
else
|
162
|
-
r = add_files_to_ds_result(r, name, single:".1.clipped.fastq")
|
162
|
+
r = add_files_to_ds_result(r, name, single: ".1.clipped.fastq")
|
163
163
|
end
|
164
164
|
r.add_file(:trimming_sumary, "#{name}.1.fastq.trimmed.summary.txt")
|
165
|
-
add_result(:raw_reads) #-> Post gunzip
|
166
165
|
r
|
167
166
|
end
|
168
167
|
|
@@ -171,10 +170,8 @@ module MiGA::Dataset::Result
|
|
171
170
|
def add_result_read_quality(base, _opts)
|
172
171
|
return nil unless result_files_exist?(base, %w[.solexaqa .fastqc])
|
173
172
|
r = MiGA::Result.new("#{base}.json")
|
174
|
-
|
175
|
-
solexaqa:".solexaqa", fastqc:".fastqc")
|
176
|
-
add_result(:trimmed_reads) #-> Post cleaning
|
177
|
-
r
|
173
|
+
add_files_to_ds_result(r, name,
|
174
|
+
solexaqa: ".solexaqa", fastqc: ".fastqc")
|
178
175
|
end
|
179
176
|
|
180
177
|
##
|
@@ -185,10 +182,8 @@ module MiGA::Dataset::Result
|
|
185
182
|
result_files_exist?(base, ".SingleReads.fa") or
|
186
183
|
result_files_exist?(base, %w[.1.fasta .2.fasta])
|
187
184
|
r = MiGA::Result.new("#{base}.json")
|
188
|
-
|
189
|
-
single:".SingleReads.fa", pair1:".1.fasta", pair2:".2.fasta")
|
190
|
-
add_result(:raw_reads) #-> Post gzip
|
191
|
-
r
|
185
|
+
add_files_to_ds_result(r, name, coupled: ".CoupledReads.fa",
|
186
|
+
single: ".SingleReads.fa", pair1: ".1.fasta", pair2: ".2.fasta")
|
192
187
|
end
|
193
188
|
|
194
189
|
##
|
@@ -197,15 +192,14 @@ module MiGA::Dataset::Result
|
|
197
192
|
def add_result_assembly(base, opts)
|
198
193
|
return nil unless result_files_exist?(base, ".LargeContigs.fna")
|
199
194
|
r = MiGA::Result.new("#{base}.json")
|
200
|
-
r = add_files_to_ds_result(r, name, largecontigs:".LargeContigs.fna",
|
201
|
-
allcontigs:".AllContigs.fna", assembly_data:
|
195
|
+
r = add_files_to_ds_result(r, name, largecontigs: ".LargeContigs.fna",
|
196
|
+
allcontigs: ".AllContigs.fna", assembly_data: '')
|
202
197
|
opts[:is_clean] ||= false
|
203
198
|
r.clean! if opts[:is_clean]
|
204
199
|
unless r.clean?
|
205
200
|
MiGA::MiGA.clean_fasta_file(r.file_path :largecontigs)
|
206
201
|
r.clean!
|
207
202
|
end
|
208
|
-
add_result(:trimmed_fasta) #-> Post interposing
|
209
203
|
r
|
210
204
|
end
|
211
205
|
|
@@ -214,8 +208,8 @@ module MiGA::Dataset::Result
|
|
214
208
|
def add_result_cds(base, opts)
|
215
209
|
return nil unless result_files_exist?(base, %w[.faa])
|
216
210
|
r = MiGA::Result.new("#{base}.json")
|
217
|
-
r = add_files_to_ds_result(r, name, proteins:".faa", genes:".fna",
|
218
|
-
gff2:".gff2", gff3:".gff3", tab:".tab")
|
211
|
+
r = add_files_to_ds_result(r, name, proteins: ".faa", genes: ".fna",
|
212
|
+
gff2: ".gff2", gff3: ".gff3", tab: ".tab")
|
219
213
|
opts[:is_clean] ||= false
|
220
214
|
r.clean! if opts[:is_clean]
|
221
215
|
unless r.clean?
|
@@ -231,8 +225,8 @@ module MiGA::Dataset::Result
|
|
231
225
|
def add_result_essential_genes(base, _opts)
|
232
226
|
return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
|
233
227
|
r = MiGA::Result.new("#{base}.json")
|
234
|
-
add_files_to_ds_result(r, name, ess_genes:".ess.faa",
|
235
|
-
collection:".ess", report:".ess/log")
|
228
|
+
add_files_to_ds_result(r, name, ess_genes: ".ess.faa",
|
229
|
+
collection: ".ess", report: ".ess/log")
|
236
230
|
end
|
237
231
|
|
238
232
|
##
|
@@ -241,8 +235,8 @@ module MiGA::Dataset::Result
|
|
241
235
|
return MiGA::Result.new("#{base}.json") if result(:assembly).nil?
|
242
236
|
return nil unless result_files_exist?(base, ".ssu.fa")
|
243
237
|
r = MiGA::Result.new("#{base}.json")
|
244
|
-
r = add_files_to_ds_result(r, name, longest_ssu_gene:".ssu.fa",
|
245
|
-
gff:".ssu.gff", all_ssu_genes:".ssu.all.fa")
|
238
|
+
r = add_files_to_ds_result(r, name, longest_ssu_gene: ".ssu.fa",
|
239
|
+
gff: ".ssu.gff", all_ssu_genes: ".ssu.all.fa")
|
246
240
|
opts[:is_clean] ||= false
|
247
241
|
r.clean! if opts[:is_clean]
|
248
242
|
unless r.clean?
|
@@ -259,11 +253,11 @@ module MiGA::Dataset::Result
|
|
259
253
|
return nil unless result_files_exist?(base, ".mytaxa") or
|
260
254
|
result_files_exist?(base, ".nomytaxa.txt")
|
261
255
|
r = MiGA::Result.new("#{base}.json")
|
262
|
-
add_files_to_ds_result(r, name, mytaxa:".mytaxa", blast:".blast",
|
263
|
-
mytaxain:".mytaxain", nomytaxa:".nomytaxa.txt",
|
264
|
-
species:".mytaxa.Species.txt", genus:".mytaxa.Genus.txt",
|
265
|
-
phylum:".mytaxa.Phylum.txt", innominate:".mytaxa.innominate",
|
266
|
-
kronain:".mytaxa.krona", krona:".html")
|
256
|
+
add_files_to_ds_result(r, name, mytaxa: ".mytaxa", blast: ".blast",
|
257
|
+
mytaxain: ".mytaxain", nomytaxa: ".nomytaxa.txt",
|
258
|
+
species: ".mytaxa.Species.txt", genus: ".mytaxa.Genus.txt",
|
259
|
+
phylum: ".mytaxa.Phylum.txt", innominate: ".mytaxa.innominate",
|
260
|
+
kronain: ".mytaxa.krona", krona: ".html")
|
267
261
|
else
|
268
262
|
MiGA::Result.new("#{base}.json")
|
269
263
|
end
|
@@ -327,8 +321,8 @@ module MiGA::Dataset::Result
|
|
327
321
|
return nil unless
|
328
322
|
File.exist?("#{pref}/01.haai/#{name}.db")
|
329
323
|
r = MiGA::Result.new("#{base}.json")
|
330
|
-
r.add_files(haai_db:"01.haai/#{name}.db", aai_db:"02.aai/#{name}.db",
|
331
|
-
ani_db:"03.ani/#{name}.db")
|
324
|
+
r.add_files(haai_db: "01.haai/#{name}.db", aai_db: "02.aai/#{name}.db",
|
325
|
+
ani_db: "03.ani/#{name}.db")
|
332
326
|
r
|
333
327
|
end
|
334
328
|
|
@@ -339,10 +333,10 @@ module MiGA::Dataset::Result
|
|
339
333
|
result_files_exist?(base, %w[.aai-medoids.tsv .aai.db]) or
|
340
334
|
result_files_exist?(base, %w[.ani-medoids.tsv .ani.db])
|
341
335
|
r = MiGA::Result.new("#{base}.json")
|
342
|
-
add_files_to_ds_result(r, name, aai_medoids:".aai-medoids.tsv",
|
343
|
-
haai_db:".haai.db", aai_db:".aai.db", ani_medoids:".ani-medoids.tsv",
|
344
|
-
ani_db:".ani.db", ref_tree:".nwk", ref_tree_pdf:".nwk.pdf",
|
345
|
-
intax_test:".intax.txt")
|
336
|
+
add_files_to_ds_result(r, name, aai_medoids: ".aai-medoids.tsv",
|
337
|
+
haai_db: ".haai.db", aai_db: ".aai.db", ani_medoids: ".ani-medoids.tsv",
|
338
|
+
ani_db: ".ani.db", ref_tree: ".nwk", ref_tree_pdf: ".nwk.pdf",
|
339
|
+
intax_test: ".intax.txt")
|
346
340
|
end
|
347
341
|
|
348
342
|
##
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -1,68 +1,12 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require 'miga/remote_dataset/
|
4
|
+
require 'miga/remote_dataset/download'
|
5
5
|
|
6
6
|
##
|
7
7
|
# MiGA representation of datasets with data in remote locations.
|
8
8
|
class MiGA::RemoteDataset < MiGA::MiGA
|
9
|
-
|
10
|
-
include MiGA::RemoteDataset::Base
|
11
|
-
|
12
|
-
# Class-level
|
13
|
-
|
14
|
-
##
|
15
|
-
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
16
|
-
# in +format+. If passed, it saves the result in +file+. Returns String.
|
17
|
-
def self.download(universe, db, ids, format, file=nil)
|
18
|
-
ids = [ids] unless ids.is_a? Array
|
19
|
-
case @@UNIVERSE[universe][:method]
|
20
|
-
when :rest
|
21
|
-
doc = download_rest(universe, db, ids, format)
|
22
|
-
when :net
|
23
|
-
doc = download_net(universe, db, ids, format)
|
24
|
-
end
|
25
|
-
unless file.nil?
|
26
|
-
ofh = File.open(file, "w")
|
27
|
-
ofh.print doc
|
28
|
-
ofh.close
|
29
|
-
end
|
30
|
-
doc
|
31
|
-
end
|
32
|
-
|
33
|
-
##
|
34
|
-
# Download data using a REST method from the +universe+ in the database +db+
|
35
|
-
# with IDs +ids+ and in +format+. Returns the doc as String.
|
36
|
-
def self.download_rest(universe, db, ids, format)
|
37
|
-
u = @@UNIVERSE[universe]
|
38
|
-
map_to = u[:dbs][db].nil? ? nil : u[:dbs][db][:map_to]
|
39
|
-
url = sprintf(u[:url], db, ids.join(","), format, map_to)
|
40
|
-
response = RestClient::Request.execute(method: :get, url:url, timeout:600)
|
41
|
-
unless response.code == 200
|
42
|
-
raise "Unable to reach #{universe} client, error code #{response.code}."
|
43
|
-
end
|
44
|
-
response.to_s
|
45
|
-
end
|
46
|
-
|
47
|
-
##
|
48
|
-
# Download data using a GET request from the +universe+ in the database +db+
|
49
|
-
# with IDs +ids+ and in +format+. Returns the doc as String.
|
50
|
-
def self.download_net(universe, db, ids, format)
|
51
|
-
u = @@UNIVERSE[universe]
|
52
|
-
map_to = u[:dbs][db].nil? ? nil : u[:dbs][db][:map_to]
|
53
|
-
url = sprintf(u[:url], db, ids.join(","), format, map_to)
|
54
|
-
doc = ""
|
55
|
-
@timeout_try = 0
|
56
|
-
begin
|
57
|
-
open(url) { |f| doc = f.read }
|
58
|
-
rescue Net::ReadTimeout
|
59
|
-
@timeout_try += 1
|
60
|
-
if @timeout_try > 3 ; raise Net::ReadTimeout
|
61
|
-
else ; retry
|
62
|
-
end
|
63
|
-
end
|
64
|
-
doc
|
65
|
-
end
|
9
|
+
include MiGA::RemoteDataset::Download
|
66
10
|
|
67
11
|
# Instance-level
|
68
12
|
|
@@ -71,7 +15,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
71
15
|
attr_reader :universe
|
72
16
|
# Database storing the dataset.
|
73
17
|
attr_reader :db
|
74
|
-
# IDs of the entries composing the dataset.
|
18
|
+
# Array of IDs of the entries composing the dataset.
|
75
19
|
attr_reader :ids
|
76
20
|
|
77
21
|
##
|
@@ -81,92 +25,91 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
81
25
|
@ids = (ids.is_a?(Array) ? ids : [ids])
|
82
26
|
@db = db.to_sym
|
83
27
|
@universe = universe.to_sym
|
84
|
-
|
85
|
-
"#{
|
86
|
-
|
87
|
-
"#{@@UNIVERSE[@universe][:dbs]}"
|
88
|
-
|
89
|
-
#
|
90
|
-
#
|
91
|
-
#
|
92
|
-
#end
|
28
|
+
@@UNIVERSE.keys.include?(@universe) or
|
29
|
+
raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
|
30
|
+
@@UNIVERSE[@universe][:dbs].include?(@db) or
|
31
|
+
raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
|
32
|
+
# FIXME: Part of the +map_to+ support:
|
33
|
+
# unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
|
34
|
+
# MiGA::RemoteDataset.download
|
35
|
+
# end
|
93
36
|
end
|
94
37
|
|
95
38
|
##
|
96
39
|
# Save dataset to the MiGA::Project +project+ identified with +name+. +is_ref+
|
97
40
|
# indicates if it should be a reference dataset, and contains +metadata+.
|
98
|
-
def save_to(project, name=nil, is_ref=true, metadata={})
|
99
|
-
name ||= ids.join(
|
41
|
+
def save_to(project, name = nil, is_ref = true, metadata = {})
|
42
|
+
name ||= ids.join('_').miga_name
|
100
43
|
project = MiGA::Project.new(project) if project.is_a? String
|
101
|
-
|
44
|
+
MiGA::Dataset.exist?(project, name) and
|
102
45
|
raise "Dataset #{name} exists in the project, aborting..."
|
103
|
-
end
|
104
46
|
metadata = get_metadata(metadata)
|
105
47
|
udb = @@UNIVERSE[universe][:dbs][db]
|
106
|
-
metadata["#{universe}_#{db}"] = ids.join(
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
base = "#{project.path}/data/#{dir}/#{name}"
|
111
|
-
l_ctg = "#{base}.LargeContigs.fna"
|
112
|
-
a_ctg = "#{base}.AllContigs.fna"
|
113
|
-
File.open("#{base}.start", "w") { |ofh| ofh.puts Time.now.to_s }
|
114
|
-
if udb[:format] == :fasta_gz
|
115
|
-
download "#{l_ctg}.gz"
|
116
|
-
system "gzip -d '#{l_ctg}.gz'"
|
117
|
-
else
|
118
|
-
download l_ctg
|
119
|
-
end
|
120
|
-
File.unlink(a_ctg) if File.exist? a_ctg
|
121
|
-
File.symlink(File.basename(l_ctg), a_ctg)
|
122
|
-
File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
|
123
|
-
else
|
124
|
-
raise "Unexpected error: Unsupported result for database #{db}."
|
125
|
-
end
|
48
|
+
metadata["#{universe}_#{db}"] = ids.join(',')
|
49
|
+
respond_to?("save_#{udb[:stage]}_to", true) or
|
50
|
+
raise "Unexpected error: Unsupported stage #{udb[:stage]} for #{db}."
|
51
|
+
send "save_#{udb[:stage]}_to", project, name, udb
|
126
52
|
dataset = MiGA::Dataset.new(project, name, is_ref, metadata)
|
127
53
|
project.add_dataset(dataset.name)
|
128
|
-
result = dataset.add_result(udb[:stage], true, is_clean:true)
|
129
|
-
|
130
|
-
|
54
|
+
result = dataset.add_result(udb[:stage], true, is_clean: true)
|
55
|
+
result.nil? and
|
56
|
+
raise 'Empty dataset: seed result not added due to incomplete files.'
|
131
57
|
result.clean!
|
132
58
|
result.save
|
133
59
|
dataset
|
134
60
|
end
|
135
61
|
|
62
|
+
##
|
63
|
+
# Updates the MiGA::Dataset +dataset+ with the remotely available metadata,
|
64
|
+
# and optionally the Hash +metadata+.
|
65
|
+
def update_metadata(dataset, metadata = {})
|
66
|
+
metadata = get_metadata(metadata)
|
67
|
+
metadata.each { |k,v| dataset.metadata[k] = v }
|
68
|
+
dataset.save
|
69
|
+
end
|
70
|
+
|
136
71
|
##
|
137
72
|
# Get metadata from the remote location.
|
138
|
-
def get_metadata(metadata={})
|
73
|
+
def get_metadata(metadata = {})
|
139
74
|
case universe
|
140
75
|
when :ebi, :ncbi
|
141
76
|
# Get taxonomy
|
142
77
|
metadata[:tax] = get_ncbi_taxonomy
|
143
78
|
end
|
79
|
+
metadata[:"#{universe}_#{db}"] = ids.join(",")
|
80
|
+
metadata = get_type_status(metadata)
|
144
81
|
metadata
|
145
82
|
end
|
146
83
|
|
147
|
-
##
|
148
|
-
# Download data into +file+.
|
149
|
-
def download(file)
|
150
|
-
MiGA::RemoteDataset.download(universe, db, ids,
|
151
|
-
@@UNIVERSE[universe][:dbs][db][:format], file)
|
152
|
-
end
|
153
|
-
|
154
84
|
##
|
155
85
|
# Get NCBI Taxonomy ID.
|
156
86
|
def get_ncbi_taxid
|
157
87
|
send("get_ncbi_taxid_from_#{universe}")
|
158
88
|
end
|
159
89
|
|
90
|
+
##
|
91
|
+
# Get the type material status and return an (updated)
|
92
|
+
# +metadata+ hash.
|
93
|
+
def get_type_status(metadata)
|
94
|
+
if metadata[:ncbi_asm]
|
95
|
+
get_type_status_ncbi_asm metadata
|
96
|
+
elsif metadata[:ncbi_nuccore]
|
97
|
+
get_type_status_ncbi_nuccore metadata
|
98
|
+
else
|
99
|
+
metadata
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
160
103
|
##
|
161
104
|
# Get NCBI taxonomy as MiGA::Taxonomy.
|
162
105
|
def get_ncbi_taxonomy
|
163
106
|
lineage = {}
|
164
107
|
tax_id = get_ncbi_taxid
|
165
|
-
|
166
|
-
doc = MiGA::RemoteDataset.download(:ebi, :taxonomy, tax_id,
|
108
|
+
until [nil, '0', '1'].include? tax_id
|
109
|
+
doc = MiGA::RemoteDataset.download(:ebi, :taxonomy, tax_id, '')
|
167
110
|
name = doc.scan(/SCIENTIFIC NAME\s+:\s+(.+)/).first.to_a.first
|
168
111
|
rank = doc.scan(/RANK\s+:\s+(.+)/).first.to_a.first
|
169
|
-
rank =
|
112
|
+
rank = 'dataset' if lineage.empty? and rank == 'no rank'
|
170
113
|
lineage[rank] = name unless rank.nil?
|
171
114
|
tax_id = doc.scan(/PARENT ID\s+:\s+(.+)/).first.to_a.first
|
172
115
|
end
|
@@ -174,24 +117,72 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
174
117
|
end
|
175
118
|
|
176
119
|
private
|
177
|
-
|
120
|
+
|
178
121
|
def get_ncbi_taxid_from_ncbi
|
179
|
-
doc =
|
180
|
-
ln = doc.grep(
|
122
|
+
doc = self.class.download(universe, db, ids, :gb).split(/\n/)
|
123
|
+
ln = doc.grep(%r{^\s+/db_xref="taxon:}).first
|
181
124
|
return nil if ln.nil?
|
182
|
-
ln.sub!(/.*(?:"taxon:)(\d+)["; ].*/,
|
125
|
+
ln.sub!(/.*(?:"taxon:)(\d+)["; ].*/, '\\1')
|
183
126
|
return nil unless ln =~ /^\d+$/
|
184
127
|
ln
|
185
128
|
end
|
186
129
|
|
187
130
|
def get_ncbi_taxid_from_ebi
|
188
|
-
doc =
|
189
|
-
ln = doc.grep(
|
131
|
+
doc = self.class.download(universe, db, ids, :annot).split(/\n/)
|
132
|
+
ln = doc.grep(%r{^FT\s+/db_xref="taxon:}).first
|
190
133
|
ln = doc.grep(/^OX\s+NCBI_TaxID=/).first if ln.nil?
|
191
134
|
return nil if ln.nil?
|
192
|
-
ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/,
|
135
|
+
ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/, '\\1')
|
193
136
|
return nil unless ln =~ /^\d+$/
|
194
137
|
ln
|
195
138
|
end
|
196
139
|
|
140
|
+
def get_type_status_ncbi_nuccore(metadata)
|
141
|
+
return metadata if metadata[:ncbi_nuccore].nil?
|
142
|
+
biosample = self.class.ncbi_map(metadata[:ncbi_nuccore],
|
143
|
+
:nuccore, :biosample)
|
144
|
+
return metadata if biosample.nil?
|
145
|
+
asm = self.class.ncbi_map(biosample,
|
146
|
+
:biosample, :assembly)
|
147
|
+
metadata[:ncbi_asm] = asm.to_s unless asm.nil?
|
148
|
+
get_type_status_ncbi_asm metadata
|
149
|
+
end
|
150
|
+
|
151
|
+
def get_type_status_ncbi_asm(metadata)
|
152
|
+
return metadata if metadata[:ncbi_asm].nil?
|
153
|
+
doc = CGI.unescapeHTML(self.class.download(:web, :text,
|
154
|
+
"https://www.ncbi.nlm.nih.gov/assembly/" \
|
155
|
+
"#{metadata[:ncbi_asm]}?report=xml", :xml)).each_line
|
156
|
+
from_type = doc.grep(%r{<FromType/?>}).first or return metadata
|
157
|
+
if from_type =~ %r{<FromType/>}
|
158
|
+
metadata[:is_type] = false
|
159
|
+
metadata[:is_ref_type] = false
|
160
|
+
elsif from_type =~ %r{<FromType>(.*)</FromType>}
|
161
|
+
if $1 == 'assembly from reference material'
|
162
|
+
metadata[:is_type] = false
|
163
|
+
metadata[:is_ref_type] = true
|
164
|
+
else
|
165
|
+
metadata[:is_type] = true
|
166
|
+
end
|
167
|
+
metadata[:type_rel] = $1
|
168
|
+
end
|
169
|
+
metadata
|
170
|
+
end
|
171
|
+
|
172
|
+
def save_assembly_to(project, name, udb)
|
173
|
+
dir = MiGA::Dataset.RESULT_DIRS[:assembly]
|
174
|
+
base = "#{project.path}/data/#{dir}/#{name}"
|
175
|
+
l_ctg = "#{base}.LargeContigs.fna"
|
176
|
+
a_ctg = "#{base}.AllContigs.fna"
|
177
|
+
File.open("#{base}.start", 'w') { |ofh| ofh.puts Time.now.to_s }
|
178
|
+
if udb[:format] == :fasta_gz
|
179
|
+
download "#{l_ctg}.gz"
|
180
|
+
system "gzip -d '#{l_ctg}.gz'"
|
181
|
+
else
|
182
|
+
download l_ctg
|
183
|
+
end
|
184
|
+
File.unlink(a_ctg) if File.exist? a_ctg
|
185
|
+
File.symlink(File.basename(l_ctg), a_ctg)
|
186
|
+
File.open("#{base}.done", 'w') { |ofh| ofh.puts Time.now.to_s }
|
187
|
+
end
|
197
188
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
|
2
|
-
require '
|
2
|
+
require 'rest-client'
|
3
3
|
require 'open-uri'
|
4
|
+
require 'cgi'
|
4
5
|
|
5
6
|
class MiGA::RemoteDataset < MiGA::MiGA
|
6
7
|
|
@@ -13,7 +14,7 @@ end
|
|
13
14
|
|
14
15
|
module MiGA::RemoteDataset::Base
|
15
16
|
|
16
|
-
@@_EUTILS =
|
17
|
+
@@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
17
18
|
|
18
19
|
##
|
19
20
|
# Structure of the different database Universes or containers. The structure
|
@@ -23,33 +24,38 @@ module MiGA::RemoteDataset::Base
|
|
23
24
|
# properties such as +stage+, +format+, and +map_to+.
|
24
25
|
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
25
26
|
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
26
|
-
#
|
27
|
-
#
|
27
|
+
# Additional parameters can be passed to certain functions using the +extra+
|
28
|
+
# option.
|
29
|
+
# - +method+ => Method used to query the URL. Only +:rest+ and +:net+ are
|
30
|
+
# currently supported.
|
28
31
|
# - +map_to_universe+ => Universe where results map to. Currently unsupported.
|
29
32
|
@@UNIVERSE = {
|
30
|
-
web:{
|
33
|
+
web: {
|
31
34
|
dbs: {
|
32
|
-
assembly:{stage: :assembly, format: :fasta},
|
33
|
-
assembly_gz:{stage: :assembly, format: :fasta_gz}
|
35
|
+
assembly: {stage: :assembly, format: :fasta},
|
36
|
+
assembly_gz: {stage: :assembly, format: :fasta_gz},
|
37
|
+
text: {stage: :metadata, format: :text}
|
34
38
|
},
|
35
39
|
url: "%2$s",
|
36
40
|
method: :net
|
37
41
|
},
|
38
|
-
ebi:{
|
39
|
-
dbs: { embl:{stage: :assembly, format: :fasta} },
|
40
|
-
url: "
|
42
|
+
ebi: {
|
43
|
+
dbs: { embl: {stage: :assembly, format: :fasta} },
|
44
|
+
url: "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
41
45
|
method: :rest
|
42
46
|
},
|
43
|
-
ncbi:{
|
44
|
-
dbs: { nuccore:{stage: :assembly, format: :fasta} },
|
47
|
+
ncbi: {
|
48
|
+
dbs: { nuccore: {stage: :assembly, format: :fasta} },
|
45
49
|
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
46
50
|
method: :rest
|
47
51
|
},
|
48
|
-
ncbi_map:{
|
49
|
-
dbs: {
|
50
|
-
|
51
|
-
|
52
|
-
|
52
|
+
ncbi_map: {
|
53
|
+
dbs: {
|
54
|
+
nuccore: {stage: :metadata, map_to: [:biosample, :assembly],
|
55
|
+
format: :json},
|
56
|
+
biosample: {stage: :metadata, map_to: [:assembly], format: :json}
|
57
|
+
},
|
58
|
+
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
|
53
59
|
method: :rest,
|
54
60
|
map_to_universe: :ncbi
|
55
61
|
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
|
2
|
+
require 'miga/remote_dataset/base'
|
3
|
+
|
4
|
+
class MiGA::RemoteDataset
|
5
|
+
include MiGA::RemoteDataset::Base
|
6
|
+
|
7
|
+
# Class-level
|
8
|
+
class << self
|
9
|
+
##
|
10
|
+
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
11
|
+
# in +format+. If passed, it saves the result in +file+. Additional
|
12
|
+
# parameters specific to the download method can be passed using +extra+.
|
13
|
+
# Returns String.
|
14
|
+
def download(universe, db, ids, format, file = nil, extra = [])
|
15
|
+
ids = [ids] unless ids.is_a? Array
|
16
|
+
case @@UNIVERSE[universe][:method]
|
17
|
+
when :rest
|
18
|
+
doc = download_rest(universe, db, ids, format, extra)
|
19
|
+
when :net
|
20
|
+
doc = download_net(universe, db, ids, format, extra)
|
21
|
+
end
|
22
|
+
unless file.nil?
|
23
|
+
ofh = File.open(file, 'w')
|
24
|
+
ofh.print doc
|
25
|
+
ofh.close
|
26
|
+
end
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Download data using a REST method from the +universe+ in the database +db+
|
32
|
+
# with IDs +ids+ and in +format+. Additional URL parameters can be passed
|
33
|
+
# using +extra+. Returns the doc as String.
|
34
|
+
def download_rest(universe, db, ids, format, extra = [])
|
35
|
+
u = @@UNIVERSE[universe]
|
36
|
+
url ||= sprintf(u[:url], db, ids.join(","), format, *extra)
|
37
|
+
response = RestClient::Request.execute(method: :get, url:url, timeout:600)
|
38
|
+
unless response.code == 200
|
39
|
+
raise "Unable to reach #{universe} client, error code #{response.code}."
|
40
|
+
end
|
41
|
+
response.to_s
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Download data using a GET request from the +universe+ in the database +db+
|
46
|
+
# with IDs +ids+ and in +format+. Additional URL parameters can be passed
|
47
|
+
# using +extra+. Returns the doc as String.
|
48
|
+
def download_net(universe, db, ids, format, extra = [])
|
49
|
+
u = @@UNIVERSE[universe]
|
50
|
+
url = sprintf(u[:url], db, ids.join(","), format, *extra)
|
51
|
+
doc = ""
|
52
|
+
@timeout_try = 0
|
53
|
+
begin
|
54
|
+
open(url) { |f| doc = f.read }
|
55
|
+
rescue Net::ReadTimeout
|
56
|
+
@timeout_try += 1
|
57
|
+
if @timeout_try > 3 ; raise Net::ReadTimeout
|
58
|
+
else ; retry
|
59
|
+
end
|
60
|
+
end
|
61
|
+
doc
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Looks for the entry +id+ in +dbfrom+, and returns the linked
|
66
|
+
# identifier in +db+ (or nil).
|
67
|
+
def ncbi_map(id, dbfrom, db)
|
68
|
+
doc = download(:ncbi_map, dbfrom, id, :json, nil, [db])
|
69
|
+
return if doc.empty?
|
70
|
+
tree = JSON.parse(doc, symbolize_names: true)
|
71
|
+
tree.dig(:linksets, 0, :linksetdbs, 0, :links, 0)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
module MiGA::RemoteDataset::Download
|
77
|
+
|
78
|
+
##
|
79
|
+
# Download data into +file+.
|
80
|
+
def download(file)
|
81
|
+
self.class.download(universe, db, ids,
|
82
|
+
self.class.UNIVERSE[universe][:dbs][db][:format], file)
|
83
|
+
end
|
84
|
+
end
|
data/lib/miga/result/dates.rb
CHANGED
@@ -1,24 +1,25 @@
|
|
1
1
|
|
2
|
-
require
|
2
|
+
require 'miga/result/base'
|
3
3
|
|
4
4
|
##
|
5
5
|
# Helper module including date-specific functions for results.
|
6
6
|
module MiGA::Result::Dates
|
7
|
-
|
8
7
|
include MiGA::Result::Base
|
9
|
-
|
8
|
+
|
10
9
|
##
|
11
|
-
# Returns the start date of processing as DateTime or +nil+ if it doesn't
|
10
|
+
# Returns the start date of processing as DateTime or +nil+ if it doesn't
|
11
|
+
# exist.
|
12
12
|
def started_at
|
13
13
|
date_at :start
|
14
14
|
end
|
15
15
|
|
16
16
|
##
|
17
|
-
# Returns the end (done) date of processing as DateTime or +nil+ if it doesn't
|
17
|
+
# Returns the end (done) date of processing as DateTime or +nil+ if it doesn't
|
18
|
+
# exist.
|
18
19
|
def done_at
|
19
20
|
date_at :done
|
20
21
|
end
|
21
|
-
|
22
|
+
|
22
23
|
##
|
23
24
|
# Time it took for the result to complete as Float in minutes.
|
24
25
|
def running_time
|
@@ -27,16 +28,17 @@ module MiGA::Result::Dates
|
|
27
28
|
(b - a).to_f * 24 * 60
|
28
29
|
end
|
29
30
|
|
30
|
-
|
31
31
|
private
|
32
32
|
|
33
33
|
##
|
34
34
|
# Internal function to detect start and end dates
|
35
35
|
def date_at(event)
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
date = self[event]
|
37
|
+
if date.nil?
|
38
|
+
f = path event
|
39
|
+
date = File.read(f) if File.size? f
|
40
|
+
end
|
41
|
+
date.nil? ? nil : DateTime.parse(date)
|
39
42
|
end
|
40
|
-
|
41
43
|
end
|
42
44
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 4, 1]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
data/scripts/assembly.bash
CHANGED
@@ -19,7 +19,7 @@ if [[ -s "$TF/$DATASET.1.fasta" \
|
|
19
19
|
FastA.interpose.pl "$TF/$DATASET.CoupledReads.fa" "$TF/$DATASET".[12].fasta
|
20
20
|
gzip -9 -f "$TF/$DATASET.1.fasta"
|
21
21
|
gzip -9 -f "$TF/$DATASET.2.fasta"
|
22
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta
|
22
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
|
23
23
|
fi
|
24
24
|
|
25
25
|
# Assemble
|
data/scripts/read_quality.bash
CHANGED
@@ -28,6 +28,7 @@ rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.paired
|
|
28
28
|
rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.single
|
29
29
|
rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed
|
30
30
|
rm -f "../02.trimmed_reads/$b".[12].fastq
|
31
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
31
32
|
|
32
33
|
# Finalize
|
33
34
|
miga date > "$DATASET.done"
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -19,6 +19,7 @@ for sis in 1 2 ; do
|
|
19
19
|
&& ! -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
|
20
20
|
&& gunzip "../02.trimmed_reads/$b.$sis.clipped.fastq.gz"
|
21
21
|
done
|
22
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
22
23
|
|
23
24
|
# FastQ -> FastA
|
24
25
|
FQ2A="$MIGA/utils/enveomics/Scripts/FastQ.toFastA.awk"
|
@@ -44,6 +45,8 @@ for sis in 1 2 ; do
|
|
44
45
|
[[ -e "../02.trimmed_reads/$b.$sis.clipped.single.fastq" ]] \
|
45
46
|
&& gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.single.fastq"
|
46
47
|
done
|
48
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
|
49
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
47
50
|
|
48
51
|
# Finalize
|
49
52
|
miga date > "$DATASET.done"
|
data/scripts/trimmed_reads.bash
CHANGED
@@ -18,6 +18,7 @@ miga date > "$DATASET.start"
|
|
18
18
|
&& gunzip "../01.raw_reads/$b.1.fastq.gz"
|
19
19
|
[[ -e "../01.raw_reads/$b.2.fastq.gz" && ! -e "../01.raw_reads/$b.2.fastq" ]] \
|
20
20
|
&& gunzip "../01.raw_reads/$b.2.fastq.gz"
|
21
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
|
21
22
|
|
22
23
|
# Clean existing files
|
23
24
|
exists "$b".[12].* && rm "$b".[12].*
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rest-client
|
@@ -162,6 +162,7 @@ files:
|
|
162
162
|
- lib/miga/project/result.rb
|
163
163
|
- lib/miga/remote_dataset.rb
|
164
164
|
- lib/miga/remote_dataset/base.rb
|
165
|
+
- lib/miga/remote_dataset/download.rb
|
165
166
|
- lib/miga/result.rb
|
166
167
|
- lib/miga/result/base.rb
|
167
168
|
- lib/miga/result/dates.rb
|
@@ -499,7 +500,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
499
500
|
requirements:
|
500
501
|
- - ">="
|
501
502
|
- !ruby/object:Gem::Version
|
502
|
-
version: '
|
503
|
+
version: '2.3'
|
503
504
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
504
505
|
requirements:
|
505
506
|
- - ">="
|