miga-base 1.3.8.2 → 1.3.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/miga/cli/action/add_result.rb +22 -1
- data/lib/miga/cli/action/browse/about.html +4 -2
- data/lib/miga/cli/action/download/gtdb.rb +1 -1
- data/lib/miga/cli/action/download/ncbi.rb +43 -68
- data/lib/miga/cli/action/download/seqcode.rb +1 -2
- data/lib/miga/cli/action/ncbi_get.rb +1 -8
- data/lib/miga/cli/action/wf.rb +15 -6
- data/lib/miga/cli/objects_helper.rb +3 -0
- data/lib/miga/cli/opt_helper.rb +8 -2
- data/lib/miga/common/net.rb +100 -18
- data/lib/miga/dataset/base.rb +40 -12
- data/lib/miga/dataset/hooks.rb +8 -0
- data/lib/miga/dataset/result/ignore.rb +14 -2
- data/lib/miga/dataset/type.rb +51 -0
- data/lib/miga/dataset.rb +3 -22
- data/lib/miga/json.rb +9 -0
- data/lib/miga/project/base.rb +15 -9
- data/lib/miga/project.rb +7 -1
- data/lib/miga/remote_dataset/base.rb +117 -36
- data/lib/miga/remote_dataset/download.rb +121 -54
- data/lib/miga/remote_dataset.rb +34 -13
- data/lib/miga/result/stats.rb +2 -0
- data/lib/miga/result/versions.rb +23 -0
- data/lib/miga/result.rb +7 -1
- data/lib/miga/taxonomy/base.rb +3 -2
- data/lib/miga/version.rb +2 -2
- data/scripts/assembly.bash +15 -1
- data/scripts/cds.bash +9 -3
- data/scripts/distances.bash +103 -5
- data/scripts/essential_genes.bash +14 -1
- data/scripts/mytaxa.bash +18 -3
- data/scripts/mytaxa_scan.bash +16 -3
- data/scripts/read_quality.bash +6 -2
- data/scripts/ssu.bash +19 -1
- data/scripts/stats.bash +9 -3
- data/scripts/taxonomy.bash +98 -2
- data/scripts/trimmed_fasta.bash +10 -2
- data/scripts/trimmed_reads.bash +26 -6
- data/test/dataset_test.rb +17 -2
- data/test/hook_test.rb +3 -2
- data/test/net_test.rb +21 -5
- data/test/project_test.rb +13 -0
- data/test/remote_dataset_test.rb +106 -7
- data/test/result_test.rb +47 -21
- data/test/taxonomy_test.rb +9 -3
- data/utils/distance/runner.rb +3 -1
- data/utils/distances.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a4aa208ac4dfe7ff6edbedb7aa7b3444c909c77e47b2ef1b93282adc83192d6
|
4
|
+
data.tar.gz: 78bdce8752b3a1a1a281123a99cccdc002d0f95a2a345aa16b1c4ee684220c29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 335e7a8715a6c561b618e21d1e273381f88d384daf16360c01f9b6818ec374f05f6ae824a36628fbe4dcb0c7d6e082f633e8372137edfa035389cb92922363fa
|
7
|
+
data.tar.gz: 76bdf75348e711bdfcca6b973acd3768559acde7ee80233833ae8b8caa595020599756b7913774dc53c038290592d20710b1b67bd5fa8bb93fd334206bf21b06
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@
|
|
12
12
|
For additional information on MiGA, visit:
|
13
13
|
|
14
14
|
* [MiGA Online][miga-online]: The Microbial Genomes Atlas Online
|
15
|
-
* [MiGA
|
15
|
+
* [MiGA Gateway][miga-gatewat]: The MiGA Science Gateway
|
16
16
|
* [MiGA users list][mailing-list]:
|
17
17
|
Forum to discuss with other users and developers
|
18
18
|
* [MiGA manual][manual]: The definitive guide to MiGA
|
@@ -57,6 +57,6 @@ See [LICENSE](LICENSE).
|
|
57
57
|
[miga-web]: https://github.com/bio-miga/miga-web
|
58
58
|
[miga-gui]: https://github.com/bio-miga/miga-gui
|
59
59
|
[miga-online]: http://microbial-genomes.org/
|
60
|
-
[miga-
|
60
|
+
[miga-gateway]: https://gateway.microbial-genomes.org/
|
61
61
|
[kostas]: http://enve-omics.gatech.edu/
|
62
62
|
[rdp]: http://rdp.cme.msu.edu/
|
@@ -5,13 +5,17 @@ require 'miga/cli/action'
|
|
5
5
|
|
6
6
|
class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
|
7
7
|
def parse_cli
|
8
|
-
cli.defaults = { force: false }
|
8
|
+
cli.defaults = { force: false, stdin_versions: false }
|
9
9
|
cli.parse do |opt|
|
10
10
|
cli.opt_object(opt, [:project, :dataset_opt, :result])
|
11
11
|
opt.on(
|
12
12
|
'-f', '--force',
|
13
13
|
'Force re-indexing of the result even if it\'s already registered'
|
14
14
|
) { |v| cli[:force] = v }
|
15
|
+
opt.on(
|
16
|
+
'--stdin-versions',
|
17
|
+
'Read Software versions from STDIN'
|
18
|
+
) { |v| cli[:stdin_versions] = v }
|
15
19
|
end
|
16
20
|
end
|
17
21
|
|
@@ -21,5 +25,22 @@ class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
|
|
21
25
|
cli.say "Registering result: #{cli[:result]}"
|
22
26
|
r = obj.add_result(cli[:result], true, force: cli[:force])
|
23
27
|
raise 'Cannot add result, incomplete expected files' if r.nil?
|
28
|
+
|
29
|
+
# Add Software version data
|
30
|
+
if cli[:stdin_versions]
|
31
|
+
versions = {}
|
32
|
+
sw = nil
|
33
|
+
$stdin.each do |ln|
|
34
|
+
ln = ln.chomp.strip
|
35
|
+
if ln =~ /^=> (.*)/
|
36
|
+
sw = $1
|
37
|
+
versions[sw] = ''
|
38
|
+
else
|
39
|
+
versions[sw] += ln
|
40
|
+
end
|
41
|
+
end
|
42
|
+
r.add_versions(versions)
|
43
|
+
r.save
|
44
|
+
end
|
24
45
|
end
|
25
46
|
end
|
@@ -12,10 +12,12 @@ terms of the terms of the
|
|
12
12
|
<p>
|
13
13
|
MiGA is the result of a collaboration between the
|
14
14
|
<a href='http://enve-omics.gatech.edu/'>Kostas Lab</a>
|
15
|
-
(<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>)
|
15
|
+
(<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>), the
|
16
16
|
<a href='http://rdp.cme.msu.edu/'>RDP team</a>
|
17
17
|
(<a href='http://cme.msu.edu/'>Center for Microbial Ecology</a>,
|
18
|
-
<a href='https://msu.edu/'>Michigan State University</a>)
|
18
|
+
<a href='https://msu.edu/'>Michigan State University</a>), and the
|
19
|
+
<a href="https://disc-genomics.uibk.ac.at/">Rodriguez-R lab</a>
|
20
|
+
(<a href="https://uibk.ac.at/">University of Innsbruck</a>).
|
19
21
|
The MiGA project is funded by the
|
20
22
|
<a href='http://nsf.gov/'>US National Science Foundation</a>
|
21
23
|
(Awards <a href='http://nsf.gov/awardsearch/showAward?AWD_ID=1356288'>#1356288</a> &
|
@@ -31,7 +31,7 @@ module MiGA::Cli::Action::Download::Gtdb
|
|
31
31
|
|
32
32
|
def remote_list
|
33
33
|
cli.say 'Downloading genome list'
|
34
|
-
extra =
|
34
|
+
extra = { sp_reps_only: cli[:reference].to_s }
|
35
35
|
json = MiGA::RemoteDataset.download(
|
36
36
|
:gtdb, :taxon, cli[:taxon], :genomes, nil, extra
|
37
37
|
)
|
@@ -34,11 +34,8 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
34
34
|
'Do not add sequence version to the dataset name',
|
35
35
|
'Only affects --complete and --chromosome'
|
36
36
|
) { |v| cli[:add_version] = v }
|
37
|
-
|
38
|
-
|
39
|
-
'Use dataset names based on chromosome entries instead of assembly',
|
40
|
-
:legacy_name
|
41
|
-
)
|
37
|
+
# For backwards compatibility
|
38
|
+
cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name)
|
42
39
|
end
|
43
40
|
|
44
41
|
def sanitize_cli
|
@@ -52,89 +49,67 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
52
49
|
end
|
53
50
|
|
54
51
|
def remote_list
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
52
|
+
list = {}
|
53
|
+
query = remote_list_query
|
54
|
+
loop do
|
55
|
+
# Query the remote collection
|
56
|
+
page = MiGA::Json.parse(
|
57
|
+
MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json),
|
58
|
+
contents: true
|
59
|
+
)
|
60
|
+
break unless page&.any? && page[:reports]&.any?
|
61
|
+
|
62
|
+
# Process reports in this page
|
63
|
+
list.merge!(parse_reports_as_datasets(page[:reports]))
|
64
|
+
|
65
|
+
# Next page
|
66
|
+
break unless page[:next_page_token]
|
67
|
+
query[:page_token] = page[:next_page_token]
|
68
|
+
end
|
69
|
+
list
|
67
70
|
end
|
68
71
|
|
69
|
-
def
|
72
|
+
def parse_reports_as_datasets(reports)
|
70
73
|
ds = {}
|
71
|
-
|
72
|
-
asm = r[
|
74
|
+
reports.each do |r|
|
75
|
+
asm = r[:accession]
|
73
76
|
next if asm.nil? || asm.empty? || asm == '-'
|
74
77
|
|
75
|
-
rep = remote_row_replicons(r)
|
76
|
-
n = remote_row_name(r, rep, asm)
|
77
|
-
|
78
78
|
# Register for download
|
79
|
+
n = remote_report_name(r, asm)
|
79
80
|
ds[n] = {
|
80
81
|
ids: [asm], db: :assembly, universe: :ncbi,
|
81
82
|
md: {
|
82
|
-
type: :genome, ncbi_asm: asm, strain: r
|
83
|
+
type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain)
|
83
84
|
}
|
84
85
|
}
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
86
|
+
date = r.dig(:assembly_info, :release_date)
|
87
|
+
ds[n][:md][:release_date] = Time.parse(date).to_s if date
|
88
|
+
ds[n][:md][:ncbi_dataset] = r
|
89
89
|
end
|
90
90
|
ds
|
91
91
|
end
|
92
92
|
|
93
|
-
def
|
94
|
-
|
95
|
-
|
96
|
-
r['replicons']
|
97
|
-
.split('; ')
|
98
|
-
.map { |i| i.gsub(/.*:/, '') }
|
99
|
-
.map { |i| i.gsub(%r{/.*}, '') }
|
100
|
-
end
|
101
|
-
|
102
|
-
def remote_row_name(r, rep, asm)
|
103
|
-
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
104
|
-
|
105
|
-
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
106
|
-
acc = rep.nil? ? '' : rep.first
|
107
|
-
else
|
108
|
-
acc = asm
|
109
|
-
end
|
93
|
+
def remote_report_name(r, asm)
|
94
|
+
acc = "#{asm}"
|
110
95
|
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
111
|
-
|
96
|
+
org = r.dig(:organism, :organism_name)
|
97
|
+
acc = "#{org}_#{acc}" if org
|
98
|
+
acc.miga_name
|
112
99
|
end
|
113
100
|
|
114
|
-
def
|
115
|
-
|
116
|
-
url_param = {
|
117
|
-
q: '[display()].' \
|
118
|
-
'from(GenomeAssemblies).' \
|
119
|
-
'usingschema(/schema/GenomeAssemblies).' \
|
120
|
-
'matching(tab==["Prokaryotes"] and q=="' \
|
121
|
-
"#{cli[:taxon]&.tr('"', "'")}\"",
|
122
|
-
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
123
|
-
'level|level,release_date|release_date,strain|strain',
|
124
|
-
nolimit: 'on'
|
125
|
-
}
|
101
|
+
def remote_list_query
|
102
|
+
q = { taxons: [cli[:taxon]], filters: {} }
|
126
103
|
if cli[:reference]
|
127
|
-
|
104
|
+
q[:filters][:reference_only] = true
|
128
105
|
else
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
}.map { |k, v| '"' + v + '"' if cli[k] }.compact
|
135
|
-
url_param[:q] += ' and level==[' + status + ']'
|
106
|
+
q[:assembly_level] = {
|
107
|
+
contig: 'contig',
|
108
|
+
scaffold: 'scaffold',
|
109
|
+
chromosome: 'chromosome',
|
110
|
+
complete: 'complete_genome'
|
111
|
+
}.map { |k, v| '"' + v + '"' if cli[k] }.compact
|
136
112
|
end
|
137
|
-
|
138
|
-
url_base + URI.encode_www_form(url_param)
|
113
|
+
q
|
139
114
|
end
|
140
115
|
end
|
@@ -29,8 +29,7 @@ module MiGA::Cli::Action::Download::Seqcode
|
|
29
29
|
|
30
30
|
while current_page <= total_pages
|
31
31
|
json = MiGA::RemoteDataset.download(
|
32
|
-
:seqcode, :'type-genomes', nil, :json, nil,
|
33
|
-
["page=#{current_page}"]
|
32
|
+
:seqcode, :'type-genomes', nil, :json, nil, page: current_page
|
34
33
|
)
|
35
34
|
doc = MiGA::Json.parse(json, contents: true)
|
36
35
|
current_page = doc[:current_page] + 1
|
@@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
8
8
|
|
9
9
|
def parse_cli
|
10
10
|
cli.defaults = {
|
11
|
-
query: false, unlink: false,
|
12
|
-
reference: false, legacy_name: false,
|
11
|
+
query: false, unlink: false, reference: false,
|
13
12
|
complete: false, chromosome: false,
|
14
13
|
scaffold: false, contig: false, add_version: true, dry: false,
|
15
14
|
get_md: false, only_md: false, save_every: 1
|
@@ -29,12 +28,6 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
29
28
|
'--api-key STRING',
|
30
29
|
'::HIDE::' # For backwards compatibility
|
31
30
|
) { |v| ENV['NCBI_API_KEY'] = v }
|
32
|
-
opt.on(
|
33
|
-
'--ncbi-table-file STRING',
|
34
|
-
'::HIDE::' # Only meant for debugging
|
35
|
-
# It can take the table returned by NCBI and parse it from a file
|
36
|
-
# instead of downloading it directly
|
37
|
-
) { |v| cli[:ncbi_table_file] = v }
|
38
31
|
opt.on(
|
39
32
|
'--ncbi-api-key STRING',
|
40
33
|
'NCBI API key'
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -8,7 +8,8 @@ module MiGA::Cli::Action::Wf
|
|
8
8
|
cli.expect_files = true
|
9
9
|
cli.defaults = {
|
10
10
|
clean: false, project_type: :genomes, dataset_type: :popgenome,
|
11
|
-
ncbi_draft: true,
|
11
|
+
ncbi_draft: true, ncbi_ref: false,
|
12
|
+
min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
|
12
13
|
prepare_and_exit: false
|
13
14
|
}
|
14
15
|
end
|
@@ -39,14 +40,21 @@ module MiGA::Cli::Action::Wf
|
|
39
40
|
'-T', '--ncbi-taxon STRING',
|
40
41
|
'Download all the genomes in NCBI classified as this taxon'
|
41
42
|
) { |v| cli[:ncbi_taxon] = v }
|
43
|
+
opt.on(
|
44
|
+
'--no-draft', '::HIDE::' # Deprecated
|
45
|
+
) { |v| cli[:ncbi_draft] = v }
|
46
|
+
opt.on(
|
47
|
+
'--ncbi-complete',
|
48
|
+
'Only download complete genomes, not drafts (requires -T)'
|
49
|
+
) { |v| cli[:ncbi_draft] = !v }
|
50
|
+
opt.on(
|
51
|
+
'--ncbi-ref',
|
52
|
+
'Only download RefSeq reference genomes (requires -T)'
|
53
|
+
) { |v| cli[:ncbi_ref] = v }
|
42
54
|
opt.on(
|
43
55
|
'-G', '--gtdb-taxon STRING',
|
44
56
|
'Download all the genomes in GTDB classified as this taxon'
|
45
57
|
) { |v| cli[:gtdb_taxon] = v }
|
46
|
-
opt.on(
|
47
|
-
'--no-draft',
|
48
|
-
'Only download complete genomes, not drafts (requires -T)'
|
49
|
-
) { |v| cli[:ncbi_draft] = v }
|
50
58
|
opt.on(
|
51
59
|
'--gtdb-ref',
|
52
60
|
'Only download reference anchor genomes in GTDB (requires -G)'
|
@@ -170,7 +178,8 @@ module MiGA::Cli::Action::Wf
|
|
170
178
|
def download_datasets
|
171
179
|
# Download datasets from NCBI
|
172
180
|
unless cli[:ncbi_taxon].nil?
|
173
|
-
what = cli[:
|
181
|
+
what = cli[:ncbi_ref] ? '--reference' :
|
182
|
+
cli[:ncbi_draft] ? '--all' : '--complete'
|
174
183
|
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
|
175
184
|
cmd += ['--max', cli[:max_download]] if cli[:max_download]
|
176
185
|
call_cli(cmd)
|
@@ -60,6 +60,9 @@ module MiGA::Cli::ObjectsHelper
|
|
60
60
|
o &&= (d.ref? == self[:ref]) unless self[:ref].nil?
|
61
61
|
o &&= (d.active? == self[:active]) unless self[:active].nil?
|
62
62
|
o &&= (self[:multi] ? d.multi? : d.nonmulti?) unless self[:multi].nil?
|
63
|
+
unless self[:markers].nil?
|
64
|
+
o &&= (self[:markers] ? d.markers? : !d.markers?)
|
65
|
+
end
|
63
66
|
unless self[:taxonomy].nil?
|
64
67
|
o &&= !d.metadata[:tax].nil? && d.metadata[:tax].in?(self[:taxonomy])
|
65
68
|
end
|
data/lib/miga/cli/opt_helper.rb
CHANGED
@@ -43,7 +43,7 @@ module MiGA::Cli::OptHelper
|
|
43
43
|
'-h', '--help',
|
44
44
|
'Display this screen'
|
45
45
|
) do
|
46
|
-
puts opt.
|
46
|
+
puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ }
|
47
47
|
exit
|
48
48
|
end
|
49
49
|
opt.separator ''
|
@@ -120,10 +120,11 @@ module MiGA::Cli::OptHelper
|
|
120
120
|
# as determined by +what+ an Array with any combination of:
|
121
121
|
# - :ref To filter by reference (--ref) or query (--no-ref)
|
122
122
|
# - :multi To filter by multiple (--multi) or single (--no-multi) species
|
123
|
+
# - :markers To filter by with (--markers) or without markers (--no-markers)
|
123
124
|
# - :active To filter by active (--active) or inactive (--no-active)
|
124
125
|
# - :taxonomy To filter by taxonomy (--taxonomy)
|
125
126
|
# The "k-th" filter (--dataset-k) is always included
|
126
|
-
def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
|
127
|
+
def opt_filter_datasets(opt, what = %i[ref multi markers active taxonomy])
|
127
128
|
what.each do |w|
|
128
129
|
case w
|
129
130
|
when :ref
|
@@ -136,6 +137,11 @@ module MiGA::Cli::OptHelper
|
|
136
137
|
'--[no-]multi',
|
137
138
|
'Use only multi-species (or only single-species) datasets'
|
138
139
|
) { |v| self[:multi] = v }
|
140
|
+
when :markers
|
141
|
+
opt.on(
|
142
|
+
'--[no-]markers',
|
143
|
+
'Use only datasets with (or without) markers'
|
144
|
+
) { |v| self[:markers] = v }
|
139
145
|
when :active
|
140
146
|
opt.on(
|
141
147
|
'--[no-]active',
|
data/lib/miga/common/net.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'net/http'
|
3
4
|
require 'net/ftp'
|
4
5
|
require 'open-uri'
|
5
6
|
require 'fileutils'
|
@@ -10,6 +11,8 @@ Net::FTP.const_set('FTP_PORT', 21)
|
|
10
11
|
##
|
11
12
|
# General web-access functions shared throughout MiGA.
|
12
13
|
module MiGA::Common::Net
|
14
|
+
attr_accessor :remote_connection_uri
|
15
|
+
|
13
16
|
##
|
14
17
|
# Returns the URL of the host +name+ (Symbol)
|
15
18
|
def known_hosts(name)
|
@@ -21,7 +24,7 @@ module MiGA::Common::Net
|
|
21
24
|
when :miga_dist
|
22
25
|
"ftp://#{main_server}/dist"
|
23
26
|
else
|
24
|
-
raise "Unrecognized server name: #{
|
27
|
+
raise "Unrecognized server name: #{name}"
|
25
28
|
end
|
26
29
|
end
|
27
30
|
|
@@ -32,49 +35,128 @@ module MiGA::Common::Net
|
|
32
35
|
end
|
33
36
|
|
34
37
|
##
|
35
|
-
# Connect to an FTP +host+ (String)
|
36
|
-
# +.known_hosts+)
|
38
|
+
# Connect to an FTP +host+ (String), a known host name (Symbol, see
|
39
|
+
# +.known_hosts+), or a parsed +URI+ object
|
40
|
+
#
|
41
|
+
# Sets the attribute +remote_connection_uri+ to the parsed +URI+ object
|
42
|
+
# silently
|
37
43
|
def remote_connection(host)
|
38
44
|
host = known_hosts(host) if host.is_a?(Symbol)
|
39
|
-
uri = URI.parse(host)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
ftp
|
44
|
-
|
45
|
-
|
46
|
-
|
45
|
+
uri = host.is_a?(URI) ? host : URI.parse(host)
|
46
|
+
@remote_connection_uri = uri
|
47
|
+
|
48
|
+
case uri.scheme
|
49
|
+
when 'ftp'
|
50
|
+
ftp = Net::FTP.new(uri.host)
|
51
|
+
ftp.passive = true
|
52
|
+
ftp.login
|
53
|
+
ftp.chdir(uri.path) unless host.is_a?(URI)
|
54
|
+
ftp
|
55
|
+
when 'http', 'https'
|
56
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
57
|
+
http.read_timeout = 600
|
58
|
+
http.use_ssl = uri.scheme == 'https'
|
59
|
+
http
|
60
|
+
else
|
61
|
+
raise 'Only FTP, HTTP, and HTTPS are currently supported'
|
62
|
+
end
|
47
63
|
end
|
48
64
|
|
49
65
|
##
|
50
66
|
# Download a file via FTP using the +connection+ (returned by
|
51
|
-
# +.remote_connection+) with remote name +file+ into local +target+.
|
67
|
+
# +.remote_connection+) with remote name +file+ into local +target+. If +file+
|
68
|
+
# is +nil+, it tries to guess the file from +connection+. If +target+ is
|
69
|
+
# +nil+, it returns the read data instead
|
52
70
|
#
|
53
|
-
# Alternatively, +connection+ can simply be the host (String)
|
54
|
-
# Symbol (see +.remote_connection+),
|
55
|
-
# connection automatically
|
71
|
+
# Alternatively, +connection+ can simply be the host (String), a recognized
|
72
|
+
# Symbol (see +.remote_connection+), or a parsed +URI+ object, in which case
|
73
|
+
# the function opens the connection automatically
|
56
74
|
#
|
57
75
|
# Reports progress to the function block with two arguments: the
|
58
76
|
# currently transferred size and the total file size
|
59
|
-
def download_file_ftp(connection, file, target)
|
77
|
+
def download_file_ftp(connection, file = nil, target = nil)
|
60
78
|
# Open connection unless passed
|
61
79
|
close_conn = false
|
62
|
-
if connection.is_a?(String) || connection.is_a?(Symbol)
|
80
|
+
if connection.is_a?(String) || connection.is_a?(Symbol) ||
|
81
|
+
connection.is_a?(URI)
|
63
82
|
connection = remote_connection(connection)
|
83
|
+
file ||= remote_connection_uri.path
|
64
84
|
close_conn = true
|
65
85
|
end
|
66
86
|
|
67
87
|
# Prepare download
|
68
|
-
FileUtils.mkdir_p(File.dirname(target))
|
88
|
+
FileUtils.mkdir_p(File.dirname(target)) if target
|
69
89
|
filesize = connection.size(file)
|
70
90
|
transferred = 0
|
71
91
|
|
72
92
|
# Get in chunks of 1KiB
|
93
|
+
ret = ''
|
73
94
|
connection.getbinaryfile(file, target, 1024) do |data|
|
74
95
|
yield(transferred += data.size, filesize) if block_given?
|
96
|
+
ret += data unless target
|
75
97
|
end
|
76
98
|
|
77
99
|
# Close connection if automatically opened
|
78
100
|
connection.close if close_conn
|
101
|
+
ret unless target
|
102
|
+
end
|
103
|
+
|
104
|
+
##
|
105
|
+
# Submit an HTTP or HTTPS request using +url+, which should be a URL
|
106
|
+
# either as String or parsed URI. The request follows the +method+, which
|
107
|
+
# should be a Net::HTTP verb such as +:get+, +:post+, or +:patch+. All
|
108
|
+
# additional parameters for the corresponding method should be passed as
|
109
|
+
# +opts+.
|
110
|
+
def http_request(method, url, *opts)
|
111
|
+
doc = nil
|
112
|
+
remote_connection(url).start do |http|
|
113
|
+
res = http.send(method, remote_connection_uri.to_s, *opts)
|
114
|
+
if %w[301 302].include?(res.code)
|
115
|
+
DEBUG "REDIRECTION #{res.code}: #{res['location']}"
|
116
|
+
return http_request(method, res['location'], *opts)
|
117
|
+
end
|
118
|
+
res.value # To force exception unless success
|
119
|
+
doc = res.body
|
120
|
+
end
|
121
|
+
doc
|
122
|
+
end
|
123
|
+
|
124
|
+
def net_method(method, uri, *opts)
|
125
|
+
attempts ||= 0
|
126
|
+
DEBUG "#{method.to_s.upcase}: #{uri} #{opts}"
|
127
|
+
case method.to_sym
|
128
|
+
when :ftp
|
129
|
+
download_file_ftp(uri)
|
130
|
+
else
|
131
|
+
http_request(method, uri, *opts)
|
132
|
+
end
|
133
|
+
rescue => e
|
134
|
+
raise e if (attempts += 1) >= 3
|
135
|
+
|
136
|
+
sleep 5 # <- For: 429 Too Many Requests
|
137
|
+
DEBUG "RETRYING after: #{e}"
|
138
|
+
retry
|
139
|
+
end
|
140
|
+
|
141
|
+
alias :https_request :http_request
|
142
|
+
|
143
|
+
##
|
144
|
+
# Normalize the encoding of +body+ to UTF-8 by attempting several
|
145
|
+
# common recodings. Code from https://github.com/seq-code/registry
|
146
|
+
def normalize_encoding(body)
|
147
|
+
# Test encodings
|
148
|
+
body.force_encoding('utf-8')
|
149
|
+
%w[iso8859-1 windows-1252 us-ascii ascii-8bit].each do |enc|
|
150
|
+
break if body.valid_encoding?
|
151
|
+
recode = body.force_encoding(enc).encode('utf-8')
|
152
|
+
body = recode if recode.valid_encoding?
|
153
|
+
end
|
154
|
+
# If nothing works, replace offending characters with '?'
|
155
|
+
unless body.valid_encoding?
|
156
|
+
body = body.encode(
|
157
|
+
'utf-8', invalid: :replace, undef: :replace, replace: '?'
|
158
|
+
)
|
159
|
+
end
|
160
|
+
body
|
79
161
|
end
|
80
162
|
end
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -32,6 +32,12 @@ class MiGA::Dataset < MiGA::MiGA
|
|
32
32
|
@@EXCLUDE_NOREF_TASKS
|
33
33
|
end
|
34
34
|
|
35
|
+
##
|
36
|
+
# Tasks to be excluded from datasets without markers
|
37
|
+
def EXCLUDE_NOMARKER_TASKS
|
38
|
+
@@EXCLUDE_NOMARKER_TASKS
|
39
|
+
end
|
40
|
+
|
35
41
|
##
|
36
42
|
# Tasks to be executed only in datasets that are single-organism. These
|
37
43
|
# tasks are ignored for multi-organism datasets or for unknown types
|
@@ -81,45 +87,67 @@ module MiGA::Dataset::Base
|
|
81
87
|
# Supported dataset types
|
82
88
|
@@KNOWN_TYPES = {
|
83
89
|
genome: {
|
84
|
-
description: 'The genome from an isolate',
|
90
|
+
description: 'The genome from an isolate',
|
91
|
+
multi: false, markers: true,
|
92
|
+
project_types: %i[mixed genomes clade]
|
85
93
|
},
|
86
94
|
scgenome: {
|
87
|
-
description: 'A Single-cell Amplified Genome (SAG)',
|
95
|
+
description: 'A Single-cell Amplified Genome (SAG)',
|
96
|
+
multi: false, markers: true,
|
97
|
+
project_types: %i[mixed genomes clade]
|
88
98
|
},
|
89
99
|
popgenome: {
|
90
|
-
description: 'A Metagenome-Assembled Genome (MAG)',
|
100
|
+
description: 'A Metagenome-Assembled Genome (MAG)',
|
101
|
+
multi: false, markers: true,
|
102
|
+
project_types: %i[mixed genomes clade]
|
91
103
|
},
|
92
104
|
metagenome: {
|
93
|
-
description: 'A metagenome (excluding viromes)',
|
105
|
+
description: 'A metagenome (excluding viromes)',
|
106
|
+
multi: true, markers: true,
|
107
|
+
project_types: %i[mixed metagenomes]
|
94
108
|
},
|
95
109
|
virome: {
|
96
|
-
description: 'A viral metagenome',
|
110
|
+
description: 'A viral metagenome',
|
111
|
+
multi: true,
|
112
|
+
markers: true, # <- We don't expect, but can be useful for contamination
|
113
|
+
project_types: %i[mixed metagenomes]
|
114
|
+
},
|
115
|
+
plasmid: {
|
116
|
+
description: 'An individual plasmid',
|
117
|
+
multi: false, markers: false,
|
118
|
+
project_types: %i[mixed plasmids]
|
97
119
|
}
|
98
120
|
}
|
99
121
|
|
100
122
|
##
|
101
123
|
# Returns an Array of tasks (Symbols) to be executed before project-wide tasks
|
102
|
-
@@PREPROCESSING_TASKS = [
|
103
|
-
|
104
|
-
|
105
|
-
|
124
|
+
@@PREPROCESSING_TASKS = %i[
|
125
|
+
raw_reads trimmed_reads read_quality trimmed_fasta
|
126
|
+
assembly cds essential_genes mytaxa mytaxa_scan
|
127
|
+
taxonomy distances ssu stats
|
106
128
|
]
|
107
129
|
|
108
130
|
##
|
109
131
|
# Tasks to be excluded from query datasets
|
110
|
-
@@EXCLUDE_NOREF_TASKS = [
|
132
|
+
@@EXCLUDE_NOREF_TASKS = %i[mytaxa_scan taxonomy]
|
111
133
|
@@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
|
112
134
|
|
135
|
+
##
|
136
|
+
# Tasks to be excluded from datasets without markers
|
137
|
+
@@EXCLUDE_NOMARKER_TASKS = %i[essential_genes ssu]
|
138
|
+
@@_EXCLUDE_NOMARKER_TASKS_H =
|
139
|
+
Hash[@@EXCLUDE_NOMARKER_TASKS.map { |i| [i, true] }]
|
140
|
+
|
113
141
|
##
|
114
142
|
# Tasks to be executed only in datasets that are single-organism. These
|
115
143
|
# tasks are ignored for multi-organism datasets or for unknown types
|
116
|
-
@@ONLY_NONMULTI_TASKS = [
|
144
|
+
@@ONLY_NONMULTI_TASKS = %i[mytaxa_scan taxonomy distances]
|
117
145
|
@@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
|
118
146
|
|
119
147
|
##
|
120
148
|
# Tasks to be executed only in datasets that are multi-organism. These
|
121
149
|
# tasks are ignored for single-organism datasets or for unknwon types
|
122
|
-
@@ONLY_MULTI_TASKS = [
|
150
|
+
@@ONLY_MULTI_TASKS = %i[mytaxa]
|
123
151
|
@@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
|
124
152
|
|
125
153
|
##
|
data/lib/miga/dataset/hooks.rb
CHANGED
@@ -15,6 +15,7 @@ require 'miga/common/hooks'
|
|
15
15
|
# Supported hooks:
|
16
16
|
# - run_lambda(lambda, args...)
|
17
17
|
# - recalculate_status()
|
18
|
+
# - check_type()
|
18
19
|
# - clear_run_counts()
|
19
20
|
# - run_cmd(cmd)
|
20
21
|
# Internal hooks:
|
@@ -27,6 +28,7 @@ module MiGA::Dataset::Hooks
|
|
27
28
|
def default_hooks
|
28
29
|
{
|
29
30
|
on_create: [[:recalculate_status]],
|
31
|
+
on_save: [[:check_type]],
|
30
32
|
on_activate: [[:clear_run_counts], [:recalculate_status]],
|
31
33
|
on_inactivate: [[:recalculate_status]],
|
32
34
|
on_result_ready: [[:_pull_result_hooks]],
|
@@ -51,6 +53,12 @@ module MiGA::Dataset::Hooks
|
|
51
53
|
recalculate_status
|
52
54
|
end
|
53
55
|
|
56
|
+
##
|
57
|
+
# Ensure that the dataset type exists and is compatible with the project type
|
58
|
+
def hook_check_type(_hook_args, _event_args)
|
59
|
+
check_type
|
60
|
+
end
|
61
|
+
|
54
62
|
##
|
55
63
|
# Run +cmd+ in the command-line with {{variables}}:
|
56
64
|
# dataset, project, project_name, miga, object (if defined for the event)
|