miga-base 1.3.8.2 → 1.3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/miga/cli/action/add_result.rb +22 -1
- data/lib/miga/cli/action/browse/about.html +4 -2
- data/lib/miga/cli/action/download/gtdb.rb +1 -1
- data/lib/miga/cli/action/download/ncbi.rb +43 -68
- data/lib/miga/cli/action/download/seqcode.rb +1 -2
- data/lib/miga/cli/action/ncbi_get.rb +1 -8
- data/lib/miga/cli/action/wf.rb +15 -6
- data/lib/miga/cli/objects_helper.rb +3 -0
- data/lib/miga/cli/opt_helper.rb +8 -2
- data/lib/miga/common/net.rb +100 -18
- data/lib/miga/dataset/base.rb +40 -12
- data/lib/miga/dataset/hooks.rb +8 -0
- data/lib/miga/dataset/result/ignore.rb +14 -2
- data/lib/miga/dataset/type.rb +51 -0
- data/lib/miga/dataset.rb +3 -22
- data/lib/miga/json.rb +9 -0
- data/lib/miga/project/base.rb +15 -9
- data/lib/miga/project.rb +7 -1
- data/lib/miga/remote_dataset/base.rb +117 -36
- data/lib/miga/remote_dataset/download.rb +121 -54
- data/lib/miga/remote_dataset.rb +34 -13
- data/lib/miga/result/stats.rb +2 -0
- data/lib/miga/result/versions.rb +23 -0
- data/lib/miga/result.rb +7 -1
- data/lib/miga/taxonomy/base.rb +3 -2
- data/lib/miga/version.rb +2 -2
- data/scripts/assembly.bash +15 -1
- data/scripts/cds.bash +9 -3
- data/scripts/distances.bash +103 -5
- data/scripts/essential_genes.bash +14 -1
- data/scripts/mytaxa.bash +18 -3
- data/scripts/mytaxa_scan.bash +16 -3
- data/scripts/read_quality.bash +6 -2
- data/scripts/ssu.bash +19 -1
- data/scripts/stats.bash +9 -3
- data/scripts/taxonomy.bash +98 -2
- data/scripts/trimmed_fasta.bash +10 -2
- data/scripts/trimmed_reads.bash +26 -6
- data/test/dataset_test.rb +17 -2
- data/test/hook_test.rb +3 -2
- data/test/net_test.rb +21 -5
- data/test/project_test.rb +13 -0
- data/test/remote_dataset_test.rb +106 -7
- data/test/result_test.rb +47 -21
- data/test/taxonomy_test.rb +9 -3
- data/utils/distance/runner.rb +3 -1
- data/utils/distances.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a4aa208ac4dfe7ff6edbedb7aa7b3444c909c77e47b2ef1b93282adc83192d6
|
4
|
+
data.tar.gz: 78bdce8752b3a1a1a281123a99cccdc002d0f95a2a345aa16b1c4ee684220c29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 335e7a8715a6c561b618e21d1e273381f88d384daf16360c01f9b6818ec374f05f6ae824a36628fbe4dcb0c7d6e082f633e8372137edfa035389cb92922363fa
|
7
|
+
data.tar.gz: 76bdf75348e711bdfcca6b973acd3768559acde7ee80233833ae8b8caa595020599756b7913774dc53c038290592d20710b1b67bd5fa8bb93fd334206bf21b06
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@
|
|
12
12
|
For additional information on MiGA, visit:
|
13
13
|
|
14
14
|
* [MiGA Online][miga-online]: The Microbial Genomes Atlas Online
|
15
|
-
* [MiGA
|
15
|
+
* [MiGA Gateway][miga-gatewat]: The MiGA Science Gateway
|
16
16
|
* [MiGA users list][mailing-list]:
|
17
17
|
Forum to discuss with other users and developers
|
18
18
|
* [MiGA manual][manual]: The definitive guide to MiGA
|
@@ -57,6 +57,6 @@ See [LICENSE](LICENSE).
|
|
57
57
|
[miga-web]: https://github.com/bio-miga/miga-web
|
58
58
|
[miga-gui]: https://github.com/bio-miga/miga-gui
|
59
59
|
[miga-online]: http://microbial-genomes.org/
|
60
|
-
[miga-
|
60
|
+
[miga-gateway]: https://gateway.microbial-genomes.org/
|
61
61
|
[kostas]: http://enve-omics.gatech.edu/
|
62
62
|
[rdp]: http://rdp.cme.msu.edu/
|
@@ -5,13 +5,17 @@ require 'miga/cli/action'
|
|
5
5
|
|
6
6
|
class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
|
7
7
|
def parse_cli
|
8
|
-
cli.defaults = { force: false }
|
8
|
+
cli.defaults = { force: false, stdin_versions: false }
|
9
9
|
cli.parse do |opt|
|
10
10
|
cli.opt_object(opt, [:project, :dataset_opt, :result])
|
11
11
|
opt.on(
|
12
12
|
'-f', '--force',
|
13
13
|
'Force re-indexing of the result even if it\'s already registered'
|
14
14
|
) { |v| cli[:force] = v }
|
15
|
+
opt.on(
|
16
|
+
'--stdin-versions',
|
17
|
+
'Read Software versions from STDIN'
|
18
|
+
) { |v| cli[:stdin_versions] = v }
|
15
19
|
end
|
16
20
|
end
|
17
21
|
|
@@ -21,5 +25,22 @@ class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
|
|
21
25
|
cli.say "Registering result: #{cli[:result]}"
|
22
26
|
r = obj.add_result(cli[:result], true, force: cli[:force])
|
23
27
|
raise 'Cannot add result, incomplete expected files' if r.nil?
|
28
|
+
|
29
|
+
# Add Software version data
|
30
|
+
if cli[:stdin_versions]
|
31
|
+
versions = {}
|
32
|
+
sw = nil
|
33
|
+
$stdin.each do |ln|
|
34
|
+
ln = ln.chomp.strip
|
35
|
+
if ln =~ /^=> (.*)/
|
36
|
+
sw = $1
|
37
|
+
versions[sw] = ''
|
38
|
+
else
|
39
|
+
versions[sw] += ln
|
40
|
+
end
|
41
|
+
end
|
42
|
+
r.add_versions(versions)
|
43
|
+
r.save
|
44
|
+
end
|
24
45
|
end
|
25
46
|
end
|
@@ -12,10 +12,12 @@ terms of the terms of the
|
|
12
12
|
<p>
|
13
13
|
MiGA is the result of a collaboration between the
|
14
14
|
<a href='http://enve-omics.gatech.edu/'>Kostas Lab</a>
|
15
|
-
(<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>)
|
15
|
+
(<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>), the
|
16
16
|
<a href='http://rdp.cme.msu.edu/'>RDP team</a>
|
17
17
|
(<a href='http://cme.msu.edu/'>Center for Microbial Ecology</a>,
|
18
|
-
<a href='https://msu.edu/'>Michigan State University</a>)
|
18
|
+
<a href='https://msu.edu/'>Michigan State University</a>), and the
|
19
|
+
<a href="https://disc-genomics.uibk.ac.at/">Rodriguez-R lab</a>
|
20
|
+
(<a href="https://uibk.ac.at/">University of Innsbruck</a>).
|
19
21
|
The MiGA project is funded by the
|
20
22
|
<a href='http://nsf.gov/'>US National Science Foundation</a>
|
21
23
|
(Awards <a href='http://nsf.gov/awardsearch/showAward?AWD_ID=1356288'>#1356288</a> &
|
@@ -31,7 +31,7 @@ module MiGA::Cli::Action::Download::Gtdb
|
|
31
31
|
|
32
32
|
def remote_list
|
33
33
|
cli.say 'Downloading genome list'
|
34
|
-
extra =
|
34
|
+
extra = { sp_reps_only: cli[:reference].to_s }
|
35
35
|
json = MiGA::RemoteDataset.download(
|
36
36
|
:gtdb, :taxon, cli[:taxon], :genomes, nil, extra
|
37
37
|
)
|
@@ -34,11 +34,8 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
34
34
|
'Do not add sequence version to the dataset name',
|
35
35
|
'Only affects --complete and --chromosome'
|
36
36
|
) { |v| cli[:add_version] = v }
|
37
|
-
|
38
|
-
|
39
|
-
'Use dataset names based on chromosome entries instead of assembly',
|
40
|
-
:legacy_name
|
41
|
-
)
|
37
|
+
# For backwards compatibility
|
38
|
+
cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name)
|
42
39
|
end
|
43
40
|
|
44
41
|
def sanitize_cli
|
@@ -52,89 +49,67 @@ module MiGA::Cli::Action::Download::Ncbi
|
|
52
49
|
end
|
53
50
|
|
54
51
|
def remote_list
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
52
|
+
list = {}
|
53
|
+
query = remote_list_query
|
54
|
+
loop do
|
55
|
+
# Query the remote collection
|
56
|
+
page = MiGA::Json.parse(
|
57
|
+
MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json),
|
58
|
+
contents: true
|
59
|
+
)
|
60
|
+
break unless page&.any? && page[:reports]&.any?
|
61
|
+
|
62
|
+
# Process reports in this page
|
63
|
+
list.merge!(parse_reports_as_datasets(page[:reports]))
|
64
|
+
|
65
|
+
# Next page
|
66
|
+
break unless page[:next_page_token]
|
67
|
+
query[:page_token] = page[:next_page_token]
|
68
|
+
end
|
69
|
+
list
|
67
70
|
end
|
68
71
|
|
69
|
-
def
|
72
|
+
def parse_reports_as_datasets(reports)
|
70
73
|
ds = {}
|
71
|
-
|
72
|
-
asm = r[
|
74
|
+
reports.each do |r|
|
75
|
+
asm = r[:accession]
|
73
76
|
next if asm.nil? || asm.empty? || asm == '-'
|
74
77
|
|
75
|
-
rep = remote_row_replicons(r)
|
76
|
-
n = remote_row_name(r, rep, asm)
|
77
|
-
|
78
78
|
# Register for download
|
79
|
+
n = remote_report_name(r, asm)
|
79
80
|
ds[n] = {
|
80
81
|
ids: [asm], db: :assembly, universe: :ncbi,
|
81
82
|
md: {
|
82
|
-
type: :genome, ncbi_asm: asm, strain: r
|
83
|
+
type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain)
|
83
84
|
}
|
84
85
|
}
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
86
|
+
date = r.dig(:assembly_info, :release_date)
|
87
|
+
ds[n][:md][:release_date] = Time.parse(date).to_s if date
|
88
|
+
ds[n][:md][:ncbi_dataset] = r
|
89
89
|
end
|
90
90
|
ds
|
91
91
|
end
|
92
92
|
|
93
|
-
def
|
94
|
-
|
95
|
-
|
96
|
-
r['replicons']
|
97
|
-
.split('; ')
|
98
|
-
.map { |i| i.gsub(/.*:/, '') }
|
99
|
-
.map { |i| i.gsub(%r{/.*}, '') }
|
100
|
-
end
|
101
|
-
|
102
|
-
def remote_row_name(r, rep, asm)
|
103
|
-
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
104
|
-
|
105
|
-
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
106
|
-
acc = rep.nil? ? '' : rep.first
|
107
|
-
else
|
108
|
-
acc = asm
|
109
|
-
end
|
93
|
+
def remote_report_name(r, asm)
|
94
|
+
acc = "#{asm}"
|
110
95
|
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
111
|
-
|
96
|
+
org = r.dig(:organism, :organism_name)
|
97
|
+
acc = "#{org}_#{acc}" if org
|
98
|
+
acc.miga_name
|
112
99
|
end
|
113
100
|
|
114
|
-
def
|
115
|
-
|
116
|
-
url_param = {
|
117
|
-
q: '[display()].' \
|
118
|
-
'from(GenomeAssemblies).' \
|
119
|
-
'usingschema(/schema/GenomeAssemblies).' \
|
120
|
-
'matching(tab==["Prokaryotes"] and q=="' \
|
121
|
-
"#{cli[:taxon]&.tr('"', "'")}\"",
|
122
|
-
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
123
|
-
'level|level,release_date|release_date,strain|strain',
|
124
|
-
nolimit: 'on'
|
125
|
-
}
|
101
|
+
def remote_list_query
|
102
|
+
q = { taxons: [cli[:taxon]], filters: {} }
|
126
103
|
if cli[:reference]
|
127
|
-
|
104
|
+
q[:filters][:reference_only] = true
|
128
105
|
else
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
}.map { |k, v| '"' + v + '"' if cli[k] }.compact
|
135
|
-
url_param[:q] += ' and level==[' + status + ']'
|
106
|
+
q[:assembly_level] = {
|
107
|
+
contig: 'contig',
|
108
|
+
scaffold: 'scaffold',
|
109
|
+
chromosome: 'chromosome',
|
110
|
+
complete: 'complete_genome'
|
111
|
+
}.map { |k, v| '"' + v + '"' if cli[k] }.compact
|
136
112
|
end
|
137
|
-
|
138
|
-
url_base + URI.encode_www_form(url_param)
|
113
|
+
q
|
139
114
|
end
|
140
115
|
end
|
@@ -29,8 +29,7 @@ module MiGA::Cli::Action::Download::Seqcode
|
|
29
29
|
|
30
30
|
while current_page <= total_pages
|
31
31
|
json = MiGA::RemoteDataset.download(
|
32
|
-
:seqcode, :'type-genomes', nil, :json, nil,
|
33
|
-
["page=#{current_page}"]
|
32
|
+
:seqcode, :'type-genomes', nil, :json, nil, page: current_page
|
34
33
|
)
|
35
34
|
doc = MiGA::Json.parse(json, contents: true)
|
36
35
|
current_page = doc[:current_page] + 1
|
@@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
8
8
|
|
9
9
|
def parse_cli
|
10
10
|
cli.defaults = {
|
11
|
-
query: false, unlink: false,
|
12
|
-
reference: false, legacy_name: false,
|
11
|
+
query: false, unlink: false, reference: false,
|
13
12
|
complete: false, chromosome: false,
|
14
13
|
scaffold: false, contig: false, add_version: true, dry: false,
|
15
14
|
get_md: false, only_md: false, save_every: 1
|
@@ -29,12 +28,6 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
29
28
|
'--api-key STRING',
|
30
29
|
'::HIDE::' # For backwards compatibility
|
31
30
|
) { |v| ENV['NCBI_API_KEY'] = v }
|
32
|
-
opt.on(
|
33
|
-
'--ncbi-table-file STRING',
|
34
|
-
'::HIDE::' # Only meant for debugging
|
35
|
-
# It can take the table returned by NCBI and parse it from a file
|
36
|
-
# instead of downloading it directly
|
37
|
-
) { |v| cli[:ncbi_table_file] = v }
|
38
31
|
opt.on(
|
39
32
|
'--ncbi-api-key STRING',
|
40
33
|
'NCBI API key'
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -8,7 +8,8 @@ module MiGA::Cli::Action::Wf
|
|
8
8
|
cli.expect_files = true
|
9
9
|
cli.defaults = {
|
10
10
|
clean: false, project_type: :genomes, dataset_type: :popgenome,
|
11
|
-
ncbi_draft: true,
|
11
|
+
ncbi_draft: true, ncbi_ref: false,
|
12
|
+
min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
|
12
13
|
prepare_and_exit: false
|
13
14
|
}
|
14
15
|
end
|
@@ -39,14 +40,21 @@ module MiGA::Cli::Action::Wf
|
|
39
40
|
'-T', '--ncbi-taxon STRING',
|
40
41
|
'Download all the genomes in NCBI classified as this taxon'
|
41
42
|
) { |v| cli[:ncbi_taxon] = v }
|
43
|
+
opt.on(
|
44
|
+
'--no-draft', '::HIDE::' # Deprecated
|
45
|
+
) { |v| cli[:ncbi_draft] = v }
|
46
|
+
opt.on(
|
47
|
+
'--ncbi-complete',
|
48
|
+
'Only download complete genomes, not drafts (requires -T)'
|
49
|
+
) { |v| cli[:ncbi_draft] = !v }
|
50
|
+
opt.on(
|
51
|
+
'--ncbi-ref',
|
52
|
+
'Only download RefSeq reference genomes (requires -T)'
|
53
|
+
) { |v| cli[:ncbi_ref] = v }
|
42
54
|
opt.on(
|
43
55
|
'-G', '--gtdb-taxon STRING',
|
44
56
|
'Download all the genomes in GTDB classified as this taxon'
|
45
57
|
) { |v| cli[:gtdb_taxon] = v }
|
46
|
-
opt.on(
|
47
|
-
'--no-draft',
|
48
|
-
'Only download complete genomes, not drafts (requires -T)'
|
49
|
-
) { |v| cli[:ncbi_draft] = v }
|
50
58
|
opt.on(
|
51
59
|
'--gtdb-ref',
|
52
60
|
'Only download reference anchor genomes in GTDB (requires -G)'
|
@@ -170,7 +178,8 @@ module MiGA::Cli::Action::Wf
|
|
170
178
|
def download_datasets
|
171
179
|
# Download datasets from NCBI
|
172
180
|
unless cli[:ncbi_taxon].nil?
|
173
|
-
what = cli[:
|
181
|
+
what = cli[:ncbi_ref] ? '--reference' :
|
182
|
+
cli[:ncbi_draft] ? '--all' : '--complete'
|
174
183
|
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
|
175
184
|
cmd += ['--max', cli[:max_download]] if cli[:max_download]
|
176
185
|
call_cli(cmd)
|
@@ -60,6 +60,9 @@ module MiGA::Cli::ObjectsHelper
|
|
60
60
|
o &&= (d.ref? == self[:ref]) unless self[:ref].nil?
|
61
61
|
o &&= (d.active? == self[:active]) unless self[:active].nil?
|
62
62
|
o &&= (self[:multi] ? d.multi? : d.nonmulti?) unless self[:multi].nil?
|
63
|
+
unless self[:markers].nil?
|
64
|
+
o &&= (self[:markers] ? d.markers? : !d.markers?)
|
65
|
+
end
|
63
66
|
unless self[:taxonomy].nil?
|
64
67
|
o &&= !d.metadata[:tax].nil? && d.metadata[:tax].in?(self[:taxonomy])
|
65
68
|
end
|
data/lib/miga/cli/opt_helper.rb
CHANGED
@@ -43,7 +43,7 @@ module MiGA::Cli::OptHelper
|
|
43
43
|
'-h', '--help',
|
44
44
|
'Display this screen'
|
45
45
|
) do
|
46
|
-
puts opt.
|
46
|
+
puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ }
|
47
47
|
exit
|
48
48
|
end
|
49
49
|
opt.separator ''
|
@@ -120,10 +120,11 @@ module MiGA::Cli::OptHelper
|
|
120
120
|
# as determined by +what+ an Array with any combination of:
|
121
121
|
# - :ref To filter by reference (--ref) or query (--no-ref)
|
122
122
|
# - :multi To filter by multiple (--multi) or single (--no-multi) species
|
123
|
+
# - :markers To filter by with (--markers) or without markers (--no-markers)
|
123
124
|
# - :active To filter by active (--active) or inactive (--no-active)
|
124
125
|
# - :taxonomy To filter by taxonomy (--taxonomy)
|
125
126
|
# The "k-th" filter (--dataset-k) is always included
|
126
|
-
def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
|
127
|
+
def opt_filter_datasets(opt, what = %i[ref multi markers active taxonomy])
|
127
128
|
what.each do |w|
|
128
129
|
case w
|
129
130
|
when :ref
|
@@ -136,6 +137,11 @@ module MiGA::Cli::OptHelper
|
|
136
137
|
'--[no-]multi',
|
137
138
|
'Use only multi-species (or only single-species) datasets'
|
138
139
|
) { |v| self[:multi] = v }
|
140
|
+
when :markers
|
141
|
+
opt.on(
|
142
|
+
'--[no-]markers',
|
143
|
+
'Use only datasets with (or without) markers'
|
144
|
+
) { |v| self[:markers] = v }
|
139
145
|
when :active
|
140
146
|
opt.on(
|
141
147
|
'--[no-]active',
|
data/lib/miga/common/net.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'net/http'
|
3
4
|
require 'net/ftp'
|
4
5
|
require 'open-uri'
|
5
6
|
require 'fileutils'
|
@@ -10,6 +11,8 @@ Net::FTP.const_set('FTP_PORT', 21)
|
|
10
11
|
##
|
11
12
|
# General web-access functions shared throughout MiGA.
|
12
13
|
module MiGA::Common::Net
|
14
|
+
attr_accessor :remote_connection_uri
|
15
|
+
|
13
16
|
##
|
14
17
|
# Returns the URL of the host +name+ (Symbol)
|
15
18
|
def known_hosts(name)
|
@@ -21,7 +24,7 @@ module MiGA::Common::Net
|
|
21
24
|
when :miga_dist
|
22
25
|
"ftp://#{main_server}/dist"
|
23
26
|
else
|
24
|
-
raise "Unrecognized server name: #{
|
27
|
+
raise "Unrecognized server name: #{name}"
|
25
28
|
end
|
26
29
|
end
|
27
30
|
|
@@ -32,49 +35,128 @@ module MiGA::Common::Net
|
|
32
35
|
end
|
33
36
|
|
34
37
|
##
|
35
|
-
# Connect to an FTP +host+ (String)
|
36
|
-
# +.known_hosts+)
|
38
|
+
# Connect to an FTP +host+ (String), a known host name (Symbol, see
|
39
|
+
# +.known_hosts+), or a parsed +URI+ object
|
40
|
+
#
|
41
|
+
# Sets the attribute +remote_connection_uri+ to the parsed +URI+ object
|
42
|
+
# silently
|
37
43
|
def remote_connection(host)
|
38
44
|
host = known_hosts(host) if host.is_a?(Symbol)
|
39
|
-
uri = URI.parse(host)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
ftp
|
44
|
-
|
45
|
-
|
46
|
-
|
45
|
+
uri = host.is_a?(URI) ? host : URI.parse(host)
|
46
|
+
@remote_connection_uri = uri
|
47
|
+
|
48
|
+
case uri.scheme
|
49
|
+
when 'ftp'
|
50
|
+
ftp = Net::FTP.new(uri.host)
|
51
|
+
ftp.passive = true
|
52
|
+
ftp.login
|
53
|
+
ftp.chdir(uri.path) unless host.is_a?(URI)
|
54
|
+
ftp
|
55
|
+
when 'http', 'https'
|
56
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
57
|
+
http.read_timeout = 600
|
58
|
+
http.use_ssl = uri.scheme == 'https'
|
59
|
+
http
|
60
|
+
else
|
61
|
+
raise 'Only FTP, HTTP, and HTTPS are currently supported'
|
62
|
+
end
|
47
63
|
end
|
48
64
|
|
49
65
|
##
|
50
66
|
# Download a file via FTP using the +connection+ (returned by
|
51
|
-
# +.remote_connection+) with remote name +file+ into local +target+.
|
67
|
+
# +.remote_connection+) with remote name +file+ into local +target+. If +file+
|
68
|
+
# is +nil+, it tries to guess the file from +connection+. If +target+ is
|
69
|
+
# +nil+, it returns the read data instead
|
52
70
|
#
|
53
|
-
# Alternatively, +connection+ can simply be the host (String)
|
54
|
-
# Symbol (see +.remote_connection+),
|
55
|
-
# connection automatically
|
71
|
+
# Alternatively, +connection+ can simply be the host (String), a recognized
|
72
|
+
# Symbol (see +.remote_connection+), or a parsed +URI+ object, in which case
|
73
|
+
# the function opens the connection automatically
|
56
74
|
#
|
57
75
|
# Reports progress to the function block with two arguments: the
|
58
76
|
# currently transferred size and the total file size
|
59
|
-
def download_file_ftp(connection, file, target)
|
77
|
+
def download_file_ftp(connection, file = nil, target = nil)
|
60
78
|
# Open connection unless passed
|
61
79
|
close_conn = false
|
62
|
-
if connection.is_a?(String) || connection.is_a?(Symbol)
|
80
|
+
if connection.is_a?(String) || connection.is_a?(Symbol) ||
|
81
|
+
connection.is_a?(URI)
|
63
82
|
connection = remote_connection(connection)
|
83
|
+
file ||= remote_connection_uri.path
|
64
84
|
close_conn = true
|
65
85
|
end
|
66
86
|
|
67
87
|
# Prepare download
|
68
|
-
FileUtils.mkdir_p(File.dirname(target))
|
88
|
+
FileUtils.mkdir_p(File.dirname(target)) if target
|
69
89
|
filesize = connection.size(file)
|
70
90
|
transferred = 0
|
71
91
|
|
72
92
|
# Get in chunks of 1KiB
|
93
|
+
ret = ''
|
73
94
|
connection.getbinaryfile(file, target, 1024) do |data|
|
74
95
|
yield(transferred += data.size, filesize) if block_given?
|
96
|
+
ret += data unless target
|
75
97
|
end
|
76
98
|
|
77
99
|
# Close connection if automatically opened
|
78
100
|
connection.close if close_conn
|
101
|
+
ret unless target
|
102
|
+
end
|
103
|
+
|
104
|
+
##
|
105
|
+
# Submit an HTTP or HTTPS request using +url+, which should be a URL
|
106
|
+
# either as String or parsed URI. The request follows the +method+, which
|
107
|
+
# should be a Net::HTTP verb such as +:get+, +:post+, or +:patch+. All
|
108
|
+
# additional parameters for the corresponding method should be passed as
|
109
|
+
# +opts+.
|
110
|
+
def http_request(method, url, *opts)
|
111
|
+
doc = nil
|
112
|
+
remote_connection(url).start do |http|
|
113
|
+
res = http.send(method, remote_connection_uri.to_s, *opts)
|
114
|
+
if %w[301 302].include?(res.code)
|
115
|
+
DEBUG "REDIRECTION #{res.code}: #{res['location']}"
|
116
|
+
return http_request(method, res['location'], *opts)
|
117
|
+
end
|
118
|
+
res.value # To force exception unless success
|
119
|
+
doc = res.body
|
120
|
+
end
|
121
|
+
doc
|
122
|
+
end
|
123
|
+
|
124
|
+
def net_method(method, uri, *opts)
|
125
|
+
attempts ||= 0
|
126
|
+
DEBUG "#{method.to_s.upcase}: #{uri} #{opts}"
|
127
|
+
case method.to_sym
|
128
|
+
when :ftp
|
129
|
+
download_file_ftp(uri)
|
130
|
+
else
|
131
|
+
http_request(method, uri, *opts)
|
132
|
+
end
|
133
|
+
rescue => e
|
134
|
+
raise e if (attempts += 1) >= 3
|
135
|
+
|
136
|
+
sleep 5 # <- For: 429 Too Many Requests
|
137
|
+
DEBUG "RETRYING after: #{e}"
|
138
|
+
retry
|
139
|
+
end
|
140
|
+
|
141
|
+
alias :https_request :http_request
|
142
|
+
|
143
|
+
##
|
144
|
+
# Normalize the encoding of +body+ to UTF-8 by attempting several
|
145
|
+
# common recodings. Code from https://github.com/seq-code/registry
|
146
|
+
def normalize_encoding(body)
|
147
|
+
# Test encodings
|
148
|
+
body.force_encoding('utf-8')
|
149
|
+
%w[iso8859-1 windows-1252 us-ascii ascii-8bit].each do |enc|
|
150
|
+
break if body.valid_encoding?
|
151
|
+
recode = body.force_encoding(enc).encode('utf-8')
|
152
|
+
body = recode if recode.valid_encoding?
|
153
|
+
end
|
154
|
+
# If nothing works, replace offending characters with '?'
|
155
|
+
unless body.valid_encoding?
|
156
|
+
body = body.encode(
|
157
|
+
'utf-8', invalid: :replace, undef: :replace, replace: '?'
|
158
|
+
)
|
159
|
+
end
|
160
|
+
body
|
79
161
|
end
|
80
162
|
end
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -32,6 +32,12 @@ class MiGA::Dataset < MiGA::MiGA
|
|
32
32
|
@@EXCLUDE_NOREF_TASKS
|
33
33
|
end
|
34
34
|
|
35
|
+
##
|
36
|
+
# Tasks to be excluded from datasets without markers
|
37
|
+
def EXCLUDE_NOMARKER_TASKS
|
38
|
+
@@EXCLUDE_NOMARKER_TASKS
|
39
|
+
end
|
40
|
+
|
35
41
|
##
|
36
42
|
# Tasks to be executed only in datasets that are single-organism. These
|
37
43
|
# tasks are ignored for multi-organism datasets or for unknown types
|
@@ -81,45 +87,67 @@ module MiGA::Dataset::Base
|
|
81
87
|
# Supported dataset types
|
82
88
|
@@KNOWN_TYPES = {
|
83
89
|
genome: {
|
84
|
-
description: 'The genome from an isolate',
|
90
|
+
description: 'The genome from an isolate',
|
91
|
+
multi: false, markers: true,
|
92
|
+
project_types: %i[mixed genomes clade]
|
85
93
|
},
|
86
94
|
scgenome: {
|
87
|
-
description: 'A Single-cell Amplified Genome (SAG)',
|
95
|
+
description: 'A Single-cell Amplified Genome (SAG)',
|
96
|
+
multi: false, markers: true,
|
97
|
+
project_types: %i[mixed genomes clade]
|
88
98
|
},
|
89
99
|
popgenome: {
|
90
|
-
description: 'A Metagenome-Assembled Genome (MAG)',
|
100
|
+
description: 'A Metagenome-Assembled Genome (MAG)',
|
101
|
+
multi: false, markers: true,
|
102
|
+
project_types: %i[mixed genomes clade]
|
91
103
|
},
|
92
104
|
metagenome: {
|
93
|
-
description: 'A metagenome (excluding viromes)',
|
105
|
+
description: 'A metagenome (excluding viromes)',
|
106
|
+
multi: true, markers: true,
|
107
|
+
project_types: %i[mixed metagenomes]
|
94
108
|
},
|
95
109
|
virome: {
|
96
|
-
description: 'A viral metagenome',
|
110
|
+
description: 'A viral metagenome',
|
111
|
+
multi: true,
|
112
|
+
markers: true, # <- We don't expect, but can be useful for contamination
|
113
|
+
project_types: %i[mixed metagenomes]
|
114
|
+
},
|
115
|
+
plasmid: {
|
116
|
+
description: 'An individual plasmid',
|
117
|
+
multi: false, markers: false,
|
118
|
+
project_types: %i[mixed plasmids]
|
97
119
|
}
|
98
120
|
}
|
99
121
|
|
100
122
|
##
|
101
123
|
# Returns an Array of tasks (Symbols) to be executed before project-wide tasks
|
102
|
-
@@PREPROCESSING_TASKS = [
|
103
|
-
|
104
|
-
|
105
|
-
|
124
|
+
@@PREPROCESSING_TASKS = %i[
|
125
|
+
raw_reads trimmed_reads read_quality trimmed_fasta
|
126
|
+
assembly cds essential_genes mytaxa mytaxa_scan
|
127
|
+
taxonomy distances ssu stats
|
106
128
|
]
|
107
129
|
|
108
130
|
##
|
109
131
|
# Tasks to be excluded from query datasets
|
110
|
-
@@EXCLUDE_NOREF_TASKS = [
|
132
|
+
@@EXCLUDE_NOREF_TASKS = %i[mytaxa_scan taxonomy]
|
111
133
|
@@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
|
112
134
|
|
135
|
+
##
|
136
|
+
# Tasks to be excluded from datasets without markers
|
137
|
+
@@EXCLUDE_NOMARKER_TASKS = %i[essential_genes ssu]
|
138
|
+
@@_EXCLUDE_NOMARKER_TASKS_H =
|
139
|
+
Hash[@@EXCLUDE_NOMARKER_TASKS.map { |i| [i, true] }]
|
140
|
+
|
113
141
|
##
|
114
142
|
# Tasks to be executed only in datasets that are single-organism. These
|
115
143
|
# tasks are ignored for multi-organism datasets or for unknown types
|
116
|
-
@@ONLY_NONMULTI_TASKS = [
|
144
|
+
@@ONLY_NONMULTI_TASKS = %i[mytaxa_scan taxonomy distances]
|
117
145
|
@@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
|
118
146
|
|
119
147
|
##
|
120
148
|
# Tasks to be executed only in datasets that are multi-organism. These
|
121
149
|
# tasks are ignored for single-organism datasets or for unknwon types
|
122
|
-
@@ONLY_MULTI_TASKS = [
|
150
|
+
@@ONLY_MULTI_TASKS = %i[mytaxa]
|
123
151
|
@@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
|
124
152
|
|
125
153
|
##
|
data/lib/miga/dataset/hooks.rb
CHANGED
@@ -15,6 +15,7 @@ require 'miga/common/hooks'
|
|
15
15
|
# Supported hooks:
|
16
16
|
# - run_lambda(lambda, args...)
|
17
17
|
# - recalculate_status()
|
18
|
+
# - check_type()
|
18
19
|
# - clear_run_counts()
|
19
20
|
# - run_cmd(cmd)
|
20
21
|
# Internal hooks:
|
@@ -27,6 +28,7 @@ module MiGA::Dataset::Hooks
|
|
27
28
|
def default_hooks
|
28
29
|
{
|
29
30
|
on_create: [[:recalculate_status]],
|
31
|
+
on_save: [[:check_type]],
|
30
32
|
on_activate: [[:clear_run_counts], [:recalculate_status]],
|
31
33
|
on_inactivate: [[:recalculate_status]],
|
32
34
|
on_result_ready: [[:_pull_result_hooks]],
|
@@ -51,6 +53,12 @@ module MiGA::Dataset::Hooks
|
|
51
53
|
recalculate_status
|
52
54
|
end
|
53
55
|
|
56
|
+
##
|
57
|
+
# Ensure that the dataset type exists and is compatible with the project type
|
58
|
+
def hook_check_type(_hook_args, _event_args)
|
59
|
+
check_type
|
60
|
+
end
|
61
|
+
|
54
62
|
##
|
55
63
|
# Run +cmd+ in the command-line with {{variables}}:
|
56
64
|
# dataset, project, project_name, miga, object (if defined for the event)
|