miga-base 0.3.1.6 → 0.3.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +57 -42
- data/lib/miga/result/base.rb +7 -0
- data/lib/miga/result/dates.rb +42 -0
- data/lib/miga/result.rb +4 -0
- data/lib/miga/version.rb +1 -1
- data/scripts/essential_genes.bash +5 -4
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/aasubs.json +75 -75
- data/utils/enveomics/Manifest/Tasks/blasttab.json +194 -185
- data/utils/enveomics/Manifest/Tasks/distances.json +130 -130
- data/utils/enveomics/Manifest/Tasks/fasta.json +51 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +161 -126
- data/utils/enveomics/Manifest/Tasks/graphics.json +111 -111
- data/utils/enveomics/Manifest/Tasks/mapping.json +30 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +308 -265
- data/utils/enveomics/Manifest/Tasks/other.json +451 -449
- data/utils/enveomics/Manifest/Tasks/remote.json +1 -1
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +18 -10
- data/utils/enveomics/Manifest/Tasks/tables.json +250 -250
- data/utils/enveomics/Manifest/Tasks/trees.json +52 -52
- data/utils/enveomics/Manifest/Tasks/variants.json +4 -4
- data/utils/enveomics/Manifest/categories.json +12 -4
- data/utils/enveomics/Manifest/examples.json +1 -1
- data/utils/enveomics/Scripts/BedGraph.tad.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +23 -22
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +272 -258
- data/utils/enveomics/Scripts/aai.rb +13 -6
- data/utils/enveomics/Scripts/ani.rb +2 -2
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +12 -14
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +2 -2
- data/utils/enveomics/Scripts/rbm.rb +23 -14
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/R/barplot.R +2 -2
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b53d716162f9aedbc64f1e54e02ffc293b16a7e7
|
4
|
+
data.tar.gz: a5c46555329c2da1ba1fd165d423e513a27562ef
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c94add412b17de6a932ee247e90ef5682afdf5b61cf09a3b6b9baa64d401da09d29915f6b7a6f39a9e8e6e67ba6e7afb5ed2a982e488805e22f16974cedc9ad7
|
7
|
+
data.tar.gz: 2b8e6fcbdc0b4f1b72e43bb02d47e3fb4618773ef4e86c9644b77926429b6f117cc2d965437704c2540fffd26a8576bdfc299fdd927f7fe2c0f2f33a4c961727
|
data/actions/ncbi_get.rb
CHANGED
@@ -7,45 +7,50 @@ require 'miga/remote_dataset'
|
|
7
7
|
|
8
8
|
o = {q:true, query:false, unlink:false,
|
9
9
|
reference: false, ignore_plasmids: false,
|
10
|
-
complete:false, chromosome:false,
|
11
|
-
scaffold:false, contig:false,}
|
10
|
+
complete: false, chromosome: false,
|
11
|
+
scaffold: false, contig: false, add_version: true, dry: false}
|
12
12
|
OptionParser.new do |opt|
|
13
13
|
opt_banner(opt)
|
14
14
|
opt_object(opt, o, [:project])
|
15
|
-
opt.on(
|
16
|
-
|
15
|
+
opt.on('-T', '--taxon STRING',
|
16
|
+
'(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
|
17
17
|
){ |v| o[:taxon]=v }
|
18
|
-
opt.on(
|
19
|
-
|
20
|
-
opt.on(
|
21
|
-
|
18
|
+
opt.on('--reference',
|
19
|
+
'Download all reference genomes (ignores -T).'){ |v| o[:reference]=v }
|
20
|
+
opt.on('--ref-no-plasmids',
|
21
|
+
'If passed, ignores plasmids (only for --reference).'
|
22
22
|
){ |v| o[:ignore_plasmids]=v }
|
23
|
-
opt.on(
|
24
|
-
opt.on(
|
25
|
-
|
26
|
-
opt.on(
|
27
|
-
opt.on(
|
23
|
+
opt.on('--complete', 'Download complete genomes.'){ |v| o[:complete]=v }
|
24
|
+
opt.on('--chromosome',
|
25
|
+
'Download complete chromosomes.'){ |v| o[:chromosome]=v }
|
26
|
+
opt.on('--scaffold', 'Download genomes in scaffolds.'){ |v| o[:scaffold]=v }
|
27
|
+
opt.on('--contig', 'Download genomes in contigs.'){ |v| o[:contig]=v }
|
28
|
+
opt.on('--all', 'Download all genomes (in any status).') do
|
28
29
|
o[:complete] = true
|
29
30
|
o[:chromosome] = true
|
30
31
|
o[:scaffold] = true
|
31
32
|
o[:contig] = true
|
32
33
|
end
|
33
|
-
opt.on(
|
34
|
-
|
34
|
+
opt.on('--no-version-name',
|
35
|
+
'Do not add sequence version to the dataset name.',
|
36
|
+
'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
|
37
|
+
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
38
|
+
opt.on('-q', '--query',
|
39
|
+
'Register the datasets as queries, not reference datasets.'
|
35
40
|
){ |v| o[:query]=v }
|
36
|
-
opt.on(
|
37
|
-
|
41
|
+
opt.on('-u', '--unlink',
|
42
|
+
'Unlink all datasets in the project missing from the download list.'
|
38
43
|
){ |v| o[:unlink]=v }
|
39
|
-
opt.on(
|
40
|
-
|
44
|
+
opt.on('-R', '--remote-list PATH',
|
45
|
+
'Path to an output file with the list of all datasets listed remotely.'
|
41
46
|
){ |v| o[:remote_list]=v }
|
42
47
|
opt_common(opt, o)
|
43
48
|
end.parse!
|
44
49
|
|
45
|
-
opt_require(o, project:
|
46
|
-
opt_require(o, taxon:
|
50
|
+
opt_require(o, project: '-P')
|
51
|
+
opt_require(o, taxon: '-T') unless o[:reference]
|
47
52
|
unless %w[reference complete chromosome scaffold contig].any?{ |i| o[i.to_sym] }
|
48
|
-
raise
|
53
|
+
raise 'No action requested. Pick at least one type of genome.'
|
49
54
|
end
|
50
55
|
|
51
56
|
##=> Main <=
|
@@ -57,12 +62,12 @@ ds = {}
|
|
57
62
|
downloaded = 0
|
58
63
|
|
59
64
|
def get_list(taxon, status)
|
60
|
-
url_base =
|
65
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?'
|
61
66
|
url_param = if status==:reference
|
62
|
-
{ action:
|
67
|
+
{ action: 'refgenomes', download: 'on' }
|
63
68
|
else
|
64
|
-
{ action:
|
65
|
-
subgroup:
|
69
|
+
{ action: 'download', report: 'proks', group: '-- All Prokaryotes --',
|
70
|
+
subgroup: '-- All Prokaryotes --', orgn: "#{taxon}[orgn]",
|
66
71
|
status: status }
|
67
72
|
end
|
68
73
|
url = url_base + URI.encode_www_form(url_param)
|
@@ -75,14 +80,16 @@ end
|
|
75
80
|
|
76
81
|
# Download IDs with reference status
|
77
82
|
if o[:reference]
|
78
|
-
$stderr.puts
|
83
|
+
$stderr.puts 'Downloading reference genomes' unless o[:q]
|
79
84
|
lineno = 0
|
80
85
|
get_list(nil, :reference).each_line do |ln|
|
81
86
|
next if (lineno+=1)==1
|
82
87
|
r = ln.chomp.split("\t")
|
83
88
|
next if r[3].nil? or r[3].empty?
|
84
|
-
ids = r[3].split(
|
85
|
-
ids += r[5].split(
|
89
|
+
ids = r[3].split(',')
|
90
|
+
ids += r[5].split(',') unless o[:ignore_plasmids] or r[5].empty?
|
91
|
+
ids.delete_if{ |i| i =~ /\A\-*\z/ }
|
92
|
+
next if ids.empty?
|
86
93
|
n = r[2].miga_name
|
87
94
|
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
88
95
|
end
|
@@ -90,22 +97,26 @@ end
|
|
90
97
|
|
91
98
|
# Download IDs with complete or chromosome status
|
92
99
|
if o[:complete] or o[:chromosome]
|
93
|
-
status = (o[:complete] and o[:chromosome] ?
|
94
|
-
|
100
|
+
status = (o[:complete] and o[:chromosome] ?
|
101
|
+
'50|40' : o[:complete] ? '50' : '40')
|
102
|
+
$stderr.puts 'Downloading complete/chromosome genomes' unless o[:q]
|
95
103
|
lineno = 0
|
96
104
|
get_list(o[:taxon], status).each_line do |ln|
|
97
105
|
next if (lineno+=1)==1
|
98
106
|
r = ln.chomp.split("\t")
|
99
107
|
next if r[10].nil? or r[10].empty?
|
100
|
-
ids = r[10].gsub(/[^:;]*:/,
|
101
|
-
|
108
|
+
ids = r[10].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').split(';')
|
109
|
+
ids.delete_if{ |i| i =~ /\A\-*\z/ }
|
110
|
+
next if ids.empty?
|
111
|
+
acc = o[:add_version] ? ids[0] : ids[0].gsub(/\.\d+\Z/,'')
|
112
|
+
n = "#{r[0]}_#{acc}".miga_name
|
102
113
|
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
103
114
|
end
|
104
115
|
end
|
105
116
|
|
106
117
|
# Download IDs with scaffold or contig status
|
107
118
|
if o[:scaffold] or o[:contig]
|
108
|
-
status = (o[:scaffold] and o[:contig] ?
|
119
|
+
status = (o[:scaffold] and o[:contig] ? '30|20' : o[:scaffold] ? '30' : '20')
|
109
120
|
$stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
|
110
121
|
lineno = 0
|
111
122
|
get_list(o[:taxon], status).each_line do |ln|
|
@@ -113,11 +124,14 @@ if o[:scaffold] or o[:contig]
|
|
113
124
|
r = ln.chomp.split("\t")
|
114
125
|
next if r[7].nil? or r[7].empty?
|
115
126
|
next if r[19].nil? or r[19].empty?
|
116
|
-
asm = r[7].gsub(/[^:;]*:/,
|
117
|
-
ids = r[19].gsub(/\s/,
|
118
|
-
|
127
|
+
asm = r[7].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').gsub(/\s/,'')
|
128
|
+
ids = r[19].gsub(/\s/,'').split(';').delete_if{ |i| i =~ /\A\-*\z/ }.
|
129
|
+
map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
|
130
|
+
next if ids.empty?
|
131
|
+
n = "#{r[0]}_#{asm}".miga_name
|
119
132
|
comm = "Assembly: #{asm}"
|
120
|
-
ds[n] = {ids: ids, md: {type: :genome, comments: comm},
|
133
|
+
ds[n] = {ids: ids, md: {type: :genome, comments: comm},
|
134
|
+
db: :assembly_gz, universe: :web}
|
121
135
|
end
|
122
136
|
end
|
123
137
|
|
@@ -127,17 +141,18 @@ ds.each do |name,body|
|
|
127
141
|
d << name
|
128
142
|
puts name
|
129
143
|
next unless p.dataset(name).nil?
|
130
|
-
|
144
|
+
downloaded += 1
|
145
|
+
next if o[:dry]
|
146
|
+
$stderr.puts ' Locating remote dataset.' unless o[:q]
|
131
147
|
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
132
|
-
$stderr.puts
|
148
|
+
$stderr.puts ' Creating dataset.' unless o[:q]
|
133
149
|
rd.save_to(p, name, !o[:query], body[:md])
|
134
150
|
p.add_dataset(name)
|
135
|
-
downloaded += 1
|
136
151
|
end
|
137
152
|
|
138
153
|
# Finalize
|
139
154
|
$stderr.puts "Datasets listed: #{d.size}" unless o[:q]
|
140
|
-
$stderr.puts "Datasets downloaded: #{downloaded}" unless o[:q]
|
155
|
+
$stderr.puts "Datasets #{"to be " if o[:dry]}downloaded: #{downloaded}" unless o[:q]
|
141
156
|
unless o[:remote_list].nil?
|
142
157
|
File.open(o[:remote_list], 'w') do |fh|
|
143
158
|
d.each { |i| fh.puts i }
|
@@ -0,0 +1,42 @@
|
|
1
|
+
|
2
|
+
require "miga/result/base"
|
3
|
+
|
4
|
+
##
|
5
|
+
# Helper module including date-specific functions for results.
|
6
|
+
module MiGA::Result::Dates
|
7
|
+
|
8
|
+
include MiGA::Result::Base
|
9
|
+
|
10
|
+
##
|
11
|
+
# Returns the start date of processing as DateTime or +nil+ if it doesn't exist.
|
12
|
+
def started_at
|
13
|
+
date_at :start
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the end (done) date of processing as DateTime or +nil+ if it doesn't exist.
|
18
|
+
def done_at
|
19
|
+
date_at :done
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Time it took for the result to complete as Float in minutes.
|
24
|
+
def running_time
|
25
|
+
a = started_at or return nil
|
26
|
+
b = done_at or return nil
|
27
|
+
(b - a).to_f * 24 * 60
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
##
|
34
|
+
# Internal function to detect start and end dates
|
35
|
+
def date_at(event)
|
36
|
+
f = path event
|
37
|
+
return nil unless File.size? f
|
38
|
+
DateTime.parse File.read(f)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
data/lib/miga/result.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
+
require "miga/result/dates"
|
5
|
+
|
4
6
|
##
|
5
7
|
# The result from a task run. It can be project-wide or dataset-specific.
|
6
8
|
class MiGA::Result < MiGA::MiGA
|
7
9
|
|
10
|
+
include MiGA::Result::Dates
|
11
|
+
|
8
12
|
# Class-level
|
9
13
|
|
10
14
|
##
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 1,
|
13
|
+
VERSION = [0.3, 1, 7]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -37,10 +37,11 @@ else
|
|
37
37
|
fi
|
38
38
|
|
39
39
|
# Reduce files
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
if exists "$DATASET".ess/*.faa ; then
|
41
|
+
( cd "${DATASET}.ess" \
|
42
|
+
&& tar -zcf proteins.tar.gz *.faa \
|
43
|
+
&& rm *.faa )
|
44
|
+
fi
|
44
45
|
|
45
46
|
# Finalize
|
46
47
|
miga date > "$DATASET.done"
|
data/utils/enveomics/Makefile
CHANGED
@@ -4,99 +4,99 @@
|
|
4
4
|
"task": "AAsubs.log2ratio.rb",
|
5
5
|
"description": ["Estimates the log2-ratio of different amino acids in",
|
6
6
|
"homologous sites using an AAsubs file (see BlastPairwise.AAsubs.pl).",
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
"It provides the point estimation (.obs file), the bootstrap of the",
|
8
|
+
"estimation (.boot file) and the null model based on label-permutation",
|
9
|
+
"(.null file)."],
|
10
10
|
"see_also": ["BlastPairwise.AAsubs.pl"],
|
11
11
|
"cite": [["Konstantinidis et al, 2009, AEM",
|
12
12
|
"http://dx.doi.org/10.1128%2FAEM.00473-09"]],
|
13
13
|
"help_arg": "--help",
|
14
14
|
"options": [
|
15
15
|
{
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
16
|
+
"opt": "--input",
|
17
|
+
"arg": "in_file",
|
18
|
+
"mandatory": true,
|
19
|
+
"description": ["Input file in AAsubs format. It's a tab-delimited",
|
20
|
+
"table where each line corresponds to a substitution, the first",
|
21
|
+
"column corresponds to the compared protein IDs, the second",
|
22
|
+
"and third columns correspond to the AA on each protein, and the",
|
23
|
+
"fourth column indicates the length of the protein (not used by",
|
24
|
+
"this script."]
|
25
|
+
},
|
26
|
+
{
|
27
|
+
"opt": "--obs-file",
|
28
|
+
"arg": "out_file",
|
29
|
+
"description": ["Output file with the log2-ratios per amino acid.",
|
30
|
+
"By default, 'Input value'.obs."]
|
31
|
+
},
|
32
|
+
{
|
33
|
+
"opt": "--bootstrap-file",
|
34
|
+
"arg": "out_file",
|
35
|
+
"description": ["Output file with the bootstrap results of",
|
36
|
+
"log2-ratios per amino acid. By default, 'Input value'.boot."]
|
37
|
+
},
|
38
|
+
{
|
39
|
+
"opt": "--null-file",
|
40
|
+
"arg": "out_file",
|
41
|
+
"description": ["Output file with the permutation results of",
|
42
|
+
"log2-ratios per amino acid. By default, 'Input value'.null."]
|
43
|
+
},
|
44
|
+
{
|
45
|
+
"opt": "--overwrite",
|
46
|
+
"description": ["Overwrite existing files. By default, skip steps if",
|
47
|
+
"the files already exist."]
|
48
|
+
},
|
49
|
+
{
|
50
|
+
"opt": "--bootstraps",
|
51
|
+
"arg": "integer",
|
52
|
+
"default": 1000,
|
53
|
+
"description": "Number of bootstraps to run."
|
54
|
+
},
|
55
|
+
{
|
56
|
+
"opt": "--permutations",
|
57
|
+
"arg": "integer",
|
58
|
+
"default": 1000,
|
59
|
+
"description": "Number of permutations to run."
|
60
|
+
},
|
61
|
+
{
|
62
|
+
"opt": "--quiet",
|
63
|
+
"description": "Run quietly (no STDERR output)."
|
64
|
+
}
|
65
65
|
]
|
66
66
|
},
|
67
67
|
{
|
68
68
|
"task": "BlastPairwise.AAsubs.pl",
|
69
69
|
"description": ["Counts the different AA substitutions in the best hit",
|
70
70
|
"blast alignments, from a BLASTP pairwise format output (-outfmt 0 in",
|
71
|
-
|
71
|
+
"BLAST+, -m 0 in legacy BLAST)."],
|
72
72
|
"see_also": ["AAsubs.log2ratio.rb"],
|
73
73
|
"cite": [["Konstantinidis et al, 2009, AEM",
|
74
74
|
"http://dx.doi.org/10.1128%2FAEM.00473-09"]],
|
75
75
|
"help_arg": "",
|
76
76
|
"options": [
|
77
77
|
{
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
78
|
+
"name": "Cigar char",
|
79
|
+
"arg": "select",
|
80
|
+
"values": ["+","_"],
|
81
|
+
"mandatory": true,
|
82
|
+
"description": ["Use '+' for similar substitutions, use '_' for non",
|
83
|
+
"similar substitutions."]
|
84
|
+
},
|
85
|
+
{
|
86
|
+
"name": "Blast M0",
|
87
|
+
"arg": "in_file",
|
88
|
+
"mandatory": true,
|
89
|
+
"description": "Blast in 'pairwise text' format (-outfmt/-m 0)."
|
90
|
+
},
|
91
|
+
">",
|
92
|
+
{
|
93
|
+
"name": "AA subs",
|
94
|
+
"arg": "out_file",
|
95
|
+
"mandatory": true,
|
96
|
+
"description": ["A tab-delimited raw file with one substitution per",
|
97
|
+
"row and columns: (1) Name-of-query_Name-of-subject, (2)",
|
98
|
+
"AA-in-subject, (3) AA-in-query, (4) Total-Align-Length."]
|
99
|
+
}
|
100
100
|
]
|
101
101
|
}
|
102
102
|
]
|