miga-base 0.3.1.6 → 0.3.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +57 -42
- data/lib/miga/result/base.rb +7 -0
- data/lib/miga/result/dates.rb +42 -0
- data/lib/miga/result.rb +4 -0
- data/lib/miga/version.rb +1 -1
- data/scripts/essential_genes.bash +5 -4
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/aasubs.json +75 -75
- data/utils/enveomics/Manifest/Tasks/blasttab.json +194 -185
- data/utils/enveomics/Manifest/Tasks/distances.json +130 -130
- data/utils/enveomics/Manifest/Tasks/fasta.json +51 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +161 -126
- data/utils/enveomics/Manifest/Tasks/graphics.json +111 -111
- data/utils/enveomics/Manifest/Tasks/mapping.json +30 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +308 -265
- data/utils/enveomics/Manifest/Tasks/other.json +451 -449
- data/utils/enveomics/Manifest/Tasks/remote.json +1 -1
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +18 -10
- data/utils/enveomics/Manifest/Tasks/tables.json +250 -250
- data/utils/enveomics/Manifest/Tasks/trees.json +52 -52
- data/utils/enveomics/Manifest/Tasks/variants.json +4 -4
- data/utils/enveomics/Manifest/categories.json +12 -4
- data/utils/enveomics/Manifest/examples.json +1 -1
- data/utils/enveomics/Scripts/BedGraph.tad.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +23 -22
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +272 -258
- data/utils/enveomics/Scripts/aai.rb +13 -6
- data/utils/enveomics/Scripts/ani.rb +2 -2
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +12 -14
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +2 -2
- data/utils/enveomics/Scripts/rbm.rb +23 -14
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/R/barplot.R +2 -2
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b53d716162f9aedbc64f1e54e02ffc293b16a7e7
|
4
|
+
data.tar.gz: a5c46555329c2da1ba1fd165d423e513a27562ef
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c94add412b17de6a932ee247e90ef5682afdf5b61cf09a3b6b9baa64d401da09d29915f6b7a6f39a9e8e6e67ba6e7afb5ed2a982e488805e22f16974cedc9ad7
|
7
|
+
data.tar.gz: 2b8e6fcbdc0b4f1b72e43bb02d47e3fb4618773ef4e86c9644b77926429b6f117cc2d965437704c2540fffd26a8576bdfc299fdd927f7fe2c0f2f33a4c961727
|
data/actions/ncbi_get.rb
CHANGED
@@ -7,45 +7,50 @@ require 'miga/remote_dataset'
|
|
7
7
|
|
8
8
|
o = {q:true, query:false, unlink:false,
|
9
9
|
reference: false, ignore_plasmids: false,
|
10
|
-
complete:false, chromosome:false,
|
11
|
-
scaffold:false, contig:false,}
|
10
|
+
complete: false, chromosome: false,
|
11
|
+
scaffold: false, contig: false, add_version: true, dry: false}
|
12
12
|
OptionParser.new do |opt|
|
13
13
|
opt_banner(opt)
|
14
14
|
opt_object(opt, o, [:project])
|
15
|
-
opt.on(
|
16
|
-
|
15
|
+
opt.on('-T', '--taxon STRING',
|
16
|
+
'(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
|
17
17
|
){ |v| o[:taxon]=v }
|
18
|
-
opt.on(
|
19
|
-
|
20
|
-
opt.on(
|
21
|
-
|
18
|
+
opt.on('--reference',
|
19
|
+
'Download all reference genomes (ignores -T).'){ |v| o[:reference]=v }
|
20
|
+
opt.on('--ref-no-plasmids',
|
21
|
+
'If passed, ignores plasmids (only for --reference).'
|
22
22
|
){ |v| o[:ignore_plasmids]=v }
|
23
|
-
opt.on(
|
24
|
-
opt.on(
|
25
|
-
|
26
|
-
opt.on(
|
27
|
-
opt.on(
|
23
|
+
opt.on('--complete', 'Download complete genomes.'){ |v| o[:complete]=v }
|
24
|
+
opt.on('--chromosome',
|
25
|
+
'Download complete chromosomes.'){ |v| o[:chromosome]=v }
|
26
|
+
opt.on('--scaffold', 'Download genomes in scaffolds.'){ |v| o[:scaffold]=v }
|
27
|
+
opt.on('--contig', 'Download genomes in contigs.'){ |v| o[:contig]=v }
|
28
|
+
opt.on('--all', 'Download all genomes (in any status).') do
|
28
29
|
o[:complete] = true
|
29
30
|
o[:chromosome] = true
|
30
31
|
o[:scaffold] = true
|
31
32
|
o[:contig] = true
|
32
33
|
end
|
33
|
-
opt.on(
|
34
|
-
|
34
|
+
opt.on('--no-version-name',
|
35
|
+
'Do not add sequence version to the dataset name.',
|
36
|
+
'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
|
37
|
+
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
38
|
+
opt.on('-q', '--query',
|
39
|
+
'Register the datasets as queries, not reference datasets.'
|
35
40
|
){ |v| o[:query]=v }
|
36
|
-
opt.on(
|
37
|
-
|
41
|
+
opt.on('-u', '--unlink',
|
42
|
+
'Unlink all datasets in the project missing from the download list.'
|
38
43
|
){ |v| o[:unlink]=v }
|
39
|
-
opt.on(
|
40
|
-
|
44
|
+
opt.on('-R', '--remote-list PATH',
|
45
|
+
'Path to an output file with the list of all datasets listed remotely.'
|
41
46
|
){ |v| o[:remote_list]=v }
|
42
47
|
opt_common(opt, o)
|
43
48
|
end.parse!
|
44
49
|
|
45
|
-
opt_require(o, project:
|
46
|
-
opt_require(o, taxon:
|
50
|
+
opt_require(o, project: '-P')
|
51
|
+
opt_require(o, taxon: '-T') unless o[:reference]
|
47
52
|
unless %w[reference complete chromosome scaffold contig].any?{ |i| o[i.to_sym] }
|
48
|
-
raise
|
53
|
+
raise 'No action requested. Pick at least one type of genome.'
|
49
54
|
end
|
50
55
|
|
51
56
|
##=> Main <=
|
@@ -57,12 +62,12 @@ ds = {}
|
|
57
62
|
downloaded = 0
|
58
63
|
|
59
64
|
def get_list(taxon, status)
|
60
|
-
url_base =
|
65
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?'
|
61
66
|
url_param = if status==:reference
|
62
|
-
{ action:
|
67
|
+
{ action: 'refgenomes', download: 'on' }
|
63
68
|
else
|
64
|
-
{ action:
|
65
|
-
subgroup:
|
69
|
+
{ action: 'download', report: 'proks', group: '-- All Prokaryotes --',
|
70
|
+
subgroup: '-- All Prokaryotes --', orgn: "#{taxon}[orgn]",
|
66
71
|
status: status }
|
67
72
|
end
|
68
73
|
url = url_base + URI.encode_www_form(url_param)
|
@@ -75,14 +80,16 @@ end
|
|
75
80
|
|
76
81
|
# Download IDs with reference status
|
77
82
|
if o[:reference]
|
78
|
-
$stderr.puts
|
83
|
+
$stderr.puts 'Downloading reference genomes' unless o[:q]
|
79
84
|
lineno = 0
|
80
85
|
get_list(nil, :reference).each_line do |ln|
|
81
86
|
next if (lineno+=1)==1
|
82
87
|
r = ln.chomp.split("\t")
|
83
88
|
next if r[3].nil? or r[3].empty?
|
84
|
-
ids = r[3].split(
|
85
|
-
ids += r[5].split(
|
89
|
+
ids = r[3].split(',')
|
90
|
+
ids += r[5].split(',') unless o[:ignore_plasmids] or r[5].empty?
|
91
|
+
ids.delete_if{ |i| i =~ /\A\-*\z/ }
|
92
|
+
next if ids.empty?
|
86
93
|
n = r[2].miga_name
|
87
94
|
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
88
95
|
end
|
@@ -90,22 +97,26 @@ end
|
|
90
97
|
|
91
98
|
# Download IDs with complete or chromosome status
|
92
99
|
if o[:complete] or o[:chromosome]
|
93
|
-
status = (o[:complete] and o[:chromosome] ?
|
94
|
-
|
100
|
+
status = (o[:complete] and o[:chromosome] ?
|
101
|
+
'50|40' : o[:complete] ? '50' : '40')
|
102
|
+
$stderr.puts 'Downloading complete/chromosome genomes' unless o[:q]
|
95
103
|
lineno = 0
|
96
104
|
get_list(o[:taxon], status).each_line do |ln|
|
97
105
|
next if (lineno+=1)==1
|
98
106
|
r = ln.chomp.split("\t")
|
99
107
|
next if r[10].nil? or r[10].empty?
|
100
|
-
ids = r[10].gsub(/[^:;]*:/,
|
101
|
-
|
108
|
+
ids = r[10].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').split(';')
|
109
|
+
ids.delete_if{ |i| i =~ /\A\-*\z/ }
|
110
|
+
next if ids.empty?
|
111
|
+
acc = o[:add_version] ? ids[0] : ids[0].gsub(/\.\d+\Z/,'')
|
112
|
+
n = "#{r[0]}_#{acc}".miga_name
|
102
113
|
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
103
114
|
end
|
104
115
|
end
|
105
116
|
|
106
117
|
# Download IDs with scaffold or contig status
|
107
118
|
if o[:scaffold] or o[:contig]
|
108
|
-
status = (o[:scaffold] and o[:contig] ?
|
119
|
+
status = (o[:scaffold] and o[:contig] ? '30|20' : o[:scaffold] ? '30' : '20')
|
109
120
|
$stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
|
110
121
|
lineno = 0
|
111
122
|
get_list(o[:taxon], status).each_line do |ln|
|
@@ -113,11 +124,14 @@ if o[:scaffold] or o[:contig]
|
|
113
124
|
r = ln.chomp.split("\t")
|
114
125
|
next if r[7].nil? or r[7].empty?
|
115
126
|
next if r[19].nil? or r[19].empty?
|
116
|
-
asm = r[7].gsub(/[^:;]*:/,
|
117
|
-
ids = r[19].gsub(/\s/,
|
118
|
-
|
127
|
+
asm = r[7].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').gsub(/\s/,'')
|
128
|
+
ids = r[19].gsub(/\s/,'').split(';').delete_if{ |i| i =~ /\A\-*\z/ }.
|
129
|
+
map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
|
130
|
+
next if ids.empty?
|
131
|
+
n = "#{r[0]}_#{asm}".miga_name
|
119
132
|
comm = "Assembly: #{asm}"
|
120
|
-
ds[n] = {ids: ids, md: {type: :genome, comments: comm},
|
133
|
+
ds[n] = {ids: ids, md: {type: :genome, comments: comm},
|
134
|
+
db: :assembly_gz, universe: :web}
|
121
135
|
end
|
122
136
|
end
|
123
137
|
|
@@ -127,17 +141,18 @@ ds.each do |name,body|
|
|
127
141
|
d << name
|
128
142
|
puts name
|
129
143
|
next unless p.dataset(name).nil?
|
130
|
-
|
144
|
+
downloaded += 1
|
145
|
+
next if o[:dry]
|
146
|
+
$stderr.puts ' Locating remote dataset.' unless o[:q]
|
131
147
|
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
132
|
-
$stderr.puts
|
148
|
+
$stderr.puts ' Creating dataset.' unless o[:q]
|
133
149
|
rd.save_to(p, name, !o[:query], body[:md])
|
134
150
|
p.add_dataset(name)
|
135
|
-
downloaded += 1
|
136
151
|
end
|
137
152
|
|
138
153
|
# Finalize
|
139
154
|
$stderr.puts "Datasets listed: #{d.size}" unless o[:q]
|
140
|
-
$stderr.puts "Datasets downloaded: #{downloaded}" unless o[:q]
|
155
|
+
$stderr.puts "Datasets #{"to be " if o[:dry]}downloaded: #{downloaded}" unless o[:q]
|
141
156
|
unless o[:remote_list].nil?
|
142
157
|
File.open(o[:remote_list], 'w') do |fh|
|
143
158
|
d.each { |i| fh.puts i }
|
@@ -0,0 +1,42 @@
|
|
1
|
+
|
2
|
+
require "miga/result/base"
|
3
|
+
|
4
|
+
##
|
5
|
+
# Helper module including date-specific functions for results.
|
6
|
+
module MiGA::Result::Dates
|
7
|
+
|
8
|
+
include MiGA::Result::Base
|
9
|
+
|
10
|
+
##
|
11
|
+
# Returns the start date of processing as DateTime or +nil+ if it doesn't exist.
|
12
|
+
def started_at
|
13
|
+
date_at :start
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the end (done) date of processing as DateTime or +nil+ if it doesn't exist.
|
18
|
+
def done_at
|
19
|
+
date_at :done
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Time it took for the result to complete as Float in minutes.
|
24
|
+
def running_time
|
25
|
+
a = started_at or return nil
|
26
|
+
b = done_at or return nil
|
27
|
+
(b - a).to_f * 24 * 60
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
##
|
34
|
+
# Internal function to detect start and end dates
|
35
|
+
def date_at(event)
|
36
|
+
f = path event
|
37
|
+
return nil unless File.size? f
|
38
|
+
DateTime.parse File.read(f)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
data/lib/miga/result.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
+
require "miga/result/dates"
|
5
|
+
|
4
6
|
##
|
5
7
|
# The result from a task run. It can be project-wide or dataset-specific.
|
6
8
|
class MiGA::Result < MiGA::MiGA
|
7
9
|
|
10
|
+
include MiGA::Result::Dates
|
11
|
+
|
8
12
|
# Class-level
|
9
13
|
|
10
14
|
##
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 1,
|
13
|
+
VERSION = [0.3, 1, 7]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -37,10 +37,11 @@ else
|
|
37
37
|
fi
|
38
38
|
|
39
39
|
# Reduce files
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
if exists "$DATASET".ess/*.faa ; then
|
41
|
+
( cd "${DATASET}.ess" \
|
42
|
+
&& tar -zcf proteins.tar.gz *.faa \
|
43
|
+
&& rm *.faa )
|
44
|
+
fi
|
44
45
|
|
45
46
|
# Finalize
|
46
47
|
miga date > "$DATASET.done"
|
data/utils/enveomics/Makefile
CHANGED
@@ -4,99 +4,99 @@
|
|
4
4
|
"task": "AAsubs.log2ratio.rb",
|
5
5
|
"description": ["Estimates the log2-ratio of different amino acids in",
|
6
6
|
"homologous sites using an AAsubs file (see BlastPairwise.AAsubs.pl).",
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
"It provides the point estimation (.obs file), the bootstrap of the",
|
8
|
+
"estimation (.boot file) and the null model based on label-permutation",
|
9
|
+
"(.null file)."],
|
10
10
|
"see_also": ["BlastPairwise.AAsubs.pl"],
|
11
11
|
"cite": [["Konstantinidis et al, 2009, AEM",
|
12
12
|
"http://dx.doi.org/10.1128%2FAEM.00473-09"]],
|
13
13
|
"help_arg": "--help",
|
14
14
|
"options": [
|
15
15
|
{
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
16
|
+
"opt": "--input",
|
17
|
+
"arg": "in_file",
|
18
|
+
"mandatory": true,
|
19
|
+
"description": ["Input file in AAsubs format. It's a tab-delimited",
|
20
|
+
"table where each line corresponds to a substitution, the first",
|
21
|
+
"column corresponds to the compared protein IDs, the second",
|
22
|
+
"and third columns correspond to the AA on each protein, and the",
|
23
|
+
"fourth column indicates the length of the protein (not used by",
|
24
|
+
"this script."]
|
25
|
+
},
|
26
|
+
{
|
27
|
+
"opt": "--obs-file",
|
28
|
+
"arg": "out_file",
|
29
|
+
"description": ["Output file with the log2-ratios per amino acid.",
|
30
|
+
"By default, 'Input value'.obs."]
|
31
|
+
},
|
32
|
+
{
|
33
|
+
"opt": "--bootstrap-file",
|
34
|
+
"arg": "out_file",
|
35
|
+
"description": ["Output file with the bootstrap results of",
|
36
|
+
"log2-ratios per amino acid. By default, 'Input value'.boot."]
|
37
|
+
},
|
38
|
+
{
|
39
|
+
"opt": "--null-file",
|
40
|
+
"arg": "out_file",
|
41
|
+
"description": ["Output file with the permutation results of",
|
42
|
+
"log2-ratios per amino acid. By default, 'Input value'.null."]
|
43
|
+
},
|
44
|
+
{
|
45
|
+
"opt": "--overwrite",
|
46
|
+
"description": ["Overwrite existing files. By default, skip steps if",
|
47
|
+
"the files already exist."]
|
48
|
+
},
|
49
|
+
{
|
50
|
+
"opt": "--bootstraps",
|
51
|
+
"arg": "integer",
|
52
|
+
"default": 1000,
|
53
|
+
"description": "Number of bootstraps to run."
|
54
|
+
},
|
55
|
+
{
|
56
|
+
"opt": "--permutations",
|
57
|
+
"arg": "integer",
|
58
|
+
"default": 1000,
|
59
|
+
"description": "Number of permutations to run."
|
60
|
+
},
|
61
|
+
{
|
62
|
+
"opt": "--quiet",
|
63
|
+
"description": "Run quietly (no STDERR output)."
|
64
|
+
}
|
65
65
|
]
|
66
66
|
},
|
67
67
|
{
|
68
68
|
"task": "BlastPairwise.AAsubs.pl",
|
69
69
|
"description": ["Counts the different AA substitutions in the best hit",
|
70
70
|
"blast alignments, from a BLASTP pairwise format output (-outfmt 0 in",
|
71
|
-
|
71
|
+
"BLAST+, -m 0 in legacy BLAST)."],
|
72
72
|
"see_also": ["AAsubs.log2ratio.rb"],
|
73
73
|
"cite": [["Konstantinidis et al, 2009, AEM",
|
74
74
|
"http://dx.doi.org/10.1128%2FAEM.00473-09"]],
|
75
75
|
"help_arg": "",
|
76
76
|
"options": [
|
77
77
|
{
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
78
|
+
"name": "Cigar char",
|
79
|
+
"arg": "select",
|
80
|
+
"values": ["+","_"],
|
81
|
+
"mandatory": true,
|
82
|
+
"description": ["Use '+' for similar substitutions, use '_' for non",
|
83
|
+
"similar substitutions."]
|
84
|
+
},
|
85
|
+
{
|
86
|
+
"name": "Blast M0",
|
87
|
+
"arg": "in_file",
|
88
|
+
"mandatory": true,
|
89
|
+
"description": "Blast in 'pairwise text' format (-outfmt/-m 0)."
|
90
|
+
},
|
91
|
+
">",
|
92
|
+
{
|
93
|
+
"name": "AA subs",
|
94
|
+
"arg": "out_file",
|
95
|
+
"mandatory": true,
|
96
|
+
"description": ["A tab-delimited raw file with one substitution per",
|
97
|
+
"row and columns: (1) Name-of-query_Name-of-subject, (2)",
|
98
|
+
"AA-in-subject, (3) AA-in-query, (4) Total-Align-Length."]
|
99
|
+
}
|
100
100
|
]
|
101
101
|
}
|
102
102
|
]
|