miga-base 0.3.3.0 → 0.3.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +7 -0
- data/actions/about.rb +1 -1
- data/actions/ls.rb +1 -1
- data/actions/summary.rb +4 -2
- data/lib/miga/remote_dataset/base.rb +59 -0
- data/lib/miga/remote_dataset.rb +4 -44
- data/lib/miga/version.rb +2 -2
- data/scripts/clade_finding.bash +2 -30
- data/scripts/subclades.bash +1 -9
- data/utils/distance/runner.rb +3 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
- data/utils/subclade/base.rb +6 -0
- data/utils/subclade/pipeline.rb +54 -0
- data/utils/subclade/runner.rb +51 -0
- data/utils/subclade/temporal.rb +14 -0
- data/utils/subclades-compile.rb +8 -9
- data/utils/subclades.R +80 -31
- data/utils/subclades.rb +9 -0
- metadata +9 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b1b32b7800278dc330c5c8e01f4b94dfd1d97750
|
4
|
+
data.tar.gz: 2c3b6ef0e73568df8775fb98c65d454cdcf0f411
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 590a41c7bc94f5d36a53e0b9eb4f096211ccdae8724e63948480e0b57c8b7fa24a5779534868c7cb13405b3360f35be00d977728886ba7b7491ef5aeebb0bc0d
|
7
|
+
data.tar.gz: 64a273f14eea3aec6f9c8cfb388bdae4bdf2027d7ce95d89b1e8a27799e51420d2f58b42a58871037ee4a0e7f616f92e8ac485a12dbe226d7ab5cda99792f286
|
data/README.md
CHANGED
@@ -30,6 +30,13 @@ You have two options:
|
|
30
30
|
[installation instructions](manual/part2/installation.md). Once you have MiGA
|
31
31
|
installed, you can [deploy some examples](manual/part4.md).
|
32
32
|
|
33
|
+
# How to cite MiGA
|
34
|
+
|
35
|
+
> Rodriguez-R *et al*. 2018. The Microbial Genomes Atlas (MiGA) webserver:
|
36
|
+
> taxonomic and gene diversity analysis of Archaea and Bacteria at the whole
|
37
|
+
> genome level. *Nucleic Acids Research* 46(W1):W282-W288.
|
38
|
+
> [doi:10.1093/nar/gky467](https://doi.org/10.1093/nar/gky467).
|
39
|
+
|
33
40
|
# Authors
|
34
41
|
|
35
42
|
Developed and maintained by [Luis M. Rodriguez-R][lrr]. MiGA is the result of a
|
data/actions/about.rb
CHANGED
@@ -12,7 +12,7 @@ OptionParser.new do |opt|
|
|
12
12
|
opt.on("-m", "--metadata STRING",
|
13
13
|
"Print name and metadata field only."
|
14
14
|
){ |v| o[:datum]=v }
|
15
|
-
opt.on("--tab
|
15
|
+
opt.on("--tab",
|
16
16
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
17
17
|
opt_common(opt, o)
|
18
18
|
end.parse!
|
data/actions/ls.rb
CHANGED
@@ -15,7 +15,7 @@ OptionParser.new do |opt|
|
|
15
15
|
opt.on("-m", "--metadata STRING",
|
16
16
|
"Print name and metadata field only. If set, ignores -i."
|
17
17
|
){ |v| o[:datum]=v }
|
18
|
-
opt.on("--tab
|
18
|
+
opt.on("--tab",
|
19
19
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
20
20
|
opt.on("-s", "--silent",
|
21
21
|
"No output and exit with non-zero status if the dataset list is empty."
|
data/actions/summary.rb
CHANGED
@@ -6,8 +6,10 @@
|
|
6
6
|
o = {q:true, units:false, tabular:false}
|
7
7
|
opts = OptionParser.new do |opt|
|
8
8
|
opt_banner(opt)
|
9
|
-
opt_object(opt, o, [:project, :dataset_opt
|
10
|
-
opt
|
9
|
+
opt_object(opt, o, [:project, :dataset_opt])
|
10
|
+
opt_filter_datasets(opt, o)
|
11
|
+
opt_object(opt, o, [:result_dataset])
|
12
|
+
opt.on("--tab",
|
11
13
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
12
14
|
opt.on("--key STRING",
|
13
15
|
"Returns only the value of the requested key."){ |v| o[:key] = v }
|
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
require 'restclient'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
class MiGA::RemoteDataset < MiGA::MiGA
|
6
|
+
|
7
|
+
# Class-level
|
8
|
+
class << self
|
9
|
+
def UNIVERSE ; @@UNIVERSE ; end
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
module MiGA::RemoteDataset::Base
|
15
|
+
|
16
|
+
@@_EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
17
|
+
|
18
|
+
##
|
19
|
+
# Structure of the different database Universes or containers. The structure
|
20
|
+
# is a Hash with universe names as keys as Symbol and values being a Hash with
|
21
|
+
# supported keys as Symbol:
|
22
|
+
# - +:dbs+ => Hash with keys being the database name and the values a Hash of
|
23
|
+
# properties such as +stage+, +format+, and +map_to+.
|
24
|
+
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
25
|
+
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
26
|
+
# - +method+ => Method used to query the URL. Only +:rest+ is currently
|
27
|
+
# supported.
|
28
|
+
# - +map_to_universe+ => Universe where results map to. Currently unsupported.
|
29
|
+
@@UNIVERSE = {
|
30
|
+
web:{
|
31
|
+
dbs: {
|
32
|
+
assembly:{stage: :assembly, format: :fasta},
|
33
|
+
assembly_gz:{stage: :assembly, format: :fasta_gz}
|
34
|
+
},
|
35
|
+
url: "%2$s",
|
36
|
+
method: :net
|
37
|
+
},
|
38
|
+
ebi:{
|
39
|
+
dbs: { embl:{stage: :assembly, format: :fasta} },
|
40
|
+
url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
41
|
+
method: :rest
|
42
|
+
},
|
43
|
+
ncbi:{
|
44
|
+
dbs: { nuccore:{stage: :assembly, format: :fasta} },
|
45
|
+
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
46
|
+
method: :rest
|
47
|
+
},
|
48
|
+
ncbi_map:{
|
49
|
+
dbs: { assembly:{map_to: :nuccore, format: :text} },
|
50
|
+
# FIXME ncbi_map is intended to do internal NCBI mapping between
|
51
|
+
# databases.
|
52
|
+
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
|
53
|
+
method: :rest,
|
54
|
+
map_to_universe: :ncbi
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
end
|
59
|
+
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -1,55 +1,15 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require "open-uri"
|
4
|
+
require 'miga/remote_dataset/base'
|
6
5
|
|
7
6
|
##
|
8
7
|
# MiGA representation of datasets with data in remote locations.
|
9
8
|
class MiGA::RemoteDataset < MiGA::MiGA
|
10
|
-
# Class-level
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
#
|
15
|
-
# is a Hash with universe names as keys as Symbol and values being a Hash with
|
16
|
-
# supported keys as Symbol:
|
17
|
-
# - +:dbs+ => Hash with keys being the database name and the values a Hash of
|
18
|
-
# properties such as +stage+, +format+, and +map_to+.
|
19
|
-
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
20
|
-
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
21
|
-
# - +method+ => Method used to query the URL. Only +:rest+ is currently
|
22
|
-
# supported.
|
23
|
-
# - +map_to_universe+ => Universe where results map to. Currently unsupported.
|
24
|
-
def self.UNIVERSE ; @@UNIVERSE ; end
|
25
|
-
@@UNIVERSE = {
|
26
|
-
web:{
|
27
|
-
dbs: {
|
28
|
-
assembly:{stage: :assembly, format: :fasta},
|
29
|
-
assembly_gz:{stage: :assembly, format: :fasta_gz}
|
30
|
-
},
|
31
|
-
url: "%2$s",
|
32
|
-
method: :net
|
33
|
-
},
|
34
|
-
ebi:{
|
35
|
-
dbs: { embl:{stage: :assembly, format: :fasta} },
|
36
|
-
url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
37
|
-
method: :rest
|
38
|
-
},
|
39
|
-
ncbi:{
|
40
|
-
dbs: { nuccore:{stage: :assembly, format: :fasta} },
|
41
|
-
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
42
|
-
method: :rest
|
43
|
-
},
|
44
|
-
ncbi_map:{
|
45
|
-
dbs: { assembly:{map_to: :nuccore, format: :text} },
|
46
|
-
# FIXME ncbi_map is intended to do internal NCBI mapping between
|
47
|
-
# databases.
|
48
|
-
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
|
49
|
-
method: :rest,
|
50
|
-
map_to_universe: :ncbi
|
51
|
-
}
|
52
|
-
}
|
10
|
+
include MiGA::RemoteDataset::Base
|
11
|
+
|
12
|
+
# Class-level
|
53
13
|
|
54
14
|
##
|
55
15
|
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 3,
|
13
|
+
VERSION = [0.3, 3, 1]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -25,7 +25,7 @@ module MiGA
|
|
25
25
|
CITATION = "Rodriguez-R et al (2018). " +
|
26
26
|
"The Microbial Genomes Atlas (MiGA) webserver: taxonomic and gene " +
|
27
27
|
"diversity analysis of Archaea and Bacteria at the whole genome level. " +
|
28
|
-
"Nucleic Acids Research
|
28
|
+
"Nucleic Acids Research 46(W1):W282-W288. doi:10.1093/nar/gky467."
|
29
29
|
|
30
30
|
end
|
31
31
|
|
data/scripts/clade_finding.bash
CHANGED
@@ -11,36 +11,8 @@ cd "$PROJECT/data/10.clades/01.find"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
|
-
#
|
15
|
-
|
16
|
-
| awk -F"\\t" '$4>=90{print $2"'"\\t"'"$3"'"\\t"'"$4}' \
|
17
|
-
> genome-genome.aai90.rbm
|
18
|
-
ogs.mcl.rb -d . -o miga-project.aai90-clades -t "$CORES" -i \
|
19
|
-
-f "(\\S+)-(\\S+)\\.aai90\\.rbm"
|
20
|
-
rm genome-genome.aai90.rbm
|
21
|
-
gunzip -c ../../09.distances/02.aai/miga-project.txt.gz | tail -n+2 \
|
22
|
-
| awk -F"\\t" '$4>=95{print $2"'"\\t"'"$3"'"\\t"'"$4}' \
|
23
|
-
> genome-genome.ani95.rbm
|
24
|
-
ogs.mcl.rb -d . -o miga-project.ani95-clades -t "$CORES" -b \
|
25
|
-
-f "(\\S+)-(\\S+)\\.ani95\\.rbm"
|
26
|
-
rm genome-genome.ani95.rbm
|
27
|
-
|
28
|
-
# Propose clade projects
|
29
|
-
tail -n +2 miga-project.ani95-clades | tr "," "\\t" | awk 'NF >= 5' \
|
30
|
-
> miga-project.proposed-clades
|
31
|
-
|
32
|
-
# Run R code (except in projects type clade)
|
33
|
-
if [[ $(miga project_info -P "$PROJECT" -m type) != "clade" ]] ; then
|
34
|
-
"$MIGA/utils/subclades.R" \
|
35
|
-
../../09.distances/02.aai/miga-project.txt.gz \
|
36
|
-
miga-project "$CORES"
|
37
|
-
mv miga-project.nwk miga-project.aai.nwk
|
38
|
-
|
39
|
-
# Compile
|
40
|
-
ruby "$MIGA/utils/subclades-compile.rb" . \
|
41
|
-
> miga-project.class.tsv \
|
42
|
-
2> miga-project.class.nwk
|
43
|
-
fi
|
14
|
+
# Run
|
15
|
+
ruby -I "$MIGA/lib" "$MIGA/utils/subclades.rb" "$PROJECT" "$SCRIPT"
|
44
16
|
|
45
17
|
# Finalize
|
46
18
|
miga date > "miga-project.done"
|
data/scripts/subclades.bash
CHANGED
@@ -12,15 +12,7 @@ cd "$PROJECT/data/10.clades/02.ani"
|
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
14
|
# Run R code
|
15
|
-
"$MIGA/utils/subclades.
|
16
|
-
../../09.distances/03.ani/miga-project.txt.gz \
|
17
|
-
miga-project "$CORES"
|
18
|
-
mv miga-project.nwk miga-project.ani.nwk
|
19
|
-
|
20
|
-
# Compile
|
21
|
-
ruby "$MIGA/utils/subclades-compile.rb" . \
|
22
|
-
> miga-project.class.tsv \
|
23
|
-
2> miga-project.class.nwk
|
15
|
+
ruby -I "$MIGA/lib" "$MIGA/utils/subclades.rb" "$PROJECT" "$SCRIPT"
|
24
16
|
|
25
17
|
# Finalize
|
26
18
|
miga date > "miga-project.done"
|
data/utils/distance/runner.rb
CHANGED
@@ -30,8 +30,10 @@ class MiGA::DistanceRunner
|
|
30
30
|
if opts[:run_taxonomy] && project.metadata[:ref_project]
|
31
31
|
@home = File.expand_path('05.taxonomy', @home)
|
32
32
|
@ref_project = MiGA::Project.load(project.metadata[:ref_project])
|
33
|
+
raise "Cannot load reference project: #{project.metadata[:ref_project]}" if @ref_project.nil?
|
34
|
+
else
|
35
|
+
@ref_project = project
|
33
36
|
end
|
34
|
-
@ref_project ||= project
|
35
37
|
[:haai_p, :aai_p, :ani_p, :distances_checkpoint].each do |m|
|
36
38
|
@opts[m] ||= ref_project.metadata[m]
|
37
39
|
end
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.N50.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.filterN.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.length.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.split.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../enveomics.R
|
@@ -0,0 +1,54 @@
|
|
1
|
+
|
2
|
+
# High-end pipelines for SubcladeRunner
|
3
|
+
module MiGA::SubcladeRunner::Pipeline
|
4
|
+
|
5
|
+
# Run species-level clusterings using ANI>95% / AAI>90%
|
6
|
+
def cluster_species
|
7
|
+
tasks = {ani95: [:ani_distances, 95.0], aai90: [:aai_distances, 90.0]}
|
8
|
+
tasks.each do |k, par|
|
9
|
+
# Final output
|
10
|
+
ogs_file = "miga-project.#{k}-clades"
|
11
|
+
next if File.size? ogs_file
|
12
|
+
|
13
|
+
# Build ABC files
|
14
|
+
abc_path = tmp_file("#{k}.abc")
|
15
|
+
ofh = File.open(abc_path, 'w')
|
16
|
+
metric_res = project.result(par[0]) or raise "Incomplete step #{par[0]}"
|
17
|
+
Zlib::GzipReader.open(metric_res.file_path(:matrix)) do |ifh|
|
18
|
+
ifh.each_line do |ln|
|
19
|
+
next if ln =~ /^metric\t/
|
20
|
+
r = ln.chomp.split("\t")
|
21
|
+
ofh.puts "G>#{r[1]}\tG>#{r[2]}\t#{r[3]}" if r[3].to_f >= par[1]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
ofh.close
|
25
|
+
# Cluster genomes
|
26
|
+
`ogs.mcl.rb -o '#{ogs_file}' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
27
|
+
end
|
28
|
+
# Propose clades
|
29
|
+
ofh = File.open('miga-project.proposed-clades', 'w')
|
30
|
+
File.open('miga-project.ani95-clades', 'r') do |ifh|
|
31
|
+
ifh.each_line do |ln|
|
32
|
+
next if $.==1
|
33
|
+
r = ln.chomp.split(',')
|
34
|
+
ofh.puts r.join("\t") if r.size >= 5
|
35
|
+
end
|
36
|
+
end
|
37
|
+
ofh.close
|
38
|
+
end
|
39
|
+
|
40
|
+
def subclades metric
|
41
|
+
src = File.expand_path('utils/subclades.R', MiGA::MiGA.root_path)
|
42
|
+
step = :"#{metric}_distances"
|
43
|
+
metric_res = project.result(step) or raise "Incomplete step #{step}"
|
44
|
+
matrix = metric_res.file_path(:matrix)
|
45
|
+
`Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}'`
|
46
|
+
File.rename('miga-project.nwk',"miga-project.#{metric}.nwk") if
|
47
|
+
File.exist? 'miga-project.nwk'
|
48
|
+
end
|
49
|
+
|
50
|
+
def compile
|
51
|
+
src = File.expand_path('utils/subclades-compile.rb', MiGA::MiGA.root_path)
|
52
|
+
`ruby '#{src}' '.' 'miga-project.class'`
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
require_relative 'base.rb'
|
3
|
+
require_relative 'temporal.rb'
|
4
|
+
require_relative 'pipeline.rb'
|
5
|
+
|
6
|
+
|
7
|
+
class MiGA::SubcladeRunner
|
8
|
+
|
9
|
+
include MiGA::SubcladeRunner::Temporal
|
10
|
+
include MiGA::SubcladeRunner::Pipeline
|
11
|
+
|
12
|
+
attr_reader :project, :step, :opts, :home, :tmp
|
13
|
+
|
14
|
+
def initialize(project_path, step, opts_hash={})
|
15
|
+
@opts = opts_hash
|
16
|
+
@project = MiGA::Project.load(project_path) or
|
17
|
+
raise "No project at #{project_path}"
|
18
|
+
@step = step.to_sym
|
19
|
+
clades_dir = File.expand_path('data/10.clades', project.path)
|
20
|
+
@home = File.expand_path(@step==:clade_finding ? '01.find' : '02.ani',
|
21
|
+
clades_dir)
|
22
|
+
@opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
|
23
|
+
end
|
24
|
+
|
25
|
+
# Launch the appropriate analysis
|
26
|
+
def go!
|
27
|
+
return if project.type == :metagenomes
|
28
|
+
Dir.chdir home
|
29
|
+
Dir.mktmpdir do |tmp_dir|
|
30
|
+
@tmp = tmp_dir
|
31
|
+
create_temporals
|
32
|
+
step==:clade_finding ? go_clade_finding! : go_subclades!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Launch analysis for clade_finding
|
37
|
+
def go_clade_finding!
|
38
|
+
cluster_species
|
39
|
+
unless project.is_clade?
|
40
|
+
subclades :aai
|
41
|
+
compile
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Launch analysis for subclades
|
46
|
+
def go_subclades!
|
47
|
+
subclades :ani
|
48
|
+
compile
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/utils/subclades-compile.rb
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# @author Luis M. Rodriguez-R
|
4
|
-
# @update Jan-15-2016
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
2
|
|
8
3
|
$:.push File.expand_path(File.dirname(__FILE__) + "/lib")
|
9
|
-
dir = ARGV.shift
|
4
|
+
dir = ARGV.shift
|
5
|
+
out = ARGV.shift or abort "Usage: #{$0} <classif.dir> <out.base>"
|
10
6
|
|
11
7
|
def read_classif(dir, classif={})
|
12
8
|
classif_file = File.expand_path("miga-project.classif", dir)
|
13
9
|
return classif unless File.exist? classif_file
|
10
|
+
ready = File.expand_path('miga-project.ready', dir)
|
11
|
+
File.size?(ready) or raise "Incomplete recursion found at #{dir}"
|
12
|
+
File.unlink ready
|
14
13
|
fh = File.open(classif_file, "r")
|
15
14
|
klass = []
|
16
15
|
while ln = fh.gets
|
@@ -44,7 +43,7 @@ end
|
|
44
43
|
|
45
44
|
c = read_classif(dir)
|
46
45
|
max_depth = c.values.map{|i| i.count}.max
|
47
|
-
|
48
|
-
|
46
|
+
File.open("#{out}.tsv", 'w') do |fh|
|
47
|
+
c.each { |k,v| fh.puts ([k] + v + ["0"]*(max_depth-v.count)).join("\t") }
|
49
48
|
end
|
50
|
-
|
49
|
+
File.open("#{out}.nwk", 'w') { |fh| fh.puts print_tree(c) + ";" }
|
data/utils/subclades.R
CHANGED
@@ -13,24 +13,82 @@ suppressPackageStartupMessages(library(parallel))
|
|
13
13
|
suppressPackageStartupMessages(library(enveomics.R))
|
14
14
|
|
15
15
|
#= Main function
|
16
|
-
subclades <- function(ani_file, out_base, thr=1, ani=
|
16
|
+
subclades <- function(ani_file, out_base, thr=1, ani.d=dist(0)) {
|
17
17
|
say("==> Out base:", out_base, "<==")
|
18
|
-
|
19
|
-
#
|
20
|
-
|
21
|
-
|
18
|
+
|
19
|
+
# Normalize input matrix
|
20
|
+
dist_rdata = paste(out_base, "dist.rdata", sep=".")
|
21
|
+
if(!missing(ani_file)){
|
22
|
+
if(length(ani.d)==0 && !file.exists(dist_rdata)){
|
23
|
+
# Read from ani_file
|
24
|
+
a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
|
25
|
+
if(nrow(a)==0){
|
26
|
+
generate_empty_files(out_base)
|
27
|
+
return(NULL)
|
28
|
+
}
|
29
|
+
say("Distances")
|
30
|
+
a$d <- 1 - (a$value/100)
|
31
|
+
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d=max(a$d)*1.2)
|
32
|
+
save(ani.d, file=dist_rdata)
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
# Read result if the subclade is ready, run it otherwise
|
37
|
+
if(file.exists(paste(out_base,"classif",sep="."))){
|
38
|
+
say("Loading")
|
39
|
+
ani.medoids <- read.table(paste(out_base, "medoids", sep="."),
|
40
|
+
sep=' ', as.is=TRUE)[,1]
|
41
|
+
a <- read.table(paste(out_base,"classif",sep="."), sep="\t", as.is=TRUE)
|
42
|
+
ani.types <- a[,2]
|
43
|
+
names(ani.types) <- a[,1]
|
44
|
+
if(length(ani.d)==0) load(dist_rdata)
|
22
45
|
}else{
|
23
|
-
|
46
|
+
res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
|
47
|
+
if(length(res)==0) return(NULL)
|
48
|
+
ani.medoids <- res[['ani.medoids']]
|
49
|
+
ani.types <- res[['ani.types']]
|
50
|
+
ani.d <- res[['ani.d']]
|
24
51
|
}
|
25
|
-
|
26
|
-
|
27
|
-
|
52
|
+
|
53
|
+
# Recursive search
|
54
|
+
say("Recursive search")
|
55
|
+
for(i in 1:length(ani.medoids)){
|
56
|
+
medoid <- ani.medoids[i]
|
57
|
+
ds_f <- names(ani.types)[ ani.types==i ]
|
58
|
+
say("Analyzing subclade", i, "with medoid:", medoid)
|
59
|
+
dir_f <- paste(out_base, ".sc-", i, sep="")
|
60
|
+
if(!dir.exists(dir_f)) dir.create(dir_f)
|
61
|
+
write.table(ds_f,
|
62
|
+
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
63
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
64
|
+
if(length(ds_f) > 8L){
|
65
|
+
ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
|
66
|
+
subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
67
|
+
thr=thr, ani.d=ani_subset)
|
68
|
+
}
|
28
69
|
}
|
29
70
|
|
71
|
+
# Declare recursion up-to-here complete
|
72
|
+
write.table(date(), paste(out_base, 'ready', sep='.'),
|
73
|
+
quote=FALSE, row.names=FALSE, col.names=FALSE)
|
74
|
+
}
|
75
|
+
|
76
|
+
#= Heavy-lifter
|
77
|
+
subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
30
78
|
# Get ANI distances
|
31
|
-
|
32
|
-
|
33
|
-
|
79
|
+
if(length(ani.d) > 0){
|
80
|
+
# Just use ani.d (and save in dist_rdata_
|
81
|
+
save(ani.d, file=dist_rdata)
|
82
|
+
}else if(file.exists(dist_rdata)){
|
83
|
+
# Read from dist_rdata
|
84
|
+
load(dist_rdata)
|
85
|
+
}else{
|
86
|
+
stop("Cannot find input matrix", out_base)
|
87
|
+
}
|
88
|
+
if(length(labels(ani.d)) <= 8L) return(list())
|
89
|
+
|
90
|
+
# Build tree
|
91
|
+
say("Tree")
|
34
92
|
ani.ph <- bionj(ani.d)
|
35
93
|
express.ori <- options('expressions')$expressions
|
36
94
|
if(express.ori < ani.ph$Nnode*4){
|
@@ -75,7 +133,6 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
75
133
|
say("Text report")
|
76
134
|
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
77
135
|
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
78
|
-
save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
|
79
136
|
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
80
137
|
ani.d.m <- 100 - as.matrix(ani.d)*100
|
81
138
|
for(j in 1:nrow(classif)){
|
@@ -83,27 +140,18 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
83
140
|
}
|
84
141
|
write.table(classif, paste(out_base,"classif",sep="."),
|
85
142
|
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
86
|
-
|
87
|
-
#
|
88
|
-
say("
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
write.table(ds_f,
|
95
|
-
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
96
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
97
|
-
if(length(ds_f) > 5){
|
98
|
-
a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
|
99
|
-
subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
100
|
-
thr=thr, ani=a_f)
|
101
|
-
}
|
102
|
-
}
|
143
|
+
|
144
|
+
# Return data
|
145
|
+
say("Cluster ready")
|
146
|
+
return(list(
|
147
|
+
ani.medoids=ani.medoids,
|
148
|
+
ani.types=ani.types,
|
149
|
+
ani.d=ani.d
|
150
|
+
))
|
103
151
|
}
|
104
152
|
|
105
153
|
#= Helper functions
|
106
|
-
say <- function(...) {
|
154
|
+
say <- function(...) { message(paste("[",date(),"]",...,"\n"),appendLF=FALSE) }
|
107
155
|
|
108
156
|
generate_empty_files <- function(out_base) {
|
109
157
|
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
@@ -182,6 +230,7 @@ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
|
|
182
230
|
}
|
183
231
|
|
184
232
|
#= Main
|
233
|
+
options(warn=1)
|
185
234
|
subclades(ani_file=argv[1], out_base=argv[2],
|
186
235
|
thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
|
187
236
|
|
data/utils/subclades.rb
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.3.
|
4
|
+
version: 0.3.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rest-client
|
@@ -161,6 +161,7 @@ files:
|
|
161
161
|
- lib/miga/project/plugins.rb
|
162
162
|
- lib/miga/project/result.rb
|
163
163
|
- lib/miga/remote_dataset.rb
|
164
|
+
- lib/miga/remote_dataset/base.rb
|
164
165
|
- lib/miga/result.rb
|
165
166
|
- lib/miga/result/base.rb
|
166
167
|
- lib/miga/result/dates.rb
|
@@ -472,9 +473,14 @@ files:
|
|
472
473
|
- utils/plot-taxdist.R
|
473
474
|
- utils/ref-tree.R
|
474
475
|
- utils/requirements.txt
|
476
|
+
- utils/subclade/base.rb
|
477
|
+
- utils/subclade/pipeline.rb
|
478
|
+
- utils/subclade/runner.rb
|
479
|
+
- utils/subclade/temporal.rb
|
475
480
|
- utils/subclades-compile.rb
|
476
481
|
- utils/subclades-nj.R
|
477
482
|
- utils/subclades.R
|
483
|
+
- utils/subclades.rb
|
478
484
|
homepage: http://enve-omics.ce.gatech.edu/miga
|
479
485
|
licenses:
|
480
486
|
- Artistic-2.0
|
@@ -501,7 +507,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
501
507
|
version: '0'
|
502
508
|
requirements: []
|
503
509
|
rubyforge_project:
|
504
|
-
rubygems_version: 2.
|
510
|
+
rubygems_version: 2.5.2.3
|
505
511
|
signing_key:
|
506
512
|
specification_version: 4
|
507
513
|
summary: MiGA
|