miga-base 0.3.3.0 → 0.3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +7 -0
- data/actions/about.rb +1 -1
- data/actions/ls.rb +1 -1
- data/actions/summary.rb +4 -2
- data/lib/miga/remote_dataset/base.rb +59 -0
- data/lib/miga/remote_dataset.rb +4 -44
- data/lib/miga/version.rb +2 -2
- data/scripts/clade_finding.bash +2 -30
- data/scripts/subclades.bash +1 -9
- data/utils/distance/runner.rb +3 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
- data/utils/subclade/base.rb +6 -0
- data/utils/subclade/pipeline.rb +54 -0
- data/utils/subclade/runner.rb +51 -0
- data/utils/subclade/temporal.rb +14 -0
- data/utils/subclades-compile.rb +8 -9
- data/utils/subclades.R +80 -31
- data/utils/subclades.rb +9 -0
- metadata +9 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b1b32b7800278dc330c5c8e01f4b94dfd1d97750
|
4
|
+
data.tar.gz: 2c3b6ef0e73568df8775fb98c65d454cdcf0f411
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 590a41c7bc94f5d36a53e0b9eb4f096211ccdae8724e63948480e0b57c8b7fa24a5779534868c7cb13405b3360f35be00d977728886ba7b7491ef5aeebb0bc0d
|
7
|
+
data.tar.gz: 64a273f14eea3aec6f9c8cfb388bdae4bdf2027d7ce95d89b1e8a27799e51420d2f58b42a58871037ee4a0e7f616f92e8ac485a12dbe226d7ab5cda99792f286
|
data/README.md
CHANGED
@@ -30,6 +30,13 @@ You have two options:
|
|
30
30
|
[installation instructions](manual/part2/installation.md). Once you have MiGA
|
31
31
|
installed, you can [deploy some examples](manual/part4.md).
|
32
32
|
|
33
|
+
# How to cite MiGA
|
34
|
+
|
35
|
+
> Rodriguez-R *et al*. 2018. The Microbial Genomes Atlas (MiGA) webserver:
|
36
|
+
> taxonomic and gene diversity analysis of Archaea and Bacteria at the whole
|
37
|
+
> genome level. *Nucleic Acids Research* 46(W1):W282-W288.
|
38
|
+
> [doi:10.1093/nar/gky467](https://doi.org/10.1093/nar/gky467).
|
39
|
+
|
33
40
|
# Authors
|
34
41
|
|
35
42
|
Developed and maintained by [Luis M. Rodriguez-R][lrr]. MiGA is the result of a
|
data/actions/about.rb
CHANGED
@@ -12,7 +12,7 @@ OptionParser.new do |opt|
|
|
12
12
|
opt.on("-m", "--metadata STRING",
|
13
13
|
"Print name and metadata field only."
|
14
14
|
){ |v| o[:datum]=v }
|
15
|
-
opt.on("--tab
|
15
|
+
opt.on("--tab",
|
16
16
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
17
17
|
opt_common(opt, o)
|
18
18
|
end.parse!
|
data/actions/ls.rb
CHANGED
@@ -15,7 +15,7 @@ OptionParser.new do |opt|
|
|
15
15
|
opt.on("-m", "--metadata STRING",
|
16
16
|
"Print name and metadata field only. If set, ignores -i."
|
17
17
|
){ |v| o[:datum]=v }
|
18
|
-
opt.on("--tab
|
18
|
+
opt.on("--tab",
|
19
19
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
20
20
|
opt.on("-s", "--silent",
|
21
21
|
"No output and exit with non-zero status if the dataset list is empty."
|
data/actions/summary.rb
CHANGED
@@ -6,8 +6,10 @@
|
|
6
6
|
o = {q:true, units:false, tabular:false}
|
7
7
|
opts = OptionParser.new do |opt|
|
8
8
|
opt_banner(opt)
|
9
|
-
opt_object(opt, o, [:project, :dataset_opt
|
10
|
-
opt
|
9
|
+
opt_object(opt, o, [:project, :dataset_opt])
|
10
|
+
opt_filter_datasets(opt, o)
|
11
|
+
opt_object(opt, o, [:result_dataset])
|
12
|
+
opt.on("--tab",
|
11
13
|
"Returns a tab-delimited table."){ |v| o[:tabular] = v }
|
12
14
|
opt.on("--key STRING",
|
13
15
|
"Returns only the value of the requested key."){ |v| o[:key] = v }
|
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
require 'restclient'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
class MiGA::RemoteDataset < MiGA::MiGA
|
6
|
+
|
7
|
+
# Class-level
|
8
|
+
class << self
|
9
|
+
def UNIVERSE ; @@UNIVERSE ; end
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
module MiGA::RemoteDataset::Base
|
15
|
+
|
16
|
+
@@_EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
17
|
+
|
18
|
+
##
|
19
|
+
# Structure of the different database Universes or containers. The structure
|
20
|
+
# is a Hash with universe names as keys as Symbol and values being a Hash with
|
21
|
+
# supported keys as Symbol:
|
22
|
+
# - +:dbs+ => Hash with keys being the database name and the values a Hash of
|
23
|
+
# properties such as +stage+, +format+, and +map_to+.
|
24
|
+
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
25
|
+
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
26
|
+
# - +method+ => Method used to query the URL. Only +:rest+ is currently
|
27
|
+
# supported.
|
28
|
+
# - +map_to_universe+ => Universe where results map to. Currently unsupported.
|
29
|
+
@@UNIVERSE = {
|
30
|
+
web:{
|
31
|
+
dbs: {
|
32
|
+
assembly:{stage: :assembly, format: :fasta},
|
33
|
+
assembly_gz:{stage: :assembly, format: :fasta_gz}
|
34
|
+
},
|
35
|
+
url: "%2$s",
|
36
|
+
method: :net
|
37
|
+
},
|
38
|
+
ebi:{
|
39
|
+
dbs: { embl:{stage: :assembly, format: :fasta} },
|
40
|
+
url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
41
|
+
method: :rest
|
42
|
+
},
|
43
|
+
ncbi:{
|
44
|
+
dbs: { nuccore:{stage: :assembly, format: :fasta} },
|
45
|
+
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
46
|
+
method: :rest
|
47
|
+
},
|
48
|
+
ncbi_map:{
|
49
|
+
dbs: { assembly:{map_to: :nuccore, format: :text} },
|
50
|
+
# FIXME ncbi_map is intended to do internal NCBI mapping between
|
51
|
+
# databases.
|
52
|
+
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
|
53
|
+
method: :rest,
|
54
|
+
map_to_universe: :ncbi
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
end
|
59
|
+
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -1,55 +1,15 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require "open-uri"
|
4
|
+
require 'miga/remote_dataset/base'
|
6
5
|
|
7
6
|
##
|
8
7
|
# MiGA representation of datasets with data in remote locations.
|
9
8
|
class MiGA::RemoteDataset < MiGA::MiGA
|
10
|
-
# Class-level
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
#
|
15
|
-
# is a Hash with universe names as keys as Symbol and values being a Hash with
|
16
|
-
# supported keys as Symbol:
|
17
|
-
# - +:dbs+ => Hash with keys being the database name and the values a Hash of
|
18
|
-
# properties such as +stage+, +format+, and +map_to+.
|
19
|
-
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
20
|
-
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
21
|
-
# - +method+ => Method used to query the URL. Only +:rest+ is currently
|
22
|
-
# supported.
|
23
|
-
# - +map_to_universe+ => Universe where results map to. Currently unsupported.
|
24
|
-
def self.UNIVERSE ; @@UNIVERSE ; end
|
25
|
-
@@UNIVERSE = {
|
26
|
-
web:{
|
27
|
-
dbs: {
|
28
|
-
assembly:{stage: :assembly, format: :fasta},
|
29
|
-
assembly_gz:{stage: :assembly, format: :fasta_gz}
|
30
|
-
},
|
31
|
-
url: "%2$s",
|
32
|
-
method: :net
|
33
|
-
},
|
34
|
-
ebi:{
|
35
|
-
dbs: { embl:{stage: :assembly, format: :fasta} },
|
36
|
-
url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
37
|
-
method: :rest
|
38
|
-
},
|
39
|
-
ncbi:{
|
40
|
-
dbs: { nuccore:{stage: :assembly, format: :fasta} },
|
41
|
-
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
42
|
-
method: :rest
|
43
|
-
},
|
44
|
-
ncbi_map:{
|
45
|
-
dbs: { assembly:{map_to: :nuccore, format: :text} },
|
46
|
-
# FIXME ncbi_map is intended to do internal NCBI mapping between
|
47
|
-
# databases.
|
48
|
-
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
|
49
|
-
method: :rest,
|
50
|
-
map_to_universe: :ncbi
|
51
|
-
}
|
52
|
-
}
|
10
|
+
include MiGA::RemoteDataset::Base
|
11
|
+
|
12
|
+
# Class-level
|
53
13
|
|
54
14
|
##
|
55
15
|
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 3,
|
13
|
+
VERSION = [0.3, 3, 1]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -25,7 +25,7 @@ module MiGA
|
|
25
25
|
CITATION = "Rodriguez-R et al (2018). " +
|
26
26
|
"The Microbial Genomes Atlas (MiGA) webserver: taxonomic and gene " +
|
27
27
|
"diversity analysis of Archaea and Bacteria at the whole genome level. " +
|
28
|
-
"Nucleic Acids Research
|
28
|
+
"Nucleic Acids Research 46(W1):W282-W288. doi:10.1093/nar/gky467."
|
29
29
|
|
30
30
|
end
|
31
31
|
|
data/scripts/clade_finding.bash
CHANGED
@@ -11,36 +11,8 @@ cd "$PROJECT/data/10.clades/01.find"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
|
-
#
|
15
|
-
|
16
|
-
| awk -F"\\t" '$4>=90{print $2"'"\\t"'"$3"'"\\t"'"$4}' \
|
17
|
-
> genome-genome.aai90.rbm
|
18
|
-
ogs.mcl.rb -d . -o miga-project.aai90-clades -t "$CORES" -i \
|
19
|
-
-f "(\\S+)-(\\S+)\\.aai90\\.rbm"
|
20
|
-
rm genome-genome.aai90.rbm
|
21
|
-
gunzip -c ../../09.distances/02.aai/miga-project.txt.gz | tail -n+2 \
|
22
|
-
| awk -F"\\t" '$4>=95{print $2"'"\\t"'"$3"'"\\t"'"$4}' \
|
23
|
-
> genome-genome.ani95.rbm
|
24
|
-
ogs.mcl.rb -d . -o miga-project.ani95-clades -t "$CORES" -b \
|
25
|
-
-f "(\\S+)-(\\S+)\\.ani95\\.rbm"
|
26
|
-
rm genome-genome.ani95.rbm
|
27
|
-
|
28
|
-
# Propose clade projects
|
29
|
-
tail -n +2 miga-project.ani95-clades | tr "," "\\t" | awk 'NF >= 5' \
|
30
|
-
> miga-project.proposed-clades
|
31
|
-
|
32
|
-
# Run R code (except in projects type clade)
|
33
|
-
if [[ $(miga project_info -P "$PROJECT" -m type) != "clade" ]] ; then
|
34
|
-
"$MIGA/utils/subclades.R" \
|
35
|
-
../../09.distances/02.aai/miga-project.txt.gz \
|
36
|
-
miga-project "$CORES"
|
37
|
-
mv miga-project.nwk miga-project.aai.nwk
|
38
|
-
|
39
|
-
# Compile
|
40
|
-
ruby "$MIGA/utils/subclades-compile.rb" . \
|
41
|
-
> miga-project.class.tsv \
|
42
|
-
2> miga-project.class.nwk
|
43
|
-
fi
|
14
|
+
# Run
|
15
|
+
ruby -I "$MIGA/lib" "$MIGA/utils/subclades.rb" "$PROJECT" "$SCRIPT"
|
44
16
|
|
45
17
|
# Finalize
|
46
18
|
miga date > "miga-project.done"
|
data/scripts/subclades.bash
CHANGED
@@ -12,15 +12,7 @@ cd "$PROJECT/data/10.clades/02.ani"
|
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
14
|
# Run R code
|
15
|
-
"$MIGA/utils/subclades.
|
16
|
-
../../09.distances/03.ani/miga-project.txt.gz \
|
17
|
-
miga-project "$CORES"
|
18
|
-
mv miga-project.nwk miga-project.ani.nwk
|
19
|
-
|
20
|
-
# Compile
|
21
|
-
ruby "$MIGA/utils/subclades-compile.rb" . \
|
22
|
-
> miga-project.class.tsv \
|
23
|
-
2> miga-project.class.nwk
|
15
|
+
ruby -I "$MIGA/lib" "$MIGA/utils/subclades.rb" "$PROJECT" "$SCRIPT"
|
24
16
|
|
25
17
|
# Finalize
|
26
18
|
miga date > "miga-project.done"
|
data/utils/distance/runner.rb
CHANGED
@@ -30,8 +30,10 @@ class MiGA::DistanceRunner
|
|
30
30
|
if opts[:run_taxonomy] && project.metadata[:ref_project]
|
31
31
|
@home = File.expand_path('05.taxonomy', @home)
|
32
32
|
@ref_project = MiGA::Project.load(project.metadata[:ref_project])
|
33
|
+
raise "Cannot load reference project: #{project.metadata[:ref_project]}" if @ref_project.nil?
|
34
|
+
else
|
35
|
+
@ref_project = project
|
33
36
|
end
|
34
|
-
@ref_project ||= project
|
35
37
|
[:haai_p, :aai_p, :ani_p, :distances_checkpoint].each do |m|
|
36
38
|
@opts[m] ||= ref_project.metadata[m]
|
37
39
|
end
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.N50.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.filterN.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.length.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../Scripts/FastA.split.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
../../enveomics.R
|
@@ -0,0 +1,54 @@
|
|
1
|
+
|
2
|
+
# High-end pipelines for SubcladeRunner
|
3
|
+
module MiGA::SubcladeRunner::Pipeline
|
4
|
+
|
5
|
+
# Run species-level clusterings using ANI>95% / AAI>90%
|
6
|
+
def cluster_species
|
7
|
+
tasks = {ani95: [:ani_distances, 95.0], aai90: [:aai_distances, 90.0]}
|
8
|
+
tasks.each do |k, par|
|
9
|
+
# Final output
|
10
|
+
ogs_file = "miga-project.#{k}-clades"
|
11
|
+
next if File.size? ogs_file
|
12
|
+
|
13
|
+
# Build ABC files
|
14
|
+
abc_path = tmp_file("#{k}.abc")
|
15
|
+
ofh = File.open(abc_path, 'w')
|
16
|
+
metric_res = project.result(par[0]) or raise "Incomplete step #{par[0]}"
|
17
|
+
Zlib::GzipReader.open(metric_res.file_path(:matrix)) do |ifh|
|
18
|
+
ifh.each_line do |ln|
|
19
|
+
next if ln =~ /^metric\t/
|
20
|
+
r = ln.chomp.split("\t")
|
21
|
+
ofh.puts "G>#{r[1]}\tG>#{r[2]}\t#{r[3]}" if r[3].to_f >= par[1]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
ofh.close
|
25
|
+
# Cluster genomes
|
26
|
+
`ogs.mcl.rb -o '#{ogs_file}' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
27
|
+
end
|
28
|
+
# Propose clades
|
29
|
+
ofh = File.open('miga-project.proposed-clades', 'w')
|
30
|
+
File.open('miga-project.ani95-clades', 'r') do |ifh|
|
31
|
+
ifh.each_line do |ln|
|
32
|
+
next if $.==1
|
33
|
+
r = ln.chomp.split(',')
|
34
|
+
ofh.puts r.join("\t") if r.size >= 5
|
35
|
+
end
|
36
|
+
end
|
37
|
+
ofh.close
|
38
|
+
end
|
39
|
+
|
40
|
+
def subclades metric
|
41
|
+
src = File.expand_path('utils/subclades.R', MiGA::MiGA.root_path)
|
42
|
+
step = :"#{metric}_distances"
|
43
|
+
metric_res = project.result(step) or raise "Incomplete step #{step}"
|
44
|
+
matrix = metric_res.file_path(:matrix)
|
45
|
+
`Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}'`
|
46
|
+
File.rename('miga-project.nwk',"miga-project.#{metric}.nwk") if
|
47
|
+
File.exist? 'miga-project.nwk'
|
48
|
+
end
|
49
|
+
|
50
|
+
def compile
|
51
|
+
src = File.expand_path('utils/subclades-compile.rb', MiGA::MiGA.root_path)
|
52
|
+
`ruby '#{src}' '.' 'miga-project.class'`
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
require_relative 'base.rb'
|
3
|
+
require_relative 'temporal.rb'
|
4
|
+
require_relative 'pipeline.rb'
|
5
|
+
|
6
|
+
|
7
|
+
class MiGA::SubcladeRunner
|
8
|
+
|
9
|
+
include MiGA::SubcladeRunner::Temporal
|
10
|
+
include MiGA::SubcladeRunner::Pipeline
|
11
|
+
|
12
|
+
attr_reader :project, :step, :opts, :home, :tmp
|
13
|
+
|
14
|
+
def initialize(project_path, step, opts_hash={})
|
15
|
+
@opts = opts_hash
|
16
|
+
@project = MiGA::Project.load(project_path) or
|
17
|
+
raise "No project at #{project_path}"
|
18
|
+
@step = step.to_sym
|
19
|
+
clades_dir = File.expand_path('data/10.clades', project.path)
|
20
|
+
@home = File.expand_path(@step==:clade_finding ? '01.find' : '02.ani',
|
21
|
+
clades_dir)
|
22
|
+
@opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
|
23
|
+
end
|
24
|
+
|
25
|
+
# Launch the appropriate analysis
|
26
|
+
def go!
|
27
|
+
return if project.type == :metagenomes
|
28
|
+
Dir.chdir home
|
29
|
+
Dir.mktmpdir do |tmp_dir|
|
30
|
+
@tmp = tmp_dir
|
31
|
+
create_temporals
|
32
|
+
step==:clade_finding ? go_clade_finding! : go_subclades!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Launch analysis for clade_finding
|
37
|
+
def go_clade_finding!
|
38
|
+
cluster_species
|
39
|
+
unless project.is_clade?
|
40
|
+
subclades :aai
|
41
|
+
compile
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Launch analysis for subclades
|
46
|
+
def go_subclades!
|
47
|
+
subclades :ani
|
48
|
+
compile
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/utils/subclades-compile.rb
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# @author Luis M. Rodriguez-R
|
4
|
-
# @update Jan-15-2016
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
2
|
|
8
3
|
$:.push File.expand_path(File.dirname(__FILE__) + "/lib")
|
9
|
-
dir = ARGV.shift
|
4
|
+
dir = ARGV.shift
|
5
|
+
out = ARGV.shift or abort "Usage: #{$0} <classif.dir> <out.base>"
|
10
6
|
|
11
7
|
def read_classif(dir, classif={})
|
12
8
|
classif_file = File.expand_path("miga-project.classif", dir)
|
13
9
|
return classif unless File.exist? classif_file
|
10
|
+
ready = File.expand_path('miga-project.ready', dir)
|
11
|
+
File.size?(ready) or raise "Incomplete recursion found at #{dir}"
|
12
|
+
File.unlink ready
|
14
13
|
fh = File.open(classif_file, "r")
|
15
14
|
klass = []
|
16
15
|
while ln = fh.gets
|
@@ -44,7 +43,7 @@ end
|
|
44
43
|
|
45
44
|
c = read_classif(dir)
|
46
45
|
max_depth = c.values.map{|i| i.count}.max
|
47
|
-
|
48
|
-
|
46
|
+
File.open("#{out}.tsv", 'w') do |fh|
|
47
|
+
c.each { |k,v| fh.puts ([k] + v + ["0"]*(max_depth-v.count)).join("\t") }
|
49
48
|
end
|
50
|
-
|
49
|
+
File.open("#{out}.nwk", 'w') { |fh| fh.puts print_tree(c) + ";" }
|
data/utils/subclades.R
CHANGED
@@ -13,24 +13,82 @@ suppressPackageStartupMessages(library(parallel))
|
|
13
13
|
suppressPackageStartupMessages(library(enveomics.R))
|
14
14
|
|
15
15
|
#= Main function
|
16
|
-
subclades <- function(ani_file, out_base, thr=1, ani=
|
16
|
+
subclades <- function(ani_file, out_base, thr=1, ani.d=dist(0)) {
|
17
17
|
say("==> Out base:", out_base, "<==")
|
18
|
-
|
19
|
-
#
|
20
|
-
|
21
|
-
|
18
|
+
|
19
|
+
# Normalize input matrix
|
20
|
+
dist_rdata = paste(out_base, "dist.rdata", sep=".")
|
21
|
+
if(!missing(ani_file)){
|
22
|
+
if(length(ani.d)==0 && !file.exists(dist_rdata)){
|
23
|
+
# Read from ani_file
|
24
|
+
a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
|
25
|
+
if(nrow(a)==0){
|
26
|
+
generate_empty_files(out_base)
|
27
|
+
return(NULL)
|
28
|
+
}
|
29
|
+
say("Distances")
|
30
|
+
a$d <- 1 - (a$value/100)
|
31
|
+
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d=max(a$d)*1.2)
|
32
|
+
save(ani.d, file=dist_rdata)
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
# Read result if the subclade is ready, run it otherwise
|
37
|
+
if(file.exists(paste(out_base,"classif",sep="."))){
|
38
|
+
say("Loading")
|
39
|
+
ani.medoids <- read.table(paste(out_base, "medoids", sep="."),
|
40
|
+
sep=' ', as.is=TRUE)[,1]
|
41
|
+
a <- read.table(paste(out_base,"classif",sep="."), sep="\t", as.is=TRUE)
|
42
|
+
ani.types <- a[,2]
|
43
|
+
names(ani.types) <- a[,1]
|
44
|
+
if(length(ani.d)==0) load(dist_rdata)
|
22
45
|
}else{
|
23
|
-
|
46
|
+
res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
|
47
|
+
if(length(res)==0) return(NULL)
|
48
|
+
ani.medoids <- res[['ani.medoids']]
|
49
|
+
ani.types <- res[['ani.types']]
|
50
|
+
ani.d <- res[['ani.d']]
|
24
51
|
}
|
25
|
-
|
26
|
-
|
27
|
-
|
52
|
+
|
53
|
+
# Recursive search
|
54
|
+
say("Recursive search")
|
55
|
+
for(i in 1:length(ani.medoids)){
|
56
|
+
medoid <- ani.medoids[i]
|
57
|
+
ds_f <- names(ani.types)[ ani.types==i ]
|
58
|
+
say("Analyzing subclade", i, "with medoid:", medoid)
|
59
|
+
dir_f <- paste(out_base, ".sc-", i, sep="")
|
60
|
+
if(!dir.exists(dir_f)) dir.create(dir_f)
|
61
|
+
write.table(ds_f,
|
62
|
+
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
63
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
64
|
+
if(length(ds_f) > 8L){
|
65
|
+
ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
|
66
|
+
subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
67
|
+
thr=thr, ani.d=ani_subset)
|
68
|
+
}
|
28
69
|
}
|
29
70
|
|
71
|
+
# Declare recursion up-to-here complete
|
72
|
+
write.table(date(), paste(out_base, 'ready', sep='.'),
|
73
|
+
quote=FALSE, row.names=FALSE, col.names=FALSE)
|
74
|
+
}
|
75
|
+
|
76
|
+
#= Heavy-lifter
|
77
|
+
subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
30
78
|
# Get ANI distances
|
31
|
-
|
32
|
-
|
33
|
-
|
79
|
+
if(length(ani.d) > 0){
|
80
|
+
# Just use ani.d (and save in dist_rdata_
|
81
|
+
save(ani.d, file=dist_rdata)
|
82
|
+
}else if(file.exists(dist_rdata)){
|
83
|
+
# Read from dist_rdata
|
84
|
+
load(dist_rdata)
|
85
|
+
}else{
|
86
|
+
stop("Cannot find input matrix", out_base)
|
87
|
+
}
|
88
|
+
if(length(labels(ani.d)) <= 8L) return(list())
|
89
|
+
|
90
|
+
# Build tree
|
91
|
+
say("Tree")
|
34
92
|
ani.ph <- bionj(ani.d)
|
35
93
|
express.ori <- options('expressions')$expressions
|
36
94
|
if(express.ori < ani.ph$Nnode*4){
|
@@ -75,7 +133,6 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
75
133
|
say("Text report")
|
76
134
|
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
77
135
|
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
78
|
-
save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
|
79
136
|
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
80
137
|
ani.d.m <- 100 - as.matrix(ani.d)*100
|
81
138
|
for(j in 1:nrow(classif)){
|
@@ -83,27 +140,18 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
83
140
|
}
|
84
141
|
write.table(classif, paste(out_base,"classif",sep="."),
|
85
142
|
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
86
|
-
|
87
|
-
#
|
88
|
-
say("
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
write.table(ds_f,
|
95
|
-
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
96
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
97
|
-
if(length(ds_f) > 5){
|
98
|
-
a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
|
99
|
-
subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
100
|
-
thr=thr, ani=a_f)
|
101
|
-
}
|
102
|
-
}
|
143
|
+
|
144
|
+
# Return data
|
145
|
+
say("Cluster ready")
|
146
|
+
return(list(
|
147
|
+
ani.medoids=ani.medoids,
|
148
|
+
ani.types=ani.types,
|
149
|
+
ani.d=ani.d
|
150
|
+
))
|
103
151
|
}
|
104
152
|
|
105
153
|
#= Helper functions
|
106
|
-
say <- function(...) {
|
154
|
+
say <- function(...) { message(paste("[",date(),"]",...,"\n"),appendLF=FALSE) }
|
107
155
|
|
108
156
|
generate_empty_files <- function(out_base) {
|
109
157
|
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
@@ -182,6 +230,7 @@ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
|
|
182
230
|
}
|
183
231
|
|
184
232
|
#= Main
|
233
|
+
options(warn=1)
|
185
234
|
subclades(ani_file=argv[1], out_base=argv[2],
|
186
235
|
thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
|
187
236
|
|
data/utils/subclades.rb
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.3.
|
4
|
+
version: 0.3.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rest-client
|
@@ -161,6 +161,7 @@ files:
|
|
161
161
|
- lib/miga/project/plugins.rb
|
162
162
|
- lib/miga/project/result.rb
|
163
163
|
- lib/miga/remote_dataset.rb
|
164
|
+
- lib/miga/remote_dataset/base.rb
|
164
165
|
- lib/miga/result.rb
|
165
166
|
- lib/miga/result/base.rb
|
166
167
|
- lib/miga/result/dates.rb
|
@@ -472,9 +473,14 @@ files:
|
|
472
473
|
- utils/plot-taxdist.R
|
473
474
|
- utils/ref-tree.R
|
474
475
|
- utils/requirements.txt
|
476
|
+
- utils/subclade/base.rb
|
477
|
+
- utils/subclade/pipeline.rb
|
478
|
+
- utils/subclade/runner.rb
|
479
|
+
- utils/subclade/temporal.rb
|
475
480
|
- utils/subclades-compile.rb
|
476
481
|
- utils/subclades-nj.R
|
477
482
|
- utils/subclades.R
|
483
|
+
- utils/subclades.rb
|
478
484
|
homepage: http://enve-omics.ce.gatech.edu/miga
|
479
485
|
licenses:
|
480
486
|
- Artistic-2.0
|
@@ -501,7 +507,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
501
507
|
version: '0'
|
502
508
|
requirements: []
|
503
509
|
rubyforge_project:
|
504
|
-
rubygems_version: 2.
|
510
|
+
rubygems_version: 2.5.2.3
|
505
511
|
signing_key:
|
506
512
|
specification_version: 4
|
507
513
|
summary: MiGA
|