miga-base 0.2.0.9 → 0.2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +3 -0
  3. data/actions/add_result.rb +37 -0
  4. data/actions/add_taxonomy.rb +63 -0
  5. data/actions/create_dataset.rb +49 -0
  6. data/actions/create_project.rb +46 -0
  7. data/actions/daemon.rb +50 -0
  8. data/actions/date.rb +14 -0
  9. data/actions/{download_dataset → download_dataset.rb} +5 -28
  10. data/actions/find_datasets.rb +41 -0
  11. data/actions/import_datasets.rb +47 -0
  12. data/actions/index_taxonomy.rb +46 -0
  13. data/actions/list_datasets.rb +50 -0
  14. data/actions/list_files.rb +43 -0
  15. data/actions/project_info.rb +40 -0
  16. data/actions/unlink_dataset.rb +28 -0
  17. data/bin/miga +129 -33
  18. data/lib/miga/daemon.rb +48 -34
  19. data/lib/miga/dataset.rb +7 -123
  20. data/lib/miga/dataset_result.rb +177 -0
  21. data/lib/miga/project.rb +32 -12
  22. data/lib/miga/version.rb +2 -2
  23. data/scripts/_distances_functions.bash +82 -0
  24. data/scripts/_distances_noref_nomulti.bash +96 -67
  25. data/scripts/_distances_ref_nomulti.bash +54 -85
  26. data/scripts/assembly.bash +16 -3
  27. data/scripts/clade_finding.bash +20 -18
  28. data/scripts/distances.bash +2 -1
  29. data/scripts/init.bash +2 -6
  30. data/scripts/subclades.bash +4 -5
  31. data/test/common_test.rb +2 -2
  32. data/test/daemon_test.rb +73 -1
  33. data/test/project_test.rb +26 -2
  34. data/test/taxonomy_test.rb +10 -0
  35. data/test/test_helper.rb +1 -1
  36. data/utils/subclades-compile.rb +4 -2
  37. data/utils/subclades.R +140 -158
  38. metadata +48 -44
  39. data/actions/add_result +0 -58
  40. data/actions/add_taxonomy +0 -83
  41. data/actions/create_dataset +0 -61
  42. data/actions/create_project +0 -67
  43. data/actions/daemon +0 -66
  44. data/actions/find_datasets +0 -61
  45. data/actions/import_datasets +0 -83
  46. data/actions/index_taxonomy +0 -68
  47. data/actions/list_datasets +0 -81
  48. data/actions/list_files +0 -63
  49. data/actions/unlink_dataset +0 -49
@@ -11,33 +11,35 @@ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
11
11
 
12
12
  # Markov-cluster genomes by ANI
13
13
  gunzip -c ../../09.distances/03.ani/miga-project.txt.gz | tail -n+2 \
14
- | awk -F"\\t" '{print $2"'"\\t"'"$3"'"\\t"'"$4}' > genome-genome.aai90.rbm
14
+ | awk -F"\\t" '$4>=90{print $2"'"\\t"'"$3"'"\\t"'"$4}' \
15
+ > genome-genome.aai90.rbm
15
16
  ogs.mcl.rb -d . -o miga-project.aai90-clades -t "$CORES" -i \
16
- -f "(\\S+)-(\\S+)\\.aai90\\.rbm"
17
+ -f "(\\S+)-(\\S+)\\.aai90\\.rbm"
17
18
  rm genome-genome.aai90.rbm
18
- cat genome-genome.aai90.rbm | awk -F"\\t" '$3>=95' > genome-genome.ani95.rbm
19
+ gunzip -c ../../09.distances/02.aai/miga-project.txt.gz | tail -n+2 \
20
+ | awk -F"\\t" '$4>=95{print $2"'"\\t"'"$3"'"\\t"'"$4}' \
21
+ > genome-genome.ani95.rbm
19
22
  ogs.mcl.rb -d . -o miga-project.ani95-clades -t "$CORES" -b \
20
- -f "(\\S+)-(\\S+)\\.ani95\\.rbm"
23
+ -f "(\\S+)-(\\S+)\\.ani95\\.rbm"
21
24
  rm genome-genome.ani95.rbm
22
25
 
23
26
  # Propose clade projects
24
27
  cat miga-project.ani95-clades | tail -n +2 | tr "," "\\t" | awk 'NF >= 5' \
25
- > miga-project.proposed-clades
28
+ > miga-project.proposed-clades
26
29
 
27
- # Run R code
28
- echo "
29
- source('$MIGA/utils/subclades.R');
30
- subclades('../../09.distances/02.aai/miga-project.txt.gz',
31
- 'miga-project', $CORES);
32
- " | R --vanilla
33
- mv miga-project.ani.nwk miga-project.aai.nwk
34
-
35
- # Compile
36
- ruby "$MIGA/utils/subclades-compile.rb" . \
37
- > miga-project.class.tsv \
38
- 2> miga-project.class.nwk
30
+ # Run R code (except in projects type clade)
31
+ if [[ $(miga project_info -P "$PROJECT" -m type) != "clade" ]] ; then
32
+ $MIGA/utils/subclades.R \
33
+ ../../09.distances/02.aai/miga-project.txt.gz \
34
+ miga-project $CORES
35
+ mv miga-project.nwk miga-project.aai.nwk
36
+
37
+ # Compile
38
+ ruby "$MIGA/utils/subclades-compile.rb" . \
39
+ > miga-project.class.tsv \
40
+ 2> miga-project.class.nwk
41
+ fi
39
42
 
40
43
  # Finalize
41
44
  date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
42
45
  miga add_result -P "$PROJECT" -r clade_finding
43
-
@@ -17,7 +17,8 @@ NOMULTI=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --no-multi \
17
17
  REF=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --ref \
18
18
  | wc -l | awk '{print $1}')
19
19
 
20
- # Call submodule
20
+ # Call submodules
21
+ source "$MIGA/scripts/_distances_functions.bash"
21
22
  if [[ "$NOMULTI" -eq "1" && "$REF" -eq "1" ]] ; then
22
23
  source "$MIGA/scripts/_distances_ref_nomulti.bash"
23
24
  elif [[ "$NOMULTI" -eq "1" ]] ; then
data/scripts/init.bash CHANGED
@@ -106,11 +106,7 @@ echo "export PATH=\$MIGA_PATH\$PATH" >> "$HOME/.miga_rc"
106
106
  # Check for R packages
107
107
  echo "
108
108
  Looking for R packages:" >&2
109
- if ! check_rlib enveomics.R ; then
110
- echo "+ Installing enveomics.R" >&2
111
- R CMD INSTALL $(dirname "$(which "FastQ.tag.rb")")/../enveomics.R
112
- fi
113
- RLIBS="ape ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
109
+ RLIBS="enveomics.R ape ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
114
110
  for lib in $RLIBS ; do
115
111
  if ! check_rlib $lib ; then
116
112
  echo "+ Installing $lib" >&2
@@ -122,7 +118,7 @@ done
122
118
  # Check for ruby gems
123
119
  echo "
124
120
  Looking for Ruby gems:" >&2
125
- GEMS="rest_client sqlite3 daemons json"
121
+ GEMS="rest-client sqlite3 daemons json"
126
122
  for gem in $GEMS ; do
127
123
  if ! check_gem $gem ; then
128
124
  echo "+ Installing $gem (user-only)" >&2
@@ -10,11 +10,10 @@ cd "$PROJECT/data/10.clades/02.ani"
10
10
  date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
11
11
 
12
12
  # Run R code
13
- echo "
14
- source('$MIGA/utils/subclades.R');
15
- subclades('../../09.distances/03.ani/miga-project.txt.gz',
16
- 'miga-project', $CORES);
17
- " | R --vanilla
13
+ $MIGA/utils/subclades.R \
14
+ ../../09.distances/03.ani/miga-project.txt.gz \
15
+ miga-project $CORES
16
+ mv miga-project.nwk miga-project.ani.nwk
18
17
 
19
18
  # Compile
20
19
  ruby "$MIGA/utils/subclades-compile.rb" . \
data/test/common_test.rb CHANGED
@@ -3,7 +3,7 @@ require "test_helper"
3
3
  class CommonTest < Test::Unit::TestCase
4
4
 
5
5
  def setup
6
- $jruby_tests = !ENV["JRUBY_TESTS"].nil?
6
+ #$jruby_tests = !ENV["JRUBY_TESTS"].nil?
7
7
  end
8
8
 
9
9
  def test_debug
@@ -28,7 +28,7 @@ class CommonTest < Test::Unit::TestCase
28
28
  assert_respond_to(MiGA::MiGA, :DEBUG)
29
29
  assert_respond_to(MiGA::MiGA, :DEBUG_ON)
30
30
  assert_respond_to(MiGA::MiGA, :DEBUG_OFF)
31
- omit_if($jruby_tests, "JRuby doesn't like interceptions.")
31
+ #omit_if($jruby_tests, "JRuby doesn't like interceptions.")
32
32
  MiGA::MiGA.DEBUG_TRACE_ON
33
33
  err = capture_stderr do
34
34
  MiGA::MiGA.DEBUG "Dandadi"
data/test/daemon_test.rb CHANGED
@@ -4,11 +4,13 @@ require "miga/daemon"
4
4
  class DaemonTest < Test::Unit::TestCase
5
5
 
6
6
  def setup
7
+ $jruby_tests = !ENV["JRUBY_TESTS"].nil?
7
8
  $tmp = Dir.mktmpdir
8
9
  ENV["MIGA_HOME"] = $tmp
9
10
  FileUtils.touch("#{ENV["MIGA_HOME"]}/.miga_rc")
10
11
  File.open("#{ENV["MIGA_HOME"]}/.miga_daemon.json", "w") do |fh|
11
- fh.puts '{"maxjobs":1,"ppn":1,"latency":2}'
12
+ fh.puts '{"maxjobs":1,"ppn":1,"latency":2,"varsep":" ","var":"%s=%s",
13
+ "cmd":"%5$s","alive":"echo 1 # %s"}'
12
14
  end
13
15
  $p1 = MiGA::Project.new(File.expand_path("project1", $tmp))
14
16
  $d1 = MiGA::Daemon.new($p1)
@@ -18,7 +20,77 @@ class DaemonTest < Test::Unit::TestCase
18
20
  FileUtils.rm_rf $tmp
19
21
  ENV["MIGA_HOME"] = nil
20
22
  end
23
+
24
+ def test_check_project
21
25
 
26
+ end
27
+
28
+ def test_check_datasets
29
+ p = $p1
30
+ d = $d1
31
+ d.runopts(:maxjobs, 0, true)
32
+ assert(d.jobs_to_run.empty?)
33
+ ds = p.add_dataset("ds1")
34
+ d.check_datasets
35
+ assert(d.jobs_to_run.empty?)
36
+ FileUtils.cp(File.expand_path("daemon/daemon.json", p.path),
37
+ File.expand_path("data/01.raw_reads/ds1.1.fastq", p.path))
38
+ FileUtils.cp(File.expand_path("daemon/daemon.json", p.path),
39
+ File.expand_path("data/01.raw_reads/ds1.done", p.path))
40
+ out = capture_stdout do
41
+ d.check_datasets
42
+ end
43
+ assert(out.string =~ /Queueing #{ds.name}:trimmed_reads/)
44
+ assert_equal(1, d.jobs_to_run.size)
45
+ assert_equal("project1:trimmed_reads:ds1", d.jobs_to_run.first[:cmd])
46
+ assert_equal(d.jobs_to_run.first, d.get_job(:trimmed_reads, ds))
47
+ end
48
+
49
+ def test_in_loop
50
+ p = $p1
51
+ d = $d1
52
+ d.runopts(:latency, 0, true)
53
+ assert_equal(-1, d.loop_i)
54
+ assert_nil(d.last_alive)
55
+ out = capture_stdout do
56
+ d.in_loop
57
+ end
58
+ assert_equal(DateTime, d.last_alive.class)
59
+ assert(out.string =~ /-{20}\n.*MiGA:#{p.name} launched/)
60
+ 10.times{ d.in_loop }
61
+ assert_equal(11, d.loop_i)
62
+ out = capture_stdout do
63
+ d.in_loop
64
+ end
65
+ assert(out.string =~ /Housekeeping for sanity/)
66
+ assert_equal(0, d.loop_i)
67
+ end
68
+
69
+ def test_start
70
+ p = $p1
71
+ d = $d1
72
+ d.runopts(:latency, 0, true)
73
+ assert_equal(0, d.latency)
74
+ omit_if($jruby_tests, "JRuby doesn't implement fork.")
75
+ $child = fork { d.start }
76
+ sleep(3)
77
+ dpath = File.expand_path("daemon/MiGA:#{p.name}",p.path)
78
+ assert(File.exist?("#{dpath}.pid"))
79
+ out = capture_stdout { d.stop }
80
+ assert(out.string =~ /MiGA:#{p.name}: trying to stop process with pid \d+/)
81
+ assert(!File.exist?("#{dpath}.pid"))
82
+ assert(File.exist?("#{dpath}.output"))
83
+ File.open("#{dpath}.output", "r") do |fh|
84
+ l = fh.each_line.to_a
85
+ assert(l[0] =~ /-{20}\n/)
86
+ assert(l[1] =~ /MiGA:#{p.name} launched/)
87
+ assert(l[2] =~ /-{20}\n/)
88
+ assert(l[3] =~ /Housekeeping for sanity\n/)
89
+ end
90
+ ensure
91
+ Process.kill("KILL", $child) unless $child.nil?
92
+ end
93
+
22
94
  def test_last_alive
23
95
  p = MiGA::Project.new(File.expand_path("last_alive", $tmp))
24
96
  d = MiGA::Daemon.new(p)
data/test/project_test.rb CHANGED
@@ -22,7 +22,8 @@ class ProjectTest < Test::Unit::TestCase
22
22
  end
23
23
 
24
24
  def test_create
25
- assert_equal($tmp + "create", MiGA::Project.new($tmp + "create").path)
25
+ assert_equal("#{$tmp}/create", MiGA::Project.new("#{$tmp}/create").path)
26
+ assert(Dir.exist?("#{$tmp}/create"))
26
27
  assert_raise do
27
28
  ENV["MIGA_HOME"] = $tmp + "/chez-moi"
28
29
  MiGA::Project.new($tmp + "/cuckoo")
@@ -41,14 +42,37 @@ class ProjectTest < Test::Unit::TestCase
41
42
  end
42
43
 
43
44
  def test_datasets
44
- p = MiGA::Project.new(File.expand_path("datasets", $tmp))
45
+ p = $p1
45
46
  d = p.add_dataset("d1")
46
47
  assert_equal(MiGA::Dataset, d.class)
47
48
  assert_equal([d], p.datasets)
49
+ assert_equal(["d1"], p.dataset_names)
48
50
  p.each_dataset{ |ds| assert_equal(d, ds) }
49
51
  dr = p.unlink_dataset("d1")
50
52
  assert_equal(d, dr)
51
53
  assert_equal([], p.datasets)
54
+ assert_equal([], p.dataset_names)
55
+ end
56
+
57
+ def test_import_dataset
58
+ p1 = $p1
59
+ d1 = p1.add_dataset("d1")
60
+ File.open("#{p1.path}/data/01.raw_reads/#{d1.name}.1.fastq",
61
+ "w") { |f| f.puts ":-)" }
62
+ File.open("#{p1.path}/data/01.raw_reads/#{d1.name}.done",
63
+ "w") { |f| f.puts ":-)" }
64
+ d1.next_preprocessing(true)
65
+ p2 = MiGA::Project.new(File.expand_path("import_dataset", $tmp))
66
+ assert(p2.datasets.empty?)
67
+ assert_nil(p2.dataset("d1"))
68
+ p2.import_dataset(d1)
69
+ assert_equal(1, p2.datasets.size)
70
+ assert_equal(MiGA::Dataset, p2.dataset("d1").class)
71
+ assert_equal(1, p2.dataset("d1").results.size)
72
+ assert(File.exist?(
73
+ File.expand_path("data/01.raw_reads/#{d1.name}.1.fastq", p2.path)))
74
+ assert(File.exist?(
75
+ File.expand_path("metadata/#{d1.name}.json", p2.path)))
52
76
  end
53
77
 
54
78
  end
@@ -33,4 +33,14 @@ class TaxonomyTest < Test::Unit::TestCase
33
33
  assert(tx.is_in? MiGA::Taxonomy.new("species:v3_0"))
34
34
  end
35
35
 
36
+ def test_init_methods
37
+ tx = MiGA::Taxonomy.new({:k=>"Mascot", :c=>"Cereal", :s=>"Melvin"})
38
+ assert_equal("k:Mascot c:Cereal s:Melvin", tx.to_s)
39
+ tx = MiGA::Taxonomy.new("Mascot College Buzz", "k c s")
40
+ assert_equal("k:Mascot c:College s:Buzz", tx.to_s)
41
+ assert_raise do
42
+ tx = MiGA::Taxonomy.new("Mascot State Georgia Peach", "k c s")
43
+ end
44
+ end
45
+
36
46
  end
data/test/test_helper.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require "codeclimate-test-reporter"
2
- CodeClimate::TestReporter.start
2
+ CodeClimate::TestReporter.start unless ENV["REMOTE_TESTS"].nil?
3
3
 
4
4
  require "rubygems"
5
5
  require "test/unit"
@@ -9,7 +9,9 @@ $:.push File.expand_path(File.dirname(__FILE__) + "/lib")
9
9
  dir = ARGV.shift or abort "Usage: #{$0} <classif.dir>"
10
10
 
11
11
  def read_classif(dir, classif={})
12
- fh = File.open(File.expand_path("miga-project.1.classif", dir), "r")
12
+ classif_file = File.expand_path("miga-project.classif", dir)
13
+ return classif unless File.exist? classif_file
14
+ fh = File.open(classif_file, "r")
13
15
  klass = []
14
16
  while ln = fh.gets
15
17
  r = ln.chomp.split("\t")
@@ -19,7 +21,7 @@ def read_classif(dir, classif={})
19
21
  end
20
22
  fh.close
21
23
  klass.each do |i|
22
- d = File.expand_path("miga-project.1.sc-#{i}", dir)
24
+ d = File.expand_path("miga-project.sc-#{i}", dir)
23
25
  classif = read_classif(d, classif) if Dir.exist? d
24
26
  end
25
27
  classif
data/utils/subclades.R CHANGED
@@ -1,171 +1,153 @@
1
- library(enveomics.R)
2
- library(ape)
3
- library(ggdendro)
4
- library(ggplot2)
5
- library(grid)
6
- library(gridExtra)
7
- library(cluster)
8
- library(dendextend)
9
- library(vegan)
10
- library(scatterplot3d)
1
+ #!/usr/bin/env Rscript
2
+ #
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+ #
11
6
 
12
- # Main function
13
- subclades <- function(ani_file, out_base, thr=1, ani=c()){
14
- # Get ANI distances
15
- cat("====", out_base, "\n")
16
- if(missing(ani_file)){
17
- a <- as.data.frame(ani)
18
- } else {
19
- a <- read.table(gzfile(ani_file), sep='\t', h=TRUE, as.is=T)
20
- }
21
- if(nrow(a)==0){
22
- pdf(paste(out_base,'.pdf',sep=''), 7, 12)
23
- plot(1,t='n',axes=F)
24
- legend('center','No ANI data',bty='n')
25
- dev.off()
26
- file.create(paste(out_base,'.1.classif',sep=''))
27
- file.create(paste(out_base,'.1.medoids',sep=''))
28
- return(NULL)
29
- }
30
- ani.d <- enve.df2dist(cbind(a$a, a$b, 1-a$value/100), default.d=0.3)
31
- ani.hc <- hclust(ani.d, method='ward.D2')
32
- write.tree(as.phylo(ani.hc), 'miga-project.ani.nwk')
33
-
34
- # Silhouette
35
- k <- 2:(length(labels(ani.d))-1)
36
- s <- sapply(k, function(x) summary(silhouette(pam(ani.d, x)))$avg.width)
37
- ds <- 10^(s[-c(1,length(s))]-(s[-length(s)+c(0,1)]+s[-c(1,2)])/2)
38
- top.n <- head(k[order(c(-Inf,ds,-Inf), decreasing=T)],n=6)
7
+ #= Load stuff
8
+ argv <- commandArgs(trailingOnly=T)
9
+ suppressPackageStartupMessages(library(ape))
10
+ suppressPackageStartupMessages(library(vegan))
11
+ suppressPackageStartupMessages(library(cluster))
12
+ suppressPackageStartupMessages(library(parallel))
13
+ suppressPackageStartupMessages(library(enveomics.R))
39
14
 
40
- # Save "ANI-types"
41
- ani.types <- c()
42
- ani.medoids <- list()
43
- for(i in 1:length(top.n)){
44
- k_i <- top.n[i]
45
- ani.cl <- pam(ani.d, k_i)
46
- ani.types <- cbind(ani.types, ani.cl$clustering)
47
- ani.medoids[[ i ]] <- ani.cl$medoids
48
- }
15
+ #= Main function
16
+ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
17
+ say("==> Out base:", out_base, "<==")
18
+
19
+ # Input arguments
20
+ if(missing(ani_file)){
21
+ a <- as.data.frame(ani)
22
+ }else{
23
+ a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
24
+ }
25
+ if(nrow(a)==0){
26
+ generate_empty_files(out_base)
27
+ return(NULL)
28
+ }
29
+
30
+ # Get ANI distances
31
+ say("Distances")
32
+ a$d <- 1-a$value/100
33
+ ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.1)
34
+ ani.hc <- hclust(ani.d, method="ward.D2")
35
+ ani.ph <- as.phylo(ani.hc)
36
+ write.tree(as.phylo(ani.hc), paste(out_base, ".nwk", sep=""))
37
+
38
+ # Silhouette
39
+ say("Silhouette")
40
+ k <- 2:min(length(labels(ani.d))-1, 100)
41
+ cl <- makeCluster(thr)
42
+ s <- parSapply(cl, k, function(x) {
43
+ library(cluster)
44
+ pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo$avg.width
45
+ })
46
+ stopCluster(cl)
47
+ ds <- (s[-c(1,length(s))]-pmax(s[-length(s)+c(0,1)],s[-c(1,2)]))
48
+ top.n <- head(k[order(c(-Inf,ds,-Inf), decreasing=T)], n=1)
49
+
50
+ # Classify genomes
51
+ say("Classify")
52
+ ani.cl <- pam(ani.d, top.n)
53
+ ani.types <- ani.cl$clustering
54
+ ani.medoids <- ani.cl$medoids
55
+
56
+ # Generate graphic report
57
+ say("Graphic report")
58
+ pdf(paste(out_base, ".pdf", sep=""), 7, 12)
59
+ layout(1:4)
60
+ plot_distances(ani.d)
61
+ plot_silhouette(k, s, ds, top.n)
62
+ plot_clustering(ani.cl, ani.d, ani.types)
63
+ plot_tree(ani.ph, ani.types, ani.medoids)
64
+ dev.off()
49
65
 
50
- # Generate graphic reports
51
- pdf(paste(out_base,'.pdf',sep=''), 7, 12)
52
- plotClusterAndMetadata(as.dendrogram(ani.hc), ani.types, main='ANI types')
53
- ani.mds <- metaMDS(ani.d, k=3, autotransform=FALSE, parallel=thr, wascores=F)
54
- layout(matrix(1:6, ncol=2))
55
- for(i in 1:length(top.n)){
56
- s <- scatterplot3d(ani.mds$points, pch=19, type='h',
57
- color=ggplotColours(top.n[i], alpha=1/2)[ani.types[,i]],
58
- cex.symbols=1/2, box=FALSE, lty.hplot=3,
59
- main=paste('NMDS of ANI distances with', top.n[i] ,'clusters'),
60
- angle=80, scale.y=3/2, las=2, xlab='', ylab='', zlab='')
61
- for(cl in 1:top.n[i]){
62
- col <- ggplotColours(top.n[i])[cl]
63
- med <- s$xyz.convert(matrix(ani.mds$points[ ani.medoids[[i]][cl] , ],
64
- ncol=3))
65
- if(sum(ani.types[,i]==cl)>1){
66
- val <- s$xyz.convert(matrix(ani.mds$points[ ani.types[,i]==cl , ],
67
- ncol=3))
68
- arrows(x0=med$x, y0=med$y, x1=val$x, y1=val$y, length=0, col=col)
69
- }
70
- points(med, col=col, pch=19, cex=3/2)
71
- text(med, labels=cl, col='white', cex=2/3)
72
- }
73
- }
74
- dev.off()
66
+ # Save results
67
+ say("Text report")
68
+ write.table(ani.medoids, paste(out_base, "medoids", sep="."),
69
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
70
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
71
+ for(j in 1:nrow(classif)){
72
+ classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
73
+ }
74
+ write.table(classif, paste(out_base,"classif",sep="."),
75
+ quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
75
76
 
76
- # Save results
77
- for(i in 1:length(top.n)){
78
- write.table(ani.medoids[[i]], paste(out_base,i,'medoids',sep='.'),
79
- quote=FALSE, col.names=FALSE, row.names=FALSE)
80
- classif <- cbind(rownames(ani.types), ani.types[,i],
81
- ani.medoids[[i]][ ani.types[,i] ], NA)
82
- for(j in 1:nrow(classif))
83
- classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
84
- write.table(classif, paste(out_base,i,'classif',sep='.'),
85
- quote=FALSE, col.names=FALSE, row.names=FALSE, sep='\t')
86
- }
77
+ # Recursive search
78
+ for(i in 1:top.n){
79
+ medoid <- ani.medoids[i]
80
+ ds_f <- names(ani.types)[ ani.types==i ]
81
+ say("Analyzing subclade", i, "with medoid:", medoid)
82
+ dir.create(paste(out_base, ".sc-", i, sep=""))
83
+ write.table(ds_f,
84
+ paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
85
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
86
+ if(length(ds_f) > 5){
87
+ a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
88
+ subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
89
+ thr=thr, ani=a_f)
90
+ }
91
+ }
92
+ }
93
+
94
+ #= Helper functions
95
+ say <- function(...) { cat("[", date(), "]", ..., "\n") }
96
+
97
+ generate_empty_files <- function(out_base) {
98
+ pdf(paste(out_base, ".pdf", sep=""), 7, 12)
99
+ plot(1, t="n", axes=F)
100
+ legend("center", "No data", bty="n")
101
+ dev.off()
102
+ file.create(paste(out_base,".1.classif",sep=""))
103
+ file.create(paste(out_base,".1.medoids",sep=""))
104
+ }
105
+
106
+ plot_silhouette <- function(k, s, ds, top.n) {
107
+ par(mar=c(4,5,1,5)+0.1)
108
+ plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
109
+ ylim=range(s), bty="n", xaxs="i", yaxt="n")
110
+ polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
111
+ axis(2, fg="grey60", col.axis="grey60")
112
+ mtext("Average silhouette", side=2, line=3, col="grey60")
113
+ par(new=TRUE)
114
+ plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
115
+ ylim=range(ds), bty="n", xaxs="i")
116
+ points(k[-c(1,length(k))], ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
117
+ axis(4, fg="darkred", col.axis="darkred")
118
+ mtext("Silhouette gain", side=4, line=3, col="darkred")
119
+ abline(v=top.n, lty=2)
120
+ }
121
+
122
+ plot_distances <- function(dist) {
123
+ par(mar=c(5,4,1,2)+0.1)
124
+ hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
125
+ }
87
126
 
88
- # Explore subclades
89
- for(i in 1:top.n[1]){
90
- medoid <- ani.medoids[[1]][i]
91
- ds_f <- rownames(ani.types)[ ani.types[,1]==i ]
92
- cat("Analyzing subclade", i, "with medoid:", medoid, "\n")
93
- cat(" ds_f: ", ds_f, "\n")
94
- if(length(ds_f) > 5){
95
- a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
96
- dir.create(paste(out_base,'.1.sc-',i,sep=''))
97
- write.table(ds_f,
98
- paste(out_base,'.1.sc-',i,'/miga-project.all',sep=''),
99
- quote=FALSE, col.names=FALSE, row.names=FALSE)
100
- cat(" looking for subclades within: ",
101
- out_base, ".1.sc-", i, "\n", sep="")
102
- subclades(
103
- out_base=paste(out_base,'.1.sc-',i,'/miga-project',sep=''),
104
- thr=thr, ani=a_f)
105
- }
106
- }
127
+ plot_clustering <- function(cl, dist, types) {
128
+ par(mar=c(5,4,4,2)+0.1)
129
+ top.n <- length(cl$medoids)
130
+ col <- ggplotColours(top.n)
131
+ plot(silhouette(cl), col=col)
132
+ clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
107
133
  }
108
134
 
109
- # Ancillary functions
110
- plotClusterAndMetadata <- function(c,m,addLabels=TRUE,main='',type='factor'){
111
- ps <- list()
112
- ps[[1]] <- rectGrob(gp=gpar(col="white"))
113
- if(length(type)==1) type <- rep(type, ncol(m))
114
- if(addLabels){
115
- m <- cbind(m, NA)
116
- m[labels(c),ncol(m)] <- labels(c)
117
- type[ncol(m)] <- 'label'
118
- }
119
- for(i in 1:ncol(m)){
120
- df <- data.frame(lab=factor(labels(c),levels=labels(c)),
121
- feat=m[labels(c),i])
122
- if(type[i]=='factor'){
123
- ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1, lab, fill=factor(feat))) +
124
- geom_tile() + geom_text(size=3/4, label=df$feat, x=.8) +
125
- scale_x_continuous(expand=c(0,0)) +
126
- theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
127
- plot.margin=unit(c(40,-12,20,-12),'points'),
128
- axis.ticks=element_blank(), axis.text=element_blank(),
129
- legend.position="none"))
130
- }else if(type[i]=='numeric'){
131
- ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1,lab,fill=as.numeric(feat))) +
132
- geom_tile() + geom_text(size=3/4, label=df$feat, x=.8) +
133
- scale_x_continuous(expand=c(0,0)) +
134
- theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
135
- plot.margin=unit(c(40,-12,20,-12),'points'),
136
- axis.ticks=element_blank(), axis.text=element_blank(),
137
- legend.position="none"))
138
- }else if(type[i]=='label'){
139
- ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1, lab)) +
140
- geom_tile(fill='white') + geom_text(size=3/4, label=df$feat, x=.8) +
141
- theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
142
- plot.margin=unit(c(40,-12,20,-12),'points'),
143
- axis.ticks=element_blank(), axis.text=element_blank(),
144
- legend.position="none"))
145
- }else{
146
- stop('Unsupported type: ', type[i])
147
- }
148
- }
149
- ps[[i+2]] <- ggplotGrob(ggplot(segment(dendro_data(c, type="rectangle"))) +
150
- geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) +
151
- scale_x_continuous(expand=c(0,.5)) +
152
- coord_flip() + theme_dendro() +
153
- theme(axis.title=element_blank(), axis.ticks=element_blank(),
154
- plot.margin=unit(c(40,20,20,ifelse(addLabels,-35,-30)),'points'),
155
- panel.margin=unit(0,'points'), axis.text=element_blank(),
156
- legend.position="none"))
157
- maxHeights = do.call(grid::unit.pmax, lapply(ps, function(x) x$heights[2:5]))
158
- for(g in ps) g$heights[2:5] <- as.list(maxHeights)
159
- ps$nrow <- 1
160
- ps$widths <- c(0.1,rep(.07,ncol(m)),1)
161
- ps$main <- main
162
- do.call(grid.arrange, ps)
163
- return(ps)
135
+ plot_tree <- function(phy, types, medoids){
136
+ layout(1)
137
+ top.n <- length(unique(types))
138
+ col <- ggplotColours(top.n)
139
+ is.medoid <- phy$tip.label %in% medoids
140
+ plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
141
+ font=ifelse(is.medoid, 2, 1),
142
+ tip.color=col[types[phy$tip.label]])
164
143
  }
165
144
 
166
145
  ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
167
- if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
168
- hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
146
+ if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
147
+ hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
169
148
  }
170
149
 
150
+ #= Main
151
+ subclades(ani_file=argv[1], out_base=argv[2],
152
+ thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
171
153