miga-base 1.1.2.2 → 1.1.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4603bf75db8f82a1a30c1d6da1ab045139b13bf1eec5fd2d5be5c79fb2ac0442
4
- data.tar.gz: 24324b35a885453c01fada6fb07527616b3a952ab4ccb9b9c818c5d06e580acd
3
+ metadata.gz: 8d9d8a12b9eaa48b63df43b871f3cc7da598997f3d208cde9fcf31f2d605d66c
4
+ data.tar.gz: 72d58adbbbea43886e1e60a608f3f7e0da542c26b3b29ecf202a3b34b9f8ac35
5
5
  SHA512:
6
- metadata.gz: d9e555d736e3987d4afc45707307b7f95d2164cb8db822ad639e0f71e919c67972bff76aefdc7b0a1a1e7120e3100b863a8dde14470066ac7d4ab0b5be5c20f7
7
- data.tar.gz: 54d1823817cb2d4220d885e84e941cf5e4df88f850824cf3837ed716879aec4733f2d2df98bedc03aa29878a1e5bd7641b015c76738eed841d08046c9f50d1bb
6
+ metadata.gz: cc2c81a38d915c7bb13d53853e0be3c822a0ba7703d2e97cf16615ac4d8f8b4fc3203bc83b76584b48ddd7f13787545d5327d9dd5cd846bbc1be1db868b120a4
7
+ data.tar.gz: 8cf8bd65e94e6bb551143ebb4fbb4ca37a74e58f777011e3c2c5823f7c9b6a242de63ddda1c496d9c91df01f2471446989267828b69e0de7fd024d3e84d70afa
@@ -5,7 +5,7 @@
5
5
  module MiGA::Cli::Action::Init::DaemonHelper
6
6
  def configure_daemon
7
7
  cli.puts 'Default daemon configuration:'
8
- daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
8
+ daemon_f = File.expand_path('.miga_daemon.json', ENV['MIGA_HOME'])
9
9
  unless File.exist?(daemon_f) and cli.ask_user(
10
10
  'A template daemon already exists, do you want to preserve it?',
11
11
  'yes', %w(yes no)
@@ -4,7 +4,7 @@
4
4
  # Helper module with files configuration functions for MiGA::Cli::Action::Init
5
5
  module MiGA::Cli::Action::Init::FilesHelper
6
6
  def open_rc_file
7
- rc_path = File.expand_path('.miga_rc', ENV['HOME'])
7
+ rc_path = File.expand_path('.miga_rc', ENV['MIGA_HOME'])
8
8
  if File.exist? rc_path
9
9
  if cli.ask_user(
10
10
  'I found a previous configuration. Do you want to continue?',
@@ -55,10 +55,11 @@ module MiGA::Project::Result
55
55
  ##
56
56
  # Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
57
57
  def add_result_distances(base, _opts)
58
- return nil unless result_files_exist?(base, %w[.Rdata .txt])
58
+ return nil unless result_files_exist?(base, %w[.rds .txt])
59
59
 
60
60
  r = MiGA::Result.new("#{base}.json")
61
- r.add_file(:rdata, 'miga-project.Rdata')
61
+ r.add_file(:rds, 'miga-project.rds')
62
+ r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
62
63
  r.add_file(:matrix, 'miga-project.txt')
63
64
  r.add_file(:log, 'miga-project.log') # Legacy file
64
65
  r.add_file(:hist, 'miga-project.hist')
@@ -82,12 +83,13 @@ module MiGA::Project::Result
82
83
  end
83
84
 
84
85
  r = add_result_iter_clades(base)
85
- r.add_file(:aai_tree, 'miga-project.aai.nwk')
86
- r.add_file(:proposal, 'miga-project.proposed-clades')
87
- r.add_file(:clades_aai90, 'miga-project.aai90-clades')
88
- r.add_file(:clades_ani95, 'miga-project.ani95-clades')
89
- r.add_file(:clades_gsp, 'miga-project.gsp-clades')
90
- r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
86
+ r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
87
+ r.add_file(:aai_tree, 'miga-project.aai.nwk')
88
+ r.add_file(:proposal, 'miga-project.proposed-clades')
89
+ r.add_file(:clades_aai90, 'miga-project.aai90-clades')
90
+ r.add_file(:clades_ani95, 'miga-project.ani95-clades')
91
+ r.add_file(:clades_gsp, 'miga-project.gsp-clades')
92
+ r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
91
93
  r
92
94
  end
93
95
 
@@ -105,6 +107,7 @@ module MiGA::Project::Result
105
107
 
106
108
  r = add_result_iter_clades(base)
107
109
  r.add_file(:ani_tree, 'miga-project.ani.nwk')
110
+ r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
108
111
  r
109
112
  end
110
113
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.1, 2, 2].freeze
15
+ VERSION = [1.1, 3, 1].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2021, 11, 9)
23
+ VERSION_DATE = Date.new(2021, 11, 24)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- aai <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(aai, file = 'miga-project.Rdata')
28
- if(sum(aai[, 'a'] != aai[, 'b']) > 0) {
29
- h <- hist(aai[aai[, 'a'] != aai[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ aai <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(aai, file = "miga-project.rds")
28
+ if(sum(aai[, "a"] != aai[, "b"]) > 0) {
29
+ h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- ani <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(ani, file = 'miga-project.Rdata')
28
- if(sum(ani[, 'a'] != ani[, 'b']) > 0) {
29
- h <- hist(ani[ani[, 'a'] != ani[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ ani <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(ani, file = "miga-project.rds")
28
+ if(sum(ani[, "a"] != ani[, "b"]) > 0) {
29
+ h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -15,7 +15,7 @@ ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
15
15
  # No real need for hAAI distributions at all
16
16
  echo -n "" > miga-project.log
17
17
  echo -n "" > miga-project.txt
18
- echo "aai <- NULL; save(aai, file = 'miga-project.Rdata')" | R --vanilla
18
+ echo 'aai <- NULL; saveRDS(aai, file = "miga-project.rds")' | R --vanilla
19
19
 
20
20
  # Finalize
21
21
  miga_end_project_step "$DIR"
data/test/project_test.rb CHANGED
@@ -82,7 +82,7 @@ class ProjectTest < Test::Unit::TestCase
82
82
  def test_add_result
83
83
  p1 = project
84
84
  assert_nil(p1.add_result(:doom))
85
- %w[.Rdata .log .txt .done].each do |x|
85
+ %w[.rds .log .txt .done].each do |x|
86
86
  assert_nil(p1.add_result(:haai_distances))
87
87
  FileUtils.touch(
88
88
  File.join(
@@ -117,11 +117,12 @@ class ProjectTest < Test::Unit::TestCase
117
117
  # Project tasks
118
118
  expected_files = {
119
119
  project_stats: %w[.taxonomy.json .metadata.db],
120
- haai_distances: %w[.Rdata .log .txt],
121
- aai_distances: %w[.Rdata .log .txt],
122
- ani_distances: %w[.Rdata .log .txt],
123
- clade_finding: %w[.pdf .classif .medoids
124
- .class.tsv .class.nwk .proposed-clades],
120
+ haai_distances: %w[.rds .log .txt],
121
+ aai_distances: %w[.rds .log .txt],
122
+ ani_distances: %w[.rds .log .txt],
123
+ clade_finding: %w[
124
+ .pdf .classif .medoids .class.tsv .class.nwk .proposed-clades
125
+ ],
125
126
  subclades: %w[.pdf .classif .medoids .class.tsv .class.nwk],
126
127
  ogs: %w[.ogs .stats]
127
128
  }
@@ -2702,6 +2702,8 @@ def merge_db_opts():
2702
2702
 
2703
2703
  parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
2704
2704
 
2705
+ parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'Alternative way to supply donors. A file containing paths to the donor databases, 1 per line')
2706
+
2705
2707
  parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
2706
2708
 
2707
2709
  parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
@@ -2720,16 +2722,23 @@ def merge_db_thread_starter(rev_index, per_db_accs):
2720
2722
 
2721
2723
 
2722
2724
 
2723
- def merge_db(recipient, donors, verbose, threads):
2725
+ def merge_db(recipient, donors, donor_file, verbose, threads):
2724
2726
  #Prettier on the CLI
2725
-
2727
+
2728
+ if donor_file is not None:
2729
+ fh = agnostic_reader(donor_file)
2730
+ donors = [line.strip() for line in fh]
2731
+ fh.close()
2732
+
2726
2733
  if donors is None or recipient is None:
2727
2734
  print("Either donor or target not given. FastAAI is exiting.")
2728
2735
  return None
2729
2736
 
2730
2737
  print("")
2731
2738
 
2732
- donors = donors.split(",")
2739
+ if donor_file is None:
2740
+ donors = donors.split(",")
2741
+
2733
2742
  valid_donors = []
2734
2743
  for d in donors:
2735
2744
  if os.path.exists(d):
@@ -3454,10 +3463,11 @@ def main():
3454
3463
 
3455
3464
  recipient = opts.recipient
3456
3465
  donors = opts.donors
3466
+ donor_file = opts.donor_file
3457
3467
  verbose = opts.verbose
3458
3468
  threads = opts.threads
3459
3469
 
3460
- merge_db(recipient, donors, verbose, threads)
3470
+ merge_db(recipient, donors, donor_file, verbose, threads)
3461
3471
 
3462
3472
  #################### Query files vs DB ########################
3463
3473
 
@@ -151,8 +151,11 @@ module MiGA::DistanceRunner::Commands
151
151
  donors << tgt_idx if tgt_idx
152
152
  end
153
153
  return nil if donors.empty?
154
+
155
+ # Build target database
156
+ File.open(f0 = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
154
157
  run_cmd <<~CMD
155
- FastAAI merge_db --donors "#{donors.join(',')}" \
158
+ FastAAI merge_db --donor_file "#{f0}" \
156
159
  --recipient "#{f1 = tmp_file}" --threads #{opts[:thr]}
157
160
  CMD
158
161
 
@@ -166,7 +169,7 @@ module MiGA::DistanceRunner::Commands
166
169
  # Save values in the databases
167
170
  haai_data = {}
168
171
  aai_data = {}
169
- # Ugly workaround to the insistence of FastAAI to not provide the files
172
+ # Ugly workaround to the insistence of FastAAI not to provide the files
170
173
  # I ask for ;-)
171
174
  qry_results = File.basename(qry_idx, '.faix') + '_results.txt'
172
175
  out_file = File.join(f2, 'results', qry_results)
@@ -127,6 +127,7 @@ module MiGA::DistanceRunner::Database
127
127
  db = tmp_dbs[metric]
128
128
  table = metric == :haai ? :aai : metric
129
129
  SQLite3::Database.new(db) do |conn|
130
+ conn.execute('BEGIN TRANSACTION')
130
131
  data.each do |k, v|
131
132
  sql = <<~SQL
132
133
  insert into #{table} (
@@ -135,6 +136,7 @@ module MiGA::DistanceRunner::Database
135
136
  SQL
136
137
  conn.execute(sql, [dataset.name, k] + v)
137
138
  end
139
+ conn.execute('COMMIT')
138
140
  end
139
141
  checkpoint(metric)
140
142
  end
data/utils/find-medoid.R CHANGED
@@ -5,26 +5,28 @@
5
5
  #
6
6
 
7
7
  #= Load stuff
8
- argv <- commandArgs(trailingOnly = T)
8
+ argv <- commandArgs(trailingOnly = TRUE)
9
9
  suppressPackageStartupMessages(library(ape))
10
- if(Sys.getenv('MIGA') == ''){
10
+ if(Sys.getenv("MIGA") == ""){
11
11
  suppressPackageStartupMessages(library(enveomics.R))
12
12
  }else{
13
- source(file.path(Sys.getenv('MIGA'),
14
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
13
+ source(file.path(
14
+ Sys.getenv("MIGA"),
15
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
16
+ ))
15
17
  }
16
18
 
17
- find_medoids <- function(ani.df, out, clades) {
19
+ find_medoids <- function (ani.df, out, clades) {
18
20
  if(nrow(ani.df) == 0) return(NULL)
19
21
  ani.df$d <- 1 - (ani.df$value/100)
20
- dist <- enve.df2dist(ani.df, 'a', 'b', 'd', default.d = max(ani.df$d)*1.2)
22
+ dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
21
23
  dist <- as.matrix(dist)
22
- cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
24
+ cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
23
25
  cl.s <- c()
24
26
  medoids <- c()
25
27
  for(i in cl){
26
- lab <- strsplit(i, ',')[[1]]
27
- cat('Clade of:', lab[1], '\n')
28
+ lab <- strsplit(i, ",")[[1]]
29
+ cat("Clade of:", lab[1], "\n")
28
30
  if(length(lab) == 1) {
29
31
  lab.s <- lab
30
32
  } else {
@@ -32,15 +34,17 @@ find_medoids <- function(ani.df, out, clades) {
32
34
  }
33
35
  med <- lab.s[1]
34
36
  medoids <- c(medoids, med)
35
- cl.s <- c(cl.s, paste(lab.s, collapse = ','))
37
+ cl.s <- c(cl.s, paste(lab.s, collapse = ","))
36
38
  }
37
39
  write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
38
- write.table(cl.s, paste(clades, '.sorted', sep = ''), quote = FALSE,
39
- row.names = FALSE, col.names = FALSE)
40
+ write.table(
41
+ cl.s, paste(clades, ".sorted", sep = ""), quote = FALSE,
42
+ row.names = FALSE, col.names = FALSE
43
+ )
40
44
  }
41
45
 
42
46
  #= Main
43
- load(argv[1])
44
- if(! exists('ani')) ani <- aai
47
+ cat("Finding Medoids")
48
+ ani <- readRDS(argv[1])
45
49
  find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
46
50
 
@@ -44,7 +44,7 @@ module MiGA::SubcladeRunner::Pipeline
44
44
  # Find genomospecies medoids
45
45
  src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
46
46
  dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
47
- `Rscript '#{src}' ../../09.distances/#{dir}/miga-project.Rdata \
47
+ `Rscript '#{src}' '../../09.distances/#{dir}/miga-project.rds' \
48
48
  miga-project.gsp-medoids miga-project.gsp-clades`
49
49
  if File.exist? 'miga-project.gsp-clades.sorted'
50
50
  File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
@@ -54,8 +54,6 @@ module MiGA::SubcladeRunner::Pipeline
54
54
  ofh = File.open('miga-project.proposed-clades', 'w')
55
55
  File.open('miga-project.gsp-clades', 'r') do |ifh|
56
56
  ifh.each_line do |ln|
57
- next if $. == 1
58
-
59
57
  r = ln.chomp.split(',')
60
58
  ofh.puts r.join("\t") if r.size >= 5
61
59
  end
data/utils/subclades.R CHANGED
@@ -10,56 +10,51 @@ suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(parallel))
13
- if(Sys.getenv('MIGA') == ''){
13
+ if(Sys.getenv("MIGA") == ""){
14
14
  suppressPackageStartupMessages(library(enveomics.R))
15
15
  }else{
16
- source(file.path(Sys.getenv('MIGA'),
17
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
16
+ source(file.path(
17
+ Sys.getenv("MIGA"),
18
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
19
+ ))
18
20
  }
19
21
 
20
22
  #= Main function
21
23
  subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
22
- say('==> Out base:', out_base, '<==')
24
+ say("==> Out base:", out_base, "<==")
23
25
 
24
26
  # Normalize input matrix
25
- dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
26
- if(!missing(ani_file)){
27
- if(length(ani.d) == 0 && !file.exists(dist_rdata)){
27
+ dist_rds <- paste(out_base, "dist.rds", sep = ".")
28
+ if (!missing(ani_file)) {
29
+ if(length(ani.d) == 0 && !file.exists(dist_rds)){
28
30
  # Read from ani_file
29
- a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
30
- if(nrow(a) == 0){
31
+ ani.d <- ani_distance(ani_file, sel)
32
+ if (is.null(ani.d)) {
31
33
  generate_empty_files(out_base)
32
34
  return(NULL)
35
+ } else {
36
+ saveRDS(ani.d, dist_rds)
33
37
  }
34
- if(!is.na(sel) && file.exists(sel)){
35
- say('Filter selection')
36
- lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
37
- a <- a[a$a %in% lab & a$b %in% lab, ]
38
- }
39
- say('Distances')
40
- a$d <- 1 - (a$value/100)
41
- ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
42
- save(ani.d, file = dist_rdata)
43
38
  }
44
39
  }
45
40
 
46
41
  # Read result if the subclade is ready, run it otherwise
47
- if(file.exists(paste(out_base, 'classif', sep = '.'))){
42
+ if (file.exists(paste(out_base, "classif", sep = "."))) {
48
43
  say("Loading")
49
44
  ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
50
- sep = ' ', as.is = TRUE)[,1]
45
+ sep = " ", as.is = TRUE)[,1]
51
46
  a <- read.table(paste(out_base, "classif", sep="."),
52
- sep = '\t', as.is = TRUE)
47
+ sep = "\t", as.is = TRUE)
53
48
  ani.types <- a[,2]
54
49
  names(ani.types) <- a[,1]
55
- if(length(ani.d) == 0) load(dist_rdata)
56
- }else if(length(labels(ani.d)) > 8L){
57
- res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
58
- if(length(res) == 0) return(NULL)
59
- ani.medoids <- res[['ani.medoids']]
60
- ani.types <- res[['ani.types']]
61
- ani.d <- res[['ani.d']]
62
- }else{
50
+ if(length(ani.d) == 0) ani.d <- readRDS(dist_rds)
51
+ } else if (length(labels(ani.d)) > 8L) {
52
+ res <- subclade_clustering(out_base, thr, ani.d, dist_rds)
53
+ if (length(res) == 0) return(NULL)
54
+ ani.medoids <- res[["ani.medoids"]]
55
+ ani.types <- res[["ani.types"]]
56
+ ani.d <- res[["ani.d"]]
57
+ } else {
63
58
  ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
64
59
  ani.types <- rep(1, length(labels(ani.d)))
65
60
  names(ani.types) <- labels(ani.d)
@@ -69,66 +64,80 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
69
64
 
70
65
  # Recursive search
71
66
  say("Recursive search")
72
- for(i in 1:length(ani.medoids)){
67
+ for (i in 1:length(ani.medoids)) {
73
68
  medoid <- ani.medoids[i]
74
69
  ds_f <- names(ani.types)[ ani.types==i ]
75
70
  say("Analyzing subclade", i, "with medoid:", medoid)
76
71
  dir_f <- paste(out_base, ".sc-", i, sep="")
77
- if(!dir.exists(dir_f)) dir.create(dir_f)
72
+ if (!dir.exists(dir_f)) dir.create(dir_f)
78
73
  write.table(ds_f,
79
74
  paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
80
75
  quote=FALSE, col.names=FALSE, row.names=FALSE)
81
- if(length(ds_f) > 8L){
76
+ if (length(ds_f) > 8L) {
82
77
  ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
83
- subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
84
- thr=thr, ani.d=ani_subset)
78
+ subclades(
79
+ out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
80
+ thr = thr,
81
+ ani.d = ani_subset
82
+ )
85
83
  }
86
84
  }
87
85
 
88
86
  # Declare recursion up-to-here complete
89
- write.table(date(), paste(out_base, 'ready', sep='.'),
90
- quote=FALSE, row.names=FALSE, col.names=FALSE)
87
+ write.table(
88
+ date(), paste(out_base, "ready", sep = "."),
89
+ quote = FALSE, row.names = FALSE, col.names = FALSE
90
+ )
91
91
  }
92
92
 
93
93
  #= Heavy-lifter
94
- subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
94
+ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
95
95
  # Get ANI distances
96
- if(length(ani.d) > 0){
97
- # Just use ani.d (and save in dist_rdata_
98
- save(ani.d, file=dist_rdata)
99
- }else if(file.exists(dist_rdata)){
100
- # Read from dist_rdata
101
- load(dist_rdata)
102
- }else{
96
+ if (length(ani.d) > 0) {
97
+ # Just use ani.d (and save in dist_rds)
98
+ if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
99
+ } else if (file.exists(dist_rds)) {
100
+ # Read from dist_rds
101
+ ani.d <- readRDS(dist_rds)
102
+ } else {
103
103
  stop("Cannot find input matrix", out_base)
104
104
  }
105
- if(length(labels(ani.d)) <= 8L) return(list())
105
+ if (length(labels(ani.d)) <= 8L) return(list())
106
106
 
107
107
  # Build tree
108
108
  say("Tree")
109
109
  ani.ph <- bionj(ani.d)
110
- express.ori <- options('expressions')$expressions
111
- if(express.ori < ani.ph$Nnode*4){
112
- options(expressions=min(c(5e7,ani.ph$Nnode*4)))
110
+ say("- Write")
111
+ express.ori <- options("expressions")$expressions
112
+ if(express.ori < ani.ph$Nnode * 4){
113
+ options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
113
114
  }
114
- write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
115
+ write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
115
116
  options(expressions=express.ori)
116
117
 
117
118
  # Silhouette
118
119
  say("Silhouette")
119
120
  nn <- length(labels(ani.d))
120
121
  k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
122
+ say("- Make cluster")
121
123
  cl <- makeCluster(thr)
122
- s <- parSapply(cl, k, function(x) {
124
+ say("- Launch parallel jobs")
125
+ s <- parSapply(
126
+ cl, k,
127
+ function(x) {
123
128
  library(cluster)
124
- s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
125
- c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
126
- })
129
+ s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
130
+ c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
131
+ }
132
+ )
133
+ say("- Stop cluster")
127
134
  stopCluster(cl)
128
- s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
129
- s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
130
- ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
131
- if(mean(s[1,]<0)<0.75) ds[s[1,]<0] <- mean(ds) # <- k's with negative average
135
+ say("- Calculate custom criteria")
136
+ s.avg.z <- (s[1,] - mean(s[1,])) / (sd(s[1,]) + 0.0001)
137
+ s.neg.z <- (s[2,] - mean(s[2,])) / (sd(s[2,]) + 0.01)
138
+ ds <- s.avg.z - s.neg.z - 2 / (1:length(k)) - (1:length(k)) / 50
139
+ if(mean(s[1,] < 0) < 0.75)
140
+ ds[s[1,] < 0] <- mean(ds) # <- k's with negative average
132
141
  top.n <- k[which.max(ds)]
133
142
 
134
143
  # Classify genomes
@@ -139,8 +148,8 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
139
148
 
140
149
  # Generate graphic report
141
150
  say("Graphic report")
142
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
143
- layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
151
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
152
+ layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
144
153
  plot_distances(ani.d)
145
154
  plot_silhouette(k, s[1,], s[2,], ds, top.n)
146
155
  plot_clustering(ani.cl, ani.d, ani.types)
@@ -153,112 +162,170 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
153
162
  # Return data
154
163
  say("Cluster ready")
155
164
  return(list(
156
- ani.medoids=ani.medoids,
157
- ani.types=ani.types,
158
- ani.d=ani.d
165
+ ani.medoids = ani.medoids,
166
+ ani.types = ani.types,
167
+ ani.d = ani.d
159
168
  ))
160
169
  }
161
170
 
162
171
  #= Helper functions
163
- say <- function(...) { message(paste("[",date(),"]",...,"\n"),appendLF=FALSE) }
172
+ say <- function (...) {
173
+ message(paste("[", date(), "]", ..., "\n"), appendLF = FALSE)
174
+ }
164
175
 
165
- generate_empty_files <- function(out_base) {
166
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
167
- plot(1, t="n", axes=F)
168
- legend("center", "No data", bty="n")
176
+ generate_empty_files <- function (out_base) {
177
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
178
+ plot(1, t = "n", axes = F)
179
+ legend("center", "No data", bty = "n")
169
180
  dev.off()
170
- file.create(paste(out_base,".1.classif",sep=""))
171
- file.create(paste(out_base,".1.medoids",sep=""))
181
+ file.create(paste(out_base, ".1.classif", sep = ""))
182
+ file.create(paste(out_base, ".1.medoids", sep = ""))
172
183
  }
173
184
 
174
- write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
175
- say('Text report')
176
- write.table(ani.medoids, paste(out_base, 'medoids', sep='.'),
177
- quote=FALSE, col.names=FALSE, row.names=FALSE)
178
- classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
179
- ani.d.m <- 100 - as.matrix(ani.d)*100
180
- for(j in 1:nrow(classif)){
185
+ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
186
+ say("Text report")
187
+ write.table(
188
+ ani.medoids, paste(out_base, "medoids", sep = "."),
189
+ quote = FALSE, col.names = FALSE, row.names = FALSE
190
+ )
191
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ani.types], NA)
192
+ ani.d.m <- 100 - as.matrix(ani.d) * 100
193
+ for (j in 1:nrow(classif)) {
181
194
  classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
182
195
  }
183
- write.table(classif, paste(out_base,"classif",sep="."),
184
- quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
196
+ write.table(
197
+ classif, paste(out_base, "classif", sep="."),
198
+ quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
199
+ )
185
200
  }
186
201
 
187
- plot_silhouette <- function(k, s, ns, ds, top.n) {
202
+ plot_silhouette <- function (k, s, ns, ds, top.n) {
188
203
  # s
189
- par(mar=c(4,5,1,5)+0.1)
190
- plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
191
- ylim=range(s), bty="n", xaxs="i", yaxt="n")
192
- polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
193
- axis(2, fg="grey60", col.axis="grey60")
194
- mtext("Mean silhouette", side=2, line=3, col="grey60")
204
+ par(mar = c(4,5,1,5)+0.1)
205
+ plot(
206
+ 1, t = "n", xlab = "k (clusters)", ylab = "", xlim = range(c(0,k)),
207
+ ylim = range(s), bty = "n", xaxs = "i", yaxt = "n"
208
+ )
209
+ polygon(c(k[1], k, k[length(k)]), c(0,s,0), border = NA, col = "grey80")
210
+ axis(2, fg = "grey60", col.axis = "grey60")
211
+ mtext("Mean silhouette", side = 2, line = 3, col = "grey60")
212
+
195
213
  # ns
196
- par(new=TRUE)
197
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
198
- ylim=range(ns), bty="n", xaxs="i")
199
- points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
200
- axis(4, fg="darkred", col.axis="darkred")
201
- mtext("Negative silhouette area", side=4, line=3, col="darkred")
214
+ par(new = TRUE)
215
+ plot(
216
+ 1, t = "n", bty = "n",
217
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
218
+ xlim = range(c(0,k)), ylim = range(ns)
219
+ )
220
+ points(k, ns, type = "o", pch = 16, col = rgb(1/2,0,0,3/4))
221
+ axis(4, fg = "darkred", col.axis = "darkred")
222
+ mtext("Negative silhouette area", side = 4, line = 3, col = "darkred")
223
+
202
224
  # ds
203
- par(new=TRUE)
204
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
205
- ylim=range(ds), bty="n", xaxs="i")
225
+ par(new = TRUE)
226
+ plot(
227
+ 1, t = "n", bty = "n",
228
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
229
+ xlim = range(c(0,k)), ylim = range(ds)
230
+ )
206
231
  lines(k, ds)
207
- abline(v=top.n, lty=2)
232
+ abline(v = top.n, lty = 2)
208
233
  }
209
234
 
210
- plot_distances <- function(dist) {
211
- par(mar=c(5,4,1,2)+0.1)
212
- hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
235
+ plot_distances <- function (dist) {
236
+ par(mar = c(5,4,1,2) + 0.1)
237
+ hist(
238
+ dist, border = NA, col = "grey60", breaks = 50,
239
+ xlab = "Distances", main = ""
240
+ )
213
241
  }
214
242
 
215
- plot_clustering <- function(cl, dist, types) {
216
- par(mar=c(5,4,4,2)+0.1)
243
+ plot_clustering <- function (cl, dist, types) {
244
+ par(mar = c(5,4,4,2) + 0.1)
217
245
  top.n <- length(cl$medoids)
218
246
  col <- ggplotColours(top.n)
219
- plot(silhouette(cl), col=col)
220
- if(length(labels(dist))<=15){
221
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
222
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
223
- }else{
224
- ani.mds <- cmdscale(dist, k=4)
225
- if(ncol(ani.mds)==4){
226
- plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
227
- xlab='Component 1', ylab='Component 2')
228
- plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
229
- xlab='Component 3', ylab='Component 4')
247
+ plot(silhouette(cl), col = col)
248
+ if (length(labels(dist)) <= 15) {
249
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
250
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
251
+ } else {
252
+ ani.mds <- cmdscale(dist, k = 4)
253
+ if (ncol(ani.mds) == 4) {
254
+ plot(
255
+ ani.mds[,1], ani.mds[,2], col = col[types], cex = 1/2,
256
+ xlab = "Component 1", ylab = "Component 2"
257
+ )
258
+ plot(
259
+ ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
260
+ xlab = "Component 3", ylab="Component 4"
261
+ )
230
262
  }else{
231
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
232
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
263
+ for (i in 1:2)
264
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
233
265
  }
234
266
  }
235
267
  }
236
268
 
237
- plot_tree <- function(phy, types, medoids){
269
+ plot_tree <- function (phy, types, medoids) {
238
270
  layout(1)
239
271
  top.n <- length(unique(types))
240
272
  col <- ggplotColours(top.n)
241
273
  is.medoid <- phy$tip.label %in% medoids
242
- phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
243
- " [", types[phy$tip.label[is.medoid]], "]", sep='')
244
- plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
245
- font=ifelse(is.medoid, 2, 1),
246
- tip.color=col[types[phy$tip.label]])
274
+ phy$tip.label[is.medoid] <- paste(
275
+ phy$tip.label[is.medoid],
276
+ " [", types[phy$tip.label[is.medoid]], "]",
277
+ sep = ""
278
+ )
279
+ plot(
280
+ phy, cex = ifelse(is.medoid, 1/3, 1/6),
281
+ font = ifelse(is.medoid, 2, 1),
282
+ tip.color = col[types[phy$tip.label]]
283
+ )
284
+ }
285
+
286
+ ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
287
+ if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360 / n
288
+ hcl(h = seq(h[1], h[2], length = n), c = 100, l = 65, alpha = alpha)
247
289
  }
248
290
 
249
- ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
250
- if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
251
- hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
291
+ ani_distance <- function (ani_file, sel) {
292
+ # Try to locate rds, otherwise read gzipped table
293
+ rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
294
+ if (file.exists(rds)) {
295
+ sim <- readRDS(rds)
296
+ } else {
297
+ sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
298
+ }
299
+
300
+ # If there is not data end process
301
+ if (nrow(sim) == 0) return(NULL)
302
+
303
+ # Apply filter (if requested)
304
+ if (!is.na(sel) && file.exists(sel)) {
305
+ say("Filter selection")
306
+ lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
307
+ sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
308
+ }
309
+
310
+ # Transform to distances
311
+ say("Distances")
312
+ sim$d <- 1 - (sim$value / 100)
313
+ return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
252
314
  }
253
315
 
254
316
  #= Main
255
317
  options(warn = 1)
256
- if(length(argv) >= 5 & argv[5] == 'empty'){
318
+ if (length(argv) >= 5 & argv[5] == "empty") {
257
319
  generate_empty_files(argv[2])
258
- write.table(NULL, paste(argv[2], "medoids", sep="."))
259
- write.table(NULL, paste(argv[2], "classif", sep="."))
260
- write.table(date(), paste(argv[2], "ready", sep="."))
320
+ write.table(NULL, paste(argv[2], "medoids", sep = "."))
321
+ write.table(NULL, paste(argv[2], "classif", sep = "."))
322
+ write.table(date(), paste(argv[2], "ready", sep = "."))
261
323
  }else{
262
- subclades(ani_file = argv[1], out_base = argv[2],
263
- thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
324
+ subclades(
325
+ ani_file = argv[1],
326
+ out_base = argv[2],
327
+ thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])),
328
+ sel = argv[4]
329
+ )
264
330
  }
331
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2.2
4
+ version: 1.1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-09 00:00:00.000000000 Z
11
+ date: 2021-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons