miga-base 1.1.2.2 → 1.1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4603bf75db8f82a1a30c1d6da1ab045139b13bf1eec5fd2d5be5c79fb2ac0442
4
- data.tar.gz: 24324b35a885453c01fada6fb07527616b3a952ab4ccb9b9c818c5d06e580acd
3
+ metadata.gz: 8d9d8a12b9eaa48b63df43b871f3cc7da598997f3d208cde9fcf31f2d605d66c
4
+ data.tar.gz: 72d58adbbbea43886e1e60a608f3f7e0da542c26b3b29ecf202a3b34b9f8ac35
5
5
  SHA512:
6
- metadata.gz: d9e555d736e3987d4afc45707307b7f95d2164cb8db822ad639e0f71e919c67972bff76aefdc7b0a1a1e7120e3100b863a8dde14470066ac7d4ab0b5be5c20f7
7
- data.tar.gz: 54d1823817cb2d4220d885e84e941cf5e4df88f850824cf3837ed716879aec4733f2d2df98bedc03aa29878a1e5bd7641b015c76738eed841d08046c9f50d1bb
6
+ metadata.gz: cc2c81a38d915c7bb13d53853e0be3c822a0ba7703d2e97cf16615ac4d8f8b4fc3203bc83b76584b48ddd7f13787545d5327d9dd5cd846bbc1be1db868b120a4
7
+ data.tar.gz: 8cf8bd65e94e6bb551143ebb4fbb4ca37a74e58f777011e3c2c5823f7c9b6a242de63ddda1c496d9c91df01f2471446989267828b69e0de7fd024d3e84d70afa
@@ -5,7 +5,7 @@
5
5
  module MiGA::Cli::Action::Init::DaemonHelper
6
6
  def configure_daemon
7
7
  cli.puts 'Default daemon configuration:'
8
- daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
8
+ daemon_f = File.expand_path('.miga_daemon.json', ENV['MIGA_HOME'])
9
9
  unless File.exist?(daemon_f) and cli.ask_user(
10
10
  'A template daemon already exists, do you want to preserve it?',
11
11
  'yes', %w(yes no)
@@ -4,7 +4,7 @@
4
4
  # Helper module with files configuration functions for MiGA::Cli::Action::Init
5
5
  module MiGA::Cli::Action::Init::FilesHelper
6
6
  def open_rc_file
7
- rc_path = File.expand_path('.miga_rc', ENV['HOME'])
7
+ rc_path = File.expand_path('.miga_rc', ENV['MIGA_HOME'])
8
8
  if File.exist? rc_path
9
9
  if cli.ask_user(
10
10
  'I found a previous configuration. Do you want to continue?',
@@ -55,10 +55,11 @@ module MiGA::Project::Result
55
55
  ##
56
56
  # Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
57
57
  def add_result_distances(base, _opts)
58
- return nil unless result_files_exist?(base, %w[.Rdata .txt])
58
+ return nil unless result_files_exist?(base, %w[.rds .txt])
59
59
 
60
60
  r = MiGA::Result.new("#{base}.json")
61
- r.add_file(:rdata, 'miga-project.Rdata')
61
+ r.add_file(:rds, 'miga-project.rds')
62
+ r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
62
63
  r.add_file(:matrix, 'miga-project.txt')
63
64
  r.add_file(:log, 'miga-project.log') # Legacy file
64
65
  r.add_file(:hist, 'miga-project.hist')
@@ -82,12 +83,13 @@ module MiGA::Project::Result
82
83
  end
83
84
 
84
85
  r = add_result_iter_clades(base)
85
- r.add_file(:aai_tree, 'miga-project.aai.nwk')
86
- r.add_file(:proposal, 'miga-project.proposed-clades')
87
- r.add_file(:clades_aai90, 'miga-project.aai90-clades')
88
- r.add_file(:clades_ani95, 'miga-project.ani95-clades')
89
- r.add_file(:clades_gsp, 'miga-project.gsp-clades')
90
- r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
86
+ r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
87
+ r.add_file(:aai_tree, 'miga-project.aai.nwk')
88
+ r.add_file(:proposal, 'miga-project.proposed-clades')
89
+ r.add_file(:clades_aai90, 'miga-project.aai90-clades')
90
+ r.add_file(:clades_ani95, 'miga-project.ani95-clades')
91
+ r.add_file(:clades_gsp, 'miga-project.gsp-clades')
92
+ r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
91
93
  r
92
94
  end
93
95
 
@@ -105,6 +107,7 @@ module MiGA::Project::Result
105
107
 
106
108
  r = add_result_iter_clades(base)
107
109
  r.add_file(:ani_tree, 'miga-project.ani.nwk')
110
+ r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
108
111
  r
109
112
  end
110
113
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.1, 2, 2].freeze
15
+ VERSION = [1.1, 3, 1].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2021, 11, 9)
23
+ VERSION_DATE = Date.new(2021, 11, 24)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- aai <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(aai, file = 'miga-project.Rdata')
28
- if(sum(aai[, 'a'] != aai[, 'b']) > 0) {
29
- h <- hist(aai[aai[, 'a'] != aai[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ aai <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(aai, file = "miga-project.rds")
28
+ if(sum(aai[, "a"] != aai[, "b"]) > 0) {
29
+ h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- ani <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(ani, file = 'miga-project.Rdata')
28
- if(sum(ani[, 'a'] != ani[, 'b']) > 0) {
29
- h <- hist(ani[ani[, 'a'] != ani[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ ani <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(ani, file = "miga-project.rds")
28
+ if(sum(ani[, "a"] != ani[, "b"]) > 0) {
29
+ h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -15,7 +15,7 @@ ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
15
15
  # No real need for hAAI distributions at all
16
16
  echo -n "" > miga-project.log
17
17
  echo -n "" > miga-project.txt
18
- echo "aai <- NULL; save(aai, file = 'miga-project.Rdata')" | R --vanilla
18
+ echo 'aai <- NULL; saveRDS(aai, file = "miga-project.rds")' | R --vanilla
19
19
 
20
20
  # Finalize
21
21
  miga_end_project_step "$DIR"
data/test/project_test.rb CHANGED
@@ -82,7 +82,7 @@ class ProjectTest < Test::Unit::TestCase
82
82
  def test_add_result
83
83
  p1 = project
84
84
  assert_nil(p1.add_result(:doom))
85
- %w[.Rdata .log .txt .done].each do |x|
85
+ %w[.rds .log .txt .done].each do |x|
86
86
  assert_nil(p1.add_result(:haai_distances))
87
87
  FileUtils.touch(
88
88
  File.join(
@@ -117,11 +117,12 @@ class ProjectTest < Test::Unit::TestCase
117
117
  # Project tasks
118
118
  expected_files = {
119
119
  project_stats: %w[.taxonomy.json .metadata.db],
120
- haai_distances: %w[.Rdata .log .txt],
121
- aai_distances: %w[.Rdata .log .txt],
122
- ani_distances: %w[.Rdata .log .txt],
123
- clade_finding: %w[.pdf .classif .medoids
124
- .class.tsv .class.nwk .proposed-clades],
120
+ haai_distances: %w[.rds .log .txt],
121
+ aai_distances: %w[.rds .log .txt],
122
+ ani_distances: %w[.rds .log .txt],
123
+ clade_finding: %w[
124
+ .pdf .classif .medoids .class.tsv .class.nwk .proposed-clades
125
+ ],
125
126
  subclades: %w[.pdf .classif .medoids .class.tsv .class.nwk],
126
127
  ogs: %w[.ogs .stats]
127
128
  }
@@ -2702,6 +2702,8 @@ def merge_db_opts():
2702
2702
 
2703
2703
  parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
2704
2704
 
2705
+ parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'Alternative way to supply donors. A file containing paths to the donor databases, 1 per line')
2706
+
2705
2707
  parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
2706
2708
 
2707
2709
  parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
@@ -2720,16 +2722,23 @@ def merge_db_thread_starter(rev_index, per_db_accs):
2720
2722
 
2721
2723
 
2722
2724
 
2723
- def merge_db(recipient, donors, verbose, threads):
2725
+ def merge_db(recipient, donors, donor_file, verbose, threads):
2724
2726
  #Prettier on the CLI
2725
-
2727
+
2728
+ if donor_file is not None:
2729
+ fh = agnostic_reader(donor_file)
2730
+ donors = [line.strip() for line in fh]
2731
+ fh.close()
2732
+
2726
2733
  if donors is None or recipient is None:
2727
2734
  print("Either donor or target not given. FastAAI is exiting.")
2728
2735
  return None
2729
2736
 
2730
2737
  print("")
2731
2738
 
2732
- donors = donors.split(",")
2739
+ if donor_file is None:
2740
+ donors = donors.split(",")
2741
+
2733
2742
  valid_donors = []
2734
2743
  for d in donors:
2735
2744
  if os.path.exists(d):
@@ -3454,10 +3463,11 @@ def main():
3454
3463
 
3455
3464
  recipient = opts.recipient
3456
3465
  donors = opts.donors
3466
+ donor_file = opts.donor_file
3457
3467
  verbose = opts.verbose
3458
3468
  threads = opts.threads
3459
3469
 
3460
- merge_db(recipient, donors, verbose, threads)
3470
+ merge_db(recipient, donors, donor_file, verbose, threads)
3461
3471
 
3462
3472
  #################### Query files vs DB ########################
3463
3473
 
@@ -151,8 +151,11 @@ module MiGA::DistanceRunner::Commands
151
151
  donors << tgt_idx if tgt_idx
152
152
  end
153
153
  return nil if donors.empty?
154
+
155
+ # Build target database
156
+ File.open(f0 = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
154
157
  run_cmd <<~CMD
155
- FastAAI merge_db --donors "#{donors.join(',')}" \
158
+ FastAAI merge_db --donor_file "#{f0}" \
156
159
  --recipient "#{f1 = tmp_file}" --threads #{opts[:thr]}
157
160
  CMD
158
161
 
@@ -166,7 +169,7 @@ module MiGA::DistanceRunner::Commands
166
169
  # Save values in the databases
167
170
  haai_data = {}
168
171
  aai_data = {}
169
- # Ugly workaround to the insistence of FastAAI to not provide the files
172
+ # Ugly workaround to the insistence of FastAAI not to provide the files
170
173
  # I ask for ;-)
171
174
  qry_results = File.basename(qry_idx, '.faix') + '_results.txt'
172
175
  out_file = File.join(f2, 'results', qry_results)
@@ -127,6 +127,7 @@ module MiGA::DistanceRunner::Database
127
127
  db = tmp_dbs[metric]
128
128
  table = metric == :haai ? :aai : metric
129
129
  SQLite3::Database.new(db) do |conn|
130
+ conn.execute('BEGIN TRANSACTION')
130
131
  data.each do |k, v|
131
132
  sql = <<~SQL
132
133
  insert into #{table} (
@@ -135,6 +136,7 @@ module MiGA::DistanceRunner::Database
135
136
  SQL
136
137
  conn.execute(sql, [dataset.name, k] + v)
137
138
  end
139
+ conn.execute('COMMIT')
138
140
  end
139
141
  checkpoint(metric)
140
142
  end
data/utils/find-medoid.R CHANGED
@@ -5,26 +5,28 @@
5
5
  #
6
6
 
7
7
  #= Load stuff
8
- argv <- commandArgs(trailingOnly = T)
8
+ argv <- commandArgs(trailingOnly = TRUE)
9
9
  suppressPackageStartupMessages(library(ape))
10
- if(Sys.getenv('MIGA') == ''){
10
+ if(Sys.getenv("MIGA") == ""){
11
11
  suppressPackageStartupMessages(library(enveomics.R))
12
12
  }else{
13
- source(file.path(Sys.getenv('MIGA'),
14
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
13
+ source(file.path(
14
+ Sys.getenv("MIGA"),
15
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
16
+ ))
15
17
  }
16
18
 
17
- find_medoids <- function(ani.df, out, clades) {
19
+ find_medoids <- function (ani.df, out, clades) {
18
20
  if(nrow(ani.df) == 0) return(NULL)
19
21
  ani.df$d <- 1 - (ani.df$value/100)
20
- dist <- enve.df2dist(ani.df, 'a', 'b', 'd', default.d = max(ani.df$d)*1.2)
22
+ dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
21
23
  dist <- as.matrix(dist)
22
- cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
24
+ cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
23
25
  cl.s <- c()
24
26
  medoids <- c()
25
27
  for(i in cl){
26
- lab <- strsplit(i, ',')[[1]]
27
- cat('Clade of:', lab[1], '\n')
28
+ lab <- strsplit(i, ",")[[1]]
29
+ cat("Clade of:", lab[1], "\n")
28
30
  if(length(lab) == 1) {
29
31
  lab.s <- lab
30
32
  } else {
@@ -32,15 +34,17 @@ find_medoids <- function(ani.df, out, clades) {
32
34
  }
33
35
  med <- lab.s[1]
34
36
  medoids <- c(medoids, med)
35
- cl.s <- c(cl.s, paste(lab.s, collapse = ','))
37
+ cl.s <- c(cl.s, paste(lab.s, collapse = ","))
36
38
  }
37
39
  write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
38
- write.table(cl.s, paste(clades, '.sorted', sep = ''), quote = FALSE,
39
- row.names = FALSE, col.names = FALSE)
40
+ write.table(
41
+ cl.s, paste(clades, ".sorted", sep = ""), quote = FALSE,
42
+ row.names = FALSE, col.names = FALSE
43
+ )
40
44
  }
41
45
 
42
46
  #= Main
43
- load(argv[1])
44
- if(! exists('ani')) ani <- aai
47
+ cat("Finding Medoids")
48
+ ani <- readRDS(argv[1])
45
49
  find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
46
50
 
@@ -44,7 +44,7 @@ module MiGA::SubcladeRunner::Pipeline
44
44
  # Find genomospecies medoids
45
45
  src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
46
46
  dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
47
- `Rscript '#{src}' ../../09.distances/#{dir}/miga-project.Rdata \
47
+ `Rscript '#{src}' '../../09.distances/#{dir}/miga-project.rds' \
48
48
  miga-project.gsp-medoids miga-project.gsp-clades`
49
49
  if File.exist? 'miga-project.gsp-clades.sorted'
50
50
  File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
@@ -54,8 +54,6 @@ module MiGA::SubcladeRunner::Pipeline
54
54
  ofh = File.open('miga-project.proposed-clades', 'w')
55
55
  File.open('miga-project.gsp-clades', 'r') do |ifh|
56
56
  ifh.each_line do |ln|
57
- next if $. == 1
58
-
59
57
  r = ln.chomp.split(',')
60
58
  ofh.puts r.join("\t") if r.size >= 5
61
59
  end
data/utils/subclades.R CHANGED
@@ -10,56 +10,51 @@ suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(parallel))
13
- if(Sys.getenv('MIGA') == ''){
13
+ if(Sys.getenv("MIGA") == ""){
14
14
  suppressPackageStartupMessages(library(enveomics.R))
15
15
  }else{
16
- source(file.path(Sys.getenv('MIGA'),
17
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
16
+ source(file.path(
17
+ Sys.getenv("MIGA"),
18
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
19
+ ))
18
20
  }
19
21
 
20
22
  #= Main function
21
23
  subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
22
- say('==> Out base:', out_base, '<==')
24
+ say("==> Out base:", out_base, "<==")
23
25
 
24
26
  # Normalize input matrix
25
- dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
26
- if(!missing(ani_file)){
27
- if(length(ani.d) == 0 && !file.exists(dist_rdata)){
27
+ dist_rds <- paste(out_base, "dist.rds", sep = ".")
28
+ if (!missing(ani_file)) {
29
+ if(length(ani.d) == 0 && !file.exists(dist_rds)){
28
30
  # Read from ani_file
29
- a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
30
- if(nrow(a) == 0){
31
+ ani.d <- ani_distance(ani_file, sel)
32
+ if (is.null(ani.d)) {
31
33
  generate_empty_files(out_base)
32
34
  return(NULL)
35
+ } else {
36
+ saveRDS(ani.d, dist_rds)
33
37
  }
34
- if(!is.na(sel) && file.exists(sel)){
35
- say('Filter selection')
36
- lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
37
- a <- a[a$a %in% lab & a$b %in% lab, ]
38
- }
39
- say('Distances')
40
- a$d <- 1 - (a$value/100)
41
- ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
42
- save(ani.d, file = dist_rdata)
43
38
  }
44
39
  }
45
40
 
46
41
  # Read result if the subclade is ready, run it otherwise
47
- if(file.exists(paste(out_base, 'classif', sep = '.'))){
42
+ if (file.exists(paste(out_base, "classif", sep = "."))) {
48
43
  say("Loading")
49
44
  ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
50
- sep = ' ', as.is = TRUE)[,1]
45
+ sep = " ", as.is = TRUE)[,1]
51
46
  a <- read.table(paste(out_base, "classif", sep="."),
52
- sep = '\t', as.is = TRUE)
47
+ sep = "\t", as.is = TRUE)
53
48
  ani.types <- a[,2]
54
49
  names(ani.types) <- a[,1]
55
- if(length(ani.d) == 0) load(dist_rdata)
56
- }else if(length(labels(ani.d)) > 8L){
57
- res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
58
- if(length(res) == 0) return(NULL)
59
- ani.medoids <- res[['ani.medoids']]
60
- ani.types <- res[['ani.types']]
61
- ani.d <- res[['ani.d']]
62
- }else{
50
+ if(length(ani.d) == 0) ani.d <- readRDS(dist_rds)
51
+ } else if (length(labels(ani.d)) > 8L) {
52
+ res <- subclade_clustering(out_base, thr, ani.d, dist_rds)
53
+ if (length(res) == 0) return(NULL)
54
+ ani.medoids <- res[["ani.medoids"]]
55
+ ani.types <- res[["ani.types"]]
56
+ ani.d <- res[["ani.d"]]
57
+ } else {
63
58
  ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
64
59
  ani.types <- rep(1, length(labels(ani.d)))
65
60
  names(ani.types) <- labels(ani.d)
@@ -69,66 +64,80 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
69
64
 
70
65
  # Recursive search
71
66
  say("Recursive search")
72
- for(i in 1:length(ani.medoids)){
67
+ for (i in 1:length(ani.medoids)) {
73
68
  medoid <- ani.medoids[i]
74
69
  ds_f <- names(ani.types)[ ani.types==i ]
75
70
  say("Analyzing subclade", i, "with medoid:", medoid)
76
71
  dir_f <- paste(out_base, ".sc-", i, sep="")
77
- if(!dir.exists(dir_f)) dir.create(dir_f)
72
+ if (!dir.exists(dir_f)) dir.create(dir_f)
78
73
  write.table(ds_f,
79
74
  paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
80
75
  quote=FALSE, col.names=FALSE, row.names=FALSE)
81
- if(length(ds_f) > 8L){
76
+ if (length(ds_f) > 8L) {
82
77
  ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
83
- subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
84
- thr=thr, ani.d=ani_subset)
78
+ subclades(
79
+ out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
80
+ thr = thr,
81
+ ani.d = ani_subset
82
+ )
85
83
  }
86
84
  }
87
85
 
88
86
  # Declare recursion up-to-here complete
89
- write.table(date(), paste(out_base, 'ready', sep='.'),
90
- quote=FALSE, row.names=FALSE, col.names=FALSE)
87
+ write.table(
88
+ date(), paste(out_base, "ready", sep = "."),
89
+ quote = FALSE, row.names = FALSE, col.names = FALSE
90
+ )
91
91
  }
92
92
 
93
93
  #= Heavy-lifter
94
- subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
94
+ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
95
95
  # Get ANI distances
96
- if(length(ani.d) > 0){
97
- # Just use ani.d (and save in dist_rdata_
98
- save(ani.d, file=dist_rdata)
99
- }else if(file.exists(dist_rdata)){
100
- # Read from dist_rdata
101
- load(dist_rdata)
102
- }else{
96
+ if (length(ani.d) > 0) {
97
+ # Just use ani.d (and save in dist_rds)
98
+ if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
99
+ } else if (file.exists(dist_rds)) {
100
+ # Read from dist_rds
101
+ ani.d <- readRDS(dist_rds)
102
+ } else {
103
103
  stop("Cannot find input matrix", out_base)
104
104
  }
105
- if(length(labels(ani.d)) <= 8L) return(list())
105
+ if (length(labels(ani.d)) <= 8L) return(list())
106
106
 
107
107
  # Build tree
108
108
  say("Tree")
109
109
  ani.ph <- bionj(ani.d)
110
- express.ori <- options('expressions')$expressions
111
- if(express.ori < ani.ph$Nnode*4){
112
- options(expressions=min(c(5e7,ani.ph$Nnode*4)))
110
+ say("- Write")
111
+ express.ori <- options("expressions")$expressions
112
+ if(express.ori < ani.ph$Nnode * 4){
113
+ options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
113
114
  }
114
- write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
115
+ write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
115
116
  options(expressions=express.ori)
116
117
 
117
118
  # Silhouette
118
119
  say("Silhouette")
119
120
  nn <- length(labels(ani.d))
120
121
  k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
122
+ say("- Make cluster")
121
123
  cl <- makeCluster(thr)
122
- s <- parSapply(cl, k, function(x) {
124
+ say("- Launch parallel jobs")
125
+ s <- parSapply(
126
+ cl, k,
127
+ function(x) {
123
128
  library(cluster)
124
- s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
125
- c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
126
- })
129
+ s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
130
+ c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
131
+ }
132
+ )
133
+ say("- Stop cluster")
127
134
  stopCluster(cl)
128
- s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
129
- s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
130
- ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
131
- if(mean(s[1,]<0)<0.75) ds[s[1,]<0] <- mean(ds) # <- k's with negative average
135
+ say("- Calculate custom criteria")
136
+ s.avg.z <- (s[1,] - mean(s[1,])) / (sd(s[1,]) + 0.0001)
137
+ s.neg.z <- (s[2,] - mean(s[2,])) / (sd(s[2,]) + 0.01)
138
+ ds <- s.avg.z - s.neg.z - 2 / (1:length(k)) - (1:length(k)) / 50
139
+ if(mean(s[1,] < 0) < 0.75)
140
+ ds[s[1,] < 0] <- mean(ds) # <- k's with negative average
132
141
  top.n <- k[which.max(ds)]
133
142
 
134
143
  # Classify genomes
@@ -139,8 +148,8 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
139
148
 
140
149
  # Generate graphic report
141
150
  say("Graphic report")
142
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
143
- layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
151
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
152
+ layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
144
153
  plot_distances(ani.d)
145
154
  plot_silhouette(k, s[1,], s[2,], ds, top.n)
146
155
  plot_clustering(ani.cl, ani.d, ani.types)
@@ -153,112 +162,170 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
153
162
  # Return data
154
163
  say("Cluster ready")
155
164
  return(list(
156
- ani.medoids=ani.medoids,
157
- ani.types=ani.types,
158
- ani.d=ani.d
165
+ ani.medoids = ani.medoids,
166
+ ani.types = ani.types,
167
+ ani.d = ani.d
159
168
  ))
160
169
  }
161
170
 
162
171
  #= Helper functions
163
- say <- function(...) { message(paste("[",date(),"]",...,"\n"),appendLF=FALSE) }
172
+ say <- function (...) {
173
+ message(paste("[", date(), "]", ..., "\n"), appendLF = FALSE)
174
+ }
164
175
 
165
- generate_empty_files <- function(out_base) {
166
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
167
- plot(1, t="n", axes=F)
168
- legend("center", "No data", bty="n")
176
+ generate_empty_files <- function (out_base) {
177
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
178
+ plot(1, t = "n", axes = F)
179
+ legend("center", "No data", bty = "n")
169
180
  dev.off()
170
- file.create(paste(out_base,".1.classif",sep=""))
171
- file.create(paste(out_base,".1.medoids",sep=""))
181
+ file.create(paste(out_base, ".1.classif", sep = ""))
182
+ file.create(paste(out_base, ".1.medoids", sep = ""))
172
183
  }
173
184
 
174
- write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
175
- say('Text report')
176
- write.table(ani.medoids, paste(out_base, 'medoids', sep='.'),
177
- quote=FALSE, col.names=FALSE, row.names=FALSE)
178
- classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
179
- ani.d.m <- 100 - as.matrix(ani.d)*100
180
- for(j in 1:nrow(classif)){
185
+ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
186
+ say("Text report")
187
+ write.table(
188
+ ani.medoids, paste(out_base, "medoids", sep = "."),
189
+ quote = FALSE, col.names = FALSE, row.names = FALSE
190
+ )
191
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ani.types], NA)
192
+ ani.d.m <- 100 - as.matrix(ani.d) * 100
193
+ for (j in 1:nrow(classif)) {
181
194
  classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
182
195
  }
183
- write.table(classif, paste(out_base,"classif",sep="."),
184
- quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
196
+ write.table(
197
+ classif, paste(out_base, "classif", sep="."),
198
+ quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
199
+ )
185
200
  }
186
201
 
187
- plot_silhouette <- function(k, s, ns, ds, top.n) {
202
+ plot_silhouette <- function (k, s, ns, ds, top.n) {
188
203
  # s
189
- par(mar=c(4,5,1,5)+0.1)
190
- plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
191
- ylim=range(s), bty="n", xaxs="i", yaxt="n")
192
- polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
193
- axis(2, fg="grey60", col.axis="grey60")
194
- mtext("Mean silhouette", side=2, line=3, col="grey60")
204
+ par(mar = c(4,5,1,5)+0.1)
205
+ plot(
206
+ 1, t = "n", xlab = "k (clusters)", ylab = "", xlim = range(c(0,k)),
207
+ ylim = range(s), bty = "n", xaxs = "i", yaxt = "n"
208
+ )
209
+ polygon(c(k[1], k, k[length(k)]), c(0,s,0), border = NA, col = "grey80")
210
+ axis(2, fg = "grey60", col.axis = "grey60")
211
+ mtext("Mean silhouette", side = 2, line = 3, col = "grey60")
212
+
195
213
  # ns
196
- par(new=TRUE)
197
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
198
- ylim=range(ns), bty="n", xaxs="i")
199
- points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
200
- axis(4, fg="darkred", col.axis="darkred")
201
- mtext("Negative silhouette area", side=4, line=3, col="darkred")
214
+ par(new = TRUE)
215
+ plot(
216
+ 1, t = "n", bty = "n",
217
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
218
+ xlim = range(c(0,k)), ylim = range(ns)
219
+ )
220
+ points(k, ns, type = "o", pch = 16, col = rgb(1/2,0,0,3/4))
221
+ axis(4, fg = "darkred", col.axis = "darkred")
222
+ mtext("Negative silhouette area", side = 4, line = 3, col = "darkred")
223
+
202
224
  # ds
203
- par(new=TRUE)
204
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
205
- ylim=range(ds), bty="n", xaxs="i")
225
+ par(new = TRUE)
226
+ plot(
227
+ 1, t = "n", bty = "n",
228
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
229
+ xlim = range(c(0,k)), ylim = range(ds)
230
+ )
206
231
  lines(k, ds)
207
- abline(v=top.n, lty=2)
232
+ abline(v = top.n, lty = 2)
208
233
  }
209
234
 
210
- plot_distances <- function(dist) {
211
- par(mar=c(5,4,1,2)+0.1)
212
- hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
235
+ plot_distances <- function (dist) {
236
+ par(mar = c(5,4,1,2) + 0.1)
237
+ hist(
238
+ dist, border = NA, col = "grey60", breaks = 50,
239
+ xlab = "Distances", main = ""
240
+ )
213
241
  }
214
242
 
215
- plot_clustering <- function(cl, dist, types) {
216
- par(mar=c(5,4,4,2)+0.1)
243
+ plot_clustering <- function (cl, dist, types) {
244
+ par(mar = c(5,4,4,2) + 0.1)
217
245
  top.n <- length(cl$medoids)
218
246
  col <- ggplotColours(top.n)
219
- plot(silhouette(cl), col=col)
220
- if(length(labels(dist))<=15){
221
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
222
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
223
- }else{
224
- ani.mds <- cmdscale(dist, k=4)
225
- if(ncol(ani.mds)==4){
226
- plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
227
- xlab='Component 1', ylab='Component 2')
228
- plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
229
- xlab='Component 3', ylab='Component 4')
247
+ plot(silhouette(cl), col = col)
248
+ if (length(labels(dist)) <= 15) {
249
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
250
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
251
+ } else {
252
+ ani.mds <- cmdscale(dist, k = 4)
253
+ if (ncol(ani.mds) == 4) {
254
+ plot(
255
+ ani.mds[,1], ani.mds[,2], col = col[types], cex = 1/2,
256
+ xlab = "Component 1", ylab = "Component 2"
257
+ )
258
+ plot(
259
+ ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
260
+ xlab = "Component 3", ylab="Component 4"
261
+ )
230
262
  }else{
231
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
232
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
263
+ for (i in 1:2)
264
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
233
265
  }
234
266
  }
235
267
  }
236
268
 
237
- plot_tree <- function(phy, types, medoids){
269
+ plot_tree <- function (phy, types, medoids) {
238
270
  layout(1)
239
271
  top.n <- length(unique(types))
240
272
  col <- ggplotColours(top.n)
241
273
  is.medoid <- phy$tip.label %in% medoids
242
- phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
243
- " [", types[phy$tip.label[is.medoid]], "]", sep='')
244
- plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
245
- font=ifelse(is.medoid, 2, 1),
246
- tip.color=col[types[phy$tip.label]])
274
+ phy$tip.label[is.medoid] <- paste(
275
+ phy$tip.label[is.medoid],
276
+ " [", types[phy$tip.label[is.medoid]], "]",
277
+ sep = ""
278
+ )
279
+ plot(
280
+ phy, cex = ifelse(is.medoid, 1/3, 1/6),
281
+ font = ifelse(is.medoid, 2, 1),
282
+ tip.color = col[types[phy$tip.label]]
283
+ )
284
+ }
285
+
286
+ ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
287
+ if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360 / n
288
+ hcl(h = seq(h[1], h[2], length = n), c = 100, l = 65, alpha = alpha)
247
289
  }
248
290
 
249
- ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
250
- if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
251
- hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
291
+ ani_distance <- function (ani_file, sel) {
292
+ # Try to locate rds, otherwise read gzipped table
293
+ rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
294
+ if (file.exists(rds)) {
295
+ sim <- readRDS(rds)
296
+ } else {
297
+ sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
298
+ }
299
+
300
+ # If there is not data end process
301
+ if (nrow(sim) == 0) return(NULL)
302
+
303
+ # Apply filter (if requested)
304
+ if (!is.na(sel) && file.exists(sel)) {
305
+ say("Filter selection")
306
+ lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
307
+ sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
308
+ }
309
+
310
+ # Transform to distances
311
+ say("Distances")
312
+ sim$d <- 1 - (sim$value / 100)
313
+ return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
252
314
  }
253
315
 
254
316
  #= Main
255
317
  options(warn = 1)
256
- if(length(argv) >= 5 & argv[5] == 'empty'){
318
+ if (length(argv) >= 5 & argv[5] == "empty") {
257
319
  generate_empty_files(argv[2])
258
- write.table(NULL, paste(argv[2], "medoids", sep="."))
259
- write.table(NULL, paste(argv[2], "classif", sep="."))
260
- write.table(date(), paste(argv[2], "ready", sep="."))
320
+ write.table(NULL, paste(argv[2], "medoids", sep = "."))
321
+ write.table(NULL, paste(argv[2], "classif", sep = "."))
322
+ write.table(date(), paste(argv[2], "ready", sep = "."))
261
323
  }else{
262
- subclades(ani_file = argv[1], out_base = argv[2],
263
- thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
324
+ subclades(
325
+ ani_file = argv[1],
326
+ out_base = argv[2],
327
+ thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])),
328
+ sel = argv[4]
329
+ )
264
330
  }
331
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2.2
4
+ version: 1.1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-09 00:00:00.000000000 Z
11
+ date: 2021-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons