miga-base 1.1.2.4 → 1.1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: beb13a42d514a4cf731b3b042096728cd6d615124f1412046ad16fb8de2b709e
4
- data.tar.gz: 64e32095ac59863d4005e602e55a1396530aac12da40fe88316985c95a90eb12
3
+ metadata.gz: 683a93a244b106eb24f94cff47695b28cc051701ecd4ea83b289a6650fdf3985
4
+ data.tar.gz: 6ce192df7a66502535a5c24406d9d3f1d0a1d51c11f3fb6e08f353547661c053
5
5
  SHA512:
6
- metadata.gz: d8f813d3b32cfdfb86da746e63c8e7a3a5c142822dc44a7c2588219368bed31851a77cd05ce30ee2d1dc02d930beaab836cda58a122cc5955d6ec0235b2edb42
7
- data.tar.gz: 81d773c3d4078ec3fd27d389bb1159aa9585dafdd2efdb5a917471ec784d9ebf24e67f2fba5358a4c4c589d03a12ab9f3e1b120e83390e0f0354a059a0cbb568
6
+ metadata.gz: cd07ba5ea8d3088ba9e7752c13bcdc07a0cf73175687c021e93c6df9361abd6dab361ffaed562f421f551999deee3de4f32d388f96f050ff45750d22bbe3eaf3
7
+ data.tar.gz: 18ba878127bc0abb7aed7b63398091cc2ed0fbee0d8629850a25d26ab9690f4489a8aca4fa8a3e2f2a8d4e80bdf1fd065511c23774327e988fac9ff0df645041
@@ -59,13 +59,14 @@ module MiGA::Cli::Action::Doctor::Base
59
59
  next if (lineno += 1) == 1
60
60
 
61
61
  r = ln.split("\t")
62
- next unless [1, 2].map { |i| p.dataset(r[i]).nil? }.any?
62
+ names = [r[0], r[1]]
63
+ next unless names.any? { |i| p.dataset(i).nil? }
63
64
 
64
- [1, 2].each do |i|
65
- if p.dataset(r[i]).nil? || !p.dataset(r[i]).active?
66
- notok[r[i]] = true
65
+ names.each do |i|
66
+ if p.dataset(i).nil? || !p.dataset(i).active?
67
+ notok[i] = true
67
68
  else
68
- fix[r[i]] = true
69
+ fix[i] = true
69
70
  end
70
71
  end
71
72
  end
@@ -55,10 +55,11 @@ module MiGA::Project::Result
55
55
  ##
56
56
  # Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
57
57
  def add_result_distances(base, _opts)
58
- return nil unless result_files_exist?(base, %w[.Rdata .txt])
58
+ return nil unless result_files_exist?(base, %w[.rds .txt])
59
59
 
60
60
  r = MiGA::Result.new("#{base}.json")
61
- r.add_file(:rdata, 'miga-project.Rdata')
61
+ r.add_file(:rds, 'miga-project.rds')
62
+ r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
62
63
  r.add_file(:matrix, 'miga-project.txt')
63
64
  r.add_file(:log, 'miga-project.log') # Legacy file
64
65
  r.add_file(:hist, 'miga-project.hist')
@@ -82,12 +83,13 @@ module MiGA::Project::Result
82
83
  end
83
84
 
84
85
  r = add_result_iter_clades(base)
85
- r.add_file(:aai_tree, 'miga-project.aai.nwk')
86
- r.add_file(:proposal, 'miga-project.proposed-clades')
87
- r.add_file(:clades_aai90, 'miga-project.aai90-clades')
88
- r.add_file(:clades_ani95, 'miga-project.ani95-clades')
89
- r.add_file(:clades_gsp, 'miga-project.gsp-clades')
90
- r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
86
+ r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
87
+ r.add_file(:aai_tree, 'miga-project.aai.nwk')
88
+ r.add_file(:proposal, 'miga-project.proposed-clades')
89
+ r.add_file(:clades_aai90, 'miga-project.aai90-clades')
90
+ r.add_file(:clades_ani95, 'miga-project.ani95-clades')
91
+ r.add_file(:clades_gsp, 'miga-project.gsp-clades')
92
+ r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
91
93
  r
92
94
  end
93
95
 
@@ -105,6 +107,7 @@ module MiGA::Project::Result
105
107
 
106
108
  r = add_result_iter_clades(base)
107
109
  r.add_file(:ani_tree, 'miga-project.ani.nwk')
110
+ r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
108
111
  r
109
112
  end
110
113
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.1, 2, 4].freeze
15
+ VERSION = [1.1, 3, 3].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2021, 11, 16)
23
+ VERSION_DATE = Date.new(2021, 11, 29)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- aai <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(aai, file = 'miga-project.Rdata')
28
- if(sum(aai[, 'a'] != aai[, 'b']) > 0) {
29
- h <- hist(aai[aai[, 'a'] != aai[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ aai <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(aai, file = "miga-project.rds")
28
+ if(sum(aai[, "a"] != aai[, "b"]) > 0) {
29
+ h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- ani <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(ani, file = 'miga-project.Rdata')
28
- if(sum(ani[, 'a'] != ani[, 'b']) > 0) {
29
- h <- hist(ani[ani[, 'a'] != ani[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ ani <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(ani, file = "miga-project.rds")
28
+ if(sum(ani[, "a"] != ani[, "b"]) > 0) {
29
+ h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -15,7 +15,7 @@ ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
15
15
  # No real need for hAAI distributions at all
16
16
  echo -n "" > miga-project.log
17
17
  echo -n "" > miga-project.txt
18
- echo "aai <- NULL; save(aai, file = 'miga-project.Rdata')" | R --vanilla
18
+ echo 'aai <- NULL; saveRDS(aai, file = "miga-project.rds")' | R --vanilla
19
19
 
20
20
  # Finalize
21
21
  miga_end_project_step "$DIR"
data/test/project_test.rb CHANGED
@@ -82,7 +82,7 @@ class ProjectTest < Test::Unit::TestCase
82
82
  def test_add_result
83
83
  p1 = project
84
84
  assert_nil(p1.add_result(:doom))
85
- %w[.Rdata .log .txt .done].each do |x|
85
+ %w[.rds .log .txt .done].each do |x|
86
86
  assert_nil(p1.add_result(:haai_distances))
87
87
  FileUtils.touch(
88
88
  File.join(
@@ -117,11 +117,12 @@ class ProjectTest < Test::Unit::TestCase
117
117
  # Project tasks
118
118
  expected_files = {
119
119
  project_stats: %w[.taxonomy.json .metadata.db],
120
- haai_distances: %w[.Rdata .log .txt],
121
- aai_distances: %w[.Rdata .log .txt],
122
- ani_distances: %w[.Rdata .log .txt],
123
- clade_finding: %w[.pdf .classif .medoids
124
- .class.tsv .class.nwk .proposed-clades],
120
+ haai_distances: %w[.rds .log .txt],
121
+ aai_distances: %w[.rds .log .txt],
122
+ ani_distances: %w[.rds .log .txt],
123
+ clade_finding: %w[
124
+ .pdf .classif .medoids .class.tsv .class.nwk .proposed-clades
125
+ ],
125
126
  subclades: %w[.pdf .classif .medoids .class.tsv .class.nwk],
126
127
  ogs: %w[.ogs .stats]
127
128
  }
@@ -2702,6 +2702,8 @@ def merge_db_opts():
2702
2702
 
2703
2703
  parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
2704
2704
 
2705
+ parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'Alternative way to supply donors. A file containing paths to the donor databases, 1 per line')
2706
+
2705
2707
  parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
2706
2708
 
2707
2709
  parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
@@ -2720,16 +2722,23 @@ def merge_db_thread_starter(rev_index, per_db_accs):
2720
2722
 
2721
2723
 
2722
2724
 
2723
- def merge_db(recipient, donors, verbose, threads):
2725
+ def merge_db(recipient, donors, donor_file, verbose, threads):
2724
2726
  #Prettier on the CLI
2725
-
2727
+
2728
+ if donor_file is not None:
2729
+ fh = agnostic_reader(donor_file)
2730
+ donors = [line.strip() for line in fh]
2731
+ fh.close()
2732
+
2726
2733
  if donors is None or recipient is None:
2727
2734
  print("Either donor or target not given. FastAAI is exiting.")
2728
2735
  return None
2729
2736
 
2730
2737
  print("")
2731
2738
 
2732
- donors = donors.split(",")
2739
+ if donor_file is None:
2740
+ donors = donors.split(",")
2741
+
2733
2742
  valid_donors = []
2734
2743
  for d in donors:
2735
2744
  if os.path.exists(d):
@@ -3454,10 +3463,11 @@ def main():
3454
3463
 
3455
3464
  recipient = opts.recipient
3456
3465
  donors = opts.donors
3466
+ donor_file = opts.donor_file
3457
3467
  verbose = opts.verbose
3458
3468
  threads = opts.threads
3459
3469
 
3460
- merge_db(recipient, donors, verbose, threads)
3470
+ merge_db(recipient, donors, donor_file, verbose, threads)
3461
3471
 
3462
3472
  #################### Query files vs DB ########################
3463
3473
 
@@ -151,22 +151,36 @@ module MiGA::DistanceRunner::Commands
151
151
  donors << tgt_idx if tgt_idx
152
152
  end
153
153
  return nil if donors.empty?
154
- run_cmd <<~CMD
155
- FastAAI merge_db --donors "#{donors.join(',')}" \
156
- --recipient "#{f1 = tmp_file}" --threads #{opts[:thr]}
157
- CMD
154
+
155
+ # Build target database
156
+ f1 = tmp_file
157
+ if donors.size == 1
158
+ File.copy(donors.first, f1)
159
+ else
160
+ File.open(f0 = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
161
+ run_cmd(
162
+ <<~CMD
163
+ FastAAI merge_db --threads #{opts[:thr]} \
164
+ --donor_file "#{f0}" --recipient "#{f1}"
165
+ CMD
166
+ )
167
+ raise "Cannot merge databases into: #{f1}" unless File.size?(f1)
168
+ end
158
169
 
159
170
  # Run FastAAI
160
- run_cmd <<~CMD
161
- FastAAI db_query --query "#{qry_idx}" --target "#{f1}" \
162
- --output "#{f2 = tmp_file}" --threads #{opts[:thr]} \
163
- --do_stdev
164
- CMD
171
+ run_cmd(
172
+ <<~CMD
173
+ FastAAI db_query --query "#{qry_idx}" --target "#{f1}" \
174
+ --output "#{f2 = tmp_file}" --threads #{opts[:thr]} \
175
+ --do_stdev
176
+ CMD
177
+ )
178
+ raise "Cannot find FastAAI output directory: #{f2}" unless Dir.exist?(f2)
165
179
 
166
180
  # Save values in the databases
167
181
  haai_data = {}
168
182
  aai_data = {}
169
- # Ugly workaround to the insistence of FastAAI to not provide the files
183
+ # Ugly workaround to the insistence of FastAAI not to provide the files
170
184
  # I ask for ;-)
171
185
  qry_results = File.basename(qry_idx, '.faix') + '_results.txt'
172
186
  out_file = File.join(f2, 'results', qry_results)
@@ -214,6 +228,6 @@ module MiGA::DistanceRunner::Commands
214
228
 
215
229
  def run_cmd(cmd)
216
230
  puts "CMD: #{cmd}"
217
- `#{cmd}`
231
+ puts `#{cmd} 2>&1`
218
232
  end
219
233
  end
@@ -127,6 +127,7 @@ module MiGA::DistanceRunner::Database
127
127
  db = tmp_dbs[metric]
128
128
  table = metric == :haai ? :aai : metric
129
129
  SQLite3::Database.new(db) do |conn|
130
+ conn.execute('BEGIN TRANSACTION')
130
131
  data.each do |k, v|
131
132
  sql = <<~SQL
132
133
  insert into #{table} (
@@ -135,6 +136,7 @@ module MiGA::DistanceRunner::Database
135
136
  SQL
136
137
  conn.execute(sql, [dataset.name, k] + v)
137
138
  end
139
+ conn.execute('COMMIT')
138
140
  end
139
141
  checkpoint(metric)
140
142
  end
data/utils/find-medoid.R CHANGED
@@ -5,26 +5,28 @@
5
5
  #
6
6
 
7
7
  #= Load stuff
8
- argv <- commandArgs(trailingOnly = T)
8
+ argv <- commandArgs(trailingOnly = TRUE)
9
9
  suppressPackageStartupMessages(library(ape))
10
- if(Sys.getenv('MIGA') == ''){
10
+ if(Sys.getenv("MIGA") == ""){
11
11
  suppressPackageStartupMessages(library(enveomics.R))
12
12
  }else{
13
- source(file.path(Sys.getenv('MIGA'),
14
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
13
+ source(file.path(
14
+ Sys.getenv("MIGA"),
15
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
16
+ ))
15
17
  }
16
18
 
17
- find_medoids <- function(ani.df, out, clades) {
19
+ find_medoids <- function (ani.df, out, clades) {
18
20
  if(nrow(ani.df) == 0) return(NULL)
19
21
  ani.df$d <- 1 - (ani.df$value/100)
20
- dist <- enve.df2dist(ani.df, 'a', 'b', 'd', default.d = max(ani.df$d)*1.2)
22
+ dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
21
23
  dist <- as.matrix(dist)
22
- cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
24
+ cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
23
25
  cl.s <- c()
24
26
  medoids <- c()
25
27
  for(i in cl){
26
- lab <- strsplit(i, ',')[[1]]
27
- cat('Clade of:', lab[1], '\n')
28
+ lab <- strsplit(i, ",")[[1]]
29
+ cat("Clade of:", lab[1], "\n")
28
30
  if(length(lab) == 1) {
29
31
  lab.s <- lab
30
32
  } else {
@@ -32,15 +34,17 @@ find_medoids <- function(ani.df, out, clades) {
32
34
  }
33
35
  med <- lab.s[1]
34
36
  medoids <- c(medoids, med)
35
- cl.s <- c(cl.s, paste(lab.s, collapse = ','))
37
+ cl.s <- c(cl.s, paste(lab.s, collapse = ","))
36
38
  }
37
39
  write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
38
- write.table(cl.s, paste(clades, '.sorted', sep = ''), quote = FALSE,
39
- row.names = FALSE, col.names = FALSE)
40
+ write.table(
41
+ cl.s, paste(clades, ".sorted", sep = ""), quote = FALSE,
42
+ row.names = FALSE, col.names = FALSE
43
+ )
40
44
  }
41
45
 
42
46
  #= Main
43
- load(argv[1])
44
- if(! exists('ani')) ani <- aai
47
+ cat("Finding Medoids")
48
+ ani <- readRDS(argv[1])
45
49
  find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
46
50
 
@@ -7,9 +7,12 @@ module MiGA::SubcladeRunner::Pipeline
7
7
  aai90: [:aai_distances, opts[:gsp_aai], :aai]
8
8
  }
9
9
  tasks.each do |k, par|
10
+ # Run only the requested metric
11
+ next unless par[2].to_s == opts[:gsp_metric]
12
+
10
13
  # Final output
11
14
  ogs_file = "miga-project.#{k}-clades"
12
- next if File.size? ogs_file
15
+ next if File.size?(ogs_file)
13
16
 
14
17
  # Build ABC files
15
18
  abc_path = tmp_file("#{k}.abc")
@@ -20,7 +23,7 @@ module MiGA::SubcladeRunner::Pipeline
20
23
  next if ln =~ /^a\tb\tvalue\t/
21
24
 
22
25
  r = ln.chomp.split("\t")
23
- ofh.puts "G>#{r[0]}\tG>#{r[1]}\t#{r[2]}" if r[2].to_f >= par[1]
26
+ ofh.puts("G>#{r[0]}\tG>#{r[1]}\t#{r[2]}") if r[2].to_f >= par[1]
24
27
  end
25
28
  end
26
29
  ofh.close
@@ -29,22 +32,20 @@ module MiGA::SubcladeRunner::Pipeline
29
32
  `ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
30
33
  File.open(ogs_file, 'w') do |fh|
31
34
  File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
32
- fh.puts ln if lno > 0
35
+ fh.puts(ln) if lno > 0
33
36
  end
34
37
  end
35
38
  File.unlink "#{ogs_file}.tmp"
36
39
  else
37
- FileUtils.touch ogs_file
38
- end
39
- if par[2].to_s == opts[:gsp_metric]
40
- FileUtils.cp(ogs_file, "miga-project.gsp-clades")
40
+ FileUtils.touch(ogs_file)
41
41
  end
42
+ FileUtils.cp(ogs_file, 'miga-project.gsp-clades')
42
43
  end
43
44
 
44
45
  # Find genomospecies medoids
45
46
  src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
46
47
  dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
47
- `Rscript '#{src}' ../../09.distances/#{dir}/miga-project.Rdata \
48
+ `Rscript '#{src}' '../../09.distances/#{dir}/miga-project.rds' \
48
49
  miga-project.gsp-medoids miga-project.gsp-clades`
49
50
  if File.exist? 'miga-project.gsp-clades.sorted'
50
51
  File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
@@ -54,8 +55,6 @@ module MiGA::SubcladeRunner::Pipeline
54
55
  ofh = File.open('miga-project.proposed-clades', 'w')
55
56
  File.open('miga-project.gsp-clades', 'r') do |ifh|
56
57
  ifh.each_line do |ln|
57
- next if $. == 1
58
-
59
58
  r = ln.chomp.split(',')
60
59
  ofh.puts r.join("\t") if r.size >= 5
61
60
  end
@@ -69,7 +68,7 @@ module MiGA::SubcladeRunner::Pipeline
69
68
  metric_res = project.result(step) or raise "Incomplete step #{step}"
70
69
  matrix = metric_res.file_path(:matrix)
71
70
  `Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}' \
72
- miga-project.ani95-medoids '#{opts[:run_clades] ? 'cluster' : 'empty'}'`
71
+ miga-project.gsp-medoids '#{opts[:run_clades] ? 'cluster' : 'empty'}'`
73
72
  if File.exist? 'miga-project.nwk'
74
73
  File.rename('miga-project.nwk', "miga-project.#{metric}.nwk")
75
74
  end
data/utils/subclades.R CHANGED
@@ -10,56 +10,55 @@ suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(parallel))
13
- if(Sys.getenv('MIGA') == ''){
13
+ if(Sys.getenv("MIGA") == ""){
14
14
  suppressPackageStartupMessages(library(enveomics.R))
15
15
  }else{
16
- source(file.path(Sys.getenv('MIGA'),
17
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
16
+ source(file.path(
17
+ Sys.getenv("MIGA"),
18
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
19
+ ))
18
20
  }
19
21
 
20
22
  #= Main function
21
23
  subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
22
- say('==> Out base:', out_base, '<==')
24
+ say("==> Out base:", out_base, "<==")
23
25
 
24
26
  # Normalize input matrix
25
- dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
26
- if(!missing(ani_file)){
27
- if(length(ani.d) == 0 && !file.exists(dist_rdata)){
28
- # Read from ani_file
29
- a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
30
- if(nrow(a) == 0){
31
- generate_empty_files(out_base)
32
- return(NULL)
33
- }
34
- if(!is.na(sel) && file.exists(sel)){
35
- say('Filter selection')
36
- lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
37
- a <- a[a$a %in% lab & a$b %in% lab, ]
27
+ dist_rds <- paste(out_base, "dist.rds", sep = ".")
28
+ if (!missing(ani_file)) {
29
+ if (length(ani.d) == 0) {
30
+ if (file.exists(dist_rds)) {
31
+ ani.d <- readRDS(dist_rds)
32
+ } else {
33
+ # Read from ani_file
34
+ ani.d <- ani_distance(ani_file, sel)
35
+ if (is.null(ani.d)) {
36
+ generate_empty_files(out_base)
37
+ return(NULL)
38
+ } else {
39
+ saveRDS(ani.d, dist_rds)
40
+ }
38
41
  }
39
- say('Distances')
40
- a$d <- 1 - (a$value/100)
41
- ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
42
- save(ani.d, file = dist_rdata)
43
42
  }
44
43
  }
45
44
 
46
45
  # Read result if the subclade is ready, run it otherwise
47
- if(file.exists(paste(out_base, 'classif', sep = '.'))){
46
+ if (file.exists(paste(out_base, "classif", sep = "."))) {
48
47
  say("Loading")
49
48
  ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
50
- sep = ' ', as.is = TRUE)[,1]
49
+ sep = " ", as.is = TRUE)[,1]
51
50
  a <- read.table(paste(out_base, "classif", sep="."),
52
- sep = '\t', as.is = TRUE)
51
+ sep = "\t", as.is = TRUE)
53
52
  ani.types <- a[,2]
54
53
  names(ani.types) <- a[,1]
55
- if(length(ani.d) == 0) load(dist_rdata)
56
- }else if(length(labels(ani.d)) > 8L){
57
- res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
58
- if(length(res) == 0) return(NULL)
59
- ani.medoids <- res[['ani.medoids']]
60
- ani.types <- res[['ani.types']]
61
- ani.d <- res[['ani.d']]
62
- }else{
54
+ if(length(ani.d) == 0) ani.d <- readRDS(dist_rds)
55
+ } else if (length(labels(ani.d)) > 8L) {
56
+ res <- subclade_clustering(out_base, thr, ani.d, dist_rds)
57
+ if (length(res) == 0) return(NULL)
58
+ ani.medoids <- res[["ani.medoids"]]
59
+ ani.types <- res[["ani.types"]]
60
+ ani.d <- res[["ani.d"]]
61
+ } else {
63
62
  ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
64
63
  ani.types <- rep(1, length(labels(ani.d)))
65
64
  names(ani.types) <- labels(ani.d)
@@ -69,66 +68,69 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
69
68
 
70
69
  # Recursive search
71
70
  say("Recursive search")
72
- for(i in 1:length(ani.medoids)){
71
+ for (i in 1:length(ani.medoids)) {
73
72
  medoid <- ani.medoids[i]
74
73
  ds_f <- names(ani.types)[ ani.types==i ]
75
74
  say("Analyzing subclade", i, "with medoid:", medoid)
76
75
  dir_f <- paste(out_base, ".sc-", i, sep="")
77
- if(!dir.exists(dir_f)) dir.create(dir_f)
76
+ if (!dir.exists(dir_f)) dir.create(dir_f)
78
77
  write.table(ds_f,
79
78
  paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
80
79
  quote=FALSE, col.names=FALSE, row.names=FALSE)
81
- if(length(ds_f) > 8L){
80
+ if (length(ds_f) > 8L) {
82
81
  ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
83
- subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
84
- thr=thr, ani.d=ani_subset)
82
+ subclades(
83
+ out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
84
+ thr = thr,
85
+ ani.d = ani_subset
86
+ )
85
87
  }
86
88
  }
87
89
 
88
90
  # Declare recursion up-to-here complete
89
- write.table(date(), paste(out_base, 'ready', sep='.'),
90
- quote=FALSE, row.names=FALSE, col.names=FALSE)
91
+ write.table(
92
+ date(), paste(out_base, "ready", sep = "."),
93
+ quote = FALSE, row.names = FALSE, col.names = FALSE
94
+ )
91
95
  }
92
96
 
93
97
  #= Heavy-lifter
94
- subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
98
+ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
95
99
  # Get ANI distances
96
- if(length(ani.d) > 0){
97
- # Just use ani.d (and save in dist_rdata_
98
- save(ani.d, file=dist_rdata)
99
- }else if(file.exists(dist_rdata)){
100
- # Read from dist_rdata
101
- load(dist_rdata)
102
- }else{
100
+ if (length(ani.d) > 0) {
101
+ # Just use ani.d (and save in dist_rds)
102
+ if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
103
+ } else if (file.exists(dist_rds)) {
104
+ # Read from dist_rds
105
+ ani.d <- readRDS(dist_rds)
106
+ } else {
103
107
  stop("Cannot find input matrix", out_base)
104
108
  }
105
- if(length(labels(ani.d)) <= 8L) return(list())
109
+ if (length(labels(ani.d)) <= 8L) return(list())
106
110
 
107
- # Build tree
108
- say("Tree")
109
- ani.ph <- bionj(ani.d)
110
- express.ori <- options('expressions')$expressions
111
- if(express.ori < ani.ph$Nnode*4){
112
- options(expressions=min(c(5e7,ani.ph$Nnode*4)))
113
- }
114
- write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
115
- options(expressions=express.ori)
116
-
117
111
  # Silhouette
118
112
  say("Silhouette")
119
113
  nn <- length(labels(ani.d))
120
114
  k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
115
+ say("- Make cluster")
121
116
  cl <- makeCluster(thr)
122
- s <- parSapply(cl, k, function(x) {
117
+ say("- Launch parallel jobs")
118
+ s <- parSapply(
119
+ cl, k,
120
+ function(x) {
123
121
  library(cluster)
124
- s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
125
- c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
126
- })
122
+ s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
123
+ c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
124
+ }
125
+ )
126
+ say("- Stop cluster")
127
127
  stopCluster(cl)
128
- s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
129
- s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
130
- ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
131
- if(mean(s[1,]<0)<0.75) ds[s[1,]<0] <- mean(ds) # <- k's with negative average
128
+ say("- Calculate custom criteria")
129
+ s.avg.z <- (s[1,] - mean(s[1,])) / (sd(s[1,]) + 0.0001)
130
+ s.neg.z <- (s[2,] - mean(s[2,])) / (sd(s[2,]) + 0.01)
131
+ ds <- s.avg.z - s.neg.z - 2 / (1:length(k)) - (1:length(k)) / 50
132
+ if(mean(s[1,] < 0) < 0.75)
133
+ ds[s[1,] < 0] <- mean(ds) # <- k's with negative average
132
134
  top.n <- k[which.max(ds)]
133
135
 
134
136
  # Classify genomes
@@ -137,10 +139,21 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
137
139
  ani.types <- ani.cl$clustering
138
140
  ani.medoids <- ani.cl$medoids
139
141
 
142
+ # Build tree
143
+ say("Tree")
144
+ ani.ph <- bionj(ani.d)
145
+ say("- Write")
146
+ express.ori <- options("expressions")$expressions
147
+ if(express.ori < ani.ph$Nnode * 4){
148
+ options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
149
+ }
150
+ write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
151
+ options(expressions=express.ori)
152
+
140
153
  # Generate graphic report
141
154
  say("Graphic report")
142
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
143
- layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
155
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
156
+ layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
144
157
  plot_distances(ani.d)
145
158
  plot_silhouette(k, s[1,], s[2,], ds, top.n)
146
159
  plot_clustering(ani.cl, ani.d, ani.types)
@@ -153,112 +166,170 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
153
166
  # Return data
154
167
  say("Cluster ready")
155
168
  return(list(
156
- ani.medoids=ani.medoids,
157
- ani.types=ani.types,
158
- ani.d=ani.d
169
+ ani.medoids = ani.medoids,
170
+ ani.types = ani.types,
171
+ ani.d = ani.d
159
172
  ))
160
173
  }
161
174
 
162
175
  #= Helper functions
163
- say <- function(...) { message(paste("[",date(),"]",...,"\n"),appendLF=FALSE) }
176
+ say <- function (...) {
177
+ message(paste("[", date(), "]", ..., "\n"), appendLF = FALSE)
178
+ }
164
179
 
165
- generate_empty_files <- function(out_base) {
166
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
167
- plot(1, t="n", axes=F)
168
- legend("center", "No data", bty="n")
180
+ generate_empty_files <- function (out_base) {
181
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
182
+ plot(1, t = "n", axes = F)
183
+ legend("center", "No data", bty = "n")
169
184
  dev.off()
170
- file.create(paste(out_base,".1.classif",sep=""))
171
- file.create(paste(out_base,".1.medoids",sep=""))
185
+ file.create(paste(out_base, ".1.classif", sep = ""))
186
+ file.create(paste(out_base, ".1.medoids", sep = ""))
172
187
  }
173
188
 
174
- write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
175
- say('Text report')
176
- write.table(ani.medoids, paste(out_base, 'medoids', sep='.'),
177
- quote=FALSE, col.names=FALSE, row.names=FALSE)
178
- classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
179
- ani.d.m <- 100 - as.matrix(ani.d)*100
180
- for(j in 1:nrow(classif)){
189
+ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
190
+ say("Text report")
191
+ write.table(
192
+ ani.medoids, paste(out_base, "medoids", sep = "."),
193
+ quote = FALSE, col.names = FALSE, row.names = FALSE
194
+ )
195
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ani.types], NA)
196
+ ani.d.m <- 100 - as.matrix(ani.d) * 100
197
+ for (j in 1:nrow(classif)) {
181
198
  classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
182
199
  }
183
- write.table(classif, paste(out_base,"classif",sep="."),
184
- quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
200
+ write.table(
201
+ classif, paste(out_base, "classif", sep="."),
202
+ quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
203
+ )
185
204
  }
186
205
 
187
- plot_silhouette <- function(k, s, ns, ds, top.n) {
206
+ plot_silhouette <- function (k, s, ns, ds, top.n) {
188
207
  # s
189
- par(mar=c(4,5,1,5)+0.1)
190
- plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
191
- ylim=range(s), bty="n", xaxs="i", yaxt="n")
192
- polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
193
- axis(2, fg="grey60", col.axis="grey60")
194
- mtext("Mean silhouette", side=2, line=3, col="grey60")
208
+ par(mar = c(4,5,1,5)+0.1)
209
+ plot(
210
+ 1, t = "n", xlab = "k (clusters)", ylab = "", xlim = range(c(0,k)),
211
+ ylim = range(s), bty = "n", xaxs = "i", yaxt = "n"
212
+ )
213
+ polygon(c(k[1], k, k[length(k)]), c(0,s,0), border = NA, col = "grey80")
214
+ axis(2, fg = "grey60", col.axis = "grey60")
215
+ mtext("Mean silhouette", side = 2, line = 3, col = "grey60")
216
+
195
217
  # ns
196
- par(new=TRUE)
197
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
198
- ylim=range(ns), bty="n", xaxs="i")
199
- points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
200
- axis(4, fg="darkred", col.axis="darkred")
201
- mtext("Negative silhouette area", side=4, line=3, col="darkred")
218
+ par(new = TRUE)
219
+ plot(
220
+ 1, t = "n", bty = "n",
221
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
222
+ xlim = range(c(0,k)), ylim = range(ns)
223
+ )
224
+ points(k, ns, type = "o", pch = 16, col = rgb(1/2,0,0,3/4))
225
+ axis(4, fg = "darkred", col.axis = "darkred")
226
+ mtext("Negative silhouette area", side = 4, line = 3, col = "darkred")
227
+
202
228
  # ds
203
- par(new=TRUE)
204
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
205
- ylim=range(ds), bty="n", xaxs="i")
229
+ par(new = TRUE)
230
+ plot(
231
+ 1, t = "n", bty = "n",
232
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
233
+ xlim = range(c(0,k)), ylim = range(ds)
234
+ )
206
235
  lines(k, ds)
207
- abline(v=top.n, lty=2)
236
+ abline(v = top.n, lty = 2)
208
237
  }
209
238
 
210
- plot_distances <- function(dist) {
211
- par(mar=c(5,4,1,2)+0.1)
212
- hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
239
+ plot_distances <- function (dist) {
240
+ par(mar = c(5,4,1,2) + 0.1)
241
+ hist(
242
+ dist, border = NA, col = "grey60", breaks = 50,
243
+ xlab = "Distances", main = ""
244
+ )
213
245
  }
214
246
 
215
- plot_clustering <- function(cl, dist, types) {
216
- par(mar=c(5,4,4,2)+0.1)
247
+ plot_clustering <- function (cl, dist, types) {
248
+ par(mar = c(5,4,4,2) + 0.1)
217
249
  top.n <- length(cl$medoids)
218
250
  col <- ggplotColours(top.n)
219
- plot(silhouette(cl), col=col)
220
- if(length(labels(dist))<=15){
221
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
222
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
223
- }else{
224
- ani.mds <- cmdscale(dist, k=4)
225
- if(ncol(ani.mds)==4){
226
- plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
227
- xlab='Component 1', ylab='Component 2')
228
- plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
229
- xlab='Component 3', ylab='Component 4')
251
+ plot(silhouette(cl), col = col)
252
+ if (length(labels(dist)) <= 15) {
253
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
254
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
255
+ } else {
256
+ ani.mds <- cmdscale(dist, k = 4)
257
+ if (ncol(ani.mds) == 4) {
258
+ plot(
259
+ ani.mds[,1], ani.mds[,2], col = col[types], cex = 1/2,
260
+ xlab = "Component 1", ylab = "Component 2"
261
+ )
262
+ plot(
263
+ ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
264
+ xlab = "Component 3", ylab="Component 4"
265
+ )
230
266
  }else{
231
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
232
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
267
+ for (i in 1:2)
268
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
233
269
  }
234
270
  }
235
271
  }
236
272
 
237
- plot_tree <- function(phy, types, medoids){
273
+ plot_tree <- function (phy, types, medoids) {
238
274
  layout(1)
239
275
  top.n <- length(unique(types))
240
276
  col <- ggplotColours(top.n)
241
277
  is.medoid <- phy$tip.label %in% medoids
242
- phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
243
- " [", types[phy$tip.label[is.medoid]], "]", sep='')
244
- plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
245
- font=ifelse(is.medoid, 2, 1),
246
- tip.color=col[types[phy$tip.label]])
278
+ phy$tip.label[is.medoid] <- paste(
279
+ phy$tip.label[is.medoid],
280
+ " [", types[phy$tip.label[is.medoid]], "]",
281
+ sep = ""
282
+ )
283
+ plot(
284
+ phy, cex = ifelse(is.medoid, 1/3, 1/6),
285
+ font = ifelse(is.medoid, 2, 1),
286
+ tip.color = col[types[phy$tip.label]]
287
+ )
247
288
  }
248
289
 
249
- ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
250
- if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
251
- hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
290
+ ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
291
+ if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360 / n
292
+ hcl(h = seq(h[1], h[2], length = n), c = 100, l = 65, alpha = alpha)
293
+ }
294
+
295
+ ani_distance <- function (ani_file, sel) {
296
+ # Try to locate rds, otherwise read gzipped table
297
+ rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
298
+ if (file.exists(rds)) {
299
+ sim <- readRDS(rds)
300
+ } else {
301
+ sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
302
+ }
303
+
304
+ # If there is not data end process
305
+ if (nrow(sim) == 0) return(NULL)
306
+
307
+ # Apply filter (if requested)
308
+ if (!is.na(sel) && file.exists(sel)) {
309
+ say("Filter selection")
310
+ lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
311
+ sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
312
+ }
313
+
314
+ # Transform to distances
315
+ say("Distances")
316
+ sim$d <- 1 - (sim$value / 100)
317
+ return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
252
318
  }
253
319
 
254
320
  #= Main
255
321
  options(warn = 1)
256
- if(length(argv) >= 5 & argv[5] == 'empty'){
322
+ if (length(argv) >= 5 & argv[5] == "empty") {
257
323
  generate_empty_files(argv[2])
258
- write.table(NULL, paste(argv[2], "medoids", sep="."))
259
- write.table(NULL, paste(argv[2], "classif", sep="."))
260
- write.table(date(), paste(argv[2], "ready", sep="."))
324
+ write.table(NULL, paste(argv[2], "medoids", sep = "."))
325
+ write.table(NULL, paste(argv[2], "classif", sep = "."))
326
+ write.table(date(), paste(argv[2], "ready", sep = "."))
261
327
  }else{
262
- subclades(ani_file = argv[1], out_base = argv[2],
263
- thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
328
+ subclades(
329
+ ani_file = argv[1],
330
+ out_base = argv[2],
331
+ thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])),
332
+ sel = argv[4]
333
+ )
264
334
  }
335
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2.4
4
+ version: 1.1.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-16 00:00:00.000000000 Z
11
+ date: 2021-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons