miga-base 1.1.2.4 → 1.1.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: beb13a42d514a4cf731b3b042096728cd6d615124f1412046ad16fb8de2b709e
4
- data.tar.gz: 64e32095ac59863d4005e602e55a1396530aac12da40fe88316985c95a90eb12
3
+ metadata.gz: 683a93a244b106eb24f94cff47695b28cc051701ecd4ea83b289a6650fdf3985
4
+ data.tar.gz: 6ce192df7a66502535a5c24406d9d3f1d0a1d51c11f3fb6e08f353547661c053
5
5
  SHA512:
6
- metadata.gz: d8f813d3b32cfdfb86da746e63c8e7a3a5c142822dc44a7c2588219368bed31851a77cd05ce30ee2d1dc02d930beaab836cda58a122cc5955d6ec0235b2edb42
7
- data.tar.gz: 81d773c3d4078ec3fd27d389bb1159aa9585dafdd2efdb5a917471ec784d9ebf24e67f2fba5358a4c4c589d03a12ab9f3e1b120e83390e0f0354a059a0cbb568
6
+ metadata.gz: cd07ba5ea8d3088ba9e7752c13bcdc07a0cf73175687c021e93c6df9361abd6dab361ffaed562f421f551999deee3de4f32d388f96f050ff45750d22bbe3eaf3
7
+ data.tar.gz: 18ba878127bc0abb7aed7b63398091cc2ed0fbee0d8629850a25d26ab9690f4489a8aca4fa8a3e2f2a8d4e80bdf1fd065511c23774327e988fac9ff0df645041
@@ -59,13 +59,14 @@ module MiGA::Cli::Action::Doctor::Base
59
59
  next if (lineno += 1) == 1
60
60
 
61
61
  r = ln.split("\t")
62
- next unless [1, 2].map { |i| p.dataset(r[i]).nil? }.any?
62
+ names = [r[0], r[1]]
63
+ next unless names.any? { |i| p.dataset(i).nil? }
63
64
 
64
- [1, 2].each do |i|
65
- if p.dataset(r[i]).nil? || !p.dataset(r[i]).active?
66
- notok[r[i]] = true
65
+ names.each do |i|
66
+ if p.dataset(i).nil? || !p.dataset(i).active?
67
+ notok[i] = true
67
68
  else
68
- fix[r[i]] = true
69
+ fix[i] = true
69
70
  end
70
71
  end
71
72
  end
@@ -55,10 +55,11 @@ module MiGA::Project::Result
55
55
  ##
56
56
  # Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
57
57
  def add_result_distances(base, _opts)
58
- return nil unless result_files_exist?(base, %w[.Rdata .txt])
58
+ return nil unless result_files_exist?(base, %w[.rds .txt])
59
59
 
60
60
  r = MiGA::Result.new("#{base}.json")
61
- r.add_file(:rdata, 'miga-project.Rdata')
61
+ r.add_file(:rds, 'miga-project.rds')
62
+ r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
62
63
  r.add_file(:matrix, 'miga-project.txt')
63
64
  r.add_file(:log, 'miga-project.log') # Legacy file
64
65
  r.add_file(:hist, 'miga-project.hist')
@@ -82,12 +83,13 @@ module MiGA::Project::Result
82
83
  end
83
84
 
84
85
  r = add_result_iter_clades(base)
85
- r.add_file(:aai_tree, 'miga-project.aai.nwk')
86
- r.add_file(:proposal, 'miga-project.proposed-clades')
87
- r.add_file(:clades_aai90, 'miga-project.aai90-clades')
88
- r.add_file(:clades_ani95, 'miga-project.ani95-clades')
89
- r.add_file(:clades_gsp, 'miga-project.gsp-clades')
90
- r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
86
+ r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
87
+ r.add_file(:aai_tree, 'miga-project.aai.nwk')
88
+ r.add_file(:proposal, 'miga-project.proposed-clades')
89
+ r.add_file(:clades_aai90, 'miga-project.aai90-clades')
90
+ r.add_file(:clades_ani95, 'miga-project.ani95-clades')
91
+ r.add_file(:clades_gsp, 'miga-project.gsp-clades')
92
+ r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
91
93
  r
92
94
  end
93
95
 
@@ -105,6 +107,7 @@ module MiGA::Project::Result
105
107
 
106
108
  r = add_result_iter_clades(base)
107
109
  r.add_file(:ani_tree, 'miga-project.ani.nwk')
110
+ r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
108
111
  r
109
112
  end
110
113
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.1, 2, 4].freeze
15
+ VERSION = [1.1, 3, 3].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2021, 11, 16)
23
+ VERSION_DATE = Date.new(2021, 11, 29)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- aai <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(aai, file = 'miga-project.Rdata')
28
- if(sum(aai[, 'a'] != aai[, 'b']) > 0) {
29
- h <- hist(aai[aai[, 'a'] != aai[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ aai <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(aai, file = "miga-project.rds")
28
+ if(sum(aai[, "a"] != aai[, "b"]) > 0) {
29
+ h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
22
22
 
23
23
  # R-ify
24
24
  cat <<R | R --vanilla
25
- file <- gzfile('miga-project.txt.gz')
26
- ani <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
- save(ani, file = 'miga-project.Rdata')
28
- if(sum(ani[, 'a'] != ani[, 'b']) > 0) {
29
- h <- hist(ani[ani[, 'a'] != ani[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
- len <- length(h[['breaks']])
25
+ file <- gzfile("miga-project.txt.gz")
26
+ ani <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
27
+ saveRDS(ani, file = "miga-project.rds")
28
+ if(sum(ani[, "a"] != ani[, "b"]) > 0) {
29
+ h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
30
+ len <- length(h[["breaks"]])
31
31
  write.table(
32
- cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
- file = 'miga-project.hist', quote = FALSE, sep = '\t',
32
+ cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
33
+ file = "miga-project.hist", quote = FALSE, sep = "\t",
34
34
  col.names = FALSE, row.names = FALSE
35
35
  )
36
36
  }
@@ -15,7 +15,7 @@ ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
15
15
  # No real need for hAAI distributions at all
16
16
  echo -n "" > miga-project.log
17
17
  echo -n "" > miga-project.txt
18
- echo "aai <- NULL; save(aai, file = 'miga-project.Rdata')" | R --vanilla
18
+ echo 'aai <- NULL; saveRDS(aai, file = "miga-project.rds")' | R --vanilla
19
19
 
20
20
  # Finalize
21
21
  miga_end_project_step "$DIR"
data/test/project_test.rb CHANGED
@@ -82,7 +82,7 @@ class ProjectTest < Test::Unit::TestCase
82
82
  def test_add_result
83
83
  p1 = project
84
84
  assert_nil(p1.add_result(:doom))
85
- %w[.Rdata .log .txt .done].each do |x|
85
+ %w[.rds .log .txt .done].each do |x|
86
86
  assert_nil(p1.add_result(:haai_distances))
87
87
  FileUtils.touch(
88
88
  File.join(
@@ -117,11 +117,12 @@ class ProjectTest < Test::Unit::TestCase
117
117
  # Project tasks
118
118
  expected_files = {
119
119
  project_stats: %w[.taxonomy.json .metadata.db],
120
- haai_distances: %w[.Rdata .log .txt],
121
- aai_distances: %w[.Rdata .log .txt],
122
- ani_distances: %w[.Rdata .log .txt],
123
- clade_finding: %w[.pdf .classif .medoids
124
- .class.tsv .class.nwk .proposed-clades],
120
+ haai_distances: %w[.rds .log .txt],
121
+ aai_distances: %w[.rds .log .txt],
122
+ ani_distances: %w[.rds .log .txt],
123
+ clade_finding: %w[
124
+ .pdf .classif .medoids .class.tsv .class.nwk .proposed-clades
125
+ ],
125
126
  subclades: %w[.pdf .classif .medoids .class.tsv .class.nwk],
126
127
  ogs: %w[.ogs .stats]
127
128
  }
@@ -2702,6 +2702,8 @@ def merge_db_opts():
2702
2702
 
2703
2703
  parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
2704
2704
 
2705
+ parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'Alternative way to supply donors. A file containing paths to the donor databases, 1 per line')
2706
+
2705
2707
  parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
2706
2708
 
2707
2709
  parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
@@ -2720,16 +2722,23 @@ def merge_db_thread_starter(rev_index, per_db_accs):
2720
2722
 
2721
2723
 
2722
2724
 
2723
- def merge_db(recipient, donors, verbose, threads):
2725
+ def merge_db(recipient, donors, donor_file, verbose, threads):
2724
2726
  #Prettier on the CLI
2725
-
2727
+
2728
+ if donor_file is not None:
2729
+ fh = agnostic_reader(donor_file)
2730
+ donors = [line.strip() for line in fh]
2731
+ fh.close()
2732
+
2726
2733
  if donors is None or recipient is None:
2727
2734
  print("Either donor or target not given. FastAAI is exiting.")
2728
2735
  return None
2729
2736
 
2730
2737
  print("")
2731
2738
 
2732
- donors = donors.split(",")
2739
+ if donor_file is None:
2740
+ donors = donors.split(",")
2741
+
2733
2742
  valid_donors = []
2734
2743
  for d in donors:
2735
2744
  if os.path.exists(d):
@@ -3454,10 +3463,11 @@ def main():
3454
3463
 
3455
3464
  recipient = opts.recipient
3456
3465
  donors = opts.donors
3466
+ donor_file = opts.donor_file
3457
3467
  verbose = opts.verbose
3458
3468
  threads = opts.threads
3459
3469
 
3460
- merge_db(recipient, donors, verbose, threads)
3470
+ merge_db(recipient, donors, donor_file, verbose, threads)
3461
3471
 
3462
3472
  #################### Query files vs DB ########################
3463
3473
 
@@ -151,22 +151,36 @@ module MiGA::DistanceRunner::Commands
151
151
  donors << tgt_idx if tgt_idx
152
152
  end
153
153
  return nil if donors.empty?
154
- run_cmd <<~CMD
155
- FastAAI merge_db --donors "#{donors.join(',')}" \
156
- --recipient "#{f1 = tmp_file}" --threads #{opts[:thr]}
157
- CMD
154
+
155
+ # Build target database
156
+ f1 = tmp_file
157
+ if donors.size == 1
158
+ File.copy(donors.first, f1)
159
+ else
160
+ File.open(f0 = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
161
+ run_cmd(
162
+ <<~CMD
163
+ FastAAI merge_db --threads #{opts[:thr]} \
164
+ --donor_file "#{f0}" --recipient "#{f1}"
165
+ CMD
166
+ )
167
+ raise "Cannot merge databases into: #{f1}" unless File.size?(f1)
168
+ end
158
169
 
159
170
  # Run FastAAI
160
- run_cmd <<~CMD
161
- FastAAI db_query --query "#{qry_idx}" --target "#{f1}" \
162
- --output "#{f2 = tmp_file}" --threads #{opts[:thr]} \
163
- --do_stdev
164
- CMD
171
+ run_cmd(
172
+ <<~CMD
173
+ FastAAI db_query --query "#{qry_idx}" --target "#{f1}" \
174
+ --output "#{f2 = tmp_file}" --threads #{opts[:thr]} \
175
+ --do_stdev
176
+ CMD
177
+ )
178
+ raise "Cannot find FastAAI output directory: #{f2}" unless Dir.exist?(f2)
165
179
 
166
180
  # Save values in the databases
167
181
  haai_data = {}
168
182
  aai_data = {}
169
- # Ugly workaround to the insistence of FastAAI to not provide the files
183
+ # Ugly workaround to the insistence of FastAAI not to provide the files
170
184
  # I ask for ;-)
171
185
  qry_results = File.basename(qry_idx, '.faix') + '_results.txt'
172
186
  out_file = File.join(f2, 'results', qry_results)
@@ -214,6 +228,6 @@ module MiGA::DistanceRunner::Commands
214
228
 
215
229
  def run_cmd(cmd)
216
230
  puts "CMD: #{cmd}"
217
- `#{cmd}`
231
+ puts `#{cmd} 2>&1`
218
232
  end
219
233
  end
@@ -127,6 +127,7 @@ module MiGA::DistanceRunner::Database
127
127
  db = tmp_dbs[metric]
128
128
  table = metric == :haai ? :aai : metric
129
129
  SQLite3::Database.new(db) do |conn|
130
+ conn.execute('BEGIN TRANSACTION')
130
131
  data.each do |k, v|
131
132
  sql = <<~SQL
132
133
  insert into #{table} (
@@ -135,6 +136,7 @@ module MiGA::DistanceRunner::Database
135
136
  SQL
136
137
  conn.execute(sql, [dataset.name, k] + v)
137
138
  end
139
+ conn.execute('COMMIT')
138
140
  end
139
141
  checkpoint(metric)
140
142
  end
data/utils/find-medoid.R CHANGED
@@ -5,26 +5,28 @@
5
5
  #
6
6
 
7
7
  #= Load stuff
8
- argv <- commandArgs(trailingOnly = T)
8
+ argv <- commandArgs(trailingOnly = TRUE)
9
9
  suppressPackageStartupMessages(library(ape))
10
- if(Sys.getenv('MIGA') == ''){
10
+ if(Sys.getenv("MIGA") == ""){
11
11
  suppressPackageStartupMessages(library(enveomics.R))
12
12
  }else{
13
- source(file.path(Sys.getenv('MIGA'),
14
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
13
+ source(file.path(
14
+ Sys.getenv("MIGA"),
15
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
16
+ ))
15
17
  }
16
18
 
17
- find_medoids <- function(ani.df, out, clades) {
19
+ find_medoids <- function (ani.df, out, clades) {
18
20
  if(nrow(ani.df) == 0) return(NULL)
19
21
  ani.df$d <- 1 - (ani.df$value/100)
20
- dist <- enve.df2dist(ani.df, 'a', 'b', 'd', default.d = max(ani.df$d)*1.2)
22
+ dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
21
23
  dist <- as.matrix(dist)
22
- cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
24
+ cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
23
25
  cl.s <- c()
24
26
  medoids <- c()
25
27
  for(i in cl){
26
- lab <- strsplit(i, ',')[[1]]
27
- cat('Clade of:', lab[1], '\n')
28
+ lab <- strsplit(i, ",")[[1]]
29
+ cat("Clade of:", lab[1], "\n")
28
30
  if(length(lab) == 1) {
29
31
  lab.s <- lab
30
32
  } else {
@@ -32,15 +34,17 @@ find_medoids <- function(ani.df, out, clades) {
32
34
  }
33
35
  med <- lab.s[1]
34
36
  medoids <- c(medoids, med)
35
- cl.s <- c(cl.s, paste(lab.s, collapse = ','))
37
+ cl.s <- c(cl.s, paste(lab.s, collapse = ","))
36
38
  }
37
39
  write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
38
- write.table(cl.s, paste(clades, '.sorted', sep = ''), quote = FALSE,
39
- row.names = FALSE, col.names = FALSE)
40
+ write.table(
41
+ cl.s, paste(clades, ".sorted", sep = ""), quote = FALSE,
42
+ row.names = FALSE, col.names = FALSE
43
+ )
40
44
  }
41
45
 
42
46
  #= Main
43
- load(argv[1])
44
- if(! exists('ani')) ani <- aai
47
+ cat("Finding Medoids")
48
+ ani <- readRDS(argv[1])
45
49
  find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
46
50
 
@@ -7,9 +7,12 @@ module MiGA::SubcladeRunner::Pipeline
7
7
  aai90: [:aai_distances, opts[:gsp_aai], :aai]
8
8
  }
9
9
  tasks.each do |k, par|
10
+ # Run only the requested metric
11
+ next unless par[2].to_s == opts[:gsp_metric]
12
+
10
13
  # Final output
11
14
  ogs_file = "miga-project.#{k}-clades"
12
- next if File.size? ogs_file
15
+ next if File.size?(ogs_file)
13
16
 
14
17
  # Build ABC files
15
18
  abc_path = tmp_file("#{k}.abc")
@@ -20,7 +23,7 @@ module MiGA::SubcladeRunner::Pipeline
20
23
  next if ln =~ /^a\tb\tvalue\t/
21
24
 
22
25
  r = ln.chomp.split("\t")
23
- ofh.puts "G>#{r[0]}\tG>#{r[1]}\t#{r[2]}" if r[2].to_f >= par[1]
26
+ ofh.puts("G>#{r[0]}\tG>#{r[1]}\t#{r[2]}") if r[2].to_f >= par[1]
24
27
  end
25
28
  end
26
29
  ofh.close
@@ -29,22 +32,20 @@ module MiGA::SubcladeRunner::Pipeline
29
32
  `ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
30
33
  File.open(ogs_file, 'w') do |fh|
31
34
  File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
32
- fh.puts ln if lno > 0
35
+ fh.puts(ln) if lno > 0
33
36
  end
34
37
  end
35
38
  File.unlink "#{ogs_file}.tmp"
36
39
  else
37
- FileUtils.touch ogs_file
38
- end
39
- if par[2].to_s == opts[:gsp_metric]
40
- FileUtils.cp(ogs_file, "miga-project.gsp-clades")
40
+ FileUtils.touch(ogs_file)
41
41
  end
42
+ FileUtils.cp(ogs_file, 'miga-project.gsp-clades')
42
43
  end
43
44
 
44
45
  # Find genomospecies medoids
45
46
  src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
46
47
  dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
47
- `Rscript '#{src}' ../../09.distances/#{dir}/miga-project.Rdata \
48
+ `Rscript '#{src}' '../../09.distances/#{dir}/miga-project.rds' \
48
49
  miga-project.gsp-medoids miga-project.gsp-clades`
49
50
  if File.exist? 'miga-project.gsp-clades.sorted'
50
51
  File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
@@ -54,8 +55,6 @@ module MiGA::SubcladeRunner::Pipeline
54
55
  ofh = File.open('miga-project.proposed-clades', 'w')
55
56
  File.open('miga-project.gsp-clades', 'r') do |ifh|
56
57
  ifh.each_line do |ln|
57
- next if $. == 1
58
-
59
58
  r = ln.chomp.split(',')
60
59
  ofh.puts r.join("\t") if r.size >= 5
61
60
  end
@@ -69,7 +68,7 @@ module MiGA::SubcladeRunner::Pipeline
69
68
  metric_res = project.result(step) or raise "Incomplete step #{step}"
70
69
  matrix = metric_res.file_path(:matrix)
71
70
  `Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}' \
72
- miga-project.ani95-medoids '#{opts[:run_clades] ? 'cluster' : 'empty'}'`
71
+ miga-project.gsp-medoids '#{opts[:run_clades] ? 'cluster' : 'empty'}'`
73
72
  if File.exist? 'miga-project.nwk'
74
73
  File.rename('miga-project.nwk', "miga-project.#{metric}.nwk")
75
74
  end
data/utils/subclades.R CHANGED
@@ -10,56 +10,55 @@ suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(parallel))
13
- if(Sys.getenv('MIGA') == ''){
13
+ if(Sys.getenv("MIGA") == ""){
14
14
  suppressPackageStartupMessages(library(enveomics.R))
15
15
  }else{
16
- source(file.path(Sys.getenv('MIGA'),
17
- 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
16
+ source(file.path(
17
+ Sys.getenv("MIGA"),
18
+ "utils", "enveomics", "enveomics.R", "R", "df2dist.R"
19
+ ))
18
20
  }
19
21
 
20
22
  #= Main function
21
23
  subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
22
- say('==> Out base:', out_base, '<==')
24
+ say("==> Out base:", out_base, "<==")
23
25
 
24
26
  # Normalize input matrix
25
- dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
26
- if(!missing(ani_file)){
27
- if(length(ani.d) == 0 && !file.exists(dist_rdata)){
28
- # Read from ani_file
29
- a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
30
- if(nrow(a) == 0){
31
- generate_empty_files(out_base)
32
- return(NULL)
33
- }
34
- if(!is.na(sel) && file.exists(sel)){
35
- say('Filter selection')
36
- lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
37
- a <- a[a$a %in% lab & a$b %in% lab, ]
27
+ dist_rds <- paste(out_base, "dist.rds", sep = ".")
28
+ if (!missing(ani_file)) {
29
+ if (length(ani.d) == 0) {
30
+ if (file.exists(dist_rds)) {
31
+ ani.d <- readRDS(dist_rds)
32
+ } else {
33
+ # Read from ani_file
34
+ ani.d <- ani_distance(ani_file, sel)
35
+ if (is.null(ani.d)) {
36
+ generate_empty_files(out_base)
37
+ return(NULL)
38
+ } else {
39
+ saveRDS(ani.d, dist_rds)
40
+ }
38
41
  }
39
- say('Distances')
40
- a$d <- 1 - (a$value/100)
41
- ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
42
- save(ani.d, file = dist_rdata)
43
42
  }
44
43
  }
45
44
 
46
45
  # Read result if the subclade is ready, run it otherwise
47
- if(file.exists(paste(out_base, 'classif', sep = '.'))){
46
+ if (file.exists(paste(out_base, "classif", sep = "."))) {
48
47
  say("Loading")
49
48
  ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
50
- sep = ' ', as.is = TRUE)[,1]
49
+ sep = " ", as.is = TRUE)[,1]
51
50
  a <- read.table(paste(out_base, "classif", sep="."),
52
- sep = '\t', as.is = TRUE)
51
+ sep = "\t", as.is = TRUE)
53
52
  ani.types <- a[,2]
54
53
  names(ani.types) <- a[,1]
55
- if(length(ani.d) == 0) load(dist_rdata)
56
- }else if(length(labels(ani.d)) > 8L){
57
- res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
58
- if(length(res) == 0) return(NULL)
59
- ani.medoids <- res[['ani.medoids']]
60
- ani.types <- res[['ani.types']]
61
- ani.d <- res[['ani.d']]
62
- }else{
54
+ if(length(ani.d) == 0) ani.d <- readRDS(dist_rds)
55
+ } else if (length(labels(ani.d)) > 8L) {
56
+ res <- subclade_clustering(out_base, thr, ani.d, dist_rds)
57
+ if (length(res) == 0) return(NULL)
58
+ ani.medoids <- res[["ani.medoids"]]
59
+ ani.types <- res[["ani.types"]]
60
+ ani.d <- res[["ani.d"]]
61
+ } else {
63
62
  ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
64
63
  ani.types <- rep(1, length(labels(ani.d)))
65
64
  names(ani.types) <- labels(ani.d)
@@ -69,66 +68,69 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
69
68
 
70
69
  # Recursive search
71
70
  say("Recursive search")
72
- for(i in 1:length(ani.medoids)){
71
+ for (i in 1:length(ani.medoids)) {
73
72
  medoid <- ani.medoids[i]
74
73
  ds_f <- names(ani.types)[ ani.types==i ]
75
74
  say("Analyzing subclade", i, "with medoid:", medoid)
76
75
  dir_f <- paste(out_base, ".sc-", i, sep="")
77
- if(!dir.exists(dir_f)) dir.create(dir_f)
76
+ if (!dir.exists(dir_f)) dir.create(dir_f)
78
77
  write.table(ds_f,
79
78
  paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
80
79
  quote=FALSE, col.names=FALSE, row.names=FALSE)
81
- if(length(ds_f) > 8L){
80
+ if (length(ds_f) > 8L) {
82
81
  ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
83
- subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
84
- thr=thr, ani.d=ani_subset)
82
+ subclades(
83
+ out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
84
+ thr = thr,
85
+ ani.d = ani_subset
86
+ )
85
87
  }
86
88
  }
87
89
 
88
90
  # Declare recursion up-to-here complete
89
- write.table(date(), paste(out_base, 'ready', sep='.'),
90
- quote=FALSE, row.names=FALSE, col.names=FALSE)
91
+ write.table(
92
+ date(), paste(out_base, "ready", sep = "."),
93
+ quote = FALSE, row.names = FALSE, col.names = FALSE
94
+ )
91
95
  }
92
96
 
93
97
  #= Heavy-lifter
94
- subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
98
+ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
95
99
  # Get ANI distances
96
- if(length(ani.d) > 0){
97
- # Just use ani.d (and save in dist_rdata_
98
- save(ani.d, file=dist_rdata)
99
- }else if(file.exists(dist_rdata)){
100
- # Read from dist_rdata
101
- load(dist_rdata)
102
- }else{
100
+ if (length(ani.d) > 0) {
101
+ # Just use ani.d (and save in dist_rds)
102
+ if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
103
+ } else if (file.exists(dist_rds)) {
104
+ # Read from dist_rds
105
+ ani.d <- readRDS(dist_rds)
106
+ } else {
103
107
  stop("Cannot find input matrix", out_base)
104
108
  }
105
- if(length(labels(ani.d)) <= 8L) return(list())
109
+ if (length(labels(ani.d)) <= 8L) return(list())
106
110
 
107
- # Build tree
108
- say("Tree")
109
- ani.ph <- bionj(ani.d)
110
- express.ori <- options('expressions')$expressions
111
- if(express.ori < ani.ph$Nnode*4){
112
- options(expressions=min(c(5e7,ani.ph$Nnode*4)))
113
- }
114
- write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
115
- options(expressions=express.ori)
116
-
117
111
  # Silhouette
118
112
  say("Silhouette")
119
113
  nn <- length(labels(ani.d))
120
114
  k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
115
+ say("- Make cluster")
121
116
  cl <- makeCluster(thr)
122
- s <- parSapply(cl, k, function(x) {
117
+ say("- Launch parallel jobs")
118
+ s <- parSapply(
119
+ cl, k,
120
+ function(x) {
123
121
  library(cluster)
124
- s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
125
- c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
126
- })
122
+ s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
123
+ c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
124
+ }
125
+ )
126
+ say("- Stop cluster")
127
127
  stopCluster(cl)
128
- s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
129
- s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
130
- ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
131
- if(mean(s[1,]<0)<0.75) ds[s[1,]<0] <- mean(ds) # <- k's with negative average
128
+ say("- Calculate custom criteria")
129
+ s.avg.z <- (s[1,] - mean(s[1,])) / (sd(s[1,]) + 0.0001)
130
+ s.neg.z <- (s[2,] - mean(s[2,])) / (sd(s[2,]) + 0.01)
131
+ ds <- s.avg.z - s.neg.z - 2 / (1:length(k)) - (1:length(k)) / 50
132
+ if(mean(s[1,] < 0) < 0.75)
133
+ ds[s[1,] < 0] <- mean(ds) # <- k's with negative average
132
134
  top.n <- k[which.max(ds)]
133
135
 
134
136
  # Classify genomes
@@ -137,10 +139,21 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
137
139
  ani.types <- ani.cl$clustering
138
140
  ani.medoids <- ani.cl$medoids
139
141
 
142
+ # Build tree
143
+ say("Tree")
144
+ ani.ph <- bionj(ani.d)
145
+ say("- Write")
146
+ express.ori <- options("expressions")$expressions
147
+ if(express.ori < ani.ph$Nnode * 4){
148
+ options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
149
+ }
150
+ write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
151
+ options(expressions=express.ori)
152
+
140
153
  # Generate graphic report
141
154
  say("Graphic report")
142
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
143
- layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
155
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
156
+ layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
144
157
  plot_distances(ani.d)
145
158
  plot_silhouette(k, s[1,], s[2,], ds, top.n)
146
159
  plot_clustering(ani.cl, ani.d, ani.types)
@@ -153,112 +166,170 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
153
166
  # Return data
154
167
  say("Cluster ready")
155
168
  return(list(
156
- ani.medoids=ani.medoids,
157
- ani.types=ani.types,
158
- ani.d=ani.d
169
+ ani.medoids = ani.medoids,
170
+ ani.types = ani.types,
171
+ ani.d = ani.d
159
172
  ))
160
173
  }
161
174
 
162
175
  #= Helper functions
163
- say <- function(...) { message(paste("[",date(),"]",...,"\n"),appendLF=FALSE) }
176
+ say <- function (...) {
177
+ message(paste("[", date(), "]", ..., "\n"), appendLF = FALSE)
178
+ }
164
179
 
165
- generate_empty_files <- function(out_base) {
166
- pdf(paste(out_base, ".pdf", sep=""), 7, 12)
167
- plot(1, t="n", axes=F)
168
- legend("center", "No data", bty="n")
180
+ generate_empty_files <- function (out_base) {
181
+ pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
182
+ plot(1, t = "n", axes = F)
183
+ legend("center", "No data", bty = "n")
169
184
  dev.off()
170
- file.create(paste(out_base,".1.classif",sep=""))
171
- file.create(paste(out_base,".1.medoids",sep=""))
185
+ file.create(paste(out_base, ".1.classif", sep = ""))
186
+ file.create(paste(out_base, ".1.medoids", sep = ""))
172
187
  }
173
188
 
174
- write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
175
- say('Text report')
176
- write.table(ani.medoids, paste(out_base, 'medoids', sep='.'),
177
- quote=FALSE, col.names=FALSE, row.names=FALSE)
178
- classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
179
- ani.d.m <- 100 - as.matrix(ani.d)*100
180
- for(j in 1:nrow(classif)){
189
+ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
190
+ say("Text report")
191
+ write.table(
192
+ ani.medoids, paste(out_base, "medoids", sep = "."),
193
+ quote = FALSE, col.names = FALSE, row.names = FALSE
194
+ )
195
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ani.types], NA)
196
+ ani.d.m <- 100 - as.matrix(ani.d) * 100
197
+ for (j in 1:nrow(classif)) {
181
198
  classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
182
199
  }
183
- write.table(classif, paste(out_base,"classif",sep="."),
184
- quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
200
+ write.table(
201
+ classif, paste(out_base, "classif", sep="."),
202
+ quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
203
+ )
185
204
  }
186
205
 
187
- plot_silhouette <- function(k, s, ns, ds, top.n) {
206
+ plot_silhouette <- function (k, s, ns, ds, top.n) {
188
207
  # s
189
- par(mar=c(4,5,1,5)+0.1)
190
- plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
191
- ylim=range(s), bty="n", xaxs="i", yaxt="n")
192
- polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
193
- axis(2, fg="grey60", col.axis="grey60")
194
- mtext("Mean silhouette", side=2, line=3, col="grey60")
208
+ par(mar = c(4,5,1,5)+0.1)
209
+ plot(
210
+ 1, t = "n", xlab = "k (clusters)", ylab = "", xlim = range(c(0,k)),
211
+ ylim = range(s), bty = "n", xaxs = "i", yaxt = "n"
212
+ )
213
+ polygon(c(k[1], k, k[length(k)]), c(0,s,0), border = NA, col = "grey80")
214
+ axis(2, fg = "grey60", col.axis = "grey60")
215
+ mtext("Mean silhouette", side = 2, line = 3, col = "grey60")
216
+
195
217
  # ns
196
- par(new=TRUE)
197
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
198
- ylim=range(ns), bty="n", xaxs="i")
199
- points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
200
- axis(4, fg="darkred", col.axis="darkred")
201
- mtext("Negative silhouette area", side=4, line=3, col="darkred")
218
+ par(new = TRUE)
219
+ plot(
220
+ 1, t = "n", bty = "n",
221
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
222
+ xlim = range(c(0,k)), ylim = range(ns)
223
+ )
224
+ points(k, ns, type = "o", pch = 16, col = rgb(1/2,0,0,3/4))
225
+ axis(4, fg = "darkred", col.axis = "darkred")
226
+ mtext("Negative silhouette area", side = 4, line = 3, col = "darkred")
227
+
202
228
  # ds
203
- par(new=TRUE)
204
- plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
205
- ylim=range(ds), bty="n", xaxs="i")
229
+ par(new = TRUE)
230
+ plot(
231
+ 1, t = "n", bty = "n",
232
+ xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
233
+ xlim = range(c(0,k)), ylim = range(ds)
234
+ )
206
235
  lines(k, ds)
207
- abline(v=top.n, lty=2)
236
+ abline(v = top.n, lty = 2)
208
237
  }
209
238
 
210
- plot_distances <- function(dist) {
211
- par(mar=c(5,4,1,2)+0.1)
212
- hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
239
+ plot_distances <- function (dist) {
240
+ par(mar = c(5,4,1,2) + 0.1)
241
+ hist(
242
+ dist, border = NA, col = "grey60", breaks = 50,
243
+ xlab = "Distances", main = ""
244
+ )
213
245
  }
214
246
 
215
- plot_clustering <- function(cl, dist, types) {
216
- par(mar=c(5,4,4,2)+0.1)
247
+ plot_clustering <- function (cl, dist, types) {
248
+ par(mar = c(5,4,4,2) + 0.1)
217
249
  top.n <- length(cl$medoids)
218
250
  col <- ggplotColours(top.n)
219
- plot(silhouette(cl), col=col)
220
- if(length(labels(dist))<=15){
221
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
222
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
223
- }else{
224
- ani.mds <- cmdscale(dist, k=4)
225
- if(ncol(ani.mds)==4){
226
- plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
227
- xlab='Component 1', ylab='Component 2')
228
- plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
229
- xlab='Component 3', ylab='Component 4')
251
+ plot(silhouette(cl), col = col)
252
+ if (length(labels(dist)) <= 15) {
253
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
254
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
255
+ } else {
256
+ ani.mds <- cmdscale(dist, k = 4)
257
+ if (ncol(ani.mds) == 4) {
258
+ plot(
259
+ ani.mds[,1], ani.mds[,2], col = col[types], cex = 1/2,
260
+ xlab = "Component 1", ylab = "Component 2"
261
+ )
262
+ plot(
263
+ ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
264
+ xlab = "Component 3", ylab="Component 4"
265
+ )
230
266
  }else{
231
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
232
- plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
267
+ for (i in 1:2)
268
+ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
233
269
  }
234
270
  }
235
271
  }
236
272
 
237
- plot_tree <- function(phy, types, medoids){
273
+ plot_tree <- function (phy, types, medoids) {
238
274
  layout(1)
239
275
  top.n <- length(unique(types))
240
276
  col <- ggplotColours(top.n)
241
277
  is.medoid <- phy$tip.label %in% medoids
242
- phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
243
- " [", types[phy$tip.label[is.medoid]], "]", sep='')
244
- plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
245
- font=ifelse(is.medoid, 2, 1),
246
- tip.color=col[types[phy$tip.label]])
278
+ phy$tip.label[is.medoid] <- paste(
279
+ phy$tip.label[is.medoid],
280
+ " [", types[phy$tip.label[is.medoid]], "]",
281
+ sep = ""
282
+ )
283
+ plot(
284
+ phy, cex = ifelse(is.medoid, 1/3, 1/6),
285
+ font = ifelse(is.medoid, 2, 1),
286
+ tip.color = col[types[phy$tip.label]]
287
+ )
247
288
  }
248
289
 
249
- ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
250
- if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
251
- hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
290
+ ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
291
+ if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360 / n
292
+ hcl(h = seq(h[1], h[2], length = n), c = 100, l = 65, alpha = alpha)
293
+ }
294
+
295
+ ani_distance <- function (ani_file, sel) {
296
+ # Try to locate rds, otherwise read gzipped table
297
+ rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
298
+ if (file.exists(rds)) {
299
+ sim <- readRDS(rds)
300
+ } else {
301
+ sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
302
+ }
303
+
304
+ # If there is not data end process
305
+ if (nrow(sim) == 0) return(NULL)
306
+
307
+ # Apply filter (if requested)
308
+ if (!is.na(sel) && file.exists(sel)) {
309
+ say("Filter selection")
310
+ lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
311
+ sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
312
+ }
313
+
314
+ # Transform to distances
315
+ say("Distances")
316
+ sim$d <- 1 - (sim$value / 100)
317
+ return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
252
318
  }
253
319
 
254
320
  #= Main
255
321
  options(warn = 1)
256
- if(length(argv) >= 5 & argv[5] == 'empty'){
322
+ if (length(argv) >= 5 & argv[5] == "empty") {
257
323
  generate_empty_files(argv[2])
258
- write.table(NULL, paste(argv[2], "medoids", sep="."))
259
- write.table(NULL, paste(argv[2], "classif", sep="."))
260
- write.table(date(), paste(argv[2], "ready", sep="."))
324
+ write.table(NULL, paste(argv[2], "medoids", sep = "."))
325
+ write.table(NULL, paste(argv[2], "classif", sep = "."))
326
+ write.table(date(), paste(argv[2], "ready", sep = "."))
261
327
  }else{
262
- subclades(ani_file = argv[1], out_base = argv[2],
263
- thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
328
+ subclades(
329
+ ani_file = argv[1],
330
+ out_base = argv[2],
331
+ thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])),
332
+ sel = argv[4]
333
+ )
264
334
  }
335
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2.4
4
+ version: 1.1.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-16 00:00:00.000000000 Z
11
+ date: 2021-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons