miga-base 0.3.5.1 → 0.3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 25f6a339288dbdbeda1f84f5da6b5de697d3790c0b38d138a4416ff762cae936
4
- data.tar.gz: 60187223750e983fafd6088935a912d016bbf4acafaf79b548e206c50076cc04
2
+ SHA1:
3
+ metadata.gz: 8b285b9906876a9f1b5366f929a4776d1689dbc1
4
+ data.tar.gz: 83b6843d00417fef4a8de18e4a102ad4d1899f0d
5
5
  SHA512:
6
- metadata.gz: 60b59ccc8fc3bf9f5a584f3221c268839ba4cd41eb3a6c16911f1808944c20100758f74908797f13ada9d2475e4fda382c1e204b257cf6c6adcfd17db694efcf
7
- data.tar.gz: e62de30119c51dc4e92e64aec954c263f84a4820f1541f014c49d93dadd8de1efb5d331c35ecbda6ccfa79c402393894b7b6ffd52edf2362b8e117fe505ff230
6
+ metadata.gz: acfa6eb243f7fa8985cb649ab3b701db68515e8a18c221d94cb149e51cddeec49642b6176da46956765cca8f1961aa88d71b5cb9625131cf64f9287e79e173c6
7
+ data.tar.gz: 4ac1f2f81854959679b53d4865efba3a36ddca14216ff9f2aef06e1c27b7b415b47c833387e73ddf3cae41369f382146f1041b9a833743d58d3aad0a954bd1ab
@@ -1,8 +1,8 @@
1
1
  # @package MiGA
2
2
  # @license Artistic-2.0
3
3
 
4
- require "miga/result"
5
- require "miga/project/base"
4
+ require 'miga/result'
5
+ require 'miga/project/base'
6
6
 
7
7
  ##
8
8
  # Helper module including specific functions to add project results.
@@ -31,7 +31,7 @@ module MiGA::Project::Result
31
31
  # Supported values include:
32
32
  # - +force+: A Boolean indicating if the result must be re-indexed. If true,
33
33
  # it implies save=true.
34
- def add_result(name, save=true, opts={})
34
+ def add_result(name, save = true, opts = {})
35
35
  return nil if @@RESULT_DIRS[name].nil?
36
36
  base = "#{path}/data/#{@@RESULT_DIRS[name]}/miga-project"
37
37
  if opts[:force]
@@ -49,21 +49,21 @@ module MiGA::Project::Result
49
49
  ##
50
50
  # Get the next distances task, saving intermediate results if +save+. Returns
51
51
  # a Symbol.
52
- def next_distances(save=true) ; next_task(@@DISTANCE_TASKS, save) ; end
52
+ def next_distances(save = true) ; next_task(@@DISTANCE_TASKS, save) ; end
53
53
 
54
54
  ##
55
55
  # Get the next inclade task, saving intermediate results if +save+. Returns a
56
56
  # Symbol.
57
- def next_inclade(save=true) ; next_task(@@INCLADE_TASKS, save) ; end
57
+ def next_inclade(save = true) ; next_task(@@INCLADE_TASKS, save) ; end
58
58
 
59
59
  ##
60
60
  # Get the next task from +tasks+, saving intermediate results if +save+.
61
61
  # Returns a Symbol.
62
- def next_task(tasks=@@DISTANCE_TASKS+@@INCLADE_TASKS, save=true)
62
+ def next_task(tasks = @@DISTANCE_TASKS+@@INCLADE_TASKS, save = true)
63
63
  tasks.find do |t|
64
- if metadata["run_#{t}"]==false or
64
+ if metadata["run_#{t}"] == false or
65
65
  (!is_clade? and @@INCLADE_TASKS.include?(t) and
66
- metadata["run_#{t}"]!=true)
66
+ metadata["run_#{t}"] != true)
67
67
  false
68
68
  else
69
69
  add_result(t, save).nil?
@@ -79,10 +79,10 @@ module MiGA::Project::Result
79
79
  def add_result_distances(base)
80
80
  return nil unless result_files_exist?(base, %w[.Rdata .log .txt])
81
81
  r = MiGA::Result.new("#{base}.json")
82
- r.add_file(:rdata, "miga-project.Rdata")
83
- r.add_file(:matrix, "miga-project.txt")
84
- r.add_file(:log, "miga-project.log")
85
- r.add_file(:hist, "miga-project.hist")
82
+ r.add_file(:rdata, 'miga-project.Rdata')
83
+ r.add_file(:matrix, 'miga-project.txt')
84
+ r.add_file(:log, 'miga-project.log')
85
+ r.add_file(:hist, 'miga-project.hist')
86
86
  r
87
87
  end
88
88
 
@@ -91,10 +91,11 @@ module MiGA::Project::Result
91
91
  return nil unless is_clade? or result_files_exist?(base,
92
92
  %w[.pdf .classif .medoids .class.tsv .class.nwk])
93
93
  r = add_result_iter_clades(base)
94
- r.add_file(:aai_tree, "miga-project.aai.nwk")
95
- r.add_file(:proposal, "miga-project.proposed-clades")
96
- r.add_file(:clades_aai90, "miga-project.aai90-clades")
97
- r.add_file(:clades_ani95, "miga-project.ani95-clades")
94
+ r.add_file(:aai_tree, 'miga-project.aai.nwk')
95
+ r.add_file(:proposal, 'miga-project.proposed-clades')
96
+ r.add_file(:clades_aai90, 'miga-project.aai90-clades')
97
+ r.add_file(:clades_ani95, 'miga-project.ani95-clades')
98
+ r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
98
99
  r
99
100
  end
100
101
 
@@ -102,28 +103,28 @@ module MiGA::Project::Result
102
103
  return nil unless result_files_exist?(base,
103
104
  %w[.pdf .classif .medoids .class.tsv .class.nwk])
104
105
  r = add_result_iter_clades(base)
105
- r.add_file(:ani_tree, "miga-project.ani.nwk")
106
+ r.add_file(:ani_tree, 'miga-project.ani.nwk')
106
107
  r
107
108
  end
108
109
 
109
110
  def add_result_iter_clades(base)
110
111
  r = MiGA::Result.new("#{base}.json")
111
- r.add_file(:report, "miga-project.pdf")
112
- r.add_file(:class_table, "miga-project.class.tsv")
113
- r.add_file(:class_tree, "miga-project.class.nwk")
114
- r.add_file(:classif, "miga-project.classif")
115
- r.add_file(:medoids, "miga-project.medoids")
112
+ r.add_file(:report, 'miga-project.pdf')
113
+ r.add_file(:class_table, 'miga-project.class.tsv')
114
+ r.add_file(:class_tree, 'miga-project.class.nwk')
115
+ r.add_file(:classif, 'miga-project.classif')
116
+ r.add_file(:medoids, 'miga-project.medoids')
116
117
  r
117
118
  end
118
119
 
119
120
  def add_result_ogs(base)
120
121
  return nil unless result_files_exist?(base, %w[.ogs .stats])
121
122
  r = MiGA::Result.new("#{base}.json")
122
- r.add_file(:ogs, "miga-project.ogs")
123
- r.add_file(:abc, "miga-project.abc")
124
- r.add_file(:stats, "miga-project.stats")
125
- r.add_file(:core_pan, "miga-project.core-pan.tsv")
126
- r.add_file(:core_pan_plot, "miga-project.core-pan.pdf")
123
+ r.add_file(:ogs, 'miga-project.ogs')
124
+ r.add_file(:abc, 'miga-project.abc')
125
+ r.add_file(:stats, 'miga-project.stats')
126
+ r.add_file(:core_pan, 'miga-project.core-pan.tsv')
127
+ r.add_file(:core_pan_plot, 'miga-project.core-pan.pdf')
127
128
  r
128
129
  end
129
130
 
@@ -131,8 +132,8 @@ module MiGA::Project::Result
131
132
  return nil unless
132
133
  result_files_exist?(base, %w[.taxonomy.json .metadata.db])
133
134
  r = MiGA::Result.new("#{base}.json")
134
- r.add_file(:taxonomy_index, "miga-project.taxonomy.json")
135
- r.add_file(:metadata_index, "miga-project.metadata.db")
135
+ r.add_file(:taxonomy_index, 'miga-project.taxonomy.json')
136
+ r.add_file(:metadata_index, 'miga-project.metadata.db')
136
137
  r
137
138
  end
138
139
 
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 5, 1]
13
+ VERSION = [0.3, 6, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,6 +18,7 @@ module MiGA::DistanceRunner::Commands
18
18
  dataset.name, target.name, tmp_dbs[:aai]).tap{ checkpoint :aai }
19
19
  end
20
20
 
21
+ ##
21
22
  # Estimates AAI against +target+ using hAAI
22
23
  def haai(target)
23
24
  haai = aai_cmd(tmp_file("ess_genes.fa"),
@@ -34,6 +35,7 @@ module MiGA::DistanceRunner::Commands
34
35
  aai
35
36
  end
36
37
 
38
+ ##
37
39
  # Calculates ANI against +target+
38
40
  def ani(target)
39
41
  # Check if the request makes sense
@@ -49,6 +51,15 @@ module MiGA::DistanceRunner::Commands
49
51
  dataset.name, target.name, tmp_dbs[:ani]).tap{ checkpoint :ani }
50
52
  end
51
53
 
54
+ ##
55
+ # Calculates and returns ANI against +target+ if AAI >= 85%. Returns
56
+ # +nil+ otherwise
57
+ def ani_after_aai(target)
58
+ aai = aai(target)
59
+ ani(target) unless aai.nil? or aai < 85.0
60
+ end
61
+
62
+ ##
52
63
  # Execute an AAI command
53
64
  def aai_cmd(f1, f2, n1, n2, db, o={})
54
65
  o = opts.merge(o)
@@ -59,6 +70,7 @@ module MiGA::DistanceRunner::Commands
59
70
  (v.nil? or v.empty?) ? 0 : v.to_f
60
71
  end
61
72
 
73
+ ##
62
74
  # Execute an ANI command
63
75
  def ani_cmd(f1, f2, n1, n2, db, o={})
64
76
  o = opts.merge(o)
@@ -2,6 +2,7 @@
2
2
  require 'sqlite3'
3
3
 
4
4
  module MiGA::DistanceRunner::Database
5
+ ##
5
6
  # Check for corrupt files and create empty databases
6
7
  def initialize_dbs!(for_ref)
7
8
  @dbs = {}
@@ -33,6 +34,7 @@ module MiGA::DistanceRunner::Database
33
34
  end
34
35
  end
35
36
 
37
+ ##
36
38
  # Path to the database +metric+ for +dataset_name+ in +project+
37
39
  # (assumes that +dataset_name+ is a reference dataset)
38
40
  def ref_db(metric, dataset_name=nil)
@@ -48,35 +50,66 @@ module MiGA::DistanceRunner::Database
48
50
  File.expand_path(b, home)
49
51
  end
50
52
 
53
+ ##
51
54
  # Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
52
55
  # query dataset)
53
56
  def query_db(metric)
54
57
  File.expand_path("#{dataset.name}.#{metric}.db", home)
55
58
  end
56
59
 
60
+ ##
57
61
  # Get the stored +metric+ value against +target+
58
62
  def stored_value(target, metric)
59
63
  # Check if self.dataset -> target is done (previous run)
60
64
  y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
61
65
  return y unless y.nil? or y.zero?
66
+
62
67
  # Check if self.dataset <- target is done (another thread)
63
- if dataset.is_ref? and project.path==ref_project.path
64
- y = value_from_db(target.name, dataset.name, ref_db(metric, target.name), metric)
65
- return y unless y.nil? or y.zero?
68
+ if dataset.is_ref? and project.path == ref_project.path
69
+ y = data_from_db(
70
+ target.name, dataset.name, ref_db(metric, target.name), metric)
71
+ unless y.nil? or y.first.zero?
72
+ # Store a copy
73
+ data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
74
+ return y.first
75
+ end
66
76
  end
67
77
  nil
68
78
  end
69
79
 
80
+ ##
70
81
  # Get the value of +metric+ in the +db+ database between +n1+ and +n2+
71
82
  def value_from_db(n1, n2, db, metric)
83
+ y = data_from_db(n1, n2, db, metric)
84
+ y.first unless y.nil?
85
+ end
86
+
87
+ ##
88
+ # Get the +metric+ data in the +db+ database between +n1+ and +n2+. Returns an
89
+ # Array with the metric, standard deviation, number of matches, and maximum
90
+ # possible number of matches
91
+ def data_from_db(n1, n2, db, metric)
72
92
  y = nil
73
93
  SQLite3::Database.new(db) do |conn|
74
- y = conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first
75
- y = y.first unless y.nil?
94
+ y = conn.execute(
95
+ "select #{metric}, sd, n, omega from #{metric} where seq1=? and seq2=?",
96
+ [n1, n2]).first
76
97
  end if File.size? db
77
98
  y
78
99
  end
79
100
 
101
+ ##
102
+ # Save +data+ of +metric+ between +n1+ and +n2+ in the +db+ database.
103
+ def data_to_db(n1, n2, db, metric, data)
104
+ SQLite3::Database.new(db) do |conn|
105
+ conn.execute(
106
+ "insert into #{metric} (seq1, seq2, #{metric}, sd, n, omega) " +
107
+ "values (?, ?, ?, ?, ?, ?)", [n1, n2] + data)
108
+ end
109
+ checkpoint metric
110
+ end
111
+
112
+ ##
80
113
  # Iterates for each entry in +db+
81
114
  def foreach_in_db(db, metric, &blk)
82
115
  SQLite3::Database.new(db) do |conn|
@@ -30,7 +30,9 @@ class MiGA::DistanceRunner
30
30
  if opts[:run_taxonomy] && project.metadata[:ref_project]
31
31
  @home = File.expand_path('05.taxonomy', @home)
32
32
  @ref_project = MiGA::Project.load(project.metadata[:ref_project])
33
- raise "Cannot load reference project: #{project.metadata[:ref_project]}" if @ref_project.nil?
33
+ if @ref_project.nil?
34
+ raise "Cannot load reference project: #{project.metadata[:ref_project]}"
35
+ end
34
36
  else
35
37
  @ref_project = project
36
38
  end
@@ -55,47 +57,61 @@ class MiGA::DistanceRunner
55
57
  def go_ref!
56
58
  # Initialize databases
57
59
  initialize_dbs! true
60
+
58
61
  # first-come-first-serve traverse
59
62
  ref_project.each_dataset do |ds|
60
63
  next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
61
64
  puts "[ #{Time.now} ] #{ds.name}"
62
- aai = aai(ds)
63
- ani(ds) unless aai.nil? or aai < 90.0
65
+ ani_after_aai(ds)
64
66
  end
67
+
65
68
  # Finalize
66
69
  [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
67
70
  end
68
71
 
72
+ ##
69
73
  # Launch analysis for query datasets
70
74
  def go_query!
71
75
  # Check if project is ready
72
- v = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
73
- res = ref_project.result(v[0])
76
+ tsk = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
77
+ res = ref_project.result(tsk[0])
74
78
  return if res.nil?
79
+
75
80
  # Initialize the databases
76
81
  initialize_dbs! false
77
82
  # Calculate the classification-informed AAI/ANI traverse
78
- results = File.expand_path("#{dataset.name}.#{v[1]}-medoids.tsv", home)
79
- fh = File.open(results, "w")
80
- classif, val_cls = *classify(res.dir, ".", v[1], fh)
83
+ results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
84
+ fh = File.open(results, 'w')
85
+ classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
81
86
  fh.close
87
+
82
88
  # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
83
89
  par_dir = File.dirname(File.expand_path(classif, res.dir))
84
- par = File.expand_path("miga-project.classif", par_dir)
90
+ par = File.expand_path('miga-project.classif', par_dir)
91
+ closest = {dataset: nil, ani: 0.0}
85
92
  if File.size? par
86
- File.open(par, "r") do |fh|
93
+ File.open(par, 'r') do |fh|
87
94
  fh.each_line do |ln|
88
95
  r = ln.chomp.split("\t")
89
- next unless r[1].to_i==val_cls
90
- target = ref_project.dataset(r[0])
91
- aai = (v[1]==:aai) ? aai(target) : 100.0
92
- ani(target) if aai >= 90.0
96
+ next unless r[1].to_i == val_cls
97
+ ani = ani_after_aai(ref_project.dataset(r[0]))
98
+ closest = {ds: r[0], ani: ani} unless ani.nil? or ani < closest[:ani]
93
99
  end
94
100
  end
95
101
  end
102
+
103
+ # Calculate all the AAIs/ANIs against the closest ANI95-clade (if ANI > 95%)
104
+ cl_path = File.expand_path('miga-project.ani95-clades', home)
105
+ if File.size? cl_path and tsk[0] == :clade_finding and closest[:ani] >= 95.0
106
+ File.foreach(cl_path).
107
+ map { |i| i.chomp.split(',') }.
108
+ find { |i| i.include? closest[:ds] }.
109
+ each { |i| ani_after_aai(ref_project.dataset(i)) }
110
+ end
111
+
96
112
  # Finalize
97
113
  [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
98
- build_medoids_tree(v[1])
114
+ build_medoids_tree(tsk[1])
99
115
  transfer_taxonomy(tax_test)
100
116
  end
101
117
 
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update: Oct 07 2015
5
+ # @license: artistic license 2.0
6
+ #
7
+ use strict;
8
+ use warnings;
9
+ use List::Util qw/sum min max/;
10
+
11
+ my ($seqs, $minlen, $n__) = @ARGV;
12
+ $seqs or die "
13
+ Description:
14
+ Calculates the N50 value of a set of sequences. Alternatively, it
15
+ can calculate other N** values. It also calculates the total number
16
+ of sequences and the total added length.
17
+
18
+ Usage:
19
+ $0 seqs.fa[ minlen[ **]]
20
+
21
+ seqs.fa A FastA file containing the sequences.
22
+ minlen (optional) The minimum length to take into consideration.
23
+ By default: 0.
24
+ ** Value N** to calculate. By default: 50 (N50).
25
+ ";
26
+ $minlen ||= 0;
27
+ $n__ ||= 50;
28
+
29
+ my @len = ();
30
+ open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
31
+ while(<SEQ>){
32
+ if(/^>/){
33
+ push @len, 0;
34
+ }else{
35
+ next if /^;/;
36
+ chomp;
37
+ s/\W//g;
38
+ $len[-1]+=length $_;
39
+ }
40
+ }
41
+ close SEQ;
42
+ @len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
43
+ my $tot = (sum(@len) || 0);
44
+
45
+ my $thr = $n__*$tot/100;
46
+ my $pos = 0;
47
+ for(@len){
48
+ $pos+= $_;
49
+ if($pos>=$thr){
50
+ print "N$n__: $_\n";
51
+ last;
52
+ }
53
+ }
54
+ print "Sequences: ".scalar(@len)."\n";
55
+ print "Total length: $tot\n";
56
+
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+
11
+ my($file, $content, $stretch) = @ARGV;
12
+ $file or die <<HELP
13
+
14
+ Description:
15
+ Filter sequences by N-content and presence of long homopolymers.
16
+ Usage:
17
+ $0 sequences.fa [content [stretch]] > filtered.fa
18
+ Where:
19
+ sequences.fa Input file in FastA format
20
+ content A number between 0 and 1 indicating the maximum proportion of Ns
21
+ (1 to turn off, 0.5 by default)
22
+ stretch A number indicating the maximum number of consecutive identical
23
+ nucleotides allowed (0 to turn off, 100 by default)
24
+ filtered.fa Filtered set of sequences.
25
+
26
+ HELP
27
+ ;
28
+ ($content ||= 0.5)+=0;
29
+ ($stretch ||= 100)+=0;
30
+
31
+ my $good = 0;
32
+ my $N = 0;
33
+
34
+ FASTA: {
35
+ local $/ = "\n>";
36
+ open FILE, "<", $file or die "I can not open the file: $file: $!\n";
37
+ SEQ: while(<FILE>){
38
+ $N++;
39
+ s/^;.*//gm;
40
+ s/>//g;
41
+ my($n,$s) = split /\n/, $_, 2;
42
+ (my $clean = $s) =~ s/[^ACTGN]//g;
43
+ if($content < 1){
44
+ (my $Ns = $clean) =~ s/[^N]//g;
45
+ next SEQ if length($Ns)>length($clean)*$content;
46
+ }
47
+ if($stretch > 0){
48
+ for my $nuc (qw(A C T G N)){
49
+ next SEQ if $clean =~ m/[$nuc]{$stretch}/;
50
+ }
51
+ }
52
+ print ">$n\n$s\n";
53
+ $good++;
54
+ }
55
+ close FILE;
56
+ print STDERR "Total sequences: $N\nAfter filtering: $good\n";
57
+ }
58
+
59
+
60
+
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M Rodriguez-R
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+
11
+ $#ARGV>=0 or die "
12
+ Usage:
13
+ $0 seqs.fa... > length.txt
14
+
15
+ seqs.fa One or more FastA files.
16
+ length.txt A table with the lengths of the sequences.
17
+
18
+ ";
19
+
20
+ for my $fa (@ARGV){
21
+ open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
22
+ my $def = '';
23
+ my $len = 0;
24
+ while(<FA>){
25
+ next if /^;/;
26
+ if(m/^>(\S+)\s?/){
27
+ print "$def\t$len\n" if $def;
28
+ $def = $1;
29
+ $len = 0;
30
+ }else{
31
+ s/[^A-Za-z]//g;
32
+ $len+= length $_;
33
+ }
34
+ }
35
+ print "$def\t$len\n" if $def;
36
+ close FA;
37
+ }
38
+
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update Oct-13-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+ use Symbol;
11
+
12
+ my ($file, $base, $outN) = @ARGV;
13
+
14
+ $outN ||= 12;
15
+ ($file and $base) or die "
16
+ Usage
17
+ $0 in_file.fa out_base[ no_files]
18
+
19
+ in_file.fa Input file in FastA format.
20
+ out_base Prefix for the name of the output files. It will
21
+ be appended with .<i>.fa, where <i> is a consecutive
22
+ number starting in 1.
23
+ no_files Number of files to generate. By default: 12.
24
+
25
+ ";
26
+
27
+
28
+ my @outSym = ();
29
+ for my $i (1 .. $outN){
30
+ $outSym[$i-1] = gensym;
31
+ open $outSym[$i-1], ">", "$base.$i.fa" or
32
+ die "I can not create the file: $base.$i.fa: $!\n";
33
+ }
34
+
35
+
36
+ my($i, $seq) = (-1, '');
37
+ open FILE, "<", $file or die "I can not read the file: $file: $!\n";
38
+ while(my $ln=<FILE>){
39
+ next if $ln=~/^;/;
40
+ if($ln =~ m/^>/){
41
+ print { $outSym[$i % $outN] } $seq if $seq;
42
+ $i++;
43
+ $seq = '';
44
+ }
45
+ $seq.=$ln;
46
+ }
47
+ print { $outSym[$i % $outN] } $seq if $seq;
48
+ close FILE;
49
+
50
+ for(my $j=0; $j<$outN; $j++){
51
+ close $outSym[$j];
52
+ }
53
+
54
+ print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
55
+
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env Rscript
2
+ #
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+ #
6
+
7
+ #= Load stuff
8
+ argv <- commandArgs(trailingOnly=T)
9
+ suppressPackageStartupMessages(library(ape))
10
+
11
+ find_medoids <- function(dist, out, clades) {
12
+ dist <- as.matrix(dist)
13
+ cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
14
+ medoids <- c()
15
+ for(i in cl){
16
+ lab <- strsplit(i, ',')[[1]]
17
+ cat('Clade of:', lab[1], '\n')
18
+ if(length(lab) == 1) {
19
+ med <- lab
20
+ } else {
21
+ med <- lab[which.min(colSums(dist[lab, lab], na.rm = TRUE))]
22
+ }
23
+ medoids <- c(medoids, med)
24
+ }
25
+ write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
26
+ }
27
+
28
+ #= Main
29
+ load(argv[1])
30
+ find_medoids(dist = ani.d, out = argv[2], clades = argv[3])
31
+
@@ -6,15 +6,18 @@ require 'miga'
6
6
  proj_path = ARGV.shift or raise "Usage: #{$0} path/to/project"
7
7
 
8
8
  # Load MiGA object
9
- p = MiGA::Project.load(proj_path) or raise "Cannot load project: #{proj_path}"
10
- pr = p.result(:clade_finding) or raise "Unavailable result: clade_finding"
11
- pf = pr.file_path(:clades_ani95) or raise "Unavailable result file: proposal"
9
+ p = MiGA::Project.load(proj_path) or
10
+ raise "Cannot load project: #{proj_path}"
11
+ pr = p.result(:clade_finding) or
12
+ raise "Unavailable result: clade_finding"
13
+ pf = pr.file_path(:clades_ani95) or
14
+ raise "Unavailable result file: clades_ani95"
12
15
 
13
16
  # Read ANIspp
14
17
  ani_spp = []
15
18
  File.open(pf, 'r') do |fh|
16
19
  fh.each_line do |ln|
17
- next if $.==1
20
+ next if $.==1 and ln.chomp == 'G' # <- Legacy check
18
21
  ani_spp << ln.chomp.split(',')
19
22
  end
20
23
  end
@@ -2,7 +2,7 @@
2
2
  # High-end pipelines for SubcladeRunner
3
3
  module MiGA::SubcladeRunner::Pipeline
4
4
 
5
- # Run species-level clusterings using ANI>95% / AAI>90%
5
+ # Run species-level clusterings using ANI > 95% / AAI > 90%
6
6
  def cluster_species
7
7
  tasks = {ani95: [:ani_distances, 95.0], aai90: [:aai_distances, 90.0]}
8
8
  tasks.each do |k, par|
@@ -23,8 +23,20 @@ module MiGA::SubcladeRunner::Pipeline
23
23
  end
24
24
  ofh.close
25
25
  # Cluster genomes
26
- `ogs.mcl.rb -o '#{ogs_file}' --abc '#{abc_path}' -t '#{opts[:thr]}'`
26
+ `ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
27
+ File.open(ogs_file, 'w') do |fh|
28
+ File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
29
+ fh.puts ln if lno != 0
30
+ end
31
+ end
32
+ File.unlink "#{ogs_file}.tmp"
27
33
  end
34
+
35
+ # Find species medoids
36
+ src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
37
+ `Rscript '#{src}' miga-project.dist.rdata \
38
+ miga-project.ani95-medoids miga-project.ani95-clades`
39
+
28
40
  # Propose clades
29
41
  ofh = File.open('miga-project.proposed-clades', 'w')
30
42
  File.open('miga-project.ani95-clades', 'r') do |ifh|
@@ -42,7 +54,8 @@ module MiGA::SubcladeRunner::Pipeline
42
54
  step = :"#{metric}_distances"
43
55
  metric_res = project.result(step) or raise "Incomplete step #{step}"
44
56
  matrix = metric_res.file_path(:matrix)
45
- `Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}'`
57
+ `Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}' \
58
+ miga-project.ani95-medoids`
46
59
  File.rename('miga-project.nwk',"miga-project.#{metric}.nwk") if
47
60
  File.exist? 'miga-project.nwk'
48
61
  end
@@ -3,7 +3,6 @@ require_relative 'base.rb'
3
3
  require_relative 'temporal.rb'
4
4
  require_relative 'pipeline.rb'
5
5
 
6
-
7
6
  class MiGA::SubcladeRunner
8
7
 
9
8
  include MiGA::SubcladeRunner::Temporal
@@ -17,7 +16,7 @@ class MiGA::SubcladeRunner
17
16
  raise "No project at #{project_path}"
18
17
  @step = step.to_sym
19
18
  clades_dir = File.expand_path('data/10.clades', project.path)
20
- @home = File.expand_path(@step==:clade_finding ? '01.find' : '02.ani',
19
+ @home = File.expand_path(@step == :clade_finding ? '01.find' : '02.ani',
21
20
  clades_dir)
22
21
  @opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
23
22
  end
@@ -29,7 +28,7 @@ class MiGA::SubcladeRunner
29
28
  Dir.mktmpdir do |tmp_dir|
30
29
  @tmp = tmp_dir
31
30
  create_temporals
32
- step==:clade_finding ? go_clade_finding! : go_subclades!
31
+ step == :clade_finding ? go_clade_finding! : go_subclades!
33
32
  end
34
33
  end
35
34
 
data/utils/subclades.R CHANGED
@@ -5,7 +5,7 @@
5
5
  #
6
6
 
7
7
  #= Load stuff
8
- argv <- commandArgs(trailingOnly=T)
8
+ argv <- commandArgs(trailingOnly = TRUE)
9
9
  suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
@@ -13,38 +13,44 @@ suppressPackageStartupMessages(library(parallel))
13
13
  suppressPackageStartupMessages(library(enveomics.R))
14
14
 
15
15
  #= Main function
16
- subclades <- function(ani_file, out_base, thr=1, ani.d=dist(0)) {
17
- say("==> Out base:", out_base, "<==")
16
+ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
17
+ say('==> Out base:', out_base, '<==')
18
18
 
19
19
  # Normalize input matrix
20
- dist_rdata = paste(out_base, "dist.rdata", sep=".")
20
+ dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
21
21
  if(!missing(ani_file)){
22
- if(length(ani.d)==0 && !file.exists(dist_rdata)){
22
+ if(length(ani.d) == 0 && !file.exists(dist_rdata)){
23
23
  # Read from ani_file
24
- a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
24
+ a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
25
25
  if(nrow(a)==0){
26
26
  generate_empty_files(out_base)
27
27
  return(NULL)
28
28
  }
29
- say("Distances")
29
+ if(!is.na(sel) and file.exists(sel)){
30
+ say('Filter selection')
31
+ lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
32
+ a <- a[a$a %in% lab & a$b %in% lab, ]
33
+ }
34
+ say('Distances')
30
35
  a$d <- 1 - (a$value/100)
31
- ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d=max(a$d)*1.2)
32
- save(ani.d, file=dist_rdata)
36
+ ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
37
+ save(ani.d, file = dist_rdata)
33
38
  }
34
39
  }
35
40
 
36
41
  # Read result if the subclade is ready, run it otherwise
37
- if(file.exists(paste(out_base,"classif",sep="."))){
42
+ if(file.exists(paste(out_base, 'classif', sep = '.'))){
38
43
  say("Loading")
39
- ani.medoids <- read.table(paste(out_base, "medoids", sep="."),
40
- sep=' ', as.is=TRUE)[,1]
41
- a <- read.table(paste(out_base,"classif",sep="."), sep="\t", as.is=TRUE)
44
+ ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
45
+ sep = ' ', as.is = TRUE)[,1]
46
+ a <- read.table(paste(out_base, "classif", sep="."),
47
+ sep = '\t', as.is = TRUE)
42
48
  ani.types <- a[,2]
43
49
  names(ani.types) <- a[,1]
44
- if(length(ani.d)==0) load(dist_rdata)
50
+ if(length(ani.d) == 0) load(dist_rdata)
45
51
  }else{
46
52
  res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
47
- if(length(res)==0) return(NULL)
53
+ if(length(res) == 0) return(NULL)
48
54
  ani.medoids <- res[['ani.medoids']]
49
55
  ani.types <- res[['ani.types']]
50
56
  ani.d <- res[['ani.d']]
@@ -230,7 +236,7 @@ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
230
236
  }
231
237
 
232
238
  #= Main
233
- options(warn=1)
234
- subclades(ani_file=argv[1], out_base=argv[2],
235
- thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
239
+ options(warn = 1)
240
+ subclades(ani_file = argv[1], out_base = argv[2],
241
+ thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
236
242
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5.1
4
+ version: 0.3.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-11-18 00:00:00.000000000 Z
11
+ date: 2018-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -341,7 +341,6 @@ files:
341
341
  - utils/enveomics/Scripts/gi2tax.rb
342
342
  - utils/enveomics/Scripts/in_silico_GA_GI.pl
343
343
  - utils/enveomics/Scripts/lib/data/essential.hmm.gz
344
- - utils/enveomics/Scripts/lib/enveomics.R
345
344
  - utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
346
345
  - utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
347
346
  - utils/enveomics/Scripts/lib/enveomics_rb/og.rb
@@ -454,6 +453,7 @@ files:
454
453
  - utils/enveomics/enveomics.R/man/z$-methods.Rd
455
454
  - utils/enveomics/globals.mk
456
455
  - utils/enveomics/manifest.json
456
+ - utils/find-medoid.R
457
457
  - utils/index_metadata.rb
458
458
  - utils/mytaxa_scan.R
459
459
  - utils/mytaxa_scan.rb
@@ -495,7 +495,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
495
495
  version: '0'
496
496
  requirements: []
497
497
  rubyforge_project:
498
- rubygems_version: 2.7.7
498
+ rubygems_version: 2.2.2
499
499
  signing_key:
500
500
  specification_version: 4
501
501
  summary: MiGA
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Scripts/lib/../../enveomics.R