miga-base 0.3.5.1 → 0.3.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 25f6a339288dbdbeda1f84f5da6b5de697d3790c0b38d138a4416ff762cae936
4
- data.tar.gz: 60187223750e983fafd6088935a912d016bbf4acafaf79b548e206c50076cc04
2
+ SHA1:
3
+ metadata.gz: 8b285b9906876a9f1b5366f929a4776d1689dbc1
4
+ data.tar.gz: 83b6843d00417fef4a8de18e4a102ad4d1899f0d
5
5
  SHA512:
6
- metadata.gz: 60b59ccc8fc3bf9f5a584f3221c268839ba4cd41eb3a6c16911f1808944c20100758f74908797f13ada9d2475e4fda382c1e204b257cf6c6adcfd17db694efcf
7
- data.tar.gz: e62de30119c51dc4e92e64aec954c263f84a4820f1541f014c49d93dadd8de1efb5d331c35ecbda6ccfa79c402393894b7b6ffd52edf2362b8e117fe505ff230
6
+ metadata.gz: acfa6eb243f7fa8985cb649ab3b701db68515e8a18c221d94cb149e51cddeec49642b6176da46956765cca8f1961aa88d71b5cb9625131cf64f9287e79e173c6
7
+ data.tar.gz: 4ac1f2f81854959679b53d4865efba3a36ddca14216ff9f2aef06e1c27b7b415b47c833387e73ddf3cae41369f382146f1041b9a833743d58d3aad0a954bd1ab
@@ -1,8 +1,8 @@
1
1
  # @package MiGA
2
2
  # @license Artistic-2.0
3
3
 
4
- require "miga/result"
5
- require "miga/project/base"
4
+ require 'miga/result'
5
+ require 'miga/project/base'
6
6
 
7
7
  ##
8
8
  # Helper module including specific functions to add project results.
@@ -31,7 +31,7 @@ module MiGA::Project::Result
31
31
  # Supported values include:
32
32
  # - +force+: A Boolean indicating if the result must be re-indexed. If true,
33
33
  # it implies save=true.
34
- def add_result(name, save=true, opts={})
34
+ def add_result(name, save = true, opts = {})
35
35
  return nil if @@RESULT_DIRS[name].nil?
36
36
  base = "#{path}/data/#{@@RESULT_DIRS[name]}/miga-project"
37
37
  if opts[:force]
@@ -49,21 +49,21 @@ module MiGA::Project::Result
49
49
  ##
50
50
  # Get the next distances task, saving intermediate results if +save+. Returns
51
51
  # a Symbol.
52
- def next_distances(save=true) ; next_task(@@DISTANCE_TASKS, save) ; end
52
+ def next_distances(save = true) ; next_task(@@DISTANCE_TASKS, save) ; end
53
53
 
54
54
  ##
55
55
  # Get the next inclade task, saving intermediate results if +save+. Returns a
56
56
  # Symbol.
57
- def next_inclade(save=true) ; next_task(@@INCLADE_TASKS, save) ; end
57
+ def next_inclade(save = true) ; next_task(@@INCLADE_TASKS, save) ; end
58
58
 
59
59
  ##
60
60
  # Get the next task from +tasks+, saving intermediate results if +save+.
61
61
  # Returns a Symbol.
62
- def next_task(tasks=@@DISTANCE_TASKS+@@INCLADE_TASKS, save=true)
62
+ def next_task(tasks = @@DISTANCE_TASKS+@@INCLADE_TASKS, save = true)
63
63
  tasks.find do |t|
64
- if metadata["run_#{t}"]==false or
64
+ if metadata["run_#{t}"] == false or
65
65
  (!is_clade? and @@INCLADE_TASKS.include?(t) and
66
- metadata["run_#{t}"]!=true)
66
+ metadata["run_#{t}"] != true)
67
67
  false
68
68
  else
69
69
  add_result(t, save).nil?
@@ -79,10 +79,10 @@ module MiGA::Project::Result
79
79
  def add_result_distances(base)
80
80
  return nil unless result_files_exist?(base, %w[.Rdata .log .txt])
81
81
  r = MiGA::Result.new("#{base}.json")
82
- r.add_file(:rdata, "miga-project.Rdata")
83
- r.add_file(:matrix, "miga-project.txt")
84
- r.add_file(:log, "miga-project.log")
85
- r.add_file(:hist, "miga-project.hist")
82
+ r.add_file(:rdata, 'miga-project.Rdata')
83
+ r.add_file(:matrix, 'miga-project.txt')
84
+ r.add_file(:log, 'miga-project.log')
85
+ r.add_file(:hist, 'miga-project.hist')
86
86
  r
87
87
  end
88
88
 
@@ -91,10 +91,11 @@ module MiGA::Project::Result
91
91
  return nil unless is_clade? or result_files_exist?(base,
92
92
  %w[.pdf .classif .medoids .class.tsv .class.nwk])
93
93
  r = add_result_iter_clades(base)
94
- r.add_file(:aai_tree, "miga-project.aai.nwk")
95
- r.add_file(:proposal, "miga-project.proposed-clades")
96
- r.add_file(:clades_aai90, "miga-project.aai90-clades")
97
- r.add_file(:clades_ani95, "miga-project.ani95-clades")
94
+ r.add_file(:aai_tree, 'miga-project.aai.nwk')
95
+ r.add_file(:proposal, 'miga-project.proposed-clades')
96
+ r.add_file(:clades_aai90, 'miga-project.aai90-clades')
97
+ r.add_file(:clades_ani95, 'miga-project.ani95-clades')
98
+ r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
98
99
  r
99
100
  end
100
101
 
@@ -102,28 +103,28 @@ module MiGA::Project::Result
102
103
  return nil unless result_files_exist?(base,
103
104
  %w[.pdf .classif .medoids .class.tsv .class.nwk])
104
105
  r = add_result_iter_clades(base)
105
- r.add_file(:ani_tree, "miga-project.ani.nwk")
106
+ r.add_file(:ani_tree, 'miga-project.ani.nwk')
106
107
  r
107
108
  end
108
109
 
109
110
  def add_result_iter_clades(base)
110
111
  r = MiGA::Result.new("#{base}.json")
111
- r.add_file(:report, "miga-project.pdf")
112
- r.add_file(:class_table, "miga-project.class.tsv")
113
- r.add_file(:class_tree, "miga-project.class.nwk")
114
- r.add_file(:classif, "miga-project.classif")
115
- r.add_file(:medoids, "miga-project.medoids")
112
+ r.add_file(:report, 'miga-project.pdf')
113
+ r.add_file(:class_table, 'miga-project.class.tsv')
114
+ r.add_file(:class_tree, 'miga-project.class.nwk')
115
+ r.add_file(:classif, 'miga-project.classif')
116
+ r.add_file(:medoids, 'miga-project.medoids')
116
117
  r
117
118
  end
118
119
 
119
120
  def add_result_ogs(base)
120
121
  return nil unless result_files_exist?(base, %w[.ogs .stats])
121
122
  r = MiGA::Result.new("#{base}.json")
122
- r.add_file(:ogs, "miga-project.ogs")
123
- r.add_file(:abc, "miga-project.abc")
124
- r.add_file(:stats, "miga-project.stats")
125
- r.add_file(:core_pan, "miga-project.core-pan.tsv")
126
- r.add_file(:core_pan_plot, "miga-project.core-pan.pdf")
123
+ r.add_file(:ogs, 'miga-project.ogs')
124
+ r.add_file(:abc, 'miga-project.abc')
125
+ r.add_file(:stats, 'miga-project.stats')
126
+ r.add_file(:core_pan, 'miga-project.core-pan.tsv')
127
+ r.add_file(:core_pan_plot, 'miga-project.core-pan.pdf')
127
128
  r
128
129
  end
129
130
 
@@ -131,8 +132,8 @@ module MiGA::Project::Result
131
132
  return nil unless
132
133
  result_files_exist?(base, %w[.taxonomy.json .metadata.db])
133
134
  r = MiGA::Result.new("#{base}.json")
134
- r.add_file(:taxonomy_index, "miga-project.taxonomy.json")
135
- r.add_file(:metadata_index, "miga-project.metadata.db")
135
+ r.add_file(:taxonomy_index, 'miga-project.taxonomy.json')
136
+ r.add_file(:metadata_index, 'miga-project.metadata.db')
136
137
  r
137
138
  end
138
139
 
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 5, 1]
13
+ VERSION = [0.3, 6, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,6 +18,7 @@ module MiGA::DistanceRunner::Commands
18
18
  dataset.name, target.name, tmp_dbs[:aai]).tap{ checkpoint :aai }
19
19
  end
20
20
 
21
+ ##
21
22
  # Estimates AAI against +target+ using hAAI
22
23
  def haai(target)
23
24
  haai = aai_cmd(tmp_file("ess_genes.fa"),
@@ -34,6 +35,7 @@ module MiGA::DistanceRunner::Commands
34
35
  aai
35
36
  end
36
37
 
38
+ ##
37
39
  # Calculates ANI against +target+
38
40
  def ani(target)
39
41
  # Check if the request makes sense
@@ -49,6 +51,15 @@ module MiGA::DistanceRunner::Commands
49
51
  dataset.name, target.name, tmp_dbs[:ani]).tap{ checkpoint :ani }
50
52
  end
51
53
 
54
+ ##
55
+ # Calculates and returns ANI against +target+ if AAI >= 85%. Returns
56
+ # +nil+ otherwise
57
+ def ani_after_aai(target)
58
+ aai = aai(target)
59
+ ani(target) unless aai.nil? or aai < 85.0
60
+ end
61
+
62
+ ##
52
63
  # Execute an AAI command
53
64
  def aai_cmd(f1, f2, n1, n2, db, o={})
54
65
  o = opts.merge(o)
@@ -59,6 +70,7 @@ module MiGA::DistanceRunner::Commands
59
70
  (v.nil? or v.empty?) ? 0 : v.to_f
60
71
  end
61
72
 
73
+ ##
62
74
  # Execute an ANI command
63
75
  def ani_cmd(f1, f2, n1, n2, db, o={})
64
76
  o = opts.merge(o)
@@ -2,6 +2,7 @@
2
2
  require 'sqlite3'
3
3
 
4
4
  module MiGA::DistanceRunner::Database
5
+ ##
5
6
  # Check for corrupt files and create empty databases
6
7
  def initialize_dbs!(for_ref)
7
8
  @dbs = {}
@@ -33,6 +34,7 @@ module MiGA::DistanceRunner::Database
33
34
  end
34
35
  end
35
36
 
37
+ ##
36
38
  # Path to the database +metric+ for +dataset_name+ in +project+
37
39
  # (assumes that +dataset_name+ is a reference dataset)
38
40
  def ref_db(metric, dataset_name=nil)
@@ -48,35 +50,66 @@ module MiGA::DistanceRunner::Database
48
50
  File.expand_path(b, home)
49
51
  end
50
52
 
53
+ ##
51
54
  # Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
52
55
  # query dataset)
53
56
  def query_db(metric)
54
57
  File.expand_path("#{dataset.name}.#{metric}.db", home)
55
58
  end
56
59
 
60
+ ##
57
61
  # Get the stored +metric+ value against +target+
58
62
  def stored_value(target, metric)
59
63
  # Check if self.dataset -> target is done (previous run)
60
64
  y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
61
65
  return y unless y.nil? or y.zero?
66
+
62
67
  # Check if self.dataset <- target is done (another thread)
63
- if dataset.is_ref? and project.path==ref_project.path
64
- y = value_from_db(target.name, dataset.name, ref_db(metric, target.name), metric)
65
- return y unless y.nil? or y.zero?
68
+ if dataset.is_ref? and project.path == ref_project.path
69
+ y = data_from_db(
70
+ target.name, dataset.name, ref_db(metric, target.name), metric)
71
+ unless y.nil? or y.first.zero?
72
+ # Store a copy
73
+ data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
74
+ return y.first
75
+ end
66
76
  end
67
77
  nil
68
78
  end
69
79
 
80
+ ##
70
81
  # Get the value of +metric+ in the +db+ database between +n1+ and +n2+
71
82
  def value_from_db(n1, n2, db, metric)
83
+ y = data_from_db(n1, n2, db, metric)
84
+ y.first unless y.nil?
85
+ end
86
+
87
+ ##
88
+ # Get the +metric+ data in the +db+ database between +n1+ and +n2+. Returns an
89
+ # Array with the metric, standard deviation, number of matches, and maximum
90
+ # possible number of matches
91
+ def data_from_db(n1, n2, db, metric)
72
92
  y = nil
73
93
  SQLite3::Database.new(db) do |conn|
74
- y = conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first
75
- y = y.first unless y.nil?
94
+ y = conn.execute(
95
+ "select #{metric}, sd, n, omega from #{metric} where seq1=? and seq2=?",
96
+ [n1, n2]).first
76
97
  end if File.size? db
77
98
  y
78
99
  end
79
100
 
101
+ ##
102
+ # Save +data+ of +metric+ between +n1+ and +n2+ in the +db+ database.
103
+ def data_to_db(n1, n2, db, metric, data)
104
+ SQLite3::Database.new(db) do |conn|
105
+ conn.execute(
106
+ "insert into #{metric} (seq1, seq2, #{metric}, sd, n, omega) " +
107
+ "values (?, ?, ?, ?, ?, ?)", [n1, n2] + data)
108
+ end
109
+ checkpoint metric
110
+ end
111
+
112
+ ##
80
113
  # Iterates for each entry in +db+
81
114
  def foreach_in_db(db, metric, &blk)
82
115
  SQLite3::Database.new(db) do |conn|
@@ -30,7 +30,9 @@ class MiGA::DistanceRunner
30
30
  if opts[:run_taxonomy] && project.metadata[:ref_project]
31
31
  @home = File.expand_path('05.taxonomy', @home)
32
32
  @ref_project = MiGA::Project.load(project.metadata[:ref_project])
33
- raise "Cannot load reference project: #{project.metadata[:ref_project]}" if @ref_project.nil?
33
+ if @ref_project.nil?
34
+ raise "Cannot load reference project: #{project.metadata[:ref_project]}"
35
+ end
34
36
  else
35
37
  @ref_project = project
36
38
  end
@@ -55,47 +57,61 @@ class MiGA::DistanceRunner
55
57
  def go_ref!
56
58
  # Initialize databases
57
59
  initialize_dbs! true
60
+
58
61
  # first-come-first-serve traverse
59
62
  ref_project.each_dataset do |ds|
60
63
  next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
61
64
  puts "[ #{Time.now} ] #{ds.name}"
62
- aai = aai(ds)
63
- ani(ds) unless aai.nil? or aai < 90.0
65
+ ani_after_aai(ds)
64
66
  end
67
+
65
68
  # Finalize
66
69
  [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
67
70
  end
68
71
 
72
+ ##
69
73
  # Launch analysis for query datasets
70
74
  def go_query!
71
75
  # Check if project is ready
72
- v = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
73
- res = ref_project.result(v[0])
76
+ tsk = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
77
+ res = ref_project.result(tsk[0])
74
78
  return if res.nil?
79
+
75
80
  # Initialize the databases
76
81
  initialize_dbs! false
77
82
  # Calculate the classification-informed AAI/ANI traverse
78
- results = File.expand_path("#{dataset.name}.#{v[1]}-medoids.tsv", home)
79
- fh = File.open(results, "w")
80
- classif, val_cls = *classify(res.dir, ".", v[1], fh)
83
+ results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
84
+ fh = File.open(results, 'w')
85
+ classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
81
86
  fh.close
87
+
82
88
  # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
83
89
  par_dir = File.dirname(File.expand_path(classif, res.dir))
84
- par = File.expand_path("miga-project.classif", par_dir)
90
+ par = File.expand_path('miga-project.classif', par_dir)
91
+ closest = {dataset: nil, ani: 0.0}
85
92
  if File.size? par
86
- File.open(par, "r") do |fh|
93
+ File.open(par, 'r') do |fh|
87
94
  fh.each_line do |ln|
88
95
  r = ln.chomp.split("\t")
89
- next unless r[1].to_i==val_cls
90
- target = ref_project.dataset(r[0])
91
- aai = (v[1]==:aai) ? aai(target) : 100.0
92
- ani(target) if aai >= 90.0
96
+ next unless r[1].to_i == val_cls
97
+ ani = ani_after_aai(ref_project.dataset(r[0]))
98
+ closest = {ds: r[0], ani: ani} unless ani.nil? or ani < closest[:ani]
93
99
  end
94
100
  end
95
101
  end
102
+
103
+ # Calculate all the AAIs/ANIs against the closest ANI95-clade (if ANI > 95%)
104
+ cl_path = File.expand_path('miga-project.ani95-clades', home)
105
+ if File.size? cl_path and tsk[0] == :clade_finding and closest[:ani] >= 95.0
106
+ File.foreach(cl_path).
107
+ map { |i| i.chomp.split(',') }.
108
+ find { |i| i.include? closest[:ds] }.
109
+ each { |i| ani_after_aai(ref_project.dataset(i)) }
110
+ end
111
+
96
112
  # Finalize
97
113
  [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
98
- build_medoids_tree(v[1])
114
+ build_medoids_tree(tsk[1])
99
115
  transfer_taxonomy(tax_test)
100
116
  end
101
117
 
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update: Oct 07 2015
5
+ # @license: artistic license 2.0
6
+ #
7
+ use strict;
8
+ use warnings;
9
+ use List::Util qw/sum min max/;
10
+
11
+ my ($seqs, $minlen, $n__) = @ARGV;
12
+ $seqs or die "
13
+ Description:
14
+ Calculates the N50 value of a set of sequences. Alternatively, it
15
+ can calculate other N** values. It also calculates the total number
16
+ of sequences and the total added length.
17
+
18
+ Usage:
19
+ $0 seqs.fa[ minlen[ **]]
20
+
21
+ seqs.fa A FastA file containing the sequences.
22
+ minlen (optional) The minimum length to take into consideration.
23
+ By default: 0.
24
+ ** Value N** to calculate. By default: 50 (N50).
25
+ ";
26
+ $minlen ||= 0;
27
+ $n__ ||= 50;
28
+
29
+ my @len = ();
30
+ open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
31
+ while(<SEQ>){
32
+ if(/^>/){
33
+ push @len, 0;
34
+ }else{
35
+ next if /^;/;
36
+ chomp;
37
+ s/\W//g;
38
+ $len[-1]+=length $_;
39
+ }
40
+ }
41
+ close SEQ;
42
+ @len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
43
+ my $tot = (sum(@len) || 0);
44
+
45
+ my $thr = $n__*$tot/100;
46
+ my $pos = 0;
47
+ for(@len){
48
+ $pos+= $_;
49
+ if($pos>=$thr){
50
+ print "N$n__: $_\n";
51
+ last;
52
+ }
53
+ }
54
+ print "Sequences: ".scalar(@len)."\n";
55
+ print "Total length: $tot\n";
56
+
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+
11
+ my($file, $content, $stretch) = @ARGV;
12
+ $file or die <<HELP
13
+
14
+ Description:
15
+ Filter sequences by N-content and presence of long homopolymers.
16
+ Usage:
17
+ $0 sequences.fa [content [stretch]] > filtered.fa
18
+ Where:
19
+ sequences.fa Input file in FastA format
20
+ content A number between 0 and 1 indicating the maximum proportion of Ns
21
+ (1 to turn off, 0.5 by default)
22
+ stretch A number indicating the maximum number of consecutive identical
23
+ nucleotides allowed (0 to turn off, 100 by default)
24
+ filtered.fa Filtered set of sequences.
25
+
26
+ HELP
27
+ ;
28
+ ($content ||= 0.5)+=0;
29
+ ($stretch ||= 100)+=0;
30
+
31
+ my $good = 0;
32
+ my $N = 0;
33
+
34
+ FASTA: {
35
+ local $/ = "\n>";
36
+ open FILE, "<", $file or die "I can not open the file: $file: $!\n";
37
+ SEQ: while(<FILE>){
38
+ $N++;
39
+ s/^;.*//gm;
40
+ s/>//g;
41
+ my($n,$s) = split /\n/, $_, 2;
42
+ (my $clean = $s) =~ s/[^ACTGN]//g;
43
+ if($content < 1){
44
+ (my $Ns = $clean) =~ s/[^N]//g;
45
+ next SEQ if length($Ns)>length($clean)*$content;
46
+ }
47
+ if($stretch > 0){
48
+ for my $nuc (qw(A C T G N)){
49
+ next SEQ if $clean =~ m/[$nuc]{$stretch}/;
50
+ }
51
+ }
52
+ print ">$n\n$s\n";
53
+ $good++;
54
+ }
55
+ close FILE;
56
+ print STDERR "Total sequences: $N\nAfter filtering: $good\n";
57
+ }
58
+
59
+
60
+
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M Rodriguez-R
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+
11
+ $#ARGV>=0 or die "
12
+ Usage:
13
+ $0 seqs.fa... > length.txt
14
+
15
+ seqs.fa One or more FastA files.
16
+ length.txt A table with the lengths of the sequences.
17
+
18
+ ";
19
+
20
+ for my $fa (@ARGV){
21
+ open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
22
+ my $def = '';
23
+ my $len = 0;
24
+ while(<FA>){
25
+ next if /^;/;
26
+ if(m/^>(\S+)\s?/){
27
+ print "$def\t$len\n" if $def;
28
+ $def = $1;
29
+ $len = 0;
30
+ }else{
31
+ s/[^A-Za-z]//g;
32
+ $len+= length $_;
33
+ }
34
+ }
35
+ print "$def\t$len\n" if $def;
36
+ close FA;
37
+ }
38
+
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update Oct-13-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+ use Symbol;
11
+
12
+ my ($file, $base, $outN) = @ARGV;
13
+
14
+ $outN ||= 12;
15
+ ($file and $base) or die "
16
+ Usage
17
+ $0 in_file.fa out_base[ no_files]
18
+
19
+ in_file.fa Input file in FastA format.
20
+ out_base Prefix for the name of the output files. It will
21
+ be appended with .<i>.fa, where <i> is a consecutive
22
+ number starting in 1.
23
+ no_files Number of files to generate. By default: 12.
24
+
25
+ ";
26
+
27
+
28
+ my @outSym = ();
29
+ for my $i (1 .. $outN){
30
+ $outSym[$i-1] = gensym;
31
+ open $outSym[$i-1], ">", "$base.$i.fa" or
32
+ die "I can not create the file: $base.$i.fa: $!\n";
33
+ }
34
+
35
+
36
+ my($i, $seq) = (-1, '');
37
+ open FILE, "<", $file or die "I can not read the file: $file: $!\n";
38
+ while(my $ln=<FILE>){
39
+ next if $ln=~/^;/;
40
+ if($ln =~ m/^>/){
41
+ print { $outSym[$i % $outN] } $seq if $seq;
42
+ $i++;
43
+ $seq = '';
44
+ }
45
+ $seq.=$ln;
46
+ }
47
+ print { $outSym[$i % $outN] } $seq if $seq;
48
+ close FILE;
49
+
50
+ for(my $j=0; $j<$outN; $j++){
51
+ close $outSym[$j];
52
+ }
53
+
54
+ print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
55
+
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env Rscript
2
+ #
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+ #
6
+
7
+ #= Load stuff
8
+ argv <- commandArgs(trailingOnly=T)
9
+ suppressPackageStartupMessages(library(ape))
10
+
11
+ find_medoids <- function(dist, out, clades) {
12
+ dist <- as.matrix(dist)
13
+ cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
14
+ medoids <- c()
15
+ for(i in cl){
16
+ lab <- strsplit(i, ',')[[1]]
17
+ cat('Clade of:', lab[1], '\n')
18
+ if(length(lab) == 1) {
19
+ med <- lab
20
+ } else {
21
+ med <- lab[which.min(colSums(dist[lab, lab], na.rm = TRUE))]
22
+ }
23
+ medoids <- c(medoids, med)
24
+ }
25
+ write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
26
+ }
27
+
28
+ #= Main
29
+ load(argv[1])
30
+ find_medoids(dist = ani.d, out = argv[2], clades = argv[3])
31
+
@@ -6,15 +6,18 @@ require 'miga'
6
6
  proj_path = ARGV.shift or raise "Usage: #{$0} path/to/project"
7
7
 
8
8
  # Load MiGA object
9
- p = MiGA::Project.load(proj_path) or raise "Cannot load project: #{proj_path}"
10
- pr = p.result(:clade_finding) or raise "Unavailable result: clade_finding"
11
- pf = pr.file_path(:clades_ani95) or raise "Unavailable result file: proposal"
9
+ p = MiGA::Project.load(proj_path) or
10
+ raise "Cannot load project: #{proj_path}"
11
+ pr = p.result(:clade_finding) or
12
+ raise "Unavailable result: clade_finding"
13
+ pf = pr.file_path(:clades_ani95) or
14
+ raise "Unavailable result file: clades_ani95"
12
15
 
13
16
  # Read ANIspp
14
17
  ani_spp = []
15
18
  File.open(pf, 'r') do |fh|
16
19
  fh.each_line do |ln|
17
- next if $.==1
20
+ next if $.==1 and ln.chomp == 'G' # <- Legacy check
18
21
  ani_spp << ln.chomp.split(',')
19
22
  end
20
23
  end
@@ -2,7 +2,7 @@
2
2
  # High-end pipelines for SubcladeRunner
3
3
  module MiGA::SubcladeRunner::Pipeline
4
4
 
5
- # Run species-level clusterings using ANI>95% / AAI>90%
5
+ # Run species-level clusterings using ANI > 95% / AAI > 90%
6
6
  def cluster_species
7
7
  tasks = {ani95: [:ani_distances, 95.0], aai90: [:aai_distances, 90.0]}
8
8
  tasks.each do |k, par|
@@ -23,8 +23,20 @@ module MiGA::SubcladeRunner::Pipeline
23
23
  end
24
24
  ofh.close
25
25
  # Cluster genomes
26
- `ogs.mcl.rb -o '#{ogs_file}' --abc '#{abc_path}' -t '#{opts[:thr]}'`
26
+ `ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
27
+ File.open(ogs_file, 'w') do |fh|
28
+ File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
29
+ fh.puts ln if lno != 0
30
+ end
31
+ end
32
+ File.unlink "#{ogs_file}.tmp"
27
33
  end
34
+
35
+ # Find species medoids
36
+ src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
37
+ `Rscript '#{src}' miga-project.dist.rdata \
38
+ miga-project.ani95-medoids miga-project.ani95-clades`
39
+
28
40
  # Propose clades
29
41
  ofh = File.open('miga-project.proposed-clades', 'w')
30
42
  File.open('miga-project.ani95-clades', 'r') do |ifh|
@@ -42,7 +54,8 @@ module MiGA::SubcladeRunner::Pipeline
42
54
  step = :"#{metric}_distances"
43
55
  metric_res = project.result(step) or raise "Incomplete step #{step}"
44
56
  matrix = metric_res.file_path(:matrix)
45
- `Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}'`
57
+ `Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}' \
58
+ miga-project.ani95-medoids`
46
59
  File.rename('miga-project.nwk',"miga-project.#{metric}.nwk") if
47
60
  File.exist? 'miga-project.nwk'
48
61
  end
@@ -3,7 +3,6 @@ require_relative 'base.rb'
3
3
  require_relative 'temporal.rb'
4
4
  require_relative 'pipeline.rb'
5
5
 
6
-
7
6
  class MiGA::SubcladeRunner
8
7
 
9
8
  include MiGA::SubcladeRunner::Temporal
@@ -17,7 +16,7 @@ class MiGA::SubcladeRunner
17
16
  raise "No project at #{project_path}"
18
17
  @step = step.to_sym
19
18
  clades_dir = File.expand_path('data/10.clades', project.path)
20
- @home = File.expand_path(@step==:clade_finding ? '01.find' : '02.ani',
19
+ @home = File.expand_path(@step == :clade_finding ? '01.find' : '02.ani',
21
20
  clades_dir)
22
21
  @opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
23
22
  end
@@ -29,7 +28,7 @@ class MiGA::SubcladeRunner
29
28
  Dir.mktmpdir do |tmp_dir|
30
29
  @tmp = tmp_dir
31
30
  create_temporals
32
- step==:clade_finding ? go_clade_finding! : go_subclades!
31
+ step == :clade_finding ? go_clade_finding! : go_subclades!
33
32
  end
34
33
  end
35
34
 
data/utils/subclades.R CHANGED
@@ -5,7 +5,7 @@
5
5
  #
6
6
 
7
7
  #= Load stuff
8
- argv <- commandArgs(trailingOnly=T)
8
+ argv <- commandArgs(trailingOnly = TRUE)
9
9
  suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
@@ -13,38 +13,44 @@ suppressPackageStartupMessages(library(parallel))
13
13
  suppressPackageStartupMessages(library(enveomics.R))
14
14
 
15
15
  #= Main function
16
- subclades <- function(ani_file, out_base, thr=1, ani.d=dist(0)) {
17
- say("==> Out base:", out_base, "<==")
16
+ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
17
+ say('==> Out base:', out_base, '<==')
18
18
 
19
19
  # Normalize input matrix
20
- dist_rdata = paste(out_base, "dist.rdata", sep=".")
20
+ dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
21
21
  if(!missing(ani_file)){
22
- if(length(ani.d)==0 && !file.exists(dist_rdata)){
22
+ if(length(ani.d) == 0 && !file.exists(dist_rdata)){
23
23
  # Read from ani_file
24
- a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
24
+ a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
25
25
  if(nrow(a)==0){
26
26
  generate_empty_files(out_base)
27
27
  return(NULL)
28
28
  }
29
- say("Distances")
29
+ if(!is.na(sel) and file.exists(sel)){
30
+ say('Filter selection')
31
+ lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
32
+ a <- a[a$a %in% lab & a$b %in% lab, ]
33
+ }
34
+ say('Distances')
30
35
  a$d <- 1 - (a$value/100)
31
- ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d=max(a$d)*1.2)
32
- save(ani.d, file=dist_rdata)
36
+ ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
37
+ save(ani.d, file = dist_rdata)
33
38
  }
34
39
  }
35
40
 
36
41
  # Read result if the subclade is ready, run it otherwise
37
- if(file.exists(paste(out_base,"classif",sep="."))){
42
+ if(file.exists(paste(out_base, 'classif', sep = '.'))){
38
43
  say("Loading")
39
- ani.medoids <- read.table(paste(out_base, "medoids", sep="."),
40
- sep=' ', as.is=TRUE)[,1]
41
- a <- read.table(paste(out_base,"classif",sep="."), sep="\t", as.is=TRUE)
44
+ ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
45
+ sep = ' ', as.is = TRUE)[,1]
46
+ a <- read.table(paste(out_base, "classif", sep="."),
47
+ sep = '\t', as.is = TRUE)
42
48
  ani.types <- a[,2]
43
49
  names(ani.types) <- a[,1]
44
- if(length(ani.d)==0) load(dist_rdata)
50
+ if(length(ani.d) == 0) load(dist_rdata)
45
51
  }else{
46
52
  res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
47
- if(length(res)==0) return(NULL)
53
+ if(length(res) == 0) return(NULL)
48
54
  ani.medoids <- res[['ani.medoids']]
49
55
  ani.types <- res[['ani.types']]
50
56
  ani.d <- res[['ani.d']]
@@ -230,7 +236,7 @@ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
230
236
  }
231
237
 
232
238
  #= Main
233
- options(warn=1)
234
- subclades(ani_file=argv[1], out_base=argv[2],
235
- thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
239
+ options(warn = 1)
240
+ subclades(ani_file = argv[1], out_base = argv[2],
241
+ thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
236
242
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5.1
4
+ version: 0.3.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-11-18 00:00:00.000000000 Z
11
+ date: 2018-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -341,7 +341,6 @@ files:
341
341
  - utils/enveomics/Scripts/gi2tax.rb
342
342
  - utils/enveomics/Scripts/in_silico_GA_GI.pl
343
343
  - utils/enveomics/Scripts/lib/data/essential.hmm.gz
344
- - utils/enveomics/Scripts/lib/enveomics.R
345
344
  - utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
346
345
  - utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
347
346
  - utils/enveomics/Scripts/lib/enveomics_rb/og.rb
@@ -454,6 +453,7 @@ files:
454
453
  - utils/enveomics/enveomics.R/man/z$-methods.Rd
455
454
  - utils/enveomics/globals.mk
456
455
  - utils/enveomics/manifest.json
456
+ - utils/find-medoid.R
457
457
  - utils/index_metadata.rb
458
458
  - utils/mytaxa_scan.R
459
459
  - utils/mytaxa_scan.rb
@@ -495,7 +495,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
495
495
  version: '0'
496
496
  requirements: []
497
497
  rubyforge_project:
498
- rubygems_version: 2.7.7
498
+ rubygems_version: 2.2.2
499
499
  signing_key:
500
500
  specification_version: 4
501
501
  summary: MiGA
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
@@ -1 +0,0 @@
1
- utils/enveomics/Scripts/lib/../../enveomics.R