miga-base 0.3.5.1 → 0.3.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/miga/project/result.rb +30 -29
- data/lib/miga/version.rb +1 -1
- data/utils/distance/commands.rb +12 -0
- data/utils/distance/database.rb +38 -5
- data/utils/distance/runner.rb +31 -15
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
- data/utils/find-medoid.R +31 -0
- data/utils/representatives.rb +7 -4
- data/utils/subclade/pipeline.rb +16 -3
- data/utils/subclade/runner.rb +2 -3
- data/utils/subclades.R +24 -18
- metadata +4 -4
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
- data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8b285b9906876a9f1b5366f929a4776d1689dbc1
|
4
|
+
data.tar.gz: 83b6843d00417fef4a8de18e4a102ad4d1899f0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: acfa6eb243f7fa8985cb649ab3b701db68515e8a18c221d94cb149e51cddeec49642b6176da46956765cca8f1961aa88d71b5cb9625131cf64f9287e79e173c6
|
7
|
+
data.tar.gz: 4ac1f2f81854959679b53d4865efba3a36ddca14216ff9f2aef06e1c27b7b415b47c833387e73ddf3cae41369f382146f1041b9a833743d58d3aad0a954bd1ab
|
data/lib/miga/project/result.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
4
|
+
require 'miga/result'
|
5
|
+
require 'miga/project/base'
|
6
6
|
|
7
7
|
##
|
8
8
|
# Helper module including specific functions to add project results.
|
@@ -31,7 +31,7 @@ module MiGA::Project::Result
|
|
31
31
|
# Supported values include:
|
32
32
|
# - +force+: A Boolean indicating if the result must be re-indexed. If true,
|
33
33
|
# it implies save=true.
|
34
|
-
def add_result(name, save=true, opts={})
|
34
|
+
def add_result(name, save = true, opts = {})
|
35
35
|
return nil if @@RESULT_DIRS[name].nil?
|
36
36
|
base = "#{path}/data/#{@@RESULT_DIRS[name]}/miga-project"
|
37
37
|
if opts[:force]
|
@@ -49,21 +49,21 @@ module MiGA::Project::Result
|
|
49
49
|
##
|
50
50
|
# Get the next distances task, saving intermediate results if +save+. Returns
|
51
51
|
# a Symbol.
|
52
|
-
def next_distances(save=true) ; next_task(@@DISTANCE_TASKS, save) ; end
|
52
|
+
def next_distances(save = true) ; next_task(@@DISTANCE_TASKS, save) ; end
|
53
53
|
|
54
54
|
##
|
55
55
|
# Get the next inclade task, saving intermediate results if +save+. Returns a
|
56
56
|
# Symbol.
|
57
|
-
def next_inclade(save=true) ; next_task(@@INCLADE_TASKS, save) ; end
|
57
|
+
def next_inclade(save = true) ; next_task(@@INCLADE_TASKS, save) ; end
|
58
58
|
|
59
59
|
##
|
60
60
|
# Get the next task from +tasks+, saving intermediate results if +save+.
|
61
61
|
# Returns a Symbol.
|
62
|
-
def next_task(tasks
|
62
|
+
def next_task(tasks = @@DISTANCE_TASKS+@@INCLADE_TASKS, save = true)
|
63
63
|
tasks.find do |t|
|
64
|
-
if metadata["run_#{t}"]==false or
|
64
|
+
if metadata["run_#{t}"] == false or
|
65
65
|
(!is_clade? and @@INCLADE_TASKS.include?(t) and
|
66
|
-
metadata["run_#{t}"]!=true)
|
66
|
+
metadata["run_#{t}"] != true)
|
67
67
|
false
|
68
68
|
else
|
69
69
|
add_result(t, save).nil?
|
@@ -79,10 +79,10 @@ module MiGA::Project::Result
|
|
79
79
|
def add_result_distances(base)
|
80
80
|
return nil unless result_files_exist?(base, %w[.Rdata .log .txt])
|
81
81
|
r = MiGA::Result.new("#{base}.json")
|
82
|
-
r.add_file(:rdata,
|
83
|
-
r.add_file(:matrix,
|
84
|
-
r.add_file(:log,
|
85
|
-
r.add_file(:hist,
|
82
|
+
r.add_file(:rdata, 'miga-project.Rdata')
|
83
|
+
r.add_file(:matrix, 'miga-project.txt')
|
84
|
+
r.add_file(:log, 'miga-project.log')
|
85
|
+
r.add_file(:hist, 'miga-project.hist')
|
86
86
|
r
|
87
87
|
end
|
88
88
|
|
@@ -91,10 +91,11 @@ module MiGA::Project::Result
|
|
91
91
|
return nil unless is_clade? or result_files_exist?(base,
|
92
92
|
%w[.pdf .classif .medoids .class.tsv .class.nwk])
|
93
93
|
r = add_result_iter_clades(base)
|
94
|
-
r.add_file(:aai_tree,
|
95
|
-
r.add_file(:proposal,
|
96
|
-
r.add_file(:clades_aai90,
|
97
|
-
r.add_file(:clades_ani95,
|
94
|
+
r.add_file(:aai_tree, 'miga-project.aai.nwk')
|
95
|
+
r.add_file(:proposal, 'miga-project.proposed-clades')
|
96
|
+
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
|
97
|
+
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
|
98
|
+
r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
|
98
99
|
r
|
99
100
|
end
|
100
101
|
|
@@ -102,28 +103,28 @@ module MiGA::Project::Result
|
|
102
103
|
return nil unless result_files_exist?(base,
|
103
104
|
%w[.pdf .classif .medoids .class.tsv .class.nwk])
|
104
105
|
r = add_result_iter_clades(base)
|
105
|
-
r.add_file(:ani_tree,
|
106
|
+
r.add_file(:ani_tree, 'miga-project.ani.nwk')
|
106
107
|
r
|
107
108
|
end
|
108
109
|
|
109
110
|
def add_result_iter_clades(base)
|
110
111
|
r = MiGA::Result.new("#{base}.json")
|
111
|
-
r.add_file(:report,
|
112
|
-
r.add_file(:class_table,
|
113
|
-
r.add_file(:class_tree,
|
114
|
-
r.add_file(:classif,
|
115
|
-
r.add_file(:medoids,
|
112
|
+
r.add_file(:report, 'miga-project.pdf')
|
113
|
+
r.add_file(:class_table, 'miga-project.class.tsv')
|
114
|
+
r.add_file(:class_tree, 'miga-project.class.nwk')
|
115
|
+
r.add_file(:classif, 'miga-project.classif')
|
116
|
+
r.add_file(:medoids, 'miga-project.medoids')
|
116
117
|
r
|
117
118
|
end
|
118
119
|
|
119
120
|
def add_result_ogs(base)
|
120
121
|
return nil unless result_files_exist?(base, %w[.ogs .stats])
|
121
122
|
r = MiGA::Result.new("#{base}.json")
|
122
|
-
r.add_file(:ogs,
|
123
|
-
r.add_file(:abc,
|
124
|
-
r.add_file(:stats,
|
125
|
-
r.add_file(:core_pan,
|
126
|
-
r.add_file(:core_pan_plot,
|
123
|
+
r.add_file(:ogs, 'miga-project.ogs')
|
124
|
+
r.add_file(:abc, 'miga-project.abc')
|
125
|
+
r.add_file(:stats, 'miga-project.stats')
|
126
|
+
r.add_file(:core_pan, 'miga-project.core-pan.tsv')
|
127
|
+
r.add_file(:core_pan_plot, 'miga-project.core-pan.pdf')
|
127
128
|
r
|
128
129
|
end
|
129
130
|
|
@@ -131,8 +132,8 @@ module MiGA::Project::Result
|
|
131
132
|
return nil unless
|
132
133
|
result_files_exist?(base, %w[.taxonomy.json .metadata.db])
|
133
134
|
r = MiGA::Result.new("#{base}.json")
|
134
|
-
r.add_file(:taxonomy_index,
|
135
|
-
r.add_file(:metadata_index,
|
135
|
+
r.add_file(:taxonomy_index, 'miga-project.taxonomy.json')
|
136
|
+
r.add_file(:metadata_index, 'miga-project.metadata.db')
|
136
137
|
r
|
137
138
|
end
|
138
139
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 6, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
data/utils/distance/commands.rb
CHANGED
@@ -18,6 +18,7 @@ module MiGA::DistanceRunner::Commands
|
|
18
18
|
dataset.name, target.name, tmp_dbs[:aai]).tap{ checkpoint :aai }
|
19
19
|
end
|
20
20
|
|
21
|
+
##
|
21
22
|
# Estimates AAI against +target+ using hAAI
|
22
23
|
def haai(target)
|
23
24
|
haai = aai_cmd(tmp_file("ess_genes.fa"),
|
@@ -34,6 +35,7 @@ module MiGA::DistanceRunner::Commands
|
|
34
35
|
aai
|
35
36
|
end
|
36
37
|
|
38
|
+
##
|
37
39
|
# Calculates ANI against +target+
|
38
40
|
def ani(target)
|
39
41
|
# Check if the request makes sense
|
@@ -49,6 +51,15 @@ module MiGA::DistanceRunner::Commands
|
|
49
51
|
dataset.name, target.name, tmp_dbs[:ani]).tap{ checkpoint :ani }
|
50
52
|
end
|
51
53
|
|
54
|
+
##
|
55
|
+
# Calculates and returns ANI against +target+ if AAI >= 85%. Returns
|
56
|
+
# +nil+ otherwise
|
57
|
+
def ani_after_aai(target)
|
58
|
+
aai = aai(target)
|
59
|
+
ani(target) unless aai.nil? or aai < 85.0
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
52
63
|
# Execute an AAI command
|
53
64
|
def aai_cmd(f1, f2, n1, n2, db, o={})
|
54
65
|
o = opts.merge(o)
|
@@ -59,6 +70,7 @@ module MiGA::DistanceRunner::Commands
|
|
59
70
|
(v.nil? or v.empty?) ? 0 : v.to_f
|
60
71
|
end
|
61
72
|
|
73
|
+
##
|
62
74
|
# Execute an ANI command
|
63
75
|
def ani_cmd(f1, f2, n1, n2, db, o={})
|
64
76
|
o = opts.merge(o)
|
data/utils/distance/database.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'sqlite3'
|
3
3
|
|
4
4
|
module MiGA::DistanceRunner::Database
|
5
|
+
##
|
5
6
|
# Check for corrupt files and create empty databases
|
6
7
|
def initialize_dbs!(for_ref)
|
7
8
|
@dbs = {}
|
@@ -33,6 +34,7 @@ module MiGA::DistanceRunner::Database
|
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
37
|
+
##
|
36
38
|
# Path to the database +metric+ for +dataset_name+ in +project+
|
37
39
|
# (assumes that +dataset_name+ is a reference dataset)
|
38
40
|
def ref_db(metric, dataset_name=nil)
|
@@ -48,35 +50,66 @@ module MiGA::DistanceRunner::Database
|
|
48
50
|
File.expand_path(b, home)
|
49
51
|
end
|
50
52
|
|
53
|
+
##
|
51
54
|
# Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
|
52
55
|
# query dataset)
|
53
56
|
def query_db(metric)
|
54
57
|
File.expand_path("#{dataset.name}.#{metric}.db", home)
|
55
58
|
end
|
56
59
|
|
60
|
+
##
|
57
61
|
# Get the stored +metric+ value against +target+
|
58
62
|
def stored_value(target, metric)
|
59
63
|
# Check if self.dataset -> target is done (previous run)
|
60
64
|
y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
|
61
65
|
return y unless y.nil? or y.zero?
|
66
|
+
|
62
67
|
# Check if self.dataset <- target is done (another thread)
|
63
|
-
if dataset.is_ref? and project.path==ref_project.path
|
64
|
-
y =
|
65
|
-
|
68
|
+
if dataset.is_ref? and project.path == ref_project.path
|
69
|
+
y = data_from_db(
|
70
|
+
target.name, dataset.name, ref_db(metric, target.name), metric)
|
71
|
+
unless y.nil? or y.first.zero?
|
72
|
+
# Store a copy
|
73
|
+
data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
|
74
|
+
return y.first
|
75
|
+
end
|
66
76
|
end
|
67
77
|
nil
|
68
78
|
end
|
69
79
|
|
80
|
+
##
|
70
81
|
# Get the value of +metric+ in the +db+ database between +n1+ and +n2+
|
71
82
|
def value_from_db(n1, n2, db, metric)
|
83
|
+
y = data_from_db(n1, n2, db, metric)
|
84
|
+
y.first unless y.nil?
|
85
|
+
end
|
86
|
+
|
87
|
+
##
|
88
|
+
# Get the +metric+ data in the +db+ database between +n1+ and +n2+. Returns an
|
89
|
+
# Array with the metric, standard deviation, number of matches, and maximum
|
90
|
+
# possible number of matches
|
91
|
+
def data_from_db(n1, n2, db, metric)
|
72
92
|
y = nil
|
73
93
|
SQLite3::Database.new(db) do |conn|
|
74
|
-
y = conn.execute(
|
75
|
-
|
94
|
+
y = conn.execute(
|
95
|
+
"select #{metric}, sd, n, omega from #{metric} where seq1=? and seq2=?",
|
96
|
+
[n1, n2]).first
|
76
97
|
end if File.size? db
|
77
98
|
y
|
78
99
|
end
|
79
100
|
|
101
|
+
##
|
102
|
+
# Save +data+ of +metric+ between +n1+ and +n2+ in the +db+ database.
|
103
|
+
def data_to_db(n1, n2, db, metric, data)
|
104
|
+
SQLite3::Database.new(db) do |conn|
|
105
|
+
conn.execute(
|
106
|
+
"insert into #{metric} (seq1, seq2, #{metric}, sd, n, omega) " +
|
107
|
+
"values (?, ?, ?, ?, ?, ?)", [n1, n2] + data)
|
108
|
+
end
|
109
|
+
checkpoint metric
|
110
|
+
end
|
111
|
+
|
112
|
+
##
|
80
113
|
# Iterates for each entry in +db+
|
81
114
|
def foreach_in_db(db, metric, &blk)
|
82
115
|
SQLite3::Database.new(db) do |conn|
|
data/utils/distance/runner.rb
CHANGED
@@ -30,7 +30,9 @@ class MiGA::DistanceRunner
|
|
30
30
|
if opts[:run_taxonomy] && project.metadata[:ref_project]
|
31
31
|
@home = File.expand_path('05.taxonomy', @home)
|
32
32
|
@ref_project = MiGA::Project.load(project.metadata[:ref_project])
|
33
|
-
|
33
|
+
if @ref_project.nil?
|
34
|
+
raise "Cannot load reference project: #{project.metadata[:ref_project]}"
|
35
|
+
end
|
34
36
|
else
|
35
37
|
@ref_project = project
|
36
38
|
end
|
@@ -55,47 +57,61 @@ class MiGA::DistanceRunner
|
|
55
57
|
def go_ref!
|
56
58
|
# Initialize databases
|
57
59
|
initialize_dbs! true
|
60
|
+
|
58
61
|
# first-come-first-serve traverse
|
59
62
|
ref_project.each_dataset do |ds|
|
60
63
|
next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
|
61
64
|
puts "[ #{Time.now} ] #{ds.name}"
|
62
|
-
|
63
|
-
ani(ds) unless aai.nil? or aai < 90.0
|
65
|
+
ani_after_aai(ds)
|
64
66
|
end
|
67
|
+
|
65
68
|
# Finalize
|
66
69
|
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
67
70
|
end
|
68
71
|
|
72
|
+
##
|
69
73
|
# Launch analysis for query datasets
|
70
74
|
def go_query!
|
71
75
|
# Check if project is ready
|
72
|
-
|
73
|
-
res = ref_project.result(
|
76
|
+
tsk = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
|
77
|
+
res = ref_project.result(tsk[0])
|
74
78
|
return if res.nil?
|
79
|
+
|
75
80
|
# Initialize the databases
|
76
81
|
initialize_dbs! false
|
77
82
|
# Calculate the classification-informed AAI/ANI traverse
|
78
|
-
results = File.expand_path("#{dataset.name}.#{
|
79
|
-
fh = File.open(results,
|
80
|
-
classif, val_cls = *classify(res.dir,
|
83
|
+
results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
|
84
|
+
fh = File.open(results, 'w')
|
85
|
+
classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
|
81
86
|
fh.close
|
87
|
+
|
82
88
|
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
83
89
|
par_dir = File.dirname(File.expand_path(classif, res.dir))
|
84
|
-
par = File.expand_path(
|
90
|
+
par = File.expand_path('miga-project.classif', par_dir)
|
91
|
+
closest = {dataset: nil, ani: 0.0}
|
85
92
|
if File.size? par
|
86
|
-
File.open(par,
|
93
|
+
File.open(par, 'r') do |fh|
|
87
94
|
fh.each_line do |ln|
|
88
95
|
r = ln.chomp.split("\t")
|
89
|
-
next unless r[1].to_i==val_cls
|
90
|
-
|
91
|
-
|
92
|
-
ani(target) if aai >= 90.0
|
96
|
+
next unless r[1].to_i == val_cls
|
97
|
+
ani = ani_after_aai(ref_project.dataset(r[0]))
|
98
|
+
closest = {ds: r[0], ani: ani} unless ani.nil? or ani < closest[:ani]
|
93
99
|
end
|
94
100
|
end
|
95
101
|
end
|
102
|
+
|
103
|
+
# Calculate all the AAIs/ANIs against the closest ANI95-clade (if ANI > 95%)
|
104
|
+
cl_path = File.expand_path('miga-project.ani95-clades', home)
|
105
|
+
if File.size? cl_path and tsk[0] == :clade_finding and closest[:ani] >= 95.0
|
106
|
+
File.foreach(cl_path).
|
107
|
+
map { |i| i.chomp.split(',') }.
|
108
|
+
find { |i| i.include? closest[:ds] }.
|
109
|
+
each { |i| ani_after_aai(ref_project.dataset(i)) }
|
110
|
+
end
|
111
|
+
|
96
112
|
# Finalize
|
97
113
|
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
98
|
-
build_medoids_tree(
|
114
|
+
build_medoids_tree(tsk[1])
|
99
115
|
transfer_taxonomy(tax_test)
|
100
116
|
end
|
101
117
|
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @update: Oct 07 2015
|
5
|
+
# @license: artistic license 2.0
|
6
|
+
#
|
7
|
+
use strict;
|
8
|
+
use warnings;
|
9
|
+
use List::Util qw/sum min max/;
|
10
|
+
|
11
|
+
my ($seqs, $minlen, $n__) = @ARGV;
|
12
|
+
$seqs or die "
|
13
|
+
Description:
|
14
|
+
Calculates the N50 value of a set of sequences. Alternatively, it
|
15
|
+
can calculate other N** values. It also calculates the total number
|
16
|
+
of sequences and the total added length.
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
$0 seqs.fa[ minlen[ **]]
|
20
|
+
|
21
|
+
seqs.fa A FastA file containing the sequences.
|
22
|
+
minlen (optional) The minimum length to take into consideration.
|
23
|
+
By default: 0.
|
24
|
+
** Value N** to calculate. By default: 50 (N50).
|
25
|
+
";
|
26
|
+
$minlen ||= 0;
|
27
|
+
$n__ ||= 50;
|
28
|
+
|
29
|
+
my @len = ();
|
30
|
+
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
|
+
while(<SEQ>){
|
32
|
+
if(/^>/){
|
33
|
+
push @len, 0;
|
34
|
+
}else{
|
35
|
+
next if /^;/;
|
36
|
+
chomp;
|
37
|
+
s/\W//g;
|
38
|
+
$len[-1]+=length $_;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
close SEQ;
|
42
|
+
@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
|
43
|
+
my $tot = (sum(@len) || 0);
|
44
|
+
|
45
|
+
my $thr = $n__*$tot/100;
|
46
|
+
my $pos = 0;
|
47
|
+
for(@len){
|
48
|
+
$pos+= $_;
|
49
|
+
if($pos>=$thr){
|
50
|
+
print "N$n__: $_\n";
|
51
|
+
last;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
print "Sequences: ".scalar(@len)."\n";
|
55
|
+
print "Total length: $tot\n";
|
56
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @update Oct-07-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
|
11
|
+
my($file, $content, $stretch) = @ARGV;
|
12
|
+
$file or die <<HELP
|
13
|
+
|
14
|
+
Description:
|
15
|
+
Filter sequences by N-content and presence of long homopolymers.
|
16
|
+
Usage:
|
17
|
+
$0 sequences.fa [content [stretch]] > filtered.fa
|
18
|
+
Where:
|
19
|
+
sequences.fa Input file in FastA format
|
20
|
+
content A number between 0 and 1 indicating the maximum proportion of Ns
|
21
|
+
(1 to turn off, 0.5 by default)
|
22
|
+
stretch A number indicating the maximum number of consecutive identical
|
23
|
+
nucleotides allowed (0 to turn off, 100 by default)
|
24
|
+
filtered.fa Filtered set of sequences.
|
25
|
+
|
26
|
+
HELP
|
27
|
+
;
|
28
|
+
($content ||= 0.5)+=0;
|
29
|
+
($stretch ||= 100)+=0;
|
30
|
+
|
31
|
+
my $good = 0;
|
32
|
+
my $N = 0;
|
33
|
+
|
34
|
+
FASTA: {
|
35
|
+
local $/ = "\n>";
|
36
|
+
open FILE, "<", $file or die "I can not open the file: $file: $!\n";
|
37
|
+
SEQ: while(<FILE>){
|
38
|
+
$N++;
|
39
|
+
s/^;.*//gm;
|
40
|
+
s/>//g;
|
41
|
+
my($n,$s) = split /\n/, $_, 2;
|
42
|
+
(my $clean = $s) =~ s/[^ACTGN]//g;
|
43
|
+
if($content < 1){
|
44
|
+
(my $Ns = $clean) =~ s/[^N]//g;
|
45
|
+
next SEQ if length($Ns)>length($clean)*$content;
|
46
|
+
}
|
47
|
+
if($stretch > 0){
|
48
|
+
for my $nuc (qw(A C T G N)){
|
49
|
+
next SEQ if $clean =~ m/[$nuc]{$stretch}/;
|
50
|
+
}
|
51
|
+
}
|
52
|
+
print ">$n\n$s\n";
|
53
|
+
$good++;
|
54
|
+
}
|
55
|
+
close FILE;
|
56
|
+
print STDERR "Total sequences: $N\nAfter filtering: $good\n";
|
57
|
+
}
|
58
|
+
|
59
|
+
|
60
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M Rodriguez-R
|
4
|
+
# @update Oct-07-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
|
11
|
+
$#ARGV>=0 or die "
|
12
|
+
Usage:
|
13
|
+
$0 seqs.fa... > length.txt
|
14
|
+
|
15
|
+
seqs.fa One or more FastA files.
|
16
|
+
length.txt A table with the lengths of the sequences.
|
17
|
+
|
18
|
+
";
|
19
|
+
|
20
|
+
for my $fa (@ARGV){
|
21
|
+
open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
|
22
|
+
my $def = '';
|
23
|
+
my $len = 0;
|
24
|
+
while(<FA>){
|
25
|
+
next if /^;/;
|
26
|
+
if(m/^>(\S+)\s?/){
|
27
|
+
print "$def\t$len\n" if $def;
|
28
|
+
$def = $1;
|
29
|
+
$len = 0;
|
30
|
+
}else{
|
31
|
+
s/[^A-Za-z]//g;
|
32
|
+
$len+= length $_;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
print "$def\t$len\n" if $def;
|
36
|
+
close FA;
|
37
|
+
}
|
38
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @update Oct-13-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
use Symbol;
|
11
|
+
|
12
|
+
my ($file, $base, $outN) = @ARGV;
|
13
|
+
|
14
|
+
$outN ||= 12;
|
15
|
+
($file and $base) or die "
|
16
|
+
Usage
|
17
|
+
$0 in_file.fa out_base[ no_files]
|
18
|
+
|
19
|
+
in_file.fa Input file in FastA format.
|
20
|
+
out_base Prefix for the name of the output files. It will
|
21
|
+
be appended with .<i>.fa, where <i> is a consecutive
|
22
|
+
number starting in 1.
|
23
|
+
no_files Number of files to generate. By default: 12.
|
24
|
+
|
25
|
+
";
|
26
|
+
|
27
|
+
|
28
|
+
my @outSym = ();
|
29
|
+
for my $i (1 .. $outN){
|
30
|
+
$outSym[$i-1] = gensym;
|
31
|
+
open $outSym[$i-1], ">", "$base.$i.fa" or
|
32
|
+
die "I can not create the file: $base.$i.fa: $!\n";
|
33
|
+
}
|
34
|
+
|
35
|
+
|
36
|
+
my($i, $seq) = (-1, '');
|
37
|
+
open FILE, "<", $file or die "I can not read the file: $file: $!\n";
|
38
|
+
while(my $ln=<FILE>){
|
39
|
+
next if $ln=~/^;/;
|
40
|
+
if($ln =~ m/^>/){
|
41
|
+
print { $outSym[$i % $outN] } $seq if $seq;
|
42
|
+
$i++;
|
43
|
+
$seq = '';
|
44
|
+
}
|
45
|
+
$seq.=$ln;
|
46
|
+
}
|
47
|
+
print { $outSym[$i % $outN] } $seq if $seq;
|
48
|
+
close FILE;
|
49
|
+
|
50
|
+
for(my $j=0; $j<$outN; $j++){
|
51
|
+
close $outSym[$j];
|
52
|
+
}
|
53
|
+
|
54
|
+
print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
|
55
|
+
|
data/utils/find-medoid.R
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
#
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
#
|
6
|
+
|
7
|
+
#= Load stuff
|
8
|
+
argv <- commandArgs(trailingOnly=T)
|
9
|
+
suppressPackageStartupMessages(library(ape))
|
10
|
+
|
11
|
+
find_medoids <- function(dist, out, clades) {
|
12
|
+
dist <- as.matrix(dist)
|
13
|
+
cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
|
14
|
+
medoids <- c()
|
15
|
+
for(i in cl){
|
16
|
+
lab <- strsplit(i, ',')[[1]]
|
17
|
+
cat('Clade of:', lab[1], '\n')
|
18
|
+
if(length(lab) == 1) {
|
19
|
+
med <- lab
|
20
|
+
} else {
|
21
|
+
med <- lab[which.min(colSums(dist[lab, lab], na.rm = TRUE))]
|
22
|
+
}
|
23
|
+
medoids <- c(medoids, med)
|
24
|
+
}
|
25
|
+
write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
|
26
|
+
}
|
27
|
+
|
28
|
+
#= Main
|
29
|
+
load(argv[1])
|
30
|
+
find_medoids(dist = ani.d, out = argv[2], clades = argv[3])
|
31
|
+
|
data/utils/representatives.rb
CHANGED
@@ -6,15 +6,18 @@ require 'miga'
|
|
6
6
|
proj_path = ARGV.shift or raise "Usage: #{$0} path/to/project"
|
7
7
|
|
8
8
|
# Load MiGA object
|
9
|
-
p = MiGA::Project.load(proj_path) or
|
10
|
-
|
11
|
-
|
9
|
+
p = MiGA::Project.load(proj_path) or
|
10
|
+
raise "Cannot load project: #{proj_path}"
|
11
|
+
pr = p.result(:clade_finding) or
|
12
|
+
raise "Unavailable result: clade_finding"
|
13
|
+
pf = pr.file_path(:clades_ani95) or
|
14
|
+
raise "Unavailable result file: clades_ani95"
|
12
15
|
|
13
16
|
# Read ANIspp
|
14
17
|
ani_spp = []
|
15
18
|
File.open(pf, 'r') do |fh|
|
16
19
|
fh.each_line do |ln|
|
17
|
-
next if $.==1
|
20
|
+
next if $.==1 and ln.chomp == 'G' # <- Legacy check
|
18
21
|
ani_spp << ln.chomp.split(',')
|
19
22
|
end
|
20
23
|
end
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# High-end pipelines for SubcladeRunner
|
3
3
|
module MiGA::SubcladeRunner::Pipeline
|
4
4
|
|
5
|
-
# Run species-level clusterings using ANI>95% / AAI>90%
|
5
|
+
# Run species-level clusterings using ANI > 95% / AAI > 90%
|
6
6
|
def cluster_species
|
7
7
|
tasks = {ani95: [:ani_distances, 95.0], aai90: [:aai_distances, 90.0]}
|
8
8
|
tasks.each do |k, par|
|
@@ -23,8 +23,20 @@ module MiGA::SubcladeRunner::Pipeline
|
|
23
23
|
end
|
24
24
|
ofh.close
|
25
25
|
# Cluster genomes
|
26
|
-
`ogs.mcl.rb -o '#{ogs_file}' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
26
|
+
`ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
27
|
+
File.open(ogs_file, 'w') do |fh|
|
28
|
+
File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
|
29
|
+
fh.puts ln if lno != 0
|
30
|
+
end
|
31
|
+
end
|
32
|
+
File.unlink "#{ogs_file}.tmp"
|
27
33
|
end
|
34
|
+
|
35
|
+
# Find species medoids
|
36
|
+
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
|
37
|
+
`Rscript '#{src}' miga-project.dist.rdata \
|
38
|
+
miga-project.ani95-medoids miga-project.ani95-clades`
|
39
|
+
|
28
40
|
# Propose clades
|
29
41
|
ofh = File.open('miga-project.proposed-clades', 'w')
|
30
42
|
File.open('miga-project.ani95-clades', 'r') do |ifh|
|
@@ -42,7 +54,8 @@ module MiGA::SubcladeRunner::Pipeline
|
|
42
54
|
step = :"#{metric}_distances"
|
43
55
|
metric_res = project.result(step) or raise "Incomplete step #{step}"
|
44
56
|
matrix = metric_res.file_path(:matrix)
|
45
|
-
`Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}'
|
57
|
+
`Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}' \
|
58
|
+
miga-project.ani95-medoids`
|
46
59
|
File.rename('miga-project.nwk',"miga-project.#{metric}.nwk") if
|
47
60
|
File.exist? 'miga-project.nwk'
|
48
61
|
end
|
data/utils/subclade/runner.rb
CHANGED
@@ -3,7 +3,6 @@ require_relative 'base.rb'
|
|
3
3
|
require_relative 'temporal.rb'
|
4
4
|
require_relative 'pipeline.rb'
|
5
5
|
|
6
|
-
|
7
6
|
class MiGA::SubcladeRunner
|
8
7
|
|
9
8
|
include MiGA::SubcladeRunner::Temporal
|
@@ -17,7 +16,7 @@ class MiGA::SubcladeRunner
|
|
17
16
|
raise "No project at #{project_path}"
|
18
17
|
@step = step.to_sym
|
19
18
|
clades_dir = File.expand_path('data/10.clades', project.path)
|
20
|
-
@home = File.expand_path(@step
|
19
|
+
@home = File.expand_path(@step == :clade_finding ? '01.find' : '02.ani',
|
21
20
|
clades_dir)
|
22
21
|
@opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
|
23
22
|
end
|
@@ -29,7 +28,7 @@ class MiGA::SubcladeRunner
|
|
29
28
|
Dir.mktmpdir do |tmp_dir|
|
30
29
|
@tmp = tmp_dir
|
31
30
|
create_temporals
|
32
|
-
step
|
31
|
+
step == :clade_finding ? go_clade_finding! : go_subclades!
|
33
32
|
end
|
34
33
|
end
|
35
34
|
|
data/utils/subclades.R
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#
|
6
6
|
|
7
7
|
#= Load stuff
|
8
|
-
argv <- commandArgs(trailingOnly=
|
8
|
+
argv <- commandArgs(trailingOnly = TRUE)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
10
|
suppressPackageStartupMessages(library(vegan))
|
11
11
|
suppressPackageStartupMessages(library(cluster))
|
@@ -13,38 +13,44 @@ suppressPackageStartupMessages(library(parallel))
|
|
13
13
|
suppressPackageStartupMessages(library(enveomics.R))
|
14
14
|
|
15
15
|
#= Main function
|
16
|
-
subclades <- function(ani_file, out_base, thr=1, ani.d=dist(0)) {
|
17
|
-
say(
|
16
|
+
subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
17
|
+
say('==> Out base:', out_base, '<==')
|
18
18
|
|
19
19
|
# Normalize input matrix
|
20
|
-
dist_rdata = paste(out_base,
|
20
|
+
dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
|
21
21
|
if(!missing(ani_file)){
|
22
|
-
if(length(ani.d)==0 && !file.exists(dist_rdata)){
|
22
|
+
if(length(ani.d) == 0 && !file.exists(dist_rdata)){
|
23
23
|
# Read from ani_file
|
24
|
-
a <- read.table(gzfile(ani_file), sep=
|
24
|
+
a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
|
25
25
|
if(nrow(a)==0){
|
26
26
|
generate_empty_files(out_base)
|
27
27
|
return(NULL)
|
28
28
|
}
|
29
|
-
|
29
|
+
if(!is.na(sel) and file.exists(sel)){
|
30
|
+
say('Filter selection')
|
31
|
+
lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
|
32
|
+
a <- a[a$a %in% lab & a$b %in% lab, ]
|
33
|
+
}
|
34
|
+
say('Distances')
|
30
35
|
a$d <- 1 - (a$value/100)
|
31
|
-
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d=max(a$d)*1.2)
|
32
|
-
save(ani.d, file=dist_rdata)
|
36
|
+
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
|
37
|
+
save(ani.d, file = dist_rdata)
|
33
38
|
}
|
34
39
|
}
|
35
40
|
|
36
41
|
# Read result if the subclade is ready, run it otherwise
|
37
|
-
if(file.exists(paste(out_base,
|
42
|
+
if(file.exists(paste(out_base, 'classif', sep = '.'))){
|
38
43
|
say("Loading")
|
39
|
-
ani.medoids <- read.table(paste(out_base, "medoids", sep="."),
|
40
|
-
|
41
|
-
a <- read.table(paste(out_base,"classif",sep="."),
|
44
|
+
ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
|
45
|
+
sep = ' ', as.is = TRUE)[,1]
|
46
|
+
a <- read.table(paste(out_base, "classif", sep="."),
|
47
|
+
sep = '\t', as.is = TRUE)
|
42
48
|
ani.types <- a[,2]
|
43
49
|
names(ani.types) <- a[,1]
|
44
|
-
if(length(ani.d)==0) load(dist_rdata)
|
50
|
+
if(length(ani.d) == 0) load(dist_rdata)
|
45
51
|
}else{
|
46
52
|
res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
|
47
|
-
if(length(res)==0) return(NULL)
|
53
|
+
if(length(res) == 0) return(NULL)
|
48
54
|
ani.medoids <- res[['ani.medoids']]
|
49
55
|
ani.types <- res[['ani.types']]
|
50
56
|
ani.d <- res[['ani.d']]
|
@@ -230,7 +236,7 @@ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
|
|
230
236
|
}
|
231
237
|
|
232
238
|
#= Main
|
233
|
-
options(warn=1)
|
234
|
-
subclades(ani_file=argv[1], out_base=argv[2],
|
235
|
-
thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
|
239
|
+
options(warn = 1)
|
240
|
+
subclades(ani_file = argv[1], out_base = argv[2],
|
241
|
+
thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
|
236
242
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -341,7 +341,6 @@ files:
|
|
341
341
|
- utils/enveomics/Scripts/gi2tax.rb
|
342
342
|
- utils/enveomics/Scripts/in_silico_GA_GI.pl
|
343
343
|
- utils/enveomics/Scripts/lib/data/essential.hmm.gz
|
344
|
-
- utils/enveomics/Scripts/lib/enveomics.R
|
345
344
|
- utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
|
346
345
|
- utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
|
347
346
|
- utils/enveomics/Scripts/lib/enveomics_rb/og.rb
|
@@ -454,6 +453,7 @@ files:
|
|
454
453
|
- utils/enveomics/enveomics.R/man/z$-methods.Rd
|
455
454
|
- utils/enveomics/globals.mk
|
456
455
|
- utils/enveomics/manifest.json
|
456
|
+
- utils/find-medoid.R
|
457
457
|
- utils/index_metadata.rb
|
458
458
|
- utils/mytaxa_scan.R
|
459
459
|
- utils/mytaxa_scan.rb
|
@@ -495,7 +495,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
495
495
|
version: '0'
|
496
496
|
requirements: []
|
497
497
|
rubyforge_project:
|
498
|
-
rubygems_version: 2.
|
498
|
+
rubygems_version: 2.2.2
|
499
499
|
signing_key:
|
500
500
|
specification_version: 4
|
501
501
|
summary: MiGA
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Scripts/lib/../../enveomics.R
|