miga-base 0.3.5.1 → 0.3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/miga/project/result.rb +30 -29
- data/lib/miga/version.rb +1 -1
- data/utils/distance/commands.rb +12 -0
- data/utils/distance/database.rb +38 -5
- data/utils/distance/runner.rb +31 -15
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
- data/utils/find-medoid.R +31 -0
- data/utils/representatives.rb +7 -4
- data/utils/subclade/pipeline.rb +16 -3
- data/utils/subclade/runner.rb +2 -3
- data/utils/subclades.R +24 -18
- metadata +4 -4
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
- data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8b285b9906876a9f1b5366f929a4776d1689dbc1
|
4
|
+
data.tar.gz: 83b6843d00417fef4a8de18e4a102ad4d1899f0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: acfa6eb243f7fa8985cb649ab3b701db68515e8a18c221d94cb149e51cddeec49642b6176da46956765cca8f1961aa88d71b5cb9625131cf64f9287e79e173c6
|
7
|
+
data.tar.gz: 4ac1f2f81854959679b53d4865efba3a36ddca14216ff9f2aef06e1c27b7b415b47c833387e73ddf3cae41369f382146f1041b9a833743d58d3aad0a954bd1ab
|
data/lib/miga/project/result.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
4
|
+
require 'miga/result'
|
5
|
+
require 'miga/project/base'
|
6
6
|
|
7
7
|
##
|
8
8
|
# Helper module including specific functions to add project results.
|
@@ -31,7 +31,7 @@ module MiGA::Project::Result
|
|
31
31
|
# Supported values include:
|
32
32
|
# - +force+: A Boolean indicating if the result must be re-indexed. If true,
|
33
33
|
# it implies save=true.
|
34
|
-
def add_result(name, save=true, opts={})
|
34
|
+
def add_result(name, save = true, opts = {})
|
35
35
|
return nil if @@RESULT_DIRS[name].nil?
|
36
36
|
base = "#{path}/data/#{@@RESULT_DIRS[name]}/miga-project"
|
37
37
|
if opts[:force]
|
@@ -49,21 +49,21 @@ module MiGA::Project::Result
|
|
49
49
|
##
|
50
50
|
# Get the next distances task, saving intermediate results if +save+. Returns
|
51
51
|
# a Symbol.
|
52
|
-
def next_distances(save=true) ; next_task(@@DISTANCE_TASKS, save) ; end
|
52
|
+
def next_distances(save = true) ; next_task(@@DISTANCE_TASKS, save) ; end
|
53
53
|
|
54
54
|
##
|
55
55
|
# Get the next inclade task, saving intermediate results if +save+. Returns a
|
56
56
|
# Symbol.
|
57
|
-
def next_inclade(save=true) ; next_task(@@INCLADE_TASKS, save) ; end
|
57
|
+
def next_inclade(save = true) ; next_task(@@INCLADE_TASKS, save) ; end
|
58
58
|
|
59
59
|
##
|
60
60
|
# Get the next task from +tasks+, saving intermediate results if +save+.
|
61
61
|
# Returns a Symbol.
|
62
|
-
def next_task(tasks
|
62
|
+
def next_task(tasks = @@DISTANCE_TASKS+@@INCLADE_TASKS, save = true)
|
63
63
|
tasks.find do |t|
|
64
|
-
if metadata["run_#{t}"]==false or
|
64
|
+
if metadata["run_#{t}"] == false or
|
65
65
|
(!is_clade? and @@INCLADE_TASKS.include?(t) and
|
66
|
-
metadata["run_#{t}"]!=true)
|
66
|
+
metadata["run_#{t}"] != true)
|
67
67
|
false
|
68
68
|
else
|
69
69
|
add_result(t, save).nil?
|
@@ -79,10 +79,10 @@ module MiGA::Project::Result
|
|
79
79
|
def add_result_distances(base)
|
80
80
|
return nil unless result_files_exist?(base, %w[.Rdata .log .txt])
|
81
81
|
r = MiGA::Result.new("#{base}.json")
|
82
|
-
r.add_file(:rdata,
|
83
|
-
r.add_file(:matrix,
|
84
|
-
r.add_file(:log,
|
85
|
-
r.add_file(:hist,
|
82
|
+
r.add_file(:rdata, 'miga-project.Rdata')
|
83
|
+
r.add_file(:matrix, 'miga-project.txt')
|
84
|
+
r.add_file(:log, 'miga-project.log')
|
85
|
+
r.add_file(:hist, 'miga-project.hist')
|
86
86
|
r
|
87
87
|
end
|
88
88
|
|
@@ -91,10 +91,11 @@ module MiGA::Project::Result
|
|
91
91
|
return nil unless is_clade? or result_files_exist?(base,
|
92
92
|
%w[.pdf .classif .medoids .class.tsv .class.nwk])
|
93
93
|
r = add_result_iter_clades(base)
|
94
|
-
r.add_file(:aai_tree,
|
95
|
-
r.add_file(:proposal,
|
96
|
-
r.add_file(:clades_aai90,
|
97
|
-
r.add_file(:clades_ani95,
|
94
|
+
r.add_file(:aai_tree, 'miga-project.aai.nwk')
|
95
|
+
r.add_file(:proposal, 'miga-project.proposed-clades')
|
96
|
+
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
|
97
|
+
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
|
98
|
+
r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
|
98
99
|
r
|
99
100
|
end
|
100
101
|
|
@@ -102,28 +103,28 @@ module MiGA::Project::Result
|
|
102
103
|
return nil unless result_files_exist?(base,
|
103
104
|
%w[.pdf .classif .medoids .class.tsv .class.nwk])
|
104
105
|
r = add_result_iter_clades(base)
|
105
|
-
r.add_file(:ani_tree,
|
106
|
+
r.add_file(:ani_tree, 'miga-project.ani.nwk')
|
106
107
|
r
|
107
108
|
end
|
108
109
|
|
109
110
|
def add_result_iter_clades(base)
|
110
111
|
r = MiGA::Result.new("#{base}.json")
|
111
|
-
r.add_file(:report,
|
112
|
-
r.add_file(:class_table,
|
113
|
-
r.add_file(:class_tree,
|
114
|
-
r.add_file(:classif,
|
115
|
-
r.add_file(:medoids,
|
112
|
+
r.add_file(:report, 'miga-project.pdf')
|
113
|
+
r.add_file(:class_table, 'miga-project.class.tsv')
|
114
|
+
r.add_file(:class_tree, 'miga-project.class.nwk')
|
115
|
+
r.add_file(:classif, 'miga-project.classif')
|
116
|
+
r.add_file(:medoids, 'miga-project.medoids')
|
116
117
|
r
|
117
118
|
end
|
118
119
|
|
119
120
|
def add_result_ogs(base)
|
120
121
|
return nil unless result_files_exist?(base, %w[.ogs .stats])
|
121
122
|
r = MiGA::Result.new("#{base}.json")
|
122
|
-
r.add_file(:ogs,
|
123
|
-
r.add_file(:abc,
|
124
|
-
r.add_file(:stats,
|
125
|
-
r.add_file(:core_pan,
|
126
|
-
r.add_file(:core_pan_plot,
|
123
|
+
r.add_file(:ogs, 'miga-project.ogs')
|
124
|
+
r.add_file(:abc, 'miga-project.abc')
|
125
|
+
r.add_file(:stats, 'miga-project.stats')
|
126
|
+
r.add_file(:core_pan, 'miga-project.core-pan.tsv')
|
127
|
+
r.add_file(:core_pan_plot, 'miga-project.core-pan.pdf')
|
127
128
|
r
|
128
129
|
end
|
129
130
|
|
@@ -131,8 +132,8 @@ module MiGA::Project::Result
|
|
131
132
|
return nil unless
|
132
133
|
result_files_exist?(base, %w[.taxonomy.json .metadata.db])
|
133
134
|
r = MiGA::Result.new("#{base}.json")
|
134
|
-
r.add_file(:taxonomy_index,
|
135
|
-
r.add_file(:metadata_index,
|
135
|
+
r.add_file(:taxonomy_index, 'miga-project.taxonomy.json')
|
136
|
+
r.add_file(:metadata_index, 'miga-project.metadata.db')
|
136
137
|
r
|
137
138
|
end
|
138
139
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 6, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
data/utils/distance/commands.rb
CHANGED
@@ -18,6 +18,7 @@ module MiGA::DistanceRunner::Commands
|
|
18
18
|
dataset.name, target.name, tmp_dbs[:aai]).tap{ checkpoint :aai }
|
19
19
|
end
|
20
20
|
|
21
|
+
##
|
21
22
|
# Estimates AAI against +target+ using hAAI
|
22
23
|
def haai(target)
|
23
24
|
haai = aai_cmd(tmp_file("ess_genes.fa"),
|
@@ -34,6 +35,7 @@ module MiGA::DistanceRunner::Commands
|
|
34
35
|
aai
|
35
36
|
end
|
36
37
|
|
38
|
+
##
|
37
39
|
# Calculates ANI against +target+
|
38
40
|
def ani(target)
|
39
41
|
# Check if the request makes sense
|
@@ -49,6 +51,15 @@ module MiGA::DistanceRunner::Commands
|
|
49
51
|
dataset.name, target.name, tmp_dbs[:ani]).tap{ checkpoint :ani }
|
50
52
|
end
|
51
53
|
|
54
|
+
##
|
55
|
+
# Calculates and returns ANI against +target+ if AAI >= 85%. Returns
|
56
|
+
# +nil+ otherwise
|
57
|
+
def ani_after_aai(target)
|
58
|
+
aai = aai(target)
|
59
|
+
ani(target) unless aai.nil? or aai < 85.0
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
52
63
|
# Execute an AAI command
|
53
64
|
def aai_cmd(f1, f2, n1, n2, db, o={})
|
54
65
|
o = opts.merge(o)
|
@@ -59,6 +70,7 @@ module MiGA::DistanceRunner::Commands
|
|
59
70
|
(v.nil? or v.empty?) ? 0 : v.to_f
|
60
71
|
end
|
61
72
|
|
73
|
+
##
|
62
74
|
# Execute an ANI command
|
63
75
|
def ani_cmd(f1, f2, n1, n2, db, o={})
|
64
76
|
o = opts.merge(o)
|
data/utils/distance/database.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'sqlite3'
|
3
3
|
|
4
4
|
module MiGA::DistanceRunner::Database
|
5
|
+
##
|
5
6
|
# Check for corrupt files and create empty databases
|
6
7
|
def initialize_dbs!(for_ref)
|
7
8
|
@dbs = {}
|
@@ -33,6 +34,7 @@ module MiGA::DistanceRunner::Database
|
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
37
|
+
##
|
36
38
|
# Path to the database +metric+ for +dataset_name+ in +project+
|
37
39
|
# (assumes that +dataset_name+ is a reference dataset)
|
38
40
|
def ref_db(metric, dataset_name=nil)
|
@@ -48,35 +50,66 @@ module MiGA::DistanceRunner::Database
|
|
48
50
|
File.expand_path(b, home)
|
49
51
|
end
|
50
52
|
|
53
|
+
##
|
51
54
|
# Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
|
52
55
|
# query dataset)
|
53
56
|
def query_db(metric)
|
54
57
|
File.expand_path("#{dataset.name}.#{metric}.db", home)
|
55
58
|
end
|
56
59
|
|
60
|
+
##
|
57
61
|
# Get the stored +metric+ value against +target+
|
58
62
|
def stored_value(target, metric)
|
59
63
|
# Check if self.dataset -> target is done (previous run)
|
60
64
|
y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
|
61
65
|
return y unless y.nil? or y.zero?
|
66
|
+
|
62
67
|
# Check if self.dataset <- target is done (another thread)
|
63
|
-
if dataset.is_ref? and project.path==ref_project.path
|
64
|
-
y =
|
65
|
-
|
68
|
+
if dataset.is_ref? and project.path == ref_project.path
|
69
|
+
y = data_from_db(
|
70
|
+
target.name, dataset.name, ref_db(metric, target.name), metric)
|
71
|
+
unless y.nil? or y.first.zero?
|
72
|
+
# Store a copy
|
73
|
+
data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
|
74
|
+
return y.first
|
75
|
+
end
|
66
76
|
end
|
67
77
|
nil
|
68
78
|
end
|
69
79
|
|
80
|
+
##
|
70
81
|
# Get the value of +metric+ in the +db+ database between +n1+ and +n2+
|
71
82
|
def value_from_db(n1, n2, db, metric)
|
83
|
+
y = data_from_db(n1, n2, db, metric)
|
84
|
+
y.first unless y.nil?
|
85
|
+
end
|
86
|
+
|
87
|
+
##
|
88
|
+
# Get the +metric+ data in the +db+ database between +n1+ and +n2+. Returns an
|
89
|
+
# Array with the metric, standard deviation, number of matches, and maximum
|
90
|
+
# possible number of matches
|
91
|
+
def data_from_db(n1, n2, db, metric)
|
72
92
|
y = nil
|
73
93
|
SQLite3::Database.new(db) do |conn|
|
74
|
-
y = conn.execute(
|
75
|
-
|
94
|
+
y = conn.execute(
|
95
|
+
"select #{metric}, sd, n, omega from #{metric} where seq1=? and seq2=?",
|
96
|
+
[n1, n2]).first
|
76
97
|
end if File.size? db
|
77
98
|
y
|
78
99
|
end
|
79
100
|
|
101
|
+
##
|
102
|
+
# Save +data+ of +metric+ between +n1+ and +n2+ in the +db+ database.
|
103
|
+
def data_to_db(n1, n2, db, metric, data)
|
104
|
+
SQLite3::Database.new(db) do |conn|
|
105
|
+
conn.execute(
|
106
|
+
"insert into #{metric} (seq1, seq2, #{metric}, sd, n, omega) " +
|
107
|
+
"values (?, ?, ?, ?, ?, ?)", [n1, n2] + data)
|
108
|
+
end
|
109
|
+
checkpoint metric
|
110
|
+
end
|
111
|
+
|
112
|
+
##
|
80
113
|
# Iterates for each entry in +db+
|
81
114
|
def foreach_in_db(db, metric, &blk)
|
82
115
|
SQLite3::Database.new(db) do |conn|
|
data/utils/distance/runner.rb
CHANGED
@@ -30,7 +30,9 @@ class MiGA::DistanceRunner
|
|
30
30
|
if opts[:run_taxonomy] && project.metadata[:ref_project]
|
31
31
|
@home = File.expand_path('05.taxonomy', @home)
|
32
32
|
@ref_project = MiGA::Project.load(project.metadata[:ref_project])
|
33
|
-
|
33
|
+
if @ref_project.nil?
|
34
|
+
raise "Cannot load reference project: #{project.metadata[:ref_project]}"
|
35
|
+
end
|
34
36
|
else
|
35
37
|
@ref_project = project
|
36
38
|
end
|
@@ -55,47 +57,61 @@ class MiGA::DistanceRunner
|
|
55
57
|
def go_ref!
|
56
58
|
# Initialize databases
|
57
59
|
initialize_dbs! true
|
60
|
+
|
58
61
|
# first-come-first-serve traverse
|
59
62
|
ref_project.each_dataset do |ds|
|
60
63
|
next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
|
61
64
|
puts "[ #{Time.now} ] #{ds.name}"
|
62
|
-
|
63
|
-
ani(ds) unless aai.nil? or aai < 90.0
|
65
|
+
ani_after_aai(ds)
|
64
66
|
end
|
67
|
+
|
65
68
|
# Finalize
|
66
69
|
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
67
70
|
end
|
68
71
|
|
72
|
+
##
|
69
73
|
# Launch analysis for query datasets
|
70
74
|
def go_query!
|
71
75
|
# Check if project is ready
|
72
|
-
|
73
|
-
res = ref_project.result(
|
76
|
+
tsk = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
|
77
|
+
res = ref_project.result(tsk[0])
|
74
78
|
return if res.nil?
|
79
|
+
|
75
80
|
# Initialize the databases
|
76
81
|
initialize_dbs! false
|
77
82
|
# Calculate the classification-informed AAI/ANI traverse
|
78
|
-
results = File.expand_path("#{dataset.name}.#{
|
79
|
-
fh = File.open(results,
|
80
|
-
classif, val_cls = *classify(res.dir,
|
83
|
+
results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
|
84
|
+
fh = File.open(results, 'w')
|
85
|
+
classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
|
81
86
|
fh.close
|
87
|
+
|
82
88
|
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
83
89
|
par_dir = File.dirname(File.expand_path(classif, res.dir))
|
84
|
-
par = File.expand_path(
|
90
|
+
par = File.expand_path('miga-project.classif', par_dir)
|
91
|
+
closest = {dataset: nil, ani: 0.0}
|
85
92
|
if File.size? par
|
86
|
-
File.open(par,
|
93
|
+
File.open(par, 'r') do |fh|
|
87
94
|
fh.each_line do |ln|
|
88
95
|
r = ln.chomp.split("\t")
|
89
|
-
next unless r[1].to_i==val_cls
|
90
|
-
|
91
|
-
|
92
|
-
ani(target) if aai >= 90.0
|
96
|
+
next unless r[1].to_i == val_cls
|
97
|
+
ani = ani_after_aai(ref_project.dataset(r[0]))
|
98
|
+
closest = {ds: r[0], ani: ani} unless ani.nil? or ani < closest[:ani]
|
93
99
|
end
|
94
100
|
end
|
95
101
|
end
|
102
|
+
|
103
|
+
# Calculate all the AAIs/ANIs against the closest ANI95-clade (if ANI > 95%)
|
104
|
+
cl_path = File.expand_path('miga-project.ani95-clades', home)
|
105
|
+
if File.size? cl_path and tsk[0] == :clade_finding and closest[:ani] >= 95.0
|
106
|
+
File.foreach(cl_path).
|
107
|
+
map { |i| i.chomp.split(',') }.
|
108
|
+
find { |i| i.include? closest[:ds] }.
|
109
|
+
each { |i| ani_after_aai(ref_project.dataset(i)) }
|
110
|
+
end
|
111
|
+
|
96
112
|
# Finalize
|
97
113
|
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
98
|
-
build_medoids_tree(
|
114
|
+
build_medoids_tree(tsk[1])
|
99
115
|
transfer_taxonomy(tax_test)
|
100
116
|
end
|
101
117
|
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @update: Oct 07 2015
|
5
|
+
# @license: artistic license 2.0
|
6
|
+
#
|
7
|
+
use strict;
|
8
|
+
use warnings;
|
9
|
+
use List::Util qw/sum min max/;
|
10
|
+
|
11
|
+
my ($seqs, $minlen, $n__) = @ARGV;
|
12
|
+
$seqs or die "
|
13
|
+
Description:
|
14
|
+
Calculates the N50 value of a set of sequences. Alternatively, it
|
15
|
+
can calculate other N** values. It also calculates the total number
|
16
|
+
of sequences and the total added length.
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
$0 seqs.fa[ minlen[ **]]
|
20
|
+
|
21
|
+
seqs.fa A FastA file containing the sequences.
|
22
|
+
minlen (optional) The minimum length to take into consideration.
|
23
|
+
By default: 0.
|
24
|
+
** Value N** to calculate. By default: 50 (N50).
|
25
|
+
";
|
26
|
+
$minlen ||= 0;
|
27
|
+
$n__ ||= 50;
|
28
|
+
|
29
|
+
my @len = ();
|
30
|
+
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
|
+
while(<SEQ>){
|
32
|
+
if(/^>/){
|
33
|
+
push @len, 0;
|
34
|
+
}else{
|
35
|
+
next if /^;/;
|
36
|
+
chomp;
|
37
|
+
s/\W//g;
|
38
|
+
$len[-1]+=length $_;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
close SEQ;
|
42
|
+
@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
|
43
|
+
my $tot = (sum(@len) || 0);
|
44
|
+
|
45
|
+
my $thr = $n__*$tot/100;
|
46
|
+
my $pos = 0;
|
47
|
+
for(@len){
|
48
|
+
$pos+= $_;
|
49
|
+
if($pos>=$thr){
|
50
|
+
print "N$n__: $_\n";
|
51
|
+
last;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
print "Sequences: ".scalar(@len)."\n";
|
55
|
+
print "Total length: $tot\n";
|
56
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @update Oct-07-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
|
11
|
+
my($file, $content, $stretch) = @ARGV;
|
12
|
+
$file or die <<HELP
|
13
|
+
|
14
|
+
Description:
|
15
|
+
Filter sequences by N-content and presence of long homopolymers.
|
16
|
+
Usage:
|
17
|
+
$0 sequences.fa [content [stretch]] > filtered.fa
|
18
|
+
Where:
|
19
|
+
sequences.fa Input file in FastA format
|
20
|
+
content A number between 0 and 1 indicating the maximum proportion of Ns
|
21
|
+
(1 to turn off, 0.5 by default)
|
22
|
+
stretch A number indicating the maximum number of consecutive identical
|
23
|
+
nucleotides allowed (0 to turn off, 100 by default)
|
24
|
+
filtered.fa Filtered set of sequences.
|
25
|
+
|
26
|
+
HELP
|
27
|
+
;
|
28
|
+
($content ||= 0.5)+=0;
|
29
|
+
($stretch ||= 100)+=0;
|
30
|
+
|
31
|
+
my $good = 0;
|
32
|
+
my $N = 0;
|
33
|
+
|
34
|
+
FASTA: {
|
35
|
+
local $/ = "\n>";
|
36
|
+
open FILE, "<", $file or die "I can not open the file: $file: $!\n";
|
37
|
+
SEQ: while(<FILE>){
|
38
|
+
$N++;
|
39
|
+
s/^;.*//gm;
|
40
|
+
s/>//g;
|
41
|
+
my($n,$s) = split /\n/, $_, 2;
|
42
|
+
(my $clean = $s) =~ s/[^ACTGN]//g;
|
43
|
+
if($content < 1){
|
44
|
+
(my $Ns = $clean) =~ s/[^N]//g;
|
45
|
+
next SEQ if length($Ns)>length($clean)*$content;
|
46
|
+
}
|
47
|
+
if($stretch > 0){
|
48
|
+
for my $nuc (qw(A C T G N)){
|
49
|
+
next SEQ if $clean =~ m/[$nuc]{$stretch}/;
|
50
|
+
}
|
51
|
+
}
|
52
|
+
print ">$n\n$s\n";
|
53
|
+
$good++;
|
54
|
+
}
|
55
|
+
close FILE;
|
56
|
+
print STDERR "Total sequences: $N\nAfter filtering: $good\n";
|
57
|
+
}
|
58
|
+
|
59
|
+
|
60
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M Rodriguez-R
|
4
|
+
# @update Oct-07-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
|
11
|
+
$#ARGV>=0 or die "
|
12
|
+
Usage:
|
13
|
+
$0 seqs.fa... > length.txt
|
14
|
+
|
15
|
+
seqs.fa One or more FastA files.
|
16
|
+
length.txt A table with the lengths of the sequences.
|
17
|
+
|
18
|
+
";
|
19
|
+
|
20
|
+
for my $fa (@ARGV){
|
21
|
+
open FA, "<", $fa or die "Cannot open file: $fa: $!\n";
|
22
|
+
my $def = '';
|
23
|
+
my $len = 0;
|
24
|
+
while(<FA>){
|
25
|
+
next if /^;/;
|
26
|
+
if(m/^>(\S+)\s?/){
|
27
|
+
print "$def\t$len\n" if $def;
|
28
|
+
$def = $1;
|
29
|
+
$len = 0;
|
30
|
+
}else{
|
31
|
+
s/[^A-Za-z]//g;
|
32
|
+
$len+= length $_;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
print "$def\t$len\n" if $def;
|
36
|
+
close FA;
|
37
|
+
}
|
38
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @update Oct-13-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
use Symbol;
|
11
|
+
|
12
|
+
my ($file, $base, $outN) = @ARGV;
|
13
|
+
|
14
|
+
$outN ||= 12;
|
15
|
+
($file and $base) or die "
|
16
|
+
Usage
|
17
|
+
$0 in_file.fa out_base[ no_files]
|
18
|
+
|
19
|
+
in_file.fa Input file in FastA format.
|
20
|
+
out_base Prefix for the name of the output files. It will
|
21
|
+
be appended with .<i>.fa, where <i> is a consecutive
|
22
|
+
number starting in 1.
|
23
|
+
no_files Number of files to generate. By default: 12.
|
24
|
+
|
25
|
+
";
|
26
|
+
|
27
|
+
|
28
|
+
my @outSym = ();
|
29
|
+
for my $i (1 .. $outN){
|
30
|
+
$outSym[$i-1] = gensym;
|
31
|
+
open $outSym[$i-1], ">", "$base.$i.fa" or
|
32
|
+
die "I can not create the file: $base.$i.fa: $!\n";
|
33
|
+
}
|
34
|
+
|
35
|
+
|
36
|
+
my($i, $seq) = (-1, '');
|
37
|
+
open FILE, "<", $file or die "I can not read the file: $file: $!\n";
|
38
|
+
while(my $ln=<FILE>){
|
39
|
+
next if $ln=~/^;/;
|
40
|
+
if($ln =~ m/^>/){
|
41
|
+
print { $outSym[$i % $outN] } $seq if $seq;
|
42
|
+
$i++;
|
43
|
+
$seq = '';
|
44
|
+
}
|
45
|
+
$seq.=$ln;
|
46
|
+
}
|
47
|
+
print { $outSym[$i % $outN] } $seq if $seq;
|
48
|
+
close FILE;
|
49
|
+
|
50
|
+
for(my $j=0; $j<$outN; $j++){
|
51
|
+
close $outSym[$j];
|
52
|
+
}
|
53
|
+
|
54
|
+
print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
|
55
|
+
|
data/utils/find-medoid.R
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
#
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
#
|
6
|
+
|
7
|
+
#= Load stuff
|
8
|
+
argv <- commandArgs(trailingOnly=T)
|
9
|
+
suppressPackageStartupMessages(library(ape))
|
10
|
+
|
11
|
+
find_medoids <- function(dist, out, clades) {
|
12
|
+
dist <- as.matrix(dist)
|
13
|
+
cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
|
14
|
+
medoids <- c()
|
15
|
+
for(i in cl){
|
16
|
+
lab <- strsplit(i, ',')[[1]]
|
17
|
+
cat('Clade of:', lab[1], '\n')
|
18
|
+
if(length(lab) == 1) {
|
19
|
+
med <- lab
|
20
|
+
} else {
|
21
|
+
med <- lab[which.min(colSums(dist[lab, lab], na.rm = TRUE))]
|
22
|
+
}
|
23
|
+
medoids <- c(medoids, med)
|
24
|
+
}
|
25
|
+
write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
|
26
|
+
}
|
27
|
+
|
28
|
+
#= Main
|
29
|
+
load(argv[1])
|
30
|
+
find_medoids(dist = ani.d, out = argv[2], clades = argv[3])
|
31
|
+
|
data/utils/representatives.rb
CHANGED
@@ -6,15 +6,18 @@ require 'miga'
|
|
6
6
|
proj_path = ARGV.shift or raise "Usage: #{$0} path/to/project"
|
7
7
|
|
8
8
|
# Load MiGA object
|
9
|
-
p = MiGA::Project.load(proj_path) or
|
10
|
-
|
11
|
-
|
9
|
+
p = MiGA::Project.load(proj_path) or
|
10
|
+
raise "Cannot load project: #{proj_path}"
|
11
|
+
pr = p.result(:clade_finding) or
|
12
|
+
raise "Unavailable result: clade_finding"
|
13
|
+
pf = pr.file_path(:clades_ani95) or
|
14
|
+
raise "Unavailable result file: clades_ani95"
|
12
15
|
|
13
16
|
# Read ANIspp
|
14
17
|
ani_spp = []
|
15
18
|
File.open(pf, 'r') do |fh|
|
16
19
|
fh.each_line do |ln|
|
17
|
-
next if $.==1
|
20
|
+
next if $.==1 and ln.chomp == 'G' # <- Legacy check
|
18
21
|
ani_spp << ln.chomp.split(',')
|
19
22
|
end
|
20
23
|
end
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# High-end pipelines for SubcladeRunner
|
3
3
|
module MiGA::SubcladeRunner::Pipeline
|
4
4
|
|
5
|
-
# Run species-level clusterings using ANI>95% / AAI>90%
|
5
|
+
# Run species-level clusterings using ANI > 95% / AAI > 90%
|
6
6
|
def cluster_species
|
7
7
|
tasks = {ani95: [:ani_distances, 95.0], aai90: [:aai_distances, 90.0]}
|
8
8
|
tasks.each do |k, par|
|
@@ -23,8 +23,20 @@ module MiGA::SubcladeRunner::Pipeline
|
|
23
23
|
end
|
24
24
|
ofh.close
|
25
25
|
# Cluster genomes
|
26
|
-
`ogs.mcl.rb -o '#{ogs_file}' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
26
|
+
`ogs.mcl.rb -o '#{ogs_file}.tmp' --abc '#{abc_path}' -t '#{opts[:thr]}'`
|
27
|
+
File.open(ogs_file, 'w') do |fh|
|
28
|
+
File.foreach("#{ogs_file}.tmp").with_index do |ln, lno|
|
29
|
+
fh.puts ln if lno != 0
|
30
|
+
end
|
31
|
+
end
|
32
|
+
File.unlink "#{ogs_file}.tmp"
|
27
33
|
end
|
34
|
+
|
35
|
+
# Find species medoids
|
36
|
+
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
|
37
|
+
`Rscript '#{src}' miga-project.dist.rdata \
|
38
|
+
miga-project.ani95-medoids miga-project.ani95-clades`
|
39
|
+
|
28
40
|
# Propose clades
|
29
41
|
ofh = File.open('miga-project.proposed-clades', 'w')
|
30
42
|
File.open('miga-project.ani95-clades', 'r') do |ifh|
|
@@ -42,7 +54,8 @@ module MiGA::SubcladeRunner::Pipeline
|
|
42
54
|
step = :"#{metric}_distances"
|
43
55
|
metric_res = project.result(step) or raise "Incomplete step #{step}"
|
44
56
|
matrix = metric_res.file_path(:matrix)
|
45
|
-
`Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}'
|
57
|
+
`Rscript '#{src}' '#{matrix}' miga-project '#{opts[:thr]}' \
|
58
|
+
miga-project.ani95-medoids`
|
46
59
|
File.rename('miga-project.nwk',"miga-project.#{metric}.nwk") if
|
47
60
|
File.exist? 'miga-project.nwk'
|
48
61
|
end
|
data/utils/subclade/runner.rb
CHANGED
@@ -3,7 +3,6 @@ require_relative 'base.rb'
|
|
3
3
|
require_relative 'temporal.rb'
|
4
4
|
require_relative 'pipeline.rb'
|
5
5
|
|
6
|
-
|
7
6
|
class MiGA::SubcladeRunner
|
8
7
|
|
9
8
|
include MiGA::SubcladeRunner::Temporal
|
@@ -17,7 +16,7 @@ class MiGA::SubcladeRunner
|
|
17
16
|
raise "No project at #{project_path}"
|
18
17
|
@step = step.to_sym
|
19
18
|
clades_dir = File.expand_path('data/10.clades', project.path)
|
20
|
-
@home = File.expand_path(@step
|
19
|
+
@home = File.expand_path(@step == :clade_finding ? '01.find' : '02.ani',
|
21
20
|
clades_dir)
|
22
21
|
@opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
|
23
22
|
end
|
@@ -29,7 +28,7 @@ class MiGA::SubcladeRunner
|
|
29
28
|
Dir.mktmpdir do |tmp_dir|
|
30
29
|
@tmp = tmp_dir
|
31
30
|
create_temporals
|
32
|
-
step
|
31
|
+
step == :clade_finding ? go_clade_finding! : go_subclades!
|
33
32
|
end
|
34
33
|
end
|
35
34
|
|
data/utils/subclades.R
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#
|
6
6
|
|
7
7
|
#= Load stuff
|
8
|
-
argv <- commandArgs(trailingOnly=
|
8
|
+
argv <- commandArgs(trailingOnly = TRUE)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
10
|
suppressPackageStartupMessages(library(vegan))
|
11
11
|
suppressPackageStartupMessages(library(cluster))
|
@@ -13,38 +13,44 @@ suppressPackageStartupMessages(library(parallel))
|
|
13
13
|
suppressPackageStartupMessages(library(enveomics.R))
|
14
14
|
|
15
15
|
#= Main function
|
16
|
-
subclades <- function(ani_file, out_base, thr=1, ani.d=dist(0)) {
|
17
|
-
say(
|
16
|
+
subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
17
|
+
say('==> Out base:', out_base, '<==')
|
18
18
|
|
19
19
|
# Normalize input matrix
|
20
|
-
dist_rdata = paste(out_base,
|
20
|
+
dist_rdata = paste(out_base, 'dist.rdata', sep = '.')
|
21
21
|
if(!missing(ani_file)){
|
22
|
-
if(length(ani.d)==0 && !file.exists(dist_rdata)){
|
22
|
+
if(length(ani.d) == 0 && !file.exists(dist_rdata)){
|
23
23
|
# Read from ani_file
|
24
|
-
a <- read.table(gzfile(ani_file), sep=
|
24
|
+
a <- read.table(gzfile(ani_file), sep = '\t', header = TRUE, as.is = TRUE)
|
25
25
|
if(nrow(a)==0){
|
26
26
|
generate_empty_files(out_base)
|
27
27
|
return(NULL)
|
28
28
|
}
|
29
|
-
|
29
|
+
if(!is.na(sel) and file.exists(sel)){
|
30
|
+
say('Filter selection')
|
31
|
+
lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
|
32
|
+
a <- a[a$a %in% lab & a$b %in% lab, ]
|
33
|
+
}
|
34
|
+
say('Distances')
|
30
35
|
a$d <- 1 - (a$value/100)
|
31
|
-
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d=max(a$d)*1.2)
|
32
|
-
save(ani.d, file=dist_rdata)
|
36
|
+
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
|
37
|
+
save(ani.d, file = dist_rdata)
|
33
38
|
}
|
34
39
|
}
|
35
40
|
|
36
41
|
# Read result if the subclade is ready, run it otherwise
|
37
|
-
if(file.exists(paste(out_base,
|
42
|
+
if(file.exists(paste(out_base, 'classif', sep = '.'))){
|
38
43
|
say("Loading")
|
39
|
-
ani.medoids <- read.table(paste(out_base, "medoids", sep="."),
|
40
|
-
|
41
|
-
a <- read.table(paste(out_base,"classif",sep="."),
|
44
|
+
ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
|
45
|
+
sep = ' ', as.is = TRUE)[,1]
|
46
|
+
a <- read.table(paste(out_base, "classif", sep="."),
|
47
|
+
sep = '\t', as.is = TRUE)
|
42
48
|
ani.types <- a[,2]
|
43
49
|
names(ani.types) <- a[,1]
|
44
|
-
if(length(ani.d)==0) load(dist_rdata)
|
50
|
+
if(length(ani.d) == 0) load(dist_rdata)
|
45
51
|
}else{
|
46
52
|
res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
|
47
|
-
if(length(res)==0) return(NULL)
|
53
|
+
if(length(res) == 0) return(NULL)
|
48
54
|
ani.medoids <- res[['ani.medoids']]
|
49
55
|
ani.types <- res[['ani.types']]
|
50
56
|
ani.d <- res[['ani.d']]
|
@@ -230,7 +236,7 @@ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
|
|
230
236
|
}
|
231
237
|
|
232
238
|
#= Main
|
233
|
-
options(warn=1)
|
234
|
-
subclades(ani_file=argv[1], out_base=argv[2],
|
235
|
-
thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
|
239
|
+
options(warn = 1)
|
240
|
+
subclades(ani_file = argv[1], out_base = argv[2],
|
241
|
+
thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])), sel = argv[4])
|
236
242
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-11-
|
11
|
+
date: 2018-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -341,7 +341,6 @@ files:
|
|
341
341
|
- utils/enveomics/Scripts/gi2tax.rb
|
342
342
|
- utils/enveomics/Scripts/in_silico_GA_GI.pl
|
343
343
|
- utils/enveomics/Scripts/lib/data/essential.hmm.gz
|
344
|
-
- utils/enveomics/Scripts/lib/enveomics.R
|
345
344
|
- utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
|
346
345
|
- utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
|
347
346
|
- utils/enveomics/Scripts/lib/enveomics_rb/og.rb
|
@@ -454,6 +453,7 @@ files:
|
|
454
453
|
- utils/enveomics/enveomics.R/man/z$-methods.Rd
|
455
454
|
- utils/enveomics/globals.mk
|
456
455
|
- utils/enveomics/manifest.json
|
456
|
+
- utils/find-medoid.R
|
457
457
|
- utils/index_metadata.rb
|
458
458
|
- utils/mytaxa_scan.R
|
459
459
|
- utils/mytaxa_scan.rb
|
@@ -495,7 +495,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
495
495
|
version: '0'
|
496
496
|
requirements: []
|
497
497
|
rubyforge_project:
|
498
|
-
rubygems_version: 2.
|
498
|
+
rubygems_version: 2.2.2
|
499
499
|
signing_key:
|
500
500
|
specification_version: 4
|
501
501
|
summary: MiGA
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
|
@@ -1 +0,0 @@
|
|
1
|
-
utils/enveomics/Scripts/lib/../../enveomics.R
|