miga-base 0.2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +351 -0
- data/actions/add_result +61 -0
- data/actions/add_taxonomy +86 -0
- data/actions/create_dataset +62 -0
- data/actions/create_project +70 -0
- data/actions/daemon +69 -0
- data/actions/download_dataset +77 -0
- data/actions/find_datasets +63 -0
- data/actions/import_datasets +86 -0
- data/actions/index_taxonomy +71 -0
- data/actions/list_datasets +83 -0
- data/actions/list_files +67 -0
- data/actions/unlink_dataset +52 -0
- data/bin/miga +48 -0
- data/lib/miga/daemon.rb +178 -0
- data/lib/miga/dataset.rb +286 -0
- data/lib/miga/gui.rb +289 -0
- data/lib/miga/metadata.rb +74 -0
- data/lib/miga/project.rb +268 -0
- data/lib/miga/remote_dataset.rb +154 -0
- data/lib/miga/result.rb +102 -0
- data/lib/miga/tax_index.rb +70 -0
- data/lib/miga/taxonomy.rb +107 -0
- data/lib/miga.rb +83 -0
- data/scripts/_distances_noref_nomulti.bash +86 -0
- data/scripts/_distances_ref_nomulti.bash +105 -0
- data/scripts/aai_distances.bash +40 -0
- data/scripts/ani_distances.bash +39 -0
- data/scripts/assembly.bash +38 -0
- data/scripts/cds.bash +45 -0
- data/scripts/clade_finding.bash +27 -0
- data/scripts/distances.bash +30 -0
- data/scripts/essential_genes.bash +29 -0
- data/scripts/haai_distances.bash +39 -0
- data/scripts/init.bash +211 -0
- data/scripts/miga.bash +12 -0
- data/scripts/mytaxa.bash +93 -0
- data/scripts/mytaxa_scan.bash +85 -0
- data/scripts/ogs.bash +36 -0
- data/scripts/read_quality.bash +37 -0
- data/scripts/ssu.bash +35 -0
- data/scripts/subclades.bash +26 -0
- data/scripts/trimmed_fasta.bash +47 -0
- data/scripts/trimmed_reads.bash +57 -0
- data/utils/adapters.fa +302 -0
- data/utils/mytaxa_scan.R +89 -0
- data/utils/mytaxa_scan.rb +58 -0
- data/utils/requirements.txt +19 -0
- data/utils/subclades-compile.rb +48 -0
- data/utils/subclades.R +171 -0
- metadata +185 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# @package MiGA
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Dec-07-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
require "restclient"
|
9
|
+
|
10
|
+
module MiGA
|
11
|
+
class RemoteDataset
|
12
|
+
# Class
|
13
|
+
@@UNIVERSE = {
|
14
|
+
ebi:{
|
15
|
+
dbs: { embl:{stage: :assembly, format: :fasta} },
|
16
|
+
url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
17
|
+
method: :rest
|
18
|
+
},
|
19
|
+
ncbi:{
|
20
|
+
dbs: { nuccore:{stage: :assembly, format: :fasta} },
|
21
|
+
url: "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" +
|
22
|
+
"efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
23
|
+
method: :rest
|
24
|
+
},
|
25
|
+
ncbi_map:{
|
26
|
+
dbs: { assembly:{map_to: :nuccore, format: :text} },
|
27
|
+
url: "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" +
|
28
|
+
"elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
|
29
|
+
method: :rest,
|
30
|
+
map_to_universe: :ncbi
|
31
|
+
}
|
32
|
+
}
|
33
|
+
def self.UNIVERSE ; @@UNIVERSE ; end
|
34
|
+
def self.download(universe, db, ids, format, file=nil)
|
35
|
+
ids = [ids] unless ids.is_a? Array
|
36
|
+
case @@UNIVERSE[universe][:method]
|
37
|
+
when :rest
|
38
|
+
map_to = @@UNIVERSE[universe][:dbs][db].nil? ? nil :
|
39
|
+
@@UNIVERSE[universe][:dbs][db][:map_to]
|
40
|
+
url = sprintf @@UNIVERSE[universe][:url],
|
41
|
+
db, ids.join(","), format, map_to
|
42
|
+
response = RestClient::Request.execute(:method=>:get, :url=>url,
|
43
|
+
:timeout=>600)
|
44
|
+
raise "Unable to reach #{universe} client, error code "+
|
45
|
+
"#{response.code}." unless response.code == 200
|
46
|
+
doc = response.to_s
|
47
|
+
else
|
48
|
+
raise "Unexpected error: Unsupported download method for Universe "+
|
49
|
+
"#{universe}."
|
50
|
+
end
|
51
|
+
unless file.nil?
|
52
|
+
ofh = File.open(file, "w")
|
53
|
+
ofh.print doc
|
54
|
+
ofh.close
|
55
|
+
end
|
56
|
+
doc
|
57
|
+
end
|
58
|
+
# Instance
|
59
|
+
attr_reader :universe, :db, :ids
|
60
|
+
def initialize(ids, db, universe)
|
61
|
+
ids = [ids] unless ids.is_a? Array
|
62
|
+
@ids = (ids.is_a?(Array) ? ids : [ids])
|
63
|
+
@db = db.to_sym
|
64
|
+
@universe = universe.to_sym
|
65
|
+
raise "Unknown Universe: #{@universe}. Try one of: "+
|
66
|
+
"#{@@UNIVERSE.keys}" unless @@UNIVERSE.keys.include? @universe
|
67
|
+
raise "Unknown Database: #{@db}. Try one of: "+
|
68
|
+
"#{@@UNIVERSE[@universe][:dbs]}" unless
|
69
|
+
@@UNIVERSE[@universe][:dbs].include? @db
|
70
|
+
unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
|
71
|
+
res = RemoteDataset.download
|
72
|
+
end
|
73
|
+
end
|
74
|
+
def save_to(project, name=nil, is_ref=true, metadata={})
|
75
|
+
name = ids.join("_").miga_name if name.nil?
|
76
|
+
project = Project.new(project) if project.is_a? String
|
77
|
+
raise "Dataset #{name} exists in the project, aborting..." if
|
78
|
+
Dataset.exist?(project, name)
|
79
|
+
metadata = get_metadata(metadata)
|
80
|
+
case @@UNIVERSE[universe][:dbs][db][:stage]
|
81
|
+
when :assembly
|
82
|
+
base = project.path + "/data/" + Dataset.RESULT_DIRS[:assembly] +
|
83
|
+
"/" + name
|
84
|
+
ofh = File.open("#{base}.start", "w")
|
85
|
+
ofh.puts Time.now.to_s
|
86
|
+
ofh.close
|
87
|
+
download("#{base}.LargeContigs.fna")
|
88
|
+
File.symlink("#{base}.LargeContigs.fna", "#{base}.AllContigs.fna")
|
89
|
+
ofh = File.open("#{base}.done", "w")
|
90
|
+
ofh.puts Time.now.to_s
|
91
|
+
ofh.close
|
92
|
+
else
|
93
|
+
raise "Unexpected error: Unsupported result for database #{db}."
|
94
|
+
end
|
95
|
+
dataset = Dataset.new(project, name, is_ref, metadata)
|
96
|
+
project.add_dataset(dataset.name)
|
97
|
+
result = dataset.add_result @@UNIVERSE[universe][:dbs][db][:stage]
|
98
|
+
raise "Empty dataset created: seed result was not added due to "+
|
99
|
+
"incomplete files." if result.nil?
|
100
|
+
dataset
|
101
|
+
end
|
102
|
+
def get_metadata(metadata={})
|
103
|
+
case universe
|
104
|
+
when :ebi
|
105
|
+
# Get taxonomy
|
106
|
+
metadata[:tax] = get_ncbi_taxonomy
|
107
|
+
when :ncbi
|
108
|
+
# Get taxonomy
|
109
|
+
metadata[:tax] = get_ncbi_taxonomy
|
110
|
+
end
|
111
|
+
metadata
|
112
|
+
end
|
113
|
+
def download(file)
|
114
|
+
RemoteDataset.download(universe, db, ids,
|
115
|
+
@@UNIVERSE[universe][:dbs][db][:format], file)
|
116
|
+
end
|
117
|
+
def get_ncbi_taxid
|
118
|
+
case universe
|
119
|
+
when :ebi
|
120
|
+
doc = RemoteDataset.download(universe, db, ids, :annot).split(/\n/)
|
121
|
+
ln = doc.grep(/^FT\s+\/db_xref="taxon:/).first
|
122
|
+
ln = doc.grep(/^OX\s+NCBI_TaxID=/).first if ln.nil?
|
123
|
+
return nil if ln.nil?
|
124
|
+
ln.sub!(/.*(?:"taxon:|NCBI_TaxID=)(\d+)["; ].*/, "\\1")
|
125
|
+
return nil unless ln =~ /^\d+$/
|
126
|
+
ln
|
127
|
+
when :ncbi
|
128
|
+
doc = RemoteDataset.download(universe, db, ids, :gb).split(/\n/)
|
129
|
+
ln = doc.grep(/^\s+\/db_xref="taxon:/).first
|
130
|
+
return nil if ln.nil?
|
131
|
+
ln.sub!(/.*(?:"taxon:)(\d+)["; ].*/, "\\1")
|
132
|
+
return nil unless ln =~ /^\d+$/
|
133
|
+
ln
|
134
|
+
else
|
135
|
+
raise "I don't know how to extract ncbi_taxids from #{universe}."
|
136
|
+
end
|
137
|
+
end
|
138
|
+
def get_ncbi_taxonomy
|
139
|
+
lineage = {}
|
140
|
+
tax_id = get_ncbi_taxid
|
141
|
+
loop do
|
142
|
+
break if tax_id.nil? or %w{0 1}.include? tax_id
|
143
|
+
doc = RemoteDataset.download(:ebi, :taxonomy, tax_id, "")
|
144
|
+
name = (doc.scan(/SCIENTIFIC NAME\s+:\s+(.+)/).first||[]).first
|
145
|
+
rank = (doc.scan(/RANK\s+:\s+(.+)/).first||[]).first
|
146
|
+
rank = "dataset" if lineage.empty? and rank=="no rank"
|
147
|
+
lineage[rank] = name unless rank.nil?
|
148
|
+
tax_id = (doc.scan(/PARENT ID\s+:\s+(.+)/).first||[]).first
|
149
|
+
end
|
150
|
+
Taxonomy.new(lineage)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
data/lib/miga/result.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
#
|
2
|
+
# @package MiGA
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Dec-19-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
module MiGA
|
9
|
+
class Result
|
10
|
+
# Class
|
11
|
+
def self.exist? path
|
12
|
+
!!(File.size? path)
|
13
|
+
end
|
14
|
+
def self.load path
|
15
|
+
return nil unless Result.exist? path
|
16
|
+
Result.new path
|
17
|
+
end
|
18
|
+
# Instance
|
19
|
+
attr_reader :path, :data, :results
|
20
|
+
def initialize(path)
|
21
|
+
@path = path
|
22
|
+
if Result.exist? path
|
23
|
+
self.load
|
24
|
+
else
|
25
|
+
self.create
|
26
|
+
end
|
27
|
+
end
|
28
|
+
def dir
|
29
|
+
File.dirname(path)
|
30
|
+
end
|
31
|
+
def file_path(k)
|
32
|
+
k = k.to_sym
|
33
|
+
return nil if self[:files].nil? or self[:files][k].nil?
|
34
|
+
return File.expand_path(self[:files][k], dir) unless
|
35
|
+
self[:files][k].is_a? Array
|
36
|
+
self[:files][k].map{ |f| File.expand_path(f, dir) }
|
37
|
+
end
|
38
|
+
def [](k) data[k.to_sym] ; end
|
39
|
+
def add_file(k, file)
|
40
|
+
k = k.to_sym
|
41
|
+
self.data[:files] ||= {}
|
42
|
+
self.data[:files][k] = file if
|
43
|
+
File.exist? File.expand_path(file, dir)
|
44
|
+
self.data[:files][k] = file + ".gz" if
|
45
|
+
File.exist? File.expand_path(file + ".gz", dir)
|
46
|
+
end
|
47
|
+
def create
|
48
|
+
@data = {:created=>Time.now.to_s, :results=>[], :stats=>{}, :files=>{}}
|
49
|
+
self.save
|
50
|
+
end
|
51
|
+
def save
|
52
|
+
self.data[:updated] = Time.now.to_s
|
53
|
+
json = JSON.pretty_generate data
|
54
|
+
ofh = File.open(path, "w")
|
55
|
+
ofh.puts json
|
56
|
+
ofh.close
|
57
|
+
self.load
|
58
|
+
end
|
59
|
+
def load
|
60
|
+
json = File.read path
|
61
|
+
@data = JSON.parse(json, {:symbolize_names=>true})
|
62
|
+
@results = self[:results].map{ |rs| Result.new rs }
|
63
|
+
end
|
64
|
+
def remove!
|
65
|
+
each_file do |file|
|
66
|
+
f = File.expand_path(file, dir)
|
67
|
+
File.unlink_r(f) if File.exist? f
|
68
|
+
end
|
69
|
+
%w(.start .done).each do |ext|
|
70
|
+
f = path.sub(/\.json$/, ext)
|
71
|
+
File.unlink f if File.exist? f
|
72
|
+
end
|
73
|
+
File.unlink path
|
74
|
+
end
|
75
|
+
def each_file(&blk)
|
76
|
+
self.data[:files] = {} if self.data[:files].nil?
|
77
|
+
self.data[:files].each do |k,files|
|
78
|
+
files = [files] unless files.kind_of? Array
|
79
|
+
files.each do |file|
|
80
|
+
if blk.arity==1
|
81
|
+
blk.call file
|
82
|
+
elsif blk.arity==2
|
83
|
+
blk.call k, file
|
84
|
+
else
|
85
|
+
raise "Wrong number of arguments: #{blk.arity} for one or two"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
def add_result(result)
|
91
|
+
self.data[:results] << result.path
|
92
|
+
self.save
|
93
|
+
end
|
94
|
+
def file_path(file)
|
95
|
+
f = self.data[:files][file.to_sym]
|
96
|
+
return nil if f.nil?
|
97
|
+
return File.dirname(self.path) + "/" + f unless f.is_a?(Array)
|
98
|
+
f.map{ |i| File.dirname(self.path) + "/" + i }
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
|
2
|
+
# @package MiGA
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Jul-09-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'miga/taxonomy'
|
9
|
+
|
10
|
+
module MiGA
|
11
|
+
class TaxIndex
|
12
|
+
# Instance
|
13
|
+
attr_reader :datasets, :root
|
14
|
+
def initialize()
|
15
|
+
@root = TaxIndexTaxon.new :root, "biota"
|
16
|
+
@datasets = []
|
17
|
+
end
|
18
|
+
def <<(dataset)
|
19
|
+
return nil if dataset.metadata[:tax].nil?
|
20
|
+
taxon = @root
|
21
|
+
Taxonomy.KNOWN_RANKS.each { |rank| taxon = taxon.add_child(rank, dataset.metadata[:tax][rank]) }
|
22
|
+
taxon.add_dataset dataset
|
23
|
+
@datasets << dataset
|
24
|
+
end
|
25
|
+
def to_json
|
26
|
+
JSON.pretty_generate({ root:root.to_hash, datasets:datasets.map{|d| d.name} })
|
27
|
+
end
|
28
|
+
def to_tab(unknown=false) ; root.to_tab(unknown) ; end
|
29
|
+
end
|
30
|
+
class TaxIndexTaxon
|
31
|
+
# Instance
|
32
|
+
attr_reader :rank, :name,:children, :datasets
|
33
|
+
def initialize(rank, name)
|
34
|
+
@rank = rank.to_sym
|
35
|
+
@name = (name.nil? ? nil : name.miga_name)
|
36
|
+
@children = []
|
37
|
+
@datasets = []
|
38
|
+
end
|
39
|
+
def tax_str ; "#{rank}:#{name.nil? ? "?" : name}" ; end
|
40
|
+
def add_child(rank, name)
|
41
|
+
rank = rank.to_sym
|
42
|
+
name = name.miga_name unless name.nil?
|
43
|
+
child = children.find{ |it| it.rank==rank and it.name==name }
|
44
|
+
if child.nil?
|
45
|
+
child = TaxIndexTaxon.new(rank, name)
|
46
|
+
@children << child
|
47
|
+
end
|
48
|
+
child
|
49
|
+
end
|
50
|
+
def add_dataset(dataset) @datasets << dataset ; end
|
51
|
+
def datasets_count
|
52
|
+
datasets.size + children.map{ |it| it.datasets_count }.reduce(0, :+)
|
53
|
+
end
|
54
|
+
def to_json(*a)
|
55
|
+
{ str:tax_str, datasets:datasets.map{|d| d.name}, children:children }.to_json(a)
|
56
|
+
end
|
57
|
+
def to_hash
|
58
|
+
{ str:tax_str, datasets:datasets.map{|d| d.name}, children:children.map{ |it| it.to_hash } }
|
59
|
+
end
|
60
|
+
def to_tab(unknown, indent=0)
|
61
|
+
o = ""
|
62
|
+
o = (" " * indent) + tax_str + ": " + datasets_count.to_s + "\n" if unknown or not datasets.empty? or not name.nil?
|
63
|
+
indent += 2
|
64
|
+
datasets.each{ |ds| o += (" " * indent) + "# " + ds.name + "\n" }
|
65
|
+
children.each{ |it| o += it.to_tab(unknown, indent) }
|
66
|
+
o
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#
|
2
|
+
# @package MiGA
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @license artistic license 2.0
|
5
|
+
# @update Oct-05-2015
|
6
|
+
#
|
7
|
+
|
8
|
+
module MiGA
|
9
|
+
class Taxonomy
|
10
|
+
# Class
|
11
|
+
# Cannonical ranks
|
12
|
+
@@KNOWN_RANKS = %w{ns d k p c o f g s ssp str ds}.map{|r| r.to_sym}
|
13
|
+
# Synonms for cannonical ranks
|
14
|
+
@@RANK_SYNONYMS = {
|
15
|
+
"namespace"=>"ns",
|
16
|
+
"domain"=>"d","superkingdom"=>"d",
|
17
|
+
"kingdom"=>"k",
|
18
|
+
"phylum"=>"p",
|
19
|
+
"class"=>"c",
|
20
|
+
"order"=>"o",
|
21
|
+
"family"=>"f",
|
22
|
+
"genus"=>"g",
|
23
|
+
"species"=>"s","sp"=>"s",
|
24
|
+
"subspecies"=>"ssp",
|
25
|
+
"strain"=>"str","isolate"=>"str","culture"=>"str",
|
26
|
+
"dataset"=>"ds","organism"=>"ds","genome"=>"ds","specimen"=>"ds"
|
27
|
+
}
|
28
|
+
def self.KNOWN_RANKS() @@KNOWN_RANKS ; end
|
29
|
+
def self.json_create(o) new(o["str"]) ; end
|
30
|
+
def self.normalize_rank(rank)
|
31
|
+
rank = rank.to_s.downcase
|
32
|
+
return nil if rank=="no rank"
|
33
|
+
rank = @@RANK_SYNONYMS[rank] unless @@RANK_SYNONYMS[rank].nil?
|
34
|
+
rank = rank.to_sym
|
35
|
+
return nil unless @@KNOWN_RANKS.include? rank
|
36
|
+
rank
|
37
|
+
end
|
38
|
+
# Instance
|
39
|
+
attr_reader :ranks
|
40
|
+
def initialize(str, ranks=nil)
|
41
|
+
@ranks = {}
|
42
|
+
if ranks.nil?
|
43
|
+
if str.is_a? Array or str.is_a? Hash
|
44
|
+
self << str
|
45
|
+
else
|
46
|
+
(str + " ").scan(/([A-Za-z]+):([^:]*)( )/) do |r,n,s|
|
47
|
+
self << {r=>n}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
else
|
51
|
+
ranks = ranks.split(/\s+/) unless ranks.is_a? Array
|
52
|
+
str = str.split(/\s/) unless str.is_a? Array
|
53
|
+
raise "Unequal number of ranks (#{ranks.size}) " +
|
54
|
+
"and names (#{str.size}):#{ranks} => #{str}" unless
|
55
|
+
ranks.size==str.size
|
56
|
+
(0 .. str.size).each{ |i| self << "#{ranks[i]}:#{str[i]}" }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def <<(value)
|
61
|
+
if value.is_a? Array
|
62
|
+
value.each{ |v| self << v }
|
63
|
+
elsif value.is_a? String
|
64
|
+
(rank,name) = value.split /:/
|
65
|
+
self << { rank => name }
|
66
|
+
elsif value.is_a? Hash
|
67
|
+
value.each_pair do |rank, name|
|
68
|
+
next if name.nil? or name == ""
|
69
|
+
@ranks[ Taxonomy.normalize_rank rank ] = name.gsub(/_/," ")
|
70
|
+
end
|
71
|
+
else
|
72
|
+
raise "Unsupported class '#{value.class.name}'."
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def [](rank) @ranks[ rank.to_sym ] ; end
|
77
|
+
|
78
|
+
### Evaluates if the loaded taxonomy includes `taxon`. It assumes that
|
79
|
+
### `taxon` only has one informative rank. The evaluation is
|
80
|
+
### case-insensitive.
|
81
|
+
def is_in? taxon
|
82
|
+
r = taxon.ranks.keys.first
|
83
|
+
return false if self[ r ].nil?
|
84
|
+
self[ r ].downcase == taxon[ r ].downcase
|
85
|
+
end
|
86
|
+
|
87
|
+
### Sorted list of ranks, as two-entry arrays
|
88
|
+
def sorted_ranks
|
89
|
+
@@KNOWN_RANKS.map do |r|
|
90
|
+
ranks[r].nil? ? nil : [r, ranks[r]]
|
91
|
+
end.compact
|
92
|
+
end
|
93
|
+
|
94
|
+
def highest; sorted_ranks.first ; end
|
95
|
+
|
96
|
+
def lowest; sorted_ranks.last ; end
|
97
|
+
|
98
|
+
def to_s
|
99
|
+
sorted_ranks.map{ |r| "#{r[0].to_s}:#{r[1].gsub(/\s/,"_")}" }.join(" ")
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_json(*a)
|
103
|
+
{ JSON.create_id => self.class.name, "str" => self.to_s }.to_json(*a)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
data/lib/miga.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#
|
2
|
+
# @package MiGA
|
3
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @license artistic license 2.0
|
5
|
+
#
|
6
|
+
|
7
|
+
require "date"
|
8
|
+
require "json"
|
9
|
+
require "fileutils"
|
10
|
+
require "miga/project"
|
11
|
+
require "miga/taxonomy"
|
12
|
+
|
13
|
+
module MiGA
|
14
|
+
VERSION = [0.2, 0, 6]
|
15
|
+
VERSION_NAME = "pochoir"
|
16
|
+
VERSION_DATE = Date.new(2015, 12, 07)
|
17
|
+
CITATION = "Rodriguez-R et al, in preparation. Microbial Genomes Atlas: " +
|
18
|
+
"Standardizing genomic and metagenomic analyses for Archaea and Bacteria."
|
19
|
+
class MiGA
|
20
|
+
@@DEBUG = false
|
21
|
+
@@DEBUG_TRACE = false
|
22
|
+
def self.DEBUG_ON() @@DEBUG=true end
|
23
|
+
def self.DEBUG_OFF() @@DEBUG=false end
|
24
|
+
def self.DEBUG_TRACE_ON
|
25
|
+
@@DEBUG_TRACE=true
|
26
|
+
self.DEBUG_ON
|
27
|
+
end
|
28
|
+
def self.DEBUG_TRACE_OFF
|
29
|
+
@@DEBUG_TRACE=false
|
30
|
+
self.DEBUG_OFF
|
31
|
+
end
|
32
|
+
def self.DEBUG *args
|
33
|
+
$stderr.puts(*args) if @@DEBUG
|
34
|
+
$stderr.puts caller.map{|v| v.gsub(/^/," ")}.join("\n") if
|
35
|
+
@@DEBUG_TRACE
|
36
|
+
end
|
37
|
+
def self.VERSION ; VERSION[0] ; end
|
38
|
+
def self.FULL_VERSION ; VERSION.join(".") ; end
|
39
|
+
def self.LONG_VERSION
|
40
|
+
"MiGA " + VERSION.join(".") + " - " + VERSION_NAME + " - " +
|
41
|
+
VERSION_DATE.to_s
|
42
|
+
end
|
43
|
+
def self.VERSION_DATE ; VERSION_DATE ; end
|
44
|
+
def self.CITATION ; CITATION ; end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
class File
|
49
|
+
def self.unlink_r(path)
|
50
|
+
if Dir.exists? path
|
51
|
+
unless File.symlink? path
|
52
|
+
Dir.entries(path).reject{|f| f =~ /^\.\.?$/}.each do |f|
|
53
|
+
File.unlink_r path + "/" + f
|
54
|
+
end
|
55
|
+
end
|
56
|
+
Dir.unlink path
|
57
|
+
elsif File.exists? path
|
58
|
+
File.unlink path
|
59
|
+
else
|
60
|
+
raise "Cannot find file: #{path}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
def self.generic_transfer(old_name, new_name, method)
|
64
|
+
return nil if exist? new_name
|
65
|
+
case method
|
66
|
+
when :symlink
|
67
|
+
File.symlink(old_name, new_name)
|
68
|
+
when :hardlink
|
69
|
+
File.link(old_name, new_name)
|
70
|
+
when :copy
|
71
|
+
FileUtils.cp_r(old_name, new_name)
|
72
|
+
else
|
73
|
+
raise "Unknown transfer method: #{method}."
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class String
|
79
|
+
def miga_name ; gsub /[^A-Za-z0-9_]/, "_" ; end
|
80
|
+
def miga_name? ; not(self !~ /^[A-Za-z0-9_]+$/) ; end
|
81
|
+
def unmiga_name ; gsub /_/, " " ; end
|
82
|
+
end
|
83
|
+
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $DATASET, $RUNTYPE, $MIGA, $CORES, $TMPDIR,
|
3
|
+
# $NOMULTI, $REF
|
4
|
+
|
5
|
+
# Deal with previous runs (if any)
|
6
|
+
exists $DATASET.a[an]i.db && cp $DATASET.a[an]i.db $TMPDIR
|
7
|
+
exists $DATASET.a[an]i.9[05] && rm $DATASET.a[an]i.9[05]
|
8
|
+
N=0
|
9
|
+
function checkpoint_n {
|
10
|
+
let N=$N+1
|
11
|
+
if [[ $N -ge 10 ]] ; then
|
12
|
+
for metric in aai ani ; do
|
13
|
+
if [[ -s $TMPDIR/$DATASET.$metric.db ]] ; then
|
14
|
+
echo "select count(*) from $metric;" \
|
15
|
+
| sqlite3 $TMPDIR/$DATASET.$metric.db \
|
16
|
+
|| exit 1
|
17
|
+
cp $TMPDIR/$DATASET.$metric.db .
|
18
|
+
fi
|
19
|
+
done
|
20
|
+
N=0
|
21
|
+
fi
|
22
|
+
}
|
23
|
+
|
24
|
+
# Find 95%ANI clade(s) with AAI <= 90% / ANI <= 95%
|
25
|
+
REFGENOMES=$(cat ../10.clades/01.find/miga-project.ani95-clades \
|
26
|
+
| tail -n +2 | cut -d , -f 1)
|
27
|
+
for i in $REFGENOMES ; do
|
28
|
+
AAI=$(aai.rb -1 ../06.cds/$DATASET.faa \
|
29
|
+
-2 ../06.cds/$i.faa -t $CORES -a --lookup-first \
|
30
|
+
-S $TMPDIR/$DATASET.aai.db --name1 $DATASET --name2 $i || echo "")
|
31
|
+
checkpoint_n
|
32
|
+
if [[ $(perl -MPOSIX -e "print ceil $AAI") -ge 90 ]] ; then
|
33
|
+
echo $i >> $DATASET.aai90
|
34
|
+
[[ -e "../05.assembly/$DATASET.LargeContigs.fna" ]] || continue
|
35
|
+
[[ -e "../05.assembly/$i.LargeContigs.fna" ]] || continue
|
36
|
+
ANI=$(ani.rb -1 ../05.assembly/$DATASET.LargeContigs.fna \
|
37
|
+
-2 ../05.assembly/$i.LargeContigs.fna -t $CORES -a \
|
38
|
+
--no-save-regions --no-save-rbm --lookup-first \
|
39
|
+
-S $TMPDIR/$DATASET.ani.db --name1 $DATASET --name2 $i || echo "")
|
40
|
+
checkpoint_n
|
41
|
+
if [[ $(perl -MPOSIX -e "print ceil $ANI") -ge 95 ]] ; then
|
42
|
+
echo $i >> $DATASET.ani95
|
43
|
+
fi
|
44
|
+
fi
|
45
|
+
done
|
46
|
+
|
47
|
+
# Classify in-clade (if project type is clade)
|
48
|
+
CLADES="../10.clades/02.ani"
|
49
|
+
CLASSIF="."
|
50
|
+
MAX_ANI=0
|
51
|
+
ANI_MED=""
|
52
|
+
[[ -e "$DATASET.medoids" ]] && rm "$DATASET.medoids"
|
53
|
+
while [[ -e "$CLADES/$CLASSIF/miga-project.1.medoids" ]] ; do
|
54
|
+
for i in $(cat "$CLADES/$CLASSIF/miga-project.1.medoids") ; do
|
55
|
+
ANI=$(ani.rb -1 ../05.assembly/$DATASET.LargeContigs.fna \
|
56
|
+
-2 ../05.assembly/$i.LargeContigs.fna -t $CORES -a \
|
57
|
+
--no-save-regions --no-save-rbm --lookup-first \
|
58
|
+
-S $TMPDIR/$DATASET.ani.db --name1 $DATASET --name2 $i || echo "")
|
59
|
+
checkpoint_n
|
60
|
+
if [[ $(perl -e "print 1 if $ANI > $MAX_ANI") == "1" ]] ; then
|
61
|
+
MAX_ANI=$ANI
|
62
|
+
ANI_MED=$i
|
63
|
+
fi
|
64
|
+
done
|
65
|
+
echo $i >> "$DATASET.medoids"
|
66
|
+
CLASSIF="$CLASSIF/miga-project.1.subcl-$i"
|
67
|
+
done
|
68
|
+
echo $CLASSIF > "$DATASET.class"
|
69
|
+
|
70
|
+
# Calculate all the ANIs against the lowest subclade (if classified in-clade)
|
71
|
+
if [[ "$CLASSIF" != "." ]] ; then
|
72
|
+
if [[ -s "$CLADES/$CLASSIF/miga-project.all" ]] ; then
|
73
|
+
for i in $(cat "$CLADES/$CLASSIF/miga-project.all") ; do
|
74
|
+
ANI=$(ani.rb -1 ../05.assembly/$DATASET.LargeContigs.fna \
|
75
|
+
-2 ../05.assembly/$i.LargeContigs.fna -t $CORES -a \
|
76
|
+
--no-save-regions --no-save-rbm --lookup-first \
|
77
|
+
-S $TMPDIR/$DATASET.ani.db --name1 $DATASET --name2 $i || echo "")
|
78
|
+
checkpoint_n
|
79
|
+
done
|
80
|
+
fi
|
81
|
+
fi
|
82
|
+
|
83
|
+
# Finalize
|
84
|
+
mv $TMPDIR/$DATASET.aai.db 02.aai/$DATASET.db
|
85
|
+
mv $TMPDIR/$DATASET.ani.db 03.ani/$DATASET.db
|
86
|
+
|