miga-base 0.3.1.5 → 0.3.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/doctor.rb +139 -34
- data/actions/get.rb +1 -3
- data/actions/ncbi_get.rb +151 -0
- data/bin/miga +2 -2
- data/lib/miga/common.rb +45 -39
- data/lib/miga/daemon.rb +1 -1
- data/lib/miga/dataset/result.rb +12 -8
- data/lib/miga/result.rb +16 -4
- data/lib/miga/version.rb +1 -1
- data/scripts/essential_genes.bash +4 -1
- data/scripts/mytaxa_scan.bash +8 -5
- metadata +3 -5
- data/utils/distances/functions.rb +0 -58
- data/utils/distances/ref-nomulti.rb +0 -2
- data/utils/distances.rb +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ba91c88b5e9a25633e5633e344fc013b2ab0d6e
|
4
|
+
data.tar.gz: 24e7c2426c9ad4c86da246c3b4c76a94ed378aa9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bba625b0f7777a8aed26b0e3e55f16ada263c01e371f29ba7c89aa80f23afd5db9ae58c41abd1d0e37c6aaa17b924d00d487c6a09e222cd8d27b9e1394cd59d9
|
7
|
+
data.tar.gz: e906281a4ccf5b8f8505d6e7d5a342d44f16505c64602f1af4d65e3e09f456b21487c9c8273f84ec2c9cb2a009de13efb3b1fe20d65d879a2510abbbac7825b8
|
data/actions/doctor.rb
CHANGED
@@ -3,12 +3,27 @@
|
|
3
3
|
# @package MiGA
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
|
6
|
+
require "sqlite3"
|
7
|
+
|
8
|
+
o = {q:true, ld:false,
|
9
|
+
db: true, dist: true, files: true, ess: true, mts: true, tax: true}
|
7
10
|
OptionParser.new do |opt|
|
8
11
|
opt_banner(opt)
|
9
12
|
opt_object(opt, o, [:project])
|
10
|
-
opt.on("-
|
11
|
-
"
|
13
|
+
opt.on("-l", "--list-datasets",
|
14
|
+
"List all fixed datasets on advance."){ |v| o[:ld]=v }
|
15
|
+
opt.on("--ignore-databases",
|
16
|
+
"Do not check database files integrity."){ |v| o[:db]=!v }
|
17
|
+
opt.on("--ignore-distances",
|
18
|
+
"Do not check distance tables."){ |v| o[:dist]=!v }
|
19
|
+
opt.on("--ignore-files",
|
20
|
+
"Do not check for outdated files."){ |v| o[:files]=!v }
|
21
|
+
opt.on("--ignore-essential-genes",
|
22
|
+
"Do not check unarchived essential genes."){ |v| o[:ess]=!v }
|
23
|
+
opt.on("--ignore-mytaxa-scan",
|
24
|
+
"Do not check unarchived MyTaxa scan."){ |v| o[:mts]=!v }
|
25
|
+
opt.on("--ignore-taxonomy",
|
26
|
+
"Do not check taxonomy consistency."){ |v| o[:tax]=!v }
|
12
27
|
opt_common(opt, o)
|
13
28
|
end.parse!
|
14
29
|
|
@@ -19,60 +34,150 @@ $stderr.puts "Loading project" unless o[:q]
|
|
19
34
|
p = MiGA::Project.load(o[:project])
|
20
35
|
raise "Impossible to load project: #{o[:project]}" if p.nil?
|
21
36
|
|
37
|
+
def check_sqlite3_database(db_file, metric)
|
38
|
+
begin
|
39
|
+
SQLite3::Database.new(db_file) do |conn|
|
40
|
+
conn.execute("select count(*) from #{metric}").first
|
41
|
+
end
|
42
|
+
rescue SQLite3::SQLException
|
43
|
+
yield
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
if o[:db]
|
48
|
+
$stderr.puts "o Checking databases integrity" unless o[:q]
|
49
|
+
p.each_dataset do |d|
|
50
|
+
[:distances, :taxonomy].each do |r_key|
|
51
|
+
r = d.result(r_key) or next
|
52
|
+
{haai_db: :aai, aai_db: :aai, ani_db: :ani}.each do |db_key, metric|
|
53
|
+
db_file = r.file_path(db_key) or next
|
54
|
+
check_sqlite3_database(db_file, metric) do
|
55
|
+
$stderr.puts(
|
56
|
+
" > Removing #{db_key} #{r_key} table for #{d.name}.") if o[:ld]
|
57
|
+
[db_file, r.path(:done), r.path].each do |f|
|
58
|
+
File.unlink f if File.exist? f
|
59
|
+
end # each |f|
|
60
|
+
end # check_sqlite3_database
|
61
|
+
end # each |db_key, metric|
|
62
|
+
end # each |r_key|
|
63
|
+
end # each |d|
|
64
|
+
end
|
65
|
+
|
22
66
|
[:ani, :aai].each do |dist|
|
23
|
-
|
24
|
-
next if
|
67
|
+
res = p.result("#{dist}_distances")
|
68
|
+
next if res.nil?
|
25
69
|
$stderr.puts "o Checking #{dist} table for consistent datasets" unless o[:q]
|
26
|
-
|
70
|
+
notok = {}
|
27
71
|
fix = {}
|
28
|
-
Zlib::GzipReader.open(
|
72
|
+
Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
|
73
|
+
lineno = 0
|
29
74
|
fh.each_line do |ln|
|
30
|
-
next if
|
75
|
+
next if (lineno+=1)==1
|
31
76
|
r = ln.split("\t")
|
32
|
-
if
|
33
|
-
|
34
|
-
|
35
|
-
|
77
|
+
if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
|
78
|
+
[1,2].each do |i|
|
79
|
+
if p.dataset(r[i]).nil?
|
80
|
+
notok[r[i]] = true
|
81
|
+
else
|
82
|
+
fix[r[i]] = true
|
83
|
+
end
|
84
|
+
end
|
36
85
|
end
|
37
86
|
end
|
38
87
|
end
|
39
88
|
|
40
89
|
$stderr.puts " - Fixing #{fix.size} datasets" unless fix.empty? or o[:q]
|
41
90
|
fix.keys.each do |d_n|
|
42
|
-
$stderr.puts " > Fixing #{d_n}." if o[:
|
91
|
+
$stderr.puts " > Fixing #{d_n}." if o[:ld]
|
43
92
|
p.dataset(d_n).cleanup_distances!
|
44
93
|
end
|
45
94
|
|
46
|
-
unless
|
47
|
-
|
48
|
-
|
95
|
+
unless notok.empty?
|
96
|
+
unless o[:q]
|
97
|
+
$stderr.puts " - Unregistered datasets detected: "
|
98
|
+
if notok.size < 3
|
99
|
+
$stderr.puts " - #{notok.keys.join(", ")}"
|
100
|
+
else
|
101
|
+
$stderr.puts " - #{notok.keys.first} and other #{notok.size-1}"
|
102
|
+
end
|
103
|
+
$stderr.puts " - Removing tables, recompute"
|
104
|
+
end
|
105
|
+
res.remove!
|
49
106
|
end
|
50
|
-
end
|
107
|
+
end if o[:dist]
|
51
108
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
109
|
+
if o[:files]
|
110
|
+
$stderr.puts "o Looking for outdated files in results" unless o[:q]
|
111
|
+
p.each_dataset do |d|
|
112
|
+
d.each_result do |r_k, r|
|
113
|
+
ok = true
|
114
|
+
r.each_file do |_f_sym, _f_rel, f_abs|
|
115
|
+
unless File.exist? f_abs
|
116
|
+
ok = false
|
117
|
+
break
|
118
|
+
end
|
119
|
+
end
|
120
|
+
unless ok
|
121
|
+
$stderr.puts " > Registering again #{d.name}:#{r_k}" if o[:ld]
|
122
|
+
d.add_result(r_k, true, force:true)
|
60
123
|
end
|
61
124
|
end
|
62
|
-
|
63
|
-
|
64
|
-
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if o[:ess]
|
129
|
+
$stderr.puts "o Looking for unarchived essential genes." unless o[:q]
|
130
|
+
p.each_dataset do |d|
|
131
|
+
res = d.result(:essential_genes)
|
132
|
+
next if res.nil?
|
133
|
+
dir = res.file_path(:collection)
|
134
|
+
if dir.nil?
|
135
|
+
$stderr.puts " > Removing #{d.name}:essential_genes" if o[:ld]
|
136
|
+
res.remove!
|
137
|
+
next
|
138
|
+
end
|
139
|
+
unless Dir["#{dir}/*.faa"].empty?
|
140
|
+
$stderr.puts " > Fixing #{d.name}." if o[:ld]
|
141
|
+
cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
|
142
|
+
warn cmdo unless cmdo.empty?
|
65
143
|
end
|
66
144
|
end
|
67
145
|
end
|
68
146
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
147
|
+
if o[:mts]
|
148
|
+
$stderr.puts "o Looking for unarchived MyTaxa Scan runs." unless o[:q]
|
149
|
+
p.each_dataset do |d|
|
150
|
+
res = d.result(:mytaxa_scan)
|
151
|
+
next if res.nil?
|
152
|
+
dir = res.file_path(:regions)
|
153
|
+
fix = false
|
154
|
+
unless dir.nil?
|
155
|
+
if Dir.exist? dir
|
156
|
+
cmdo = `cd '#{dir}/..' \
|
157
|
+
&& tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
|
158
|
+
&& rm -r '#{d.name}.reg'`.chomp
|
159
|
+
warn cmdo unless cmdo.empty?
|
160
|
+
end
|
161
|
+
fix = true
|
162
|
+
end
|
163
|
+
%w[blast mytaxain wintax gene_ids region_ids].each do |ext|
|
164
|
+
file = res.file_path(ext.to_sym)
|
165
|
+
unless file.nil?
|
166
|
+
FileUtils.rm(file) if File.exist? file
|
167
|
+
fix = true
|
168
|
+
end
|
169
|
+
end
|
170
|
+
if fix
|
171
|
+
$stderr.puts " > Fixing #{d.name}." if o[:ld]
|
172
|
+
d.add_result(:mytaxa_scan, true, force: true)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
73
176
|
|
74
|
-
|
75
|
-
|
177
|
+
if o[:tax]
|
178
|
+
#$stderr.puts "o Checking for taxonomy/distances consistency" unless o[:q]
|
179
|
+
# TODO: Find 95%ANI clusters with entries from different species
|
180
|
+
end
|
76
181
|
|
77
182
|
$stderr.puts "Done" unless o[:q]
|
78
183
|
|
data/actions/get.rb
CHANGED
@@ -30,8 +30,6 @@ OptionParser.new do |opt|
|
|
30
30
|
"If set, ignores datasets that already exist."){ |v| o[:ignore_dup]=v }
|
31
31
|
opt.on("-d", "--description STRING",
|
32
32
|
"Description of the dataset."){ |v| o[:description]=v }
|
33
|
-
opt.on("-u", "--user STRING",
|
34
|
-
"Owner of the dataset."){ |v| o[:user]=v }
|
35
33
|
opt.on("-c", "--comments STRING",
|
36
34
|
"Comments on the dataset."){ |v| o[:comments]=v }
|
37
35
|
opt_common(opt, o)
|
@@ -68,7 +66,7 @@ glob.each do |o_i|
|
|
68
66
|
raise "Impossible to load project: #{o_i[:project]}" if p.nil?
|
69
67
|
|
70
68
|
next if o_i[:ignore_dup] and not p.dataset(o_i[:dataset]).nil?
|
71
|
-
|
69
|
+
|
72
70
|
$stderr.puts "Locating remote dataset." unless o_i[:q]
|
73
71
|
rd = MiGA::RemoteDataset.new(o_i[:ids], o_i[:db], o_i[:universe])
|
74
72
|
|
data/actions/ncbi_get.rb
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
require 'miga/remote_dataset'
|
7
|
+
|
8
|
+
o = {q:true, query:false, unlink:false,
|
9
|
+
reference: false, ignore_plasmids: false,
|
10
|
+
complete:false, chromosome:false,
|
11
|
+
scaffold:false, contig:false,}
|
12
|
+
OptionParser.new do |opt|
|
13
|
+
opt_banner(opt)
|
14
|
+
opt_object(opt, o, [:project])
|
15
|
+
opt.on("-T", "--taxon STRING",
|
16
|
+
"(Mandatory unless --reference) Name of the taxon (e.g., a species binomial)."
|
17
|
+
){ |v| o[:taxon]=v }
|
18
|
+
opt.on("--reference",
|
19
|
+
"Download all reference genomes (ignores -T)."){ |v| o[:reference]=v }
|
20
|
+
opt.on("--ref-no-plasmids",
|
21
|
+
"If passed, ignores plasmids (only for --reference)."
|
22
|
+
){ |v| o[:ignore_plasmids]=v }
|
23
|
+
opt.on("--complete", "Download complete genomes."){ |v| o[:complete]=v }
|
24
|
+
opt.on("--chromosome", "Download complete chromosomes."){ |v| o[:chromosome]=v }
|
25
|
+
opt.on("--scaffold", "Download genomes in scaffolds."){ |v| o[:scaffold]=v }
|
26
|
+
opt.on("--contig", "Download genomes in contigs."){ |v| o[:contig]=v }
|
27
|
+
opt.on("--all", "Download all genomes (in any status).") do
|
28
|
+
o[:complete] = true
|
29
|
+
o[:chromosome] = true
|
30
|
+
o[:scaffold] = true
|
31
|
+
o[:contig] = true
|
32
|
+
end
|
33
|
+
opt.on("-q", "--query",
|
34
|
+
"If set, the datasets are registered as queries, not reference datasets."
|
35
|
+
){ |v| o[:query]=v }
|
36
|
+
opt.on("-u", "--unlink",
|
37
|
+
"If set, unlinks all datasets in the project missing from the download list."
|
38
|
+
){ |v| o[:unlink]=v }
|
39
|
+
opt.on("-R", "--remote-list PATH",
|
40
|
+
"Path to an output file with the list of all datasets listed remotely."
|
41
|
+
){ |v| o[:remote_list]=v }
|
42
|
+
opt_common(opt, o)
|
43
|
+
end.parse!
|
44
|
+
|
45
|
+
opt_require(o, project: "-P")
|
46
|
+
opt_require(o, taxon: "-T") unless o[:reference]
|
47
|
+
unless %w[reference complete chromosome scaffold contig].any?{ |i| o[i.to_sym] }
|
48
|
+
raise "No action requested. Pick at least one type of genome"
|
49
|
+
end
|
50
|
+
|
51
|
+
##=> Main <=
|
52
|
+
$stderr.puts "Loading project." unless o[:q]
|
53
|
+
p = MiGA::Project.load(o[:project])
|
54
|
+
raise "Impossible to load project: #{o[:project]}" if p.nil?
|
55
|
+
d = []
|
56
|
+
ds = {}
|
57
|
+
downloaded = 0
|
58
|
+
|
59
|
+
def get_list(taxon, status)
|
60
|
+
url_base = "https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?"
|
61
|
+
url_param = if status==:reference
|
62
|
+
{ action: "refgenomes", download: "on" }
|
63
|
+
else
|
64
|
+
{ action: "download", report: "proks", group: "-- All Prokaryotes --",
|
65
|
+
subgroup: "-- All Prokaryotes --", orgn: "#{taxon}[orgn]",
|
66
|
+
status: status }
|
67
|
+
end
|
68
|
+
url = url_base + URI.encode_www_form(url_param)
|
69
|
+
response = RestClient::Request.execute(method: :get, url:url, timeout:600)
|
70
|
+
unless response.code == 200
|
71
|
+
raise "Unable to reach NCBI, error code #{response.code}."
|
72
|
+
end
|
73
|
+
response.to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
# Download IDs with reference status
|
77
|
+
if o[:reference]
|
78
|
+
$stderr.puts "Downloading reference genomes" unless o[:q]
|
79
|
+
lineno = 0
|
80
|
+
get_list(nil, :reference).each_line do |ln|
|
81
|
+
next if (lineno+=1)==1
|
82
|
+
r = ln.chomp.split("\t")
|
83
|
+
next if r[3].nil? or r[3].empty?
|
84
|
+
ids = r[3].split(",")
|
85
|
+
ids += r[5].split(",") unless o[:ignore_plasmids] or r[5].empty?
|
86
|
+
n = r[2].miga_name
|
87
|
+
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Download IDs with complete or chromosome status
|
92
|
+
if o[:complete] or o[:chromosome]
|
93
|
+
status = (o[:complete] and o[:chromosome] ? "50|40" : o[:complete] ? "50" : "40")
|
94
|
+
$stderr.puts "Downloading complete/chromosome genomes" unless o[:q]
|
95
|
+
lineno = 0
|
96
|
+
get_list(o[:taxon], status).each_line do |ln|
|
97
|
+
next if (lineno+=1)==1
|
98
|
+
r = ln.chomp.split("\t")
|
99
|
+
next if r[10].nil? or r[10].empty?
|
100
|
+
ids = r[10].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").split(";")
|
101
|
+
n = (r[0] + "_" + ids[0]).miga_name
|
102
|
+
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Download IDs with scaffold or contig status
|
107
|
+
if o[:scaffold] or o[:contig]
|
108
|
+
status = (o[:scaffold] and o[:contig] ? "30|20" : o[:scaffold] ? "30" : "20")
|
109
|
+
$stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
|
110
|
+
lineno = 0
|
111
|
+
get_list(o[:taxon], status).each_line do |ln|
|
112
|
+
next if (lineno+=1)==1
|
113
|
+
r = ln.chomp.split("\t")
|
114
|
+
next if r[7].nil? or r[7].empty?
|
115
|
+
next if r[19].nil? or r[19].empty?
|
116
|
+
asm = r[7].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").gsub(/\s/,"")
|
117
|
+
ids = r[19].gsub(/\s/, "").split(";").map{ |i| i + "/" + File.basename(i) + "_genomic.fna.gz" }
|
118
|
+
n = (r[0] + "_" + asm).miga_name
|
119
|
+
comm = "Assembly: #{asm}"
|
120
|
+
ds[n] = {ids: ids, md: {type: :genome, comments: comm}, db: :assembly_gz, universe: :web}
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Download entries
|
125
|
+
$stderr.puts "Downloading #{ds.size} #{ds.size>1 ? "entries" : "entry"}." unless o[:q]
|
126
|
+
ds.each do |name,body|
|
127
|
+
d << name
|
128
|
+
puts name
|
129
|
+
next unless p.dataset(name).nil?
|
130
|
+
$stderr.puts " Locating remote dataset." unless o[:q]
|
131
|
+
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
132
|
+
$stderr.puts " Creating dataset." unless o[:q]
|
133
|
+
rd.save_to(p, name, !o[:query], body[:md])
|
134
|
+
p.add_dataset(name)
|
135
|
+
downloaded += 1
|
136
|
+
end
|
137
|
+
|
138
|
+
# Finalize
|
139
|
+
$stderr.puts "Datasets listed: #{d.size}" unless o[:q]
|
140
|
+
$stderr.puts "Datasets downloaded: #{downloaded}" unless o[:q]
|
141
|
+
unless o[:remote_list].nil?
|
142
|
+
File.open(o[:remote_list], 'w') do |fh|
|
143
|
+
d.each { |i| fh.puts i }
|
144
|
+
end
|
145
|
+
end
|
146
|
+
if o[:unlink]
|
147
|
+
unlink = p.dataset_names - d
|
148
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
149
|
+
$stderr.puts "Datasets unlinked: #{unlink.size}" unless o[:q]
|
150
|
+
end
|
151
|
+
|
data/bin/miga
CHANGED
@@ -19,6 +19,7 @@ $task_desc = {
|
|
19
19
|
# Datasets
|
20
20
|
add: "Creates an empty dataset in a pre-existing MiGA project.",
|
21
21
|
get: "Downloads a dataset from public databases into a MiGA project.",
|
22
|
+
ncbi_get: "Downloads all genomes in a taxon or RefSeq from NCBI into a MiGA project.",
|
22
23
|
rm: "Removes a dataset from an MiGA project.",
|
23
24
|
find: "Finds unregistered datasets based on result files.",
|
24
25
|
ln: "Link datasets (including results) from one project to another.",
|
@@ -102,7 +103,7 @@ def opt_object(opt, o, what=[:project, :dataset])
|
|
102
103
|
what.include? :dataset_type_req
|
103
104
|
opt.on("-t", "--type STRING",
|
104
105
|
(what.include?(:project_type_req) ? "(Mandatory) " : "") +
|
105
|
-
"Type of project. Recognized types include:",
|
106
|
+
"Type of project. Recognized types include:",
|
106
107
|
*MiGA::Project.KNOWN_TYPES.map{ |k,v| "~ #{k}: #{v[:description]}"}
|
107
108
|
){ |v| o[:type]=v.to_sym } if what.include? :project_type or
|
108
109
|
what.include? :project_type_req
|
@@ -228,4 +229,3 @@ generic options:
|
|
228
229
|
|
229
230
|
HELP
|
230
231
|
end
|
231
|
-
|
data/lib/miga/common.rb
CHANGED
@@ -1,21 +1,23 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
4
|
+
require 'miga/version'
|
5
|
+
require 'json'
|
6
|
+
require 'tempfile'
|
7
|
+
require 'zlib'
|
8
8
|
|
9
9
|
##
|
10
10
|
# Generic class used to handle system-wide information and methods, and parent
|
11
11
|
# of all other MiGA::* classes.
|
12
12
|
class MiGA::MiGA
|
13
13
|
|
14
|
-
ENV[
|
14
|
+
ENV['MIGA_HOME'] ||= ENV['HOME']
|
15
15
|
|
16
16
|
##
|
17
17
|
# Root path to MiGA (as estimated from the location of the current file).
|
18
|
-
def self.root_path
|
18
|
+
def self.root_path
|
19
|
+
File.expand_path('../../..', __FILE__)
|
20
|
+
end
|
19
21
|
|
20
22
|
##
|
21
23
|
# Should debugging information be reported?
|
@@ -48,32 +50,32 @@ class MiGA::MiGA
|
|
48
50
|
|
49
51
|
##
|
50
52
|
# Send debug message.
|
51
|
-
def self.DEBUG
|
53
|
+
def self.DEBUG(*args)
|
52
54
|
$stderr.puts(*args) if @@DEBUG
|
53
|
-
|
54
|
-
|
55
|
+
if @@DEBUG_TRACE
|
56
|
+
$stderr.puts caller.map{ |v| v.gsub(/^/,' ') }.join("\n")
|
57
|
+
end
|
55
58
|
end
|
56
59
|
|
57
60
|
##
|
58
61
|
# Has MiGA been initialized?
|
59
62
|
def self.initialized?
|
60
|
-
File.exist?(File.expand_path(
|
61
|
-
File.exist?(File.expand_path(
|
63
|
+
File.exist?(File.expand_path('.miga_rc', ENV['MIGA_HOME'])) and
|
64
|
+
File.exist?(File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
62
65
|
end
|
63
66
|
|
64
67
|
##
|
65
68
|
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
66
69
|
# entries as +header+. Returns an Array of String, one per line.
|
67
70
|
def self.tabulate(header, values)
|
68
|
-
fields = [header.map
|
69
|
-
fields << fields.first.map{ |h| h.gsub(/\S/,
|
70
|
-
fields += values.map{ |row| row.map{ |cell| cell.nil? ?
|
71
|
-
clen = fields.map{ |row|
|
72
|
-
row.map{ |cell| cell.length } }.transpose.map{ |col| col.max }
|
71
|
+
fields = [header.map(&:to_s)]
|
72
|
+
fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
|
73
|
+
fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
|
74
|
+
clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
|
73
75
|
fields.map do |row|
|
74
76
|
(0 .. clen.size-1).map do |col_n|
|
75
77
|
col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
|
76
|
-
end.join(
|
78
|
+
end.join(' ')
|
77
79
|
end
|
78
80
|
end
|
79
81
|
|
@@ -82,25 +84,25 @@ class MiGA::MiGA
|
|
82
84
|
def self.clean_fasta_file(file)
|
83
85
|
tmp_fh = nil
|
84
86
|
begin
|
85
|
-
if
|
86
|
-
tmp_path = Tempfile.new(
|
87
|
+
if file =~ /\.gz/
|
88
|
+
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
87
89
|
tmp_fh = Zlib::GzipWriter.open(tmp_path)
|
88
90
|
fh = Zlib::GzipReader.open(file)
|
89
91
|
else
|
90
|
-
tmp_fh = Tempfile.new(
|
92
|
+
tmp_fh = Tempfile.new('MiGA')
|
91
93
|
tmp_path = tmp_fh.path
|
92
|
-
fh = File.open(file,
|
94
|
+
fh = File.open(file, 'r')
|
93
95
|
end
|
94
|
-
buffer =
|
96
|
+
buffer = ''
|
95
97
|
fh.each_line do |ln|
|
96
98
|
ln.chomp!
|
97
99
|
if ln =~ /^>\s*(\S+)(.*)/
|
98
100
|
(id, df) = [$1, $2]
|
99
101
|
tmp_fh.print buffer.wrap_width(80)
|
100
|
-
buffer =
|
102
|
+
buffer = ''
|
101
103
|
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
|
102
104
|
else
|
103
|
-
buffer << ln.gsub(/[^A-Za-z\.\-]/,
|
105
|
+
buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
|
104
106
|
end
|
105
107
|
end
|
106
108
|
tmp_fh.print buffer.wrap_width(80)
|
@@ -124,7 +126,7 @@ class MiGA::MiGA
|
|
124
126
|
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
125
127
|
# - +gc+: If true, it also returns the G+C content (in %).
|
126
128
|
def self.seqs_length(file, format, opts={})
|
127
|
-
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file,
|
129
|
+
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
128
130
|
l = []
|
129
131
|
gc = 0
|
130
132
|
i = 0 # <- Zlib::GzipReader doesn't set $.
|
@@ -154,12 +156,11 @@ class MiGA::MiGA
|
|
154
156
|
break if pos >= thr
|
155
157
|
end
|
156
158
|
o[:med] = o[:n].even? ?
|
157
|
-
0.5*
|
159
|
+
0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
|
158
160
|
end
|
159
161
|
o
|
160
162
|
end
|
161
|
-
|
162
|
-
|
163
|
+
|
163
164
|
##
|
164
165
|
# Path to a script to be executed for +task+. Supported +opts+ are:
|
165
166
|
# - +:miga+ Path to the MiGA home to use. If not passed, the home of the
|
@@ -178,17 +179,15 @@ class MiGA::MiGA
|
|
178
179
|
File.expand_path("scripts/#{task}.bash", opts[:miga])
|
179
180
|
end
|
180
181
|
|
181
|
-
|
182
182
|
##
|
183
183
|
# Check if the result files exist with +base+ name (String) followed by the
|
184
184
|
# +ext+ values (Array of String).
|
185
185
|
def result_files_exist?(base, ext)
|
186
|
-
ext = [ext] unless ext.
|
186
|
+
ext = [ext] unless ext.is_a? Array
|
187
187
|
ext.all? do |f|
|
188
188
|
File.exist?(base + f) or File.exist?("#{base}#{f}.gz")
|
189
189
|
end
|
190
190
|
end
|
191
|
-
|
192
191
|
end
|
193
192
|
|
194
193
|
##
|
@@ -212,29 +211,36 @@ class File
|
|
212
211
|
raise "Unknown transfer method: #{method}."
|
213
212
|
end
|
214
213
|
end
|
215
|
-
|
216
214
|
end
|
217
215
|
|
218
216
|
##
|
219
217
|
# MiGA extensions to the String class.
|
220
218
|
class String
|
221
|
-
|
219
|
+
|
222
220
|
##
|
223
221
|
# Replace any character not allowed in a MiGA name for underscore (_). This
|
224
222
|
# results in a MiGA-compliant name EXCEPT for empty strings, that results in
|
225
223
|
# empty strings.
|
226
|
-
def miga_name
|
224
|
+
def miga_name
|
225
|
+
gsub(/[^A-Za-z0-9_]/, '_')
|
226
|
+
end
|
227
227
|
|
228
228
|
##
|
229
229
|
# Is the string a MiGA-compliant name?
|
230
|
-
def miga_name?
|
230
|
+
def miga_name?
|
231
|
+
!(self !~ /^[A-Za-z0-9_]+$/)
|
232
|
+
end
|
231
233
|
|
232
234
|
##
|
233
235
|
# Replace underscores by spaces or dots (depending on context).
|
234
|
-
def unmiga_name
|
235
|
-
|
236
|
+
def unmiga_name
|
237
|
+
gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
|
238
|
+
end
|
239
|
+
|
236
240
|
##
|
237
241
|
# Wraps the string with fixed Integer +width+.
|
238
|
-
def wrap_width(width)
|
239
|
-
|
242
|
+
def wrap_width(width)
|
243
|
+
gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
|
244
|
+
end
|
240
245
|
end
|
246
|
+
|
data/lib/miga/daemon.rb
CHANGED
@@ -191,7 +191,7 @@ class MiGA::Daemon < MiGA::MiGA
|
|
191
191
|
log_dir = File.expand_path("daemon/#{job}", project.path)
|
192
192
|
Dir.mkdir(log_dir) unless Dir.exist? log_dir
|
193
193
|
task_name = "#{project.metadata[:name][0..9]}:#{job}:#{ds_name}"
|
194
|
-
to_run = {ds: ds, job: job, task_name: task_name,
|
194
|
+
to_run = {ds: ds, ds_name: ds_name, job: job, task_name: task_name,
|
195
195
|
cmd: sprintf(runopts(:cmd),
|
196
196
|
# 1: script
|
197
197
|
MiGA::MiGA.script_path(job, miga:vars['MIGA'], project:project),
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -42,7 +42,9 @@ module MiGA::Dataset::Result
|
|
42
42
|
dir = @@RESULT_DIRS[result_type]
|
43
43
|
return nil if dir.nil?
|
44
44
|
base = File.expand_path("data/#{dir}/#{name}", project.path)
|
45
|
-
|
45
|
+
if opts[:force]
|
46
|
+
FileUtils.rm("#{base}.json") if File.exist?("#{base}.json")
|
47
|
+
else
|
46
48
|
r_pre = MiGA::Result.load("#{base}.json")
|
47
49
|
return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
|
48
50
|
end
|
@@ -204,7 +206,7 @@ module MiGA::Dataset::Result
|
|
204
206
|
##
|
205
207
|
# Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
|
206
208
|
def add_result_cds(base, opts)
|
207
|
-
return nil unless result_files_exist?(base, %w[.faa
|
209
|
+
return nil unless result_files_exist?(base, %w[.faa])
|
208
210
|
r = MiGA::Result.new("#{base}.json")
|
209
211
|
r = add_files_to_ds_result(r, name, proteins:".faa", genes:".fna",
|
210
212
|
gff2:".gff2", gff3:".gff3", tab:".tab")
|
@@ -266,13 +268,15 @@ module MiGA::Dataset::Result
|
|
266
268
|
def add_result_mytaxa_scan(base, _opts)
|
267
269
|
if is_nonmulti?
|
268
270
|
return nil unless
|
269
|
-
result_files_exist?(base, %w[.pdf .
|
270
|
-
result_files_exist?(base,
|
271
|
+
result_files_exist?(base, %w[.pdf .mytaxa]) or
|
272
|
+
result_files_exist?(base, '.nomytaxa.txt')
|
271
273
|
r = MiGA::Result.new("#{base}.json")
|
272
|
-
add_files_to_ds_result(r, name,
|
273
|
-
|
274
|
-
|
275
|
-
|
274
|
+
add_files_to_ds_result(r, name, nomytaxa: '.nomytaxa.txt',
|
275
|
+
mytaxa: '.mytaxa', report: '.pdf', regions_archive: '.reg.tar',
|
276
|
+
# Intermediate / Deprecated
|
277
|
+
blast: '.blast', mytaxain: '.mytaxain', wintax: '.wintax',
|
278
|
+
gene_ids: '.wintax.genes', region_ids: '.wintax.regions',
|
279
|
+
regions: '.reg')
|
276
280
|
else
|
277
281
|
MiGA::Result.new("#{base}.json")
|
278
282
|
end
|
data/lib/miga/result.rb
CHANGED
@@ -21,10 +21,6 @@ class MiGA::Result < MiGA::MiGA
|
|
21
21
|
|
22
22
|
# Instance-level
|
23
23
|
|
24
|
-
##
|
25
|
-
# Path to the JSON file describing the result.
|
26
|
-
attr_reader :path
|
27
|
-
|
28
24
|
##
|
29
25
|
# Hash with the result metadata.
|
30
26
|
attr_reader :data
|
@@ -48,6 +44,22 @@ class MiGA::Result < MiGA::MiGA
|
|
48
44
|
# Register the result as cleaned.
|
49
45
|
def clean! ; self[:clean] = true ; end
|
50
46
|
|
47
|
+
##
|
48
|
+
# Path to the standard files of the result. +which+ must be one of:
|
49
|
+
# - :json (default) : JSON file describing the result.
|
50
|
+
# - :start : File with the date when the processing started.
|
51
|
+
# - :done : File with the date when the processing ended.
|
52
|
+
def path(which=:json)
|
53
|
+
case which.to_sym
|
54
|
+
when :json
|
55
|
+
@path
|
56
|
+
when :start
|
57
|
+
@path.sub(/\.json$/, ".start")
|
58
|
+
when :done
|
59
|
+
@path.sub(/\.json$/, ".done")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
51
63
|
##
|
52
64
|
# Directory containing the result.
|
53
65
|
def dir
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 1,
|
13
|
+
VERSION = [0.3, 1, 6]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -37,7 +37,10 @@ else
|
|
37
37
|
fi
|
38
38
|
|
39
39
|
# Reduce files
|
40
|
-
( cd "${DATASET}.ess"
|
40
|
+
( cd "${DATASET}.ess" \
|
41
|
+
&& exists *.faa \
|
42
|
+
&& tar -zcf proteins.tar.gz *.faa \
|
43
|
+
&& rm *.faa )
|
41
44
|
|
42
45
|
# Finalize
|
43
46
|
miga date > "$DATASET.done"
|
data/scripts/mytaxa_scan.bash
CHANGED
@@ -75,14 +75,17 @@ else
|
|
75
75
|
FastA.filter.pl -q "$DATASET.reg/$i.ids" \
|
76
76
|
"../../../06.cds/$DATASET.faa" > "$DATASET.reg/$i.faa"
|
77
77
|
done
|
78
|
+
# Archive regions
|
79
|
+
tar zcf "$DATASET.reg.tar.gz" "$DATASET.reg"
|
80
|
+
rm -r "$DATASET.reg"
|
78
81
|
fi
|
79
82
|
|
80
83
|
# Clean
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
[[ -s "$DATASET.
|
85
|
-
&& gzip -9 -f "$DATASET.
|
84
|
+
for x in daa blast mytaxain wintax wintax.genes wintax.regions ; do
|
85
|
+
[[ -e "$DATASET.$x" ]] && rm "$DATASET.$x"
|
86
|
+
done
|
87
|
+
[[ -s "$DATASET.mytaxa" && ! -s "$DATASET.mytaxa.gz" ]] \
|
88
|
+
&& gzip -9 -f "$DATASET.mytaxa"
|
86
89
|
fi
|
87
90
|
|
88
91
|
fi
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.1.
|
4
|
+
version: 0.3.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rest-client
|
@@ -130,6 +130,7 @@ files:
|
|
130
130
|
- actions/init.rb
|
131
131
|
- actions/ln.rb
|
132
132
|
- actions/ls.rb
|
133
|
+
- actions/ncbi_get.rb
|
133
134
|
- actions/new.rb
|
134
135
|
- actions/plugins.rb
|
135
136
|
- actions/rm.rb
|
@@ -196,9 +197,6 @@ files:
|
|
196
197
|
- utils/adapters.fa
|
197
198
|
- utils/arch-ess-genes.rb
|
198
199
|
- utils/core-pan-plot.R
|
199
|
-
- utils/distances.rb
|
200
|
-
- utils/distances/functions.rb
|
201
|
-
- utils/distances/ref-nomulti.rb
|
202
200
|
- utils/enveomics/Examples/aai-matrix.bash
|
203
201
|
- utils/enveomics/Examples/ani-matrix.bash
|
204
202
|
- utils/enveomics/Examples/essential-phylogeny.bash
|
@@ -1,58 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'sqlite3'
|
3
|
-
|
4
|
-
$opts = {}
|
5
|
-
if ENV["MIGA_AAI_SAVE_RBM"].nil?
|
6
|
-
$opts[:aai_save_rbm] = $project.is_clade? ? "save-rbm" : "no-save-rbm"
|
7
|
-
else
|
8
|
-
$opts[:aai_save_rbm] = ENV["MIGA_AAI_SAVE_RBM"]
|
9
|
-
end
|
10
|
-
$opts[:thr] = ENV["CORES"].nil? ? 2 : ENV["CORES"].to_i
|
11
|
-
|
12
|
-
def ani(f1, f2, db, opts={})
|
13
|
-
opts = $opts.merge(opts)
|
14
|
-
v = `ani.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
|
15
|
-
-t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm --lookup-first`
|
16
|
-
v.nil? or v.empty? ? 0 : v.to_f
|
17
|
-
end
|
18
|
-
|
19
|
-
def make_empty_aai_db(db)
|
20
|
-
SQLite3::Database.new(db) do |conn|
|
21
|
-
conn.execute "create table if not exists aai(" +
|
22
|
-
"seq1 varchar(256), seq2 varchar(256), " +
|
23
|
-
"aai float, sd float, n int omega int" +
|
24
|
-
")"
|
25
|
-
end unless File.size?(db)
|
26
|
-
end
|
27
|
-
|
28
|
-
def aai(f1, f2, db, opts={})
|
29
|
-
opts = $opts.merge(opts)
|
30
|
-
v = `aai.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
|
31
|
-
-t "#{opts[:thr]}" -a --lookup-first "--#{opts[:aai_save_rbm]}"`.chomp
|
32
|
-
v.nil? or v.empty? ? 0 : v.to_f
|
33
|
-
end
|
34
|
-
|
35
|
-
def haai(f1, f2, db, aai_db, opts={})
|
36
|
-
opts = $opts.merge(opts)
|
37
|
-
haai = aai(f1, f2, db, aai_save_rbm: "no-save-rbm")
|
38
|
-
return 0 if haai.nil? or haai == 0 or haai > 90.0
|
39
|
-
aai = 100.0 - Math.exp(2.435076 + 0.4275193*Math.log(100.0-haai))
|
40
|
-
make_empty_aai_db(aai_db)
|
41
|
-
SQLite3::Database.new(db) do |conn|
|
42
|
-
conn.execute "insert into aai values(?, ?, ?, 0, 0, 0)",
|
43
|
-
[ds_name(f1), ds_name(f2), aai]
|
44
|
-
end
|
45
|
-
aai
|
46
|
-
end
|
47
|
-
|
48
|
-
def haai_or_aai(f1_h, f2_h, db_h, f1, f2, db, opts={})
|
49
|
-
haai=haai(f1_h, f2_h, db_h, db, opts)
|
50
|
-
aai = aai(f1, f2, db, opts) if aai.nil? or aai.zero?
|
51
|
-
aai
|
52
|
-
end
|
53
|
-
|
54
|
-
def val_from_db(n1, n2, db, metric)
|
55
|
-
SQLite3::Database.new(db) do |conn|
|
56
|
-
return conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first.first
|
57
|
-
end if File.size? db
|
58
|
-
end
|
data/utils/distances.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'miga'
|
4
|
-
|
5
|
-
$project = MiGA::Project.load(ARGV.shift)
|
6
|
-
$dataset = $project.dataset(ARGV.shift)
|
7
|
-
opts = Hash[ARGV]
|
8
|
-
|
9
|
-
exit(0) if dataset.is_multi?
|
10
|
-
|
11
|
-
if dataset.is_ref?
|
12
|
-
require_relative 'distances/ref-nomulti.rb'
|
13
|
-
else
|
14
|
-
require_relative 'distances/noref-nomulti.rb'
|
15
|
-
end
|
16
|
-
# TODO run_distances!!!
|