miga-base 0.3.1.5 → 0.3.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/doctor.rb +139 -34
- data/actions/get.rb +1 -3
- data/actions/ncbi_get.rb +151 -0
- data/bin/miga +2 -2
- data/lib/miga/common.rb +45 -39
- data/lib/miga/daemon.rb +1 -1
- data/lib/miga/dataset/result.rb +12 -8
- data/lib/miga/result.rb +16 -4
- data/lib/miga/version.rb +1 -1
- data/scripts/essential_genes.bash +4 -1
- data/scripts/mytaxa_scan.bash +8 -5
- metadata +3 -5
- data/utils/distances/functions.rb +0 -58
- data/utils/distances/ref-nomulti.rb +0 -2
- data/utils/distances.rb +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ba91c88b5e9a25633e5633e344fc013b2ab0d6e
|
4
|
+
data.tar.gz: 24e7c2426c9ad4c86da246c3b4c76a94ed378aa9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bba625b0f7777a8aed26b0e3e55f16ada263c01e371f29ba7c89aa80f23afd5db9ae58c41abd1d0e37c6aaa17b924d00d487c6a09e222cd8d27b9e1394cd59d9
|
7
|
+
data.tar.gz: e906281a4ccf5b8f8505d6e7d5a342d44f16505c64602f1af4d65e3e09f456b21487c9c8273f84ec2c9cb2a009de13efb3b1fe20d65d879a2510abbbac7825b8
|
data/actions/doctor.rb
CHANGED
@@ -3,12 +3,27 @@
|
|
3
3
|
# @package MiGA
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
|
6
|
+
require "sqlite3"
|
7
|
+
|
8
|
+
o = {q:true, ld:false,
|
9
|
+
db: true, dist: true, files: true, ess: true, mts: true, tax: true}
|
7
10
|
OptionParser.new do |opt|
|
8
11
|
opt_banner(opt)
|
9
12
|
opt_object(opt, o, [:project])
|
10
|
-
opt.on("-
|
11
|
-
"
|
13
|
+
opt.on("-l", "--list-datasets",
|
14
|
+
"List all fixed datasets on advance."){ |v| o[:ld]=v }
|
15
|
+
opt.on("--ignore-databases",
|
16
|
+
"Do not check database files integrity."){ |v| o[:db]=!v }
|
17
|
+
opt.on("--ignore-distances",
|
18
|
+
"Do not check distance tables."){ |v| o[:dist]=!v }
|
19
|
+
opt.on("--ignore-files",
|
20
|
+
"Do not check for outdated files."){ |v| o[:files]=!v }
|
21
|
+
opt.on("--ignore-essential-genes",
|
22
|
+
"Do not check unarchived essential genes."){ |v| o[:ess]=!v }
|
23
|
+
opt.on("--ignore-mytaxa-scan",
|
24
|
+
"Do not check unarchived MyTaxa scan."){ |v| o[:mts]=!v }
|
25
|
+
opt.on("--ignore-taxonomy",
|
26
|
+
"Do not check taxonomy consistency."){ |v| o[:tax]=!v }
|
12
27
|
opt_common(opt, o)
|
13
28
|
end.parse!
|
14
29
|
|
@@ -19,60 +34,150 @@ $stderr.puts "Loading project" unless o[:q]
|
|
19
34
|
p = MiGA::Project.load(o[:project])
|
20
35
|
raise "Impossible to load project: #{o[:project]}" if p.nil?
|
21
36
|
|
37
|
+
def check_sqlite3_database(db_file, metric)
|
38
|
+
begin
|
39
|
+
SQLite3::Database.new(db_file) do |conn|
|
40
|
+
conn.execute("select count(*) from #{metric}").first
|
41
|
+
end
|
42
|
+
rescue SQLite3::SQLException
|
43
|
+
yield
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
if o[:db]
|
48
|
+
$stderr.puts "o Checking databases integrity" unless o[:q]
|
49
|
+
p.each_dataset do |d|
|
50
|
+
[:distances, :taxonomy].each do |r_key|
|
51
|
+
r = d.result(r_key) or next
|
52
|
+
{haai_db: :aai, aai_db: :aai, ani_db: :ani}.each do |db_key, metric|
|
53
|
+
db_file = r.file_path(db_key) or next
|
54
|
+
check_sqlite3_database(db_file, metric) do
|
55
|
+
$stderr.puts(
|
56
|
+
" > Removing #{db_key} #{r_key} table for #{d.name}.") if o[:ld]
|
57
|
+
[db_file, r.path(:done), r.path].each do |f|
|
58
|
+
File.unlink f if File.exist? f
|
59
|
+
end # each |f|
|
60
|
+
end # check_sqlite3_database
|
61
|
+
end # each |db_key, metric|
|
62
|
+
end # each |r_key|
|
63
|
+
end # each |d|
|
64
|
+
end
|
65
|
+
|
22
66
|
[:ani, :aai].each do |dist|
|
23
|
-
|
24
|
-
next if
|
67
|
+
res = p.result("#{dist}_distances")
|
68
|
+
next if res.nil?
|
25
69
|
$stderr.puts "o Checking #{dist} table for consistent datasets" unless o[:q]
|
26
|
-
|
70
|
+
notok = {}
|
27
71
|
fix = {}
|
28
|
-
Zlib::GzipReader.open(
|
72
|
+
Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
|
73
|
+
lineno = 0
|
29
74
|
fh.each_line do |ln|
|
30
|
-
next if
|
75
|
+
next if (lineno+=1)==1
|
31
76
|
r = ln.split("\t")
|
32
|
-
if
|
33
|
-
|
34
|
-
|
35
|
-
|
77
|
+
if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
|
78
|
+
[1,2].each do |i|
|
79
|
+
if p.dataset(r[i]).nil?
|
80
|
+
notok[r[i]] = true
|
81
|
+
else
|
82
|
+
fix[r[i]] = true
|
83
|
+
end
|
84
|
+
end
|
36
85
|
end
|
37
86
|
end
|
38
87
|
end
|
39
88
|
|
40
89
|
$stderr.puts " - Fixing #{fix.size} datasets" unless fix.empty? or o[:q]
|
41
90
|
fix.keys.each do |d_n|
|
42
|
-
$stderr.puts " > Fixing #{d_n}." if o[:
|
91
|
+
$stderr.puts " > Fixing #{d_n}." if o[:ld]
|
43
92
|
p.dataset(d_n).cleanup_distances!
|
44
93
|
end
|
45
94
|
|
46
|
-
unless
|
47
|
-
|
48
|
-
|
95
|
+
unless notok.empty?
|
96
|
+
unless o[:q]
|
97
|
+
$stderr.puts " - Unregistered datasets detected: "
|
98
|
+
if notok.size < 3
|
99
|
+
$stderr.puts " - #{notok.keys.join(", ")}"
|
100
|
+
else
|
101
|
+
$stderr.puts " - #{notok.keys.first} and other #{notok.size-1}"
|
102
|
+
end
|
103
|
+
$stderr.puts " - Removing tables, recompute"
|
104
|
+
end
|
105
|
+
res.remove!
|
49
106
|
end
|
50
|
-
end
|
107
|
+
end if o[:dist]
|
51
108
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
109
|
+
if o[:files]
|
110
|
+
$stderr.puts "o Looking for outdated files in results" unless o[:q]
|
111
|
+
p.each_dataset do |d|
|
112
|
+
d.each_result do |r_k, r|
|
113
|
+
ok = true
|
114
|
+
r.each_file do |_f_sym, _f_rel, f_abs|
|
115
|
+
unless File.exist? f_abs
|
116
|
+
ok = false
|
117
|
+
break
|
118
|
+
end
|
119
|
+
end
|
120
|
+
unless ok
|
121
|
+
$stderr.puts " > Registering again #{d.name}:#{r_k}" if o[:ld]
|
122
|
+
d.add_result(r_k, true, force:true)
|
60
123
|
end
|
61
124
|
end
|
62
|
-
|
63
|
-
|
64
|
-
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if o[:ess]
|
129
|
+
$stderr.puts "o Looking for unarchived essential genes." unless o[:q]
|
130
|
+
p.each_dataset do |d|
|
131
|
+
res = d.result(:essential_genes)
|
132
|
+
next if res.nil?
|
133
|
+
dir = res.file_path(:collection)
|
134
|
+
if dir.nil?
|
135
|
+
$stderr.puts " > Removing #{d.name}:essential_genes" if o[:ld]
|
136
|
+
res.remove!
|
137
|
+
next
|
138
|
+
end
|
139
|
+
unless Dir["#{dir}/*.faa"].empty?
|
140
|
+
$stderr.puts " > Fixing #{d.name}." if o[:ld]
|
141
|
+
cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
|
142
|
+
warn cmdo unless cmdo.empty?
|
65
143
|
end
|
66
144
|
end
|
67
145
|
end
|
68
146
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
147
|
+
if o[:mts]
|
148
|
+
$stderr.puts "o Looking for unarchived MyTaxa Scan runs." unless o[:q]
|
149
|
+
p.each_dataset do |d|
|
150
|
+
res = d.result(:mytaxa_scan)
|
151
|
+
next if res.nil?
|
152
|
+
dir = res.file_path(:regions)
|
153
|
+
fix = false
|
154
|
+
unless dir.nil?
|
155
|
+
if Dir.exist? dir
|
156
|
+
cmdo = `cd '#{dir}/..' \
|
157
|
+
&& tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
|
158
|
+
&& rm -r '#{d.name}.reg'`.chomp
|
159
|
+
warn cmdo unless cmdo.empty?
|
160
|
+
end
|
161
|
+
fix = true
|
162
|
+
end
|
163
|
+
%w[blast mytaxain wintax gene_ids region_ids].each do |ext|
|
164
|
+
file = res.file_path(ext.to_sym)
|
165
|
+
unless file.nil?
|
166
|
+
FileUtils.rm(file) if File.exist? file
|
167
|
+
fix = true
|
168
|
+
end
|
169
|
+
end
|
170
|
+
if fix
|
171
|
+
$stderr.puts " > Fixing #{d.name}." if o[:ld]
|
172
|
+
d.add_result(:mytaxa_scan, true, force: true)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
73
176
|
|
74
|
-
|
75
|
-
|
177
|
+
if o[:tax]
|
178
|
+
#$stderr.puts "o Checking for taxonomy/distances consistency" unless o[:q]
|
179
|
+
# TODO: Find 95%ANI clusters with entries from different species
|
180
|
+
end
|
76
181
|
|
77
182
|
$stderr.puts "Done" unless o[:q]
|
78
183
|
|
data/actions/get.rb
CHANGED
@@ -30,8 +30,6 @@ OptionParser.new do |opt|
|
|
30
30
|
"If set, ignores datasets that already exist."){ |v| o[:ignore_dup]=v }
|
31
31
|
opt.on("-d", "--description STRING",
|
32
32
|
"Description of the dataset."){ |v| o[:description]=v }
|
33
|
-
opt.on("-u", "--user STRING",
|
34
|
-
"Owner of the dataset."){ |v| o[:user]=v }
|
35
33
|
opt.on("-c", "--comments STRING",
|
36
34
|
"Comments on the dataset."){ |v| o[:comments]=v }
|
37
35
|
opt_common(opt, o)
|
@@ -68,7 +66,7 @@ glob.each do |o_i|
|
|
68
66
|
raise "Impossible to load project: #{o_i[:project]}" if p.nil?
|
69
67
|
|
70
68
|
next if o_i[:ignore_dup] and not p.dataset(o_i[:dataset]).nil?
|
71
|
-
|
69
|
+
|
72
70
|
$stderr.puts "Locating remote dataset." unless o_i[:q]
|
73
71
|
rd = MiGA::RemoteDataset.new(o_i[:ids], o_i[:db], o_i[:universe])
|
74
72
|
|
data/actions/ncbi_get.rb
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
require 'miga/remote_dataset'
|
7
|
+
|
8
|
+
o = {q:true, query:false, unlink:false,
|
9
|
+
reference: false, ignore_plasmids: false,
|
10
|
+
complete:false, chromosome:false,
|
11
|
+
scaffold:false, contig:false,}
|
12
|
+
OptionParser.new do |opt|
|
13
|
+
opt_banner(opt)
|
14
|
+
opt_object(opt, o, [:project])
|
15
|
+
opt.on("-T", "--taxon STRING",
|
16
|
+
"(Mandatory unless --reference) Name of the taxon (e.g., a species binomial)."
|
17
|
+
){ |v| o[:taxon]=v }
|
18
|
+
opt.on("--reference",
|
19
|
+
"Download all reference genomes (ignores -T)."){ |v| o[:reference]=v }
|
20
|
+
opt.on("--ref-no-plasmids",
|
21
|
+
"If passed, ignores plasmids (only for --reference)."
|
22
|
+
){ |v| o[:ignore_plasmids]=v }
|
23
|
+
opt.on("--complete", "Download complete genomes."){ |v| o[:complete]=v }
|
24
|
+
opt.on("--chromosome", "Download complete chromosomes."){ |v| o[:chromosome]=v }
|
25
|
+
opt.on("--scaffold", "Download genomes in scaffolds."){ |v| o[:scaffold]=v }
|
26
|
+
opt.on("--contig", "Download genomes in contigs."){ |v| o[:contig]=v }
|
27
|
+
opt.on("--all", "Download all genomes (in any status).") do
|
28
|
+
o[:complete] = true
|
29
|
+
o[:chromosome] = true
|
30
|
+
o[:scaffold] = true
|
31
|
+
o[:contig] = true
|
32
|
+
end
|
33
|
+
opt.on("-q", "--query",
|
34
|
+
"If set, the datasets are registered as queries, not reference datasets."
|
35
|
+
){ |v| o[:query]=v }
|
36
|
+
opt.on("-u", "--unlink",
|
37
|
+
"If set, unlinks all datasets in the project missing from the download list."
|
38
|
+
){ |v| o[:unlink]=v }
|
39
|
+
opt.on("-R", "--remote-list PATH",
|
40
|
+
"Path to an output file with the list of all datasets listed remotely."
|
41
|
+
){ |v| o[:remote_list]=v }
|
42
|
+
opt_common(opt, o)
|
43
|
+
end.parse!
|
44
|
+
|
45
|
+
opt_require(o, project: "-P")
|
46
|
+
opt_require(o, taxon: "-T") unless o[:reference]
|
47
|
+
unless %w[reference complete chromosome scaffold contig].any?{ |i| o[i.to_sym] }
|
48
|
+
raise "No action requested. Pick at least one type of genome"
|
49
|
+
end
|
50
|
+
|
51
|
+
##=> Main <=
|
52
|
+
$stderr.puts "Loading project." unless o[:q]
|
53
|
+
p = MiGA::Project.load(o[:project])
|
54
|
+
raise "Impossible to load project: #{o[:project]}" if p.nil?
|
55
|
+
d = []
|
56
|
+
ds = {}
|
57
|
+
downloaded = 0
|
58
|
+
|
59
|
+
def get_list(taxon, status)
|
60
|
+
url_base = "https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?"
|
61
|
+
url_param = if status==:reference
|
62
|
+
{ action: "refgenomes", download: "on" }
|
63
|
+
else
|
64
|
+
{ action: "download", report: "proks", group: "-- All Prokaryotes --",
|
65
|
+
subgroup: "-- All Prokaryotes --", orgn: "#{taxon}[orgn]",
|
66
|
+
status: status }
|
67
|
+
end
|
68
|
+
url = url_base + URI.encode_www_form(url_param)
|
69
|
+
response = RestClient::Request.execute(method: :get, url:url, timeout:600)
|
70
|
+
unless response.code == 200
|
71
|
+
raise "Unable to reach NCBI, error code #{response.code}."
|
72
|
+
end
|
73
|
+
response.to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
# Download IDs with reference status
|
77
|
+
if o[:reference]
|
78
|
+
$stderr.puts "Downloading reference genomes" unless o[:q]
|
79
|
+
lineno = 0
|
80
|
+
get_list(nil, :reference).each_line do |ln|
|
81
|
+
next if (lineno+=1)==1
|
82
|
+
r = ln.chomp.split("\t")
|
83
|
+
next if r[3].nil? or r[3].empty?
|
84
|
+
ids = r[3].split(",")
|
85
|
+
ids += r[5].split(",") unless o[:ignore_plasmids] or r[5].empty?
|
86
|
+
n = r[2].miga_name
|
87
|
+
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Download IDs with complete or chromosome status
|
92
|
+
if o[:complete] or o[:chromosome]
|
93
|
+
status = (o[:complete] and o[:chromosome] ? "50|40" : o[:complete] ? "50" : "40")
|
94
|
+
$stderr.puts "Downloading complete/chromosome genomes" unless o[:q]
|
95
|
+
lineno = 0
|
96
|
+
get_list(o[:taxon], status).each_line do |ln|
|
97
|
+
next if (lineno+=1)==1
|
98
|
+
r = ln.chomp.split("\t")
|
99
|
+
next if r[10].nil? or r[10].empty?
|
100
|
+
ids = r[10].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").split(";")
|
101
|
+
n = (r[0] + "_" + ids[0]).miga_name
|
102
|
+
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Download IDs with scaffold or contig status
|
107
|
+
if o[:scaffold] or o[:contig]
|
108
|
+
status = (o[:scaffold] and o[:contig] ? "30|20" : o[:scaffold] ? "30" : "20")
|
109
|
+
$stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
|
110
|
+
lineno = 0
|
111
|
+
get_list(o[:taxon], status).each_line do |ln|
|
112
|
+
next if (lineno+=1)==1
|
113
|
+
r = ln.chomp.split("\t")
|
114
|
+
next if r[7].nil? or r[7].empty?
|
115
|
+
next if r[19].nil? or r[19].empty?
|
116
|
+
asm = r[7].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").gsub(/\s/,"")
|
117
|
+
ids = r[19].gsub(/\s/, "").split(";").map{ |i| i + "/" + File.basename(i) + "_genomic.fna.gz" }
|
118
|
+
n = (r[0] + "_" + asm).miga_name
|
119
|
+
comm = "Assembly: #{asm}"
|
120
|
+
ds[n] = {ids: ids, md: {type: :genome, comments: comm}, db: :assembly_gz, universe: :web}
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Download entries
|
125
|
+
$stderr.puts "Downloading #{ds.size} #{ds.size>1 ? "entries" : "entry"}." unless o[:q]
|
126
|
+
ds.each do |name,body|
|
127
|
+
d << name
|
128
|
+
puts name
|
129
|
+
next unless p.dataset(name).nil?
|
130
|
+
$stderr.puts " Locating remote dataset." unless o[:q]
|
131
|
+
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
132
|
+
$stderr.puts " Creating dataset." unless o[:q]
|
133
|
+
rd.save_to(p, name, !o[:query], body[:md])
|
134
|
+
p.add_dataset(name)
|
135
|
+
downloaded += 1
|
136
|
+
end
|
137
|
+
|
138
|
+
# Finalize
|
139
|
+
$stderr.puts "Datasets listed: #{d.size}" unless o[:q]
|
140
|
+
$stderr.puts "Datasets downloaded: #{downloaded}" unless o[:q]
|
141
|
+
unless o[:remote_list].nil?
|
142
|
+
File.open(o[:remote_list], 'w') do |fh|
|
143
|
+
d.each { |i| fh.puts i }
|
144
|
+
end
|
145
|
+
end
|
146
|
+
if o[:unlink]
|
147
|
+
unlink = p.dataset_names - d
|
148
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
149
|
+
$stderr.puts "Datasets unlinked: #{unlink.size}" unless o[:q]
|
150
|
+
end
|
151
|
+
|
data/bin/miga
CHANGED
@@ -19,6 +19,7 @@ $task_desc = {
|
|
19
19
|
# Datasets
|
20
20
|
add: "Creates an empty dataset in a pre-existing MiGA project.",
|
21
21
|
get: "Downloads a dataset from public databases into a MiGA project.",
|
22
|
+
ncbi_get: "Downloads all genomes in a taxon or RefSeq from NCBI into a MiGA project.",
|
22
23
|
rm: "Removes a dataset from an MiGA project.",
|
23
24
|
find: "Finds unregistered datasets based on result files.",
|
24
25
|
ln: "Link datasets (including results) from one project to another.",
|
@@ -102,7 +103,7 @@ def opt_object(opt, o, what=[:project, :dataset])
|
|
102
103
|
what.include? :dataset_type_req
|
103
104
|
opt.on("-t", "--type STRING",
|
104
105
|
(what.include?(:project_type_req) ? "(Mandatory) " : "") +
|
105
|
-
"Type of project. Recognized types include:",
|
106
|
+
"Type of project. Recognized types include:",
|
106
107
|
*MiGA::Project.KNOWN_TYPES.map{ |k,v| "~ #{k}: #{v[:description]}"}
|
107
108
|
){ |v| o[:type]=v.to_sym } if what.include? :project_type or
|
108
109
|
what.include? :project_type_req
|
@@ -228,4 +229,3 @@ generic options:
|
|
228
229
|
|
229
230
|
HELP
|
230
231
|
end
|
231
|
-
|
data/lib/miga/common.rb
CHANGED
@@ -1,21 +1,23 @@
|
|
1
1
|
# @package MiGA
|
2
2
|
# @license Artistic-2.0
|
3
3
|
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
4
|
+
require 'miga/version'
|
5
|
+
require 'json'
|
6
|
+
require 'tempfile'
|
7
|
+
require 'zlib'
|
8
8
|
|
9
9
|
##
|
10
10
|
# Generic class used to handle system-wide information and methods, and parent
|
11
11
|
# of all other MiGA::* classes.
|
12
12
|
class MiGA::MiGA
|
13
13
|
|
14
|
-
ENV[
|
14
|
+
ENV['MIGA_HOME'] ||= ENV['HOME']
|
15
15
|
|
16
16
|
##
|
17
17
|
# Root path to MiGA (as estimated from the location of the current file).
|
18
|
-
def self.root_path
|
18
|
+
def self.root_path
|
19
|
+
File.expand_path('../../..', __FILE__)
|
20
|
+
end
|
19
21
|
|
20
22
|
##
|
21
23
|
# Should debugging information be reported?
|
@@ -48,32 +50,32 @@ class MiGA::MiGA
|
|
48
50
|
|
49
51
|
##
|
50
52
|
# Send debug message.
|
51
|
-
def self.DEBUG
|
53
|
+
def self.DEBUG(*args)
|
52
54
|
$stderr.puts(*args) if @@DEBUG
|
53
|
-
|
54
|
-
|
55
|
+
if @@DEBUG_TRACE
|
56
|
+
$stderr.puts caller.map{ |v| v.gsub(/^/,' ') }.join("\n")
|
57
|
+
end
|
55
58
|
end
|
56
59
|
|
57
60
|
##
|
58
61
|
# Has MiGA been initialized?
|
59
62
|
def self.initialized?
|
60
|
-
File.exist?(File.expand_path(
|
61
|
-
File.exist?(File.expand_path(
|
63
|
+
File.exist?(File.expand_path('.miga_rc', ENV['MIGA_HOME'])) and
|
64
|
+
File.exist?(File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
|
62
65
|
end
|
63
66
|
|
64
67
|
##
|
65
68
|
# Tabulates an +values+, and Array of Arrays, all with the same number of
|
66
69
|
# entries as +header+. Returns an Array of String, one per line.
|
67
70
|
def self.tabulate(header, values)
|
68
|
-
fields = [header.map
|
69
|
-
fields << fields.first.map{ |h| h.gsub(/\S/,
|
70
|
-
fields += values.map{ |row| row.map{ |cell| cell.nil? ?
|
71
|
-
clen = fields.map{ |row|
|
72
|
-
row.map{ |cell| cell.length } }.transpose.map{ |col| col.max }
|
71
|
+
fields = [header.map(&:to_s)]
|
72
|
+
fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
|
73
|
+
fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
|
74
|
+
clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
|
73
75
|
fields.map do |row|
|
74
76
|
(0 .. clen.size-1).map do |col_n|
|
75
77
|
col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
|
76
|
-
end.join(
|
78
|
+
end.join(' ')
|
77
79
|
end
|
78
80
|
end
|
79
81
|
|
@@ -82,25 +84,25 @@ class MiGA::MiGA
|
|
82
84
|
def self.clean_fasta_file(file)
|
83
85
|
tmp_fh = nil
|
84
86
|
begin
|
85
|
-
if
|
86
|
-
tmp_path = Tempfile.new(
|
87
|
+
if file =~ /\.gz/
|
88
|
+
tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
|
87
89
|
tmp_fh = Zlib::GzipWriter.open(tmp_path)
|
88
90
|
fh = Zlib::GzipReader.open(file)
|
89
91
|
else
|
90
|
-
tmp_fh = Tempfile.new(
|
92
|
+
tmp_fh = Tempfile.new('MiGA')
|
91
93
|
tmp_path = tmp_fh.path
|
92
|
-
fh = File.open(file,
|
94
|
+
fh = File.open(file, 'r')
|
93
95
|
end
|
94
|
-
buffer =
|
96
|
+
buffer = ''
|
95
97
|
fh.each_line do |ln|
|
96
98
|
ln.chomp!
|
97
99
|
if ln =~ /^>\s*(\S+)(.*)/
|
98
100
|
(id, df) = [$1, $2]
|
99
101
|
tmp_fh.print buffer.wrap_width(80)
|
100
|
-
buffer =
|
102
|
+
buffer = ''
|
101
103
|
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
|
102
104
|
else
|
103
|
-
buffer << ln.gsub(/[^A-Za-z\.\-]/,
|
105
|
+
buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
|
104
106
|
end
|
105
107
|
end
|
106
108
|
tmp_fh.print buffer.wrap_width(80)
|
@@ -124,7 +126,7 @@ class MiGA::MiGA
|
|
124
126
|
# - +:n50+: If true, it also returns the N50 and the median (in bp).
|
125
127
|
# - +gc+: If true, it also returns the G+C content (in %).
|
126
128
|
def self.seqs_length(file, format, opts={})
|
127
|
-
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file,
|
129
|
+
fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
128
130
|
l = []
|
129
131
|
gc = 0
|
130
132
|
i = 0 # <- Zlib::GzipReader doesn't set $.
|
@@ -154,12 +156,11 @@ class MiGA::MiGA
|
|
154
156
|
break if pos >= thr
|
155
157
|
end
|
156
158
|
o[:med] = o[:n].even? ?
|
157
|
-
0.5*
|
159
|
+
0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
|
158
160
|
end
|
159
161
|
o
|
160
162
|
end
|
161
|
-
|
162
|
-
|
163
|
+
|
163
164
|
##
|
164
165
|
# Path to a script to be executed for +task+. Supported +opts+ are:
|
165
166
|
# - +:miga+ Path to the MiGA home to use. If not passed, the home of the
|
@@ -178,17 +179,15 @@ class MiGA::MiGA
|
|
178
179
|
File.expand_path("scripts/#{task}.bash", opts[:miga])
|
179
180
|
end
|
180
181
|
|
181
|
-
|
182
182
|
##
|
183
183
|
# Check if the result files exist with +base+ name (String) followed by the
|
184
184
|
# +ext+ values (Array of String).
|
185
185
|
def result_files_exist?(base, ext)
|
186
|
-
ext = [ext] unless ext.
|
186
|
+
ext = [ext] unless ext.is_a? Array
|
187
187
|
ext.all? do |f|
|
188
188
|
File.exist?(base + f) or File.exist?("#{base}#{f}.gz")
|
189
189
|
end
|
190
190
|
end
|
191
|
-
|
192
191
|
end
|
193
192
|
|
194
193
|
##
|
@@ -212,29 +211,36 @@ class File
|
|
212
211
|
raise "Unknown transfer method: #{method}."
|
213
212
|
end
|
214
213
|
end
|
215
|
-
|
216
214
|
end
|
217
215
|
|
218
216
|
##
|
219
217
|
# MiGA extensions to the String class.
|
220
218
|
class String
|
221
|
-
|
219
|
+
|
222
220
|
##
|
223
221
|
# Replace any character not allowed in a MiGA name for underscore (_). This
|
224
222
|
# results in a MiGA-compliant name EXCEPT for empty strings, that results in
|
225
223
|
# empty strings.
|
226
|
-
def miga_name
|
224
|
+
def miga_name
|
225
|
+
gsub(/[^A-Za-z0-9_]/, '_')
|
226
|
+
end
|
227
227
|
|
228
228
|
##
|
229
229
|
# Is the string a MiGA-compliant name?
|
230
|
-
def miga_name?
|
230
|
+
def miga_name?
|
231
|
+
!(self !~ /^[A-Za-z0-9_]+$/)
|
232
|
+
end
|
231
233
|
|
232
234
|
##
|
233
235
|
# Replace underscores by spaces or dots (depending on context).
|
234
|
-
def unmiga_name
|
235
|
-
|
236
|
+
def unmiga_name
|
237
|
+
gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
|
238
|
+
end
|
239
|
+
|
236
240
|
##
|
237
241
|
# Wraps the string with fixed Integer +width+.
|
238
|
-
def wrap_width(width)
|
239
|
-
|
242
|
+
def wrap_width(width)
|
243
|
+
gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
|
244
|
+
end
|
240
245
|
end
|
246
|
+
|
data/lib/miga/daemon.rb
CHANGED
@@ -191,7 +191,7 @@ class MiGA::Daemon < MiGA::MiGA
|
|
191
191
|
log_dir = File.expand_path("daemon/#{job}", project.path)
|
192
192
|
Dir.mkdir(log_dir) unless Dir.exist? log_dir
|
193
193
|
task_name = "#{project.metadata[:name][0..9]}:#{job}:#{ds_name}"
|
194
|
-
to_run = {ds: ds, job: job, task_name: task_name,
|
194
|
+
to_run = {ds: ds, ds_name: ds_name, job: job, task_name: task_name,
|
195
195
|
cmd: sprintf(runopts(:cmd),
|
196
196
|
# 1: script
|
197
197
|
MiGA::MiGA.script_path(job, miga:vars['MIGA'], project:project),
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -42,7 +42,9 @@ module MiGA::Dataset::Result
|
|
42
42
|
dir = @@RESULT_DIRS[result_type]
|
43
43
|
return nil if dir.nil?
|
44
44
|
base = File.expand_path("data/#{dir}/#{name}", project.path)
|
45
|
-
|
45
|
+
if opts[:force]
|
46
|
+
FileUtils.rm("#{base}.json") if File.exist?("#{base}.json")
|
47
|
+
else
|
46
48
|
r_pre = MiGA::Result.load("#{base}.json")
|
47
49
|
return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
|
48
50
|
end
|
@@ -204,7 +206,7 @@ module MiGA::Dataset::Result
|
|
204
206
|
##
|
205
207
|
# Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
|
206
208
|
def add_result_cds(base, opts)
|
207
|
-
return nil unless result_files_exist?(base, %w[.faa
|
209
|
+
return nil unless result_files_exist?(base, %w[.faa])
|
208
210
|
r = MiGA::Result.new("#{base}.json")
|
209
211
|
r = add_files_to_ds_result(r, name, proteins:".faa", genes:".fna",
|
210
212
|
gff2:".gff2", gff3:".gff3", tab:".tab")
|
@@ -266,13 +268,15 @@ module MiGA::Dataset::Result
|
|
266
268
|
def add_result_mytaxa_scan(base, _opts)
|
267
269
|
if is_nonmulti?
|
268
270
|
return nil unless
|
269
|
-
result_files_exist?(base, %w[.pdf .
|
270
|
-
result_files_exist?(base,
|
271
|
+
result_files_exist?(base, %w[.pdf .mytaxa]) or
|
272
|
+
result_files_exist?(base, '.nomytaxa.txt')
|
271
273
|
r = MiGA::Result.new("#{base}.json")
|
272
|
-
add_files_to_ds_result(r, name,
|
273
|
-
|
274
|
-
|
275
|
-
|
274
|
+
add_files_to_ds_result(r, name, nomytaxa: '.nomytaxa.txt',
|
275
|
+
mytaxa: '.mytaxa', report: '.pdf', regions_archive: '.reg.tar',
|
276
|
+
# Intermediate / Deprecated
|
277
|
+
blast: '.blast', mytaxain: '.mytaxain', wintax: '.wintax',
|
278
|
+
gene_ids: '.wintax.genes', region_ids: '.wintax.regions',
|
279
|
+
regions: '.reg')
|
276
280
|
else
|
277
281
|
MiGA::Result.new("#{base}.json")
|
278
282
|
end
|
data/lib/miga/result.rb
CHANGED
@@ -21,10 +21,6 @@ class MiGA::Result < MiGA::MiGA
|
|
21
21
|
|
22
22
|
# Instance-level
|
23
23
|
|
24
|
-
##
|
25
|
-
# Path to the JSON file describing the result.
|
26
|
-
attr_reader :path
|
27
|
-
|
28
24
|
##
|
29
25
|
# Hash with the result metadata.
|
30
26
|
attr_reader :data
|
@@ -48,6 +44,22 @@ class MiGA::Result < MiGA::MiGA
|
|
48
44
|
# Register the result as cleaned.
|
49
45
|
def clean! ; self[:clean] = true ; end
|
50
46
|
|
47
|
+
##
|
48
|
+
# Path to the standard files of the result. +which+ must be one of:
|
49
|
+
# - :json (default) : JSON file describing the result.
|
50
|
+
# - :start : File with the date when the processing started.
|
51
|
+
# - :done : File with the date when the processing ended.
|
52
|
+
def path(which=:json)
|
53
|
+
case which.to_sym
|
54
|
+
when :json
|
55
|
+
@path
|
56
|
+
when :start
|
57
|
+
@path.sub(/\.json$/, ".start")
|
58
|
+
when :done
|
59
|
+
@path.sub(/\.json$/, ".done")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
51
63
|
##
|
52
64
|
# Directory containing the result.
|
53
65
|
def dir
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3, 1,
|
13
|
+
VERSION = [0.3, 1, 6]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -37,7 +37,10 @@ else
|
|
37
37
|
fi
|
38
38
|
|
39
39
|
# Reduce files
|
40
|
-
( cd "${DATASET}.ess"
|
40
|
+
( cd "${DATASET}.ess" \
|
41
|
+
&& exists *.faa \
|
42
|
+
&& tar -zcf proteins.tar.gz *.faa \
|
43
|
+
&& rm *.faa )
|
41
44
|
|
42
45
|
# Finalize
|
43
46
|
miga date > "$DATASET.done"
|
data/scripts/mytaxa_scan.bash
CHANGED
@@ -75,14 +75,17 @@ else
|
|
75
75
|
FastA.filter.pl -q "$DATASET.reg/$i.ids" \
|
76
76
|
"../../../06.cds/$DATASET.faa" > "$DATASET.reg/$i.faa"
|
77
77
|
done
|
78
|
+
# Archive regions
|
79
|
+
tar zcf "$DATASET.reg.tar.gz" "$DATASET.reg"
|
80
|
+
rm -r "$DATASET.reg"
|
78
81
|
fi
|
79
82
|
|
80
83
|
# Clean
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
[[ -s "$DATASET.
|
85
|
-
&& gzip -9 -f "$DATASET.
|
84
|
+
for x in daa blast mytaxain wintax wintax.genes wintax.regions ; do
|
85
|
+
[[ -e "$DATASET.$x" ]] && rm "$DATASET.$x"
|
86
|
+
done
|
87
|
+
[[ -s "$DATASET.mytaxa" && ! -s "$DATASET.mytaxa.gz" ]] \
|
88
|
+
&& gzip -9 -f "$DATASET.mytaxa"
|
86
89
|
fi
|
87
90
|
|
88
91
|
fi
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.1.
|
4
|
+
version: 0.3.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rest-client
|
@@ -130,6 +130,7 @@ files:
|
|
130
130
|
- actions/init.rb
|
131
131
|
- actions/ln.rb
|
132
132
|
- actions/ls.rb
|
133
|
+
- actions/ncbi_get.rb
|
133
134
|
- actions/new.rb
|
134
135
|
- actions/plugins.rb
|
135
136
|
- actions/rm.rb
|
@@ -196,9 +197,6 @@ files:
|
|
196
197
|
- utils/adapters.fa
|
197
198
|
- utils/arch-ess-genes.rb
|
198
199
|
- utils/core-pan-plot.R
|
199
|
-
- utils/distances.rb
|
200
|
-
- utils/distances/functions.rb
|
201
|
-
- utils/distances/ref-nomulti.rb
|
202
200
|
- utils/enveomics/Examples/aai-matrix.bash
|
203
201
|
- utils/enveomics/Examples/ani-matrix.bash
|
204
202
|
- utils/enveomics/Examples/essential-phylogeny.bash
|
@@ -1,58 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'sqlite3'
|
3
|
-
|
4
|
-
$opts = {}
|
5
|
-
if ENV["MIGA_AAI_SAVE_RBM"].nil?
|
6
|
-
$opts[:aai_save_rbm] = $project.is_clade? ? "save-rbm" : "no-save-rbm"
|
7
|
-
else
|
8
|
-
$opts[:aai_save_rbm] = ENV["MIGA_AAI_SAVE_RBM"]
|
9
|
-
end
|
10
|
-
$opts[:thr] = ENV["CORES"].nil? ? 2 : ENV["CORES"].to_i
|
11
|
-
|
12
|
-
def ani(f1, f2, db, opts={})
|
13
|
-
opts = $opts.merge(opts)
|
14
|
-
v = `ani.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
|
15
|
-
-t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm --lookup-first`
|
16
|
-
v.nil? or v.empty? ? 0 : v.to_f
|
17
|
-
end
|
18
|
-
|
19
|
-
def make_empty_aai_db(db)
|
20
|
-
SQLite3::Database.new(db) do |conn|
|
21
|
-
conn.execute "create table if not exists aai(" +
|
22
|
-
"seq1 varchar(256), seq2 varchar(256), " +
|
23
|
-
"aai float, sd float, n int omega int" +
|
24
|
-
")"
|
25
|
-
end unless File.size?(db)
|
26
|
-
end
|
27
|
-
|
28
|
-
def aai(f1, f2, db, opts={})
|
29
|
-
opts = $opts.merge(opts)
|
30
|
-
v = `aai.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
|
31
|
-
-t "#{opts[:thr]}" -a --lookup-first "--#{opts[:aai_save_rbm]}"`.chomp
|
32
|
-
v.nil? or v.empty? ? 0 : v.to_f
|
33
|
-
end
|
34
|
-
|
35
|
-
def haai(f1, f2, db, aai_db, opts={})
|
36
|
-
opts = $opts.merge(opts)
|
37
|
-
haai = aai(f1, f2, db, aai_save_rbm: "no-save-rbm")
|
38
|
-
return 0 if haai.nil? or haai == 0 or haai > 90.0
|
39
|
-
aai = 100.0 - Math.exp(2.435076 + 0.4275193*Math.log(100.0-haai))
|
40
|
-
make_empty_aai_db(aai_db)
|
41
|
-
SQLite3::Database.new(db) do |conn|
|
42
|
-
conn.execute "insert into aai values(?, ?, ?, 0, 0, 0)",
|
43
|
-
[ds_name(f1), ds_name(f2), aai]
|
44
|
-
end
|
45
|
-
aai
|
46
|
-
end
|
47
|
-
|
48
|
-
def haai_or_aai(f1_h, f2_h, db_h, f1, f2, db, opts={})
|
49
|
-
haai=haai(f1_h, f2_h, db_h, db, opts)
|
50
|
-
aai = aai(f1, f2, db, opts) if aai.nil? or aai.zero?
|
51
|
-
aai
|
52
|
-
end
|
53
|
-
|
54
|
-
def val_from_db(n1, n2, db, metric)
|
55
|
-
SQLite3::Database.new(db) do |conn|
|
56
|
-
return conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first.first
|
57
|
-
end if File.size? db
|
58
|
-
end
|
data/utils/distances.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'miga'
|
4
|
-
|
5
|
-
$project = MiGA::Project.load(ARGV.shift)
|
6
|
-
$dataset = $project.dataset(ARGV.shift)
|
7
|
-
opts = Hash[ARGV]
|
8
|
-
|
9
|
-
exit(0) if dataset.is_multi?
|
10
|
-
|
11
|
-
if dataset.is_ref?
|
12
|
-
require_relative 'distances/ref-nomulti.rb'
|
13
|
-
else
|
14
|
-
require_relative 'distances/noref-nomulti.rb'
|
15
|
-
end
|
16
|
-
# TODO run_distances!!!
|