miga-base 0.3.1.5 → 0.3.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 36bdee987135a432485bcb9eb49895ad22c19870
4
- data.tar.gz: 7409617014eb5adfb0adba8cdaa53c80db185db4
3
+ metadata.gz: 4ba91c88b5e9a25633e5633e344fc013b2ab0d6e
4
+ data.tar.gz: 24e7c2426c9ad4c86da246c3b4c76a94ed378aa9
5
5
  SHA512:
6
- metadata.gz: f9000a92772143755f48e3c27e69cd9c42533cbd8946682239c3d3f9e8596342d4b5214d86d84363fa77dc8f8947915b764e3bd861431151a0dd86299db5fce3
7
- data.tar.gz: 35b4eb28a73c23f601bfcac9777c25cf7116a69654f3b99a95e46118256041f2d0e2ec89dca379d97e46c6bae87fd5b64e481895e9f305573722905da6c3f22e
6
+ metadata.gz: bba625b0f7777a8aed26b0e3e55f16ada263c01e371f29ba7c89aa80f23afd5db9ae58c41abd1d0e37c6aaa17b924d00d487c6a09e222cd8d27b9e1394cd59d9
7
+ data.tar.gz: e906281a4ccf5b8f8505d6e7d5a342d44f16505c64602f1af4d65e3e09f456b21487c9c8273f84ec2c9cb2a009de13efb3b1fe20d65d879a2510abbbac7825b8
data/actions/doctor.rb CHANGED
@@ -3,12 +3,27 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
- o = {q:true, v:false}
6
+ require "sqlite3"
7
+
8
+ o = {q:true, ld:false,
9
+ db: true, dist: true, files: true, ess: true, mts: true, tax: true}
7
10
  OptionParser.new do |opt|
8
11
  opt_banner(opt)
9
12
  opt_object(opt, o, [:project])
10
- opt.on("-v", "--verbose",
11
- "Print additional information on advance."){ |v| o[:v]=v }
13
+ opt.on("-l", "--list-datasets",
14
+ "List all fixed datasets on advance."){ |v| o[:ld]=v }
15
+ opt.on("--ignore-databases",
16
+ "Do not check database files integrity."){ |v| o[:db]=!v }
17
+ opt.on("--ignore-distances",
18
+ "Do not check distance tables."){ |v| o[:dist]=!v }
19
+ opt.on("--ignore-files",
20
+ "Do not check for outdated files."){ |v| o[:files]=!v }
21
+ opt.on("--ignore-essential-genes",
22
+ "Do not check unarchived essential genes."){ |v| o[:ess]=!v }
23
+ opt.on("--ignore-mytaxa-scan",
24
+ "Do not check unarchived MyTaxa scan."){ |v| o[:mts]=!v }
25
+ opt.on("--ignore-taxonomy",
26
+ "Do not check taxonomy consistency."){ |v| o[:tax]=!v }
12
27
  opt_common(opt, o)
13
28
  end.parse!
14
29
 
@@ -19,60 +34,150 @@ $stderr.puts "Loading project" unless o[:q]
19
34
  p = MiGA::Project.load(o[:project])
20
35
  raise "Impossible to load project: #{o[:project]}" if p.nil?
21
36
 
37
+ def check_sqlite3_database(db_file, metric)
38
+ begin
39
+ SQLite3::Database.new(db_file) do |conn|
40
+ conn.execute("select count(*) from #{metric}").first
41
+ end
42
+ rescue SQLite3::SQLException
43
+ yield
44
+ end
45
+ end
46
+
47
+ if o[:db]
48
+ $stderr.puts "o Checking databases integrity" unless o[:q]
49
+ p.each_dataset do |d|
50
+ [:distances, :taxonomy].each do |r_key|
51
+ r = d.result(r_key) or next
52
+ {haai_db: :aai, aai_db: :aai, ani_db: :ani}.each do |db_key, metric|
53
+ db_file = r.file_path(db_key) or next
54
+ check_sqlite3_database(db_file, metric) do
55
+ $stderr.puts(
56
+ " > Removing #{db_key} #{r_key} table for #{d.name}.") if o[:ld]
57
+ [db_file, r.path(:done), r.path].each do |f|
58
+ File.unlink f if File.exist? f
59
+ end # each |f|
60
+ end # check_sqlite3_database
61
+ end # each |db_key, metric|
62
+ end # each |r_key|
63
+ end # each |d|
64
+ end
65
+
22
66
  [:ani, :aai].each do |dist|
23
- r = p.result("#{dist}_distances")
24
- next if r.nil?
67
+ res = p.result("#{dist}_distances")
68
+ next if res.nil?
25
69
  $stderr.puts "o Checking #{dist} table for consistent datasets" unless o[:q]
26
- ok = true
70
+ notok = {}
27
71
  fix = {}
28
- Zlib::GzipReader.open(r.file_path(:matrix)) do |fh|
72
+ Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
73
+ lineno = 0
29
74
  fh.each_line do |ln|
30
- next if $.==1
75
+ next if (lineno+=1)==1
31
76
  r = ln.split("\t")
32
- if p.dataset(r[1]).nil? or p.dataset(r[2]).nil?
33
- fix[r[2]] = true unless p.dataset(r[2]).nil?
34
- fix[r[1]] = true unless p.dataset(r[1]).nil?
35
- ok = false
77
+ if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
78
+ [1,2].each do |i|
79
+ if p.dataset(r[i]).nil?
80
+ notok[r[i]] = true
81
+ else
82
+ fix[r[i]] = true
83
+ end
84
+ end
36
85
  end
37
86
  end
38
87
  end
39
88
 
40
89
  $stderr.puts " - Fixing #{fix.size} datasets" unless fix.empty? or o[:q]
41
90
  fix.keys.each do |d_n|
42
- $stderr.puts " > Fixing #{d_n}." if o[:v]
91
+ $stderr.puts " > Fixing #{d_n}." if o[:ld]
43
92
  p.dataset(d_n).cleanup_distances!
44
93
  end
45
94
 
46
- unless ok
47
- $stderr.puts " - Removing tables, recompute" unless o[:q]
48
- r.remove!
95
+ unless notok.empty?
96
+ unless o[:q]
97
+ $stderr.puts " - Unregistered datasets detected: "
98
+ if notok.size < 3
99
+ $stderr.puts " - #{notok.keys.join(", ")}"
100
+ else
101
+ $stderr.puts " - #{notok.keys.first} and other #{notok.size-1}"
102
+ end
103
+ $stderr.puts " - Removing tables, recompute"
104
+ end
105
+ res.remove!
49
106
  end
50
- end
107
+ end if o[:dist]
51
108
 
52
- $stderr.puts "o Looking for outdated files in results" unless o[:q]
53
- p.each_dataset do |d|
54
- d.each_result do |r_k, r|
55
- ok = true
56
- r.each_file do |_f_sym, _f_rel, f_abs|
57
- unless File.exist? f_abs
58
- ok = false
59
- break
109
+ if o[:files]
110
+ $stderr.puts "o Looking for outdated files in results" unless o[:q]
111
+ p.each_dataset do |d|
112
+ d.each_result do |r_k, r|
113
+ ok = true
114
+ r.each_file do |_f_sym, _f_rel, f_abs|
115
+ unless File.exist? f_abs
116
+ ok = false
117
+ break
118
+ end
119
+ end
120
+ unless ok
121
+ $stderr.puts " > Registering again #{d.name}:#{r_k}" if o[:ld]
122
+ d.add_result(r_k, true, force:true)
60
123
  end
61
124
  end
62
- unless ok
63
- $stderr.puts " - Registering again #{d.name}:#{r_k}" if o[:v]
64
- d.add_result(r_k, true, force:true)
125
+ end
126
+ end
127
+
128
+ if o[:ess]
129
+ $stderr.puts "o Looking for unarchived essential genes." unless o[:q]
130
+ p.each_dataset do |d|
131
+ res = d.result(:essential_genes)
132
+ next if res.nil?
133
+ dir = res.file_path(:collection)
134
+ if dir.nil?
135
+ $stderr.puts " > Removing #{d.name}:essential_genes" if o[:ld]
136
+ res.remove!
137
+ next
138
+ end
139
+ unless Dir["#{dir}/*.faa"].empty?
140
+ $stderr.puts " > Fixing #{d.name}." if o[:ld]
141
+ cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
142
+ warn cmdo unless cmdo.empty?
65
143
  end
66
144
  end
67
145
  end
68
146
 
69
- #$stderr.puts "o Looking for unarchived essential genes." unless o[:q]
70
- #p.each_dataset do |d|
71
- # TODO: Check unarchived protein files
72
- #end
147
+ if o[:mts]
148
+ $stderr.puts "o Looking for unarchived MyTaxa Scan runs." unless o[:q]
149
+ p.each_dataset do |d|
150
+ res = d.result(:mytaxa_scan)
151
+ next if res.nil?
152
+ dir = res.file_path(:regions)
153
+ fix = false
154
+ unless dir.nil?
155
+ if Dir.exist? dir
156
+ cmdo = `cd '#{dir}/..' \
157
+ && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
158
+ && rm -r '#{d.name}.reg'`.chomp
159
+ warn cmdo unless cmdo.empty?
160
+ end
161
+ fix = true
162
+ end
163
+ %w[blast mytaxain wintax gene_ids region_ids].each do |ext|
164
+ file = res.file_path(ext.to_sym)
165
+ unless file.nil?
166
+ FileUtils.rm(file) if File.exist? file
167
+ fix = true
168
+ end
169
+ end
170
+ if fix
171
+ $stderr.puts " > Fixing #{d.name}." if o[:ld]
172
+ d.add_result(:mytaxa_scan, true, force: true)
173
+ end
174
+ end
175
+ end
73
176
 
74
- #$stderr.puts "o Checking for taxonomy/distances consistency" unless o[:q]
75
- # TODO: Find 95%ANI clusters with entries from different species
177
+ if o[:tax]
178
+ #$stderr.puts "o Checking for taxonomy/distances consistency" unless o[:q]
179
+ # TODO: Find 95%ANI clusters with entries from different species
180
+ end
76
181
 
77
182
  $stderr.puts "Done" unless o[:q]
78
183
 
data/actions/get.rb CHANGED
@@ -30,8 +30,6 @@ OptionParser.new do |opt|
30
30
  "If set, ignores datasets that already exist."){ |v| o[:ignore_dup]=v }
31
31
  opt.on("-d", "--description STRING",
32
32
  "Description of the dataset."){ |v| o[:description]=v }
33
- opt.on("-u", "--user STRING",
34
- "Owner of the dataset."){ |v| o[:user]=v }
35
33
  opt.on("-c", "--comments STRING",
36
34
  "Comments on the dataset."){ |v| o[:comments]=v }
37
35
  opt_common(opt, o)
@@ -68,7 +66,7 @@ glob.each do |o_i|
68
66
  raise "Impossible to load project: #{o_i[:project]}" if p.nil?
69
67
 
70
68
  next if o_i[:ignore_dup] and not p.dataset(o_i[:dataset]).nil?
71
-
69
+
72
70
  $stderr.puts "Locating remote dataset." unless o_i[:q]
73
71
  rd = MiGA::RemoteDataset.new(o_i[:ids], o_i[:db], o_i[:universe])
74
72
 
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ require 'miga/remote_dataset'
7
+
8
+ o = {q:true, query:false, unlink:false,
9
+ reference: false, ignore_plasmids: false,
10
+ complete:false, chromosome:false,
11
+ scaffold:false, contig:false,}
12
+ OptionParser.new do |opt|
13
+ opt_banner(opt)
14
+ opt_object(opt, o, [:project])
15
+ opt.on("-T", "--taxon STRING",
16
+ "(Mandatory unless --reference) Name of the taxon (e.g., a species binomial)."
17
+ ){ |v| o[:taxon]=v }
18
+ opt.on("--reference",
19
+ "Download all reference genomes (ignores -T)."){ |v| o[:reference]=v }
20
+ opt.on("--ref-no-plasmids",
21
+ "If passed, ignores plasmids (only for --reference)."
22
+ ){ |v| o[:ignore_plasmids]=v }
23
+ opt.on("--complete", "Download complete genomes."){ |v| o[:complete]=v }
24
+ opt.on("--chromosome", "Download complete chromosomes."){ |v| o[:chromosome]=v }
25
+ opt.on("--scaffold", "Download genomes in scaffolds."){ |v| o[:scaffold]=v }
26
+ opt.on("--contig", "Download genomes in contigs."){ |v| o[:contig]=v }
27
+ opt.on("--all", "Download all genomes (in any status).") do
28
+ o[:complete] = true
29
+ o[:chromosome] = true
30
+ o[:scaffold] = true
31
+ o[:contig] = true
32
+ end
33
+ opt.on("-q", "--query",
34
+ "If set, the datasets are registered as queries, not reference datasets."
35
+ ){ |v| o[:query]=v }
36
+ opt.on("-u", "--unlink",
37
+ "If set, unlinks all datasets in the project missing from the download list."
38
+ ){ |v| o[:unlink]=v }
39
+ opt.on("-R", "--remote-list PATH",
40
+ "Path to an output file with the list of all datasets listed remotely."
41
+ ){ |v| o[:remote_list]=v }
42
+ opt_common(opt, o)
43
+ end.parse!
44
+
45
+ opt_require(o, project: "-P")
46
+ opt_require(o, taxon: "-T") unless o[:reference]
47
+ unless %w[reference complete chromosome scaffold contig].any?{ |i| o[i.to_sym] }
48
+ raise "No action requested. Pick at least one type of genome"
49
+ end
50
+
51
+ ##=> Main <=
52
+ $stderr.puts "Loading project." unless o[:q]
53
+ p = MiGA::Project.load(o[:project])
54
+ raise "Impossible to load project: #{o[:project]}" if p.nil?
55
+ d = []
56
+ ds = {}
57
+ downloaded = 0
58
+
59
+ def get_list(taxon, status)
60
+ url_base = "https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?"
61
+ url_param = if status==:reference
62
+ { action: "refgenomes", download: "on" }
63
+ else
64
+ { action: "download", report: "proks", group: "-- All Prokaryotes --",
65
+ subgroup: "-- All Prokaryotes --", orgn: "#{taxon}[orgn]",
66
+ status: status }
67
+ end
68
+ url = url_base + URI.encode_www_form(url_param)
69
+ response = RestClient::Request.execute(method: :get, url:url, timeout:600)
70
+ unless response.code == 200
71
+ raise "Unable to reach NCBI, error code #{response.code}."
72
+ end
73
+ response.to_s
74
+ end
75
+
76
+ # Download IDs with reference status
77
+ if o[:reference]
78
+ $stderr.puts "Downloading reference genomes" unless o[:q]
79
+ lineno = 0
80
+ get_list(nil, :reference).each_line do |ln|
81
+ next if (lineno+=1)==1
82
+ r = ln.chomp.split("\t")
83
+ next if r[3].nil? or r[3].empty?
84
+ ids = r[3].split(",")
85
+ ids += r[5].split(",") unless o[:ignore_plasmids] or r[5].empty?
86
+ n = r[2].miga_name
87
+ ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
88
+ end
89
+ end
90
+
91
+ # Download IDs with complete or chromosome status
92
+ if o[:complete] or o[:chromosome]
93
+ status = (o[:complete] and o[:chromosome] ? "50|40" : o[:complete] ? "50" : "40")
94
+ $stderr.puts "Downloading complete/chromosome genomes" unless o[:q]
95
+ lineno = 0
96
+ get_list(o[:taxon], status).each_line do |ln|
97
+ next if (lineno+=1)==1
98
+ r = ln.chomp.split("\t")
99
+ next if r[10].nil? or r[10].empty?
100
+ ids = r[10].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").split(";")
101
+ n = (r[0] + "_" + ids[0]).miga_name
102
+ ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
103
+ end
104
+ end
105
+
106
+ # Download IDs with scaffold or contig status
107
+ if o[:scaffold] or o[:contig]
108
+ status = (o[:scaffold] and o[:contig] ? "30|20" : o[:scaffold] ? "30" : "20")
109
+ $stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
110
+ lineno = 0
111
+ get_list(o[:taxon], status).each_line do |ln|
112
+ next if (lineno+=1)==1
113
+ r = ln.chomp.split("\t")
114
+ next if r[7].nil? or r[7].empty?
115
+ next if r[19].nil? or r[19].empty?
116
+ asm = r[7].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").gsub(/\s/,"")
117
+ ids = r[19].gsub(/\s/, "").split(";").map{ |i| i + "/" + File.basename(i) + "_genomic.fna.gz" }
118
+ n = (r[0] + "_" + asm).miga_name
119
+ comm = "Assembly: #{asm}"
120
+ ds[n] = {ids: ids, md: {type: :genome, comments: comm}, db: :assembly_gz, universe: :web}
121
+ end
122
+ end
123
+
124
+ # Download entries
125
+ $stderr.puts "Downloading #{ds.size} #{ds.size>1 ? "entries" : "entry"}." unless o[:q]
126
+ ds.each do |name,body|
127
+ d << name
128
+ puts name
129
+ next unless p.dataset(name).nil?
130
+ $stderr.puts " Locating remote dataset." unless o[:q]
131
+ rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
132
+ $stderr.puts " Creating dataset." unless o[:q]
133
+ rd.save_to(p, name, !o[:query], body[:md])
134
+ p.add_dataset(name)
135
+ downloaded += 1
136
+ end
137
+
138
+ # Finalize
139
+ $stderr.puts "Datasets listed: #{d.size}" unless o[:q]
140
+ $stderr.puts "Datasets downloaded: #{downloaded}" unless o[:q]
141
+ unless o[:remote_list].nil?
142
+ File.open(o[:remote_list], 'w') do |fh|
143
+ d.each { |i| fh.puts i }
144
+ end
145
+ end
146
+ if o[:unlink]
147
+ unlink = p.dataset_names - d
148
+ unlink.each { |i| p.unlink_dataset(i).remove! }
149
+ $stderr.puts "Datasets unlinked: #{unlink.size}" unless o[:q]
150
+ end
151
+
data/bin/miga CHANGED
@@ -19,6 +19,7 @@ $task_desc = {
19
19
  # Datasets
20
20
  add: "Creates an empty dataset in a pre-existing MiGA project.",
21
21
  get: "Downloads a dataset from public databases into a MiGA project.",
22
+ ncbi_get: "Downloads all genomes in a taxon or RefSeq from NCBI into a MiGA project.",
22
23
  rm: "Removes a dataset from an MiGA project.",
23
24
  find: "Finds unregistered datasets based on result files.",
24
25
  ln: "Link datasets (including results) from one project to another.",
@@ -102,7 +103,7 @@ def opt_object(opt, o, what=[:project, :dataset])
102
103
  what.include? :dataset_type_req
103
104
  opt.on("-t", "--type STRING",
104
105
  (what.include?(:project_type_req) ? "(Mandatory) " : "") +
105
- "Type of project. Recognized types include:",
106
+ "Type of project. Recognized types include:",
106
107
  *MiGA::Project.KNOWN_TYPES.map{ |k,v| "~ #{k}: #{v[:description]}"}
107
108
  ){ |v| o[:type]=v.to_sym } if what.include? :project_type or
108
109
  what.include? :project_type_req
@@ -228,4 +229,3 @@ generic options:
228
229
 
229
230
  HELP
230
231
  end
231
-
data/lib/miga/common.rb CHANGED
@@ -1,21 +1,23 @@
1
1
  # @package MiGA
2
2
  # @license Artistic-2.0
3
3
 
4
- require "miga/version"
5
- require "json"
6
- require "tempfile"
7
- require "zlib"
4
+ require 'miga/version'
5
+ require 'json'
6
+ require 'tempfile'
7
+ require 'zlib'
8
8
 
9
9
  ##
10
10
  # Generic class used to handle system-wide information and methods, and parent
11
11
  # of all other MiGA::* classes.
12
12
  class MiGA::MiGA
13
13
 
14
- ENV["MIGA_HOME"] ||= ENV["HOME"]
14
+ ENV['MIGA_HOME'] ||= ENV['HOME']
15
15
 
16
16
  ##
17
17
  # Root path to MiGA (as estimated from the location of the current file).
18
- def self.root_path ; File.expand_path("../../..", __FILE__) ; end
18
+ def self.root_path
19
+ File.expand_path('../../..', __FILE__)
20
+ end
19
21
 
20
22
  ##
21
23
  # Should debugging information be reported?
@@ -48,32 +50,32 @@ class MiGA::MiGA
48
50
 
49
51
  ##
50
52
  # Send debug message.
51
- def self.DEBUG *args
53
+ def self.DEBUG(*args)
52
54
  $stderr.puts(*args) if @@DEBUG
53
- $stderr.puts caller.map{|v| v.gsub(/^/," ")}.join("\n") if
54
- @@DEBUG_TRACE
55
+ if @@DEBUG_TRACE
56
+ $stderr.puts caller.map{ |v| v.gsub(/^/,' ') }.join("\n")
57
+ end
55
58
  end
56
59
 
57
60
  ##
58
61
  # Has MiGA been initialized?
59
62
  def self.initialized?
60
- File.exist?(File.expand_path(".miga_rc", ENV["MIGA_HOME"])) and
61
- File.exist?(File.expand_path(".miga_daemon.json", ENV["MIGA_HOME"]))
63
+ File.exist?(File.expand_path('.miga_rc', ENV['MIGA_HOME'])) and
64
+ File.exist?(File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
62
65
  end
63
66
 
64
67
  ##
65
68
  # Tabulates an +values+, and Array of Arrays, all with the same number of
66
69
  # entries as +header+. Returns an Array of String, one per line.
67
70
  def self.tabulate(header, values)
68
- fields = [header.map{ |h| h.to_s }]
69
- fields << fields.first.map{ |h| h.gsub(/\S/, "-") }
70
- fields += values.map{ |row| row.map{ |cell| cell.nil? ? "?" : cell.to_s } }
71
- clen = fields.map{ |row|
72
- row.map{ |cell| cell.length } }.transpose.map{ |col| col.max }
71
+ fields = [header.map(&:to_s)]
72
+ fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
73
+ fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
74
+ clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
73
75
  fields.map do |row|
74
76
  (0 .. clen.size-1).map do |col_n|
75
77
  col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
76
- end.join(" ")
78
+ end.join(' ')
77
79
  end
78
80
  end
79
81
 
@@ -82,25 +84,25 @@ class MiGA::MiGA
82
84
  def self.clean_fasta_file(file)
83
85
  tmp_fh = nil
84
86
  begin
85
- if (file =~ /\.gz/)
86
- tmp_path = Tempfile.new("MiGA.gz").tap{ |i| i.close }.path
87
+ if file =~ /\.gz/
88
+ tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
87
89
  tmp_fh = Zlib::GzipWriter.open(tmp_path)
88
90
  fh = Zlib::GzipReader.open(file)
89
91
  else
90
- tmp_fh = Tempfile.new("MiGA")
92
+ tmp_fh = Tempfile.new('MiGA')
91
93
  tmp_path = tmp_fh.path
92
- fh = File.open(file, "r")
94
+ fh = File.open(file, 'r')
93
95
  end
94
- buffer = ""
96
+ buffer = ''
95
97
  fh.each_line do |ln|
96
98
  ln.chomp!
97
99
  if ln =~ /^>\s*(\S+)(.*)/
98
100
  (id, df) = [$1, $2]
99
101
  tmp_fh.print buffer.wrap_width(80)
100
- buffer = ""
102
+ buffer = ''
101
103
  tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
102
104
  else
103
- buffer << ln.gsub(/[^A-Za-z\.\-]/, "")
105
+ buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
104
106
  end
105
107
  end
106
108
  tmp_fh.print buffer.wrap_width(80)
@@ -124,7 +126,7 @@ class MiGA::MiGA
124
126
  # - +:n50+: If true, it also returns the N50 and the median (in bp).
125
127
  # - +gc+: If true, it also returns the G+C content (in %).
126
128
  def self.seqs_length(file, format, opts={})
127
- fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, "r")
129
+ fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
128
130
  l = []
129
131
  gc = 0
130
132
  i = 0 # <- Zlib::GzipReader doesn't set $.
@@ -154,12 +156,11 @@ class MiGA::MiGA
154
156
  break if pos >= thr
155
157
  end
156
158
  o[:med] = o[:n].even? ?
157
- 0.5*( l[o[:n]/2-1,2].inject(:+) ) : l[(o[:n]-1)/2]
159
+ 0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
158
160
  end
159
161
  o
160
162
  end
161
-
162
-
163
+
163
164
  ##
164
165
  # Path to a script to be executed for +task+. Supported +opts+ are:
165
166
  # - +:miga+ Path to the MiGA home to use. If not passed, the home of the
@@ -178,17 +179,15 @@ class MiGA::MiGA
178
179
  File.expand_path("scripts/#{task}.bash", opts[:miga])
179
180
  end
180
181
 
181
-
182
182
  ##
183
183
  # Check if the result files exist with +base+ name (String) followed by the
184
184
  # +ext+ values (Array of String).
185
185
  def result_files_exist?(base, ext)
186
- ext = [ext] unless ext.kind_of? Array
186
+ ext = [ext] unless ext.is_a? Array
187
187
  ext.all? do |f|
188
188
  File.exist?(base + f) or File.exist?("#{base}#{f}.gz")
189
189
  end
190
190
  end
191
-
192
191
  end
193
192
 
194
193
  ##
@@ -212,29 +211,36 @@ class File
212
211
  raise "Unknown transfer method: #{method}."
213
212
  end
214
213
  end
215
-
216
214
  end
217
215
 
218
216
  ##
219
217
  # MiGA extensions to the String class.
220
218
  class String
221
-
219
+
222
220
  ##
223
221
  # Replace any character not allowed in a MiGA name for underscore (_). This
224
222
  # results in a MiGA-compliant name EXCEPT for empty strings, that results in
225
223
  # empty strings.
226
- def miga_name ; gsub(/[^A-Za-z0-9_]/, "_") ; end
224
+ def miga_name
225
+ gsub(/[^A-Za-z0-9_]/, '_')
226
+ end
227
227
 
228
228
  ##
229
229
  # Is the string a MiGA-compliant name?
230
- def miga_name? ; not(self !~ /^[A-Za-z0-9_]+$/) ; end
230
+ def miga_name?
231
+ !(self !~ /^[A-Za-z0-9_]+$/)
232
+ end
231
233
 
232
234
  ##
233
235
  # Replace underscores by spaces or dots (depending on context).
234
- def unmiga_name ; gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr("_", " ") ; end
235
-
236
+ def unmiga_name
237
+ gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
238
+ end
239
+
236
240
  ##
237
241
  # Wraps the string with fixed Integer +width+.
238
- def wrap_width(width) ; gsub(/([^\n\r]{1,#{width}})/,"\\1\n") ; end
239
-
242
+ def wrap_width(width)
243
+ gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
244
+ end
240
245
  end
246
+
data/lib/miga/daemon.rb CHANGED
@@ -191,7 +191,7 @@ class MiGA::Daemon < MiGA::MiGA
191
191
  log_dir = File.expand_path("daemon/#{job}", project.path)
192
192
  Dir.mkdir(log_dir) unless Dir.exist? log_dir
193
193
  task_name = "#{project.metadata[:name][0..9]}:#{job}:#{ds_name}"
194
- to_run = {ds: ds, job: job, task_name: task_name,
194
+ to_run = {ds: ds, ds_name: ds_name, job: job, task_name: task_name,
195
195
  cmd: sprintf(runopts(:cmd),
196
196
  # 1: script
197
197
  MiGA::MiGA.script_path(job, miga:vars['MIGA'], project:project),
@@ -42,7 +42,9 @@ module MiGA::Dataset::Result
42
42
  dir = @@RESULT_DIRS[result_type]
43
43
  return nil if dir.nil?
44
44
  base = File.expand_path("data/#{dir}/#{name}", project.path)
45
- unless opts[:force]
45
+ if opts[:force]
46
+ FileUtils.rm("#{base}.json") if File.exist?("#{base}.json")
47
+ else
46
48
  r_pre = MiGA::Result.load("#{base}.json")
47
49
  return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
48
50
  end
@@ -204,7 +206,7 @@ module MiGA::Dataset::Result
204
206
  ##
205
207
  # Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
206
208
  def add_result_cds(base, opts)
207
- return nil unless result_files_exist?(base, %w[.faa .fna])
209
+ return nil unless result_files_exist?(base, %w[.faa])
208
210
  r = MiGA::Result.new("#{base}.json")
209
211
  r = add_files_to_ds_result(r, name, proteins:".faa", genes:".fna",
210
212
  gff2:".gff2", gff3:".gff3", tab:".tab")
@@ -266,13 +268,15 @@ module MiGA::Dataset::Result
266
268
  def add_result_mytaxa_scan(base, _opts)
267
269
  if is_nonmulti?
268
270
  return nil unless
269
- result_files_exist?(base, %w[.pdf .wintax .mytaxa .reg]) or
270
- result_files_exist?(base, ".nomytaxa.txt")
271
+ result_files_exist?(base, %w[.pdf .mytaxa]) or
272
+ result_files_exist?(base, '.nomytaxa.txt')
271
273
  r = MiGA::Result.new("#{base}.json")
272
- add_files_to_ds_result(r, name, mytaxa:".mytaxa", wintax:".wintax",
273
- blast:".blast", mytaxain:".mytaxain", report:".pdf", regions:".reg",
274
- gene_ids:".wintax.genes", region_ids:".wintax.regions",
275
- nomytaxa:".nomytaxa.txt")
274
+ add_files_to_ds_result(r, name, nomytaxa: '.nomytaxa.txt',
275
+ mytaxa: '.mytaxa', report: '.pdf', regions_archive: '.reg.tar',
276
+ # Intermediate / Deprecated
277
+ blast: '.blast', mytaxain: '.mytaxain', wintax: '.wintax',
278
+ gene_ids: '.wintax.genes', region_ids: '.wintax.regions',
279
+ regions: '.reg')
276
280
  else
277
281
  MiGA::Result.new("#{base}.json")
278
282
  end
data/lib/miga/result.rb CHANGED
@@ -21,10 +21,6 @@ class MiGA::Result < MiGA::MiGA
21
21
 
22
22
  # Instance-level
23
23
 
24
- ##
25
- # Path to the JSON file describing the result.
26
- attr_reader :path
27
-
28
24
  ##
29
25
  # Hash with the result metadata.
30
26
  attr_reader :data
@@ -48,6 +44,22 @@ class MiGA::Result < MiGA::MiGA
48
44
  # Register the result as cleaned.
49
45
  def clean! ; self[:clean] = true ; end
50
46
 
47
+ ##
48
+ # Path to the standard files of the result. +which+ must be one of:
49
+ # - :json (default) : JSON file describing the result.
50
+ # - :start : File with the date when the processing started.
51
+ # - :done : File with the date when the processing ended.
52
+ def path(which=:json)
53
+ case which.to_sym
54
+ when :json
55
+ @path
56
+ when :start
57
+ @path.sub(/\.json$/, ".start")
58
+ when :done
59
+ @path.sub(/\.json$/, ".done")
60
+ end
61
+ end
62
+
51
63
  ##
52
64
  # Directory containing the result.
53
65
  def dir
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 1, 5]
13
+ VERSION = [0.3, 1, 6]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -37,7 +37,10 @@ else
37
37
  fi
38
38
 
39
39
  # Reduce files
40
- ( cd "${DATASET}.ess" && tar -zcf proteins.tar.gz *.faa && rm *.faa )
40
+ ( cd "${DATASET}.ess" \
41
+ && exists *.faa \
42
+ && tar -zcf proteins.tar.gz *.faa \
43
+ && rm *.faa )
41
44
 
42
45
  # Finalize
43
46
  miga date > "$DATASET.done"
@@ -75,14 +75,17 @@ else
75
75
  FastA.filter.pl -q "$DATASET.reg/$i.ids" \
76
76
  "../../../06.cds/$DATASET.faa" > "$DATASET.reg/$i.faa"
77
77
  done
78
+ # Archive regions
79
+ tar zcf "$DATASET.reg.tar.gz" "$DATASET.reg"
80
+ rm -r "$DATASET.reg"
78
81
  fi
79
82
 
80
83
  # Clean
81
- [[ -e "$DATASET.daa" ]] && rm "$DATASET.daa"
82
- [[ -s "$DATASET.blast" && ! -s "$DATASET.blast.gz" ]] \
83
- && gzip -9 -f "$DATASET.blast"
84
- [[ -s "$DATASET.mytaxain" && ! -s "$DATASET.mytaxain.gz" ]] \
85
- && gzip -9 -f "$DATASET.mytaxain"
84
+ for x in daa blast mytaxain wintax wintax.genes wintax.regions ; do
85
+ [[ -e "$DATASET.$x" ]] && rm "$DATASET.$x"
86
+ done
87
+ [[ -s "$DATASET.mytaxa" && ! -s "$DATASET.mytaxa.gz" ]] \
88
+ && gzip -9 -f "$DATASET.mytaxa"
86
89
  fi
87
90
 
88
91
  fi
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1.5
4
+ version: 0.3.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-31 00:00:00.000000000 Z
11
+ date: 2018-01-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rest-client
@@ -130,6 +130,7 @@ files:
130
130
  - actions/init.rb
131
131
  - actions/ln.rb
132
132
  - actions/ls.rb
133
+ - actions/ncbi_get.rb
133
134
  - actions/new.rb
134
135
  - actions/plugins.rb
135
136
  - actions/rm.rb
@@ -196,9 +197,6 @@ files:
196
197
  - utils/adapters.fa
197
198
  - utils/arch-ess-genes.rb
198
199
  - utils/core-pan-plot.R
199
- - utils/distances.rb
200
- - utils/distances/functions.rb
201
- - utils/distances/ref-nomulti.rb
202
200
  - utils/enveomics/Examples/aai-matrix.bash
203
201
  - utils/enveomics/Examples/ani-matrix.bash
204
202
  - utils/enveomics/Examples/essential-phylogeny.bash
@@ -1,58 +0,0 @@
1
-
2
- require 'sqlite3'
3
-
4
- $opts = {}
5
- if ENV["MIGA_AAI_SAVE_RBM"].nil?
6
- $opts[:aai_save_rbm] = $project.is_clade? ? "save-rbm" : "no-save-rbm"
7
- else
8
- $opts[:aai_save_rbm] = ENV["MIGA_AAI_SAVE_RBM"]
9
- end
10
- $opts[:thr] = ENV["CORES"].nil? ? 2 : ENV["CORES"].to_i
11
-
12
- def ani(f1, f2, db, opts={})
13
- opts = $opts.merge(opts)
14
- v = `ani.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
15
- -t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm --lookup-first`
16
- v.nil? or v.empty? ? 0 : v.to_f
17
- end
18
-
19
- def make_empty_aai_db(db)
20
- SQLite3::Database.new(db) do |conn|
21
- conn.execute "create table if not exists aai(" +
22
- "seq1 varchar(256), seq2 varchar(256), " +
23
- "aai float, sd float, n int omega int" +
24
- ")"
25
- end unless File.size?(db)
26
- end
27
-
28
- def aai(f1, f2, db, opts={})
29
- opts = $opts.merge(opts)
30
- v = `aai.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
31
- -t "#{opts[:thr]}" -a --lookup-first "--#{opts[:aai_save_rbm]}"`.chomp
32
- v.nil? or v.empty? ? 0 : v.to_f
33
- end
34
-
35
- def haai(f1, f2, db, aai_db, opts={})
36
- opts = $opts.merge(opts)
37
- haai = aai(f1, f2, db, aai_save_rbm: "no-save-rbm")
38
- return 0 if haai.nil? or haai == 0 or haai > 90.0
39
- aai = 100.0 - Math.exp(2.435076 + 0.4275193*Math.log(100.0-haai))
40
- make_empty_aai_db(aai_db)
41
- SQLite3::Database.new(db) do |conn|
42
- conn.execute "insert into aai values(?, ?, ?, 0, 0, 0)",
43
- [ds_name(f1), ds_name(f2), aai]
44
- end
45
- aai
46
- end
47
-
48
- def haai_or_aai(f1_h, f2_h, db_h, f1, f2, db, opts={})
49
- haai=haai(f1_h, f2_h, db_h, db, opts)
50
- aai = aai(f1, f2, db, opts) if aai.nil? or aai.zero?
51
- aai
52
- end
53
-
54
- def val_from_db(n1, n2, db, metric)
55
- SQLite3::Database.new(db) do |conn|
56
- return conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first.first
57
- end if File.size? db
58
- end
@@ -1,2 +0,0 @@
1
-
2
- require_relative 'functions.rb'
data/utils/distances.rb DELETED
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'miga'
4
-
5
- $project = MiGA::Project.load(ARGV.shift)
6
- $dataset = $project.dataset(ARGV.shift)
7
- opts = Hash[ARGV]
8
-
9
- exit(0) if dataset.is_multi?
10
-
11
- if dataset.is_ref?
12
- require_relative 'distances/ref-nomulti.rb'
13
- else
14
- require_relative 'distances/noref-nomulti.rb'
15
- end
16
- # TODO run_distances!!!