miga-base 0.3.1.5 → 0.3.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 36bdee987135a432485bcb9eb49895ad22c19870
4
- data.tar.gz: 7409617014eb5adfb0adba8cdaa53c80db185db4
3
+ metadata.gz: 4ba91c88b5e9a25633e5633e344fc013b2ab0d6e
4
+ data.tar.gz: 24e7c2426c9ad4c86da246c3b4c76a94ed378aa9
5
5
  SHA512:
6
- metadata.gz: f9000a92772143755f48e3c27e69cd9c42533cbd8946682239c3d3f9e8596342d4b5214d86d84363fa77dc8f8947915b764e3bd861431151a0dd86299db5fce3
7
- data.tar.gz: 35b4eb28a73c23f601bfcac9777c25cf7116a69654f3b99a95e46118256041f2d0e2ec89dca379d97e46c6bae87fd5b64e481895e9f305573722905da6c3f22e
6
+ metadata.gz: bba625b0f7777a8aed26b0e3e55f16ada263c01e371f29ba7c89aa80f23afd5db9ae58c41abd1d0e37c6aaa17b924d00d487c6a09e222cd8d27b9e1394cd59d9
7
+ data.tar.gz: e906281a4ccf5b8f8505d6e7d5a342d44f16505c64602f1af4d65e3e09f456b21487c9c8273f84ec2c9cb2a009de13efb3b1fe20d65d879a2510abbbac7825b8
data/actions/doctor.rb CHANGED
@@ -3,12 +3,27 @@
3
3
  # @package MiGA
4
4
  # @license Artistic-2.0
5
5
 
6
- o = {q:true, v:false}
6
+ require "sqlite3"
7
+
8
+ o = {q:true, ld:false,
9
+ db: true, dist: true, files: true, ess: true, mts: true, tax: true}
7
10
  OptionParser.new do |opt|
8
11
  opt_banner(opt)
9
12
  opt_object(opt, o, [:project])
10
- opt.on("-v", "--verbose",
11
- "Print additional information on advance."){ |v| o[:v]=v }
13
+ opt.on("-l", "--list-datasets",
14
+ "List all fixed datasets on advance."){ |v| o[:ld]=v }
15
+ opt.on("--ignore-databases",
16
+ "Do not check database files integrity."){ |v| o[:db]=!v }
17
+ opt.on("--ignore-distances",
18
+ "Do not check distance tables."){ |v| o[:dist]=!v }
19
+ opt.on("--ignore-files",
20
+ "Do not check for outdated files."){ |v| o[:files]=!v }
21
+ opt.on("--ignore-essential-genes",
22
+ "Do not check unarchived essential genes."){ |v| o[:ess]=!v }
23
+ opt.on("--ignore-mytaxa-scan",
24
+ "Do not check unarchived MyTaxa scan."){ |v| o[:mts]=!v }
25
+ opt.on("--ignore-taxonomy",
26
+ "Do not check taxonomy consistency."){ |v| o[:tax]=!v }
12
27
  opt_common(opt, o)
13
28
  end.parse!
14
29
 
@@ -19,60 +34,150 @@ $stderr.puts "Loading project" unless o[:q]
19
34
  p = MiGA::Project.load(o[:project])
20
35
  raise "Impossible to load project: #{o[:project]}" if p.nil?
21
36
 
37
+ def check_sqlite3_database(db_file, metric)
38
+ begin
39
+ SQLite3::Database.new(db_file) do |conn|
40
+ conn.execute("select count(*) from #{metric}").first
41
+ end
42
+ rescue SQLite3::SQLException
43
+ yield
44
+ end
45
+ end
46
+
47
+ if o[:db]
48
+ $stderr.puts "o Checking databases integrity" unless o[:q]
49
+ p.each_dataset do |d|
50
+ [:distances, :taxonomy].each do |r_key|
51
+ r = d.result(r_key) or next
52
+ {haai_db: :aai, aai_db: :aai, ani_db: :ani}.each do |db_key, metric|
53
+ db_file = r.file_path(db_key) or next
54
+ check_sqlite3_database(db_file, metric) do
55
+ $stderr.puts(
56
+ " > Removing #{db_key} #{r_key} table for #{d.name}.") if o[:ld]
57
+ [db_file, r.path(:done), r.path].each do |f|
58
+ File.unlink f if File.exist? f
59
+ end # each |f|
60
+ end # check_sqlite3_database
61
+ end # each |db_key, metric|
62
+ end # each |r_key|
63
+ end # each |d|
64
+ end
65
+
22
66
  [:ani, :aai].each do |dist|
23
- r = p.result("#{dist}_distances")
24
- next if r.nil?
67
+ res = p.result("#{dist}_distances")
68
+ next if res.nil?
25
69
  $stderr.puts "o Checking #{dist} table for consistent datasets" unless o[:q]
26
- ok = true
70
+ notok = {}
27
71
  fix = {}
28
- Zlib::GzipReader.open(r.file_path(:matrix)) do |fh|
72
+ Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
73
+ lineno = 0
29
74
  fh.each_line do |ln|
30
- next if $.==1
75
+ next if (lineno+=1)==1
31
76
  r = ln.split("\t")
32
- if p.dataset(r[1]).nil? or p.dataset(r[2]).nil?
33
- fix[r[2]] = true unless p.dataset(r[2]).nil?
34
- fix[r[1]] = true unless p.dataset(r[1]).nil?
35
- ok = false
77
+ if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
78
+ [1,2].each do |i|
79
+ if p.dataset(r[i]).nil?
80
+ notok[r[i]] = true
81
+ else
82
+ fix[r[i]] = true
83
+ end
84
+ end
36
85
  end
37
86
  end
38
87
  end
39
88
 
40
89
  $stderr.puts " - Fixing #{fix.size} datasets" unless fix.empty? or o[:q]
41
90
  fix.keys.each do |d_n|
42
- $stderr.puts " > Fixing #{d_n}." if o[:v]
91
+ $stderr.puts " > Fixing #{d_n}." if o[:ld]
43
92
  p.dataset(d_n).cleanup_distances!
44
93
  end
45
94
 
46
- unless ok
47
- $stderr.puts " - Removing tables, recompute" unless o[:q]
48
- r.remove!
95
+ unless notok.empty?
96
+ unless o[:q]
97
+ $stderr.puts " - Unregistered datasets detected: "
98
+ if notok.size < 3
99
+ $stderr.puts " - #{notok.keys.join(", ")}"
100
+ else
101
+ $stderr.puts " - #{notok.keys.first} and other #{notok.size-1}"
102
+ end
103
+ $stderr.puts " - Removing tables, recompute"
104
+ end
105
+ res.remove!
49
106
  end
50
- end
107
+ end if o[:dist]
51
108
 
52
- $stderr.puts "o Looking for outdated files in results" unless o[:q]
53
- p.each_dataset do |d|
54
- d.each_result do |r_k, r|
55
- ok = true
56
- r.each_file do |_f_sym, _f_rel, f_abs|
57
- unless File.exist? f_abs
58
- ok = false
59
- break
109
+ if o[:files]
110
+ $stderr.puts "o Looking for outdated files in results" unless o[:q]
111
+ p.each_dataset do |d|
112
+ d.each_result do |r_k, r|
113
+ ok = true
114
+ r.each_file do |_f_sym, _f_rel, f_abs|
115
+ unless File.exist? f_abs
116
+ ok = false
117
+ break
118
+ end
119
+ end
120
+ unless ok
121
+ $stderr.puts " > Registering again #{d.name}:#{r_k}" if o[:ld]
122
+ d.add_result(r_k, true, force:true)
60
123
  end
61
124
  end
62
- unless ok
63
- $stderr.puts " - Registering again #{d.name}:#{r_k}" if o[:v]
64
- d.add_result(r_k, true, force:true)
125
+ end
126
+ end
127
+
128
+ if o[:ess]
129
+ $stderr.puts "o Looking for unarchived essential genes." unless o[:q]
130
+ p.each_dataset do |d|
131
+ res = d.result(:essential_genes)
132
+ next if res.nil?
133
+ dir = res.file_path(:collection)
134
+ if dir.nil?
135
+ $stderr.puts " > Removing #{d.name}:essential_genes" if o[:ld]
136
+ res.remove!
137
+ next
138
+ end
139
+ unless Dir["#{dir}/*.faa"].empty?
140
+ $stderr.puts " > Fixing #{d.name}." if o[:ld]
141
+ cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
142
+ warn cmdo unless cmdo.empty?
65
143
  end
66
144
  end
67
145
  end
68
146
 
69
- #$stderr.puts "o Looking for unarchived essential genes." unless o[:q]
70
- #p.each_dataset do |d|
71
- # TODO: Check unarchived protein files
72
- #end
147
+ if o[:mts]
148
+ $stderr.puts "o Looking for unarchived MyTaxa Scan runs." unless o[:q]
149
+ p.each_dataset do |d|
150
+ res = d.result(:mytaxa_scan)
151
+ next if res.nil?
152
+ dir = res.file_path(:regions)
153
+ fix = false
154
+ unless dir.nil?
155
+ if Dir.exist? dir
156
+ cmdo = `cd '#{dir}/..' \
157
+ && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
158
+ && rm -r '#{d.name}.reg'`.chomp
159
+ warn cmdo unless cmdo.empty?
160
+ end
161
+ fix = true
162
+ end
163
+ %w[blast mytaxain wintax gene_ids region_ids].each do |ext|
164
+ file = res.file_path(ext.to_sym)
165
+ unless file.nil?
166
+ FileUtils.rm(file) if File.exist? file
167
+ fix = true
168
+ end
169
+ end
170
+ if fix
171
+ $stderr.puts " > Fixing #{d.name}." if o[:ld]
172
+ d.add_result(:mytaxa_scan, true, force: true)
173
+ end
174
+ end
175
+ end
73
176
 
74
- #$stderr.puts "o Checking for taxonomy/distances consistency" unless o[:q]
75
- # TODO: Find 95%ANI clusters with entries from different species
177
+ if o[:tax]
178
+ #$stderr.puts "o Checking for taxonomy/distances consistency" unless o[:q]
179
+ # TODO: Find 95%ANI clusters with entries from different species
180
+ end
76
181
 
77
182
  $stderr.puts "Done" unless o[:q]
78
183
 
data/actions/get.rb CHANGED
@@ -30,8 +30,6 @@ OptionParser.new do |opt|
30
30
  "If set, ignores datasets that already exist."){ |v| o[:ignore_dup]=v }
31
31
  opt.on("-d", "--description STRING",
32
32
  "Description of the dataset."){ |v| o[:description]=v }
33
- opt.on("-u", "--user STRING",
34
- "Owner of the dataset."){ |v| o[:user]=v }
35
33
  opt.on("-c", "--comments STRING",
36
34
  "Comments on the dataset."){ |v| o[:comments]=v }
37
35
  opt_common(opt, o)
@@ -68,7 +66,7 @@ glob.each do |o_i|
68
66
  raise "Impossible to load project: #{o_i[:project]}" if p.nil?
69
67
 
70
68
  next if o_i[:ignore_dup] and not p.dataset(o_i[:dataset]).nil?
71
-
69
+
72
70
  $stderr.puts "Locating remote dataset." unless o_i[:q]
73
71
  rd = MiGA::RemoteDataset.new(o_i[:ids], o_i[:db], o_i[:universe])
74
72
 
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ require 'miga/remote_dataset'
7
+
8
+ o = {q:true, query:false, unlink:false,
9
+ reference: false, ignore_plasmids: false,
10
+ complete:false, chromosome:false,
11
+ scaffold:false, contig:false,}
12
+ OptionParser.new do |opt|
13
+ opt_banner(opt)
14
+ opt_object(opt, o, [:project])
15
+ opt.on("-T", "--taxon STRING",
16
+ "(Mandatory unless --reference) Name of the taxon (e.g., a species binomial)."
17
+ ){ |v| o[:taxon]=v }
18
+ opt.on("--reference",
19
+ "Download all reference genomes (ignores -T)."){ |v| o[:reference]=v }
20
+ opt.on("--ref-no-plasmids",
21
+ "If passed, ignores plasmids (only for --reference)."
22
+ ){ |v| o[:ignore_plasmids]=v }
23
+ opt.on("--complete", "Download complete genomes."){ |v| o[:complete]=v }
24
+ opt.on("--chromosome", "Download complete chromosomes."){ |v| o[:chromosome]=v }
25
+ opt.on("--scaffold", "Download genomes in scaffolds."){ |v| o[:scaffold]=v }
26
+ opt.on("--contig", "Download genomes in contigs."){ |v| o[:contig]=v }
27
+ opt.on("--all", "Download all genomes (in any status).") do
28
+ o[:complete] = true
29
+ o[:chromosome] = true
30
+ o[:scaffold] = true
31
+ o[:contig] = true
32
+ end
33
+ opt.on("-q", "--query",
34
+ "If set, the datasets are registered as queries, not reference datasets."
35
+ ){ |v| o[:query]=v }
36
+ opt.on("-u", "--unlink",
37
+ "If set, unlinks all datasets in the project missing from the download list."
38
+ ){ |v| o[:unlink]=v }
39
+ opt.on("-R", "--remote-list PATH",
40
+ "Path to an output file with the list of all datasets listed remotely."
41
+ ){ |v| o[:remote_list]=v }
42
+ opt_common(opt, o)
43
+ end.parse!
44
+
45
+ opt_require(o, project: "-P")
46
+ opt_require(o, taxon: "-T") unless o[:reference]
47
+ unless %w[reference complete chromosome scaffold contig].any?{ |i| o[i.to_sym] }
48
+ raise "No action requested. Pick at least one type of genome"
49
+ end
50
+
51
+ ##=> Main <=
52
+ $stderr.puts "Loading project." unless o[:q]
53
+ p = MiGA::Project.load(o[:project])
54
+ raise "Impossible to load project: #{o[:project]}" if p.nil?
55
+ d = []
56
+ ds = {}
57
+ downloaded = 0
58
+
59
+ def get_list(taxon, status)
60
+ url_base = "https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?"
61
+ url_param = if status==:reference
62
+ { action: "refgenomes", download: "on" }
63
+ else
64
+ { action: "download", report: "proks", group: "-- All Prokaryotes --",
65
+ subgroup: "-- All Prokaryotes --", orgn: "#{taxon}[orgn]",
66
+ status: status }
67
+ end
68
+ url = url_base + URI.encode_www_form(url_param)
69
+ response = RestClient::Request.execute(method: :get, url:url, timeout:600)
70
+ unless response.code == 200
71
+ raise "Unable to reach NCBI, error code #{response.code}."
72
+ end
73
+ response.to_s
74
+ end
75
+
76
+ # Download IDs with reference status
77
+ if o[:reference]
78
+ $stderr.puts "Downloading reference genomes" unless o[:q]
79
+ lineno = 0
80
+ get_list(nil, :reference).each_line do |ln|
81
+ next if (lineno+=1)==1
82
+ r = ln.chomp.split("\t")
83
+ next if r[3].nil? or r[3].empty?
84
+ ids = r[3].split(",")
85
+ ids += r[5].split(",") unless o[:ignore_plasmids] or r[5].empty?
86
+ n = r[2].miga_name
87
+ ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
88
+ end
89
+ end
90
+
91
+ # Download IDs with complete or chromosome status
92
+ if o[:complete] or o[:chromosome]
93
+ status = (o[:complete] and o[:chromosome] ? "50|40" : o[:complete] ? "50" : "40")
94
+ $stderr.puts "Downloading complete/chromosome genomes" unless o[:q]
95
+ lineno = 0
96
+ get_list(o[:taxon], status).each_line do |ln|
97
+ next if (lineno+=1)==1
98
+ r = ln.chomp.split("\t")
99
+ next if r[10].nil? or r[10].empty?
100
+ ids = r[10].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").split(";")
101
+ n = (r[0] + "_" + ids[0]).miga_name
102
+ ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
103
+ end
104
+ end
105
+
106
+ # Download IDs with scaffold or contig status
107
+ if o[:scaffold] or o[:contig]
108
+ status = (o[:scaffold] and o[:contig] ? "30|20" : o[:scaffold] ? "30" : "20")
109
+ $stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
110
+ lineno = 0
111
+ get_list(o[:taxon], status).each_line do |ln|
112
+ next if (lineno+=1)==1
113
+ r = ln.chomp.split("\t")
114
+ next if r[7].nil? or r[7].empty?
115
+ next if r[19].nil? or r[19].empty?
116
+ asm = r[7].gsub(/[^:;]*:/,"").gsub(/\/[^\/;]*/,"").gsub(/\s/,"")
117
+ ids = r[19].gsub(/\s/, "").split(";").map{ |i| i + "/" + File.basename(i) + "_genomic.fna.gz" }
118
+ n = (r[0] + "_" + asm).miga_name
119
+ comm = "Assembly: #{asm}"
120
+ ds[n] = {ids: ids, md: {type: :genome, comments: comm}, db: :assembly_gz, universe: :web}
121
+ end
122
+ end
123
+
124
+ # Download entries
125
+ $stderr.puts "Downloading #{ds.size} #{ds.size>1 ? "entries" : "entry"}." unless o[:q]
126
+ ds.each do |name,body|
127
+ d << name
128
+ puts name
129
+ next unless p.dataset(name).nil?
130
+ $stderr.puts " Locating remote dataset." unless o[:q]
131
+ rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
132
+ $stderr.puts " Creating dataset." unless o[:q]
133
+ rd.save_to(p, name, !o[:query], body[:md])
134
+ p.add_dataset(name)
135
+ downloaded += 1
136
+ end
137
+
138
+ # Finalize
139
+ $stderr.puts "Datasets listed: #{d.size}" unless o[:q]
140
+ $stderr.puts "Datasets downloaded: #{downloaded}" unless o[:q]
141
+ unless o[:remote_list].nil?
142
+ File.open(o[:remote_list], 'w') do |fh|
143
+ d.each { |i| fh.puts i }
144
+ end
145
+ end
146
+ if o[:unlink]
147
+ unlink = p.dataset_names - d
148
+ unlink.each { |i| p.unlink_dataset(i).remove! }
149
+ $stderr.puts "Datasets unlinked: #{unlink.size}" unless o[:q]
150
+ end
151
+
data/bin/miga CHANGED
@@ -19,6 +19,7 @@ $task_desc = {
19
19
  # Datasets
20
20
  add: "Creates an empty dataset in a pre-existing MiGA project.",
21
21
  get: "Downloads a dataset from public databases into a MiGA project.",
22
+ ncbi_get: "Downloads all genomes in a taxon or RefSeq from NCBI into a MiGA project.",
22
23
  rm: "Removes a dataset from an MiGA project.",
23
24
  find: "Finds unregistered datasets based on result files.",
24
25
  ln: "Link datasets (including results) from one project to another.",
@@ -102,7 +103,7 @@ def opt_object(opt, o, what=[:project, :dataset])
102
103
  what.include? :dataset_type_req
103
104
  opt.on("-t", "--type STRING",
104
105
  (what.include?(:project_type_req) ? "(Mandatory) " : "") +
105
- "Type of project. Recognized types include:",
106
+ "Type of project. Recognized types include:",
106
107
  *MiGA::Project.KNOWN_TYPES.map{ |k,v| "~ #{k}: #{v[:description]}"}
107
108
  ){ |v| o[:type]=v.to_sym } if what.include? :project_type or
108
109
  what.include? :project_type_req
@@ -228,4 +229,3 @@ generic options:
228
229
 
229
230
  HELP
230
231
  end
231
-
data/lib/miga/common.rb CHANGED
@@ -1,21 +1,23 @@
1
1
  # @package MiGA
2
2
  # @license Artistic-2.0
3
3
 
4
- require "miga/version"
5
- require "json"
6
- require "tempfile"
7
- require "zlib"
4
+ require 'miga/version'
5
+ require 'json'
6
+ require 'tempfile'
7
+ require 'zlib'
8
8
 
9
9
  ##
10
10
  # Generic class used to handle system-wide information and methods, and parent
11
11
  # of all other MiGA::* classes.
12
12
  class MiGA::MiGA
13
13
 
14
- ENV["MIGA_HOME"] ||= ENV["HOME"]
14
+ ENV['MIGA_HOME'] ||= ENV['HOME']
15
15
 
16
16
  ##
17
17
  # Root path to MiGA (as estimated from the location of the current file).
18
- def self.root_path ; File.expand_path("../../..", __FILE__) ; end
18
+ def self.root_path
19
+ File.expand_path('../../..', __FILE__)
20
+ end
19
21
 
20
22
  ##
21
23
  # Should debugging information be reported?
@@ -48,32 +50,32 @@ class MiGA::MiGA
48
50
 
49
51
  ##
50
52
  # Send debug message.
51
- def self.DEBUG *args
53
+ def self.DEBUG(*args)
52
54
  $stderr.puts(*args) if @@DEBUG
53
- $stderr.puts caller.map{|v| v.gsub(/^/," ")}.join("\n") if
54
- @@DEBUG_TRACE
55
+ if @@DEBUG_TRACE
56
+ $stderr.puts caller.map{ |v| v.gsub(/^/,' ') }.join("\n")
57
+ end
55
58
  end
56
59
 
57
60
  ##
58
61
  # Has MiGA been initialized?
59
62
  def self.initialized?
60
- File.exist?(File.expand_path(".miga_rc", ENV["MIGA_HOME"])) and
61
- File.exist?(File.expand_path(".miga_daemon.json", ENV["MIGA_HOME"]))
63
+ File.exist?(File.expand_path('.miga_rc', ENV['MIGA_HOME'])) and
64
+ File.exist?(File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
62
65
  end
63
66
 
64
67
  ##
65
68
  # Tabulates an +values+, and Array of Arrays, all with the same number of
66
69
  # entries as +header+. Returns an Array of String, one per line.
67
70
  def self.tabulate(header, values)
68
- fields = [header.map{ |h| h.to_s }]
69
- fields << fields.first.map{ |h| h.gsub(/\S/, "-") }
70
- fields += values.map{ |row| row.map{ |cell| cell.nil? ? "?" : cell.to_s } }
71
- clen = fields.map{ |row|
72
- row.map{ |cell| cell.length } }.transpose.map{ |col| col.max }
71
+ fields = [header.map(&:to_s)]
72
+ fields << fields.first.map{ |h| h.gsub(/\S/, '-') }
73
+ fields += values.map{ |row| row.map{ |cell| cell.nil? ? '?' : cell.to_s } }
74
+ clen = fields.map{ |row| row.map(&:length) }.transpose.map(&:max)
73
75
  fields.map do |row|
74
76
  (0 .. clen.size-1).map do |col_n|
75
77
  col_n==0 ? row[col_n].rjust(clen[col_n]) : row[col_n].ljust(clen[col_n])
76
- end.join(" ")
78
+ end.join(' ')
77
79
  end
78
80
  end
79
81
 
@@ -82,25 +84,25 @@ class MiGA::MiGA
82
84
  def self.clean_fasta_file(file)
83
85
  tmp_fh = nil
84
86
  begin
85
- if (file =~ /\.gz/)
86
- tmp_path = Tempfile.new("MiGA.gz").tap{ |i| i.close }.path
87
+ if file =~ /\.gz/
88
+ tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
87
89
  tmp_fh = Zlib::GzipWriter.open(tmp_path)
88
90
  fh = Zlib::GzipReader.open(file)
89
91
  else
90
- tmp_fh = Tempfile.new("MiGA")
92
+ tmp_fh = Tempfile.new('MiGA')
91
93
  tmp_path = tmp_fh.path
92
- fh = File.open(file, "r")
94
+ fh = File.open(file, 'r')
93
95
  end
94
- buffer = ""
96
+ buffer = ''
95
97
  fh.each_line do |ln|
96
98
  ln.chomp!
97
99
  if ln =~ /^>\s*(\S+)(.*)/
98
100
  (id, df) = [$1, $2]
99
101
  tmp_fh.print buffer.wrap_width(80)
100
- buffer = ""
102
+ buffer = ''
101
103
  tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, "_")}#{df}"
102
104
  else
103
- buffer << ln.gsub(/[^A-Za-z\.\-]/, "")
105
+ buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
104
106
  end
105
107
  end
106
108
  tmp_fh.print buffer.wrap_width(80)
@@ -124,7 +126,7 @@ class MiGA::MiGA
124
126
  # - +:n50+: If true, it also returns the N50 and the median (in bp).
125
127
  # - +gc+: If true, it also returns the G+C content (in %).
126
128
  def self.seqs_length(file, format, opts={})
127
- fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, "r")
129
+ fh = (file =~ /\.gz/) ? Zlib::GzipReader.open(file) : File.open(file, 'r')
128
130
  l = []
129
131
  gc = 0
130
132
  i = 0 # <- Zlib::GzipReader doesn't set $.
@@ -154,12 +156,11 @@ class MiGA::MiGA
154
156
  break if pos >= thr
155
157
  end
156
158
  o[:med] = o[:n].even? ?
157
- 0.5*( l[o[:n]/2-1,2].inject(:+) ) : l[(o[:n]-1)/2]
159
+ 0.5*l[o[:n]/2-1,2].inject(:+) : l[(o[:n]-1)/2]
158
160
  end
159
161
  o
160
162
  end
161
-
162
-
163
+
163
164
  ##
164
165
  # Path to a script to be executed for +task+. Supported +opts+ are:
165
166
  # - +:miga+ Path to the MiGA home to use. If not passed, the home of the
@@ -178,17 +179,15 @@ class MiGA::MiGA
178
179
  File.expand_path("scripts/#{task}.bash", opts[:miga])
179
180
  end
180
181
 
181
-
182
182
  ##
183
183
  # Check if the result files exist with +base+ name (String) followed by the
184
184
  # +ext+ values (Array of String).
185
185
  def result_files_exist?(base, ext)
186
- ext = [ext] unless ext.kind_of? Array
186
+ ext = [ext] unless ext.is_a? Array
187
187
  ext.all? do |f|
188
188
  File.exist?(base + f) or File.exist?("#{base}#{f}.gz")
189
189
  end
190
190
  end
191
-
192
191
  end
193
192
 
194
193
  ##
@@ -212,29 +211,36 @@ class File
212
211
  raise "Unknown transfer method: #{method}."
213
212
  end
214
213
  end
215
-
216
214
  end
217
215
 
218
216
  ##
219
217
  # MiGA extensions to the String class.
220
218
  class String
221
-
219
+
222
220
  ##
223
221
  # Replace any character not allowed in a MiGA name for underscore (_). This
224
222
  # results in a MiGA-compliant name EXCEPT for empty strings, that results in
225
223
  # empty strings.
226
- def miga_name ; gsub(/[^A-Za-z0-9_]/, "_") ; end
224
+ def miga_name
225
+ gsub(/[^A-Za-z0-9_]/, '_')
226
+ end
227
227
 
228
228
  ##
229
229
  # Is the string a MiGA-compliant name?
230
- def miga_name? ; not(self !~ /^[A-Za-z0-9_]+$/) ; end
230
+ def miga_name?
231
+ !(self !~ /^[A-Za-z0-9_]+$/)
232
+ end
231
233
 
232
234
  ##
233
235
  # Replace underscores by spaces or dots (depending on context).
234
- def unmiga_name ; gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr("_", " ") ; end
235
-
236
+ def unmiga_name
237
+ gsub(/_(str|sp|subsp|pv)__/,"_\\1._").tr('_', ' ')
238
+ end
239
+
236
240
  ##
237
241
  # Wraps the string with fixed Integer +width+.
238
- def wrap_width(width) ; gsub(/([^\n\r]{1,#{width}})/,"\\1\n") ; end
239
-
242
+ def wrap_width(width)
243
+ gsub(/([^\n\r]{1,#{width}})/,"\\1\n")
244
+ end
240
245
  end
246
+
data/lib/miga/daemon.rb CHANGED
@@ -191,7 +191,7 @@ class MiGA::Daemon < MiGA::MiGA
191
191
  log_dir = File.expand_path("daemon/#{job}", project.path)
192
192
  Dir.mkdir(log_dir) unless Dir.exist? log_dir
193
193
  task_name = "#{project.metadata[:name][0..9]}:#{job}:#{ds_name}"
194
- to_run = {ds: ds, job: job, task_name: task_name,
194
+ to_run = {ds: ds, ds_name: ds_name, job: job, task_name: task_name,
195
195
  cmd: sprintf(runopts(:cmd),
196
196
  # 1: script
197
197
  MiGA::MiGA.script_path(job, miga:vars['MIGA'], project:project),
@@ -42,7 +42,9 @@ module MiGA::Dataset::Result
42
42
  dir = @@RESULT_DIRS[result_type]
43
43
  return nil if dir.nil?
44
44
  base = File.expand_path("data/#{dir}/#{name}", project.path)
45
- unless opts[:force]
45
+ if opts[:force]
46
+ FileUtils.rm("#{base}.json") if File.exist?("#{base}.json")
47
+ else
46
48
  r_pre = MiGA::Result.load("#{base}.json")
47
49
  return r_pre if (r_pre.nil? and not save) or not r_pre.nil?
48
50
  end
@@ -204,7 +206,7 @@ module MiGA::Dataset::Result
204
206
  ##
205
207
  # Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
206
208
  def add_result_cds(base, opts)
207
- return nil unless result_files_exist?(base, %w[.faa .fna])
209
+ return nil unless result_files_exist?(base, %w[.faa])
208
210
  r = MiGA::Result.new("#{base}.json")
209
211
  r = add_files_to_ds_result(r, name, proteins:".faa", genes:".fna",
210
212
  gff2:".gff2", gff3:".gff3", tab:".tab")
@@ -266,13 +268,15 @@ module MiGA::Dataset::Result
266
268
  def add_result_mytaxa_scan(base, _opts)
267
269
  if is_nonmulti?
268
270
  return nil unless
269
- result_files_exist?(base, %w[.pdf .wintax .mytaxa .reg]) or
270
- result_files_exist?(base, ".nomytaxa.txt")
271
+ result_files_exist?(base, %w[.pdf .mytaxa]) or
272
+ result_files_exist?(base, '.nomytaxa.txt')
271
273
  r = MiGA::Result.new("#{base}.json")
272
- add_files_to_ds_result(r, name, mytaxa:".mytaxa", wintax:".wintax",
273
- blast:".blast", mytaxain:".mytaxain", report:".pdf", regions:".reg",
274
- gene_ids:".wintax.genes", region_ids:".wintax.regions",
275
- nomytaxa:".nomytaxa.txt")
274
+ add_files_to_ds_result(r, name, nomytaxa: '.nomytaxa.txt',
275
+ mytaxa: '.mytaxa', report: '.pdf', regions_archive: '.reg.tar',
276
+ # Intermediate / Deprecated
277
+ blast: '.blast', mytaxain: '.mytaxain', wintax: '.wintax',
278
+ gene_ids: '.wintax.genes', region_ids: '.wintax.regions',
279
+ regions: '.reg')
276
280
  else
277
281
  MiGA::Result.new("#{base}.json")
278
282
  end
data/lib/miga/result.rb CHANGED
@@ -21,10 +21,6 @@ class MiGA::Result < MiGA::MiGA
21
21
 
22
22
  # Instance-level
23
23
 
24
- ##
25
- # Path to the JSON file describing the result.
26
- attr_reader :path
27
-
28
24
  ##
29
25
  # Hash with the result metadata.
30
26
  attr_reader :data
@@ -48,6 +44,22 @@ class MiGA::Result < MiGA::MiGA
48
44
  # Register the result as cleaned.
49
45
  def clean! ; self[:clean] = true ; end
50
46
 
47
+ ##
48
+ # Path to the standard files of the result. +which+ must be one of:
49
+ # - :json (default) : JSON file describing the result.
50
+ # - :start : File with the date when the processing started.
51
+ # - :done : File with the date when the processing ended.
52
+ def path(which=:json)
53
+ case which.to_sym
54
+ when :json
55
+ @path
56
+ when :start
57
+ @path.sub(/\.json$/, ".start")
58
+ when :done
59
+ @path.sub(/\.json$/, ".done")
60
+ end
61
+ end
62
+
51
63
  ##
52
64
  # Directory containing the result.
53
65
  def dir
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 1, 5]
13
+ VERSION = [0.3, 1, 6]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -37,7 +37,10 @@ else
37
37
  fi
38
38
 
39
39
  # Reduce files
40
- ( cd "${DATASET}.ess" && tar -zcf proteins.tar.gz *.faa && rm *.faa )
40
+ ( cd "${DATASET}.ess" \
41
+ && exists *.faa \
42
+ && tar -zcf proteins.tar.gz *.faa \
43
+ && rm *.faa )
41
44
 
42
45
  # Finalize
43
46
  miga date > "$DATASET.done"
@@ -75,14 +75,17 @@ else
75
75
  FastA.filter.pl -q "$DATASET.reg/$i.ids" \
76
76
  "../../../06.cds/$DATASET.faa" > "$DATASET.reg/$i.faa"
77
77
  done
78
+ # Archive regions
79
+ tar zcf "$DATASET.reg.tar.gz" "$DATASET.reg"
80
+ rm -r "$DATASET.reg"
78
81
  fi
79
82
 
80
83
  # Clean
81
- [[ -e "$DATASET.daa" ]] && rm "$DATASET.daa"
82
- [[ -s "$DATASET.blast" && ! -s "$DATASET.blast.gz" ]] \
83
- && gzip -9 -f "$DATASET.blast"
84
- [[ -s "$DATASET.mytaxain" && ! -s "$DATASET.mytaxain.gz" ]] \
85
- && gzip -9 -f "$DATASET.mytaxain"
84
+ for x in daa blast mytaxain wintax wintax.genes wintax.regions ; do
85
+ [[ -e "$DATASET.$x" ]] && rm "$DATASET.$x"
86
+ done
87
+ [[ -s "$DATASET.mytaxa" && ! -s "$DATASET.mytaxa.gz" ]] \
88
+ && gzip -9 -f "$DATASET.mytaxa"
86
89
  fi
87
90
 
88
91
  fi
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1.5
4
+ version: 0.3.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-31 00:00:00.000000000 Z
11
+ date: 2018-01-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rest-client
@@ -130,6 +130,7 @@ files:
130
130
  - actions/init.rb
131
131
  - actions/ln.rb
132
132
  - actions/ls.rb
133
+ - actions/ncbi_get.rb
133
134
  - actions/new.rb
134
135
  - actions/plugins.rb
135
136
  - actions/rm.rb
@@ -196,9 +197,6 @@ files:
196
197
  - utils/adapters.fa
197
198
  - utils/arch-ess-genes.rb
198
199
  - utils/core-pan-plot.R
199
- - utils/distances.rb
200
- - utils/distances/functions.rb
201
- - utils/distances/ref-nomulti.rb
202
200
  - utils/enveomics/Examples/aai-matrix.bash
203
201
  - utils/enveomics/Examples/ani-matrix.bash
204
202
  - utils/enveomics/Examples/essential-phylogeny.bash
@@ -1,58 +0,0 @@
1
-
2
- require 'sqlite3'
3
-
4
- $opts = {}
5
- if ENV["MIGA_AAI_SAVE_RBM"].nil?
6
- $opts[:aai_save_rbm] = $project.is_clade? ? "save-rbm" : "no-save-rbm"
7
- else
8
- $opts[:aai_save_rbm] = ENV["MIGA_AAI_SAVE_RBM"]
9
- end
10
- $opts[:thr] = ENV["CORES"].nil? ? 2 : ENV["CORES"].to_i
11
-
12
- def ani(f1, f2, db, opts={})
13
- opts = $opts.merge(opts)
14
- v = `ani.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
15
- -t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm --lookup-first`
16
- v.nil? or v.empty? ? 0 : v.to_f
17
- end
18
-
19
- def make_empty_aai_db(db)
20
- SQLite3::Database.new(db) do |conn|
21
- conn.execute "create table if not exists aai(" +
22
- "seq1 varchar(256), seq2 varchar(256), " +
23
- "aai float, sd float, n int omega int" +
24
- ")"
25
- end unless File.size?(db)
26
- end
27
-
28
- def aai(f1, f2, db, opts={})
29
- opts = $opts.merge(opts)
30
- v = `aai.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" --name1 "#{ds_name f1}" --name2 "#{ds_name f2}" \
31
- -t "#{opts[:thr]}" -a --lookup-first "--#{opts[:aai_save_rbm]}"`.chomp
32
- v.nil? or v.empty? ? 0 : v.to_f
33
- end
34
-
35
- def haai(f1, f2, db, aai_db, opts={})
36
- opts = $opts.merge(opts)
37
- haai = aai(f1, f2, db, aai_save_rbm: "no-save-rbm")
38
- return 0 if haai.nil? or haai == 0 or haai > 90.0
39
- aai = 100.0 - Math.exp(2.435076 + 0.4275193*Math.log(100.0-haai))
40
- make_empty_aai_db(aai_db)
41
- SQLite3::Database.new(db) do |conn|
42
- conn.execute "insert into aai values(?, ?, ?, 0, 0, 0)",
43
- [ds_name(f1), ds_name(f2), aai]
44
- end
45
- aai
46
- end
47
-
48
- def haai_or_aai(f1_h, f2_h, db_h, f1, f2, db, opts={})
49
- haai=haai(f1_h, f2_h, db_h, db, opts)
50
- aai = aai(f1, f2, db, opts) if aai.nil? or aai.zero?
51
- aai
52
- end
53
-
54
- def val_from_db(n1, n2, db, metric)
55
- SQLite3::Database.new(db) do |conn|
56
- return conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first.first
57
- end if File.size? db
58
- end
@@ -1,2 +0,0 @@
1
-
2
- require_relative 'functions.rb'
data/utils/distances.rb DELETED
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'miga'
4
-
5
- $project = MiGA::Project.load(ARGV.shift)
6
- $dataset = $project.dataset(ARGV.shift)
7
- opts = Hash[ARGV]
8
-
9
- exit(0) if dataset.is_multi?
10
-
11
- if dataset.is_ref?
12
- require_relative 'distances/ref-nomulti.rb'
13
- else
14
- require_relative 'distances/noref-nomulti.rb'
15
- end
16
- # TODO run_distances!!!