miga-base 0.2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +351 -0
  3. data/actions/add_result +61 -0
  4. data/actions/add_taxonomy +86 -0
  5. data/actions/create_dataset +62 -0
  6. data/actions/create_project +70 -0
  7. data/actions/daemon +69 -0
  8. data/actions/download_dataset +77 -0
  9. data/actions/find_datasets +63 -0
  10. data/actions/import_datasets +86 -0
  11. data/actions/index_taxonomy +71 -0
  12. data/actions/list_datasets +83 -0
  13. data/actions/list_files +67 -0
  14. data/actions/unlink_dataset +52 -0
  15. data/bin/miga +48 -0
  16. data/lib/miga/daemon.rb +178 -0
  17. data/lib/miga/dataset.rb +286 -0
  18. data/lib/miga/gui.rb +289 -0
  19. data/lib/miga/metadata.rb +74 -0
  20. data/lib/miga/project.rb +268 -0
  21. data/lib/miga/remote_dataset.rb +154 -0
  22. data/lib/miga/result.rb +102 -0
  23. data/lib/miga/tax_index.rb +70 -0
  24. data/lib/miga/taxonomy.rb +107 -0
  25. data/lib/miga.rb +83 -0
  26. data/scripts/_distances_noref_nomulti.bash +86 -0
  27. data/scripts/_distances_ref_nomulti.bash +105 -0
  28. data/scripts/aai_distances.bash +40 -0
  29. data/scripts/ani_distances.bash +39 -0
  30. data/scripts/assembly.bash +38 -0
  31. data/scripts/cds.bash +45 -0
  32. data/scripts/clade_finding.bash +27 -0
  33. data/scripts/distances.bash +30 -0
  34. data/scripts/essential_genes.bash +29 -0
  35. data/scripts/haai_distances.bash +39 -0
  36. data/scripts/init.bash +211 -0
  37. data/scripts/miga.bash +12 -0
  38. data/scripts/mytaxa.bash +93 -0
  39. data/scripts/mytaxa_scan.bash +85 -0
  40. data/scripts/ogs.bash +36 -0
  41. data/scripts/read_quality.bash +37 -0
  42. data/scripts/ssu.bash +35 -0
  43. data/scripts/subclades.bash +26 -0
  44. data/scripts/trimmed_fasta.bash +47 -0
  45. data/scripts/trimmed_reads.bash +57 -0
  46. data/utils/adapters.fa +302 -0
  47. data/utils/mytaxa_scan.R +89 -0
  48. data/utils/mytaxa_scan.rb +58 -0
  49. data/utils/requirements.txt +19 -0
  50. data/utils/subclades-compile.rb +48 -0
  51. data/utils/subclades.R +171 -0
  52. metadata +185 -0
@@ -0,0 +1,178 @@
1
+ #
2
+ # @package MiGA
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license artistic license 2.0
5
+ # @update Nov-12-2015
6
+ #
7
+
8
+ require "miga/project"
9
+ require "daemons"
10
+ require "date"
11
+
12
+ module MiGA
13
+ class Daemon
14
+ def self.last_alive(p)
15
+ f = File.expand_path("daemon/alive", p.path)
16
+ return nil unless File.size? f
17
+ DateTime.parse(File.read(f))
18
+ end
19
+
20
+ attr_reader :project, :options, :jobs_to_run, :jobs_running
21
+ def initialize(p)
22
+ @project = p
23
+ @runopts = JSON.parse(
24
+ File.read(File.expand_path("daemon/daemon.json", project.path)),
25
+ {:symbolize_names=>true})
26
+ @jobs_to_run = []
27
+ @jobs_running = []
28
+ end
29
+ def last_alive
30
+ Daemon.last_alive project
31
+ end
32
+ def default_options
33
+ { dir_mode: :normal, dir: File.expand_path("daemon", project.path),
34
+ multiple: false, log_output: true }
35
+ end
36
+ def runopts(k, v=nil)
37
+ k = k.to_sym
38
+ unless v.nil?
39
+ v = v.to_i if [:latency, :maxjobs, :ppn].include? k
40
+ raise "Daemon's #{k} cannot be set to zero." if
41
+ v.is_a? Integer and v==0
42
+ @runopts[k] = v
43
+ end
44
+ @runopts[k]
45
+ end
46
+ def latency() runopts(:latency) ; end
47
+ def maxjobs() runopts(:maxjobs) ; end
48
+ def ppn() runopts(:ppn) ; end
49
+ def start() daemon("start") ; end
50
+ def stop() daemon("stop") ; end
51
+ def restart() daemon("restart") ; end
52
+ def status() daemon("status") ; end
53
+ def daemon(task, opts=[])
54
+ options = default_options
55
+ opts.unshift(task)
56
+ options[:ARGV] = opts
57
+ Daemons.run_proc("MiGA:#{project.metadata[:name]}", options) do
58
+ p = project
59
+ say "-----------------------------------"
60
+ say "MiGA:#{p.metadata[:name]} launched."
61
+ say "-----------------------------------"
62
+ loop_i = 0
63
+ loop do
64
+ # Tell the world you're alive
65
+ f = File.open(File.expand_path("daemon/alive", project.path),"w")
66
+ f.print Time.now.to_s
67
+ f.close
68
+ loop_i += 1
69
+ # Traverse datasets
70
+ p.datasets.each do |ds|
71
+ # Inspect preprocessing
72
+ to_run = ds.next_preprocessing
73
+ # Launch task
74
+ queue_job(to_run, ds) unless to_run.nil?
75
+ end
76
+
77
+ # Check if all the reference datasets are pre-processed.
78
+ # If yes, check the project-level tasks
79
+ if p.done_preprocessing?
80
+ to_run = p.next_distances
81
+ to_run = p.next_inclade if to_run.nil?
82
+ # Launch task
83
+ queue_job(to_run) unless to_run.nil?
84
+ end
85
+
86
+ # Run jobs
87
+ flush!
88
+
89
+ # Every 12 loops:
90
+ if loop_i==12
91
+ say "Housekeeping for sanity"
92
+ loop_i = 0
93
+ # Check if running jobs are alive
94
+ purge!
95
+ # Reload project metadata (to add newly created datasets)
96
+ project.load
97
+ end
98
+ sleep(latency)
99
+ end
100
+ end
101
+ end
102
+ def queue_job(job, ds=nil)
103
+ return nil unless get_job(job, ds).nil?
104
+ ds_name = (ds.nil? ? "miga-project" : ds.name)
105
+ say "Queueing ", ds_name, ":#{job}"
106
+ type = runopts(:type)
107
+ vars = {
108
+ "PROJECT"=>project.path, "RUNTYPE"=>runopts(:type), "CORES"=>ppn,
109
+ "MIGA"=>File.expand_path("../..", File.dirname(__FILE__)) }
110
+ vars["DATASET"] = ds.name unless ds.nil?
111
+ log_dir = File.expand_path("daemon/#{job}", project.path)
112
+ Dir.mkdir log_dir unless Dir.exist? log_dir
113
+ to_run = {ds: ds, job: job, cmd: sprintf(runopts(:cmd),
114
+ # 1: script
115
+ vars["MIGA"] + "/scripts/#{job.to_s}.bash",
116
+ # 2: vars
117
+ vars.keys.map{|k| sprintf(runopts(:var),k,vars[k])
118
+ }.join(runopts(:varsep)),
119
+ # 3: CPUs
120
+ ppn,
121
+ # 4: log file
122
+ File.expand_path("#{ds_name}.log", log_dir),
123
+ # 5: task name
124
+ "#{project.metadata[:name][0..9]}:#{job}:#{ds_name}")}
125
+ @jobs_to_run << to_run
126
+ end
127
+ def get_job(job, ds=nil)
128
+ if ds==nil
129
+ (@jobs_to_run + @jobs_running).select do |j|
130
+ (j[:ds].nil?) and (j[:job]==job)
131
+ end.first
132
+ else
133
+ (@jobs_to_run + @jobs_running).select do |j|
134
+ (not j[:ds].nil?) and (j[:ds].name==ds.name) and (j[:job]==job)
135
+ end.first
136
+ end
137
+ end
138
+ def flush!
139
+ # Check for finished jobs
140
+ self.jobs_running.select! do |job|
141
+ r = job[:ds].nil? ?
142
+ self.project.add_result(job[:job]) :
143
+ job[:ds].add_result(job[:job])
144
+ say "Completed pid:#{job[:pid]} for " +
145
+ "#{job[:ds].nil? ? "" : "#{job[:ds].name}:"}#{job[:job]}" unless
146
+ r.nil?
147
+ r.nil?
148
+ end
149
+
150
+ # Avoid single datasets hogging resources
151
+ @jobs_to_run.rotate! rand(@jobs_to_run.size)
152
+
153
+ # Launch as many @jobs_to_run as possible
154
+ while jobs_running.size < maxjobs
155
+ break if jobs_to_run.empty?
156
+ job = self.jobs_to_run.shift
157
+ if runopts(:type) == "bash"
158
+ job[:pid] = spawn job[:cmd]
159
+ Process.detach job[:pid]
160
+ else
161
+ job[:pid] = `#{job[:cmd]}`.gsub(/[\n\r]/,"")
162
+ end
163
+ @jobs_running << job
164
+ say "Spawned pid:#{job[:pid]} for " +
165
+ "#{job[:ds].nil? ? "" : "#{job[:ds].name}:"}#{job[:job]}"
166
+ end
167
+ end
168
+ def purge!
169
+ self.jobs_running.select! do |job|
170
+ `#{sprintf(runopts(:alive), job[:pid])}`.chomp.to_i == 1
171
+ end
172
+ end
173
+ def say(*opts)
174
+ print "[#{Time.new.inspect}] ", *opts, "\n"
175
+ end
176
+ end
177
+ end
178
+
@@ -0,0 +1,286 @@
1
+ #
2
+ # @package MiGA
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license artistic license 2.0
5
+ # @update Jan-18-2016
6
+ #
7
+
8
+ require "miga/metadata"
9
+ require "miga/project"
10
+ require "miga/result"
11
+
12
+ module MiGA
13
+ class Dataset
14
+ # Class
15
+ @@RESULT_DIRS = {
16
+ # Preprocessing
17
+ raw_reads: "01.raw_reads", trimmed_reads: "02.trimmed_reads",
18
+ read_quality: "03.read_quality", trimmed_fasta: "04.trimmed_fasta",
19
+ assembly: "05.assembly", cds: "06.cds",
20
+ # Annotation
21
+ essential_genes: "07.annotation/01.function/01.essential",
22
+ ssu: "07.annotation/01.function/02.ssu",
23
+ mytaxa: "07.annotation/02.taxonomy/01.mytaxa",
24
+ mytaxa_scan: "07.annotation/03.qa/02.mytaxa_scan",
25
+ # Mapping
26
+ mapping_on_contigs: "08.mapping/01.read-ctg",
27
+ mapping_on_genes: "08.mapping/02.read-gene",
28
+ # Distances (for single-species datasets)
29
+ distances: "09.distances"
30
+ }
31
+ @@KNOWN_TYPES = {
32
+ genome: {description: "The genome from an isolate.", multi: false},
33
+ metagenome: {description: "A metagenome (excluding viromes).",
34
+ multi: true},
35
+ virome: {description: "A viral metagenome.", multi: true},
36
+ scgenome: {description: "A genome from a single cell.", multi: false},
37
+ popgenome: {description: "The genome of a population (including " +
38
+ "microdiversity).", :multi=>false}
39
+ }
40
+ @@PREPROCESSING_TASKS = [:raw_reads, :trimmed_reads, :read_quality,
41
+ :trimmed_fasta, :assembly, :cds, :essential_genes, :ssu, :mytaxa,
42
+ :mytaxa_scan, :distances]
43
+ @@EXCLUDE_NOREF_TASKS = [:essential_genes, :mytaxa_scan]
44
+ @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :distances]
45
+ @@ONLY_MULTI_TASKS = [:mytaxa]
46
+ def self.PREPROCESSING_TASKS ; @@PREPROCESSING_TASKS ; end
47
+ def self.RESULT_DIRS ; @@RESULT_DIRS end
48
+ def self.KNOWN_TYPES ; @@KNOWN_TYPES end
49
+ def self.exist?(project, name)
50
+ File.exist? project.path + "/metadata/" + name + ".json"
51
+ end
52
+ def self.INFO_FIELDS
53
+ %w(name created updated type ref user description comments)
54
+ end
55
+ # Instance
56
+ attr_reader :project, :name, :metadata
57
+ def initialize(project, name, is_ref=true, metadata={})
58
+ abort "Invalid name '#{name}', please use only alphanumerics and " +
59
+ "underscores." unless name.miga_name?
60
+ @project = project
61
+ @name = name
62
+ metadata[:ref] = is_ref
63
+ @metadata = Metadata.new(project.path + "/metadata/" + name + ".json",
64
+ metadata)
65
+ end
66
+ def save
67
+ self.metadata[:type] = :metagenome if !metadata[:tax].nil? and
68
+ !metadata[:tax][:ns].nil? and
69
+ metadata[:tax][:ns]=="COMMUNITY"
70
+ self.metadata.save
71
+ self.load
72
+ end
73
+ def load
74
+ # Nothing here...
75
+ end
76
+ def remove!
77
+ self.results.each{ |r| r.remove! }
78
+ self.metadata.remove!
79
+ end
80
+ def info()
81
+ Dataset.INFO_FIELDS.map do |k|
82
+ (k=="name") ? self.name : self.metadata[k.to_sym]
83
+ end
84
+ end
85
+ def is_ref?() !!self.metadata[:ref] end
86
+ def is_multi?
87
+ return false if self.metadata[:type].nil?
88
+ return @@KNOWN_TYPES[self.metadata[:type]][:multi]
89
+ end
90
+ def is_nonmulti?
91
+ return false if self.metadata[:type].nil?
92
+ return !@@KNOWN_TYPES[self.metadata[:type]][:multi]
93
+ end
94
+ def result(k)
95
+ return nil if @@RESULT_DIRS[k.to_sym].nil?
96
+ Result.load(project.path + "/data/" + @@RESULT_DIRS[k.to_sym] +
97
+ "/" + name + ".json")
98
+ end
99
+ def results() @@RESULT_DIRS.keys.map{ |k| self.result k }.compact end
100
+ def each_result(&blk)
101
+ @@RESULT_DIRS.keys.each do |k|
102
+ v = self.result k
103
+ blk.call(k,v) unless v.nil?
104
+ end
105
+ end
106
+ def add_result result_type
107
+ return nil if @@RESULT_DIRS[result_type].nil?
108
+ base = project.path + "/data/" + @@RESULT_DIRS[result_type] +
109
+ "/" + name
110
+ return nil unless File.exist? base + ".done"
111
+ r = nil
112
+ case result_type
113
+ when :raw_reads
114
+ return nil unless
115
+ File.exist? base + ".1.fastq" or
116
+ File.exist? base + ".1.fastq.gz"
117
+ r = Result.new base + ".json"
118
+ r.data[:gz] = File.exist?(base + ".1.fastq.gz")
119
+ if File.exist? base + ".2.fastq" + (r.data[:gz] ? ".gz" : "")
120
+ r.add_file :pair1, name + ".1.fastq"
121
+ r.add_file :pair2, name + ".2.fastq"
122
+ else
123
+ r.add_file :single, name + ".1.fastq"
124
+ end
125
+ when :trimmed_reads
126
+ return nil unless
127
+ File.exist?(base + ".1.clipped.fastq") or
128
+ File.exist?(base + ".1.clipped.fastq.gz")
129
+ r = Result.new base + ".json"
130
+ r.data[:gz] = File.exist?(base + ".1.clipped.fastq.gz")
131
+ if File.exist? base + ".2.clipped.fastq" + (r.data[:gz] ? ".gz":"")
132
+ r.add_file :pair1, name + ".1.clipped.fastq"
133
+ r.add_file :pair2, name + ".2.clipped.fastq"
134
+ end
135
+ r.add_file :single, name + ".1.clipped.single.fastq"
136
+ add_result :raw_reads #-> Post gunzip (if any)
137
+ when :read_quality
138
+ return nil unless
139
+ Dir.exist?(base + ".solexaqa") and
140
+ Dir.exist?(base + ".fastqc")
141
+ r = Result.new base + ".json"
142
+ r.add_file :solexaqa, self.name + ".solexaqa"
143
+ r.add_file :fastqc, self.name + ".fastqc"
144
+ add_result :trimmed_reads #-> Post cleaning
145
+ when :trimmed_fasta
146
+ return nil unless
147
+ File.exist?(base + ".CoupledReads.fa") or
148
+ File.exist?(base + ".SingleReads.fa")
149
+ r = Result.new base + ".json"
150
+ if File.exist?(base + ".CoupledReads.fa")
151
+ r.add_file :coupled, name + ".CoupledReads.fa"
152
+ r.add_file :pair1, name + ".1.fa"
153
+ r.add_file :pair2, name + ".2.fa"
154
+ end
155
+ r.add_file :single, name + ".SingleReads.fa"
156
+ add_result :raw_reads #-> Post gzip
157
+ when :assembly
158
+ return nil unless
159
+ File.exist?(base + ".LargeContigs.fna")
160
+ r = Result.new base + ".json"
161
+ r.add_file :largecontigs, name + ".LargeContigs.fna"
162
+ r.add_file :allcontigs, name + ".AllContigs.fna"
163
+ when :cds
164
+ return nil unless
165
+ File.exist?(base + ".faa") and
166
+ File.exist?(base + ".fna")
167
+ r = Result.new base + ".json"
168
+ r.add_file :proteins, name + ".faa"
169
+ r.add_file :genes, name + ".fna"
170
+ %w(gff2 gff3 tab).each do |ext|
171
+ r.add_file ext, "#{name}.#{ext}"
172
+ end
173
+ when :essential_genes
174
+ return nil unless
175
+ File.exist?(base + ".ess.faa") and
176
+ Dir.exist?(base + ".ess") and
177
+ File.exist?(base + ".ess/log")
178
+ r = Result.new base + ".json"
179
+ r.add_file :ess_genes, name + ".ess.faa"
180
+ r.add_file :collection, name + ".ess"
181
+ r.add_file :report, name + ".ess/log"
182
+ when :ssu
183
+ if result(:assembly).nil?
184
+ r = Result.new base + ".json"
185
+ else
186
+ return nil unless
187
+ File.exist?(base + ".ssu.fa") or
188
+ File.exist?(base + ".ssu.fa.gz")
189
+ r = Result.new base + ".json"
190
+ r.data[:gz] = File.exist?(base + ".ssu.fa.gz")
191
+ r.add_file :longest_ssu_gene, name + ".ssu.fa"
192
+ r.add_file :gff, name + ".ssu.gff"
193
+ r.add_file :all_ssu_genes, name + ".ssu.all.fa"
194
+ end
195
+ when :mytaxa
196
+ if is_multi?
197
+ return nil unless File.exist?(base + ".mytaxa")
198
+ r = Result.new base + ".json"
199
+ r.data[:gz] = File.exist?(base + ".mytaxain.gz")
200
+ r.add_file :mytaxa, name + ".mytaxa"
201
+ r.add_file :blast, name + ".blast"
202
+ r.add_file :mytaxain, name + ".mytaxain"
203
+ else
204
+ r = Result.new base + ".json"
205
+ r.data[:files] = {}
206
+ end
207
+ when :mytaxa_scan
208
+ if is_nonmulti?
209
+ return nil unless
210
+ File.exists?(base + ".pdf") and
211
+ File.exist?(base + ".wintax") and
212
+ File.exist?(base + ".mytaxa") and
213
+ Dir.exist?(base + ".reg")
214
+ r = Result.new base + ".json"
215
+ r.add_file :mytaxa, name + ".mytaxa"
216
+ r.add_file :wintax, name + ".wintax"
217
+ r.add_file :report, name + ".pdf"
218
+ r.add_file :regions, name + ".reg"
219
+ r.add_file :gene_ids, name + ".wintax.genes"
220
+ r.add_file :region_ids, name + ".wintax.regions"
221
+ r.add_file :blast, name + ".blast"
222
+ r.add_file :mytaxain, name + ".mytaxain"
223
+ else
224
+ r = Result.new base + ".json"
225
+ r.data[:files] = {}
226
+ end
227
+ when :distances
228
+ if is_nonmulti?
229
+ pref = project.path + "/data/" + @@RESULT_DIRS[result_type]
230
+ if is_ref?
231
+ return nil unless
232
+ File.exist?(pref + "/01.haai/" + name + ".db")
233
+ else
234
+ return nil unless
235
+ File.exist?(pref + "/02.aai/" + name + ".db")
236
+ end
237
+ r = Result.new base + ".json"
238
+ r.add_file :haai_db, "01.haai/" + name + ".db"
239
+ r.add_file :aai_db, "02.aai/" + name + ".db"
240
+ r.add_file :ani_db, "03.ani/" + name + ".db"
241
+ else
242
+ r = Result.new base + ".json"
243
+ r.data[:files] = {}
244
+ end
245
+ end
246
+ r.save
247
+ r
248
+ end # def add_result
249
+ def first_preprocessing
250
+ @@PREPROCESSING_TASKS.find{ |t| not self.add_result(t).nil? }
251
+ end
252
+ def next_preprocessing
253
+ after_first = false
254
+ first = self.first_preprocessing
255
+ return nil if first.nil?
256
+ @@PREPROCESSING_TASKS.each do |t|
257
+ next if @@EXCLUDE_NOREF_TASKS.include?(t) and not is_ref?
258
+ next if @@ONLY_MULTI_TASKS.include?(t) and not is_multi?
259
+ next if @@ONLY_NONMULTI_TASKS.include?(t) and not is_nonmulti?
260
+ return t if after_first and add_result(t).nil?
261
+ after_first = (after_first or (t==first))
262
+ end
263
+ nil
264
+ end
265
+ def done_preprocessing?
266
+ !first_preprocessing.nil? and next_preprocessing.nil?
267
+ end
268
+ def profile_advance
269
+ if first_preprocessing.nil?
270
+ adv = Array.new(@@PREPROCESSING_TASKS.size, 0)
271
+ else
272
+ adv = []
273
+ state = 0
274
+ first_task = first_preprocessing
275
+ next_task = next_preprocessing
276
+ @@PREPROCESSING_TASKS.each do |task|
277
+ state = 1 if first_task==task
278
+ state = 2 if !next_task.nil? and next_task==task
279
+ adv << state
280
+ end
281
+ end
282
+ adv
283
+ end
284
+ end # class Dataset
285
+ end # module MiGA
286
+