miga-base 0.2.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +351 -0
  3. data/actions/add_result +61 -0
  4. data/actions/add_taxonomy +86 -0
  5. data/actions/create_dataset +62 -0
  6. data/actions/create_project +70 -0
  7. data/actions/daemon +69 -0
  8. data/actions/download_dataset +77 -0
  9. data/actions/find_datasets +63 -0
  10. data/actions/import_datasets +86 -0
  11. data/actions/index_taxonomy +71 -0
  12. data/actions/list_datasets +83 -0
  13. data/actions/list_files +67 -0
  14. data/actions/unlink_dataset +52 -0
  15. data/bin/miga +48 -0
  16. data/lib/miga/daemon.rb +178 -0
  17. data/lib/miga/dataset.rb +286 -0
  18. data/lib/miga/gui.rb +289 -0
  19. data/lib/miga/metadata.rb +74 -0
  20. data/lib/miga/project.rb +268 -0
  21. data/lib/miga/remote_dataset.rb +154 -0
  22. data/lib/miga/result.rb +102 -0
  23. data/lib/miga/tax_index.rb +70 -0
  24. data/lib/miga/taxonomy.rb +107 -0
  25. data/lib/miga.rb +83 -0
  26. data/scripts/_distances_noref_nomulti.bash +86 -0
  27. data/scripts/_distances_ref_nomulti.bash +105 -0
  28. data/scripts/aai_distances.bash +40 -0
  29. data/scripts/ani_distances.bash +39 -0
  30. data/scripts/assembly.bash +38 -0
  31. data/scripts/cds.bash +45 -0
  32. data/scripts/clade_finding.bash +27 -0
  33. data/scripts/distances.bash +30 -0
  34. data/scripts/essential_genes.bash +29 -0
  35. data/scripts/haai_distances.bash +39 -0
  36. data/scripts/init.bash +211 -0
  37. data/scripts/miga.bash +12 -0
  38. data/scripts/mytaxa.bash +93 -0
  39. data/scripts/mytaxa_scan.bash +85 -0
  40. data/scripts/ogs.bash +36 -0
  41. data/scripts/read_quality.bash +37 -0
  42. data/scripts/ssu.bash +35 -0
  43. data/scripts/subclades.bash +26 -0
  44. data/scripts/trimmed_fasta.bash +47 -0
  45. data/scripts/trimmed_reads.bash +57 -0
  46. data/utils/adapters.fa +302 -0
  47. data/utils/mytaxa_scan.R +89 -0
  48. data/utils/mytaxa_scan.rb +58 -0
  49. data/utils/requirements.txt +19 -0
  50. data/utils/subclades-compile.rb +48 -0
  51. data/utils/subclades.R +171 -0
  52. metadata +185 -0
@@ -0,0 +1,178 @@
1
+ #
2
+ # @package MiGA
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license artistic license 2.0
5
+ # @update Nov-12-2015
6
+ #
7
+
8
+ require "miga/project"
9
+ require "daemons"
10
+ require "date"
11
+
12
+ module MiGA
13
+ class Daemon
14
+ def self.last_alive(p)
15
+ f = File.expand_path("daemon/alive", p.path)
16
+ return nil unless File.size? f
17
+ DateTime.parse(File.read(f))
18
+ end
19
+
20
+ attr_reader :project, :options, :jobs_to_run, :jobs_running
21
+ def initialize(p)
22
+ @project = p
23
+ @runopts = JSON.parse(
24
+ File.read(File.expand_path("daemon/daemon.json", project.path)),
25
+ {:symbolize_names=>true})
26
+ @jobs_to_run = []
27
+ @jobs_running = []
28
+ end
29
+ def last_alive
30
+ Daemon.last_alive project
31
+ end
32
+ def default_options
33
+ { dir_mode: :normal, dir: File.expand_path("daemon", project.path),
34
+ multiple: false, log_output: true }
35
+ end
36
+ def runopts(k, v=nil)
37
+ k = k.to_sym
38
+ unless v.nil?
39
+ v = v.to_i if [:latency, :maxjobs, :ppn].include? k
40
+ raise "Daemon's #{k} cannot be set to zero." if
41
+ v.is_a? Integer and v==0
42
+ @runopts[k] = v
43
+ end
44
+ @runopts[k]
45
+ end
46
+ def latency() runopts(:latency) ; end
47
+ def maxjobs() runopts(:maxjobs) ; end
48
+ def ppn() runopts(:ppn) ; end
49
+ def start() daemon("start") ; end
50
+ def stop() daemon("stop") ; end
51
+ def restart() daemon("restart") ; end
52
+ def status() daemon("status") ; end
53
+ def daemon(task, opts=[])
54
+ options = default_options
55
+ opts.unshift(task)
56
+ options[:ARGV] = opts
57
+ Daemons.run_proc("MiGA:#{project.metadata[:name]}", options) do
58
+ p = project
59
+ say "-----------------------------------"
60
+ say "MiGA:#{p.metadata[:name]} launched."
61
+ say "-----------------------------------"
62
+ loop_i = 0
63
+ loop do
64
+ # Tell the world you're alive
65
+ f = File.open(File.expand_path("daemon/alive", project.path),"w")
66
+ f.print Time.now.to_s
67
+ f.close
68
+ loop_i += 1
69
+ # Traverse datasets
70
+ p.datasets.each do |ds|
71
+ # Inspect preprocessing
72
+ to_run = ds.next_preprocessing
73
+ # Launch task
74
+ queue_job(to_run, ds) unless to_run.nil?
75
+ end
76
+
77
+ # Check if all the reference datasets are pre-processed.
78
+ # If yes, check the project-level tasks
79
+ if p.done_preprocessing?
80
+ to_run = p.next_distances
81
+ to_run = p.next_inclade if to_run.nil?
82
+ # Launch task
83
+ queue_job(to_run) unless to_run.nil?
84
+ end
85
+
86
+ # Run jobs
87
+ flush!
88
+
89
+ # Every 12 loops:
90
+ if loop_i==12
91
+ say "Housekeeping for sanity"
92
+ loop_i = 0
93
+ # Check if running jobs are alive
94
+ purge!
95
+ # Reload project metadata (to add newly created datasets)
96
+ project.load
97
+ end
98
+ sleep(latency)
99
+ end
100
+ end
101
+ end
102
+ def queue_job(job, ds=nil)
103
+ return nil unless get_job(job, ds).nil?
104
+ ds_name = (ds.nil? ? "miga-project" : ds.name)
105
+ say "Queueing ", ds_name, ":#{job}"
106
+ type = runopts(:type)
107
+ vars = {
108
+ "PROJECT"=>project.path, "RUNTYPE"=>runopts(:type), "CORES"=>ppn,
109
+ "MIGA"=>File.expand_path("../..", File.dirname(__FILE__)) }
110
+ vars["DATASET"] = ds.name unless ds.nil?
111
+ log_dir = File.expand_path("daemon/#{job}", project.path)
112
+ Dir.mkdir log_dir unless Dir.exist? log_dir
113
+ to_run = {ds: ds, job: job, cmd: sprintf(runopts(:cmd),
114
+ # 1: script
115
+ vars["MIGA"] + "/scripts/#{job.to_s}.bash",
116
+ # 2: vars
117
+ vars.keys.map{|k| sprintf(runopts(:var),k,vars[k])
118
+ }.join(runopts(:varsep)),
119
+ # 3: CPUs
120
+ ppn,
121
+ # 4: log file
122
+ File.expand_path("#{ds_name}.log", log_dir),
123
+ # 5: task name
124
+ "#{project.metadata[:name][0..9]}:#{job}:#{ds_name}")}
125
+ @jobs_to_run << to_run
126
+ end
127
+ def get_job(job, ds=nil)
128
+ if ds==nil
129
+ (@jobs_to_run + @jobs_running).select do |j|
130
+ (j[:ds].nil?) and (j[:job]==job)
131
+ end.first
132
+ else
133
+ (@jobs_to_run + @jobs_running).select do |j|
134
+ (not j[:ds].nil?) and (j[:ds].name==ds.name) and (j[:job]==job)
135
+ end.first
136
+ end
137
+ end
138
+ def flush!
139
+ # Check for finished jobs
140
+ self.jobs_running.select! do |job|
141
+ r = job[:ds].nil? ?
142
+ self.project.add_result(job[:job]) :
143
+ job[:ds].add_result(job[:job])
144
+ say "Completed pid:#{job[:pid]} for " +
145
+ "#{job[:ds].nil? ? "" : "#{job[:ds].name}:"}#{job[:job]}" unless
146
+ r.nil?
147
+ r.nil?
148
+ end
149
+
150
+ # Avoid single datasets hogging resources
151
+ @jobs_to_run.rotate! rand(@jobs_to_run.size)
152
+
153
+ # Launch as many @jobs_to_run as possible
154
+ while jobs_running.size < maxjobs
155
+ break if jobs_to_run.empty?
156
+ job = self.jobs_to_run.shift
157
+ if runopts(:type) == "bash"
158
+ job[:pid] = spawn job[:cmd]
159
+ Process.detach job[:pid]
160
+ else
161
+ job[:pid] = `#{job[:cmd]}`.gsub(/[\n\r]/,"")
162
+ end
163
+ @jobs_running << job
164
+ say "Spawned pid:#{job[:pid]} for " +
165
+ "#{job[:ds].nil? ? "" : "#{job[:ds].name}:"}#{job[:job]}"
166
+ end
167
+ end
168
+ def purge!
169
+ self.jobs_running.select! do |job|
170
+ `#{sprintf(runopts(:alive), job[:pid])}`.chomp.to_i == 1
171
+ end
172
+ end
173
+ def say(*opts)
174
+ print "[#{Time.new.inspect}] ", *opts, "\n"
175
+ end
176
+ end
177
+ end
178
+
@@ -0,0 +1,286 @@
1
+ #
2
+ # @package MiGA
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license artistic license 2.0
5
+ # @update Jan-18-2016
6
+ #
7
+
8
+ require "miga/metadata"
9
+ require "miga/project"
10
+ require "miga/result"
11
+
12
+ module MiGA
13
+ class Dataset
14
+ # Class
15
+ @@RESULT_DIRS = {
16
+ # Preprocessing
17
+ raw_reads: "01.raw_reads", trimmed_reads: "02.trimmed_reads",
18
+ read_quality: "03.read_quality", trimmed_fasta: "04.trimmed_fasta",
19
+ assembly: "05.assembly", cds: "06.cds",
20
+ # Annotation
21
+ essential_genes: "07.annotation/01.function/01.essential",
22
+ ssu: "07.annotation/01.function/02.ssu",
23
+ mytaxa: "07.annotation/02.taxonomy/01.mytaxa",
24
+ mytaxa_scan: "07.annotation/03.qa/02.mytaxa_scan",
25
+ # Mapping
26
+ mapping_on_contigs: "08.mapping/01.read-ctg",
27
+ mapping_on_genes: "08.mapping/02.read-gene",
28
+ # Distances (for single-species datasets)
29
+ distances: "09.distances"
30
+ }
31
+ @@KNOWN_TYPES = {
32
+ genome: {description: "The genome from an isolate.", multi: false},
33
+ metagenome: {description: "A metagenome (excluding viromes).",
34
+ multi: true},
35
+ virome: {description: "A viral metagenome.", multi: true},
36
+ scgenome: {description: "A genome from a single cell.", multi: false},
37
+ popgenome: {description: "The genome of a population (including " +
38
+ "microdiversity).", :multi=>false}
39
+ }
40
+ @@PREPROCESSING_TASKS = [:raw_reads, :trimmed_reads, :read_quality,
41
+ :trimmed_fasta, :assembly, :cds, :essential_genes, :ssu, :mytaxa,
42
+ :mytaxa_scan, :distances]
43
+ @@EXCLUDE_NOREF_TASKS = [:essential_genes, :mytaxa_scan]
44
+ @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :distances]
45
+ @@ONLY_MULTI_TASKS = [:mytaxa]
46
+ def self.PREPROCESSING_TASKS ; @@PREPROCESSING_TASKS ; end
47
+ def self.RESULT_DIRS ; @@RESULT_DIRS end
48
+ def self.KNOWN_TYPES ; @@KNOWN_TYPES end
49
+ def self.exist?(project, name)
50
+ File.exist? project.path + "/metadata/" + name + ".json"
51
+ end
52
+ def self.INFO_FIELDS
53
+ %w(name created updated type ref user description comments)
54
+ end
55
+ # Instance
56
+ attr_reader :project, :name, :metadata
57
+ def initialize(project, name, is_ref=true, metadata={})
58
+ abort "Invalid name '#{name}', please use only alphanumerics and " +
59
+ "underscores." unless name.miga_name?
60
+ @project = project
61
+ @name = name
62
+ metadata[:ref] = is_ref
63
+ @metadata = Metadata.new(project.path + "/metadata/" + name + ".json",
64
+ metadata)
65
+ end
66
+ def save
67
+ self.metadata[:type] = :metagenome if !metadata[:tax].nil? and
68
+ !metadata[:tax][:ns].nil? and
69
+ metadata[:tax][:ns]=="COMMUNITY"
70
+ self.metadata.save
71
+ self.load
72
+ end
73
+ def load
74
+ # Nothing here...
75
+ end
76
+ def remove!
77
+ self.results.each{ |r| r.remove! }
78
+ self.metadata.remove!
79
+ end
80
+ def info()
81
+ Dataset.INFO_FIELDS.map do |k|
82
+ (k=="name") ? self.name : self.metadata[k.to_sym]
83
+ end
84
+ end
85
+ def is_ref?() !!self.metadata[:ref] end
86
+ def is_multi?
87
+ return false if self.metadata[:type].nil?
88
+ return @@KNOWN_TYPES[self.metadata[:type]][:multi]
89
+ end
90
+ def is_nonmulti?
91
+ return false if self.metadata[:type].nil?
92
+ return !@@KNOWN_TYPES[self.metadata[:type]][:multi]
93
+ end
94
+ def result(k)
95
+ return nil if @@RESULT_DIRS[k.to_sym].nil?
96
+ Result.load(project.path + "/data/" + @@RESULT_DIRS[k.to_sym] +
97
+ "/" + name + ".json")
98
+ end
99
+ def results() @@RESULT_DIRS.keys.map{ |k| self.result k }.compact end
100
+ def each_result(&blk)
101
+ @@RESULT_DIRS.keys.each do |k|
102
+ v = self.result k
103
+ blk.call(k,v) unless v.nil?
104
+ end
105
+ end
106
+ def add_result result_type
107
+ return nil if @@RESULT_DIRS[result_type].nil?
108
+ base = project.path + "/data/" + @@RESULT_DIRS[result_type] +
109
+ "/" + name
110
+ return nil unless File.exist? base + ".done"
111
+ r = nil
112
+ case result_type
113
+ when :raw_reads
114
+ return nil unless
115
+ File.exist? base + ".1.fastq" or
116
+ File.exist? base + ".1.fastq.gz"
117
+ r = Result.new base + ".json"
118
+ r.data[:gz] = File.exist?(base + ".1.fastq.gz")
119
+ if File.exist? base + ".2.fastq" + (r.data[:gz] ? ".gz" : "")
120
+ r.add_file :pair1, name + ".1.fastq"
121
+ r.add_file :pair2, name + ".2.fastq"
122
+ else
123
+ r.add_file :single, name + ".1.fastq"
124
+ end
125
+ when :trimmed_reads
126
+ return nil unless
127
+ File.exist?(base + ".1.clipped.fastq") or
128
+ File.exist?(base + ".1.clipped.fastq.gz")
129
+ r = Result.new base + ".json"
130
+ r.data[:gz] = File.exist?(base + ".1.clipped.fastq.gz")
131
+ if File.exist? base + ".2.clipped.fastq" + (r.data[:gz] ? ".gz":"")
132
+ r.add_file :pair1, name + ".1.clipped.fastq"
133
+ r.add_file :pair2, name + ".2.clipped.fastq"
134
+ end
135
+ r.add_file :single, name + ".1.clipped.single.fastq"
136
+ add_result :raw_reads #-> Post gunzip (if any)
137
+ when :read_quality
138
+ return nil unless
139
+ Dir.exist?(base + ".solexaqa") and
140
+ Dir.exist?(base + ".fastqc")
141
+ r = Result.new base + ".json"
142
+ r.add_file :solexaqa, self.name + ".solexaqa"
143
+ r.add_file :fastqc, self.name + ".fastqc"
144
+ add_result :trimmed_reads #-> Post cleaning
145
+ when :trimmed_fasta
146
+ return nil unless
147
+ File.exist?(base + ".CoupledReads.fa") or
148
+ File.exist?(base + ".SingleReads.fa")
149
+ r = Result.new base + ".json"
150
+ if File.exist?(base + ".CoupledReads.fa")
151
+ r.add_file :coupled, name + ".CoupledReads.fa"
152
+ r.add_file :pair1, name + ".1.fa"
153
+ r.add_file :pair2, name + ".2.fa"
154
+ end
155
+ r.add_file :single, name + ".SingleReads.fa"
156
+ add_result :raw_reads #-> Post gzip
157
+ when :assembly
158
+ return nil unless
159
+ File.exist?(base + ".LargeContigs.fna")
160
+ r = Result.new base + ".json"
161
+ r.add_file :largecontigs, name + ".LargeContigs.fna"
162
+ r.add_file :allcontigs, name + ".AllContigs.fna"
163
+ when :cds
164
+ return nil unless
165
+ File.exist?(base + ".faa") and
166
+ File.exist?(base + ".fna")
167
+ r = Result.new base + ".json"
168
+ r.add_file :proteins, name + ".faa"
169
+ r.add_file :genes, name + ".fna"
170
+ %w(gff2 gff3 tab).each do |ext|
171
+ r.add_file ext, "#{name}.#{ext}"
172
+ end
173
+ when :essential_genes
174
+ return nil unless
175
+ File.exist?(base + ".ess.faa") and
176
+ Dir.exist?(base + ".ess") and
177
+ File.exist?(base + ".ess/log")
178
+ r = Result.new base + ".json"
179
+ r.add_file :ess_genes, name + ".ess.faa"
180
+ r.add_file :collection, name + ".ess"
181
+ r.add_file :report, name + ".ess/log"
182
+ when :ssu
183
+ if result(:assembly).nil?
184
+ r = Result.new base + ".json"
185
+ else
186
+ return nil unless
187
+ File.exist?(base + ".ssu.fa") or
188
+ File.exist?(base + ".ssu.fa.gz")
189
+ r = Result.new base + ".json"
190
+ r.data[:gz] = File.exist?(base + ".ssu.fa.gz")
191
+ r.add_file :longest_ssu_gene, name + ".ssu.fa"
192
+ r.add_file :gff, name + ".ssu.gff"
193
+ r.add_file :all_ssu_genes, name + ".ssu.all.fa"
194
+ end
195
+ when :mytaxa
196
+ if is_multi?
197
+ return nil unless File.exist?(base + ".mytaxa")
198
+ r = Result.new base + ".json"
199
+ r.data[:gz] = File.exist?(base + ".mytaxain.gz")
200
+ r.add_file :mytaxa, name + ".mytaxa"
201
+ r.add_file :blast, name + ".blast"
202
+ r.add_file :mytaxain, name + ".mytaxain"
203
+ else
204
+ r = Result.new base + ".json"
205
+ r.data[:files] = {}
206
+ end
207
+ when :mytaxa_scan
208
+ if is_nonmulti?
209
+ return nil unless
210
+ File.exists?(base + ".pdf") and
211
+ File.exist?(base + ".wintax") and
212
+ File.exist?(base + ".mytaxa") and
213
+ Dir.exist?(base + ".reg")
214
+ r = Result.new base + ".json"
215
+ r.add_file :mytaxa, name + ".mytaxa"
216
+ r.add_file :wintax, name + ".wintax"
217
+ r.add_file :report, name + ".pdf"
218
+ r.add_file :regions, name + ".reg"
219
+ r.add_file :gene_ids, name + ".wintax.genes"
220
+ r.add_file :region_ids, name + ".wintax.regions"
221
+ r.add_file :blast, name + ".blast"
222
+ r.add_file :mytaxain, name + ".mytaxain"
223
+ else
224
+ r = Result.new base + ".json"
225
+ r.data[:files] = {}
226
+ end
227
+ when :distances
228
+ if is_nonmulti?
229
+ pref = project.path + "/data/" + @@RESULT_DIRS[result_type]
230
+ if is_ref?
231
+ return nil unless
232
+ File.exist?(pref + "/01.haai/" + name + ".db")
233
+ else
234
+ return nil unless
235
+ File.exist?(pref + "/02.aai/" + name + ".db")
236
+ end
237
+ r = Result.new base + ".json"
238
+ r.add_file :haai_db, "01.haai/" + name + ".db"
239
+ r.add_file :aai_db, "02.aai/" + name + ".db"
240
+ r.add_file :ani_db, "03.ani/" + name + ".db"
241
+ else
242
+ r = Result.new base + ".json"
243
+ r.data[:files] = {}
244
+ end
245
+ end
246
+ r.save
247
+ r
248
+ end # def add_result
249
+ def first_preprocessing
250
+ @@PREPROCESSING_TASKS.find{ |t| not self.add_result(t).nil? }
251
+ end
252
+ def next_preprocessing
253
+ after_first = false
254
+ first = self.first_preprocessing
255
+ return nil if first.nil?
256
+ @@PREPROCESSING_TASKS.each do |t|
257
+ next if @@EXCLUDE_NOREF_TASKS.include?(t) and not is_ref?
258
+ next if @@ONLY_MULTI_TASKS.include?(t) and not is_multi?
259
+ next if @@ONLY_NONMULTI_TASKS.include?(t) and not is_nonmulti?
260
+ return t if after_first and add_result(t).nil?
261
+ after_first = (after_first or (t==first))
262
+ end
263
+ nil
264
+ end
265
+ def done_preprocessing?
266
+ !first_preprocessing.nil? and next_preprocessing.nil?
267
+ end
268
+ def profile_advance
269
+ if first_preprocessing.nil?
270
+ adv = Array.new(@@PREPROCESSING_TASKS.size, 0)
271
+ else
272
+ adv = []
273
+ state = 0
274
+ first_task = first_preprocessing
275
+ next_task = next_preprocessing
276
+ @@PREPROCESSING_TASKS.each do |task|
277
+ state = 1 if first_task==task
278
+ state = 2 if !next_task.nil? and next_task==task
279
+ adv << state
280
+ end
281
+ end
282
+ adv
283
+ end
284
+ end # class Dataset
285
+ end # module MiGA
286
+