RubyGems - miga-base - Versions diffs - 0.2.0.6 - Mend

miga-base 0.2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +7 -0
data/README.md +351 -0
data/actions/add_result +61 -0
data/actions/add_taxonomy +86 -0
data/actions/create_dataset +62 -0
data/actions/create_project +70 -0
data/actions/daemon +69 -0
data/actions/download_dataset +77 -0
data/actions/find_datasets +63 -0
data/actions/import_datasets +86 -0
data/actions/index_taxonomy +71 -0
data/actions/list_datasets +83 -0
data/actions/list_files +67 -0
data/actions/unlink_dataset +52 -0
data/bin/miga +48 -0
data/lib/miga/daemon.rb +178 -0
data/lib/miga/dataset.rb +286 -0
data/lib/miga/gui.rb +289 -0
data/lib/miga/metadata.rb +74 -0
data/lib/miga/project.rb +268 -0
data/lib/miga/remote_dataset.rb +154 -0
data/lib/miga/result.rb +102 -0
data/lib/miga/tax_index.rb +70 -0
data/lib/miga/taxonomy.rb +107 -0
data/lib/miga.rb +83 -0
data/scripts/_distances_noref_nomulti.bash +86 -0
data/scripts/_distances_ref_nomulti.bash +105 -0
data/scripts/aai_distances.bash +40 -0
data/scripts/ani_distances.bash +39 -0
data/scripts/assembly.bash +38 -0
data/scripts/cds.bash +45 -0
data/scripts/clade_finding.bash +27 -0
data/scripts/distances.bash +30 -0
data/scripts/essential_genes.bash +29 -0
data/scripts/haai_distances.bash +39 -0
data/scripts/init.bash +211 -0
data/scripts/miga.bash +12 -0
data/scripts/mytaxa.bash +93 -0
data/scripts/mytaxa_scan.bash +85 -0
data/scripts/ogs.bash +36 -0
data/scripts/read_quality.bash +37 -0
data/scripts/ssu.bash +35 -0
data/scripts/subclades.bash +26 -0
data/scripts/trimmed_fasta.bash +47 -0
data/scripts/trimmed_reads.bash +57 -0
data/utils/adapters.fa +302 -0
data/utils/mytaxa_scan.R +89 -0
data/utils/mytaxa_scan.rb +58 -0
data/utils/requirements.txt +19 -0
data/utils/subclades-compile.rb +48 -0
data/utils/subclades.R +171 -0
metadata +185 -0

data/lib/miga/daemon.rb ADDED Viewed

@@ -0,0 +1,178 @@
+#
+# @package MiGA
+# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
+# @license artistic license 2.0
+# @update Nov-12-2015
+#
+require "miga/project"
+require "daemons"
+require "date"
+module MiGA
+   class Daemon
+      def self.last_alive(p)
+         f = File.expand_path("daemon/alive", p.path)
+	 return nil unless File.size? f
+	 DateTime.parse(File.read(f))
+      end
+      attr_reader :project, :options, :jobs_to_run, :jobs_running
+      def initialize(p)
+	 @project = p
+	 @runopts = JSON.parse(
+	    File.read(File.expand_path("daemon/daemon.json", project.path)),
+	    {:symbolize_names=>true})
+	 @jobs_to_run = []
+	 @jobs_running = []
+      end
+      def last_alive
+	 Daemon.last_alive project
+      end
+      def default_options
+         { dir_mode: :normal, dir: File.expand_path("daemon", project.path),
+	    multiple: false, log_output: true }
+      end
+      def runopts(k, v=nil)
+	 k = k.to_sym
+	 unless v.nil?
+	    v = v.to_i if [:latency, :maxjobs, :ppn].include? k
+	    raise "Daemon's #{k} cannot be set to zero." if
+	       v.is_a? Integer and v==0
+	    @runopts[k] = v
+	 end
+	 @runopts[k]
+      end
+      def latency() runopts(:latency) ; end
+      def maxjobs() runopts(:maxjobs) ; end
+      def ppn() runopts(:ppn) ; end
+      def start() daemon("start") ; end
+      def stop() daemon("stop") ; end
+      def restart() daemon("restart") ; end
+      def status() daemon("status") ; end
+      def daemon(task, opts=[])
+	 options = default_options
+	 opts.unshift(task)
+	 options[:ARGV] = opts
+	 Daemons.run_proc("MiGA:#{project.metadata[:name]}", options) do
+	    p = project
+	    say "-----------------------------------"
+	    say "MiGA:#{p.metadata[:name]} launched."
+	    say "-----------------------------------"
+	    loop_i = 0
+	    loop do
+	       # Tell the world you're alive
+	       f = File.open(File.expand_path("daemon/alive", project.path),"w")
+	       f.print Time.now.to_s
+	       f.close
+	       loop_i += 1
+	       # Traverse datasets
+	       p.datasets.each do |ds|
+	          # Inspect preprocessing
+		  to_run = ds.next_preprocessing
+		  # Launch task
+		  queue_job(to_run, ds) unless to_run.nil?
+	       end
+	       # Check if all the reference datasets are pre-processed.
+	       # If yes, check the project-level tasks
+	       if p.done_preprocessing?
+		  to_run = p.next_distances
+		  to_run = p.next_inclade if to_run.nil?
+		  # Launch task
+		  queue_job(to_run) unless to_run.nil?
+	       end
+	       # Run jobs
+	       flush!
+	       # Every 12 loops:
+	       if loop_i==12
+		  say "Housekeeping for sanity"
+		  loop_i = 0
+		  # Check if running jobs are alive
+		  purge!
+		  # Reload project metadata (to add newly created datasets)
+		  project.load
+	       end
+	       sleep(latency)
+	    end
+	 end
+      end
+      def queue_job(job, ds=nil)
+	 return nil unless get_job(job, ds).nil?
+	 ds_name = (ds.nil? ? "miga-project" : ds.name)
+	 say "Queueing ", ds_name, ":#{job}"
+	 type = runopts(:type)
+	 vars = {
+	    "PROJECT"=>project.path, "RUNTYPE"=>runopts(:type), "CORES"=>ppn,
+	    "MIGA"=>File.expand_path("../..", File.dirname(__FILE__)) }
+	 vars["DATASET"] = ds.name unless ds.nil?
+	 log_dir = File.expand_path("daemon/#{job}", project.path)
+	 Dir.mkdir log_dir unless Dir.exist? log_dir
+	 to_run = {ds: ds, job: job, cmd: sprintf(runopts(:cmd),
+	       # 1: script
+	       vars["MIGA"] + "/scripts/#{job.to_s}.bash",
+	       # 2: vars
+	       vars.keys.map{|k| sprintf(runopts(:var),k,vars[k])
+		  }.join(runopts(:varsep)),
+	       # 3: CPUs
+	       ppn,
+	       # 4: log file
+	       File.expand_path("#{ds_name}.log", log_dir),
+	       # 5: task name
+	       "#{project.metadata[:name][0..9]}:#{job}:#{ds_name}")}
+	 @jobs_to_run << to_run
+      end
+      def get_job(job, ds=nil)
+	 if ds==nil
+	    (@jobs_to_run + @jobs_running).select do |j|
+	       (j[:ds].nil?) and (j[:job]==job)
+	    end.first
+	 else
+	    (@jobs_to_run + @jobs_running).select do |j|
+	       (not j[:ds].nil?) and (j[:ds].name==ds.name) and (j[:job]==job)
+	    end.first
+	 end
+      end
+      def flush!
+	 # Check for finished jobs
+	 self.jobs_running.select! do |job|
+	    r = job[:ds].nil? ?
+	       self.project.add_result(job[:job]) :
+	       job[:ds].add_result(job[:job])
+	    say "Completed pid:#{job[:pid]} for " +
+	       "#{job[:ds].nil? ? "" : "#{job[:ds].name}:"}#{job[:job]}" unless
+	       r.nil?
+	    r.nil?
+	 end
+	 # Avoid single datasets hogging resources
+	 @jobs_to_run.rotate! rand(@jobs_to_run.size)
+	 # Launch as many @jobs_to_run as possible
+	 while jobs_running.size < maxjobs
+	    break if jobs_to_run.empty?
+	    job = self.jobs_to_run.shift
+	    if runopts(:type) == "bash"
+	       job[:pid] = spawn job[:cmd]
+	       Process.detach job[:pid]
+	    else
+	       job[:pid] = `#{job[:cmd]}`.gsub(/[\n\r]/,"")
+	    end
+	    @jobs_running << job
+	    say "Spawned pid:#{job[:pid]} for " +
+	       "#{job[:ds].nil? ? "" : "#{job[:ds].name}:"}#{job[:job]}"
+	 end
+      end
+      def purge!
+	 self.jobs_running.select! do |job|
+	    `#{sprintf(runopts(:alive), job[:pid])}`.chomp.to_i == 1
+	 end
+      end
+      def say(*opts)
+	 print "[#{Time.new.inspect}] ", *opts, "\n"
+      end
+   end
+end

data/lib/miga/dataset.rb ADDED Viewed

@@ -0,0 +1,286 @@
+#
+# @package MiGA
+# @author  Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
+# @license artistic license 2.0
+# @update  Jan-18-2016
+#
+require "miga/metadata"
+require "miga/project"
+require "miga/result"
+module MiGA
+   class Dataset
+      # Class
+      @@RESULT_DIRS = {
+	 # Preprocessing
+	 raw_reads: "01.raw_reads", trimmed_reads: "02.trimmed_reads",
+	 read_quality: "03.read_quality", trimmed_fasta: "04.trimmed_fasta",
+	 assembly: "05.assembly", cds: "06.cds",
+	 # Annotation
+	 essential_genes: "07.annotation/01.function/01.essential",
+	 ssu: "07.annotation/01.function/02.ssu",
+	 mytaxa: "07.annotation/02.taxonomy/01.mytaxa",
+	 mytaxa_scan: "07.annotation/03.qa/02.mytaxa_scan",
+	 # Mapping
+	 mapping_on_contigs: "08.mapping/01.read-ctg",
+	 mapping_on_genes: "08.mapping/02.read-gene",
+	 # Distances (for single-species datasets)
+	 distances: "09.distances"
+      }
+      @@KNOWN_TYPES = {
+         genome: {description: "The genome from an isolate.", multi: false},
+	 metagenome: {description: "A metagenome (excluding viromes).",
+	    multi: true},
+	 virome: {description: "A viral metagenome.", multi: true},
+	 scgenome: {description: "A genome from a single cell.", multi: false},
+	 popgenome: {description: "The genome of a population (including " +
+	    "microdiversity).", :multi=>false}
+      }
+      @@PREPROCESSING_TASKS = [:raw_reads, :trimmed_reads, :read_quality,
+	 :trimmed_fasta, :assembly, :cds, :essential_genes, :ssu, :mytaxa,
+	 :mytaxa_scan, :distances]
+      @@EXCLUDE_NOREF_TASKS = [:essential_genes, :mytaxa_scan]
+      @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :distances]
+      @@ONLY_MULTI_TASKS = [:mytaxa]
+      def self.PREPROCESSING_TASKS ; @@PREPROCESSING_TASKS ; end
+      def self.RESULT_DIRS ; @@RESULT_DIRS end
+      def self.KNOWN_TYPES ; @@KNOWN_TYPES end
+      def self.exist?(project, name)
+	 File.exist? project.path + "/metadata/" + name + ".json"
+      end
+      def self.INFO_FIELDS
+	 %w(name created updated type ref user description comments)
+      end
+      # Instance
+      attr_reader :project, :name, :metadata
+      def initialize(project, name, is_ref=true, metadata={})
+	 abort "Invalid name '#{name}', please use only alphanumerics and " +
+	    "underscores." unless name.miga_name?
+	 @project = project
+	 @name = name
+	 metadata[:ref] = is_ref
+	 @metadata = Metadata.new(project.path + "/metadata/" + name + ".json",
+	    metadata)
+      end
+      def save
+	 self.metadata[:type] = :metagenome if !metadata[:tax].nil? and
+	    !metadata[:tax][:ns].nil? and
+	    metadata[:tax][:ns]=="COMMUNITY"
+	 self.metadata.save
+	 self.load
+      end
+      def load
+         # Nothing here...
+      end
+      def remove!
+         self.results.each{ |r| r.remove! }
+	 self.metadata.remove!
+      end
+      def info()
+	 Dataset.INFO_FIELDS.map do |k|
+	    (k=="name") ? self.name : self.metadata[k.to_sym]
+	 end
+      end
+      def is_ref?() !!self.metadata[:ref] end
+      def is_multi?
+	 return false if self.metadata[:type].nil?
+	 return @@KNOWN_TYPES[self.metadata[:type]][:multi]
+      end
+      def is_nonmulti?
+	 return false if self.metadata[:type].nil?
+	 return !@@KNOWN_TYPES[self.metadata[:type]][:multi]
+      end
+      def result(k)
+	 return nil if @@RESULT_DIRS[k.to_sym].nil?
+	 Result.load(project.path + "/data/" + @@RESULT_DIRS[k.to_sym] +
+	    "/" + name + ".json")
+      end
+      def results() @@RESULT_DIRS.keys.map{ |k| self.result k }.compact end
+      def each_result(&blk)
+         @@RESULT_DIRS.keys.each do |k|
+	    v = self.result k
+	    blk.call(k,v) unless v.nil?
+	 end
+      end
+      def add_result result_type
+	 return nil if @@RESULT_DIRS[result_type].nil?
+	 base = project.path + "/data/" + @@RESULT_DIRS[result_type] +
+	    "/" + name
+	 return nil unless File.exist? base + ".done"
+	 r = nil
+	 case result_type
+	 when :raw_reads
+	    return nil unless
+	       File.exist? base + ".1.fastq" or
+	       File.exist? base + ".1.fastq.gz"
+	    r = Result.new base + ".json"
+	    r.data[:gz] = File.exist?(base + ".1.fastq.gz")
+	    if File.exist? base + ".2.fastq" + (r.data[:gz] ? ".gz" : "")
+	       r.add_file :pair1, name + ".1.fastq"
+	       r.add_file :pair2, name + ".2.fastq"
+	    else
+	       r.add_file :single, name + ".1.fastq"
+	    end
+	 when :trimmed_reads
+	    return nil unless
+	       File.exist?(base + ".1.clipped.fastq") or
+	       File.exist?(base + ".1.clipped.fastq.gz")
+	    r = Result.new base + ".json"
+	    r.data[:gz] = File.exist?(base + ".1.clipped.fastq.gz")
+	    if File.exist? base + ".2.clipped.fastq" + (r.data[:gz] ? ".gz":"")
+	       r.add_file :pair1, name + ".1.clipped.fastq"
+	       r.add_file :pair2, name + ".2.clipped.fastq"
+	    end
+	    r.add_file :single, name + ".1.clipped.single.fastq"
+	    add_result :raw_reads #-> Post gunzip (if any)
+	 when :read_quality
+	    return nil unless
+	       Dir.exist?(base + ".solexaqa") and
+	       Dir.exist?(base + ".fastqc")
+	    r = Result.new base + ".json"
+	    r.add_file :solexaqa, self.name + ".solexaqa"
+	    r.add_file :fastqc, self.name + ".fastqc"
+	    add_result :trimmed_reads #-> Post cleaning
+	 when :trimmed_fasta
+	    return nil unless
+	       File.exist?(base + ".CoupledReads.fa") or
+	       File.exist?(base + ".SingleReads.fa")
+	    r = Result.new base + ".json"
+	    if File.exist?(base + ".CoupledReads.fa")
+	       r.add_file :coupled, name + ".CoupledReads.fa"
+	       r.add_file :pair1, name + ".1.fa"
+	       r.add_file :pair2, name + ".2.fa"
+	    end
+	    r.add_file :single, name + ".SingleReads.fa"
+	    add_result :raw_reads #-> Post gzip
+	 when :assembly
+	    return nil unless
+	       File.exist?(base + ".LargeContigs.fna")
+	    r = Result.new base + ".json"
+	    r.add_file :largecontigs, name + ".LargeContigs.fna"
+	    r.add_file :allcontigs, name + ".AllContigs.fna"
+	 when :cds
+	    return nil unless
+	       File.exist?(base + ".faa") and
+	       File.exist?(base + ".fna")
+	    r = Result.new base + ".json"
+	    r.add_file :proteins, name + ".faa"
+	    r.add_file :genes, name + ".fna"
+	    %w(gff2 gff3 tab).each do |ext|
+	       r.add_file ext, "#{name}.#{ext}"
+	    end
+	 when :essential_genes
+	    return nil unless
+	       File.exist?(base + ".ess.faa") and
+	       Dir.exist?(base + ".ess") and
+	       File.exist?(base + ".ess/log")
+	    r = Result.new base + ".json"
+	    r.add_file :ess_genes, name + ".ess.faa"
+	    r.add_file :collection, name + ".ess"
+	    r.add_file :report, name + ".ess/log"
+	 when :ssu
+	    if result(:assembly).nil?
+	       r = Result.new base + ".json"
+	    else
+	       return nil unless
+		  File.exist?(base + ".ssu.fa") or
+		  File.exist?(base + ".ssu.fa.gz")
+	       r = Result.new base + ".json"
+	       r.data[:gz] = File.exist?(base + ".ssu.fa.gz")
+	       r.add_file :longest_ssu_gene, name + ".ssu.fa"
+	       r.add_file :gff, name + ".ssu.gff"
+	       r.add_file :all_ssu_genes, name + ".ssu.all.fa"
+	    end
+	 when :mytaxa
+	    if is_multi?
+	       return nil unless File.exist?(base + ".mytaxa")
+	       r = Result.new base + ".json"
+	       r.data[:gz] = File.exist?(base + ".mytaxain.gz")
+	       r.add_file :mytaxa, name + ".mytaxa"
+	       r.add_file :blast, name + ".blast"
+	       r.add_file :mytaxain, name + ".mytaxain"
+	    else
+	       r = Result.new base + ".json"
+	       r.data[:files] = {}
+	    end
+	 when :mytaxa_scan
+	    if is_nonmulti?
+	       return nil unless
+		  File.exists?(base + ".pdf") and
+		  File.exist?(base + ".wintax") and
+		  File.exist?(base + ".mytaxa") and
+		  Dir.exist?(base + ".reg")
+	       r = Result.new base + ".json"
+	       r.add_file :mytaxa, name + ".mytaxa"
+	       r.add_file :wintax, name + ".wintax"
+	       r.add_file :report, name + ".pdf"
+	       r.add_file :regions, name + ".reg"
+	       r.add_file :gene_ids, name + ".wintax.genes"
+	       r.add_file :region_ids, name + ".wintax.regions"
+	       r.add_file :blast, name + ".blast"
+	       r.add_file :mytaxain, name + ".mytaxain"
+	    else
+	       r = Result.new base + ".json"
+	       r.data[:files] = {}
+	    end
+	 when :distances
+	    if is_nonmulti?
+	       pref = project.path + "/data/" + @@RESULT_DIRS[result_type]
+	       if is_ref?
+		  return nil unless
+		     File.exist?(pref + "/01.haai/" + name + ".db")
+	       else
+		  return nil unless
+		     File.exist?(pref + "/02.aai/" + name + ".db")
+	       end
+	       r = Result.new base + ".json"
+	       r.add_file :haai_db, "01.haai/" + name + ".db"
+	       r.add_file :aai_db, "02.aai/" + name + ".db"
+	       r.add_file :ani_db, "03.ani/" + name + ".db"
+	    else
+	       r = Result.new base + ".json"
+	       r.data[:files] = {}
+	    end
+	 end
+	 r.save
+	 r
+      end # def add_result
+      def first_preprocessing
+	 @@PREPROCESSING_TASKS.find{ |t| not self.add_result(t).nil? }
+      end
+      def next_preprocessing
+         after_first = false
+	 first = self.first_preprocessing
+	 return nil if first.nil?
+	 @@PREPROCESSING_TASKS.each do |t|
+	    next if @@EXCLUDE_NOREF_TASKS.include?(t) and not is_ref?
+	    next if @@ONLY_MULTI_TASKS.include?(t) and not is_multi?
+	    next if @@ONLY_NONMULTI_TASKS.include?(t) and not is_nonmulti?
+	    return t if after_first and add_result(t).nil?
+	    after_first = (after_first or (t==first))
+	 end
+	 nil
+      end
+      def done_preprocessing?
+	 !first_preprocessing.nil? and next_preprocessing.nil?
+      end
+      def profile_advance
+         if first_preprocessing.nil?
+	    adv = Array.new(@@PREPROCESSING_TASKS.size, 0)
+	 else
+	    adv = []
+	    state = 0
+	    first_task = first_preprocessing
+	    next_task = next_preprocessing
+	    @@PREPROCESSING_TASKS.each do |task|
+	       state = 1 if first_task==task
+	       state = 2 if !next_task.nil? and next_task==task
+	       adv << state
+	    end
+	 end
+	 adv
+      end
+   end # class Dataset
+end # module MiGA