RubyGems - miga-base - Versions diffs - 1.2.15.0 → 1.2.15.1 - Mend

miga-base 1.2.15.0 → 1.2.15.1

Files changed (17) hide show

checksums.yaml +4 -4
data/lib/miga/cli/action/doctor/base.rb +16 -40
data/lib/miga/cli/action/doctor/databases.rb +39 -0
data/lib/miga/cli/action/doctor/distances.rb +144 -0
data/lib/miga/cli/action/doctor/operations.rb +159 -0
data/lib/miga/cli/action/doctor.rb +7 -287
data/lib/miga/cli/action/download/base.rb +48 -1
data/lib/miga/cli/action/gtdb_get.rb +2 -31
data/lib/miga/cli/action/ncbi_get.rb +6 -31
data/lib/miga/cli/opt_helper.rb +1 -1
data/lib/miga/dataset/base.rb +34 -5
data/lib/miga/dataset/result/add.rb +286 -0
data/lib/miga/dataset/result/ignore.rb +93 -0
data/lib/miga/dataset/result.rb +31 -342
data/lib/miga/version.rb +2 -2
data/test/remote_dataset_test.rb +6 -0
metadata +7 -2

data/lib/miga/dataset/result/add.rb ADDED Viewed

@@ -0,0 +1,286 @@
+# frozen_string_literal: true
+module MiGA::Dataset::Result::Add
+  ##
+  # Add result type +:raw_reads+ at +base+ (no +_opts+ supported)
+  def add_result_raw_reads(base, _opts)
+    return nil unless result_files_exist?(base, '.1.fastq')
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      if result_files_exist?(base, '.2.fastq')
+        { pair1: '.1.fastq', pair2: '.2.fastq' }
+      else
+        { single: '.1.fastq' }
+      end
+    )
+  end
+  ##
+  # Add result type +:trimmed_reads+ at +base+ (no +_opts+ supported)
+  def add_result_trimmed_reads(base, _opts)
+    return nil unless result_files_exist?(base, '.1.clipped.fastq')
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      if result_files_exist?(base, '.2.clipped.fastq')
+        { pair1: '.1.clipped.fastq', pair2: '.2.clipped.fastq' }
+      else
+        { single: '.1.clipped.fastq' }
+      end
+    ).tap do |r|
+      # Legacy files
+      r.add_file(:trimming_sumary, "#{name}.1.fastq.trimmed.summary.txt")
+      r.add_file(:single, "#{name}.1.clipped.single.fastq")
+    end
+  end
+  ##
+  # Add result type +:read_quality+ at +base+ (no +_opts+ supported)
+  def add_result_read_quality(base, _opts)
+    return nil unless
+      result_files_exist?(base, %w[.post.1.html]) ||
+      result_files_exist?(base, %w[.solexaqa .fastqc])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      pre_qc_1: '.pre.1.html', pre_qc_2: '.pre.2.html',
+      post_qc_1: '.post.1.html', post_qc_2: '.post.2.html',
+      adapter_detection: '.adapters.txt',
+      # Legacy files
+      solexaqa: '.solexaqa', fastqc: '.fastqc'
+    )
+  end
+  ##
+  # Add result type +:trimmed_fasta+ at +base+ (no +_opts+ supported)
+  def add_result_trimmed_fasta(base, _opts)
+    return nil unless
+      result_files_exist?(base, '.CoupledReads.fa') ||
+      result_files_exist?(base, '.SingleReads.fa')  ||
+      result_files_exist?(base, %w[.1.fasta .2.fasta])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      coupled: '.CoupledReads.fa',
+      single: '.SingleReads.fa',
+      pair1: '.1.fasta',
+      pair2: '.2.fasta'
+    )
+  end
+  ##
+  # Add result type +:assembly+ at +base+. Hash +opts+ supports
+  # +is_clean: Boolean+.
+  def add_result_assembly(base, opts)
+    return nil unless result_files_exist?(base, '.LargeContigs.fna')
+    r = add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      largecontigs: '.LargeContigs.fna',
+      allcontigs: '.AllContigs.fna',
+      assembly_data: ''
+    )
+    opts[:is_clean] ||= false
+    r.clean! if opts[:is_clean]
+    unless r.clean?
+      MiGA::MiGA.clean_fasta_file(r.file_path(:largecontigs))
+      r.clean!
+    end
+    r
+  end
+  ##
+  # Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
+  def add_result_cds(base, opts)
+    return nil unless result_files_exist?(base, %w[.faa])
+    r = add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      proteins: '.faa',
+      genes: '.fna',
+      gff2: '.gff2',
+      gff3: '.gff3',
+      tab: '.tab'
+    )
+    opts[:is_clean] ||= false
+    r.clean! if opts[:is_clean]
+    unless r.clean?
+      MiGA::MiGA.clean_fasta_file(r.file_path(:proteins))
+      MiGA::MiGA.clean_fasta_file(r.file_path(:genes)) if r.file_path(:genes)
+      r.clean!
+    end
+    r
+  end
+  ##
+  # Add result type +:essential_genes+ at +base+ (no +_opts+ supported).
+  def add_result_essential_genes(base, _opts)
+    return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      ess_genes: '.ess.faa',
+      collection: '.ess',
+      report: '.ess/log',
+      alignments: '.ess/proteins.aln',
+      fastaai_index: '.faix.db.gz',
+      fastaai_index_2: '.faix'
+    )
+  end
+  ##
+  # Add result type +:ssu+ at +base+. Hash +opts+ supports +is_clean: Boolean+
+  def add_result_ssu(base, opts)
+    return MiGA::Result.new("#{base}.json") if result(:assembly).nil?
+    return nil unless result_files_exist?(base, '.ssu.fa')
+    r = add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      longest_ssu_gene: '.ssu.fa',
+      ssu_gff: '.ssu.gff', # DEPRECATED
+      gff: '.gff',
+      all_ssu_genes: '.ssu.all.fa',
+      classification: '.rdp.tsv',
+      trna_list: '.trna.txt'
+    )
+    opts[:is_clean] ||= false
+    r.clean! if opts[:is_clean]
+    unless r.clean?
+      MiGA::MiGA.clean_fasta_file(r.file_path(:longest_ssu_gene))
+      r.clean!
+    end
+    r
+  end
+  ##
+  # Add result type +:mytaxa+ at +base+ (no +_opts+ supported)
+  def add_result_mytaxa(base, _opts)
+    if multi?
+      return nil unless
+        result_files_exist?(base, '.mytaxa') ||
+        result_files_exist?(base, '.nomytaxa.txt')
+      add_files_to_ds_result(
+        MiGA::Result.new("#{base}.json"), name,
+        mytaxa: '.mytaxa',
+        blast: '.blast',
+        mytaxain: '.mytaxain',
+        nomytaxa: '.nomytaxa.txt',
+        species: '.mytaxa.Species.txt',
+        genus: '.mytaxa.Genus.txt',
+        phylum: '.mytaxa.Phylum.txt',
+        innominate: '.mytaxa.innominate',
+        kronain: '.mytaxa.krona',
+        krona: '.html'
+      )
+    else
+      MiGA::Result.new("#{base}.json")
+    end
+  end
+  ##
+  # Add result type +:mytaxa_scan+ at +base+ (no +_opts+ supported)
+  def add_result_mytaxa_scan(base, _opts)
+    if nonmulti?
+      return nil unless
+        result_files_exist?(base, %w[.pdf .mytaxa]) ||
+        result_files_exist?(base, '.nomytaxa.txt')
+      add_files_to_ds_result(
+        MiGA::Result.new("#{base}.json"), name,
+        nomytaxa: '.nomytaxa.txt',
+        mytaxa: '.mytaxa',
+        report: '.pdf',
+        regions_archive: '.reg.tar',
+        # Intermediate / Deprecated:
+        blast: '.blast',
+        mytaxain: '.mytaxain',
+        wintax: '.wintax',
+        gene_ids: '.wintax.genes',
+        region_ids: '.wintax.regions',
+        regions: '.reg'
+      )
+    else
+      MiGA::Result.new("#{base}.json")
+    end
+  end
+  ##
+  # Add result type +:distances+ at +base+ (no +_opts+ supported)
+  def add_result_distances(base, _opts)
+    if nonmulti?
+      if ref?
+        add_result_distances_ref(base)
+      else
+        add_result_distances_nonref(base)
+      end
+    else
+      add_result_distances_multi(base)
+    end
+  end
+  ##
+  # Add result type +:taxonomy+ at +base+ (no +_opts+ supported)
+  def add_result_taxonomy(base, _opts)
+    add_result_distances_nonref(base)
+  end
+  ##
+  # Add result type +:stats+ at +base+ (no +_opts+ supported)
+  def add_result_stats(base, _opts)
+    MiGA::Result.new("#{base}.json")
+  end
+  private
+  ##
+  # Add result type +:distances+ for _multi_ datasets at +base+
+  def add_result_distances_multi(base)
+    MiGA::Result.new("#{base}.json")
+  end
+  ##
+  # Add result type +:distances+ for _nonmulti_ reference datasets at +base+
+  def add_result_distances_ref(base)
+    pref = File.dirname(base)
+    return nil unless File.exist?("#{pref}/01.haai/#{name}.db")
+    MiGA::Result.new("#{base}.json").tap do |r|
+      r.add_files(
+        haai_db: "01.haai/#{name}.db",
+        aai_db: "02.aai/#{name}.db",
+        ani_db: "03.ani/#{name}.db"
+      )
+    end
+  end
+  ##
+  # Add result type +:distances+ for _nonmulti_ query datasets at +base+
+  def add_result_distances_nonref(base)
+    return nil unless
+      result_files_exist?(base, %w[.aai-medoids.tsv .aai.db]) ||
+      result_files_exist?(base, %w[.ani-medoids.tsv .ani.db])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      aai_medoids: '.aai-medoids.tsv',
+      haai_db: '.haai.db',
+      aai_db: '.aai.db',
+      ani_medoids: '.ani-medoids.tsv',
+      ani_db: '.ani.db',
+      ref_tree: '.nwk',
+      ref_tree_pdf: '.nwk.pdf',
+      intax_test: '.intax.txt'
+    )
+  end
+  ##
+  # Add files in +rel_files+ Hash to the result +r+ with dataset name +name+
+  def add_files_to_ds_result(r, name, rel_files)
+    files = {}
+    rel_files.each { |k, v| files[k] = name + v }
+    r.add_files(files)
+    r
+  end
+end

data/lib/miga/dataset/result/ignore.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+module MiGA::Dataset::Result::Ignore
+  ##
+  # Should I ignore +task+ for this dataset?
+  def ignore_task?(task)
+    why_ignore(task) != :execute
+  end
+  ##
+  # Returns an array of symbols indicating all the possible reasons why a
+  # given task migh be ignored:
+  # - empty: the dataset has no data
+  # - inactive: the dataset is inactive
+  # - upstream: the task is upstream from dataset's input
+  # - force: forced to ignore by metadata
+  # - project: incompatible project
+  # - noref: incompatible dataset, only for reference
+  # - multi: incompatible dataset, only for multi
+  # - nonmulti: incompatible dataset, only for nonmulti
+  # - complete: the task is already complete
+  def ignore_reasons
+    %i[empty inactive upstream force project noref multi nonmulti complete]
+  end
+  ##
+  # Return a code explaining why a task is ignored (see +ignore_reasons+) or
+  # the symbol +:execute+ (do not ignore, execute the task)
+  def why_ignore(task)
+    # Find a reason to ignore it
+    ignore_reasons.each do |i|
+      return i if send(:"ignore_#{i}?", task)
+    end
+    # Otherwise, execute
+    return :execute
+  end
+  ##
+  # Ignore +task+ because it's already done
+  def ignore_complete?(task)
+    !get_result(task).nil?
+  end
+  ##
+  # Ignore any task because the dataset is inactive (+_task+ is ignored)
+  def ignore_inactive?(_task)
+    !active?
+  end
+  ##
+  # Ignore any task because the dataset is empty (+_task+ is ignored)
+  def ignore_empty?(_task)
+    first_preprocessing.nil?
+  end
+  ##
+  # Ignore +task+ because it's upstream from the entry point
+  def ignore_upstream?(task)
+    self.class.PREPROCESSING_TASKS.index(task) <
+      self.class.PREPROCESSING_TASKS.index(first_preprocessing)
+  end
+  ##
+  # Ignore +task+ because the metadata says so
+  def ignore_force?(task)
+    !(metadata["run_#{task}"].nil? || metadata["run_#{task}"])
+  end
+  ##
+  # Ignore +task+ because the project is not compatible
+  def ignore_project?(task)
+    task == :taxonomy && project.option(:ref_project).nil?
+  end
+  ##
+  # Ignore +task+ because it's not a reference dataset
+  def ignore_noref?(task)
+    self.class.EXCLUDE_NOREF_TASKS.include?(task) && !ref?
+  end
+  ##
+  # Ignore +task+ because it's not a multi dataset
+  def ignore_multi?(task)
+    self.class.ONLY_MULTI_TASKS.include?(task) && !multi?
+  end
+  ##
+  # Ignore +task+ because it's not a nonmulti dataset
+  def ignore_nonmulti?(task)
+    self.class.ONLY_NONMULTI_TASKS.include?(task) && !nonmulti?
+  end
+end