RubyGems - miga-base - Versions diffs - 1.2.14.2 → 1.2.15.1 - Mend

miga-base 1.2.14.2 → 1.2.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/lib/miga/cli/action/doctor/base.rb +16 -40
data/lib/miga/cli/action/doctor/databases.rb +39 -0
data/lib/miga/cli/action/doctor/distances.rb +144 -0
data/lib/miga/cli/action/doctor/operations.rb +159 -0
data/lib/miga/cli/action/doctor.rb +7 -287
data/lib/miga/cli/action/download/base.rb +64 -2
data/lib/miga/cli/action/gtdb_get.rb +2 -31
data/lib/miga/cli/action/ncbi_get.rb +6 -31
data/lib/miga/cli/opt_helper.rb +1 -1
data/lib/miga/common/errors.rb +10 -0
data/lib/miga/dataset/base.rb +34 -5
data/lib/miga/dataset/result/add.rb +286 -0
data/lib/miga/dataset/result/ignore.rb +93 -0
data/lib/miga/dataset/result.rb +31 -342
data/lib/miga/remote_dataset/download.rb +6 -0
data/lib/miga/version.rb +2 -2
data/test/remote_dataset_test.rb +6 -0
metadata +7 -2

data/lib/miga/dataset/result/add.rb ADDED Viewed

@@ -0,0 +1,286 @@
+# frozen_string_literal: true
+module MiGA::Dataset::Result::Add
+  ##
+  # Add result type +:raw_reads+ at +base+ (no +_opts+ supported)
+  def add_result_raw_reads(base, _opts)
+    return nil unless result_files_exist?(base, '.1.fastq')
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      if result_files_exist?(base, '.2.fastq')
+        { pair1: '.1.fastq', pair2: '.2.fastq' }
+      else
+        { single: '.1.fastq' }
+      end
+    )
+  end
+  ##
+  # Add result type +:trimmed_reads+ at +base+ (no +_opts+ supported)
+  def add_result_trimmed_reads(base, _opts)
+    return nil unless result_files_exist?(base, '.1.clipped.fastq')
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      if result_files_exist?(base, '.2.clipped.fastq')
+        { pair1: '.1.clipped.fastq', pair2: '.2.clipped.fastq' }
+      else
+        { single: '.1.clipped.fastq' }
+      end
+    ).tap do |r|
+      # Legacy files
+      r.add_file(:trimming_sumary, "#{name}.1.fastq.trimmed.summary.txt")
+      r.add_file(:single, "#{name}.1.clipped.single.fastq")
+    end
+  end
+  ##
+  # Add result type +:read_quality+ at +base+ (no +_opts+ supported)
+  def add_result_read_quality(base, _opts)
+    return nil unless
+      result_files_exist?(base, %w[.post.1.html]) ||
+      result_files_exist?(base, %w[.solexaqa .fastqc])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      pre_qc_1: '.pre.1.html', pre_qc_2: '.pre.2.html',
+      post_qc_1: '.post.1.html', post_qc_2: '.post.2.html',
+      adapter_detection: '.adapters.txt',
+      # Legacy files
+      solexaqa: '.solexaqa', fastqc: '.fastqc'
+    )
+  end
+  ##
+  # Add result type +:trimmed_fasta+ at +base+ (no +_opts+ supported)
+  def add_result_trimmed_fasta(base, _opts)
+    return nil unless
+      result_files_exist?(base, '.CoupledReads.fa') ||
+      result_files_exist?(base, '.SingleReads.fa')  ||
+      result_files_exist?(base, %w[.1.fasta .2.fasta])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      coupled: '.CoupledReads.fa',
+      single: '.SingleReads.fa',
+      pair1: '.1.fasta',
+      pair2: '.2.fasta'
+    )
+  end
+  ##
+  # Add result type +:assembly+ at +base+. Hash +opts+ supports
+  # +is_clean: Boolean+.
+  def add_result_assembly(base, opts)
+    return nil unless result_files_exist?(base, '.LargeContigs.fna')
+    r = add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      largecontigs: '.LargeContigs.fna',
+      allcontigs: '.AllContigs.fna',
+      assembly_data: ''
+    )
+    opts[:is_clean] ||= false
+    r.clean! if opts[:is_clean]
+    unless r.clean?
+      MiGA::MiGA.clean_fasta_file(r.file_path(:largecontigs))
+      r.clean!
+    end
+    r
+  end
+  ##
+  # Add result type +:cds+ at +base+. Hash +opts+ supports +is_clean: Boolean+
+  def add_result_cds(base, opts)
+    return nil unless result_files_exist?(base, %w[.faa])
+    r = add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      proteins: '.faa',
+      genes: '.fna',
+      gff2: '.gff2',
+      gff3: '.gff3',
+      tab: '.tab'
+    )
+    opts[:is_clean] ||= false
+    r.clean! if opts[:is_clean]
+    unless r.clean?
+      MiGA::MiGA.clean_fasta_file(r.file_path(:proteins))
+      MiGA::MiGA.clean_fasta_file(r.file_path(:genes)) if r.file_path(:genes)
+      r.clean!
+    end
+    r
+  end
+  ##
+  # Add result type +:essential_genes+ at +base+ (no +_opts+ supported).
+  def add_result_essential_genes(base, _opts)
+    return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      ess_genes: '.ess.faa',
+      collection: '.ess',
+      report: '.ess/log',
+      alignments: '.ess/proteins.aln',
+      fastaai_index: '.faix.db.gz',
+      fastaai_index_2: '.faix'
+    )
+  end
+  ##
+  # Add result type +:ssu+ at +base+. Hash +opts+ supports +is_clean: Boolean+
+  def add_result_ssu(base, opts)
+    return MiGA::Result.new("#{base}.json") if result(:assembly).nil?
+    return nil unless result_files_exist?(base, '.ssu.fa')
+    r = add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      longest_ssu_gene: '.ssu.fa',
+      ssu_gff: '.ssu.gff', # DEPRECATED
+      gff: '.gff',
+      all_ssu_genes: '.ssu.all.fa',
+      classification: '.rdp.tsv',
+      trna_list: '.trna.txt'
+    )
+    opts[:is_clean] ||= false
+    r.clean! if opts[:is_clean]
+    unless r.clean?
+      MiGA::MiGA.clean_fasta_file(r.file_path(:longest_ssu_gene))
+      r.clean!
+    end
+    r
+  end
+  ##
+  # Add result type +:mytaxa+ at +base+ (no +_opts+ supported)
+  def add_result_mytaxa(base, _opts)
+    if multi?
+      return nil unless
+        result_files_exist?(base, '.mytaxa') ||
+        result_files_exist?(base, '.nomytaxa.txt')
+      add_files_to_ds_result(
+        MiGA::Result.new("#{base}.json"), name,
+        mytaxa: '.mytaxa',
+        blast: '.blast',
+        mytaxain: '.mytaxain',
+        nomytaxa: '.nomytaxa.txt',
+        species: '.mytaxa.Species.txt',
+        genus: '.mytaxa.Genus.txt',
+        phylum: '.mytaxa.Phylum.txt',
+        innominate: '.mytaxa.innominate',
+        kronain: '.mytaxa.krona',
+        krona: '.html'
+      )
+    else
+      MiGA::Result.new("#{base}.json")
+    end
+  end
+  ##
+  # Add result type +:mytaxa_scan+ at +base+ (no +_opts+ supported)
+  def add_result_mytaxa_scan(base, _opts)
+    if nonmulti?
+      return nil unless
+        result_files_exist?(base, %w[.pdf .mytaxa]) ||
+        result_files_exist?(base, '.nomytaxa.txt')
+      add_files_to_ds_result(
+        MiGA::Result.new("#{base}.json"), name,
+        nomytaxa: '.nomytaxa.txt',
+        mytaxa: '.mytaxa',
+        report: '.pdf',
+        regions_archive: '.reg.tar',
+        # Intermediate / Deprecated:
+        blast: '.blast',
+        mytaxain: '.mytaxain',
+        wintax: '.wintax',
+        gene_ids: '.wintax.genes',
+        region_ids: '.wintax.regions',
+        regions: '.reg'
+      )
+    else
+      MiGA::Result.new("#{base}.json")
+    end
+  end
+  ##
+  # Add result type +:distances+ at +base+ (no +_opts+ supported)
+  def add_result_distances(base, _opts)
+    if nonmulti?
+      if ref?
+        add_result_distances_ref(base)
+      else
+        add_result_distances_nonref(base)
+      end
+    else
+      add_result_distances_multi(base)
+    end
+  end
+  ##
+  # Add result type +:taxonomy+ at +base+ (no +_opts+ supported)
+  def add_result_taxonomy(base, _opts)
+    add_result_distances_nonref(base)
+  end
+  ##
+  # Add result type +:stats+ at +base+ (no +_opts+ supported)
+  def add_result_stats(base, _opts)
+    MiGA::Result.new("#{base}.json")
+  end
+  private
+  ##
+  # Add result type +:distances+ for _multi_ datasets at +base+
+  def add_result_distances_multi(base)
+    MiGA::Result.new("#{base}.json")
+  end
+  ##
+  # Add result type +:distances+ for _nonmulti_ reference datasets at +base+
+  def add_result_distances_ref(base)
+    pref = File.dirname(base)
+    return nil unless File.exist?("#{pref}/01.haai/#{name}.db")
+    MiGA::Result.new("#{base}.json").tap do |r|
+      r.add_files(
+        haai_db: "01.haai/#{name}.db",
+        aai_db: "02.aai/#{name}.db",
+        ani_db: "03.ani/#{name}.db"
+      )
+    end
+  end
+  ##
+  # Add result type +:distances+ for _nonmulti_ query datasets at +base+
+  def add_result_distances_nonref(base)
+    return nil unless
+      result_files_exist?(base, %w[.aai-medoids.tsv .aai.db]) ||
+      result_files_exist?(base, %w[.ani-medoids.tsv .ani.db])
+    add_files_to_ds_result(
+      MiGA::Result.new("#{base}.json"), name,
+      aai_medoids: '.aai-medoids.tsv',
+      haai_db: '.haai.db',
+      aai_db: '.aai.db',
+      ani_medoids: '.ani-medoids.tsv',
+      ani_db: '.ani.db',
+      ref_tree: '.nwk',
+      ref_tree_pdf: '.nwk.pdf',
+      intax_test: '.intax.txt'
+    )
+  end
+  ##
+  # Add files in +rel_files+ Hash to the result +r+ with dataset name +name+
+  def add_files_to_ds_result(r, name, rel_files)
+    files = {}
+    rel_files.each { |k, v| files[k] = name + v }
+    r.add_files(files)
+    r
+  end
+end

data/lib/miga/dataset/result/ignore.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+module MiGA::Dataset::Result::Ignore
+  ##
+  # Should I ignore +task+ for this dataset?
+  def ignore_task?(task)
+    why_ignore(task) != :execute
+  end
+  ##
+  # Returns an array of symbols indicating all the possible reasons why a
+  # given task migh be ignored:
+  # - empty: the dataset has no data
+  # - inactive: the dataset is inactive
+  # - upstream: the task is upstream from dataset's input
+  # - force: forced to ignore by metadata
+  # - project: incompatible project
+  # - noref: incompatible dataset, only for reference
+  # - multi: incompatible dataset, only for multi
+  # - nonmulti: incompatible dataset, only for nonmulti
+  # - complete: the task is already complete
+  def ignore_reasons
+    %i[empty inactive upstream force project noref multi nonmulti complete]
+  end
+  ##
+  # Return a code explaining why a task is ignored (see +ignore_reasons+) or
+  # the symbol +:execute+ (do not ignore, execute the task)
+  def why_ignore(task)
+    # Find a reason to ignore it
+    ignore_reasons.each do |i|
+      return i if send(:"ignore_#{i}?", task)
+    end
+    # Otherwise, execute
+    return :execute
+  end
+  ##
+  # Ignore +task+ because it's already done
+  def ignore_complete?(task)
+    !get_result(task).nil?
+  end
+  ##
+  # Ignore any task because the dataset is inactive (+_task+ is ignored)
+  def ignore_inactive?(_task)
+    !active?
+  end
+  ##
+  # Ignore any task because the dataset is empty (+_task+ is ignored)
+  def ignore_empty?(_task)
+    first_preprocessing.nil?
+  end
+  ##
+  # Ignore +task+ because it's upstream from the entry point
+  def ignore_upstream?(task)
+    self.class.PREPROCESSING_TASKS.index(task) <
+      self.class.PREPROCESSING_TASKS.index(first_preprocessing)
+  end
+  ##
+  # Ignore +task+ because the metadata says so
+  def ignore_force?(task)
+    !(metadata["run_#{task}"].nil? || metadata["run_#{task}"])
+  end
+  ##
+  # Ignore +task+ because the project is not compatible
+  def ignore_project?(task)
+    task == :taxonomy && project.option(:ref_project).nil?
+  end
+  ##
+  # Ignore +task+ because it's not a reference dataset
+  def ignore_noref?(task)
+    self.class.EXCLUDE_NOREF_TASKS.include?(task) && !ref?
+  end
+  ##
+  # Ignore +task+ because it's not a multi dataset
+  def ignore_multi?(task)
+    self.class.ONLY_MULTI_TASKS.include?(task) && !multi?
+  end
+  ##
+  # Ignore +task+ because it's not a nonmulti dataset
+  def ignore_nonmulti?(task)
+    self.class.ONLY_NONMULTI_TASKS.include?(task) && !nonmulti?
+  end
+end