RubyGems - bioroebe - Versions diffs - 0.10.80 → 0.11.32 - Mend

bioroebe 0.10.80 → 0.11.32

Potentially problematic release.

This version of bioroebe might be problematic. Click here for more details.

Files changed (210) hide show

data/lib/bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb CHANGED Viewed

@@ -2,15 +2,1468 @@
 # Encoding: UTF-8
 # frozen_string_literal: true
 # =========================================================================== #
+# === Bioroebe::ParseFasta
+#
+# This class will parse through a local FASTA file and find the
+# proper entries.
+#
+# A FASTA file may have nucleotides or an aminoacid-sequence, so
+# we have to keep this in mind when parsing it.
+#
+# Usage examples:
+#
+#   Bioroebe::ParseFasta.new(ARGV)
+#   Bioroebe.parse_fasta(ARGV)
+#
+# =========================================================================== #
 # require 'bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb'
-# Bioroebe::ParseFasta.new(ARGV)
+# Bioroebe.parse_fasta
+# Bioroebe.sizeseq
 # =========================================================================== #
 require 'bioroebe/base/commandline_application/commandline_application.rb'
-require 'bioroebe/fasta_and_fastq/parse_fasta/constants.rb'
-require 'bioroebe/fasta_and_fastq/parse_fasta/initialize.rb'
-require 'bioroebe/fasta_and_fastq/parse_fasta/misc.rb'
-require 'bioroebe/fasta_and_fastq/parse_fasta/reset.rb'
-require 'bioroebe/fasta_and_fastq/parse_fasta/run.rb'
+module Bioroebe
+class ParseFasta < ::Bioroebe::CommandlineApplication # === Bioroebe::ParseFasta
+  require 'bioroebe/sequence/dna.rb'
+  # ========================================================================= #
+  # === REGEX_NON_NUCLEOTIDES
+  #
+  # All non-nucleotides will be handled here via this regex.
+  #
+  # N is excluded because it may stand for "any" nucleotide too, at
+  # the least for a purine.
+  # ========================================================================= #
+  REGEX_NON_NUCLEOTIDES =
+    /BDEFHIJKLMOPQRSVWXYZ/
+  # ========================================================================= #
+  # === DEFAULT_FASTA
+  #
+  # This String can be used to quickly test code depending on FASTA
+  # entries.
+  # ========================================================================= #
+  DEFAULT_FASTA = '>Rosalind_6404
+CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
+TCCCACTAATAATTCTGAGG
+>Rosalind_5959
+CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
+ATATCCATTTGTCAGCAGACACGC
+>Rosalind_0808
+CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
+TGGGAACCTGCGGGCAGTAGGTGGAAT'
+  # ========================================================================= #
+  # === DEFAULT_ROUND_TO
+  # ========================================================================= #
+  DEFAULT_ROUND_TO = 2
+  # ========================================================================= #
+  # === initialize
+  # ========================================================================= #
+  def initialize(
+      i           = DEFAULT_FASTA,
+      run_already = true,
+      &block
+    )
+    reset
+    # ======================================================================= #
+    # === Handle blocks next
+    # ======================================================================= #
+    if block_given?
+      yielded = yield
+      # ===================================================================== #
+      # First handle Symbols.
+      # ===================================================================== #
+      case yielded
+      # ===================================================================== #
+      # === :be_verbose
+      # ===================================================================== #
+      when :be_verbose,
+           :verbose
+        set_be_verbose_and_report_the_sequence
+      # ===================================================================== #
+      # === :be_quiet
+      # ===================================================================== #
+      when :be_quiet,
+           :be_silent
+        be_quiet
+      # ===================================================================== #
+      # === :sizeseq
+      # ===================================================================== #
+      when :sizeseq
+        @sort_by_size = true
+      end
+      # ===================================================================== #
+      # === Handle Hashes next
+      # ===================================================================== #
+      if yielded.is_a? Hash
+        # =================================================================== #
+        # === :be_verbose
+        # =================================================================== #
+        if yielded.has_key? :be_verbose
+          set_be_verbose(yielded.delete(:be_verbose))
+          @internal_hash[:report_the_sequence] = true
+        end
+        # =================================================================== #
+        # === :use_colours
+        # =================================================================== #
+        if yielded.has_key? :use_colours
+          set_use_colours(
+            yielded.delete(:use_colours)
+          )
+        end
+        # =================================================================== #
+        # === :sizeseq
+        # =================================================================== #
+        if yielded.has_key? :sizeseq
+          @sort_by_size = true
+        end
+      end
+    end
+    set_commandline_arguments(i)
+    case run_already
+    # ======================================================================= #
+    # === :dont_run_yet
+    # ======================================================================= #
+    when :dont_run_yet,
+         :do_not_run_yet
+      run_already = false
+    end
+    run if run_already
+  end
+  # ========================================================================= #
+  # === reset                                                     (reset tag)
+  # ========================================================================= #
+  def reset
+    super()
+    infer_the_namespace
+    # ======================================================================= #
+    # === @is_a_genbank_file
+    # ======================================================================= #
+    @is_a_genbank_file = false
+    # ======================================================================= #
+    # === @input_file
+    #
+    # This variable denotes which input file is used to read data from.
+    #
+    # It is nil initially because we may skip reading from an existing
+    # file and e. g. only read from a String or some other non-file
+    # entity.
+    # ======================================================================= #
+    @input_file = nil
+    # ======================================================================= #
+    # === @hash
+    #
+    # This is the main variable for the class. It will keep entries such
+    # as this one here:
+    #
+    #   {
+    #     "ENSMUSG00000020122|ENSMUST08" => "CCCTCC"
+    #   }
+    #
+    # ======================================================================= #
+    @hash = {}
+    # ======================================================================= #
+    # === @internal_hash
+    #
+    # This Hash exists for internal configuration of the class.
+    # ======================================================================= #
+    @internal_hash = {}
+    # ======================================================================= #
+    # === :report_the_sequence
+    # ======================================================================= #
+    @internal_hash[:report_the_sequence] = false
+    # ======================================================================= #
+    # === :overwrite_the_original_file
+    # ======================================================================= #
+    @internal_hash[:overwrite_the_original_file] = false
+    # ======================================================================= #
+    # === :save_the_file
+    # ======================================================================= #
+    @internal_hash[:save_the_file] = false
+    # ======================================================================= #
+    # === :remove_numbers_from_input
+    # ======================================================================= #
+    @internal_hash[:remove_numbers_from_input] = false
+    # ======================================================================= #
+    # === :show_the_translated_protein_sequence
+    #
+    # This setting is false initially. If set to true via the commandline
+    # then report() will show the translated protein sequence as well.
+    # ======================================================================= #
+    @internal_hash[:show_the_translated_protein_sequence] = false
+    # ======================================================================= #
+    # === :condense_the_sequence_onto_a_single_line
+    #
+    # By default the output of this class will include newlines for the
+    # sequence. If this is not wanted by the user then the following
+    # variable keeps track of that behaviour. You can use the flag
+    # called --one-line to enable a condensed output, with newlines
+    # being removed.
+    # ======================================================================= #
+    @internal_hash[:condense_the_sequence_onto_a_single_line] = false
+    # ======================================================================= #
+    # === :limit_the_display_to_n_nucleotides
+    #
+    # If this variable is a number rather than nil, then it will be used
+    # to display only a limited number of nucleotides, e. g. "1000" if
+    # the user passes in 1000.
+    # ======================================================================= #
+    @internal_hash[:limit_the_display_to_n_nucleotides] = nil
+    # ======================================================================= #
+    # === @may_we_exit
+    # ======================================================================= #
+    @may_we_exit = false
+    # ======================================================================= #
+    # === @current_key
+    # ======================================================================= #
+    @current_key = nil
+    # ======================================================================= #
+    # === @use_opn
+    # ======================================================================= #
+    @use_opn = ::Bioroebe.use_opn?
+    # ======================================================================= #
+    # === @colourize_sequence
+    # ======================================================================= #
+    @colourize_sequence = false
+    # ======================================================================= #
+    # === @sort_by_size
+    #
+    # If the following variable is set to true, then this class will
+    # run a sizeseq-comparison, that is, it will compare all sequences
+    # and output them in a size-sorted manner, similar to the EMBOSS
+    # sizeseq action.
+    # ======================================================================= #
+    @sort_by_size = false
+    # ======================================================================= #
+    # === @show_the_header
+    #
+    # If this variable is true then the header will be shown.
+    # ======================================================================= #
+    @show_the_header = false
+    set_round_to :default
+    set_be_verbose
+  end
+  # ========================================================================= #
+  # === menu                                                       (menu tag)
+  # ========================================================================= #
+  def menu(
+      i = return_commandline_arguments_that_are_not_files
+    )
+    if i.is_a? Array
+      i.each {|entry| menu(entry) }
+    else
+      case i # case tag
+      # ===================================================================== #
+      # === --to-protein
+      #
+      # Invocation example:
+      #
+      #   pfasta *.fasta --toprotein
+      #
+      # ===================================================================== #
+      when /^-?-?to(-|_)?protein/i
+        @internal_hash[:show_the_translated_protein_sequence] = true
+      # ===================================================================== #
+      # === --one-line
+      #
+      # Invocation example:
+      #
+      #   pfasta rpoS_NC_000913.3.fasta --one-line
+      #
+      # ===================================================================== #
+      when /^-?-?one(-|_)?liner?/i
+        @internal_hash[:condense_the_sequence_onto_a_single_line] = true
+      # ===================================================================== #
+      # === --limit=1000
+      #
+      # Invocation example:
+      #
+      #   pfasta --limit=1000
+      #
+      # ===================================================================== #
+      when /^-?-?limit=(\d+)$/i
+        @internal_hash[:limit_the_display_to_n_nucleotides] = $1.to_s.dup.to_i
+      # ===================================================================== #
+      # === --overwrite
+      # ===================================================================== #
+      when /^-?-?overwrite/i
+        @internal_hash[:overwrite_the_original_file] = true
+      # ===================================================================== #
+      # === --help
+      #
+      # Usage example:
+      #
+      #   parse_fasta --help
+      #
+      # ===================================================================== #
+      when /^-?-?help/i
+        show_help
+        exit
+      # ===================================================================== #
+      # === --save-file
+      # ===================================================================== #
+      when /^-?-?save(-|_)?file/i
+        @internal_hash[:save_the_file] = true
+      # ===================================================================== #
+      # === --also-show-the-sequence
+      #
+      # To invoke this method try:
+      #
+      #   parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --show
+      #
+      # ===================================================================== #
+      when /^-?-?also(-|_)?show(-|_)?the(-|_)?sequence$/i,
+           /^-?-?report$/i,
+           /^-?-?show$/i
+        @internal_hash[:report_the_sequence] = true
+      # ===================================================================== #
+      # === --header
+      # ===================================================================== #
+      when /^-?-?header/i
+        do_show_the_header
+      # ===================================================================== #
+      # === --short
+      #
+      # This entry point can be used to show 300 nucleotides and not
+      # more, by simply using the --short commandline flag.
+      # ===================================================================== #
+      when /^-?-?short/i
+        @internal_hash[:limit_the_display_to_n_nucleotides] = 300
+      # ===================================================================== #
+      # === --size
+      #
+      # This will simply tell us how many nucleotides the given sequence
+      # has, then exit.
+      #
+      # To invoke this method try:
+      #
+      #   parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --size
+      #
+      # ===================================================================== #
+      when /^-?-?size$/i
+        set_be_quiet
+        do_process_the_commandline_arguments_that_are_files
+        erev size? # Report the size here.
+        exit
+      end
+    end
+  end
+  require 'bioroebe/calculate/calculate_gc_content.rb'
+  # ========================================================================= #
+  # === show_help                                                  (help tag)
+  #
+  # This method will inform the user how this class may be used from the
+  # commandline.
+  #
+  # Invocation example:
+  #
+  #   pfasta --help
+  #
+  # ========================================================================= #
+  def show_help
+    e
+    eparse '  --size'
+    eparse '  --also-show-the-sequence'
+    eparse '  --header     # show the header as well (normally the '\
+           'header is not shown)'
+    eparse '  --limit=1000 # limit to show only the first 1000 '\
+           'nucleotides; use'
+    eparse '               # any number that you need here'
+    eparse '  --one-line   # show the sequence on one line only, '\
+           'e. g. all newlines'
+    eparse '               # were removed'
+    eparse '  --toprotein  # show the protein sequence as well '\
+           '(assumes DNA or RNA'
+    eparse '               # .fasta file)'
+    e
+  end
+  # ========================================================================= #
+  # === show_the_translated_protein_sequence?
+  # ========================================================================= #
+  def show_the_translated_protein_sequence?
+    @internal_hash[:show_the_translated_protein_sequence]
+  end
+  # ========================================================================= #
+  # === set_round_to
+  #
+  # This will set to how many decimal numbers we will round to. This is
+  # mostly done for display-purposes, hence why the default is a fairly
+  # low value.
+  # ========================================================================= #
+  def set_round_to(
+      i = :default
+    )
+    case i
+    # ======================================================================= #
+    # === :default
+    #
+    # Since as of April 2021, the new default is 2, for rounding.
+    # ======================================================================= #
+    when :default
+      i = DEFAULT_ROUND_TO
+    end
+    @internal_hash[:round_to] = i.to_i
+  end
+  # ========================================================================= #
+  # === do_process_the_commandline_arguments_that_are_files
+  # ========================================================================= #
+  def do_process_the_commandline_arguments_that_are_files(
+      these_files = commandline_arguments_that_are_files?
+    )
+    unless these_files.is_a? Array
+      these_files = [these_files].flatten.compact
+    end
+    these_files.each {|this_file|
+      set_input_file(this_file)
+      set_data # This will use the default file.
+      split_into_proper_sections
+      report_the_FASTA_header if @show_the_header
+      if @sort_by_size
+        run_sizeseq_comparison
+      else
+        # =================================================================== #
+        # === Handle cases where the input is a protein
+        # =================================================================== #
+        if is_the_sequence_a_polypeptide?
+          if be_verbose?
+            erev "This sequence is assumed to be a #{royalblue('protein')}#{rev}."
+            report_how_many_elements_we_have_found
+          end
+        else # Must be a protein.
+          # =================================================================== #
+          # === Else it must be RNA or DNA
+          # =================================================================== #
+          if be_verbose?
+            erev "This sequence is assumed to "\
+                 "be #{royalblue('DNA')}#{rev} or #{royalblue('RNA')}#{rev}."
+          end
+          calculate_gc_content # GC content makes only sense for nucleotides.
+          report_how_many_elements_we_have_found if be_verbose?
+        end
+        if be_verbose?
+          report_the_nucleotide_composition
+          report_on_how_many_entries_we_did_work
+          if report_the_sequence?
+            do_report_the_sequence
+          end
+        end
+      end
+    }
+  end
+  # ========================================================================= #
+  # === sanitize_the_description
+  #
+  # This method will iterate over the description entry and sanitize
+  # it. In this context sanitizing means to add the "length" entry,
+  # and the "type" entry, such as in:
+  #
+  #   " # length=231; type=dna"
+  #
+  # ========================================================================= #
+  def sanitize_the_description
+    @data.map! {|line|
+      if line.start_with?('>') and !line.include?('length=')
+        length = 0
+        if @hash.has_key? line.delete('>')
+          length = @hash[line.delete('>')].size
+        end
+        line << " # length=#{length}; type=dna" # Currently hardcoded to DNA.
+      end
+      line
+    }
+  end
+  # ========================================================================= #
+  # === entries?
+  # ========================================================================= #
+  def entries?
+    @data
+  end
+  # ========================================================================= #
+  # === we_may_exit
+  # ========================================================================= #
+  def we_may_exit
+    @may_we_exit = true
+  end
+  # ========================================================================= #
+  # === output_results
+  # ========================================================================= #
+  def output_results
+    pp @hash
+  end
+  # ========================================================================= #
+  # === do_report_the_sequence                                   (report tag)
+  #
+  # This method is used to display the main sequence at hand.
+  # ========================================================================= #
+  def do_report_the_sequence
+    _ = main_sequence?
+    # ======================================================================= #
+    # Honour the --limit commandline flag next.
+    # ======================================================================= #
+    if @internal_hash[:limit_the_display_to_n_nucleotides]
+      _ = _[0 .. (@internal_hash[:limit_the_display_to_n_nucleotides] - 1)]
+    end
+    if @colourize_sequence
+      if is_polynucleotide?
+        # =================================================================== #
+        # Else assume this is DNA/RNA input.
+        # =================================================================== #
+        _.gsub!(/A/, teal('A')+rev)
+        _.gsub!(/C/, slateblue('C')+rev)
+        _.gsub!(/G/, royalblue('G')+rev)
+        _.gsub!(/T/, steelblue('T')+rev)
+        _.gsub!(/U/, steelblue('U')+rev)
+      #else
+      end
+    end
+    if condense_the_sequence_onto_a_single_line?
+      _ = _.delete("\n")
+    end
+    erev colourize_this_nucleotide_sequence(_)
+    e if condense_the_sequence_onto_a_single_line?
+    if show_the_translated_protein_sequence?
+      # ===================================================================== #
+      # Do show the translated protein sequence next:
+      # ===================================================================== #
+      translated_into_aa = Bioroebe.to_aa(_)
+      translated_into_aa_and_colourized = translated_into_aa.dup
+      if translated_into_aa.include? '*'
+        translated_into_aa_and_colourized = translated_into_aa.gsub(/\*/,tomato('*'))
+      end
+      erev 'The translated aminoacid sequence of '+
+           sfancy(translated_into_aa.size.to_s)+rev+
+           ' aminoacids is:'
+      e
+      erev steelblue("  #{translated_into_aa_and_colourized}")
+      e
+    end
+  end; alias display do_report_the_sequence # === display
+       alias report  do_report_the_sequence # === report
+  # ========================================================================= #
+  # === report_the_nucleotide_composition
+  # ========================================================================= #
+  def report_the_nucleotide_composition
+    if is_this_sequence_a_polynucleotide_sequence?
+      first = @hash.values.first.upcase
+      total_size   = first.size
+      n_adenines   = first.count('A')
+      n_thymidines = first.count('T')
+      n_cytodines  = first.count('C')
+      n_guanines   = first.count('G')
+      erev "The nucleotide composition is as follows:"
+      e "  "\
+        "#{steelblue(n_adenines)}#{rev}x A (#{(n_adenines * 100.0 / total_size).round(2)}%), "\
+        "#{steelblue(n_thymidines)}#{rev}x T (#{(n_thymidines * 100.0 / total_size).round(2)}%), "\
+        "#{steelblue(n_cytodines)}#{rev}x C (#{(n_cytodines * 100.0 / total_size).round(2)}%), "\
+        "#{steelblue(n_guanines)}#{rev}x G (#{(n_guanines * 100.0 / total_size).round(2)}%)"
+    elsif is_a_protein?
+      # ===================================================================== #
+      # Report the composition of the protein:
+      # ===================================================================== #
+      sequence = @hash.values.first.delete("\n")
+      erev "The protein composition (aminoacids) is as follows:"
+      # e colourize_this_aminoacid_sequence_for_the_commandline("  #{sequence}")
+      e orchid("  #{sequence}")
+    end
+  end; alias report_the_protein_composition report_the_nucleotide_composition # === report_the_protein_composition
+  # ========================================================================= #
+  # === report_how_many_elements_we_have_found
+  # ========================================================================= #
+  def report_how_many_elements_we_have_found
+    if @hash
+      first = @hash.values.first.delete("\n")
+      size = first.size.to_s
+      if be_verbose?
+        n_start_codons = first.count('ATG')
+        # =================================================================== #
+        # We upcase it since as of October 2021, as some FASTA files may
+        # include the sequence in lowercased characters.
+        # =================================================================== #
+        n_start_codons += first.reverse.upcase.count('ATG')
+        result = "This sequence contains #{simp(size.to_s)}#{rev}"\
+                 " #{nucleotides_or_aminoacids?}".dup
+        if is_a_nucleotide?
+          result << " and #{n_start_codons} "\
+                    "ATG codons (on both strands) in total"
+        end
+        result << '.'
+        if size.to_i > 1_000_000
+          # ================================================================= #
+          # Format the number with '_' characters.
+          # ================================================================= #
+          formatted = size.to_i.to_s.reverse.split(/(.{3})/).reject(&:empty?).join('_').reverse
+          result = result.dup if result.frozen?
+          result << ' ('+simp(formatted+' bp')+rev+')'
+        end
+        erev result
+      end
+    end
+  end
+  # ========================================================================= #
+  # === report_on_how_many_entries_we_did_work
+  # ========================================================================= #
+  def report_on_how_many_entries_we_did_work
+    if be_verbose?
+      entry_or_entries = 'entry'
+      if @hash.keys.size > 1
+        entry_or_entries = 'entries'
+      end
+      erev "We have identified a total of #{orange(@hash.keys.size)}"\
+           "#{rev} #{entry_or_entries} in this fasta dataset."
+      e
+    end
+  end
+  # ========================================================================= #
+  # === report_the_FASTA_header
+  # ========================================================================= #
+  def report_the_FASTA_header
+    e "#{rev}The header is: #{steelblue(header?)}"
+  end
+  # ========================================================================= #
+  # === report_the_sequence?
+  # ========================================================================= #
+  def report_the_sequence?
+    @internal_hash[:report_the_sequence]
+  end
+  # ========================================================================= #
+  # === run                                                         (run tag)
+  # ========================================================================= #
+  def run
+    menu
+    do_process_the_commandline_arguments_that_are_files
+    do_save_the_file if save_the_file?
+  end
+  # ========================================================================= #
+  # === sanitize_data
+  # ========================================================================= #
+  def sanitize_data(i)
+    if i.is_a? Array
+      i.flatten!
+      i.reject! {|entry| entry.start_with? '#' }
+      i.reject! {|entry| entry.strip.empty? }
+      if i.first and i.first.include? "\r"
+        # =================================================================== #
+        # Some FASTA files include "\r" line endings. We will check first
+        # for the first entry to contain a \r, and if so, we assume the
+        # whole FASTA file may have \r, which then will be removed.
+        # =================================================================== #
+        i.map! {|entry| entry.delete("\r") }
+      end
+    end
+    # ========================================================================= #
+    # === Run through SanitizeNucleotideSequence
+    # ========================================================================= #
+    if @internal_hash[:remove_numbers_from_input]
+      i = Bioroebe::SanitizeNucleotideSequence[i]
+    end
+    i
+  end
+  # ========================================================================= #
+  # === current_key?
+  # ========================================================================= #
+  def current_key?
+    @current_key
+  end; alias id?          current_key? # === id?
+       alias sequence_id? current_key? # === sequence_id?
+       alias title        current_key? # === title
+       alias title?       current_key? # === title?
+  # ========================================================================= #
+  # === round_to?
+  # ========================================================================= #
+  def round_to?
+    @internal_hash[:round_to]
+  end
+  # ========================================================================= #
+  # === opnn
+  # ========================================================================= #
+  def opnn
+    super(namespace?) if use_opn?
+  end
+  # ========================================================================= #
+  # === use_opn?
+  # ========================================================================= #
+  def use_opn?
+    @use_opn
+  end
+  # ========================================================================= #
+  # === calculate_gc_content
+  #
+  # Calculate the gc content through this method, which is called from
+  # within the method run().
+  # ========================================================================= #
+  def calculate_gc_content
+    _ = @hash.values.join.delete(N)
+    if is_polynucleotide? _
+      @hash.each_pair {|key, content|
+        # =================================================================== #
+        # Delegate towards the method Bioroebe.gc_content next, including
+        # to round towards 5 positions:
+        # =================================================================== #
+        gc_content = ::Bioroebe.gc_content(content.upcase, round_to?)
+        gc_content = gc_content.first if gc_content.is_a? Array
+        gc_content = gc_content.to_s
+        minimal_key = key.to_s
+        if minimal_key.include? '|'
+          minimal_key = minimal_key.split('|').last.strip
+        end
+        if be_verbose?
+          _ = minimal_key.strip
+          if _.size > 40 # Shorten the content a bit if it is too long.
+            _ = _[0 .. 40]+' [...]'
+          end
+          erev 'GC content of "'+simp(_)+rev+'" is: '+
+               "#{sfancy(gc_content)}#{rev} %"
+        end
+      }
+    else
+      erev '`'+simp(_)+rev+'` is not a polynucleotide.' if be_verbose?
+    end
+  end
+  # ========================================================================= #
+  # === first_value
+  #
+  # This will return the first entry of the Fasta files.
+  # ========================================================================= #
+  def first_value
+    sequences?.first
+  end
+  # ========================================================================= #
+  # === nucleotides_or_aminoacids?
+  # ========================================================================= #
+  def nucleotides_or_aminoacids?
+    if is_polynucleotide?
+      'nucleotides'
+    else
+      'aminoacids'
+    end
+  end
+  # ========================================================================= #
+  # === is_polynucleotide?
+  # ========================================================================= #
+  def is_polynucleotide?(i = main_sequence?)
+    !is_protein?(i)
+  end; alias is_a_nucleotide? is_polynucleotide? # === is_a_nucleotide?
+  # ========================================================================= #
+  # === is_this_sequence_a_polynucleotide_sequence?
+  # ========================================================================= #
+  def is_this_sequence_a_polynucleotide_sequence?
+    !is_protein?
+  end
+  # ========================================================================= #
+  # === data?
+  #
+  # This will contain the full content of the (whole) .fasta file, including
+  # the header.
+  # ========================================================================= #
+  def data?
+    @data
+  end; alias input?   data? # === input?
+       alias dataset? data? # === dataset?
+  # ========================================================================= #
+  # === hash?
+  # ========================================================================= #
+  def hash?
+    @hash
+  end
+  # ========================================================================= #
+  # === sequences?
+  #
+  # This method will obtain all found sequences.
+  # ========================================================================= #
+  def sequences?
+    @hash.values
+  end; alias sequences sequences? # === sequences
+       alias values    sequences? # === values
+  # ========================================================================= #
+  # === short_headers?
+  #
+  # The short-headers are like the headers, but if a ' ' token is found
+  # then the line will be truncated towards that first ' '.
+  #
+  # An example is:
+  #
+  #   sp|Q91FT8|234R_IIV6 Uncharacterized protein 234R OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-234R PE=4 SV=1
+  #
+  # This will be truncated towards
+  #
+  #   sp|Q91FT8|234R_IIV6
+  #
+  # This could then be used to automatically rename FASTA files, for
+  # instance.
+  # ========================================================================= #
+  def short_headers?
+    headers?.map {|entry|
+      if entry.include? ' '
+        entry = entry.split(' ').first
+      end
+      entry
+    }
+  end
+  # ========================================================================= #
+  # === set_data
+  #
+  # This is the setter-method towards @data. It is no longer allowed to
+  # invoke set_input_file() since as of 12.06.2020. This means that
+  # you have to invoke that method prior to calling this method.
+  # ========================================================================= #
+  def set_data(i = @input_file)
+    # ======================================================================= #
+    # The next line attempts to ensure that even an Array can be used
+    # as input to that method.
+    # ======================================================================= #
+    i = [i].flatten.compact.first.to_s.dup
+    if File.exist? i.to_s # First try to read in from a file.
+      if be_verbose?
+        opnn; erev "Will read from the file `#{sfile(i)}#{rev}`."
+      end
+      i = File.readlines(i)
+      if @is_a_genbank_file
+        selected = i.select {|line|
+          line.start_with?('       ') and # such as: "       61 atggggcctg caatggggcc tgcaatgggg cctgca\n"
+          (line.strip =~ /\d+/)
+        }.map {|inner_line|
+          inner_line.strip.delete(' 0123456789').strip.upcase
+        }
+        i = ["> genbank file"]+selected
+      end
+    end
+    if i.nil? or i.empty?
+      i = DEFAULT_FASTA
+      opnn; erev 'No input was provided. Thus a default FASTA '\
+                 'sequence will be used instead.'
+    end
+    i = sanitize_data(i)
+    i = i.split(N) if i.is_a? String
+    @data = i
+  end; alias set_sequence set_data # === set_Sequence
+  # ========================================================================= #
+  # === set_be_verbose_and_report_the_sequence
+  # ========================================================================= #
+  def set_be_verbose_and_report_the_sequence
+    set_be_verbose
+    @internal_hash[:report_the_sequence] = true
+  end
+  # ========================================================================= #
+  # === condense_the_sequence_onto_a_single_line?
+  # ========================================================================= #
+  def condense_the_sequence_onto_a_single_line?
+    @internal_hash[:condense_the_sequence_onto_a_single_line]
+  end
+  # ========================================================================= #
+  # === return_size_sorted_hash
+  # ========================================================================= #
+  def return_size_sorted_hash(i = @hash)
+    _ = i.sort_by {|key, value| value.size }
+    i = Hash[_]
+    return i
+  end
+  # ========================================================================= #
+  # === do_sort_by_size
+  #
+  # This method will sort the hash by size of the sequence. It has been
+  # inspired by the EMBOSS sizeq functionality.
+  #
+  # The output that should be generated might look like this:
+  #
+  #   https://www.bioinformatics.nl/cgi-bin/emboss/help/sizeseq#input.1
+  #
+  # Invocation example:
+  #
+  #   x = Bioroebe::ParseFasta.new('/Depot/j/globins.fasta'); x.do_sort_by_size
+  #
+  # ========================================================================= #
+  def do_sort_by_size
+    # ======================================================================= #
+    # Sort it here first, by the size of the "value", aka the sequence body.
+    # ======================================================================= #
+    @hash = return_size_sorted_hash(@hash)
+    _ = ''.dup
+    @hash.each_pair {|key, sequence|
+      _ << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
+           ' SQ '+sequence.size.to_s+' AA'+N # ; unknown MW as of yet; '\
+           #'unknown CRC64 as of yet'+N
+      _ << sequence+N+N
+    }
+    e _
+  end; alias run_sizeseq_comparison do_sort_by_size # === run_sizeseq_comparison
+  # ========================================================================= #
+  # === n_nucleotides?
+  # ========================================================================= #
+  def n_nucleotides?
+    @hash.values.first.delete("\n").size
+  end; alias return_n_aminoacids n_nucleotides? # === return_n_aminoacids
+       alias size?               n_nucleotides? # === size?
+       alias sequence_size?      n_nucleotides? # === sequence_size?
+  # ========================================================================= #
+  # === headers?
+  # ========================================================================= #
+  def headers?
+    @hash.keys
+  end
+  # ========================================================================= #
+  # === first_key?
+  #
+  # Obtain the very first entry.
+  # ========================================================================= #
+  def first_key?
+    headers?.first
+  end
+  # ========================================================================= #
+  # === header?
+  #
+  # This variant will always return the first entry.
+  # ========================================================================= #
+  def header?
+    headers?.first.to_s
+  end
+  # ========================================================================= #
+  # === raw_body?
+  # ========================================================================= #
+  def raw_body?
+    @hash.values.first
+  end
+  # ========================================================================= #
+  # === do_show_the_header
+  # ========================================================================= #
+  def do_show_the_header
+    @show_the_header = true
+  end
+  # ========================================================================= #
+  # === set_input_file
+  #
+  # This method will be used to keep track of the input-file, from
+  # which we will read the dataset.
+  # ========================================================================= #
+  def set_input_file(i = nil)
+    if i.nil?
+      # ===================================================================== #
+      # First, we try to find a .fasta or .fa file in the current
+      # directory. If we can find it, we will use that instead.
+      # ===================================================================== #
+      unless Dir['*.{fa,fasta}'].empty?
+        file = Dir['*.{fa,fasta}'].first
+        if be_verbose?
+          result = 'A '
+          if file.end_with? '.fasta'
+            result < 'FASTA '
+          end
+          result << 'file was found in this directory ('+sfile(file)+').'
+          opnn; erev result
+          opnn; erev 'We will use it.'
+        end
+        i = file
+      end
+      unless Dir['*.{fa,fasta}'].empty?
+        file = Dir['*.{fa,fasta}'].first
+        if be_verbose?
+          opnn; erev "We have found a file in this "\
+                     "directory (#{sfile(file)}#{rev})."
+          opnn; erev 'We will use it.'
+        end
+        i = file
+      end
+    end
+    if i and File.exist?(i)
+      dataset = File.read(i)
+      if dataset[0 .. ('LOCUS'.size - 1)] == 'LOCUS'
+        @is_a_genbank_file = true
+      end
+    end
+    @input_file = i
+  end; alias set_input_files set_input_file # === set_input_files
+  # ========================================================================= #
+  # === save_the_file?
+  # ========================================================================= #
+  def save_the_file?
+    @internal_hash[:save_the_file]
+  end
+  # ========================================================================= #
+  # === overwrite_the_original_file?
+  # ========================================================================= #
+  def overwrite_the_original_file?
+    @internal_hash[:overwrite_the_original_file]
+  end
+  # ========================================================================= #
+  # === split_into_proper_sections
+  #
+  # Split up into the fasta identifier, and the content.
+  # ========================================================================= #
+  def split_into_proper_sections
+    unless @data.to_s.include? '>'
+      erev 'No ">" character was found in this dataset.'
+      erev 'It is recommended to always have a > identifier '\
+           'for the'
+      erev 'FASTA format (such as in a .fasta or a .fa file).'
+    end if be_verbose? # Ok, the input data includes >. We can proceed.
+    @data.each { |line|
+      # ===================================================================== #
+      # === Handle the leading > FASTA identifier first
+      # ===================================================================== #
+      if line.start_with? '>' # leading identifier.
+        @current_key = line[1..-1].chomp # Select all but the first character.
+        @hash[@current_key] = ''.dup
+      else
+        line.delete!('_')
+        unless @current_key
+          @current_key = 'standard'
+          @hash[@current_key] = ''.dup
+        end
+        # =================================================================== #
+        # === Retain the newlines
+        #
+        # Here we may decide to get rid of newlines, but it is better to
+        # NOT remove the newlines - that way we can simply save the
+        # dataset again.
+        # @hash[@current_key] << no_newlines(line)
+        # =================================================================== #
+        @hash[@current_key] << line
+      end
+    }
+  end
+  # ========================================================================= #
+  # ===   save_into_a_fasta_file
+  # ========================================================================= #
+  def save_into_a_fasta_file(
+      be_verbose = be_verbose?
+    )
+    case be_verbose
+    when :be_verbose
+      be_verbose = true
+    end
+    if @data
+      what = @data.join("\n")
+      into = 'standard.fasta'
+      erev 'Saving into '+sfile(into)+rev+'.' if be_verbose
+      write_what_into(what, into)
+      return File.absolute_path(into) # And return the file we saved into.
+    else
+      opnn; erev 'No @data variable exists.'
+    end
+  end; alias do_save_the_file save_into_a_fasta_file # === do_save_the_file
+  # ========================================================================= #
+  # === add_length_information_to_the_header
+  # ========================================================================= #
+  def add_length_information_to_the_header
+    _ = header?.strip
+    _ << ' length='+sequence_size?.to_s+';'
+    # ======================================================================= #
+    # Next, designate where to store this file.
+    # ======================================================================= #
+    into = 'new_fasta_file.fasta'
+    if overwrite_the_original_file?
+      into = @input_file
+    end
+    what = ''.dup
+    what << "> "+_+"\n"
+    what << raw_body?
+    if what and into
+      erev 'Storing into `'+sfile(into)+rev+'`.'
+      write_what_into(what, into)
+    end
+  end
+  # ========================================================================= #
+  # === simplify_header
+  #
+  # This method can be called to simplify the header. It will save into
+  # a .fasta file at once.
+  # ========================================================================= #
+  def simplify_header
+    _ = header?
+    # ======================================================================= #
+    # Next, simplify the header. We must start with checking for [] first,
+    # because if there are any [] in the FASTA header then we can simplify
+    # stuff at once.
+    # ======================================================================= #
+    if _.include?('[') and _.include?(']')
+      _ = '> '+_.strip.scan(/\[.+\]/).flatten.first.delete('[]')+"\n"
+    elsif _.include? ','
+      _ = _[0 .. (_.index(',') - 1) ].strip
+    end
+    what = nil
+    # ======================================================================= #
+    # Next, designate where to store this file.
+    # ======================================================================= #
+    into = 'new_fasta_file.fasta'
+    if overwrite_the_original_file?
+      into = @input_file
+    end
+    if _.start_with? '>'
+      what = _
+    elsif _.include?('[') and _.include?(']') # For example: [Pan troglodytes]
+      # ===================================================================== #
+      # See rubular at:
+      #
+      #   https://rubular.com/r/aDjI0JwMOUlZzP
+      #
+      # ===================================================================== #
+      what = "> "+_.scan(/\[(.+)\]/).flatten.first.to_s+"\n".dup
+    elsif _.include? 'Human'
+      _scanned_result = _.scan(/(Human)/)
+      what = "> "+$1.to_s.dup+"\n".dup
+    else
+      erev "Unsure what to do: #{steelblue(_)}"
+    end
+    if what and into
+      what << raw_body?
+      erev 'Storing into `'+sfile(into)+rev+'`.'
+      write_what_into(what, into)
+    end
+  end
+  # ========================================================================= #
+  # === sequence
+  #
+  # This method will return the sequence, without any newlines. It is also
+  # called the "body" of a FASTA file.
+  # ========================================================================= #
+  def sequence
+    _ = @hash.values.first
+    _.chomp! if _ and _.end_with?(N)
+    return no_newlines(_)
+  end; alias fasta_sequence      sequence # === fasta_sequence
+       alias sequence?           sequence # === sequence?
+       alias body?               sequence # === body?
+       alias body                sequence # === body?
+       alias naseq               sequence # === naseq
+       alias nucleotide_sequence sequence # === nucleotide_sequence
+       alias return_sequence     sequence # === return_sequence
+       alias content?            sequence # === content?
+  # ========================================================================= #
+  # === save
+  #
+  # This method will save our FASTA file.
+  # ========================================================================= #
+  def save
+    if @input_file.nil?
+      erev "The generic file #{sfile('foobar.fasta')}#{rev} "\
+           "will be used."
+      set_input_file('foobar.fasta')
+    end
+    into = @input_file
+    what = @data.join("\n")
+    erev 'Storing into '+sfile(into)+rev+'.'
+    write_what_into(what, into)
+    return into
+  end
+  # ========================================================================= #
+  # === []
+  #
+  # This is a simpler query-interface for obtaining the DNA/RNA sequence
+  # of the FASTA file (or aminoacid sequence, if we have a protein at
+  # hand here).
+  #
+  # Using the method sequences? here, which in turn works on @hash, is
+  # ok because Hashes are kept in a sorted manner in ruby since some
+  # time.
+  # ========================================================================= #
+  def [](i)
+    sequences?[i]
+  end
+  # ========================================================================= #
+  # === Bioroebe::ParseFasta[]
+  # ========================================================================= #
+  def self.[](i)
+    _ = new(i)
+    _.sequences?
+  end
+  # ========================================================================= #
+  # === type?
+  # ========================================================================= #
+  def type?
+    if is_the_sequence_a_polypeptide?
+      :protein
+    elsif is_this_sequence_a_polynucleotide_sequence?
+      :dna_or_rna
+    else
+      :unknown
+    end
+  end
+  # ========================================================================= #
+  # === is_the_sequence_a_polypeptide?
+  #
+  # This method can be used to determine whether a given input sequence
+  # is a polypeptide (aka a protein) or whether it is not.
+  #
+  # If this sequence is a polypeptide then this method will return true.
+  # Otherwise false will be returned.
+  # ========================================================================= #
+  def is_the_sequence_a_polypeptide?(
+      i = main_sequence?
+    )
+    return_value = false # Set the default return value here.
+    # ======================================================================= #
+    # Look at the first 120 positions to determine whether this is a protein
+    # or a nucleotide sequence.
+    # ======================================================================= #
+    subsequence = i[0 .. 119] # Must deduct 1 at the end since Arrays in ruby start at 0.
+    # ======================================================================= #
+    # Build a frequency of the characters there.
+    # ======================================================================= #
+    hash = {}
+    hash.default = 0
+    subsequence.chars.each {|character|
+      hash[character] += 1
+    }
+    keys_to_check_for = %w(
+      B D E F H I J K L M O P Q R S V W X Y Z
+    )
+    values = hash.select {|key, value|
+      if keys_to_check_for.include? key
+        true
+      else
+        false
+      end
+    }.values.sum
+    if values > 0
+      return_value = true
+    end
+    return return_value
+  end; alias is_protein?   is_the_sequence_a_polypeptide? # === is_protein?
+       alias is_a_protein? is_the_sequence_a_polypeptide? # === is_a_protein?
+  # ========================================================================= #
+  # === main_sequence?
+  #
+  # This will always return the first entry.
+  # ========================================================================= #
+  def main_sequence?
+    @hash.values.first
+  end
+  # ========================================================================= #
+  # === gc_content?
+  # ========================================================================= #
+  def gc_content?
+    return ::Bioroebe.gc_content(main_sequence?).to_f # Must be a float.
+  end; alias gc_content gc_content? # === gc_content
+  # ========================================================================= #
+  # === sequence_object
+  #
+  # This method will return a Sequence object.
+  #
+  # Usage example:
+  #
+  #   x = Bioroebe.parse_fasta 'ls_orchid.fasta'
+  #   y = x.sequence_object # y is now an instance of Bioroebe::Sequence
+  #
+  # ========================================================================= #
+  def sequence_object
+    ::Bioroebe::Sequence.new(main_sequence?)
+  end
+end
+Fasta = ParseFasta # Add an "alias" constant to class ParseFasta.
+# =========================================================================== #
+# === Bioroebe.parse_fasta_quietly
+#
+# As the variant above, but will work quietly.
+# =========================================================================== #
+def self.parse_fasta_quietly(
+    i, use_colours = true
+  )
+  ::Bioroebe.parse_fasta(i, use_colours) { :be_quiet }
+end
+# =========================================================================== #
+# === Bioroebe.return_fasta_entry_with_the_highest_gc_content
+#
+# The first argument should be a locally existing FASTA file that
+# contains different sequences.
+#
+# Usage example:
+#
+#   x = Bioroebe.return_fasta_entry_with_the_highest_gc_content('/rosalind_gc.txt')
+#
+# =========================================================================== #
+def self.return_fasta_entry_with_the_highest_gc_content(this_fasta_file)
+  if File.exist? this_fasta_file
+    dataset = File.read(this_fasta_file)
+    dataset = parse_fasta(dataset) { :be_quiet }
+    hash = dataset.hash?
+    hash.transform_values! {|this_value|
+      ::Bioroebe.gc_content(this_value).to_f
+    }
+    return hash.max_by {|key, value| value }
+  else
+    erev "No file exists at #{sfile(this_fasta_file)}#{rev}."
+  end
+end
+# =========================================================================== #
+# === Bioroebe.sizeseq
+#
+# This method will "size-sequence compare", typically on a .fasta file.
+# =========================================================================== #
+def self.sizeseq(i)
+  if i.is_a? Array
+    i = i.first
+  end
+  _ = Bioroebe.parse_fasta(i) { :be_quiet }
+  _.do_sort_by_size
+end
+# =========================================================================== #
+# === Bioroebe.return_sizeseq
+#
+# This is as Bioroebe.sizeseq(), but it will just return the result,
+# rather than output it.
+# =========================================================================== #
+def self.return_sizeseq(i)
+  if i.is_a? Array
+    i = i.first
+  end
+  _ = Bioroebe.parse_fasta(i) { :be_quiet }
+  hash = _.return_size_sorted_hash
+  result = ''.dup
+  hash.each_pair {|key, sequence|
+    result << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
+         ' SQ '+sequence.size.to_s+' AA'+N
+    result << sequence+N+N
+  }
+  return result
+end
+# =========================================================================== #
+# === Bioroebe.genbank_to_fasta
+#
+# This method will convert from a genbank file, to a .fasta file.
+#
+# Invocation example:
+#
+#   Bioroebe.genbank_to_fasta('/home/x/DATA/PROGRAMMING_LANGUAGES/RUBY/src/bioroebe/lib/bioroebe/data/genbank/sample_file.genbank')
+#
+# =========================================================================== #
+def self.genbank_to_fasta(
+    this_file,
+    be_verbose = :be_verbose
+  )
+  case be_verbose
+  when :be_quiet
+    be_verbose = false
+  end
+  if this_file.is_a? Array
+    this_file = this_file.first
+  end
+  if File.exist? this_file
+    _ = Bioroebe::ParseFasta.new(this_file) { :be_quiet }
+  else
+    _ = Bioroebe::ParseFasta.new(:do_not_run_yet) { :be_quiet }
+    _.set_data # This will use the default file.
+    _.split_into_proper_sections
+  end
+  file_path = _.save_into_a_fasta_file(be_verbose)
+  return file_path
+end
+# =========================================================================== #
+# === Bioroebe.parse_fasta_file
+# =========================================================================== #
+def self.parse_fasta_file(
+    i           = ARGV,
+    use_colours = true
+  )
+  use_this_hash = {
+    use_colours: use_colours,
+    be_verbose:  false
+  }
+  ParseFasta.new(i) { use_this_hash }
+end; self.instance_eval { alias fasta_file parse_fasta_file } # === Bioroebe.fasta_file
+# =========================================================================== #
+# === Bioroebe.parse_fasta
+#
+# Easier reader-method for .fasta files.
+#
+# The second argument determines whether we will use colours or whether
+# we will not. For now, the default is to not use colours when we use
+# this particular class method.
+#
+# Invocation examples:
+#
+#   x = Bioroebe.parse_fasta('/rosalind_gc.txt')
+#   hash = Bioroebe.parse_fasta('/rosalind_gc.txt').hash?
+#
+# =========================================================================== #
+def self.parse_fasta(
+    i,
+    use_colours = true
+  )
+  use_this_hash = {
+    use_colours: use_colours
+  }
+  if block_given?
+    use_this_hash = {
+      use_colours: use_colours,
+      be_verbose:  yield
+    }
+  end
+  ::Bioroebe::ParseFasta.new(i) { use_this_hash }
+end; self.instance_eval { alias fasta parse_fasta } # === Bioroebe.fasta
+end
 if __FILE__ == $PROGRAM_NAME
   Bioroebe::ParseFasta.new(ARGV) { :sizeseq }
@@ -24,4 +1477,4 @@ end # corefasta globins.fasta
     # pfasta /GC.txt
     # pfasta 013521.3_289_aa.fasta --also-show-the-sequence
     # pfasta  $RSRC/bioroebe/lib/bioroebe/data/GFP_mutant_3_coding_sequence.fasta  --also-show-the-sequence
-    # corefasta $J/globins.fasta
+    # corefasta $J/globins.fasta