RubyGems - parse_fasta - Versions diffs - 1.9.2 → 2.0.0 - Mend

parse_fasta 1.9.2 → 2.0.0

Files changed (48) hide show

checksums.yaml +8 -8
data/.gitignore +1 -0
data/.rspec +2 -0
data/CHANGELOG.md +178 -0
data/README.md +42 -215
data/Rakefile +2 -4
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/parse_fasta/error.rb +39 -0
data/lib/parse_fasta/record.rb +88 -0
data/lib/parse_fasta/seq_file.rb +221 -114
data/lib/parse_fasta/version.rb +2 -2
data/lib/parse_fasta.rb +5 -20
data/spec/parse_fasta/record_spec.rb +115 -0
data/spec/parse_fasta/seq_file_spec.rb +238 -0
data/spec/parse_fasta_spec.rb +25 -0
data/spec/spec_helper.rb +2 -44
data/spec/test_files/cr.fa +1 -0
data/spec/test_files/cr.fa.gz +0 -0
data/spec/test_files/cr.fq +3 -0
data/spec/test_files/cr.fq.gz +0 -0
data/spec/test_files/cr_nl.fa +4 -0
data/spec/test_files/cr_nl.fa.gz +0 -0
data/spec/test_files/cr_nl.fq +8 -0
data/spec/test_files/cr_nl.fq.gz +0 -0
data/spec/test_files/multi_blob.fa.gz +0 -0
data/spec/test_files/multi_blob.fq.gz +0 -0
data/spec/test_files/not_a_seq_file.txt +1 -0
data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
data/spec/test_files/seqs.fa.gz +0 -0
data/spec/test_files/seqs.fq +8 -0
data/spec/test_files/seqs.fq.gz +0 -0
metadata +49 -24
data/lib/parse_fasta/fasta_file.rb +0 -232
data/lib/parse_fasta/fastq_file.rb +0 -160
data/lib/parse_fasta/quality.rb +0 -54
data/lib/parse_fasta/sequence.rb +0 -174
data/spec/lib/fasta_file_spec.rb +0 -212
data/spec/lib/fastq_file_spec.rb +0 -143
data/spec/lib/quality_spec.rb +0 -51
data/spec/lib/seq_file_spec.rb +0 -357
data/spec/lib/sequence_spec.rb +0 -188
data/test_files/benchmark.rb +0 -99
data/test_files/bogus.txt +0 -2
data/test_files/test.fa.gz +0 -0
data/test_files/test.fq +0 -8
data/test_files/test.fq.gz +0 -0

data/lib/parse_fasta/fasta_file.rb DELETED Viewed

@@ -1,232 +0,0 @@
-# Copyright 2014, 2015 Ryan Moore
-# Contact: moorer@udel.edu
-#
-# This file is part of parse_fasta.
-#
-# parse_fasta is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# parse_fasta is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
-require 'zlib'
-# Provides simple interface for parsing fasta format files. Gzipped
-# files are no problem.
-class FastaFile < File
-  # Use it like IO::open
-  #
-  # @param fname [String] the name of the file to open
-  #
-  # @return [FastaFile] a FastaFile
-  def self.open(fname, *args)
-    begin
-      handle = Zlib::GzipReader.open(fname)
-    rescue Zlib::GzipFile::Error => e
-      handle = File.open(fname)
-    end
-    unless handle.each_char.peek[0] == '>'
-      raise ParseFasta::DataFormatError
-    end
-    handle.close
-    super
-  end
-  # Returns the records in the fasta file as a hash map with the
-  # headers as keys and the Sequences as values.
-  #
-  # @example Read a fastA into a hash table.
-  #   seqs = FastaFile.open('reads.fa').to_hash
-  #
-  # @return [Hash] A hash with headers as keys, sequences as the
-  #   values (Sequence objects)
-  #
-  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
-  def to_hash
-    hash = {}
-    self.each_record do |head, seq|
-      hash[head] = seq
-    end
-    hash
-  end
-  # Analagous to IO#each_line, #each_record is used to go through a
-  # fasta file record by record. It will accept gzipped files as well.
-  #
-  # @param separate_lines [Object] If truthy, separate lines of record
-  #   into an array of Sequences, but if falsy, yield a Sequence
-  #   object for the sequence instead.
-  #
-  # @example Parsing a fasta file (default behavior, gzip files are fine)
-  #   FastaFile.open('reads.fna.gz').each_record do |header, sequence|
-  #     puts [header, sequence.gc].join("\t")
-  #   end
-  #
-  # @example Parsing a fasta file (with truthy value param)
-  #   FastaFile.open('reads.fna').each_record(1) do |header, sequence|
-  #     # header => 'sequence_1'
-  #     # sequence => ['AACTG', 'AGTCGT', ... ]
-  #   end
-  #
-  # @yield The header and sequence for each record in the fasta
-  #   file to the block
-  #
-  # @yieldparam header [String] The header of the fasta record without
-  #   the leading '>'
-  #
-  # @yieldparam sequence [Sequence, Array<Sequence>] The sequence of the
-  #   fasta record. If `separate_lines` is falsy (the default
-  #   behavior), will be Sequence, but if truthy will be
-  #   Array<String>.
-  #
-  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
-  def each_record(separate_lines=nil)
-    begin
-      f = Zlib::GzipReader.open(self)
-    rescue Zlib::GzipFile::Error => e
-      f = self
-    end
-    if separate_lines
-      f.each("\n>") do |line|
-        header, sequence = parse_line_separately(line)
-        yield(header.strip, sequence)
-      end
-      # f.each_with_index(">") do |line, idx|
-      #   if idx.zero?
-      #     if line != ">"
-      #       raise ParseFasta::DataFormatError
-      #     end
-      #   else
-      #     header, sequence = parse_line_separately(line)
-      #     yield(header.strip, sequence)
-      #   end
-      # end
-    else
-      header = ""
-      sequence = ""
-      f.each_line do |line|
-        line.chomp!
-        len = line.length
-        if header.empty? && line.start_with?(">")
-          header = line[1, len]
-        elsif line.start_with?(">")
-          yield(header.strip, Sequence.new(sequence || ""))
-          header = line[1, len]
-          sequence = ""
-        else
-          raise ParseFasta::SequenceFormatError if sequence.include? ">"
-          sequence << line
-        end
-      end
-      yield(header, Sequence.new(sequence || ""))
-      # f.each("\n>") do |line|
-      #     header, sequence = parse_line(line)
-      #     yield(header.strip, Sequence.new(sequence || ""))
-      #   end
-      # f.each_with_index(sep=/^>/) do |line, idx|
-      #   if idx.zero?
-      #     if line != ">"
-      #       raise ParseFasta::DataFormatError
-      #     end
-      #   else
-      #     header, sequence = parse_line(line)
-      #     yield(header.strip, Sequence.new(sequence || ""))
-      #   end
-      # end
-    end
-    f.close if f.instance_of?(Zlib::GzipReader)
-    return f
-  end
-  # Fast version of #each_record
-  #
-  # Yields the sequence as a String, not Sequence. No separate lines
-  # option.
-  #
-  # @note If the fastA file has spaces in the sequence, they will be
-  #   retained. If this is a problem, use #each_record instead.
-  #
-  # @yield The header and sequence for each record in the fasta
-  #   file to the block
-  #
-  # @yieldparam header [String] The header of the fasta record without
-  #   the leading '>'
-  #
-  # @yieldparam sequence [String] The sequence of the fasta record
-  #
-  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
-  def each_record_fast
-    begin
-      f = Zlib::GzipReader.open(self)
-    rescue Zlib::GzipFile::Error => e
-      f = self
-    end
-    header = ""
-    sequence = ""
-    f.each_line do |line|
-      line.chomp!
-      len = line.length
-      if header.empty? && line.start_with?(">")
-        header = line[1, len]
-      elsif line.start_with?(">")
-        yield(header.strip, sequence)
-        header = line[1, len]
-        sequence = ""
-      else
-        raise ParseFasta::SequenceFormatError if sequence.include? ">"
-        sequence << line
-      end
-    end
-    yield(header, sequence)
-    # f.each("\n>") do |line|
-    #   header, sequence = parse_line(line)
-    #   raise ParseFasta::SequenceFormatError if sequence.include? ">"
-    #   yield(header.strip, sequence)
-    # end
-    f.close if f.instance_of?(Zlib::GzipReader)
-    return f
-  end
-  private
-  def parse_line(line)
-    line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
-  end
-  def parse_line_separately(line)
-    header, sequence =
-      line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
-    if sequence.nil?
-      sequences = []
-    else
-      sequences = sequence.split("\n")
-        .reject { |s| s.empty? }
-        .map { |s| Sequence.new(s) }
-    end
-    [header, sequences]
-  end
-end

data/lib/parse_fasta/fastq_file.rb DELETED Viewed

@@ -1,160 +0,0 @@
-# Copyright 2014, 2015 Ryan Moore
-# Contact: moorer@udel.edu
-#
-# This file is part of parse_fasta.
-#
-# parse_fasta is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# parse_fasta is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
-require 'zlib'
-# Provides simple interface for parsing four-line-per-record fastq
-# format files. Gzipped files are no problem.
-class FastqFile < File
-  # Returns the records in the fastq file as a hash map with the
-  # headers as keys pointing to a hash map like so
-  # { "seq1" => { head: "seq1", seq: "ACTG", desc: "", qual: "II3*"} }
-  #
-  # @example Read a fastQ into a hash table.
-  #   seqs = FastqFile.open('reads.fq.gz').to_hash
-  #
-  # @return [Hash] A hash with headers as keys, and a hash map as the
-  #   value with keys :head, :seq, :desc, :qual, for header, sequence,
-  #   description, and quality.
-  def to_hash
-    hash = {}
-    self.each_record do |head, seq, desc, qual|
-      hash[head] = { head: head, seq: seq, desc: desc, qual: qual }
-    end
-    hash
-  end
-  # Analagous to IO#each_line, #each_record is used to go through a
-  # fastq file record by record. It will accept gzipped files as well.
-  #
-  # @example Parsing a fastq file
-  #   FastqFile.open('reads.fq').each_record do |head, seq, desc, qual|
-  #     # do some fun stuff here!
-  #   end
-  # @example Use the same syntax for gzipped files!
-  #   FastqFile.open('reads.fq.gz').each_record do |head, seq, desc, qual|
-  #     # do some fun stuff here!
-  #   end
-  #
-  # @yield The header, sequence, description and quality string for
-  #   each record in the fastq file to the block
-  # @yieldparam header [String] The header of the fastq record without
-  #   the leading '@'
-  # @yieldparam sequence [Sequence] The sequence of the fastq record
-  # @yieldparam description [String] The description line of the fastq
-  #   record without the leading '+'
-  # @yieldparam quality_string [Quality] The quality string of the
-  #   fastq record
-  def each_record
-    count = 0
-    header = ''
-    sequence = ''
-    description = ''
-    quality = ''
-    begin
-      f = Zlib::GzipReader.open(self)
-    rescue Zlib::GzipFile::Error => e
-      f = self
-    end
-    f.each_line do |line|
-      line.chomp!
-      case count
-      when 0
-        header = line[1..-1]
-      when 1
-        sequence = Sequence.new(line)
-      when 2
-        description = line[1..-1]
-      when 3
-        count = -1
-        quality = Quality.new(line)
-        yield(header, sequence, description, quality)
-      end
-      count += 1
-    end
-    f.close if f.instance_of?(Zlib::GzipReader)
-    return f
-  end
-  # Fast version of #each_record
-  #
-  # @note If the fastQ file has spaces in the sequence, they will be
-  #   retained. If this is a problem, use #each_record instead.
-  #
-  # @example Parsing a fastq file
-  #   FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
-  #     # do some fun stuff here!
-  #   end
-  # @example Use the same syntax for gzipped files!
-  #   FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
-  #     # do some fun stuff here!
-  #   end
-  #
-  # @yield The header, sequence, description and quality string for
-  #   each record in the fastq file to the block
-  #
-  # @yieldparam header [String] The header of the fastq record without
-  #   the leading '@'
-  # @yieldparam sequence [String] The sequence of the fastq record
-  # @yieldparam description [String] The description line of the fastq
-  #   record without the leading '+'
-  # @yieldparam quality_string [String] The quality string of the
-  #   fastq record
-  def each_record_fast
-    count = 0
-    header = ''
-    sequence = ''
-    description = ''
-    quality = ''
-    begin
-      f = Zlib::GzipReader.open(self)
-    rescue Zlib::GzipFile::Error => e
-      f = self
-    end
-    f.each_line do |line|
-      line.chomp!
-      case count
-      when 0
-        header = line[1..-1]
-      when 1
-        sequence = line
-      when 2
-        description = line[1..-1]
-      when 3
-        count = -1
-        quality = line
-        yield(header, sequence, description, quality)
-      end
-      count += 1
-    end
-    f.close if f.instance_of?(Zlib::GzipReader)
-    return f
-  end
-end

data/lib/parse_fasta/quality.rb DELETED Viewed

@@ -1,54 +0,0 @@
-# Copyright 2014, 2015 Ryan Moore
-# Contact: moorer@udel.edu
-#
-# This file is part of parse_fasta.
-#
-# parse_fasta is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# parse_fasta is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
-# Provide some methods for dealing with common tasks regarding
-# quality strings.
-class Quality < String
-  # Strips whitespace from the str argument before calling super
-  #
-  # @return [Quality] A Quality string
-  #
-  # @example Removes whitespace
-  #   Quality.new "I I 2 ! " #=> "II2!"
-  def initialize(str)
-    super(str.gsub(/ +/, ""))
-  end
-  # Returns the mean quality for the record. This will be a good deal
-  # faster than getting the average with `qual_scores` and reduce.
-  #
-  # @example Get mean quality score for a record
-  #   Quality.new("!+5?I").mean_qual #=> 20.0
-  #
-  # @return [Float] Mean quality score for record
-  def mean_qual
-    (self.sum - (self.length * 33)) / self.length.to_f
-  end
-  # Returns an array of illumina style quality scores. The quality
-  # scores generated will be Phred+33 (i.e., new Illumina).
-  #
-  # @example Get quality score array of a Quality
-  #   Quality.new("!+5?I").qual_scores #=> [0, 10, 20, 30, 40]
-  #
-  # @return [Array<Fixnum>] the quality scores
-  def qual_scores
-    self.each_byte.map { |b| b - 33 }
-  end
-end

data/lib/parse_fasta/sequence.rb DELETED Viewed

@@ -1,174 +0,0 @@
-# Copyright 2014, 2015 Ryan Moore
-# Contact: moorer@udel.edu
-#
-# This file is part of parse_fasta.
-#
-# parse_fasta is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# parse_fasta is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
-# Provide some methods for dealing with common tasks regarding
-# nucleotide sequences.
-class Sequence < String
-  # # Error raised if both T and U are present
-  # #
-  # # @note This is NOT checked on every call to Sequence.new
-  # class AmbiguousSequenceError < StandardError
-  #   def message
-  #     "Sequence is ambiguous -- both T and U present"
-  #   end
-  # end
-  # Strips whitespace from the str argument before calling super
-  #
-  # @return [Sequence] A Sequence string
-  #
-  # @example Removes whitespace
-  #   Sequence.new "AA CC TT" #=> "AACCTT"
-  #
-  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
-  def initialize(str)
-    if str.match(/>/)
-      raise ParseFasta::SequenceFormatError
-    end
-    super(str.gsub(/ +/, ""))
-  end
-  # Calculates GC content
-  #
-  # Calculates GC content by dividing count of G + C divided by count
-  # of G + C + T + A + U. If there are both T's and U's in the
-  # Sequence, things will get weird, but then again, that wouldn't
-  # happen, now would it! Ambiguous bases are ignored similar to
-  # BioRuby.
-  #
-  # @example Get GC of a Sequence
-  #   Sequence.new('ACTg').gc #=> 0.5
-  # @example Using with FastaFile#each_record
-  #   FastaFile.open('reads.fna', 'r').each_record do |header, sequence|
-  #     puts [header, sequence.gc].join("\t")
-  #   end
-  #
-  # @return [0] if the Sequence is empty or there are no A, C, T, G or U
-  #   present
-  # @return [Float] if the GC content is defined for the Sequence
-  def gc
-    s = self.downcase
-    c = s.count('c')
-    g = s.count('g')
-    t = s.count('t')
-    a = s.count('a')
-    u = s.count('u')
-    return 0 if c + g + t + a + u == 0
-    return (c + g) / (c + g + t + a + u).to_f
-  end
-  # Returns a map of base counts
-  #
-  # This method will check if the sequence is DNA or RNA and return a
-  # count map appropriate for each. If a truthy argument is given, the
-  # count of ambiguous bases will be returned as well.
-  #
-  # If a sequence has both T and U present, will warn the user and
-  # keep going. Will return a map with counts of both, however.
-  #
-  # @example Get base counts of DNA sequence without ambiguous bases
-  #   Sequence.new('AcTGn').base_counts
-  #   #=> { a: 1, c: 1, t: 1, g: 1 }
-  # @example Get base counts of DNA sequence with ambiguous bases
-  #   Sequence.new('AcTGn').base_counts(true)
-  #   #=> { a: 1, c: 1, t: 1, g: 1, n: 1 }
-  # @example Get base counts of RNA sequence without ambiguous bases
-  #   Sequence.new('AcUGn').base_counts
-  #   #=> { a: 1, c: 1, u: 1, g: 1 }
-  # @example Get base counts of DNA sequence with ambiguous bases
-  #   Sequence.new('AcUGn').base_counts(true)
-  #   #=> { a: 1, c: 1, u: 1, g: 1, n: 1 }
-  #
-  # @return [Hash] A hash with base as key, count as value
-  def base_counts(count_ambiguous_bases=nil)
-    s = self.downcase
-    t = s.count('t')
-    u = s.count('u')
-    counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') }
-    if t > 0 && u == 0
-      counts[:t] = t
-    elsif t == 0 && u > 0
-      counts[:u] = u
-    elsif t > 0 && u > 0
-      warn('ERROR: A sequence contains both T and U')
-      counts[:t], counts[:u] = t, u
-    end
-    counts[:n] = s.count('n') if count_ambiguous_bases
-    counts
-  end
-  # Returns a map of base frequencies
-  #
-  # Counts bases with the `base_counts` method, then divides each
-  # count by the total bases counted to give frequency for each
-  # base. If a truthy argument is given, ambiguous bases will be
-  # included in the total and their frequency reported. Can discern
-  # between DNA and RNA.
-  #
-  # If default or falsy argument is given, ambiguous bases will not be
-  # counted in the total base count and their frequency will not be
-  # given.
-  #
-  # @example Get base frequencies of DNA sequence without ambiguous bases
-  #   Sequence.new('AcTGn').base_counts
-  #   #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 }
-  # @example Get base counts of DNA sequence with ambiguous bases
-  #   Sequence.new('AcTGn').base_counts(true)
-  #   #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 }
-  #
-  # @return [Hash] A hash with base as key, frequency as value
-  def base_frequencies(count_ambiguous_bases=nil)
-    base_counts = self.base_counts(count_ambiguous_bases)
-    total_bases = base_counts.values.reduce(:+).to_f
-    base_freqs =
-      base_counts.map { |base, count| [base, count/total_bases] }.flatten
-    Hash[*base_freqs]
-  end
-  # Returns a reverse complement of self
-  #
-  # @return [Sequence] a Sequence that is the reverse complement of
-  #   self
-  #
-  # @example Hanldes any IUPAC character and capitalization properly
-  #   Sequence.new("gARKbdctymvhu").rev_comp #=> "adbkraghvMYTc"
-  #
-  # @example Leaves non IUPAC characters
-  #   Sequence.new("cccc--CCCcccga").rev_comp #=> "tcgggGGG--gggg""
-  #
-  # @note If Sequence contains non-IUPAC characters, these are not
-  #   complemented
-  def rev_comp
-    # if self.match(/T/i) && self.match(/U/i)
-    #   raise Sequence::AmbiguousSequenceError
-    # end
-    # if self.match(/[^ATUGCYRSWKMBDHVN]/i)
-    #   warn "WARNING: Sequence contains non IUPAC characters"
-    # end
-    self.reverse.tr("ATUGCYRSWKMBDHVNatugcyrswkmbdhvn",
-                    "TAACGRYSWMKVHDBNtaacgryswmkvhdbn")
-  end
-end