RubyGems - demultiplexer - Versions diffs - 0.0.1 - Mend

demultiplexer 0.0.1

Files changed (22) hide show

checksums.yaml +7 -0
data/.gitignore +5 -0
data/LICENSE +340 -0
data/README.md +2 -0
data/Rakefile +17 -0
data/bin/demultiplexer +194 -0
data/demultiplexer.gemspec +26 -0
data/lib/data_io.rb +207 -0
data/lib/demultiplexer.rb +263 -0
data/lib/demultiplexer/version.rb +26 -0
data/lib/index_builder.rb +181 -0
data/lib/sample_reader.rb +198 -0
data/lib/screen.rb +39 -0
data/lib/status.rb +101 -0
data/test/helper.rb +51 -0
data/test/test_data_io.rb +7 -0
data/test/test_demultiplexer.rb +7 -0
data/test/test_index_builder.rb +7 -0
data/test/test_sample_reader.rb +7 -0
data/test/test_screen.rb +7 -0
data/test/test_status.rb +7 -0
metadata +127 -0

data/demultiplexer.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+$:.push File.expand_path("../lib", __FILE__)
+require 'demultiplexer/version'
+Gem::Specification.new do |s|
+  s.name              = 'demultiplexer'
+  s.version           = Demultiplexer::VERSION
+  s.platform          = Gem::Platform::RUBY
+  s.date              = Time.now.strftime("%F")
+  s.summary           = "Demultiplexer"
+  s.description       = "Demultiplex sequences from the Illumina platform."
+  s.authors           = ["Martin A. Hansen"]
+  s.email             = 'mail@maasha.dk'
+  s.rubyforge_project = "demultiplexer"
+  s.homepage          = 'http://github.com/maasha/demultiplexer'
+  s.license           = 'GPL2'
+  s.rubygems_version  = "2.0.0"
+  s.files             = `git ls-files`.split("\n")
+  s.test_files        = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.require_paths     = ["lib"]
+  s.add_dependency("biopieces",             ">= 0.4.1")
+  s.add_dependency("google_hash",           ">= 0.8.4")
+  s.add_development_dependency("bundler",   ">= 1.7.4")
+  s.add_development_dependency("simplecov", ">= 0.9.2")
+end

data/lib/data_io.rb ADDED Viewed

@@ -0,0 +1,207 @@
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+#                                                                              #
+# Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk).                #
+#                                                                              #
+# This program is free software; you can redistribute it and/or                #
+# modify it under the terms of the GNU General Public License                  #
+# as published by the Free Software Foundation; either version 2               #
+# of the License, or (at your option) any later version.                       #
+#                                                                              #
+# This program is distributed in the hope that it will be useful,              #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of               #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                #
+# GNU General Public License for more details.                                 #
+#                                                                              #
+# You should have received a copy of the GNU General Public License            #
+# along with this program; if not, write to the Free Software                  #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,    #
+# USA.                                                                         #
+#                                                                              #
+# http://www.gnu.org/copyleft/gpl.html                                         #
+#                                                                              #
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+# Class containing methods for reading and write FASTQ data files.
+class DataIO
+  def initialize(samples, fastq_files, compress, output_dir)
+    @samples      = samples
+    @compress     = compress
+    @output_dir   = output_dir
+    @suffix1      = extract_suffix(fastq_files.grep(/_R1_/).first)
+    @suffix2      = extract_suffix(fastq_files.grep(/_R2_/).first)
+    @input_files  = identify_input_files(fastq_files)
+    @undetermined = @samples.size + 1
+    @file_hash    = nil
+  end
+  # Method that extracts the Sample, Lane, Region information from a given file.
+  #
+  # file - String with file name.
+  #
+  # Examples
+  #
+  #   extract_suffix("Sample1_S1_L001_R1_001.fastq.gz")
+  #   # => "_S1_L001_R1_001"
+  #
+  # Returns String with SLR info.
+  def extract_suffix(file)
+    if file =~ /.+(_S\d_L\d{3}_R[12]_\d{3}).+$/
+      slr = Regexp.last_match(1)
+    else
+      fail "Unable to parse file SLR from: #{file}"
+    end
+    append_suffix(slr)
+  end
+  # Method that appends a file suffix to a given Sample, Lane, Region
+  # information String based on the @options[:compress] option. The
+  # file suffix can be either ".fastq.gz", ".fastq.bz2", or ".fastq".
+  #
+  # slr - String Sample, Lane, Region information.
+  #
+  # Examples
+  #
+  #   append_suffix("_S1_L001_R1_001")
+  #   # => "_S1_L001_R1_001.fastq.gz"
+  #
+  # Returns String with SLR info and file suffix.
+  def append_suffix(slr)
+    case @compress
+    when /gzip/
+      slr << '.fastq.gz'
+    when /bzip2/
+      slr << '.fastq.bz2'
+    else
+      slr << '.fastq'
+    end
+    slr
+  end
+  # Method identify the different input files from a given Array of FASTQ files.
+  # The forward index file contains a _I1_, the reverse index file contains a
+  # _I2_, the forward read file contains a _R1_ and finally, the reverse read
+  # file contain a _R2_.
+  #
+  # fastq_files - Array with FASTQ files (Strings).
+  #
+  # Returns an Array with input files (Strings).
+  def identify_input_files(fastq_files)
+    input_files = []
+    input_files << fastq_files.grep(/_I1_/).first
+    input_files << fastq_files.grep(/_I2_/).first
+    input_files << fastq_files.grep(/_R1_/).first
+    input_files << fastq_files.grep(/_R2_/).first
+    input_files
+  end
+  # Method that opens the @input_files for reading.
+  #
+  # input_files - Array with input file paths.
+  #
+  # Returns an Array with IO objects (file handles).
+  def open_input_files
+    @file_ios = []
+    @input_files.each do |input_file|
+      @file_ios << BioPieces::Fastq.open(input_file)
+    end
+    yield self
+  ensure
+    close_input_files
+  end
+  # Method that closes open input files.
+  #
+  # Returns nothing.
+  def close_input_files
+    @file_ios.map(&:close)
+  end
+  # Method that reads a Seq entry from each of the file handles in the
+  # @file_ios Array. Iteration stops when no more Seq entries are found.
+  #
+  # Yields an Array with 4 Seq objects.
+  #
+  # Returns nothing
+  def each
+    loop do
+      entries = @file_ios.each_with_object([]) { |e, a| a << e.next_entry }
+      break if entries.compact.size != 4
+      yield entries
+    end
+  end
+  # Method that opens the output files for writing.
+  #
+  # Yeilds a Hash with an incrementing index as keys, and a tuple of file
+  # handles as values.
+  def open_output_files
+    @file_hash = {}
+    comp       = @compress
+    @file_hash.merge!(open_output_files_samples(comp))
+    @file_hash.merge!(open_output_files_undet(comp))
+    yield self
+  ensure
+    close_output_files
+  end
+  def close_output_files
+    @file_hash.each_value { |value| value.map(&:close) }
+  end
+  # Getter method that returns a tuple of file handles from @file_hash when
+  # given a key.
+  #
+  # key - Key used to lookup
+  #
+  # Returns Array with a tuple of IO objects.
+  def [](key)
+    @file_hash[key]
+  end
+  # Method that opens the sample output files for writing.
+  #
+  # comp - Symbol with type of output compression.
+  #
+  # Returns a Hash with an incrementing index as keys, and a tuple of file
+  # handles as values.
+  def open_output_files_samples(comp)
+    file_hash = {}
+    @samples.each_with_index do |sample, i|
+      file_forward = File.join(@output_dir, "#{sample.id}#{@suffix1}")
+      file_reverse = File.join(@output_dir, "#{sample.id}#{@suffix2}")
+      io_forward   = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
+      io_reverse   = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
+      file_hash[i] = [io_forward, io_reverse]
+    end
+    file_hash
+  end
+  # Method that opens the undertermined output files for writing.
+  #
+  # comp - Symbol with type of output compression.
+  #
+  # Returns a Hash with an incrementing index as keys, and a tuple of file
+  # handles as values.
+  def open_output_files_undet(comp)
+    file_hash    = {}
+    file_forward = File.join(@output_dir, "Undetermined#{@suffix1}")
+    file_reverse = File.join(@output_dir, "Undetermined#{@suffix2}")
+    io_forward   = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
+    io_reverse   = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
+    file_hash[@undetermined] = [io_forward, io_reverse]
+    file_hash
+  end
+end

data/lib/demultiplexer.rb ADDED Viewed

@@ -0,0 +1,263 @@
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+#                                                                              #
+# Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk).                #
+#                                                                              #
+# This program is free software; you can redistribute it and/or                #
+# modify it under the terms of the GNU General Public License                  #
+# as published by the Free Software Foundation; either version 2               #
+# of the License, or (at your option) any later version.                       #
+#                                                                              #
+# This program is distributed in the hope that it will be useful,              #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of               #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                #
+# GNU General Public License for more details.                                 #
+#                                                                              #
+# You should have received a copy of the GNU General Public License            #
+# along with this program; if not, write to the Free Software                  #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,    #
+# USA.                                                                         #
+#                                                                              #
+# http://www.gnu.org/copyleft/gpl.html                                         #
+#                                                                              #
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+# Class containing methods for demultiplexing MiSeq sequences.
+class Demultiplexer
+  attr_reader :status
+  # Public: Class method to run demultiplexing of MiSeq sequences.
+  #
+  # fastq_files - Array with paths to FASTQ files.
+  # options     - Options Hash.
+  #               :verbose        - Verbose flag (default: false).
+  #               :mismatches_max - Integer value indicating max mismatches
+  #                                 (default: 0).
+  #               :samples_file   - String with path to samples file.
+  #               :revcomp_index1 - Flag indicating that index1 should be
+  #                                 reverse-complemented (default: false).
+  #               :revcomp_index2 - Flag indicating that index2 should be
+  #                                 reverse-complemented (default: false).
+  #               :output_dir     - String with output directory (optional).
+  #               :scores_min     - An Integer representing the Phred score
+  #                                 minimum, such that a reads is dropped if a
+  #                                 single position in the index contain a
+  #                                 score below this value (default: 16).
+  #               :scores_mean=>  - An Integer representing the mean Phread
+  #                                 score, such that a read is dropped if the
+  #                                 mean quality score is below this value
+  #                                 (default: 16).
+  #
+  # Examples
+  #
+  #   Demultiplexer.run(['I1.fq', 'I2.fq', 'R1.fq', 'R2.fq'], \
+  #     samples_file: 'samples.txt')
+  #   # => <Demultiplexer>
+  #
+  # Returns Demultiplexer object
+  def self.run(fastq_files, options)
+    log_file      = File.join(options[:output_dir], 'Demultiplex.log')
+    demultiplexer = new(fastq_files, options)
+    Screen.clear if options[:verbose]
+    demultiplexer.demultiplex
+    puts demultiplexer.status if options[:verbose]
+    demultiplexer.status.save(log_file)
+  end
+  # Constructor method for Demultiplexer object.
+  #
+  # fastq_files - Array with paths to FASTQ files.
+  # options     - Options Hash.
+  #               :verbose        - Verbose flag (default: false).
+  #               :mismatches_max - Integer value indicating max mismatches
+  #                                 (default: 0).
+  #               :samples_file   - String with path to samples file.
+  #               :revcomp_index1 - Flag indicating that index1 should be
+  #                                 reverse-complemented (default: false).
+  #               :revcomp_index2 - Flag indicating that index2 should be
+  #                                 reverse-complemented (default: false).
+  #               :output_dir     - String with output directory (optional).
+  #               :scores_min     - An Integer representing the Phred score
+  #                                 minimum, such that a reads is dropped if a
+  #                                 single position in the index contain a
+  #                                 score below this value (default: 16).
+  #               :scores_mean=>  - An Integer representing the mean Phread
+  #                                 score, such that a read is dropped if the
+  #                                 mean quality score is below this value
+  #                                 (default: 16).
+  #
+  # Returns Demultiplexer object
+  def initialize(fastq_files, options)
+    @options      = options
+    @samples      = SampleReader.read(options[:samples_file],
+                                      options[:revcomp_index1],
+                                      options[:revcomp_index2])
+    @undetermined = @samples.size + 1
+    @index_hash   = IndexBuilder.build(@samples, options[:mismatches_max])
+    @data_io      = DataIO.new(@samples, fastq_files, options[:compress],
+                               options[:output_dir])
+    @status       = Status.new
+  end
+  # Method to demultiplex reads according the index. This is done by
+  # simultaniously read-opening all input files (forward and reverse index
+  # files and forward and reverse read files) and read one entry from each.
+  # Such four entries we call a set of entries. If the quality scores from
+  # either index1 or index2 fails the criteria for mean and min required
+  # quality the set is skipped. In the combined indexes are found in the
+  # search index, then the reads are writting to files according to the sample
+  # information in the search index. If the combined indexes are not found,
+  # then the reads have their names appended with the index sequences and the
+  # reads are written to the Undertermined files.
+  #
+  # Returns nothing.
+  def demultiplex
+    @data_io.open_input_files do |ios_in|
+      @data_io.open_output_files do |ios_out|
+        ios_in.each do |index1, index2, read1, read2|
+          @status.count += 2
+          puts(@status) if @options[:verbose] &&
+                           (@status.count % 1_000) == 0
+          next unless index_qual_ok?(index1, index2)
+          match_index(ios_out, index1, index2, read1, read2)
+          # break if @status.count == 100_000
+        end
+      end
+    end
+  end
+  private
+  # Method that matches the combined index1 and index2 sequences against the
+  # search index. In case of a match the reads are written to file according to
+  # the information in the search index, otherwise the reads will have thier
+  # names appended with the index sequences and they will be written to the
+  # Undetermined files.
+  #
+  # ios_out - DataIO object with an accessor method for file output handles.
+  # index1  - Seq object with index1.
+  # index2  - Seq object with index2.
+  # read1   - Seq object with read1.
+  # read2   - Seq object with read2.
+  #
+  # Returns nothing.
+  def match_index(ios_out, index1, index2, read1, read2)
+    if (sample_id = @index_hash["#{index1.seq}#{index2.seq}".hash])
+      write_match(ios_out, sample_id, read1, read2)
+    else
+      write_undetermined(ios_out, index1, index2, read1, read2)
+    end
+  end
+  # Method that writes a index match to file according to the information in
+  # the search index.
+  #
+  # ios_out - DataIO object with an accessor method for file output handles.
+  # read1   - Seq object with read1.
+  # read2   - Seq object with read2.
+  #
+  # Returns nothing.
+  def write_match(ios_out, sample_id, read1, read2)
+    @status.match += 2
+    io_forward, io_reverse = ios_out[sample_id]
+    io_forward.puts read1.to_fastq
+    io_reverse.puts read2.to_fastq
+  end
+  # Method that appends the read names with the index sequences and writes
+  # the reads to the Undetermined files.
+  #
+  # ios_out - DataIO object with an accessor method for file output handles.
+  # index1  - Seq object with index1.
+  # index2  - Seq object with index2.
+  # read1   - Seq object with read1.
+  # read2   - Seq object with read2.
+  #
+  # Returns nothing.
+  def write_undetermined(ios_out, index1, index2, read1, read2)
+    @status.undetermined += 2
+    read1.seq_name = "#{read1.seq_name} #{index1.seq}"
+    read2.seq_name = "#{read2.seq_name} #{index2.seq}"
+    io_forward, io_reverse = ios_out[@undetermined]
+    io_forward.puts read1.to_fastq
+    io_reverse.puts read2.to_fastq
+  end
+  # Method to check the quality scores of the given indexes.
+  # If the mean score is higher than @options[:scores_mean] or
+  # if the min score is higher than @options[:scores_min] then
+  # the indexes are OK.
+  #
+  # index1 - Index1 Seq object.
+  # index2 - Index2 Seq object.
+  #
+  # Returns true if quality OK, else false.
+  def index_qual_ok?(index1, index2)
+    index_qual_mean_ok?(index1, index2) &&
+      index_qual_min_ok?(index1, index2)
+  end
+  # Method to check the mean quality scores of the given indexes.
+  # If the mean score is higher than @options[:scores_mean] the
+  # indexes are OK.
+  #
+  # index1 - Index1 Seq object.
+  # index2 - Index2 Seq object.
+  #
+  # Returns true if quality mean OK, else false.
+  def index_qual_mean_ok?(index1, index2)
+    if index1.scores_mean < @options[:scores_mean]
+      @status.index1_bad_mean += 2
+      return false
+    elsif index2.scores_mean < @options[:scores_mean]
+      @status.index2_bad_mean += 2
+      return false
+    end
+    true
+  end
+  # Method to check the min quality scores of the given indexes.
+  # If the min score is higher than @options[:scores_min] the
+  # indexes are OK.
+  #
+  # index1 - Index1 Seq object.
+  # index2 - Index2 Seq object.
+  #
+  # Returns true if quality min OK, else false.
+  def index_qual_min_ok?(index1, index2)
+    if index1.scores_min < @options[:scores_min]
+      @status.index1_bad_min += 2
+      return false
+    elsif index2.scores_min < @options[:scores_min]
+      @status.index2_bad_min += 2
+      return false
+    end
+    true
+  end
+  # Method that iterates over @samples and compiles a sorted Array with all
+  # unique index1 sequences.
+  #
+  # Returns Array with uniq index1 sequences.
+  def uniq_index1
+    @status.index1 = @samples.each_with_object(SortedSet.new) do |a, e|
+      a << e.index1
+    end.to_a
+  end
+  # Method that iterates over @samples and compiles a sorted Array with all
+  # unique index2 sequences.
+  #
+  # Returns Array with uniq index2 sequences.
+  def uniq_index2
+    @status.index2 = @samples.each_with_object(SortedSet.new) do |a, e|
+      a << e.index2
+    end.to_a
+  end
+end