RubyGems - demultiplexer - Versions diffs - 0.0.1 → 0.1.0 - Mend

demultiplexer 0.0.1 → 0.1.0

Files changed (18) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 68481bbde8b232b733c63ee1e34f2751a4df922a
-  data.tar.gz: 192a74882e331612656283c2886765098cc6a2d6
+  metadata.gz: 9fd3a1561408ab33687de0b73f424873850cb55e
+  data.tar.gz: f57280b5897b453d43b0de72ecd72414ed4ac428
 SHA512:
-  metadata.gz: 39b6727cebb26c01b30c7ce4cc5c0907c25a462093e18172c24aa6d592f6efbcacb78cdca07f14f9ba73c776eb5aaeba6349dde53c08f68d3b87a64a9d4c68b6
-  data.tar.gz: 75fb912a9facc062e348b80afec84a3aed886178b6279e99dd4edf0dc2d5624944f60e4c01eba840d2b5ec46a837033146d83e46e53d497be4eedff7d2e0d830
+  metadata.gz: 4db2e0c15013c7a03ae483d0595b87feb147ff7284f2ae3541f0fe21c7b3e57e86d25cb4c1edd71e12042f5cce58f6c8773968d756f4d9812a20886205d71475
+  data.tar.gz: 1cfea327fdda5aa1d7f263d9b6f2e43890f4fcdcdac5c1380964c62cc39918e84b9121bfa0724625601a71f65202933066229b2abd267203b01398a231b36118

data/Rakefile CHANGED Viewed

@@ -4,14 +4,17 @@ require 'rake/testtask'
 Bundler::GemHelper.install_tasks
-task :default => "test"
+task default: 'test'
 Rake::TestTask.new do |t|
-  t.test_files = Dir['test/*'].select { |f| File.basename(f).match(/^test_.+\.rb$/) }
+  t.test_files = Dir['test/*'].select do |file|
+    File.basename(file).match(/^test_.+\.rb$/)
+  end
   t.warning    = true
 end
-desc "Add or update rdoc"
+desc 'Add or update rdoc'
 task :doc do
   `rdoc lib/`
 end

data/bin/demultiplexer CHANGED Viewed

@@ -1,5 +1,8 @@
 #!/usr/bin/env ruby
+require 'optparse'
+require 'demultiplexer'
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
 #                                                                              #
 # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk).                #
@@ -23,56 +26,56 @@
 #                                                                              #
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
-USAGE = <<USAGE
-  This program demultiplexes Illumina Paired data given a samples file and four
-  FASTQ files containing forward and reverse index data and forward and reverse
-  read data.
-  The samples file consists of three tab-separated columns: sample_id, forward
-  index, reverse index).
-  The FASTQ files are generated by the Illumina MiSeq instrument by adding the
-  following key:
-    <add key="CreateFastqForIndexReads" value="1">
-  To the `MiSeq Reporter.exe.config` file located in the `MiSeq Reporter`
-  installation folder, `C:\\Illumina\\MiSeqReporter` and restarting the
-  `MiSeq Reporter` service. See the MiSeq Reporter User Guide page 29:
-  http://support.illumina.com/downloads/miseq_reporter_user_guide_15042295.html
-  Thus Basecalling using a SampleSheet.csv containing a single entry `Data` with
-  no index information will generate the following files:
-    Data_S1_L001_I1_001.fastq.gz
-    Data_S1_L001_I2_001.fastq.gz
-    Data_S1_L001_R1_001.fastq.gz
-    Data_S1_L001_R2_001.fastq.gz
-    Undetermined_S0_L001_I1_001.fastq.gz
-    Undetermined_S0_L001_I2_001.fastq.gz
-    Undetermined_S0_L001_R1_001.fastq.gz
-    Undetermined_S0_L001_R2_001.fastq.gz
-  Demultiplexing will generate file pairs according to the sample information
-  in the samples file and input file suffix, one pair per sample, and these
-  will be output to the output directory. Also a file pair with undetermined
-  reads are created where the index sequence is appended to the sequence name.
-  It is possible to allow up to three mismatches per index. Also, read pairs are
-  filtered if either of the indexes have a mean quality score below a given
-  threshold or any single position in the index have a quality score below a
-  given theshold.
-  Finally, a log file `Demultiplex.log` is output containing the status of the
-  demultiplexing process along with a list of the samples ids and unique index1
-  and index2 sequences.
-  Usage: #{File.basename(__FILE__)} [options] <FASTQ files>
-  Example: #{File.basename(__FILE__)} -m samples.tsv Data*.fastq.gz
-  Options:
+USAGE = <<USAGE.gsub(/^\s+\|/, '')
+  |This program demultiplexes Illumina Paired data given a samples file and four
+  |FASTQ files containing forward and reverse index data and forward and reverse
+  |read data.
+  |The samples file consists of three tab-separated columns: sample_id, forward
+  |index, reverse index).
+  |The FASTQ files are generated by the Illumina MiSeq instrument by adding the
+  |following key:
+  |  <add key="CreateFastqForIndexReads" value="1">
+  |To the `MiSeq Reporter.exe.config` file located in the `MiSeq Reporter`
+  |installation folder, `C:\\Illumina\\MiSeqReporter` and restarting the `MiSeq
+  |Reporter` service. See the MiSeq Reporter User Guide page 29:
+  |http://support.illumina.com/downloads/miseq_reporter_user_guide_15042295.html
+  |Thus Basecalling using a SampleSheet.csv containing a single entry `Data`
+  |with no index information will generate the following files:
+  |  Data_S1_L001_I1_001.fastq.gz
+  |  Data_S1_L001_I2_001.fastq.gz
+  |  Data_S1_L001_R1_001.fastq.gz
+  |  Data_S1_L001_R2_001.fastq.gz
+  |  Undetermined_S0_L001_I1_001.fastq.gz
+  |  Undetermined_S0_L001_I2_001.fastq.gz
+  |  Undetermined_S0_L001_R1_001.fastq.gz
+  |  Undetermined_S0_L001_R2_001.fastq.gz
+  |
+  |Demultiplexing will generate file pairs according to the sample information
+  |in the samples file and input file suffix, one pair per sample, and these
+  |will be output to the output directory. Also a file pair with undetermined
+  |reads are created where the index sequence is appended to the sequence name.
+  |
+  |It is possible to allow up to three mismatches per index. Also, read pairs
+  |are filtered if either of the indexes have a mean quality score below a given
+  |threshold or any single position in the index have a quality score below a
+  |given theshold.
+  |
+  |Finally, a log file `Demultiplex.log` is output containing the status of the
+  |demultiplexing process along with a list of the samples ids and unique index1
+  |and index2 sequences.
+  |
+  |Usage: #{File.basename(__FILE__)} [options] <FASTQ files>
+  |
+  |Example: #{File.basename(__FILE__)} -m samples.tsv Data*.fastq.gz
+  |
+  |Options:
 USAGE
 DEFAULT_SCORE_MIN  = 16
@@ -95,8 +98,8 @@ OptionParser.new do |opts|
     options[:samples_file] = o
   end
-  opts.on('-m', '--mismatches_max <uint>', Integer, "Maximum mismatches_max \
-    allowed (default=#{DEFAULT_MISMATCHES})") do |o|
+  opts.on('-m', '--mismatches_max <uint>', Integer, 'Maximum mismatches_max ',
+          "allowed (default=#{DEFAULT_MISMATCHES})") do |o|
     options[:mismatches_max] = o
   end
@@ -108,14 +111,17 @@ OptionParser.new do |opts|
     options[:revcomp_index2] = o
   end
-  opts.on('--scores_min <uint>', Integer, "Drop reads if a single position in \
-    the index have a quality score below scores_min \
-    (default=#{DEFAULT_SCORE_MIN})") do |o|
+  opts.on('--scores_min <uint>', Integer, 'Drop reads if a single position in ',
+          'the index have a quality score ',
+          'below scores_min (default= ' \
+          "#{DEFAULT_SCORE_MIN})") do |o|
     options[:scores_min] = o
   end
-  opts.on('--scores_mean <uint>', Integer, "Drop reads if the mean index \
-    quality score is below scores_mean (default=#{DEFAULT_SCORE_MEAN})") do |o|
+  opts.on('--scores_mean <uint>', Integer, 'Drop reads if the mean index',
+          'quality score is below ',
+          'scores_mean (default= ' \
+          "#{DEFAULT_SCORE_MEAN})") do |o|
     options[:scores_mean] = o
   end
@@ -123,8 +129,10 @@ OptionParser.new do |opts|
     options[:output_dir] = o
   end
-  opts.on('-c', '--compress <gzip|bzip2>', String, 'Compress output using \
-    gzip or bzip2 (default=<no compression>)') do |o|
+  opts.on('-c', '--compress <gzip|bzip2>', String, 'Compress output using ' \
+          'gzip or bzip2 ',
+          '(default=' \
+          '<no compression>)') do |o|
     options[:compress] = o.to_sym
   end

data/demultiplexer.gemspec CHANGED Viewed

@@ -1,4 +1,4 @@
-$:.push File.expand_path("../lib", __FILE__)
+$LOAD_PATH.push File.expand_path('../lib', __FILE__)
 require 'demultiplexer/version'
@@ -6,21 +6,21 @@ Gem::Specification.new do |s|
   s.name              = 'demultiplexer'
   s.version           = Demultiplexer::VERSION
   s.platform          = Gem::Platform::RUBY
-  s.date              = Time.now.strftime("%F")
-  s.summary           = "Demultiplexer"
-  s.description       = "Demultiplex sequences from the Illumina platform."
-  s.authors           = ["Martin A. Hansen"]
+  s.date              = Time.now.strftime('%F')
+  s.summary           = 'Demultiplexer'
+  s.description       = 'Demultiplex sequences from the Illumina platform.'
+  s.authors           = ['Martin A. Hansen']
   s.email             = 'mail@maasha.dk'
-  s.rubyforge_project = "demultiplexer"
+  s.rubyforge_project = 'demultiplexer'
   s.homepage          = 'http://github.com/maasha/demultiplexer'
   s.license           = 'GPL2'
-  s.rubygems_version  = "2.0.0"
+  s.rubygems_version  = '2.0.0'
   s.files             = `git ls-files`.split("\n")
   s.test_files        = `git ls-files -- {test,spec,features}/*`.split("\n")
-  s.require_paths     = ["lib"]
+  s.require_paths     = ['lib']
-  s.add_dependency("biopieces",             ">= 0.4.1")
-  s.add_dependency("google_hash",           ">= 0.8.4")
-  s.add_development_dependency("bundler",   ">= 1.7.4")
-  s.add_development_dependency("simplecov", ">= 0.9.2")
+  s.add_dependency('biopieces',             '>= 0.4.1')
+  s.add_dependency('google_hash',           '>= 0.8.4')
+  s.add_development_dependency('bundler',   '>= 1.7.4')
+  s.add_development_dependency('simplecov', '>= 0.9.2')
 end

data/lib/data_io.rb CHANGED Viewed

@@ -21,40 +21,126 @@
 #                                                                              #
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+# Error class for all errors to do with DataIO.
+DataIOError = Class.new(StandardError)
 # Class containing methods for reading and write FASTQ data files.
 class DataIO
+  # Internal: Constructor method for DataIO objects.
+  #
+  # samples     - Array with Sample objects consisting id, index1 and index2
+  # fastq_files - Array of Strings with FASTQ file names of multiplexed data.
+  # compress    - Symbol indicating if output data should be compressed with
+  #               either gzip or bzip2.
+  # output_dir  - String with path of output directory.
+  #
+  # Returns DataIO object.
   def initialize(samples, fastq_files, compress, output_dir)
-    @samples      = samples
-    @compress     = compress
-    @output_dir   = output_dir
-    @suffix1      = extract_suffix(fastq_files.grep(/_R1_/).first)
-    @suffix2      = extract_suffix(fastq_files.grep(/_R2_/).first)
-    @input_files  = identify_input_files(fastq_files)
-    @undetermined = @samples.size + 1
-    @file_hash    = nil
+    @samples         = samples
+    @compress        = compress
+    @output_dir      = output_dir
+    @suffix1         = extract_suffix(fastq_files, '_R1_')
+    @suffix2         = extract_suffix(fastq_files, '_R2_')
+    @input_files     = identify_input_files(fastq_files)
+    @undetermined    = @samples.size
+    @output_file_ios = nil
+  end
+  # Internal: Method that opens the @input_files for reading.
+  #
+  # input_files - Array with input file paths.
+  #
+  # Returns an Array with IO objects (file handles).
+  def open_input_files
+    @input_file_ios = []
+    @input_files.each do |input_file|
+      @input_file_ios << BioPieces::Fastq.open(input_file)
+    end
+    yield self
+  ensure
+    close_input_files
+  end
+  # Internal: Method that opens the output files for writing.
+  #
+  # Yields a Hash with an incrementing index as keys, and a tuple of file
+  # handles as values.
+  def open_output_files
+    @output_file_ios = {}
+    comp             = @compress
+    @output_file_ios.merge!(open_output_files_samples(comp))
+    @output_file_ios.merge!(open_output_files_undet(comp))
+    yield self
+  ensure
+    close_output_files
+  end
+  # Internal: Method that reads a Seq entry from each of the file handles in
+  # the @input_file_ios Array. Iteration stops when no more Seq entries are
+  # found.
+  #
+  # Yields an Array with 4 Seq objects.
+  #
+  # Returns nothing
+  def each
+    loop do
+      entries = @input_file_ios.each_with_object([]) do |e, a|
+        a << e.next_entry
+      end
+      break if entries.compact.size != 4
+      yield entries
+    end
+  end
+  # Internal: Getter method that returns a tuple of file handles from
+  # @output_file_ios when given a sample index key.
+  #
+  # key - Sample index Integer key used for lookup.
+  #
+  # Returns Array with a tuple of IO objects.
+  def [](key)
+    @output_file_ios[key]
   end
-  # Method that extracts the Sample, Lane, Region information from a given file.
+  private
+  # Internal: Method that extracts the Sample, Lane, Region information from
+  # given files.
   #
-  # file - String with file name.
+  # files   - Array with FASTQ file names as Strings.
+  # pattern - String with pattern to use for matching file names.
   #
   # Examples
   #
-  #   extract_suffix("Sample1_S1_L001_R1_001.fastq.gz")
+  #   extract_suffix("Sample1_S1_L001_R1_001.fastq.gz", "_R1_")
   #   # => "_S1_L001_R1_001"
   #
   # Returns String with SLR info.
-  def extract_suffix(file)
-    if file =~ /.+(_S\d_L\d{3}_R[12]_\d{3}).+$/
+  # Raises unless pattern match exactly 1 file.
+  # Raises unless SLR info can be parsed.
+  def extract_suffix(files, pattern)
+    hits = files.grep(Regexp.new(pattern))
+    unless hits.size == 1
+      fail DataIOError, "Expecting exactly 1 hit but got: #{hits.size}"
+    end
+    if hits.first =~ /.+(_S\d_L\d{3}_R[12]_\d{3}).+$/
       slr = Regexp.last_match(1)
     else
-      fail "Unable to parse file SLR from: #{file}"
+      fail DataIOError, "Unable to parse file SLR from: #{hits.first}"
     end
     append_suffix(slr)
   end
-  # Method that appends a file suffix to a given Sample, Lane, Region
+  # Internal: Method that appends a file suffix to a given Sample, Lane, Region
   # information String based on the @options[:compress] option. The
   # file suffix can be either ".fastq.gz", ".fastq.bz2", or ".fastq".
   #
@@ -79,14 +165,15 @@ class DataIO
     slr
   end
-  # Method identify the different input files from a given Array of FASTQ files.
-  # The forward index file contains a _I1_, the reverse index file contains a
-  # _I2_, the forward read file contains a _R1_ and finally, the reverse read
-  # file contain a _R2_.
+  # Internal: Method identify the different input files from a given Array of
+  # FASTQ files. The forward index file contains a _I1_, the reverse index file
+  # contains a _I2_, the forward read file contains a _R1_ and finally, the
+  # reverse read file contain a _R2_.
   #
   # fastq_files - Array with FASTQ files (Strings).
   #
   # Returns an Array with input files (Strings).
+  # Raises unless 4 input_files are found.
   def identify_input_files(fastq_files)
     input_files = []
@@ -95,113 +182,62 @@ class DataIO
     input_files << fastq_files.grep(/_R1_/).first
     input_files << fastq_files.grep(/_R2_/).first
-    input_files
-  end
-  # Method that opens the @input_files for reading.
-  #
-  # input_files - Array with input file paths.
-  #
-  # Returns an Array with IO objects (file handles).
-  def open_input_files
-    @file_ios = []
-    @input_files.each do |input_file|
-      @file_ios << BioPieces::Fastq.open(input_file)
-    end
-    yield self
-  ensure
-    close_input_files
-  end
-  # Method that closes open input files.
-  #
-  # Returns nothing.
-  def close_input_files
-    @file_ios.map(&:close)
-  end
-  # Method that reads a Seq entry from each of the file handles in the
-  # @file_ios Array. Iteration stops when no more Seq entries are found.
-  #
-  # Yields an Array with 4 Seq objects.
-  #
-  # Returns nothing
-  def each
-    loop do
-      entries = @file_ios.each_with_object([]) { |e, a| a << e.next_entry }
-      break if entries.compact.size != 4
-      yield entries
+    unless input_files.compact.size == 4
+      fail DataIOError, 'Expecting exactly 4 input_files but got: ' \
+                        "#{input_files.compact.size}"
     end
-  end
-  # Method that opens the output files for writing.
-  #
-  # Yeilds a Hash with an incrementing index as keys, and a tuple of file
-  # handles as values.
-  def open_output_files
-    @file_hash = {}
-    comp       = @compress
-    @file_hash.merge!(open_output_files_samples(comp))
-    @file_hash.merge!(open_output_files_undet(comp))
-    yield self
-  ensure
-    close_output_files
-  end
-  def close_output_files
-    @file_hash.each_value { |value| value.map(&:close) }
-  end
-  # Getter method that returns a tuple of file handles from @file_hash when
-  # given a key.
-  #
-  # key - Key used to lookup
-  #
-  # Returns Array with a tuple of IO objects.
-  def [](key)
-    @file_hash[key]
+    input_files
   end
-  # Method that opens the sample output files for writing.
+  # Internal: Method that opens the sample output files for writing.
   #
   # comp - Symbol with type of output compression.
   #
   # Returns a Hash with an incrementing index as keys, and a tuple of file
   # handles as values.
   def open_output_files_samples(comp)
-    file_hash = {}
+    output_file_ios = {}
     @samples.each_with_index do |sample, i|
       file_forward = File.join(@output_dir, "#{sample.id}#{@suffix1}")
       file_reverse = File.join(@output_dir, "#{sample.id}#{@suffix2}")
       io_forward   = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
       io_reverse   = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
-      file_hash[i] = [io_forward, io_reverse]
+      output_file_ios[i] = [io_forward, io_reverse]
     end
-    file_hash
+    output_file_ios
   end
-  # Method that opens the undertermined output files for writing.
+  # Internal: Method that opens the undertermined output files for writing.
   #
   # comp - Symbol with type of output compression.
   #
   # Returns a Hash with an incrementing index as keys, and a tuple of file
   # handles as values.
   def open_output_files_undet(comp)
-    file_hash    = {}
+    output_file_ios    = {}
     file_forward = File.join(@output_dir, "Undetermined#{@suffix1}")
     file_reverse = File.join(@output_dir, "Undetermined#{@suffix2}")
     io_forward   = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
     io_reverse   = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
-    file_hash[@undetermined] = [io_forward, io_reverse]
+    output_file_ios[@undetermined] = [io_forward, io_reverse]
-    file_hash
+    output_file_ios
+  end
+  # Internal: Method that closes open input files.
+  #
+  # Returns nothing.
+  def close_input_files
+    @input_file_ios.map(&:close)
+  end
+  # Internal: Method that closes the file handles stored in @output_file_ios.
+  #
+  # Returns nothing.
+  def close_output_files
+    @output_file_ios.each_value { |value| value.map(&:close) }
   end
 end

data/lib/demultiplexer/version.rb CHANGED Viewed

@@ -21,6 +21,7 @@
 #                                                                              #
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+# Adding VERSION constant to class.
 class Demultiplexer
-  VERSION = "0.0.1"
+  VERSION = '0.1.0'
 end