RubyGems - demultiplexer - Versions diffs - 0.0.1 → 0.1.0 - Mend

demultiplexer 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 68481bbde8b232b733c63ee1e34f2751a4df922a
-  data.tar.gz: 192a74882e331612656283c2886765098cc6a2d6
+  metadata.gz: 9fd3a1561408ab33687de0b73f424873850cb55e
+  data.tar.gz: f57280b5897b453d43b0de72ecd72414ed4ac428
 SHA512:
-  metadata.gz: 39b6727cebb26c01b30c7ce4cc5c0907c25a462093e18172c24aa6d592f6efbcacb78cdca07f14f9ba73c776eb5aaeba6349dde53c08f68d3b87a64a9d4c68b6
-  data.tar.gz: 75fb912a9facc062e348b80afec84a3aed886178b6279e99dd4edf0dc2d5624944f60e4c01eba840d2b5ec46a837033146d83e46e53d497be4eedff7d2e0d830
+  metadata.gz: 4db2e0c15013c7a03ae483d0595b87feb147ff7284f2ae3541f0fe21c7b3e57e86d25cb4c1edd71e12042f5cce58f6c8773968d756f4d9812a20886205d71475
+  data.tar.gz: 1cfea327fdda5aa1d7f263d9b6f2e43890f4fcdcdac5c1380964c62cc39918e84b9121bfa0724625601a71f65202933066229b2abd267203b01398a231b36118

data/Rakefile CHANGED Viewed

@@ -4,14 +4,17 @@ require 'rake/testtask'
 Bundler::GemHelper.install_tasks
-task :default => "test"
+task default: 'test'
 Rake::TestTask.new do |t|
-  t.test_files = Dir['test/*'].select { |f| File.basename(f).match(/^test_.+\.rb$/) }
+  t.test_files = Dir['test/*'].select do |file|
+    File.basename(file).match(/^test_.+\.rb$/)
+  end
   t.warning    = true
 end
-desc "Add or update rdoc"
+desc 'Add or update rdoc'
 task :doc do
   `rdoc lib/`
 end

data/bin/demultiplexer CHANGED Viewed

@@ -1,5 +1,8 @@
 #!/usr/bin/env ruby
+require 'optparse'
+require 'demultiplexer'
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
 #                                                                              #
 # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk).                #
@@ -23,56 +26,56 @@
 #                                                                              #
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
-USAGE = <<USAGE
-  This program demultiplexes Illumina Paired data given a samples file and four
-  FASTQ files containing forward and reverse index data and forward and reverse
-  read data.
-  The samples file consists of three tab-separated columns: sample_id, forward
-  index, reverse index).
-  The FASTQ files are generated by the Illumina MiSeq instrument by adding the
-  following key:
-    <add key="CreateFastqForIndexReads" value="1">
-  To the `MiSeq Reporter.exe.config` file located in the `MiSeq Reporter`
-  installation folder, `C:\\Illumina\\MiSeqReporter` and restarting the
-  `MiSeq Reporter` service. See the MiSeq Reporter User Guide page 29:
-  http://support.illumina.com/downloads/miseq_reporter_user_guide_15042295.html
-  Thus Basecalling using a SampleSheet.csv containing a single entry `Data` with
-  no index information will generate the following files:
-    Data_S1_L001_I1_001.fastq.gz
-    Data_S1_L001_I2_001.fastq.gz
-    Data_S1_L001_R1_001.fastq.gz
-    Data_S1_L001_R2_001.fastq.gz
-    Undetermined_S0_L001_I1_001.fastq.gz
-    Undetermined_S0_L001_I2_001.fastq.gz
-    Undetermined_S0_L001_R1_001.fastq.gz
-    Undetermined_S0_L001_R2_001.fastq.gz
-  Demultiplexing will generate file pairs according to the sample information
-  in the samples file and input file suffix, one pair per sample, and these
-  will be output to the output directory. Also a file pair with undetermined
-  reads are created where the index sequence is appended to the sequence name.
-  It is possible to allow up to three mismatches per index. Also, read pairs are
-  filtered if either of the indexes have a mean quality score below a given
-  threshold or any single position in the index have a quality score below a
-  given theshold.
-  Finally, a log file `Demultiplex.log` is output containing the status of the
-  demultiplexing process along with a list of the samples ids and unique index1
-  and index2 sequences.
-  Usage: #{File.basename(__FILE__)} [options] <FASTQ files>
-  Example: #{File.basename(__FILE__)} -m samples.tsv Data*.fastq.gz
-  Options:
+USAGE = <<USAGE.gsub(/^\s+\|/, '')
+  |This program demultiplexes Illumina Paired data given a samples file and four
+  |FASTQ files containing forward and reverse index data and forward and reverse
+  |read data.
+  |The samples file consists of three tab-separated columns: sample_id, forward
+  |index, reverse index).
+  |The FASTQ files are generated by the Illumina MiSeq instrument by adding the
+  |following key:
+  |  <add key="CreateFastqForIndexReads" value="1">
+  |To the `MiSeq Reporter.exe.config` file located in the `MiSeq Reporter`
+  |installation folder, `C:\\Illumina\\MiSeqReporter` and restarting the `MiSeq
+  |Reporter` service. See the MiSeq Reporter User Guide page 29:
+  |http://support.illumina.com/downloads/miseq_reporter_user_guide_15042295.html
+  |Thus Basecalling using a SampleSheet.csv containing a single entry `Data`
+  |with no index information will generate the following files:
+  |  Data_S1_L001_I1_001.fastq.gz
+  |  Data_S1_L001_I2_001.fastq.gz
+  |  Data_S1_L001_R1_001.fastq.gz
+  |  Data_S1_L001_R2_001.fastq.gz
+  |  Undetermined_S0_L001_I1_001.fastq.gz
+  |  Undetermined_S0_L001_I2_001.fastq.gz
+  |  Undetermined_S0_L001_R1_001.fastq.gz
+  |  Undetermined_S0_L001_R2_001.fastq.gz
+  |
+  |Demultiplexing will generate file pairs according to the sample information
+  |in the samples file and input file suffix, one pair per sample, and these
+  |will be output to the output directory. Also a file pair with undetermined
+  |reads are created where the index sequence is appended to the sequence name.
+  |
+  |It is possible to allow up to three mismatches per index. Also, read pairs
+  |are filtered if either of the indexes have a mean quality score below a given
+  |threshold or any single position in the index have a quality score below a
+  |given theshold.
+  |
+  |Finally, a log file `Demultiplex.log` is output containing the status of the
+  |demultiplexing process along with a list of the samples ids and unique index1
+  |and index2 sequences.
+  |
+  |Usage: #{File.basename(__FILE__)} [options] <FASTQ files>
+  |
+  |Example: #{File.basename(__FILE__)} -m samples.tsv Data*.fastq.gz
+  |
+  |Options:
 USAGE
 DEFAULT_SCORE_MIN  = 16
@@ -95,8 +98,8 @@ OptionParser.new do |opts|
     options[:samples_file] = o
   end
-  opts.on('-m', '--mismatches_max <uint>', Integer, "Maximum mismatches_max \
-    allowed (default=#{DEFAULT_MISMATCHES})") do |o|
+  opts.on('-m', '--mismatches_max <uint>', Integer, 'Maximum mismatches_max ',
+          "allowed (default=#{DEFAULT_MISMATCHES})") do |o|
     options[:mismatches_max] = o
   end
@@ -108,14 +111,17 @@ OptionParser.new do |opts|
     options[:revcomp_index2] = o
   end
-  opts.on('--scores_min <uint>', Integer, "Drop reads if a single position in \
-    the index have a quality score below scores_min \
-    (default=#{DEFAULT_SCORE_MIN})") do |o|
+  opts.on('--scores_min <uint>', Integer, 'Drop reads if a single position in ',
+          'the index have a quality score ',
+          'below scores_min (default= ' \
+          "#{DEFAULT_SCORE_MIN})") do |o|
     options[:scores_min] = o
   end
-  opts.on('--scores_mean <uint>', Integer, "Drop reads if the mean index \
-    quality score is below scores_mean (default=#{DEFAULT_SCORE_MEAN})") do |o|
+  opts.on('--scores_mean <uint>', Integer, 'Drop reads if the mean index',
+          'quality score is below ',
+          'scores_mean (default= ' \
+          "#{DEFAULT_SCORE_MEAN})") do |o|
     options[:scores_mean] = o
   end
@@ -123,8 +129,10 @@ OptionParser.new do |opts|
     options[:output_dir] = o
   end
-  opts.on('-c', '--compress <gzip|bzip2>', String, 'Compress output using \
-    gzip or bzip2 (default=<no compression>)') do |o|
+  opts.on('-c', '--compress <gzip|bzip2>', String, 'Compress output using ' \
+          'gzip or bzip2 ',
+          '(default=' \
+          '<no compression>)') do |o|
     options[:compress] = o.to_sym
   end

data/demultiplexer.gemspec CHANGED Viewed

@@ -1,4 +1,4 @@
-$:.push File.expand_path("../lib", __FILE__)
+$LOAD_PATH.push File.expand_path('../lib', __FILE__)
 require 'demultiplexer/version'
@@ -6,21 +6,21 @@ Gem::Specification.new do |s|
   s.name              = 'demultiplexer'
   s.version           = Demultiplexer::VERSION
   s.platform          = Gem::Platform::RUBY
-  s.date              = Time.now.strftime("%F")
-  s.summary           = "Demultiplexer"
-  s.description       = "Demultiplex sequences from the Illumina platform."
-  s.authors           = ["Martin A. Hansen"]
+  s.date              = Time.now.strftime('%F')
+  s.summary           = 'Demultiplexer'
+  s.description       = 'Demultiplex sequences from the Illumina platform.'
+  s.authors           = ['Martin A. Hansen']
   s.email             = 'mail@maasha.dk'
-  s.rubyforge_project = "demultiplexer"
+  s.rubyforge_project = 'demultiplexer'
   s.homepage          = 'http://github.com/maasha/demultiplexer'
   s.license           = 'GPL2'
-  s.rubygems_version  = "2.0.0"
+  s.rubygems_version  = '2.0.0'
   s.files             = `git ls-files`.split("\n")
   s.test_files        = `git ls-files -- {test,spec,features}/*`.split("\n")
-  s.require_paths     = ["lib"]
+  s.require_paths     = ['lib']
-  s.add_dependency("biopieces",             ">= 0.4.1")
-  s.add_dependency("google_hash",           ">= 0.8.4")
-  s.add_development_dependency("bundler",   ">= 1.7.4")
-  s.add_development_dependency("simplecov", ">= 0.9.2")
+  s.add_dependency('biopieces',             '>= 0.4.1')
+  s.add_dependency('google_hash',           '>= 0.8.4')
+  s.add_development_dependency('bundler',   '>= 1.7.4')
+  s.add_development_dependency('simplecov', '>= 0.9.2')
 end

data/lib/data_io.rb CHANGED Viewed

@@ -21,40 +21,126 @@
 #                                                                              #
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+# Error class for all errors to do with DataIO.
+DataIOError = Class.new(StandardError)
 # Class containing methods for reading and write FASTQ data files.
 class DataIO
+  # Internal: Constructor method for DataIO objects.
+  #
+  # samples     - Array with Sample objects consisting id, index1 and index2
+  # fastq_files - Array of Strings with FASTQ file names of multiplexed data.
+  # compress    - Symbol indicating if output data should be compressed with
+  #               either gzip or bzip2.
+  # output_dir  - String with path of output directory.
+  #
+  # Returns DataIO object.
   def initialize(samples, fastq_files, compress, output_dir)
-    @samples      = samples
-    @compress     = compress
-    @output_dir   = output_dir
-    @suffix1      = extract_suffix(fastq_files.grep(/_R1_/).first)
-    @suffix2      = extract_suffix(fastq_files.grep(/_R2_/).first)
-    @input_files  = identify_input_files(fastq_files)
-    @undetermined = @samples.size + 1
-    @file_hash    = nil
+    @samples         = samples
+    @compress        = compress
+    @output_dir      = output_dir
+    @suffix1         = extract_suffix(fastq_files, '_R1_')
+    @suffix2         = extract_suffix(fastq_files, '_R2_')
+    @input_files     = identify_input_files(fastq_files)
+    @undetermined    = @samples.size
+    @output_file_ios = nil
+  end
+  # Internal: Method that opens the @input_files for reading.
+  #
+  # input_files - Array with input file paths.
+  #
+  # Returns an Array with IO objects (file handles).
+  def open_input_files
+    @input_file_ios = []
+    @input_files.each do |input_file|
+      @input_file_ios << BioPieces::Fastq.open(input_file)
+    end
+    yield self
+  ensure
+    close_input_files
+  end
+  # Internal: Method that opens the output files for writing.
+  #
+  # Yields a Hash with an incrementing index as keys, and a tuple of file
+  # handles as values.
+  def open_output_files
+    @output_file_ios = {}
+    comp             = @compress
+    @output_file_ios.merge!(open_output_files_samples(comp))
+    @output_file_ios.merge!(open_output_files_undet(comp))
+    yield self
+  ensure
+    close_output_files
+  end
+  # Internal: Method that reads a Seq entry from each of the file handles in
+  # the @input_file_ios Array. Iteration stops when no more Seq entries are
+  # found.
+  #
+  # Yields an Array with 4 Seq objects.
+  #
+  # Returns nothing
+  def each
+    loop do
+      entries = @input_file_ios.each_with_object([]) do |e, a|
+        a << e.next_entry
+      end
+      break if entries.compact.size != 4
+      yield entries
+    end
+  end
+  # Internal: Getter method that returns a tuple of file handles from
+  # @output_file_ios when given a sample index key.
+  #
+  # key - Sample index Integer key used for lookup.
+  #
+  # Returns Array with a tuple of IO objects.
+  def [](key)
+    @output_file_ios[key]
   end
-  # Method that extracts the Sample, Lane, Region information from a given file.
+  private
+  # Internal: Method that extracts the Sample, Lane, Region information from
+  # given files.
   #
-  # file - String with file name.
+  # files   - Array with FASTQ file names as Strings.
+  # pattern - String with pattern to use for matching file names.
   #
   # Examples
   #
-  #   extract_suffix("Sample1_S1_L001_R1_001.fastq.gz")
+  #   extract_suffix("Sample1_S1_L001_R1_001.fastq.gz", "_R1_")
   #   # => "_S1_L001_R1_001"
   #
   # Returns String with SLR info.
-  def extract_suffix(file)
-    if file =~ /.+(_S\d_L\d{3}_R[12]_\d{3}).+$/
+  # Raises unless pattern match exactly 1 file.
+  # Raises unless SLR info can be parsed.
+  def extract_suffix(files, pattern)
+    hits = files.grep(Regexp.new(pattern))
+    unless hits.size == 1
+      fail DataIOError, "Expecting exactly 1 hit but got: #{hits.size}"
+    end
+    if hits.first =~ /.+(_S\d_L\d{3}_R[12]_\d{3}).+$/
       slr = Regexp.last_match(1)
     else
-      fail "Unable to parse file SLR from: #{file}"
+      fail DataIOError, "Unable to parse file SLR from: #{hits.first}"
     end
     append_suffix(slr)
   end
-  # Method that appends a file suffix to a given Sample, Lane, Region
+  # Internal: Method that appends a file suffix to a given Sample, Lane, Region
   # information String based on the @options[:compress] option. The
   # file suffix can be either ".fastq.gz", ".fastq.bz2", or ".fastq".
   #
@@ -79,14 +165,15 @@ class DataIO
     slr
   end
-  # Method identify the different input files from a given Array of FASTQ files.
-  # The forward index file contains a _I1_, the reverse index file contains a
-  # _I2_, the forward read file contains a _R1_ and finally, the reverse read
-  # file contain a _R2_.
+  # Internal: Method identify the different input files from a given Array of
+  # FASTQ files. The forward index file contains a _I1_, the reverse index file
+  # contains a _I2_, the forward read file contains a _R1_ and finally, the
+  # reverse read file contain a _R2_.
   #
   # fastq_files - Array with FASTQ files (Strings).
   #
   # Returns an Array with input files (Strings).
+  # Raises unless 4 input_files are found.
   def identify_input_files(fastq_files)
     input_files = []
@@ -95,113 +182,62 @@ class DataIO
     input_files << fastq_files.grep(/_R1_/).first
     input_files << fastq_files.grep(/_R2_/).first
-    input_files
-  end
-  # Method that opens the @input_files for reading.
-  #
-  # input_files - Array with input file paths.
-  #
-  # Returns an Array with IO objects (file handles).
-  def open_input_files
-    @file_ios = []
-    @input_files.each do |input_file|
-      @file_ios << BioPieces::Fastq.open(input_file)
-    end
-    yield self
-  ensure
-    close_input_files
-  end
-  # Method that closes open input files.
-  #
-  # Returns nothing.
-  def close_input_files
-    @file_ios.map(&:close)
-  end
-  # Method that reads a Seq entry from each of the file handles in the
-  # @file_ios Array. Iteration stops when no more Seq entries are found.
-  #
-  # Yields an Array with 4 Seq objects.
-  #
-  # Returns nothing
-  def each
-    loop do
-      entries = @file_ios.each_with_object([]) { |e, a| a << e.next_entry }
-      break if entries.compact.size != 4
-      yield entries
+    unless input_files.compact.size == 4
+      fail DataIOError, 'Expecting exactly 4 input_files but got: ' \
+                        "#{input_files.compact.size}"
     end
-  end
-  # Method that opens the output files for writing.
-  #
-  # Yeilds a Hash with an incrementing index as keys, and a tuple of file
-  # handles as values.
-  def open_output_files
-    @file_hash = {}
-    comp       = @compress
-    @file_hash.merge!(open_output_files_samples(comp))
-    @file_hash.merge!(open_output_files_undet(comp))
-    yield self
-  ensure
-    close_output_files
-  end
-  def close_output_files
-    @file_hash.each_value { |value| value.map(&:close) }
-  end
-  # Getter method that returns a tuple of file handles from @file_hash when
-  # given a key.
-  #
-  # key - Key used to lookup
-  #
-  # Returns Array with a tuple of IO objects.
-  def [](key)
-    @file_hash[key]
+    input_files
   end
-  # Method that opens the sample output files for writing.
+  # Internal: Method that opens the sample output files for writing.
   #
   # comp - Symbol with type of output compression.
   #
   # Returns a Hash with an incrementing index as keys, and a tuple of file
   # handles as values.
   def open_output_files_samples(comp)
-    file_hash = {}
+    output_file_ios = {}
     @samples.each_with_index do |sample, i|
       file_forward = File.join(@output_dir, "#{sample.id}#{@suffix1}")
       file_reverse = File.join(@output_dir, "#{sample.id}#{@suffix2}")
       io_forward   = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
       io_reverse   = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
-      file_hash[i] = [io_forward, io_reverse]
+      output_file_ios[i] = [io_forward, io_reverse]
     end
-    file_hash
+    output_file_ios
   end
-  # Method that opens the undertermined output files for writing.
+  # Internal: Method that opens the undertermined output files for writing.
   #
   # comp - Symbol with type of output compression.
   #
   # Returns a Hash with an incrementing index as keys, and a tuple of file
   # handles as values.
   def open_output_files_undet(comp)
-    file_hash    = {}
+    output_file_ios    = {}
     file_forward = File.join(@output_dir, "Undetermined#{@suffix1}")
     file_reverse = File.join(@output_dir, "Undetermined#{@suffix2}")
     io_forward   = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
     io_reverse   = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
-    file_hash[@undetermined] = [io_forward, io_reverse]
+    output_file_ios[@undetermined] = [io_forward, io_reverse]
-    file_hash
+    output_file_ios
+  end
+  # Internal: Method that closes open input files.
+  #
+  # Returns nothing.
+  def close_input_files
+    @input_file_ios.map(&:close)
+  end
+  # Internal: Method that closes the file handles stored in @output_file_ios.
+  #
+  # Returns nothing.
+  def close_output_files
+    @output_file_ios.each_value { |value| value.map(&:close) }
   end
 end

data/lib/demultiplexer/version.rb CHANGED Viewed

@@ -21,6 +21,7 @@
 #                                                                              #
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
+# Adding VERSION constant to class.
 class Demultiplexer
-  VERSION = "0.0.1"
+  VERSION = '0.1.0'
 end