demultiplexer 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/demultiplexer.rb CHANGED
@@ -21,8 +21,22 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ require 'google_hash'
25
+ require 'status'
26
+ require 'sample_reader'
27
+ require 'index_builder'
28
+ require 'data_io'
29
+
24
30
  # Class containing methods for demultiplexing MiSeq sequences.
25
31
  class Demultiplexer
32
+ DEFAULT = { verbose: false,
33
+ mismatches_max: 0,
34
+ revcomp_index1: false,
35
+ revcomp_index2: false,
36
+ scores_min: 16,
37
+ scores_mean: 16
38
+ }
39
+
26
40
  attr_reader :status
27
41
 
28
42
  # Public: Class method to run demultiplexing of MiSeq sequences.
@@ -55,6 +69,7 @@ class Demultiplexer
55
69
  #
56
70
  # Returns Demultiplexer object
57
71
  def self.run(fastq_files, options)
72
+ options = DEFAULT.merge(options)
58
73
  log_file = File.join(options[:output_dir], 'Demultiplex.log')
59
74
  demultiplexer = new(fastq_files, options)
60
75
  Screen.clear if options[:verbose]
@@ -63,7 +78,7 @@ class Demultiplexer
63
78
  demultiplexer.status.save(log_file)
64
79
  end
65
80
 
66
- # Constructor method for Demultiplexer object.
81
+ # Internal: Constructor method for Demultiplexer object.
67
82
  #
68
83
  # fastq_files - Array with paths to FASTQ files.
69
84
  # options - Options Hash.
@@ -91,14 +106,14 @@ class Demultiplexer
91
106
  @samples = SampleReader.read(options[:samples_file],
92
107
  options[:revcomp_index1],
93
108
  options[:revcomp_index2])
94
- @undetermined = @samples.size + 1
109
+ @undetermined = @samples.size
95
110
  @index_hash = IndexBuilder.build(@samples, options[:mismatches_max])
96
111
  @data_io = DataIO.new(@samples, fastq_files, options[:compress],
97
112
  options[:output_dir])
98
- @status = Status.new
113
+ @status = Status.new(@samples)
99
114
  end
100
115
 
101
- # Method to demultiplex reads according the index. This is done by
116
+ # Internal: Method to demultiplex reads according the index. This is done by
102
117
  # simultaniously read-opening all input files (forward and reverse index
103
118
  # files and forward and reverse read files) and read one entry from each.
104
119
  # Such four entries we call a set of entries. If the quality scores from
@@ -130,11 +145,11 @@ class Demultiplexer
130
145
 
131
146
  private
132
147
 
133
- # Method that matches the combined index1 and index2 sequences against the
134
- # search index. In case of a match the reads are written to file according to
135
- # the information in the search index, otherwise the reads will have thier
136
- # names appended with the index sequences and they will be written to the
137
- # Undetermined files.
148
+ # Internal: Method that matches the combined index1 and index2 sequences
149
+ # against the search index. In case of a match the reads are written to file
150
+ # according to the information in the search index, otherwise the reads will
151
+ # have thier names appended with the index sequences and they will be written
152
+ # to the Undetermined files.
138
153
  #
139
154
  # ios_out - DataIO object with an accessor method for file output handles.
140
155
  # index1 - Seq object with index1.
@@ -144,15 +159,17 @@ class Demultiplexer
144
159
  #
145
160
  # Returns nothing.
146
161
  def match_index(ios_out, index1, index2, read1, read2)
147
- if (sample_id = @index_hash["#{index1.seq}#{index2.seq}".hash])
162
+ key = "#{index1.seq.upcase}#{index2.seq.upcase}".hash
163
+
164
+ if (sample_id = @index_hash[key])
148
165
  write_match(ios_out, sample_id, read1, read2)
149
166
  else
150
167
  write_undetermined(ios_out, index1, index2, read1, read2)
151
168
  end
152
169
  end
153
170
 
154
- # Method that writes a index match to file according to the information in
155
- # the search index.
171
+ # Internal: Method that writes a index match to file according to the
172
+ # information in the search index.
156
173
  #
157
174
  # ios_out - DataIO object with an accessor method for file output handles.
158
175
  # read1 - Seq object with read1.
@@ -167,8 +184,8 @@ class Demultiplexer
167
184
  io_reverse.puts read2.to_fastq
168
185
  end
169
186
 
170
- # Method that appends the read names with the index sequences and writes
171
- # the reads to the Undetermined files.
187
+ # Internal: Method that appends the read names with the index sequences and
188
+ # writes the reads to the Undetermined files.
172
189
  #
173
190
  # ios_out - DataIO object with an accessor method for file output handles.
174
191
  # index1 - Seq object with index1.
@@ -187,7 +204,7 @@ class Demultiplexer
187
204
  io_reverse.puts read2.to_fastq
188
205
  end
189
206
 
190
- # Method to check the quality scores of the given indexes.
207
+ # Internal: Method to check the quality scores of the given indexes.
191
208
  # If the mean score is higher than @options[:scores_mean] or
192
209
  # if the min score is higher than @options[:scores_min] then
193
210
  # the indexes are OK.
@@ -201,7 +218,7 @@ class Demultiplexer
201
218
  index_qual_min_ok?(index1, index2)
202
219
  end
203
220
 
204
- # Method to check the mean quality scores of the given indexes.
221
+ # Internal: Method to check the mean quality scores of the given indexes.
205
222
  # If the mean score is higher than @options[:scores_mean] the
206
223
  # indexes are OK.
207
224
  #
@@ -221,7 +238,7 @@ class Demultiplexer
221
238
  true
222
239
  end
223
240
 
224
- # Method to check the min quality scores of the given indexes.
241
+ # Internal: Method to check the min quality scores of the given indexes.
225
242
  # If the min score is higher than @options[:scores_min] the
226
243
  # indexes are OK.
227
244
  #
@@ -240,24 +257,4 @@ class Demultiplexer
240
257
 
241
258
  true
242
259
  end
243
-
244
- # Method that iterates over @samples and compiles a sorted Array with all
245
- # unique index1 sequences.
246
- #
247
- # Returns Array with uniq index1 sequences.
248
- def uniq_index1
249
- @status.index1 = @samples.each_with_object(SortedSet.new) do |a, e|
250
- a << e.index1
251
- end.to_a
252
- end
253
-
254
- # Method that iterates over @samples and compiles a sorted Array with all
255
- # unique index2 sequences.
256
- #
257
- # Returns Array with uniq index2 sequences.
258
- def uniq_index2
259
- @status.index2 = @samples.each_with_object(SortedSet.new) do |a, e|
260
- a << e.index2
261
- end.to_a
262
- end
263
260
  end
data/lib/index_builder.rb CHANGED
@@ -21,9 +21,17 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ # Class for IndexBuilder errors.
25
+ IndexBuilderError = Class.new(StandardError)
26
+
24
27
  # Class containing methods for building an search index.
25
28
  class IndexBuilder
26
- # Class method that build a search index from a given Array of samples.
29
+ # Internal: Class method that build a search index from a given Array of
30
+ # samples. The index consists of a Google Hash, which don't have Ruby's
31
+ # garbage collection and therefore is much more efficient. The Hash keys
32
+ # consists of index1 and index2 concatenated, and furthermore, if
33
+ # mismatches_max is given index1, and index2 are permutated accordingly.
34
+ # The Hash values are the sample number.
27
35
  #
28
36
  # samples - Array of samples (Sample objects with id, index1 and index2).
29
37
  #
@@ -32,15 +40,16 @@ class IndexBuilder
32
40
  # IndexBuilder.build(samples)
33
41
  # # => <Google Hash>
34
42
  #
35
- # Returns a Google Hash where the key is the index and the value is the TODO
43
+ # Returns a Google Hash where the key is the index and the value is sample
44
+ # number.
36
45
  def self.build(samples, mismatches_max)
37
46
  index_builder = new(samples, mismatches_max)
38
47
  index_hash = index_builder.index_init
39
48
  index_builder.index_populate(index_hash)
40
49
  end
41
50
 
42
- # Constructor method for IndexBuilder object. The given Array of samples and
43
- # mismatches_max are saved as an instance variable.
51
+ # Internal: Constructor method for IndexBuilder object. The given Array of
52
+ # samples and mismatches_max are saved as an instance variable.
44
53
  #
45
54
  # samples - Array of Sample objects.
46
55
  # mismatches_max - Integer denoting the maximum number of misses allowed in
@@ -57,7 +66,7 @@ class IndexBuilder
57
66
  @mismatches_max = mismatches_max
58
67
  end
59
68
 
60
- # Method to initialize the index. If @mismatches_max is <= then
69
+ # Internal: Method to initialize the index. If @mismatches_max is <= then
61
70
  # GoogleHashSparseLongToInt is used else GoogleHashDenseLongToInt due to
62
71
  # memory and performance.
63
72
  #
@@ -72,7 +81,7 @@ class IndexBuilder
72
81
  index_hash
73
82
  end
74
83
 
75
- # Method to populate the index.
84
+ # Internal: Method to populate the index.
76
85
  #
77
86
  # index_hash - Google Hash with initialized index.
78
87
  #
@@ -82,12 +91,10 @@ class IndexBuilder
82
91
  index_list1 = permutate([sample.index1], @mismatches_max)
83
92
  index_list2 = permutate([sample.index2], @mismatches_max)
84
93
 
85
- # index_check_list_sizes(index_list1, index_list2)
86
-
87
94
  index_list1.product(index_list2).each do |index1, index2|
88
95
  key = "#{index1}#{index2}".hash
89
96
 
90
- index_check_existing(index_hash, key)
97
+ index_check_existing(index_hash, key, sample, index1, index2)
91
98
 
92
99
  index_hash[key] = i
93
100
  end
@@ -98,37 +105,26 @@ class IndexBuilder
98
105
 
99
106
  private
100
107
 
101
- # Method to check if two index lists differ in size, if so an exception is
102
- # raised.
103
- #
104
- # index_list1 - Array with index1
105
- # index_list2 - Array with index2
106
- #
107
- # Returns nothing.
108
- def index_check_list_sizes(index_list1, index_list2)
109
- return if index_list1.size == index_list2.size
110
-
111
- fail "Permutated list sizes differ: \
112
- #{index_list1.size} != #{index_list2.size}"
113
- end
114
-
115
- # Method to check if a index key already exists in the index, and if so an
116
- # exception is raised.
108
+ # Internal: Method to check if a index key already exists in the index, and
109
+ # if so an exception is raised.
117
110
  #
118
111
  # index_hash - Google Hash with index
119
112
  # key - Integer from Google Hash's #hash method
113
+ # sample - Sample object whos index to check.
114
+ # index1 - String with index1 sequence.
115
+ # index2 - String with index2 sequence.
120
116
  #
121
117
  # Returns nothing.
122
- def index_check_existing(index_hash, key)
118
+ def index_check_existing(index_hash, key, sample, index1, index2)
123
119
  return unless index_hash[key]
124
120
 
125
- fail "Index combo of #{index1} and #{index2} already exists for \
126
- sample id: #{@samples[index_hash[key]].id} and #{sample.id}"
121
+ fail IndexBuilderError, "Index combo of #{index1} and #{index2} already \
122
+ exists for sample id: #{@samples[index_hash[key]].id} and #{sample.id}"
127
123
  end
128
124
 
129
- # Method that for each word in a given Array of word permutates each word a
130
- # given number (permuate) of times using a given alphabet, such that an Array
131
- # of words with all possible combinations is returned.
125
+ # Internal: Method that for each word in a given Array of word permutates
126
+ # each word a given number (permuate) of times using a given alphabet, such
127
+ # that an Array of words with all possible combinations is returned.
132
128
  #
133
129
  # list - Array of words (Strings) to permutate.
134
130
  # permuate - Number of permutations (Integer).
@@ -155,8 +151,8 @@ class IndexBuilder
155
151
  list
156
152
  end
157
153
 
158
- # Method that permutates a given word using a given alphabet, such that an
159
- # Array of words with all possible combinations is returned.
154
+ # Internal: Method that permutates a given word using a given alphabet, such
155
+ # that an Array of words with all possible combinations is returned.
160
156
  #
161
157
  # word - String with word to permutate.
162
158
  # alphabet - String with alphabet used for permutation.
data/lib/sample_reader.rb CHANGED
@@ -21,11 +21,19 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ require 'csv'
25
+ require 'set'
26
+ require 'biopieces'
27
+
28
+ # Class for all SampleReader errors.
29
+ SampleReaderError = Class.new(StandardError)
30
+
24
31
  # Class containing methods for reading and checking sample information.
25
32
  class SampleReader
26
- # Class method that reads sample information from a samples file, which
27
- # consists of ASCII text in three tab separated columns: The first column is
28
- # the sample_id, the second column is index1 and the third column is index2.
33
+ # Internal: Class method that reads sample information from a samples file,
34
+ # which consists of ASCII text in three tab separated columns: The first
35
+ # column is the sample_id, the second column is index1 and the third column is
36
+ # index2.
29
37
  #
30
38
  # If revcomp1 or revcomp2 is set then index1 and index2 are
31
39
  # reverse-complemented accordingly.
@@ -45,8 +53,8 @@ class SampleReader
45
53
  sample_reader.samples_parse(file)
46
54
  end
47
55
 
48
- # Constructor method for SampleReader object. The given revcomp1 and revcomp2
49
- # flags are stored as instance variables.
56
+ # Internal: Constructor method for SampleReader object. The given revcomp1 and
57
+ # revcomp2 flags are stored as instance variables.
50
58
  #
51
59
  # revcomp1 - Flag indicating that index1 should be reverse-complemented.
52
60
  # revcomp2 - Flag indicating that index2 should be reverse-complemented.
@@ -62,9 +70,9 @@ class SampleReader
62
70
  @revcomp2 = revcomp2
63
71
  end
64
72
 
65
- # Method that reads sample information from a samples file, which consists
66
- # of ASCII text in three tab separated columns: The first column is the
67
- # sample_id, the second column is index1 and the third column is index2.
73
+ # Internal: Method that reads sample information from a samples file, which
74
+ # consists of ASCII text in three tab separated columns: The first column is
75
+ # the sample_id, the second column is index1 and the third column is index2.
68
76
  #
69
77
  # file - String with path to sample file.
70
78
  #
@@ -82,8 +90,8 @@ class SampleReader
82
90
  errors.push(*samples_check_uniq_id(samples))
83
91
 
84
92
  unless errors.empty?
85
- pp errors
86
- fail 'errors found in sample file.'
93
+ warn errors
94
+ fail SampleReaderError, 'errors found in sample file.'
87
95
  end
88
96
 
89
97
  samples
@@ -91,9 +99,9 @@ class SampleReader
91
99
 
92
100
  private
93
101
 
94
- # Method that reads sample information form a samples file, which consists
95
- # of ASCII text in three tab separated columns: The first column is the
96
- # sample_id, the second column is index1 and the third column is index2.
102
+ # Internal: Method that reads sample information form a samples file, which
103
+ # consists of ASCII text in three tab separated columns: The first column is
104
+ # the sample_id, the second column is index1 and the third column is index2.
97
105
  #
98
106
  # If @options[:revcomp_index1] or @options[:revcomp_index2] is set then
99
107
  # index1 and index2 are reverse-complemented accordingly.
@@ -110,14 +118,16 @@ class SampleReader
110
118
  samples = []
111
119
 
112
120
  CSV.read(file, col_sep: "\t").each do |id, index1, index2|
113
- samples << Sample.new(id, index1, index2)
121
+ next if id[0] == '#'
122
+
123
+ samples << Sample.new(id, index1.upcase, index2.upcase)
114
124
  end
115
125
 
116
126
  samples
117
127
  end
118
128
 
119
- # Method that iterates over the a given Array of sample Objects, and if
120
- # @options[:revcomp_index1] or @options[:revcomp_index2] is set then
129
+ # Internal: Method that iterates over the a given Array of sample Objects,
130
+ # and if @options[:revcomp_index1] or @options[:revcomp_index2] is set then
121
131
  # index1 and index2 are reverse-complemented accordingly.
122
132
  #
123
133
  # samples - Array of Sample objects.
@@ -139,9 +149,9 @@ class SampleReader
139
149
  BioPieces::Seq.new(seq: index, type: :dna).reverse.complement.seq
140
150
  end
141
151
 
142
- # Method that iterates over the a given Array of sample Objects, and if
143
- # the combination of index1 and index2 is non-unique an error is pushed
144
- # on an error Array.
152
+ # Internal: Method that iterates over the a given Array of sample Objects,
153
+ # and if the combination of index1 and index2 is non-unique an error is
154
+ # pushed on an error Array.
145
155
  #
146
156
  # samples - Array of Sample objects.
147
157
  #
@@ -161,8 +171,8 @@ class SampleReader
161
171
  errors
162
172
  end
163
173
 
164
- # Method that iterates over the a given Array of sample Objects, and if
165
- # a sample id is non-unique an error is pushed on an error Array.
174
+ # Internal: Method that iterates over the a given Array of sample Objects,
175
+ # and if a sample id is non-unique an error is pushed on an error Array.
166
176
  #
167
177
  # samples - Array of Sample objects.
168
178
  #
@@ -182,7 +192,7 @@ class SampleReader
182
192
  errors
183
193
  end
184
194
 
185
- # Struct for holding sample information.
195
+ # Internal: Struct for holding sample information.
186
196
  #
187
197
  # id - Sample id.
188
198
  # index1 - Index1 sequence.
@@ -194,5 +204,17 @@ class SampleReader
194
204
  # # => <Sample>
195
205
  #
196
206
  # Returns Sample object.
197
- Sample = Struct.new(:id, :index1, :index2)
207
+ Sample = Struct.new(:id, :index1, :index2) do
208
+ # Internal: Method that returns a String representaion of a Sample object.
209
+ #
210
+ # Examples
211
+ #
212
+ # Sample.to_s
213
+ # # => "test\tATCG\tTCGA"
214
+ #
215
+ # Returns a String with the values joined by "\t".
216
+ def to_s
217
+ [id, index1, index2].join("\t")
218
+ end
219
+ end
198
220
  end
data/lib/status.rb CHANGED
@@ -25,8 +25,8 @@
25
25
  class Status
26
26
  attr_accessor :count, :match, :undetermined, :index1_bad_mean,
27
27
  :index2_bad_mean, :index1_bad_min, :index2_bad_min
28
- # Method to initialize a Status object, which contains the following instance
29
- # variables initialized to 0:
28
+ # Internal: Constructor method to initialize a Status object, which contains
29
+ # the following instance variables initialized to 0:
30
30
  #
31
31
  # @count - Number or reads.
32
32
  # @match - Number of reads found in index.
@@ -36,13 +36,16 @@ class Status
36
36
  # @index1_bad_min - Number of reads dropped due to bad min in index1.
37
37
  # @index2_bad_min - Number of reads dropped due to bad min in index2.
38
38
  #
39
+ # samples - Array of Sample objects.
40
+ #
39
41
  # Examples
40
42
  #
41
- # Status.new
43
+ # Status.new(samples)
42
44
  # # => <Status>
43
45
  #
44
46
  # Returns a Status object.
45
- def initialize
47
+ def initialize(samples)
48
+ @samples = samples
46
49
  @count = 0
47
50
  @match = 0
48
51
  @undetermined = 0
@@ -53,8 +56,9 @@ class Status
53
56
  @time_start = Time.now
54
57
  end
55
58
 
56
- # Method to format a String from a Status object. This is done by adding the
57
- # relevant instance variables to a Hash and return this as an YAML String.
59
+ # Internal: Method to format a String from a Status object. This is done by
60
+ # adding the relevant instance variables to a Hash and return this as an YAML
61
+ # String.
58
62
  #
59
63
  # Returns a YAML String.
60
64
  def to_s
@@ -66,36 +70,59 @@ class Status
66
70
  index2_bad_mean: @index2_bad_mean,
67
71
  index1_bad_min: @index1_bad_min,
68
72
  index2_bad_min: @index2_bad_min,
69
- time: time }.to_yaml
73
+ sample_ids: @samples.map(&:id),
74
+ index1: uniq_index1,
75
+ index2: uniq_index2,
76
+ time_elapsed: time_elapsed }.to_yaml
70
77
  end
71
78
 
72
- # Method that calculate the percentage of undetermined reads.
79
+ # Internal: Method to save stats to the log file 'Demultiplex.log' in the
80
+ # output directory.
81
+ #
82
+ # Returns nothing.
83
+ def save(file)
84
+ File.open(file, 'w') do |ios|
85
+ ios.puts self
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ # Internal: Method that calculate the percentage of undetermined reads.
73
92
  #
74
93
  # Returns a Float with the percentage of undetermined reads.
75
94
  def undetermined_percent
95
+ return 0.0 if @count == 0
96
+
76
97
  (100 * @undetermined / @count.to_f).round(1)
77
98
  end
78
99
 
79
- # Method that calculates the elapsed time and formats a nice Time String.
100
+ # Internal: Method that calculates the elapsed time and formats a nice Time
101
+ # String.
80
102
  #
81
103
  # Returns String with elapsed time.
82
- def time
104
+ def time_elapsed
83
105
  time_elapsed = Time.now - @time_start
84
106
  (Time.mktime(0) + time_elapsed).strftime('%H:%M:%S')
85
107
  end
86
108
 
87
- # Method to save stats to the log file 'Demultiplex.log' in the output
88
- # directory.
109
+ # Internal: Method that iterates over @samples and compiles a sorted Array
110
+ # with all unique index1 sequences.
89
111
  #
90
- # Returns nothing.
91
- def save(file)
92
- @stats[:sample_id] = @samples.map(&:id)
93
-
94
- @stats[:index1] = uniq_index1
95
- @stats[:index2] = uniq_index2
112
+ # Returns Array with uniq index1 sequences.
113
+ def uniq_index1
114
+ @samples.each_with_object(SortedSet.new) do |e, a|
115
+ a << e.index1
116
+ end.to_a
117
+ end
96
118
 
97
- File.open(file, 'w') do |ios|
98
- ios.puts @status
99
- end
119
+ # Internal: Method that iterates over @samples and compiles a sorted Array
120
+ # with all unique index2 sequences.
121
+ #
122
+ # Returns Array with uniq index2 sequences.
123
+ def uniq_index2
124
+ @samples.each_with_object(SortedSet.new) do |e, a|
125
+ a << e.index2
126
+ end.to_a
100
127
  end
101
128
  end
data/test/helper.rb CHANGED
@@ -21,9 +21,13 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ require 'pp'
25
+ require 'fileutils'
26
+ require 'tempfile'
24
27
  require 'demultiplexer'
25
28
  require 'test/unit'
26
29
 
30
+ # Adding stream capture methods.
27
31
  module Kernel
28
32
  def capture_stdout
29
33
  out = StringIO.new
@@ -44,6 +48,7 @@ module Kernel
44
48
  end
45
49
  end
46
50
 
51
+ # Adding custom test class method to TestCase.
47
52
  class Test::Unit::TestCase
48
53
  def self.test(desc, &impl)
49
54
  define_method("test #{desc}", &impl)