demultiplexer 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/demultiplexer.rb CHANGED
@@ -21,8 +21,22 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ require 'google_hash'
25
+ require 'status'
26
+ require 'sample_reader'
27
+ require 'index_builder'
28
+ require 'data_io'
29
+
24
30
  # Class containing methods for demultiplexing MiSeq sequences.
25
31
  class Demultiplexer
32
+ DEFAULT = { verbose: false,
33
+ mismatches_max: 0,
34
+ revcomp_index1: false,
35
+ revcomp_index2: false,
36
+ scores_min: 16,
37
+ scores_mean: 16
38
+ }
39
+
26
40
  attr_reader :status
27
41
 
28
42
  # Public: Class method to run demultiplexing of MiSeq sequences.
@@ -55,6 +69,7 @@ class Demultiplexer
55
69
  #
56
70
  # Returns Demultiplexer object
57
71
  def self.run(fastq_files, options)
72
+ options = DEFAULT.merge(options)
58
73
  log_file = File.join(options[:output_dir], 'Demultiplex.log')
59
74
  demultiplexer = new(fastq_files, options)
60
75
  Screen.clear if options[:verbose]
@@ -63,7 +78,7 @@ class Demultiplexer
63
78
  demultiplexer.status.save(log_file)
64
79
  end
65
80
 
66
- # Constructor method for Demultiplexer object.
81
+ # Internal: Constructor method for Demultiplexer object.
67
82
  #
68
83
  # fastq_files - Array with paths to FASTQ files.
69
84
  # options - Options Hash.
@@ -91,14 +106,14 @@ class Demultiplexer
91
106
  @samples = SampleReader.read(options[:samples_file],
92
107
  options[:revcomp_index1],
93
108
  options[:revcomp_index2])
94
- @undetermined = @samples.size + 1
109
+ @undetermined = @samples.size
95
110
  @index_hash = IndexBuilder.build(@samples, options[:mismatches_max])
96
111
  @data_io = DataIO.new(@samples, fastq_files, options[:compress],
97
112
  options[:output_dir])
98
- @status = Status.new
113
+ @status = Status.new(@samples)
99
114
  end
100
115
 
101
- # Method to demultiplex reads according the index. This is done by
116
+ # Internal: Method to demultiplex reads according the index. This is done by
102
117
  # simultaniously read-opening all input files (forward and reverse index
103
118
  # files and forward and reverse read files) and read one entry from each.
104
119
  # Such four entries we call a set of entries. If the quality scores from
@@ -130,11 +145,11 @@ class Demultiplexer
130
145
 
131
146
  private
132
147
 
133
- # Method that matches the combined index1 and index2 sequences against the
134
- # search index. In case of a match the reads are written to file according to
135
- # the information in the search index, otherwise the reads will have thier
136
- # names appended with the index sequences and they will be written to the
137
- # Undetermined files.
148
+ # Internal: Method that matches the combined index1 and index2 sequences
149
+ # against the search index. In case of a match the reads are written to file
150
+ # according to the information in the search index, otherwise the reads will
151
+ # have thier names appended with the index sequences and they will be written
152
+ # to the Undetermined files.
138
153
  #
139
154
  # ios_out - DataIO object with an accessor method for file output handles.
140
155
  # index1 - Seq object with index1.
@@ -144,15 +159,17 @@ class Demultiplexer
144
159
  #
145
160
  # Returns nothing.
146
161
  def match_index(ios_out, index1, index2, read1, read2)
147
- if (sample_id = @index_hash["#{index1.seq}#{index2.seq}".hash])
162
+ key = "#{index1.seq.upcase}#{index2.seq.upcase}".hash
163
+
164
+ if (sample_id = @index_hash[key])
148
165
  write_match(ios_out, sample_id, read1, read2)
149
166
  else
150
167
  write_undetermined(ios_out, index1, index2, read1, read2)
151
168
  end
152
169
  end
153
170
 
154
- # Method that writes a index match to file according to the information in
155
- # the search index.
171
+ # Internal: Method that writes a index match to file according to the
172
+ # information in the search index.
156
173
  #
157
174
  # ios_out - DataIO object with an accessor method for file output handles.
158
175
  # read1 - Seq object with read1.
@@ -167,8 +184,8 @@ class Demultiplexer
167
184
  io_reverse.puts read2.to_fastq
168
185
  end
169
186
 
170
- # Method that appends the read names with the index sequences and writes
171
- # the reads to the Undetermined files.
187
+ # Internal: Method that appends the read names with the index sequences and
188
+ # writes the reads to the Undetermined files.
172
189
  #
173
190
  # ios_out - DataIO object with an accessor method for file output handles.
174
191
  # index1 - Seq object with index1.
@@ -187,7 +204,7 @@ class Demultiplexer
187
204
  io_reverse.puts read2.to_fastq
188
205
  end
189
206
 
190
- # Method to check the quality scores of the given indexes.
207
+ # Internal: Method to check the quality scores of the given indexes.
191
208
  # If the mean score is higher than @options[:scores_mean] or
192
209
  # if the min score is higher than @options[:scores_min] then
193
210
  # the indexes are OK.
@@ -201,7 +218,7 @@ class Demultiplexer
201
218
  index_qual_min_ok?(index1, index2)
202
219
  end
203
220
 
204
- # Method to check the mean quality scores of the given indexes.
221
+ # Internal: Method to check the mean quality scores of the given indexes.
205
222
  # If the mean score is higher than @options[:scores_mean] the
206
223
  # indexes are OK.
207
224
  #
@@ -221,7 +238,7 @@ class Demultiplexer
221
238
  true
222
239
  end
223
240
 
224
- # Method to check the min quality scores of the given indexes.
241
+ # Internal: Method to check the min quality scores of the given indexes.
225
242
  # If the min score is higher than @options[:scores_min] the
226
243
  # indexes are OK.
227
244
  #
@@ -240,24 +257,4 @@ class Demultiplexer
240
257
 
241
258
  true
242
259
  end
243
-
244
- # Method that iterates over @samples and compiles a sorted Array with all
245
- # unique index1 sequences.
246
- #
247
- # Returns Array with uniq index1 sequences.
248
- def uniq_index1
249
- @status.index1 = @samples.each_with_object(SortedSet.new) do |a, e|
250
- a << e.index1
251
- end.to_a
252
- end
253
-
254
- # Method that iterates over @samples and compiles a sorted Array with all
255
- # unique index2 sequences.
256
- #
257
- # Returns Array with uniq index2 sequences.
258
- def uniq_index2
259
- @status.index2 = @samples.each_with_object(SortedSet.new) do |a, e|
260
- a << e.index2
261
- end.to_a
262
- end
263
260
  end
data/lib/index_builder.rb CHANGED
@@ -21,9 +21,17 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ # Class for IndexBuilder errors.
25
+ IndexBuilderError = Class.new(StandardError)
26
+
24
27
  # Class containing methods for building an search index.
25
28
  class IndexBuilder
26
- # Class method that build a search index from a given Array of samples.
29
+ # Internal: Class method that build a search index from a given Array of
30
+ # samples. The index consists of a Google Hash, which don't have Ruby's
31
+ # garbage collection and therefore is much more efficient. The Hash keys
32
+ # consists of index1 and index2 concatenated, and furthermore, if
33
+ # mismatches_max is given index1, and index2 are permutated accordingly.
34
+ # The Hash values are the sample number.
27
35
  #
28
36
  # samples - Array of samples (Sample objects with id, index1 and index2).
29
37
  #
@@ -32,15 +40,16 @@ class IndexBuilder
32
40
  # IndexBuilder.build(samples)
33
41
  # # => <Google Hash>
34
42
  #
35
- # Returns a Google Hash where the key is the index and the value is the TODO
43
+ # Returns a Google Hash where the key is the index and the value is sample
44
+ # number.
36
45
  def self.build(samples, mismatches_max)
37
46
  index_builder = new(samples, mismatches_max)
38
47
  index_hash = index_builder.index_init
39
48
  index_builder.index_populate(index_hash)
40
49
  end
41
50
 
42
- # Constructor method for IndexBuilder object. The given Array of samples and
43
- # mismatches_max are saved as an instance variable.
51
+ # Internal: Constructor method for IndexBuilder object. The given Array of
52
+ # samples and mismatches_max are saved as an instance variable.
44
53
  #
45
54
  # samples - Array of Sample objects.
46
55
  # mismatches_max - Integer denoting the maximum number of misses allowed in
@@ -57,7 +66,7 @@ class IndexBuilder
57
66
  @mismatches_max = mismatches_max
58
67
  end
59
68
 
60
- # Method to initialize the index. If @mismatches_max is <= then
69
+ # Internal: Method to initialize the index. If @mismatches_max is <= then
61
70
  # GoogleHashSparseLongToInt is used else GoogleHashDenseLongToInt due to
62
71
  # memory and performance.
63
72
  #
@@ -72,7 +81,7 @@ class IndexBuilder
72
81
  index_hash
73
82
  end
74
83
 
75
- # Method to populate the index.
84
+ # Internal: Method to populate the index.
76
85
  #
77
86
  # index_hash - Google Hash with initialized index.
78
87
  #
@@ -82,12 +91,10 @@ class IndexBuilder
82
91
  index_list1 = permutate([sample.index1], @mismatches_max)
83
92
  index_list2 = permutate([sample.index2], @mismatches_max)
84
93
 
85
- # index_check_list_sizes(index_list1, index_list2)
86
-
87
94
  index_list1.product(index_list2).each do |index1, index2|
88
95
  key = "#{index1}#{index2}".hash
89
96
 
90
- index_check_existing(index_hash, key)
97
+ index_check_existing(index_hash, key, sample, index1, index2)
91
98
 
92
99
  index_hash[key] = i
93
100
  end
@@ -98,37 +105,26 @@ class IndexBuilder
98
105
 
99
106
  private
100
107
 
101
- # Method to check if two index lists differ in size, if so an exception is
102
- # raised.
103
- #
104
- # index_list1 - Array with index1
105
- # index_list2 - Array with index2
106
- #
107
- # Returns nothing.
108
- def index_check_list_sizes(index_list1, index_list2)
109
- return if index_list1.size == index_list2.size
110
-
111
- fail "Permutated list sizes differ: \
112
- #{index_list1.size} != #{index_list2.size}"
113
- end
114
-
115
- # Method to check if a index key already exists in the index, and if so an
116
- # exception is raised.
108
+ # Internal: Method to check if a index key already exists in the index, and
109
+ # if so an exception is raised.
117
110
  #
118
111
  # index_hash - Google Hash with index
119
112
  # key - Integer from Google Hash's #hash method
113
+ # sample - Sample object whos index to check.
114
+ # index1 - String with index1 sequence.
115
+ # index2 - String with index2 sequence.
120
116
  #
121
117
  # Returns nothing.
122
- def index_check_existing(index_hash, key)
118
+ def index_check_existing(index_hash, key, sample, index1, index2)
123
119
  return unless index_hash[key]
124
120
 
125
- fail "Index combo of #{index1} and #{index2} already exists for \
126
- sample id: #{@samples[index_hash[key]].id} and #{sample.id}"
121
+ fail IndexBuilderError, "Index combo of #{index1} and #{index2} already \
122
+ exists for sample id: #{@samples[index_hash[key]].id} and #{sample.id}"
127
123
  end
128
124
 
129
- # Method that for each word in a given Array of word permutates each word a
130
- # given number (permuate) of times using a given alphabet, such that an Array
131
- # of words with all possible combinations is returned.
125
+ # Internal: Method that for each word in a given Array of word permutates
126
+ # each word a given number (permuate) of times using a given alphabet, such
127
+ # that an Array of words with all possible combinations is returned.
132
128
  #
133
129
  # list - Array of words (Strings) to permutate.
134
130
  # permuate - Number of permutations (Integer).
@@ -155,8 +151,8 @@ class IndexBuilder
155
151
  list
156
152
  end
157
153
 
158
- # Method that permutates a given word using a given alphabet, such that an
159
- # Array of words with all possible combinations is returned.
154
+ # Internal: Method that permutates a given word using a given alphabet, such
155
+ # that an Array of words with all possible combinations is returned.
160
156
  #
161
157
  # word - String with word to permutate.
162
158
  # alphabet - String with alphabet used for permutation.
data/lib/sample_reader.rb CHANGED
@@ -21,11 +21,19 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ require 'csv'
25
+ require 'set'
26
+ require 'biopieces'
27
+
28
+ # Class for all SampleReader errors.
29
+ SampleReaderError = Class.new(StandardError)
30
+
24
31
  # Class containing methods for reading and checking sample information.
25
32
  class SampleReader
26
- # Class method that reads sample information from a samples file, which
27
- # consists of ASCII text in three tab separated columns: The first column is
28
- # the sample_id, the second column is index1 and the third column is index2.
33
+ # Internal: Class method that reads sample information from a samples file,
34
+ # which consists of ASCII text in three tab separated columns: The first
35
+ # column is the sample_id, the second column is index1 and the third column is
36
+ # index2.
29
37
  #
30
38
  # If revcomp1 or revcomp2 is set then index1 and index2 are
31
39
  # reverse-complemented accordingly.
@@ -45,8 +53,8 @@ class SampleReader
45
53
  sample_reader.samples_parse(file)
46
54
  end
47
55
 
48
- # Constructor method for SampleReader object. The given revcomp1 and revcomp2
49
- # flags are stored as instance variables.
56
+ # Internal: Constructor method for SampleReader object. The given revcomp1 and
57
+ # revcomp2 flags are stored as instance variables.
50
58
  #
51
59
  # revcomp1 - Flag indicating that index1 should be reverse-complemented.
52
60
  # revcomp2 - Flag indicating that index2 should be reverse-complemented.
@@ -62,9 +70,9 @@ class SampleReader
62
70
  @revcomp2 = revcomp2
63
71
  end
64
72
 
65
- # Method that reads sample information from a samples file, which consists
66
- # of ASCII text in three tab separated columns: The first column is the
67
- # sample_id, the second column is index1 and the third column is index2.
73
+ # Internal: Method that reads sample information from a samples file, which
74
+ # consists of ASCII text in three tab separated columns: The first column is
75
+ # the sample_id, the second column is index1 and the third column is index2.
68
76
  #
69
77
  # file - String with path to sample file.
70
78
  #
@@ -82,8 +90,8 @@ class SampleReader
82
90
  errors.push(*samples_check_uniq_id(samples))
83
91
 
84
92
  unless errors.empty?
85
- pp errors
86
- fail 'errors found in sample file.'
93
+ warn errors
94
+ fail SampleReaderError, 'errors found in sample file.'
87
95
  end
88
96
 
89
97
  samples
@@ -91,9 +99,9 @@ class SampleReader
91
99
 
92
100
  private
93
101
 
94
- # Method that reads sample information form a samples file, which consists
95
- # of ASCII text in three tab separated columns: The first column is the
96
- # sample_id, the second column is index1 and the third column is index2.
102
+ # Internal: Method that reads sample information form a samples file, which
103
+ # consists of ASCII text in three tab separated columns: The first column is
104
+ # the sample_id, the second column is index1 and the third column is index2.
97
105
  #
98
106
  # If @options[:revcomp_index1] or @options[:revcomp_index2] is set then
99
107
  # index1 and index2 are reverse-complemented accordingly.
@@ -110,14 +118,16 @@ class SampleReader
110
118
  samples = []
111
119
 
112
120
  CSV.read(file, col_sep: "\t").each do |id, index1, index2|
113
- samples << Sample.new(id, index1, index2)
121
+ next if id[0] == '#'
122
+
123
+ samples << Sample.new(id, index1.upcase, index2.upcase)
114
124
  end
115
125
 
116
126
  samples
117
127
  end
118
128
 
119
- # Method that iterates over the a given Array of sample Objects, and if
120
- # @options[:revcomp_index1] or @options[:revcomp_index2] is set then
129
+ # Internal: Method that iterates over the a given Array of sample Objects,
130
+ # and if @options[:revcomp_index1] or @options[:revcomp_index2] is set then
121
131
  # index1 and index2 are reverse-complemented accordingly.
122
132
  #
123
133
  # samples - Array of Sample objects.
@@ -139,9 +149,9 @@ class SampleReader
139
149
  BioPieces::Seq.new(seq: index, type: :dna).reverse.complement.seq
140
150
  end
141
151
 
142
- # Method that iterates over the a given Array of sample Objects, and if
143
- # the combination of index1 and index2 is non-unique an error is pushed
144
- # on an error Array.
152
+ # Internal: Method that iterates over the a given Array of sample Objects,
153
+ # and if the combination of index1 and index2 is non-unique an error is
154
+ # pushed on an error Array.
145
155
  #
146
156
  # samples - Array of Sample objects.
147
157
  #
@@ -161,8 +171,8 @@ class SampleReader
161
171
  errors
162
172
  end
163
173
 
164
- # Method that iterates over the a given Array of sample Objects, and if
165
- # a sample id is non-unique an error is pushed on an error Array.
174
+ # Internal: Method that iterates over the a given Array of sample Objects,
175
+ # and if a sample id is non-unique an error is pushed on an error Array.
166
176
  #
167
177
  # samples - Array of Sample objects.
168
178
  #
@@ -182,7 +192,7 @@ class SampleReader
182
192
  errors
183
193
  end
184
194
 
185
- # Struct for holding sample information.
195
+ # Internal: Struct for holding sample information.
186
196
  #
187
197
  # id - Sample id.
188
198
  # index1 - Index1 sequence.
@@ -194,5 +204,17 @@ class SampleReader
194
204
  # # => <Sample>
195
205
  #
196
206
  # Returns Sample object.
197
- Sample = Struct.new(:id, :index1, :index2)
207
+ Sample = Struct.new(:id, :index1, :index2) do
208
+ # Internal: Method that returns a String representaion of a Sample object.
209
+ #
210
+ # Examples
211
+ #
212
+ # Sample.to_s
213
+ # # => "test\tATCG\tTCGA"
214
+ #
215
+ # Returns a String with the values joined by "\t".
216
+ def to_s
217
+ [id, index1, index2].join("\t")
218
+ end
219
+ end
198
220
  end
data/lib/status.rb CHANGED
@@ -25,8 +25,8 @@
25
25
  class Status
26
26
  attr_accessor :count, :match, :undetermined, :index1_bad_mean,
27
27
  :index2_bad_mean, :index1_bad_min, :index2_bad_min
28
- # Method to initialize a Status object, which contains the following instance
29
- # variables initialized to 0:
28
+ # Internal: Constructor method to initialize a Status object, which contains
29
+ # the following instance variables initialized to 0:
30
30
  #
31
31
  # @count - Number or reads.
32
32
  # @match - Number of reads found in index.
@@ -36,13 +36,16 @@ class Status
36
36
  # @index1_bad_min - Number of reads dropped due to bad min in index1.
37
37
  # @index2_bad_min - Number of reads dropped due to bad min in index2.
38
38
  #
39
+ # samples - Array of Sample objects.
40
+ #
39
41
  # Examples
40
42
  #
41
- # Status.new
43
+ # Status.new(samples)
42
44
  # # => <Status>
43
45
  #
44
46
  # Returns a Status object.
45
- def initialize
47
+ def initialize(samples)
48
+ @samples = samples
46
49
  @count = 0
47
50
  @match = 0
48
51
  @undetermined = 0
@@ -53,8 +56,9 @@ class Status
53
56
  @time_start = Time.now
54
57
  end
55
58
 
56
- # Method to format a String from a Status object. This is done by adding the
57
- # relevant instance variables to a Hash and return this as an YAML String.
59
+ # Internal: Method to format a String from a Status object. This is done by
60
+ # adding the relevant instance variables to a Hash and return this as an YAML
61
+ # String.
58
62
  #
59
63
  # Returns a YAML String.
60
64
  def to_s
@@ -66,36 +70,59 @@ class Status
66
70
  index2_bad_mean: @index2_bad_mean,
67
71
  index1_bad_min: @index1_bad_min,
68
72
  index2_bad_min: @index2_bad_min,
69
- time: time }.to_yaml
73
+ sample_ids: @samples.map(&:id),
74
+ index1: uniq_index1,
75
+ index2: uniq_index2,
76
+ time_elapsed: time_elapsed }.to_yaml
70
77
  end
71
78
 
72
- # Method that calculate the percentage of undetermined reads.
79
+ # Internal: Method to save stats to the log file 'Demultiplex.log' in the
80
+ # output directory.
81
+ #
82
+ # Returns nothing.
83
+ def save(file)
84
+ File.open(file, 'w') do |ios|
85
+ ios.puts self
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ # Internal: Method that calculate the percentage of undetermined reads.
73
92
  #
74
93
  # Returns a Float with the percentage of undetermined reads.
75
94
  def undetermined_percent
95
+ return 0.0 if @count == 0
96
+
76
97
  (100 * @undetermined / @count.to_f).round(1)
77
98
  end
78
99
 
79
- # Method that calculates the elapsed time and formats a nice Time String.
100
+ # Internal: Method that calculates the elapsed time and formats a nice Time
101
+ # String.
80
102
  #
81
103
  # Returns String with elapsed time.
82
- def time
104
+ def time_elapsed
83
105
  time_elapsed = Time.now - @time_start
84
106
  (Time.mktime(0) + time_elapsed).strftime('%H:%M:%S')
85
107
  end
86
108
 
87
- # Method to save stats to the log file 'Demultiplex.log' in the output
88
- # directory.
109
+ # Internal: Method that iterates over @samples and compiles a sorted Array
110
+ # with all unique index1 sequences.
89
111
  #
90
- # Returns nothing.
91
- def save(file)
92
- @stats[:sample_id] = @samples.map(&:id)
93
-
94
- @stats[:index1] = uniq_index1
95
- @stats[:index2] = uniq_index2
112
+ # Returns Array with uniq index1 sequences.
113
+ def uniq_index1
114
+ @samples.each_with_object(SortedSet.new) do |e, a|
115
+ a << e.index1
116
+ end.to_a
117
+ end
96
118
 
97
- File.open(file, 'w') do |ios|
98
- ios.puts @status
99
- end
119
+ # Internal: Method that iterates over @samples and compiles a sorted Array
120
+ # with all unique index2 sequences.
121
+ #
122
+ # Returns Array with uniq index2 sequences.
123
+ def uniq_index2
124
+ @samples.each_with_object(SortedSet.new) do |e, a|
125
+ a << e.index2
126
+ end.to_a
100
127
  end
101
128
  end
data/test/helper.rb CHANGED
@@ -21,9 +21,13 @@
21
21
  # #
22
22
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
23
 
24
+ require 'pp'
25
+ require 'fileutils'
26
+ require 'tempfile'
24
27
  require 'demultiplexer'
25
28
  require 'test/unit'
26
29
 
30
+ # Adding stream capture methods.
27
31
  module Kernel
28
32
  def capture_stdout
29
33
  out = StringIO.new
@@ -44,6 +48,7 @@ module Kernel
44
48
  end
45
49
  end
46
50
 
51
+ # Adding custom test class method to TestCase.
47
52
  class Test::Unit::TestCase
48
53
  def self.test(desc, &impl)
49
54
  define_method("test #{desc}", &impl)