demultiplexer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+
3
+ require 'demultiplexer/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'demultiplexer'
7
+ s.version = Demultiplexer::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.date = Time.now.strftime("%F")
10
+ s.summary = "Demultiplexer"
11
+ s.description = "Demultiplex sequences from the Illumina platform."
12
+ s.authors = ["Martin A. Hansen"]
13
+ s.email = 'mail@maasha.dk'
14
+ s.rubyforge_project = "demultiplexer"
15
+ s.homepage = 'http://github.com/maasha/demultiplexer'
16
+ s.license = 'GPL2'
17
+ s.rubygems_version = "2.0.0"
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency("biopieces", ">= 0.4.1")
23
+ s.add_dependency("google_hash", ">= 0.8.4")
24
+ s.add_development_dependency("bundler", ">= 1.7.4")
25
+ s.add_development_dependency("simplecov", ">= 0.9.2")
26
+ end
data/lib/data_io.rb ADDED
@@ -0,0 +1,207 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Class containing methods for reading and write FASTQ data files.
25
+ class DataIO
26
+ def initialize(samples, fastq_files, compress, output_dir)
27
+ @samples = samples
28
+ @compress = compress
29
+ @output_dir = output_dir
30
+ @suffix1 = extract_suffix(fastq_files.grep(/_R1_/).first)
31
+ @suffix2 = extract_suffix(fastq_files.grep(/_R2_/).first)
32
+ @input_files = identify_input_files(fastq_files)
33
+ @undetermined = @samples.size + 1
34
+ @file_hash = nil
35
+ end
36
+
37
+ # Method that extracts the Sample, Lane, Region information from a given file.
38
+ #
39
+ # file - String with file name.
40
+ #
41
+ # Examples
42
+ #
43
+ # extract_suffix("Sample1_S1_L001_R1_001.fastq.gz")
44
+ # # => "_S1_L001_R1_001"
45
+ #
46
+ # Returns String with SLR info.
47
+ def extract_suffix(file)
48
+ if file =~ /.+(_S\d_L\d{3}_R[12]_\d{3}).+$/
49
+ slr = Regexp.last_match(1)
50
+ else
51
+ fail "Unable to parse file SLR from: #{file}"
52
+ end
53
+
54
+ append_suffix(slr)
55
+ end
56
+
57
+ # Method that appends a file suffix to a given Sample, Lane, Region
58
+ # information String based on the @options[:compress] option. The
59
+ # file suffix can be either ".fastq.gz", ".fastq.bz2", or ".fastq".
60
+ #
61
+ # slr - String Sample, Lane, Region information.
62
+ #
63
+ # Examples
64
+ #
65
+ # append_suffix("_S1_L001_R1_001")
66
+ # # => "_S1_L001_R1_001.fastq.gz"
67
+ #
68
+ # Returns String with SLR info and file suffix.
69
+ def append_suffix(slr)
70
+ case @compress
71
+ when /gzip/
72
+ slr << '.fastq.gz'
73
+ when /bzip2/
74
+ slr << '.fastq.bz2'
75
+ else
76
+ slr << '.fastq'
77
+ end
78
+
79
+ slr
80
+ end
81
+
82
+ # Method identify the different input files from a given Array of FASTQ files.
83
+ # The forward index file contains a _I1_, the reverse index file contains a
84
+ # _I2_, the forward read file contains a _R1_ and finally, the reverse read
85
+ # file contain a _R2_.
86
+ #
87
+ # fastq_files - Array with FASTQ files (Strings).
88
+ #
89
+ # Returns an Array with input files (Strings).
90
+ def identify_input_files(fastq_files)
91
+ input_files = []
92
+
93
+ input_files << fastq_files.grep(/_I1_/).first
94
+ input_files << fastq_files.grep(/_I2_/).first
95
+ input_files << fastq_files.grep(/_R1_/).first
96
+ input_files << fastq_files.grep(/_R2_/).first
97
+
98
+ input_files
99
+ end
100
+
101
+ # Method that opens the @input_files for reading.
102
+ #
103
+ # input_files - Array with input file paths.
104
+ #
105
+ # Returns an Array with IO objects (file handles).
106
+ def open_input_files
107
+ @file_ios = []
108
+
109
+ @input_files.each do |input_file|
110
+ @file_ios << BioPieces::Fastq.open(input_file)
111
+ end
112
+
113
+ yield self
114
+ ensure
115
+ close_input_files
116
+ end
117
+
118
+ # Method that closes open input files.
119
+ #
120
+ # Returns nothing.
121
+ def close_input_files
122
+ @file_ios.map(&:close)
123
+ end
124
+
125
+ # Method that reads a Seq entry from each of the file handles in the
126
+ # @file_ios Array. Iteration stops when no more Seq entries are found.
127
+ #
128
+ # Yields an Array with 4 Seq objects.
129
+ #
130
+ # Returns nothing
131
+ def each
132
+ loop do
133
+ entries = @file_ios.each_with_object([]) { |e, a| a << e.next_entry }
134
+
135
+ break if entries.compact.size != 4
136
+
137
+ yield entries
138
+ end
139
+ end
140
+
141
+ # Method that opens the output files for writing.
142
+ #
143
+ # Yeilds a Hash with an incrementing index as keys, and a tuple of file
144
+ # handles as values.
145
+ def open_output_files
146
+ @file_hash = {}
147
+ comp = @compress
148
+
149
+ @file_hash.merge!(open_output_files_samples(comp))
150
+ @file_hash.merge!(open_output_files_undet(comp))
151
+
152
+ yield self
153
+ ensure
154
+ close_output_files
155
+ end
156
+
157
+ def close_output_files
158
+ @file_hash.each_value { |value| value.map(&:close) }
159
+ end
160
+
161
+ # Getter method that returns a tuple of file handles from @file_hash when
162
+ # given a key.
163
+ #
164
+ # key - Key used to lookup
165
+ #
166
+ # Returns Array with a tuple of IO objects.
167
+ def [](key)
168
+ @file_hash[key]
169
+ end
170
+
171
+ # Method that opens the sample output files for writing.
172
+ #
173
+ # comp - Symbol with type of output compression.
174
+ #
175
+ # Returns a Hash with an incrementing index as keys, and a tuple of file
176
+ # handles as values.
177
+ def open_output_files_samples(comp)
178
+ file_hash = {}
179
+
180
+ @samples.each_with_index do |sample, i|
181
+ file_forward = File.join(@output_dir, "#{sample.id}#{@suffix1}")
182
+ file_reverse = File.join(@output_dir, "#{sample.id}#{@suffix2}")
183
+ io_forward = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
184
+ io_reverse = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
185
+ file_hash[i] = [io_forward, io_reverse]
186
+ end
187
+
188
+ file_hash
189
+ end
190
+
191
+ # Method that opens the undertermined output files for writing.
192
+ #
193
+ # comp - Symbol with type of output compression.
194
+ #
195
+ # Returns a Hash with an incrementing index as keys, and a tuple of file
196
+ # handles as values.
197
+ def open_output_files_undet(comp)
198
+ file_hash = {}
199
+ file_forward = File.join(@output_dir, "Undetermined#{@suffix1}")
200
+ file_reverse = File.join(@output_dir, "Undetermined#{@suffix2}")
201
+ io_forward = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
202
+ io_reverse = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
203
+ file_hash[@undetermined] = [io_forward, io_reverse]
204
+
205
+ file_hash
206
+ end
207
+ end
@@ -0,0 +1,263 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Class containing methods for demultiplexing MiSeq sequences.
25
+ class Demultiplexer
26
+ attr_reader :status
27
+
28
+ # Public: Class method to run demultiplexing of MiSeq sequences.
29
+ #
30
+ # fastq_files - Array with paths to FASTQ files.
31
+ # options - Options Hash.
32
+ # :verbose - Verbose flag (default: false).
33
+ # :mismatches_max - Integer value indicating max mismatches
34
+ # (default: 0).
35
+ # :samples_file - String with path to samples file.
36
+ # :revcomp_index1 - Flag indicating that index1 should be
37
+ # reverse-complemented (default: false).
38
+ # :revcomp_index2 - Flag indicating that index2 should be
39
+ # reverse-complemented (default: false).
40
+ # :output_dir - String with output directory (optional).
41
+ # :scores_min - An Integer representing the Phred score
42
+ # minimum, such that a reads is dropped if a
43
+ # single position in the index contain a
44
+ # score below this value (default: 16).
45
+ # :scores_mean=> - An Integer representing the mean Phread
46
+ # score, such that a read is dropped if the
47
+ # mean quality score is below this value
48
+ # (default: 16).
49
+ #
50
+ # Examples
51
+ #
52
+ # Demultiplexer.run(['I1.fq', 'I2.fq', 'R1.fq', 'R2.fq'], \
53
+ # samples_file: 'samples.txt')
54
+ # # => <Demultiplexer>
55
+ #
56
+ # Returns Demultiplexer object
57
+ def self.run(fastq_files, options)
58
+ log_file = File.join(options[:output_dir], 'Demultiplex.log')
59
+ demultiplexer = new(fastq_files, options)
60
+ Screen.clear if options[:verbose]
61
+ demultiplexer.demultiplex
62
+ puts demultiplexer.status if options[:verbose]
63
+ demultiplexer.status.save(log_file)
64
+ end
65
+
66
+ # Constructor method for Demultiplexer object.
67
+ #
68
+ # fastq_files - Array with paths to FASTQ files.
69
+ # options - Options Hash.
70
+ # :verbose - Verbose flag (default: false).
71
+ # :mismatches_max - Integer value indicating max mismatches
72
+ # (default: 0).
73
+ # :samples_file - String with path to samples file.
74
+ # :revcomp_index1 - Flag indicating that index1 should be
75
+ # reverse-complemented (default: false).
76
+ # :revcomp_index2 - Flag indicating that index2 should be
77
+ # reverse-complemented (default: false).
78
+ # :output_dir - String with output directory (optional).
79
+ # :scores_min - An Integer representing the Phred score
80
+ # minimum, such that a reads is dropped if a
81
+ # single position in the index contain a
82
+ # score below this value (default: 16).
83
+ # :scores_mean=> - An Integer representing the mean Phread
84
+ # score, such that a read is dropped if the
85
+ # mean quality score is below this value
86
+ # (default: 16).
87
+ #
88
+ # Returns Demultiplexer object
89
+ def initialize(fastq_files, options)
90
+ @options = options
91
+ @samples = SampleReader.read(options[:samples_file],
92
+ options[:revcomp_index1],
93
+ options[:revcomp_index2])
94
+ @undetermined = @samples.size + 1
95
+ @index_hash = IndexBuilder.build(@samples, options[:mismatches_max])
96
+ @data_io = DataIO.new(@samples, fastq_files, options[:compress],
97
+ options[:output_dir])
98
+ @status = Status.new
99
+ end
100
+
101
+ # Method to demultiplex reads according the index. This is done by
102
+ # simultaniously read-opening all input files (forward and reverse index
103
+ # files and forward and reverse read files) and read one entry from each.
104
+ # Such four entries we call a set of entries. If the quality scores from
105
+ # either index1 or index2 fails the criteria for mean and min required
106
+ # quality the set is skipped. In the combined indexes are found in the
107
+ # search index, then the reads are writting to files according to the sample
108
+ # information in the search index. If the combined indexes are not found,
109
+ # then the reads have their names appended with the index sequences and the
110
+ # reads are written to the Undertermined files.
111
+ #
112
+ # Returns nothing.
113
+ def demultiplex
114
+ @data_io.open_input_files do |ios_in|
115
+ @data_io.open_output_files do |ios_out|
116
+ ios_in.each do |index1, index2, read1, read2|
117
+ @status.count += 2
118
+ puts(@status) if @options[:verbose] &&
119
+ (@status.count % 1_000) == 0
120
+
121
+ next unless index_qual_ok?(index1, index2)
122
+
123
+ match_index(ios_out, index1, index2, read1, read2)
124
+
125
+ # break if @status.count == 100_000
126
+ end
127
+ end
128
+ end
129
+ end
130
+
131
+ private
132
+
133
+ # Method that matches the combined index1 and index2 sequences against the
134
+ # search index. In case of a match the reads are written to file according to
135
+ # the information in the search index, otherwise the reads will have thier
136
+ # names appended with the index sequences and they will be written to the
137
+ # Undetermined files.
138
+ #
139
+ # ios_out - DataIO object with an accessor method for file output handles.
140
+ # index1 - Seq object with index1.
141
+ # index2 - Seq object with index2.
142
+ # read1 - Seq object with read1.
143
+ # read2 - Seq object with read2.
144
+ #
145
+ # Returns nothing.
146
+ def match_index(ios_out, index1, index2, read1, read2)
147
+ if (sample_id = @index_hash["#{index1.seq}#{index2.seq}".hash])
148
+ write_match(ios_out, sample_id, read1, read2)
149
+ else
150
+ write_undetermined(ios_out, index1, index2, read1, read2)
151
+ end
152
+ end
153
+
154
+ # Method that writes a index match to file according to the information in
155
+ # the search index.
156
+ #
157
+ # ios_out - DataIO object with an accessor method for file output handles.
158
+ # read1 - Seq object with read1.
159
+ # read2 - Seq object with read2.
160
+ #
161
+ # Returns nothing.
162
+ def write_match(ios_out, sample_id, read1, read2)
163
+ @status.match += 2
164
+ io_forward, io_reverse = ios_out[sample_id]
165
+
166
+ io_forward.puts read1.to_fastq
167
+ io_reverse.puts read2.to_fastq
168
+ end
169
+
170
+ # Method that appends the read names with the index sequences and writes
171
+ # the reads to the Undetermined files.
172
+ #
173
+ # ios_out - DataIO object with an accessor method for file output handles.
174
+ # index1 - Seq object with index1.
175
+ # index2 - Seq object with index2.
176
+ # read1 - Seq object with read1.
177
+ # read2 - Seq object with read2.
178
+ #
179
+ # Returns nothing.
180
+ def write_undetermined(ios_out, index1, index2, read1, read2)
181
+ @status.undetermined += 2
182
+ read1.seq_name = "#{read1.seq_name} #{index1.seq}"
183
+ read2.seq_name = "#{read2.seq_name} #{index2.seq}"
184
+
185
+ io_forward, io_reverse = ios_out[@undetermined]
186
+ io_forward.puts read1.to_fastq
187
+ io_reverse.puts read2.to_fastq
188
+ end
189
+
190
+ # Method to check the quality scores of the given indexes.
191
+ # If the mean score is higher than @options[:scores_mean] or
192
+ # if the min score is higher than @options[:scores_min] then
193
+ # the indexes are OK.
194
+ #
195
+ # index1 - Index1 Seq object.
196
+ # index2 - Index2 Seq object.
197
+ #
198
+ # Returns true if quality OK, else false.
199
+ def index_qual_ok?(index1, index2)
200
+ index_qual_mean_ok?(index1, index2) &&
201
+ index_qual_min_ok?(index1, index2)
202
+ end
203
+
204
+ # Method to check the mean quality scores of the given indexes.
205
+ # If the mean score is higher than @options[:scores_mean] the
206
+ # indexes are OK.
207
+ #
208
+ # index1 - Index1 Seq object.
209
+ # index2 - Index2 Seq object.
210
+ #
211
+ # Returns true if quality mean OK, else false.
212
+ def index_qual_mean_ok?(index1, index2)
213
+ if index1.scores_mean < @options[:scores_mean]
214
+ @status.index1_bad_mean += 2
215
+ return false
216
+ elsif index2.scores_mean < @options[:scores_mean]
217
+ @status.index2_bad_mean += 2
218
+ return false
219
+ end
220
+
221
+ true
222
+ end
223
+
224
+ # Method to check the min quality scores of the given indexes.
225
+ # If the min score is higher than @options[:scores_min] the
226
+ # indexes are OK.
227
+ #
228
+ # index1 - Index1 Seq object.
229
+ # index2 - Index2 Seq object.
230
+ #
231
+ # Returns true if quality min OK, else false.
232
+ def index_qual_min_ok?(index1, index2)
233
+ if index1.scores_min < @options[:scores_min]
234
+ @status.index1_bad_min += 2
235
+ return false
236
+ elsif index2.scores_min < @options[:scores_min]
237
+ @status.index2_bad_min += 2
238
+ return false
239
+ end
240
+
241
+ true
242
+ end
243
+
244
+ # Method that iterates over @samples and compiles a sorted Array with all
245
+ # unique index1 sequences.
246
+ #
247
+ # Returns Array with uniq index1 sequences.
248
+ def uniq_index1
249
+ @status.index1 = @samples.each_with_object(SortedSet.new) do |a, e|
250
+ a << e.index1
251
+ end.to_a
252
+ end
253
+
254
+ # Method that iterates over @samples and compiles a sorted Array with all
255
+ # unique index2 sequences.
256
+ #
257
+ # Returns Array with uniq index2 sequences.
258
+ def uniq_index2
259
+ @status.index2 = @samples.each_with_object(SortedSet.new) do |a, e|
260
+ a << e.index2
261
+ end.to_a
262
+ end
263
+ end