demultiplexer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,26 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+
3
+ require 'demultiplexer/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'demultiplexer'
7
+ s.version = Demultiplexer::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.date = Time.now.strftime("%F")
10
+ s.summary = "Demultiplexer"
11
+ s.description = "Demultiplex sequences from the Illumina platform."
12
+ s.authors = ["Martin A. Hansen"]
13
+ s.email = 'mail@maasha.dk'
14
+ s.rubyforge_project = "demultiplexer"
15
+ s.homepage = 'http://github.com/maasha/demultiplexer'
16
+ s.license = 'GPL2'
17
+ s.rubygems_version = "2.0.0"
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency("biopieces", ">= 0.4.1")
23
+ s.add_dependency("google_hash", ">= 0.8.4")
24
+ s.add_development_dependency("bundler", ">= 1.7.4")
25
+ s.add_development_dependency("simplecov", ">= 0.9.2")
26
+ end
data/lib/data_io.rb ADDED
@@ -0,0 +1,207 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Class containing methods for reading and write FASTQ data files.
25
+ class DataIO
26
+ def initialize(samples, fastq_files, compress, output_dir)
27
+ @samples = samples
28
+ @compress = compress
29
+ @output_dir = output_dir
30
+ @suffix1 = extract_suffix(fastq_files.grep(/_R1_/).first)
31
+ @suffix2 = extract_suffix(fastq_files.grep(/_R2_/).first)
32
+ @input_files = identify_input_files(fastq_files)
33
+ @undetermined = @samples.size + 1
34
+ @file_hash = nil
35
+ end
36
+
37
+ # Method that extracts the Sample, Lane, Region information from a given file.
38
+ #
39
+ # file - String with file name.
40
+ #
41
+ # Examples
42
+ #
43
+ # extract_suffix("Sample1_S1_L001_R1_001.fastq.gz")
44
+ # # => "_S1_L001_R1_001"
45
+ #
46
+ # Returns String with SLR info.
47
+ def extract_suffix(file)
48
+ if file =~ /.+(_S\d_L\d{3}_R[12]_\d{3}).+$/
49
+ slr = Regexp.last_match(1)
50
+ else
51
+ fail "Unable to parse file SLR from: #{file}"
52
+ end
53
+
54
+ append_suffix(slr)
55
+ end
56
+
57
+ # Method that appends a file suffix to a given Sample, Lane, Region
58
+ # information String based on the @options[:compress] option. The
59
+ # file suffix can be either ".fastq.gz", ".fastq.bz2", or ".fastq".
60
+ #
61
+ # slr - String Sample, Lane, Region information.
62
+ #
63
+ # Examples
64
+ #
65
+ # append_suffix("_S1_L001_R1_001")
66
+ # # => "_S1_L001_R1_001.fastq.gz"
67
+ #
68
+ # Returns String with SLR info and file suffix.
69
+ def append_suffix(slr)
70
+ case @compress
71
+ when /gzip/
72
+ slr << '.fastq.gz'
73
+ when /bzip2/
74
+ slr << '.fastq.bz2'
75
+ else
76
+ slr << '.fastq'
77
+ end
78
+
79
+ slr
80
+ end
81
+
82
+ # Method identify the different input files from a given Array of FASTQ files.
83
+ # The forward index file contains a _I1_, the reverse index file contains a
84
+ # _I2_, the forward read file contains a _R1_ and finally, the reverse read
85
+ # file contain a _R2_.
86
+ #
87
+ # fastq_files - Array with FASTQ files (Strings).
88
+ #
89
+ # Returns an Array with input files (Strings).
90
+ def identify_input_files(fastq_files)
91
+ input_files = []
92
+
93
+ input_files << fastq_files.grep(/_I1_/).first
94
+ input_files << fastq_files.grep(/_I2_/).first
95
+ input_files << fastq_files.grep(/_R1_/).first
96
+ input_files << fastq_files.grep(/_R2_/).first
97
+
98
+ input_files
99
+ end
100
+
101
+ # Method that opens the @input_files for reading.
102
+ #
103
+ # input_files - Array with input file paths.
104
+ #
105
+ # Returns an Array with IO objects (file handles).
106
+ def open_input_files
107
+ @file_ios = []
108
+
109
+ @input_files.each do |input_file|
110
+ @file_ios << BioPieces::Fastq.open(input_file)
111
+ end
112
+
113
+ yield self
114
+ ensure
115
+ close_input_files
116
+ end
117
+
118
+ # Method that closes open input files.
119
+ #
120
+ # Returns nothing.
121
+ def close_input_files
122
+ @file_ios.map(&:close)
123
+ end
124
+
125
+ # Method that reads a Seq entry from each of the file handles in the
126
+ # @file_ios Array. Iteration stops when no more Seq entries are found.
127
+ #
128
+ # Yields an Array with 4 Seq objects.
129
+ #
130
+ # Returns nothing
131
+ def each
132
+ loop do
133
+ entries = @file_ios.each_with_object([]) { |e, a| a << e.next_entry }
134
+
135
+ break if entries.compact.size != 4
136
+
137
+ yield entries
138
+ end
139
+ end
140
+
141
+ # Method that opens the output files for writing.
142
+ #
143
+ # Yeilds a Hash with an incrementing index as keys, and a tuple of file
144
+ # handles as values.
145
+ def open_output_files
146
+ @file_hash = {}
147
+ comp = @compress
148
+
149
+ @file_hash.merge!(open_output_files_samples(comp))
150
+ @file_hash.merge!(open_output_files_undet(comp))
151
+
152
+ yield self
153
+ ensure
154
+ close_output_files
155
+ end
156
+
157
+ def close_output_files
158
+ @file_hash.each_value { |value| value.map(&:close) }
159
+ end
160
+
161
+ # Getter method that returns a tuple of file handles from @file_hash when
162
+ # given a key.
163
+ #
164
+ # key - Key used to lookup
165
+ #
166
+ # Returns Array with a tuple of IO objects.
167
+ def [](key)
168
+ @file_hash[key]
169
+ end
170
+
171
+ # Method that opens the sample output files for writing.
172
+ #
173
+ # comp - Symbol with type of output compression.
174
+ #
175
+ # Returns a Hash with an incrementing index as keys, and a tuple of file
176
+ # handles as values.
177
+ def open_output_files_samples(comp)
178
+ file_hash = {}
179
+
180
+ @samples.each_with_index do |sample, i|
181
+ file_forward = File.join(@output_dir, "#{sample.id}#{@suffix1}")
182
+ file_reverse = File.join(@output_dir, "#{sample.id}#{@suffix2}")
183
+ io_forward = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
184
+ io_reverse = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
185
+ file_hash[i] = [io_forward, io_reverse]
186
+ end
187
+
188
+ file_hash
189
+ end
190
+
191
+ # Method that opens the undertermined output files for writing.
192
+ #
193
+ # comp - Symbol with type of output compression.
194
+ #
195
+ # Returns a Hash with an incrementing index as keys, and a tuple of file
196
+ # handles as values.
197
+ def open_output_files_undet(comp)
198
+ file_hash = {}
199
+ file_forward = File.join(@output_dir, "Undetermined#{@suffix1}")
200
+ file_reverse = File.join(@output_dir, "Undetermined#{@suffix2}")
201
+ io_forward = BioPieces::Fastq.open(file_forward, 'w', compress: comp)
202
+ io_reverse = BioPieces::Fastq.open(file_reverse, 'w', compress: comp)
203
+ file_hash[@undetermined] = [io_forward, io_reverse]
204
+
205
+ file_hash
206
+ end
207
+ end
@@ -0,0 +1,263 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Class containing methods for demultiplexing MiSeq sequences.
25
+ class Demultiplexer
26
+ attr_reader :status
27
+
28
+ # Public: Class method to run demultiplexing of MiSeq sequences.
29
+ #
30
+ # fastq_files - Array with paths to FASTQ files.
31
+ # options - Options Hash.
32
+ # :verbose - Verbose flag (default: false).
33
+ # :mismatches_max - Integer value indicating max mismatches
34
+ # (default: 0).
35
+ # :samples_file - String with path to samples file.
36
+ # :revcomp_index1 - Flag indicating that index1 should be
37
+ # reverse-complemented (default: false).
38
+ # :revcomp_index2 - Flag indicating that index2 should be
39
+ # reverse-complemented (default: false).
40
+ # :output_dir - String with output directory (optional).
41
+ # :scores_min - An Integer representing the Phred score
42
+ # minimum, such that a reads is dropped if a
43
+ # single position in the index contain a
44
+ # score below this value (default: 16).
45
+ # :scores_mean=> - An Integer representing the mean Phread
46
+ # score, such that a read is dropped if the
47
+ # mean quality score is below this value
48
+ # (default: 16).
49
+ #
50
+ # Examples
51
+ #
52
+ # Demultiplexer.run(['I1.fq', 'I2.fq', 'R1.fq', 'R2.fq'], \
53
+ # samples_file: 'samples.txt')
54
+ # # => <Demultiplexer>
55
+ #
56
+ # Returns Demultiplexer object
57
+ def self.run(fastq_files, options)
58
+ log_file = File.join(options[:output_dir], 'Demultiplex.log')
59
+ demultiplexer = new(fastq_files, options)
60
+ Screen.clear if options[:verbose]
61
+ demultiplexer.demultiplex
62
+ puts demultiplexer.status if options[:verbose]
63
+ demultiplexer.status.save(log_file)
64
+ end
65
+
66
+ # Constructor method for Demultiplexer object.
67
+ #
68
+ # fastq_files - Array with paths to FASTQ files.
69
+ # options - Options Hash.
70
+ # :verbose - Verbose flag (default: false).
71
+ # :mismatches_max - Integer value indicating max mismatches
72
+ # (default: 0).
73
+ # :samples_file - String with path to samples file.
74
+ # :revcomp_index1 - Flag indicating that index1 should be
75
+ # reverse-complemented (default: false).
76
+ # :revcomp_index2 - Flag indicating that index2 should be
77
+ # reverse-complemented (default: false).
78
+ # :output_dir - String with output directory (optional).
79
+ # :scores_min - An Integer representing the Phred score
80
+ # minimum, such that a reads is dropped if a
81
+ # single position in the index contain a
82
+ # score below this value (default: 16).
83
+ # :scores_mean=> - An Integer representing the mean Phread
84
+ # score, such that a read is dropped if the
85
+ # mean quality score is below this value
86
+ # (default: 16).
87
+ #
88
+ # Returns Demultiplexer object
89
+ def initialize(fastq_files, options)
90
+ @options = options
91
+ @samples = SampleReader.read(options[:samples_file],
92
+ options[:revcomp_index1],
93
+ options[:revcomp_index2])
94
+ @undetermined = @samples.size + 1
95
+ @index_hash = IndexBuilder.build(@samples, options[:mismatches_max])
96
+ @data_io = DataIO.new(@samples, fastq_files, options[:compress],
97
+ options[:output_dir])
98
+ @status = Status.new
99
+ end
100
+
101
+ # Method to demultiplex reads according the index. This is done by
102
+ # simultaniously read-opening all input files (forward and reverse index
103
+ # files and forward and reverse read files) and read one entry from each.
104
+ # Such four entries we call a set of entries. If the quality scores from
105
+ # either index1 or index2 fails the criteria for mean and min required
106
+ # quality the set is skipped. In the combined indexes are found in the
107
+ # search index, then the reads are writting to files according to the sample
108
+ # information in the search index. If the combined indexes are not found,
109
+ # then the reads have their names appended with the index sequences and the
110
+ # reads are written to the Undertermined files.
111
+ #
112
+ # Returns nothing.
113
+ def demultiplex
114
+ @data_io.open_input_files do |ios_in|
115
+ @data_io.open_output_files do |ios_out|
116
+ ios_in.each do |index1, index2, read1, read2|
117
+ @status.count += 2
118
+ puts(@status) if @options[:verbose] &&
119
+ (@status.count % 1_000) == 0
120
+
121
+ next unless index_qual_ok?(index1, index2)
122
+
123
+ match_index(ios_out, index1, index2, read1, read2)
124
+
125
+ # break if @status.count == 100_000
126
+ end
127
+ end
128
+ end
129
+ end
130
+
131
+ private
132
+
133
+ # Method that matches the combined index1 and index2 sequences against the
134
+ # search index. In case of a match the reads are written to file according to
135
+ # the information in the search index, otherwise the reads will have thier
136
+ # names appended with the index sequences and they will be written to the
137
+ # Undetermined files.
138
+ #
139
+ # ios_out - DataIO object with an accessor method for file output handles.
140
+ # index1 - Seq object with index1.
141
+ # index2 - Seq object with index2.
142
+ # read1 - Seq object with read1.
143
+ # read2 - Seq object with read2.
144
+ #
145
+ # Returns nothing.
146
+ def match_index(ios_out, index1, index2, read1, read2)
147
+ if (sample_id = @index_hash["#{index1.seq}#{index2.seq}".hash])
148
+ write_match(ios_out, sample_id, read1, read2)
149
+ else
150
+ write_undetermined(ios_out, index1, index2, read1, read2)
151
+ end
152
+ end
153
+
154
+ # Method that writes a index match to file according to the information in
155
+ # the search index.
156
+ #
157
+ # ios_out - DataIO object with an accessor method for file output handles.
158
+ # read1 - Seq object with read1.
159
+ # read2 - Seq object with read2.
160
+ #
161
+ # Returns nothing.
162
+ def write_match(ios_out, sample_id, read1, read2)
163
+ @status.match += 2
164
+ io_forward, io_reverse = ios_out[sample_id]
165
+
166
+ io_forward.puts read1.to_fastq
167
+ io_reverse.puts read2.to_fastq
168
+ end
169
+
170
+ # Method that appends the read names with the index sequences and writes
171
+ # the reads to the Undetermined files.
172
+ #
173
+ # ios_out - DataIO object with an accessor method for file output handles.
174
+ # index1 - Seq object with index1.
175
+ # index2 - Seq object with index2.
176
+ # read1 - Seq object with read1.
177
+ # read2 - Seq object with read2.
178
+ #
179
+ # Returns nothing.
180
+ def write_undetermined(ios_out, index1, index2, read1, read2)
181
+ @status.undetermined += 2
182
+ read1.seq_name = "#{read1.seq_name} #{index1.seq}"
183
+ read2.seq_name = "#{read2.seq_name} #{index2.seq}"
184
+
185
+ io_forward, io_reverse = ios_out[@undetermined]
186
+ io_forward.puts read1.to_fastq
187
+ io_reverse.puts read2.to_fastq
188
+ end
189
+
190
+ # Method to check the quality scores of the given indexes.
191
+ # If the mean score is higher than @options[:scores_mean] or
192
+ # if the min score is higher than @options[:scores_min] then
193
+ # the indexes are OK.
194
+ #
195
+ # index1 - Index1 Seq object.
196
+ # index2 - Index2 Seq object.
197
+ #
198
+ # Returns true if quality OK, else false.
199
+ def index_qual_ok?(index1, index2)
200
+ index_qual_mean_ok?(index1, index2) &&
201
+ index_qual_min_ok?(index1, index2)
202
+ end
203
+
204
+ # Method to check the mean quality scores of the given indexes.
205
+ # If the mean score is higher than @options[:scores_mean] the
206
+ # indexes are OK.
207
+ #
208
+ # index1 - Index1 Seq object.
209
+ # index2 - Index2 Seq object.
210
+ #
211
+ # Returns true if quality mean OK, else false.
212
+ def index_qual_mean_ok?(index1, index2)
213
+ if index1.scores_mean < @options[:scores_mean]
214
+ @status.index1_bad_mean += 2
215
+ return false
216
+ elsif index2.scores_mean < @options[:scores_mean]
217
+ @status.index2_bad_mean += 2
218
+ return false
219
+ end
220
+
221
+ true
222
+ end
223
+
224
+ # Method to check the min quality scores of the given indexes.
225
+ # If the min score is higher than @options[:scores_min] the
226
+ # indexes are OK.
227
+ #
228
+ # index1 - Index1 Seq object.
229
+ # index2 - Index2 Seq object.
230
+ #
231
+ # Returns true if quality min OK, else false.
232
+ def index_qual_min_ok?(index1, index2)
233
+ if index1.scores_min < @options[:scores_min]
234
+ @status.index1_bad_min += 2
235
+ return false
236
+ elsif index2.scores_min < @options[:scores_min]
237
+ @status.index2_bad_min += 2
238
+ return false
239
+ end
240
+
241
+ true
242
+ end
243
+
244
+ # Method that iterates over @samples and compiles a sorted Array with all
245
+ # unique index1 sequences.
246
+ #
247
+ # Returns Array with uniq index1 sequences.
248
+ def uniq_index1
249
+ @status.index1 = @samples.each_with_object(SortedSet.new) do |a, e|
250
+ a << e.index1
251
+ end.to_a
252
+ end
253
+
254
+ # Method that iterates over @samples and compiles a sorted Array with all
255
+ # unique index2 sequences.
256
+ #
257
+ # Returns Array with uniq index2 sequences.
258
+ def uniq_index2
259
+ @status.index2 = @samples.each_with_object(SortedSet.new) do |a, e|
260
+ a << e.index2
261
+ end.to_a
262
+ end
263
+ end