demultiplexer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ class Demultiplexer
25
+ VERSION = "0.0.1"
26
+ end
@@ -0,0 +1,181 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Class containing methods for building an search index.
25
+ class IndexBuilder
26
+ # Class method that build a search index from a given Array of samples.
27
+ #
28
+ # samples - Array of samples (Sample objects with id, index1 and index2).
29
+ #
30
+ # Examples
31
+ #
32
+ # IndexBuilder.build(samples)
33
+ # # => <Google Hash>
34
+ #
35
+ # Returns a Google Hash where the key is the index and the value is the TODO
36
+ def self.build(samples, mismatches_max)
37
+ index_builder = new(samples, mismatches_max)
38
+ index_hash = index_builder.index_init
39
+ index_builder.index_populate(index_hash)
40
+ end
41
+
42
+ # Constructor method for IndexBuilder object. The given Array of samples and
43
+ # mismatches_max are saved as an instance variable.
44
+ #
45
+ # samples - Array of Sample objects.
46
+ # mismatches_max - Integer denoting the maximum number of misses allowed in
47
+ # an index sequence.
48
+ #
49
+ # Examples
50
+ #
51
+ # IndexBuilder.new(samples, 2)
52
+ # # => <IndexBuilder>
53
+ #
54
+ # Returns an IndexBuilder object.
55
+ def initialize(samples, mismatches_max)
56
+ @samples = samples
57
+ @mismatches_max = mismatches_max
58
+ end
59
+
60
+ # Method to initialize the index. If @mismatches_max is <= then
61
+ # GoogleHashSparseLongToInt is used else GoogleHashDenseLongToInt due to
62
+ # memory and performance.
63
+ #
64
+ # Returns a Google Hash.
65
+ def index_init
66
+ if @mismatches_max <= 1
67
+ index_hash = GoogleHashSparseLongToInt.new
68
+ else
69
+ index_hash = GoogleHashDenseLongToInt.new
70
+ end
71
+
72
+ index_hash
73
+ end
74
+
75
+ # Method to populate the index.
76
+ #
77
+ # index_hash - Google Hash with initialized index.
78
+ #
79
+ # Returns a Google Hash.
80
+ def index_populate(index_hash)
81
+ @samples.each_with_index do |sample, i|
82
+ index_list1 = permutate([sample.index1], @mismatches_max)
83
+ index_list2 = permutate([sample.index2], @mismatches_max)
84
+
85
+ # index_check_list_sizes(index_list1, index_list2)
86
+
87
+ index_list1.product(index_list2).each do |index1, index2|
88
+ key = "#{index1}#{index2}".hash
89
+
90
+ index_check_existing(index_hash, key)
91
+
92
+ index_hash[key] = i
93
+ end
94
+ end
95
+
96
+ index_hash
97
+ end
98
+
99
+ private
100
+
101
+ # Method to check if two index lists differ in size, if so an exception is
102
+ # raised.
103
+ #
104
+ # index_list1 - Array with index1
105
+ # index_list2 - Array with index2
106
+ #
107
+ # Returns nothing.
108
+ def index_check_list_sizes(index_list1, index_list2)
109
+ return if index_list1.size == index_list2.size
110
+
111
+ fail "Permutated list sizes differ: \
112
+ #{index_list1.size} != #{index_list2.size}"
113
+ end
114
+
115
+ # Method to check if a index key already exists in the index, and if so an
116
+ # exception is raised.
117
+ #
118
+ # index_hash - Google Hash with index
119
+ # key - Integer from Google Hash's #hash method
120
+ #
121
+ # Returns nothing.
122
+ def index_check_existing(index_hash, key)
123
+ return unless index_hash[key]
124
+
125
+ fail "Index combo of #{index1} and #{index2} already exists for \
126
+ sample id: #{@samples[index_hash[key]].id} and #{sample.id}"
127
+ end
128
+
129
+ # Method that for each word in a given Array of word permutates each word a
130
+ # given number (permuate) of times using a given alphabet, such that an Array
131
+ # of words with all possible combinations is returned.
132
+ #
133
+ # list - Array of words (Strings) to permutate.
134
+ # permuate - Number of permutations (Integer).
135
+ # alphabet - String with alphabet used for permutation.
136
+ #
137
+ # Examples
138
+ #
139
+ # permutate(["AA"], 1, "ATCG")
140
+ # # => ["AA", "TA", "CA", "GA", "AA", "AT", "AC, "AG"]
141
+ #
142
+ # Returns an Array with permutated words (Strings).
143
+ def permutate(list, permutations = 2, alphabet = 'ATCG')
144
+ permutations.times do
145
+ set = list.each_with_object(Set.new) { |e, a| a.add(e.to_sym) }
146
+
147
+ list.each do |word|
148
+ new_words = permutate_word(word, alphabet)
149
+ new_words.map { |new_word| set.add(new_word.to_sym) }
150
+ end
151
+
152
+ list = set.map(&:to_s)
153
+ end
154
+
155
+ list
156
+ end
157
+
158
+ # Method that permutates a given word using a given alphabet, such that an
159
+ # Array of words with all possible combinations is returned.
160
+ #
161
+ # word - String with word to permutate.
162
+ # alphabet - String with alphabet used for permutation.
163
+ #
164
+ # Examples
165
+ #
166
+ # permutate("AA", "ATCG")
167
+ # # => ["AA", "TA", "CA", "GA", "AA", "AT", "AC, "AG"]
168
+ #
169
+ # Returns an Array with permutated words (Strings).
170
+ def permutate_word(word, alphabet)
171
+ new_words = []
172
+
173
+ (0...word.size).each do |pos|
174
+ alphabet.each_char do |char|
175
+ new_words << "#{word[0...pos]}#{char}#{word[pos + 1..-1]}"
176
+ end
177
+ end
178
+
179
+ new_words
180
+ end
181
+ end
@@ -0,0 +1,198 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Class containing methods for reading and checking sample information.
25
+ class SampleReader
26
+ # Class method that reads sample information from a samples file, which
27
+ # consists of ASCII text in three tab separated columns: The first column is
28
+ # the sample_id, the second column is index1 and the third column is index2.
29
+ #
30
+ # If revcomp1 or revcomp2 is set then index1 and index2 are
31
+ # reverse-complemented accordingly.
32
+ #
33
+ # file - String with path to sample file.
34
+ # revcomp1 - Flag indicating that index1 should be reverse-complemented.
35
+ # revcomp2 - Flag indicating that index2 should be reverse-complemented.
36
+ #
37
+ # Examples
38
+ #
39
+ # SampleReader.read("samples.txt", false, false)
40
+ # # => [<Sample>, <Sample>, <Sample> ...]
41
+ #
42
+ # Returns an Array of Sample objects.
43
+ def self.read(file, revcomp1, revcomp2)
44
+ sample_reader = new(revcomp1, revcomp2)
45
+ sample_reader.samples_parse(file)
46
+ end
47
+
48
+ # Constructor method for SampleReader object. The given revcomp1 and revcomp2
49
+ # flags are stored as instance variables.
50
+ #
51
+ # revcomp1 - Flag indicating that index1 should be reverse-complemented.
52
+ # revcomp2 - Flag indicating that index2 should be reverse-complemented.
53
+ #
54
+ # Examples
55
+ #
56
+ # SampleReader.new(false, false)
57
+ # # => <SampleReader>
58
+ #
59
+ # Returns SampleReader object.
60
+ def initialize(revcomp1, revcomp2)
61
+ @revcomp1 = revcomp1
62
+ @revcomp2 = revcomp2
63
+ end
64
+
65
+ # Method that reads sample information from a samples file, which consists
66
+ # of ASCII text in three tab separated columns: The first column is the
67
+ # sample_id, the second column is index1 and the third column is index2.
68
+ #
69
+ # file - String with path to sample file.
70
+ #
71
+ # Examples
72
+ #
73
+ # samples_parse("samples.txt")
74
+ # # => [<Sample>, <Sample>, <Sample> ...]
75
+ #
76
+ # Returns an Array of Sample objects.
77
+ def samples_parse(file)
78
+ samples = samples_read(file)
79
+ samples_reverse_complement(samples)
80
+ errors = []
81
+ errors.push(*samples_check_index_combo(samples))
82
+ errors.push(*samples_check_uniq_id(samples))
83
+
84
+ unless errors.empty?
85
+ pp errors
86
+ fail 'errors found in sample file.'
87
+ end
88
+
89
+ samples
90
+ end
91
+
92
+ private
93
+
94
+ # Method that reads sample information form a samples file, which consists
95
+ # of ASCII text in three tab separated columns: The first column is the
96
+ # sample_id, the second column is index1 and the third column is index2.
97
+ #
98
+ # If @options[:revcomp_index1] or @options[:revcomp_index2] is set then
99
+ # index1 and index2 are reverse-complemented accordingly.
100
+ #
101
+ # file - String with path to sample file.
102
+ #
103
+ # Examples
104
+ #
105
+ # samples_read("samples.txt")
106
+ # # => [<Sample>, <Sample>, <Sample> ...]
107
+ #
108
+ # Returns an Array of Sample objects.
109
+ def samples_read(file)
110
+ samples = []
111
+
112
+ CSV.read(file, col_sep: "\t").each do |id, index1, index2|
113
+ samples << Sample.new(id, index1, index2)
114
+ end
115
+
116
+ samples
117
+ end
118
+
119
+ # Method that iterates over the a given Array of sample Objects, and if
120
+ # @options[:revcomp_index1] or @options[:revcomp_index2] is set then
121
+ # index1 and index2 are reverse-complemented accordingly.
122
+ #
123
+ # samples - Array of Sample objects.
124
+ #
125
+ # Returns nothing.
126
+ def samples_reverse_complement(samples)
127
+ samples.each do |sample|
128
+ sample.index1 = index_reverse_complement(sample.index1) if @revcomp1
129
+ sample.index2 = index_reverse_complement(sample.index2) if @revcomp2
130
+ end
131
+ end
132
+
133
+ # Method that reverse-complements a given index sequence.
134
+ #
135
+ # index - Index String.
136
+ #
137
+ # Returns reverse-complemented index String.
138
+ def index_reverse_complement(index)
139
+ BioPieces::Seq.new(seq: index, type: :dna).reverse.complement.seq
140
+ end
141
+
142
+ # Method that iterates over the a given Array of sample Objects, and if
143
+ # the combination of index1 and index2 is non-unique an error is pushed
144
+ # on an error Array.
145
+ #
146
+ # samples - Array of Sample objects.
147
+ #
148
+ # Returns an Array of found errors.
149
+ def samples_check_index_combo(samples)
150
+ errors = []
151
+ lookup = {}
152
+
153
+ samples.each do |sample|
154
+ if (id2 = lookup["#{sample.index1}#{sample.index2}"])
155
+ errors << ['Samples with same index combo', sample.id, id2].join("\t")
156
+ else
157
+ lookup["#{sample.index1}#{sample.index2}"] = sample.id
158
+ end
159
+ end
160
+
161
+ errors
162
+ end
163
+
164
+ # Method that iterates over the a given Array of sample Objects, and if
165
+ # a sample id is non-unique an error is pushed on an error Array.
166
+ #
167
+ # samples - Array of Sample objects.
168
+ #
169
+ # Returns an Array of found errors.
170
+ def samples_check_uniq_id(samples)
171
+ errors = []
172
+ lookup = Set.new
173
+
174
+ samples.each do |sample|
175
+ if lookup.include? sample.id
176
+ errors << ['Non-unique sample id', sample.id].join("\t")
177
+ end
178
+
179
+ lookup << sample.id
180
+ end
181
+
182
+ errors
183
+ end
184
+
185
+ # Struct for holding sample information.
186
+ #
187
+ # id - Sample id.
188
+ # index1 - Index1 sequence.
189
+ # index2 - Index2 sequence.
190
+ #
191
+ # Examples
192
+ #
193
+ # Sample.new("test1", "atcg", "gcta")
194
+ # # => <Sample>
195
+ #
196
+ # Returns Sample object.
197
+ Sample = Struct.new(:id, :index1, :index2)
198
+ end
data/lib/screen.rb ADDED
@@ -0,0 +1,39 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Module containing class methods for clearing and resetting a terminal screen.
25
+ module Screen
26
+ # Method that uses console code to clear the screen.
27
+ #
28
+ # Returns nothing.
29
+ def self.clear
30
+ print "\e[H\e[2J"
31
+ end
32
+
33
+ # Method that uses console code to move cursor to 1,1 coordinate.
34
+ #
35
+ # Returns nothing.
36
+ def self.reset
37
+ print "\e[1;1H"
38
+ end
39
+ end
data/lib/status.rb ADDED
@@ -0,0 +1,101 @@
1
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
2
+ # #
3
+ # Copyright (C) 2014-2015 Martin Asser Hansen (mail@maasha.dk). #
4
+ # #
5
+ # This program is free software; you can redistribute it and/or #
6
+ # modify it under the terms of the GNU General Public License #
7
+ # as published by the Free Software Foundation; either version 2 #
8
+ # of the License, or (at your option) any later version. #
9
+ # #
10
+ # This program is distributed in the hope that it will be useful, #
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
13
+ # GNU General Public License for more details. #
14
+ # #
15
+ # You should have received a copy of the GNU General Public License #
16
+ # along with this program; if not, write to the Free Software #
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, #
18
+ # USA. #
19
+ # #
20
+ # http://www.gnu.org/copyleft/gpl.html #
21
+ # #
22
+ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
23
+
24
+ # Class containing methods to records demultiplexing status.
25
+ class Status
26
+ attr_accessor :count, :match, :undetermined, :index1_bad_mean,
27
+ :index2_bad_mean, :index1_bad_min, :index2_bad_min
28
+ # Method to initialize a Status object, which contains the following instance
29
+ # variables initialized to 0:
30
+ #
31
+ # @count - Number or reads.
32
+ # @match - Number of reads found in index.
33
+ # @undetermined - Number of reads not found in index.
34
+ # @index1_bad_mean - Number of reads dropped due to bad mean in index1.
35
+ # @index2_bad_mean - Number of reads dropped due to bad mean in index2.
36
+ # @index1_bad_min - Number of reads dropped due to bad min in index1.
37
+ # @index2_bad_min - Number of reads dropped due to bad min in index2.
38
+ #
39
+ # Examples
40
+ #
41
+ # Status.new
42
+ # # => <Status>
43
+ #
44
+ # Returns a Status object.
45
+ def initialize
46
+ @count = 0
47
+ @match = 0
48
+ @undetermined = 0
49
+ @index1_bad_mean = 0
50
+ @index2_bad_mean = 0
51
+ @index1_bad_min = 0
52
+ @index2_bad_min = 0
53
+ @time_start = Time.now
54
+ end
55
+
56
+ # Method to format a String from a Status object. This is done by adding the
57
+ # relevant instance variables to a Hash and return this as an YAML String.
58
+ #
59
+ # Returns a YAML String.
60
+ def to_s
61
+ { count: @count,
62
+ match: @match,
63
+ undetermined: @undetermined,
64
+ undetermined_percent: undetermined_percent,
65
+ index1_bad_mean: @index1_bad_mean,
66
+ index2_bad_mean: @index2_bad_mean,
67
+ index1_bad_min: @index1_bad_min,
68
+ index2_bad_min: @index2_bad_min,
69
+ time: time }.to_yaml
70
+ end
71
+
72
+ # Method that calculate the percentage of undetermined reads.
73
+ #
74
+ # Returns a Float with the percentage of undetermined reads.
75
+ def undetermined_percent
76
+ (100 * @undetermined / @count.to_f).round(1)
77
+ end
78
+
79
+ # Method that calculates the elapsed time and formats a nice Time String.
80
+ #
81
+ # Returns String with elapsed time.
82
+ def time
83
+ time_elapsed = Time.now - @time_start
84
+ (Time.mktime(0) + time_elapsed).strftime('%H:%M:%S')
85
+ end
86
+
87
+ # Method to save stats to the log file 'Demultiplex.log' in the output
88
+ # directory.
89
+ #
90
+ # Returns nothing.
91
+ def save(file)
92
+ @stats[:sample_id] = @samples.map(&:id)
93
+
94
+ @stats[:index1] = uniq_index1
95
+ @stats[:index2] = uniq_index2
96
+
97
+ File.open(file, 'w') do |ios|
98
+ ios.puts @status
99
+ end
100
+ end
101
+ end