viral_seq 1.0.6 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ module ViralSeq
7
7
  # @example join the paired-end sequences with an overlap of 100 bp
8
8
  # my_seqhashpair.join1(100)
9
9
  # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
10
- # my_seqhashpair.join1(:indiv)
10
+ # my_seqhashpair.join2(model: :indiv)
11
11
 
12
12
  class SeqHashPair
13
13
 
@@ -80,6 +80,12 @@ module ViralSeq
80
80
  alias_method :fa, :new_from_fasta
81
81
  end
82
82
 
83
+ # the size of nt sequence hash of the SeqHashPair object
84
+ # @return [Integer] size of nt sequence hash of the SeqHash object
85
+ def size
86
+ self.dna_hash.size
87
+ end
88
+
83
89
  # Pair-end join function for KNOWN overlap size.
84
90
  # @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
85
91
  # @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
@@ -104,17 +110,21 @@ module ViralSeq
104
110
  raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
105
111
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
106
112
  joined_seq = {}
107
- seq_pair_hash.each do |seq_name, seq_pair|
113
+ seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
108
114
  r1_seq = seq_pair[0]
109
115
  r2_seq = seq_pair[1]
110
116
  if overlap.zero?
111
- joined_seq[seq_name] = r1_seq + r2_seq
117
+ joined_sequence = r1_seq + r2_seq
112
118
  elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
113
- joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
119
+ joined_sequence= r1_seq + r2_seq[overlap..-1]
114
120
  else
115
121
  next
116
122
  end
123
+ seq_names.each do |seq_name|
124
+ joined_seq[seq_name] = joined_sequence
125
+ end
117
126
  end
127
+
118
128
  joined_seq_hash = ViralSeq::SeqHash.new
119
129
  joined_seq_hash.dna_hash = joined_seq
120
130
  joined_seq_hash.title = self.title + "_joined"
@@ -139,7 +149,7 @@ module ViralSeq
139
149
  # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
140
150
  # my_seqhashpair.join2.dna_hash
141
151
  # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
142
- # my_seqhashpair.join2(model :indiv).dna_hash
152
+ # my_seqhashpair.join2(model: :indiv).dna_hash
143
153
  # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
144
154
 
145
155
  def join2(model: :con, diff: 0.0)
@@ -207,7 +217,7 @@ module ViralSeq
207
217
  # {minimal overlap set to 4. }
208
218
  def overlap_matrix(sequence1, sequence2)
209
219
  min_overlap = 4
210
- max_overlap = [sequence1.size, sequence2.size].max
220
+ max_overlap = [sequence1.size, sequence2.size].min
211
221
  matrix_hash = {}
212
222
  (min_overlap..max_overlap).each do |overlap|
213
223
  matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
@@ -0,0 +1,305 @@
1
+ module ViralSeq
2
+
3
+ # Core functions for `tcs` pipeline
4
+
5
+ class TcsCore
6
+ class << self
7
+
8
+ # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
9
+
10
+ def calculate_cut_off(m, error_rate = 0.02)
11
+ n = 0
12
+ case error_rate
13
+ when 0.005...0.015
14
+ if m <= 10
15
+ n = 2
16
+ else
17
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
18
+ end
19
+
20
+ when 0...0.005
21
+ if m <= 10
22
+ n = 2
23
+ else
24
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
25
+ end
26
+
27
+ else
28
+ if m <= 10
29
+ n = 2
30
+ elsif m <= 8500
31
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
32
+ else
33
+ n = 0.0079 * m + 9.4869
34
+ end
35
+ end
36
+
37
+ n = n.round
38
+ n = 2 if n < 3
39
+ return n
40
+ end
41
+
42
+ # identify which file in the directory is R1 file, and which is R2 file based on file names
43
+ # input as directory (Dir object or a string of path)
44
+ # by default, .gz files will be unzipped.
45
+ # return as an hash of {r1_file: file1, r1_file: file2}
46
+ def r1r2(directory, unzip = true)
47
+ files = []
48
+ Dir.chdir(directory) { files = Dir.glob "*" }
49
+ r1_file = ""
50
+ r2_file = ""
51
+ files.each do |f|
52
+ tag = parser_file_name(f)[:tag]
53
+
54
+ if tag.include? "R1"
55
+ unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
56
+ elsif tag.include? "R2"
57
+ unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
58
+ end
59
+ end
60
+ return { r1_file: r1_file, r2_file: r2_file }
61
+ end # end of ViralSeq:TcsCore.r1r2
62
+
63
+ # sort directories containing mulitple r1 and r2 files.
64
+ # use the library name (first string before "_") to seperate libraries
65
+ # out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
66
+ # return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
67
+
68
+ def sort_by_lib(directory, out_dir = directory + "_sorted")
69
+ Dir.mkdir(out_dir) unless File.directory?(out_dir)
70
+ files = []
71
+ Dir.chdir(directory) {files = Dir.glob("*")}
72
+
73
+ files.each do |file|
74
+ path = File.join(directory,file)
75
+ index = file.split("_")[0]
76
+ index_dir = File.join(out_dir, index)
77
+ Dir.mkdir(index_dir) unless File.directory?(index_dir)
78
+ File.rename(path, File.join(index_dir, file))
79
+ end
80
+
81
+ return_obj = { with_both_r1_r2: [],
82
+ missing_r1: [],
83
+ missing_r2: [],
84
+ error: []
85
+ }
86
+
87
+ libs = []
88
+ Dir.chdir(out_dir) { libs = Dir.glob('*') }
89
+ libs.each do |lib|
90
+ file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
91
+ if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
92
+ return_obj[:with_both_r1_r2] << lib
93
+ elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
94
+ return_obj[:missing_r1] << lib
95
+ elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
96
+ return_obj[:missing_r2] << lib
97
+ else
98
+ return_obj[:error] << lib
99
+ end
100
+ end
101
+ return return_obj
102
+ end
103
+
104
+ # sort array of file names to determine if there is potential errors
105
+ # input name_array array of file names
106
+ # output hash { }
107
+ # need to change for each file name have an error code. and a bool to show if all pass
108
+ def validate_file_name(name_array)
109
+ errors = {
110
+ file_type_error: [] ,
111
+ missing_r1_file: [] ,
112
+ missing_r2_file: [] ,
113
+ extra_r1_r2_file: [],
114
+ no_region_tag: [] ,
115
+ multiple_region_tag: []
116
+ }
117
+
118
+ passed_libs = {}
119
+
120
+ name_with_r1_r2 = []
121
+
122
+ name_array.each do |name|
123
+ tag = parser_file_name(name)[:tag]
124
+ if name !~ /\.fastq\Z|\.fastq\.gz\Z/
125
+ errors[:file_type_error] << name
126
+ elsif tag.count("R1") == 0 and tag.count("R2") == 0
127
+ errors[:no_region_tag] << name
128
+ elsif tag.count("R1") > 0 and tag.count("R2") > 0
129
+ errors[:multiple_region_tag] << name
130
+ elsif tag.count("R1") > 1 or tag.count("R2") > 1
131
+ errors[:multiple_region_tag] << name
132
+ else
133
+ name_with_r1_r2 << name
134
+ end
135
+ end
136
+
137
+ libs = {}
138
+
139
+ name_with_r1_r2.map do |name|
140
+ libname = parser_file_name(name)[:libname]
141
+ libs[libname] ||= []
142
+ libs[libname] << name
143
+ end
144
+
145
+ libs.each do |libname, files|
146
+ count_r1_file = 0
147
+ count_r2_file = 0
148
+ files.each do |name|
149
+ tag = parser_file_name(name)[:tag]
150
+ if tag.include? "R1"
151
+ count_r1_file += 1
152
+ elsif tag.include? "R2"
153
+ count_r2_file += 1
154
+ end
155
+ end
156
+
157
+ if count_r1_file > 1 or count_r2_file > 1
158
+ errors[:extra_r1_r2_file] += files
159
+ elsif count_r1_file.zero?
160
+ errors[:missing_r1_file] += files
161
+ elsif count_r2_file.zero?
162
+ errors[:missing_r2_file] += files
163
+ else
164
+ passed_libs[libname] = files
165
+ end
166
+ end
167
+
168
+ passed_names = []
169
+
170
+ passed_libs.values.each { |names| passed_names += names}
171
+
172
+ if passed_names.size < name_array.size
173
+ pass = false
174
+ else
175
+ pass = true
176
+ end
177
+
178
+ return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
179
+ end
180
+
181
+ # filter r1 raw sequences for non-specific primers.
182
+ # input r1_sh, SeqHash obj.
183
+ # return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
184
+
185
+ def filter_r1(r1_sh, forward_primer)
186
+ if forward_primer.match(/(N+)(\w+)$/)
187
+ forward_n = $1.size
188
+ forward_bio_primer = $2
189
+ else
190
+ forward_n = 0
191
+ forward_bio_primer = forward_primer
192
+ end
193
+ forward_bio_primer_size = forward_bio_primer.size
194
+ forward_starting_number = forward_n + forward_bio_primer_size
195
+ forward_primer_ref = forward_bio_primer.nt_parser
196
+
197
+ r1_passed_seq = {}
198
+ r1_raw = r1_sh.dna_hash
199
+
200
+ proc_filter = proc do |name|
201
+ seq = r1_raw[name]
202
+ next unless general_filter seq
203
+ primer_region_seq = seq[forward_n, forward_bio_primer_size]
204
+ if primer_region_seq =~ forward_primer_ref
205
+ new_name = remove_tag name
206
+ r1_passed_seq[new_name] = seq
207
+ end
208
+ end
209
+
210
+ r1_raw.keys.map do |name|
211
+ proc_filter.call name
212
+ end
213
+
214
+ return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
215
+ end # end of filter_r1
216
+
217
+ # filter r2 raw sequences for non-specific primers.
218
+ # input r2_sh, SeqHash obj.
219
+ # return filtered Hash of sequence name and seq pair, as well as the length of PID.
220
+ def filter_r2(r2_sh, cdna_primer)
221
+ r2_raw = r2_sh.dna_hash
222
+ cdna_primer.match(/(N+)(\w+)$/)
223
+ pid_length = $1.size
224
+ cdna_bio_primer = $2
225
+ cdna_bio_primer_size = cdna_bio_primer.size
226
+ reverse_starting_number = pid_length + cdna_bio_primer_size
227
+ cdna_primer_ref = cdna_bio_primer.nt_parser
228
+ r2_passed_seq = {}
229
+ proc_filter = proc do |name|
230
+ seq = r2_raw[name]
231
+ next unless general_filter seq
232
+ primer_region_seq = seq[pid_length, cdna_bio_primer_size]
233
+ if primer_region_seq =~ cdna_primer_ref
234
+ new_name = remove_tag name
235
+ r2_passed_seq[new_name] = seq
236
+ end
237
+ end
238
+
239
+ r2_raw.keys.map do |name|
240
+ proc_filter.call name
241
+ end
242
+
243
+ return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
244
+ end # end of filter_r2
245
+
246
+
247
+
248
+ # puts error message in the log file handler, and abort with the same infor
249
+
250
+ def log_and_abort(log, infor)
251
+ log.puts Time.now.to_s + "\t" + infor
252
+ log.close
253
+ abort infor.red.bold
254
+ end
255
+
256
+ private
257
+
258
+ def unzip_r(indir, f)
259
+ r_file = File.join(indir, f)
260
+ if f =~ /.gz/
261
+ `gzip -d #{r_file}`
262
+ new_f = f.sub ".gz", ""
263
+ r_file = File.join(indir, new_f)
264
+ end
265
+ return r_file
266
+ end
267
+
268
+ def parser_file_name(file_name)
269
+ t = file_name.split(".")[0].split("_")
270
+ if t.size == 1
271
+ libname = "lib"
272
+ tag = [ t[0].upcase ]
273
+ else
274
+ libname = t[0]
275
+ tag = t[1..-1].map(&:upcase)
276
+ end
277
+ return {libname: libname, tag: tag}
278
+ end
279
+
280
+ def general_filter(seq)
281
+ if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
282
+ return false
283
+ elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
284
+ return false
285
+ elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
286
+ return false
287
+ else
288
+ return true
289
+ end
290
+ end
291
+
292
+ # remove region info tags from the raw MiSeq sequences.
293
+ def remove_tag(seq_name)
294
+ if seq_name =~ /\s/
295
+ new_tag = $`
296
+ else
297
+ new_tag = seq_name[0..-3]
298
+ end
299
+ end
300
+
301
+ end # end of class << self
302
+
303
+ end # end of TcsCore module
304
+
305
+ end # end of main module
@@ -0,0 +1,178 @@
1
+ module ViralSeq
2
+ class TcsJson
3
+ class << self
4
+
5
+ def generate
6
+ puts '-'*58
7
+ puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
8
+ puts '-'*58 + "\n"
9
+
10
+ param = {}
11
+
12
+ puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
13
+ print '> '
14
+ param[:raw_sequence_dir] = gets.chomp.rstrip
15
+
16
+ puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
17
+ print '> '
18
+ input_error = gets.chomp.rstrip.to_f
19
+ if input_error == 0.0
20
+ param[:platform_error_rate] = 0.02
21
+ else
22
+ param[:platform_error_rate] = input_error
23
+ end
24
+
25
+ param[:primer_pairs] = []
26
+
27
+ loop do
28
+ data = {}
29
+ puts "Enter the name for the sequenced region: "
30
+ print '> '
31
+ data[:region] = gets.chomp.rstrip
32
+
33
+ puts "Enter the #{"cDNA".red.bold} primer sequence: "
34
+ print '> '
35
+ data[:cdna] = gets.chomp.rstrip
36
+
37
+ puts "Enter the #{"forward".blue.bold} primer sequence: "
38
+ print '> '
39
+ data[:forward] = gets.chomp.rstrip
40
+
41
+ puts "Enter supermajority cut-off (0.5 - 1.0). Default Simple Majority"
42
+ print '> '
43
+ mj = gets.chomp.rstrip.to_f
44
+ if (0.5..1.0).include?(mj)
45
+ data[:majority] = mj
46
+ else
47
+ data[:majority] = 0
48
+ end
49
+
50
+ print "Need end-join? Y/N \n> "
51
+ ej = gets.chomp.rstrip
52
+ if ej =~ /y|yes/i
53
+ data[:end_join] = true
54
+
55
+ print "End-join option? Choose from (1-4):\n
56
+ 1: simple join, no overlap
57
+ 2: known overlap \n
58
+ 3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
59
+ 4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
60
+ > "
61
+ ej_option = gets.chomp.rstrip
62
+ while ![1,2,3,4].include?(ej_option.to_i)
63
+ puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
64
+ ej_option = gets.chomp.rstrip.to_i
65
+ end
66
+ case ej_option.to_i
67
+ when 1
68
+ data[:end_join_option] = 1
69
+ data[:overlap] = 0
70
+ when 2
71
+ data[:end_join_option] = 1
72
+ print "overlap bases: \n> "
73
+ ol = gets.chomp.rstrip.to_i
74
+ data[:overlap] = ol
75
+ when 3
76
+ data[:end_join_option] = 3
77
+ when 4
78
+ data[:end_join_option] = 4
79
+ end
80
+
81
+ print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
82
+ qc = gets.chomp.rstrip
83
+ if qc =~ /y|yes/i
84
+ data[:TCS_QC] = true
85
+
86
+ data[:ref_genome] = get_ref
87
+
88
+ print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
89
+ data[:ref_start] = gets.chomp.rstrip.to_i
90
+
91
+ print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
92
+ data[:ref_end] = gets.chomp.rstrip.to_i
93
+
94
+ print "allow indels? (default as yes) Y/N \n> "
95
+ indel = gets.chomp.rstrip
96
+ if indel =~ /n|no/i
97
+ data[:indel] = false
98
+ else
99
+ data[:indel] = true
100
+ end
101
+ else
102
+ data[:TCS_QC] = false
103
+ end
104
+
105
+ print "Need trimming to a reference genome? Y/N \n> "
106
+ trim_option = gets.chomp.rstrip
107
+ if trim_option =~ /y|yes/i
108
+ data[:trim] = true
109
+ data[:trim_ref] = get_ref
110
+
111
+ print "reference 5'end ref position \n> "
112
+ data[:trim_ref_start] = gets.chomp.rstrip.to_i
113
+
114
+ print "reference 3'end ref position \n> "
115
+ data[:trim_ref_end] = gets.chomp.rstrip.to_i
116
+
117
+ else
118
+ data[:trim] = false
119
+ end
120
+
121
+ else
122
+ data[:end_join] = false
123
+ end
124
+
125
+ param[:primer_pairs] << data
126
+ print "Do you wish to conintue? Y/N \n> "
127
+ continue_sig = gets.chomp.rstrip
128
+ break unless continue_sig =~ /y|yes/i
129
+
130
+ end
131
+
132
+ puts "\nYour JSON string is:"
133
+ puts JSON.pretty_generate(param)
134
+
135
+ print "\nDo you wish to save it as a file? Y/N \n> "
136
+ save_option = gets.chomp.rstrip
137
+
138
+ if save_option =~ /y|yes/i
139
+ print "Path to save JSON file:\n> "
140
+ path = gets.chomp.rstrip
141
+ File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
142
+ end
143
+
144
+ print "\nDo you wish to execute tcs pipeline with the input params now? Y/N \n> "
145
+
146
+ rsp = gets.chomp.rstrip
147
+ if rsp =~ /y/i
148
+ return param
149
+ else
150
+ abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`"
151
+ end
152
+
153
+ end
154
+
155
+ private
156
+ def get_ref
157
+ puts "Choose reference genome (1-3):"
158
+ puts "1. HIV-1 HXB2".red.bold
159
+ puts "2. HIV-1 NL4-3".blue.bold
160
+ puts "3. SIV MAC239".magenta.bold
161
+ print "> "
162
+ ref_option = gets.chomp.rstrip
163
+ while ![1,2,3].include?(ref_option.to_i)
164
+ print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
165
+ ref_option = gets.chomp.rstrip.to_i
166
+ end
167
+ ref = case ref_option.to_i
168
+ when 1
169
+ :HXB2
170
+ when 2
171
+ :NL43
172
+ when 3
173
+ :MAC239
174
+ end
175
+ end
176
+ end
177
+ end # end TcsJson
178
+ end # end main module