viral_seq 1.0.7 → 1.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb326c97b25326286a51ec63583983a20dfebee2513fd8811bc855ec21ac0b5d
4
- data.tar.gz: e9870bbaa8c17ba51d53e790ca8189e2dd362911e1b5cfcd4806a3bc68ccf369
3
+ metadata.gz: 8d79f0676fb23cdc25fb3b0161b5665ecfe082e2401f40a1de3a782d9fb3d52a
4
+ data.tar.gz: 01a09f4cfca1274bfb1b870cdad62614def01fdaded727ce9100eec377962401
5
5
  SHA512:
6
- metadata.gz: ff6e5727484687db04180a1ef9d3204e9ed02d9b1a98862bdb8796255680aca1e830667429a57db116702793dc55eeb7cc84800c39b27f8e2773186e1a638988
7
- data.tar.gz: 86d0b03af6335cc91e38bc54a8c1fa7e2c84d430dc0adb02e4dc3819ebb188a0e8ae1e4c76c71e5066cac51675e0a45f9ee5a9b0bbd2de8b26da4fa04fe95d85
6
+ metadata.gz: 042f11da57209003bc84b0f7c764a9953f0ca6c1fcd00a5e943be531162bc06c9d54e3c4ceb1305c91fe5795894e3da394a196899a4f1df83d97b826c5582411
7
+ data.tar.gz: b2b2bfb9a8e6d023f610b19311a1a1ea331fbaa804cf20aebc3a34f6b049240ec43fe10e92b9f00feef3fd78e922fe0ed39281146693358998020036b9553504
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.0.7)
4
+ viral_seq (1.0.8)
5
5
  colorize (~> 0.1)
6
6
  muscle_bio (~> 0.4)
7
7
 
data/README.md CHANGED
@@ -51,6 +51,17 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
51
51
 
52
52
  ## Updates
53
53
 
54
+ Version 1.0.8-02282020:
55
+
56
+ 1. TCS pipeline added as executable.
57
+ tcs - main TCS pipeline script.
58
+ tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
59
+
60
+ 2. Methods added:
61
+ ViralSeq::SeqHash#trim
62
+
63
+ 3. Bug fix for several methods.
64
+
54
65
  Version 1.0.7-01282020:
55
66
 
56
67
  1. Several methods added, including
@@ -1,5 +1,25 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ # Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
3
23
  require 'viral_seq'
4
24
  require 'csv'
5
25
  require 'optparse'
data/bin/tcs ADDED
@@ -0,0 +1,528 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # TCS pipeline for Primer ID sequencing data analysis.
4
+
5
+ # Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in
15
+ # all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ # THE SOFTWARE.
24
+
25
+ # Use JSON file as the run param
26
+ # run tcs_json_generator.rb to generate param json file.
27
+
28
+ require 'viral_seq'
29
+ require 'json'
30
+ require 'colorize'
31
+
32
+ # updated the ViralSeq module. Push with the new version.
33
+
34
+ module ViralSeq
35
+ class SeqHash
36
+ def self.new_from_fastq(fastq_file)
37
+ count = 0
38
+ sequence_a = []
39
+ quality_a = []
40
+ count_seq = 0
41
+
42
+ File.open(fastq_file,'r') do |file|
43
+ file.readlines.collect do |line|
44
+ count +=1
45
+ count_m = count % 4
46
+ if count_m == 1
47
+ line.tr!('@','>')
48
+ sequence_a << line.chomp
49
+ quality_a << line.chomp
50
+ count_seq += 1
51
+ elsif count_m == 2
52
+ sequence_a << line.chomp
53
+ elsif count_m == 0
54
+ quality_a << line.chomp
55
+ end
56
+ end
57
+ end
58
+ sequence_hash = Hash[sequence_a.each_slice(2).to_a]
59
+ quality_hash = Hash[quality_a.each_slice(2).to_a]
60
+
61
+ seq_hash = ViralSeq::SeqHash.new
62
+ seq_hash.dna_hash = sequence_hash
63
+ seq_hash.qc_hash = quality_hash
64
+ seq_hash.title = File.basename(fastq_file,".*")
65
+ seq_hash.file = fastq_file
66
+ return seq_hash
67
+ end # end of ::new_from_fastq
68
+
69
+ class << self
70
+ alias_method :fq, :new_from_fastq
71
+ end
72
+ end
73
+ end
74
+
75
+ module ViralSeq
76
+ class SeqHash
77
+ def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
78
+ seq_hash = self.dna_hash.dup
79
+ seq_hash_unique = seq_hash.uniq_hash
80
+ trimmed_seq_hash = {}
81
+ seq_hash_unique.each do |seq, names|
82
+ trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
83
+ names.each do |name|
84
+ trimmed_seq_hash[name] = trimmed_seq
85
+ end
86
+ end
87
+ return_seq_hash = self.dup
88
+ return_seq_hash.dna_hash = trimmed_seq_hash
89
+ return return_seq_hash
90
+ end
91
+ end
92
+ end
93
+
94
+ # end of additonal methods. Delete before publish
95
+
96
+ # calculate consensus cutoff
97
+
98
+ def calculate_cut_off(m, error_rate = 0.02)
99
+ n = 0
100
+ case error_rate
101
+ when 0.005...0.015
102
+ if m <= 10
103
+ n = 2
104
+ else
105
+ n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
106
+ end
107
+
108
+ when 0...0.005
109
+ if m <= 10
110
+ n = 2
111
+ else
112
+ n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
113
+ end
114
+
115
+ else
116
+ if m <= 10
117
+ n = 2
118
+ elsif m <= 8500
119
+ n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
120
+ else
121
+ n = 0.0079 * m + 9.4869
122
+ end
123
+ end
124
+
125
+ n = n.round
126
+ n = 2 if n < 3
127
+ return n
128
+ end
129
+
130
+
131
+ TCS_VERSION = "2.0.0"
132
+
133
+ puts "\n" + '-'*58
134
+ puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
135
+ puts '-'*58 + "\n"
136
+
137
+ unless ARGV[0]
138
+ raise "No JSON param file found. Script terminated."
139
+ end
140
+
141
+ params = JSON.parse(File.read(ARGV[0]), symbolize_names: true)
142
+
143
+ indir = params[:raw_sequence_dir]
144
+
145
+ unless File.exist?(indir)
146
+ raise "No input sequence directory found. Script terminated."
147
+ end
148
+
149
+ libname = File.basename(indir)
150
+
151
+ # obtain R1 and R2 file path
152
+ files = []
153
+ Dir.chdir(indir) do
154
+ files = Dir.glob("*")
155
+ end
156
+
157
+ if files.empty?
158
+ raise "Input dir does not contain files. Script terminated."
159
+ end
160
+
161
+ r1_f = ""
162
+ r2_f = ""
163
+
164
+ # unzip .fasta.gz
165
+ def unzip_r(indir, f)
166
+ r_file = indir + "/" + f
167
+ if f =~ /.gz/
168
+ `gzip -d #{r_file}`
169
+ new_f = f.sub ".gz", ""
170
+ r_file = File.join(indir, new_f)
171
+ end
172
+ return r_file
173
+ end
174
+ runtime_log_file = File.join(indir,"runtime.log")
175
+ log = File.open(runtime_log_file, "w")
176
+ log.puts "TSC pipeline Version " + TCS_VERSION.to_s
177
+ log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
178
+ log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
179
+
180
+
181
+ files.each do |f|
182
+ t = f.split("_")
183
+ if t.size == 1
184
+ tag = f
185
+ else
186
+ tag = f.split("_")[1..-1].join("_")
187
+ end
188
+
189
+ if tag =~ /r1/i
190
+ r1_f = unzip_r(indir, f)
191
+ elsif tag =~ /r2/i
192
+ r2_f = unzip_r(indir, f)
193
+ end
194
+ end
195
+
196
+
197
+ unless File.exist?(r1_f)
198
+ log.puts "R1 file not found. Script terminated."
199
+ raise "R1 file not found. Script terminated."
200
+ end
201
+
202
+ unless File.exist?(r2_f)
203
+ log.puts "R2 file not found. Script terminated."
204
+ raise "R2 file not found. Script terminated."
205
+ end
206
+
207
+ r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
208
+ r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
209
+
210
+ raw_sequence_number = r1_fastq_sh.size
211
+ log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
212
+
213
+ if params[:platform_error_rate]
214
+ error_rate = params[:platform_error_rate]
215
+ else
216
+ error_rate = 0.02
217
+ end
218
+
219
+ primers = params[:primer_pairs]
220
+ if primers.empty?
221
+ log.puts "No primer information. Script terminated."
222
+ raise "No primer information. Script terminated."
223
+ end
224
+
225
+ primers.each do |primer|
226
+ summary_json = {}
227
+ summary_json[:tcs_version] = TCS_VERSION
228
+ summary_json[:viralseq_version] = ViralSeq::VERSION
229
+ summary_json[:runtime] = Time.now.to_s
230
+
231
+ primer[:region] ? region = primer[:region] : region = "region"
232
+ summary_json[:primer_set_name] = region
233
+
234
+ cdna_primer = primer[:cdna]
235
+ forward_primer = primer[:forward]
236
+ unless cdna_primer
237
+ log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
238
+ end
239
+ unless forward_primer
240
+ log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
241
+ end
242
+ summary_json[:cdan_primer] = cdna_primer
243
+ summary_json[:forward_primer] = forward_primer
244
+
245
+ primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0.5
246
+ summary_json[:majority_cut_off] = majority_cut_off
247
+
248
+ summary_json[:total_raw_sequence] = raw_sequence_number
249
+
250
+ log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
251
+
252
+ r1_raw = r1_fastq_sh.dna_hash
253
+ r2_raw = r2_fastq_sh.dna_hash
254
+
255
+ log.puts Time.now.to_s + "\t" + "filtering R1..."
256
+ # obtain biological forward primer sequence
257
+ if forward_primer.match(/(N+)(\w+)$/)
258
+ forward_n = $1.size
259
+ forward_bio_primer = $2
260
+ else
261
+ forward_n = 0
262
+ forward_bio_primer = forward_primer
263
+ end
264
+ forward_bio_primer_size = forward_bio_primer.size
265
+ forward_starting_number = forward_n + forward_bio_primer_size
266
+
267
+ # filter R1 sequences with forward primers.
268
+ forward_primer_ref = forward_bio_primer.nt_parser
269
+ r1_passed_seq = {}
270
+ r1_raw.each do |name,seq|
271
+ next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
272
+ next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
273
+ next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
274
+
275
+ primer_region_seq = seq[forward_n, forward_bio_primer_size]
276
+ if primer_region_seq =~ forward_primer_ref
277
+ r1_passed_seq[name.split("\s")[0]] = seq
278
+ end
279
+ end
280
+ log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
281
+
282
+ summary_json[:r1_filtered_raw] = r1_passed_seq.size
283
+
284
+ log.puts Time.now.to_s + "\t" + "filtering R2..."
285
+ # obtain biological reverse primer sequence
286
+ cdna_primer.match(/(N+)(\w+)$/)
287
+ pid_length = $1.size
288
+ cdna_bio_primer = $2
289
+ cdna_bio_primer_size = cdna_bio_primer.size
290
+ reverse_starting_number = pid_length + cdna_bio_primer_size
291
+
292
+ # filter R2 sequences with cDNA primers.
293
+ cdna_primer_ref = cdna_bio_primer.nt_parser
294
+ r2_passed_seq = {}
295
+ r2_raw.each do |name, seq|
296
+ next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
297
+ next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
298
+ next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
299
+
300
+ primer_region_seq = seq[pid_length, cdna_bio_primer_size]
301
+ if primer_region_seq =~ cdna_primer_ref
302
+ r2_passed_seq[name.split("\s")[0]] = seq
303
+ end
304
+ end
305
+ log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
306
+ summary_json[:r2_filtered_raw] = r2_passed_seq.size
307
+
308
+ # pair-end
309
+ log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
310
+ id = {} # hash for :sequence_tag => primer_id
311
+ bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
312
+ bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
313
+ common_keys = r1_passed_seq.keys & r2_passed_seq.keys
314
+ paired_seq_number = common_keys.size
315
+ log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
316
+ summary_json[:paired_raw_sequence] = paired_seq_number
317
+
318
+ common_keys.each do |seqtag|
319
+ r1_seq = r1_passed_seq[seqtag]
320
+ r2_seq = r2_passed_seq[seqtag]
321
+ pid = r2_seq[0, pid_length]
322
+ id[seqtag] = pid
323
+ bio_r2[seqtag] = r2_seq[reverse_starting_number..-2]
324
+ bio_r1[seqtag] = r1_seq[forward_starting_number..-2]
325
+ end
326
+
327
+ # TCS cut-off
328
+ log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
329
+
330
+ primer_id_list = id.values
331
+ primer_id_count = primer_id_list.count_freq
332
+ primer_id_dis = primer_id_count.values.count_freq
333
+
334
+ # calculate distinct_to_raw
335
+ distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
336
+ summary_json[:distinct_to_raw] = distinct_to_raw
337
+
338
+ if primer_id_dis.keys.size < 5
339
+ log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
340
+ next
341
+ end
342
+
343
+ max_id = primer_id_dis.keys.sort[-5..-1].mean
344
+ consensus_cutoff = calculate_cut_off(max_id,error_rate)
345
+ log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
346
+ summary_json[:consensus_cutoff] = consensus_cutoff
347
+ summary_json[:length_of_pid] = pid_length
348
+
349
+ log.puts Time.now.to_s + "\t" + "Creating consensus..."
350
+
351
+ # Primer ID over the cut-off
352
+ primer_id_count_over_n = []
353
+ primer_id_count.each do |primer_id,count|
354
+ primer_id_count_over_n << primer_id if count > consensus_cutoff
355
+ end
356
+ pid_to_process = primer_id_count_over_n.size
357
+ log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
358
+ summary_json[:total_tcs_with_ambiguities] = pid_to_process
359
+
360
+ # setup output path
361
+ out_dir_set = File.join(indir, region)
362
+ Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
363
+ out_dir_consensus = File.join(out_dir_set, "consensus")
364
+ Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
365
+
366
+ outfile_r1 = File.join(out_dir_consensus, 'r1.txt')
367
+ outfile_r2 = File.join(out_dir_consensus, 'r2.txt')
368
+ outfile_log = File.join(out_dir_set, 'log.json')
369
+
370
+ # create TCS
371
+
372
+ pid_seqtag_hash = {}
373
+ id.each do |name, pid|
374
+ if pid_seqtag_hash[pid]
375
+ pid_seqtag_hash[pid] << name
376
+ else
377
+ pid_seqtag_hash[pid] = []
378
+ pid_seqtag_hash[pid] << name
379
+ end
380
+ end
381
+
382
+ consensus = {}
383
+ r1_temp = {}
384
+ r2_temp = {}
385
+ m = 0
386
+ primer_id_count_over_n.each do |primer_id|
387
+ m += 1
388
+ log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
389
+ seq_with_same_primer_id = pid_seqtag_hash[primer_id]
390
+ r1_sub_seq = []
391
+ r2_sub_seq = []
392
+ seq_with_same_primer_id.each do |seq_name|
393
+ r1_sub_seq << bio_r1[seq_name]
394
+ r2_sub_seq << bio_r2[seq_name]
395
+ end
396
+
397
+ #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
398
+ consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
399
+ r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
400
+ r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
401
+ next if r1_consensus =~ /[^ATCG]/
402
+ next if r2_consensus =~ /[^ATCG]/
403
+
404
+ # reverse complement sequence of the R2 region
405
+ r2_consensus = r2_consensus.rc
406
+ consensus[consensus_name] = [r1_consensus, r2_consensus]
407
+ r1_temp[consensus_name] = r1_consensus
408
+ r2_temp[consensus_name] = r2_consensus
409
+ end
410
+ r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
411
+ r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
412
+
413
+ # filter consensus sequences for residual offspring PIDs
414
+ consensus_filtered = {}
415
+ consensus_number_temp = consensus.size
416
+ max_pid_comb = 4**pid_length
417
+ if consensus_number_temp < 0.003*max_pid_comb
418
+ log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
419
+ r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
420
+ r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
421
+ common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
422
+ common_pid.each do |pid|
423
+ consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
424
+ end
425
+ else
426
+ consensus_filtered = consensus
427
+ end
428
+ n_con = consensus_filtered.size
429
+ log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
430
+ summary_json[:total_tcs] = n_con
431
+ summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
432
+
433
+ log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
434
+ # r1_file output
435
+ f1 = File.open(outfile_r1, 'w')
436
+ f2 = File.open(outfile_r2, 'w')
437
+ primer_id_in_use = {}
438
+ r1_seq_length = consensus_filtered.values[0][0].size
439
+ r2_seq_length = consensus_filtered.values[0][1].size
440
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
441
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
442
+ consensus_filtered.each do |seq_name,seq|
443
+ f1.print seq_name + "_r1\n" + seq[0] + "\n"
444
+ f2.print seq_name + "_r2\n" + seq[1] + "\n"
445
+ primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
446
+ end
447
+ f1.close
448
+ f2.close
449
+
450
+ out_pid_json = File.join(out_dir_set, 'primer_id.json')
451
+ pid_json = {}
452
+ pid_json[:primer_id_in_use] = Hash[*(primer_id_in_use.sort_by {|k, v| [-v,k]}.flatten)]
453
+ pid_json[:primer_id_distribution] = Hash[*(primer_id_dis.sort_by{|k,v| k}.flatten)]
454
+ pid_json[:primer_id_frequency] = Hash[*(primer_id_count.sort_by {|k, v| [-v,k]}.flatten)]
455
+ File.open(out_pid_json, 'w') do |f|
456
+ f.puts JSON.pretty_generate(pid_json)
457
+ end
458
+
459
+ if primer[:end_join]
460
+ log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
461
+ shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
462
+ case primer[:end_join_option]
463
+ when 1
464
+ joined_sh = shp.join1(primer[:overlap])
465
+ when 3
466
+ joined_sh = shp.join2
467
+ when 4
468
+ joined_sh = shp.join2(model: :indiv)
469
+ end
470
+ log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
471
+ summary_json[:combined_tcs] = joined_sh.size
472
+ else
473
+ File.open(outfile_log, "w") do |f|
474
+ f.puts JSON.pretty_generate(summary_json)
475
+ end
476
+ next
477
+ end
478
+
479
+ if primer[:TCS_QC]
480
+ ref_start = primer[:ref_start]
481
+ ref_end = primer[:ref_end]
482
+ ref_genome = primer[:ref_genome].to_sym
483
+ indel = primer[:indel]
484
+ if ref_start == 0
485
+ ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
486
+ end
487
+ if ref_end == 0
488
+ ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
489
+ end
490
+ if primer[:end_join_option] == 1 and primer[:overlap] == 0
491
+ r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
492
+ r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
493
+ r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
494
+ r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
495
+ new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
496
+ new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
497
+ joined_seq = {}
498
+ new_r1_seq.each do |seq_name, seq|
499
+ next unless seq
500
+ next unless new_r2_seq[seq_name]
501
+ joined_seq[seq_name] = seq + new_r2_seq[seq_name]
502
+ end
503
+ joined_sh = ViralSeq::SeqHash.new(joined_seq)
504
+ else
505
+ joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
506
+ end
507
+ log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
508
+ summary_json[:combined_tcs_after_qc] = joined_sh.size
509
+ if primer[:trim]
510
+ trim_start = primer[:trim_ref_start]
511
+ trim_end = primer[:trim_ref_end]
512
+ trim_ref = primer[:trim_ref].to_sym
513
+ joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
514
+ end
515
+ joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.txt"))
516
+ end
517
+
518
+ File.open(outfile_log, "w") do |f|
519
+ f.puts JSON.pretty_generate(summary_json)
520
+ end
521
+ end
522
+
523
+ log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
524
+ File.unlink(r1_f)
525
+ File.unlink(r2_f)
526
+ log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly exercuted."
527
+ log.close
528
+ puts "DONE!"
@@ -0,0 +1,170 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # TCS pipeline JSON params generator.
4
+
5
+ require 'colorize'
6
+ require 'json'
7
+
8
+ def get_ref
9
+ puts "Choose reference genome (1-3):"
10
+ puts "1. HIV-1 HXB2".red.bold
11
+ puts "2. HIV-1 NL4-3".blue.bold
12
+ puts "3. SIV MAC239".magenta.bold
13
+ print "> "
14
+ ref_option = gets.chomp.rstrip
15
+ while ![1,2,3].include?(ref_option.to_i)
16
+ print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
17
+ ref_option = gets.chomp.rstrip.to_i
18
+ end
19
+ ref = case ref_option.to_i
20
+ when 1
21
+ :HXB2
22
+ when 2
23
+ :NL43
24
+ when 3
25
+ :MAC239
26
+ end
27
+ end
28
+
29
+ TCS_VERSION = "2.0.0"
30
+
31
+ puts "\n" + '-'*58
32
+ puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
33
+ puts '-'*58 + "\n"
34
+
35
+ param = {}
36
+
37
+ puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
38
+ print '> '
39
+ param[:raw_sequence_dir] = gets.chomp.rstrip
40
+
41
+ puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
42
+ print '> '
43
+ input_error = gets.chomp.rstrip.to_f
44
+ if input_error == 0.0
45
+ param[:platform_error_rate] = 0.02
46
+ else
47
+ param[:platform_error_rate] = input_error
48
+ end
49
+
50
+ param[:primer_pairs] = []
51
+ continue = true
52
+ while continue
53
+ data = {}
54
+ puts "Enter the name for the sequenced region: "
55
+ print '> '
56
+ data[:region] = gets.chomp.rstrip
57
+
58
+ puts "Enter the #{"cDNA".red.bold} primer sequence: "
59
+ print '> '
60
+ data[:cdna] = gets.chomp.rstrip
61
+
62
+ puts "Enter the #{"forward".blue.bold} primer sequence: "
63
+ print '> '
64
+ data[:forward] = gets.chomp.rstrip
65
+
66
+ puts "Enter supermajority cut-off (0.5 - 0.9). Default: " + "0.5".blue.bold + " (simple majority)"
67
+ print '> '
68
+ mj = gets.chomp.rstrip.to_f
69
+ if (0.5..0.9).include?(mj)
70
+ data[:majority] = mj
71
+ else
72
+ data[:majority] = 0.5
73
+ end
74
+
75
+ print "Need end-join? Y/N \n> "
76
+ ej = gets.chomp.rstrip
77
+ if ej =~ /y|yes/i
78
+ data[:end_join] = true
79
+
80
+ print "End-join option? Choose from (1-4):\n
81
+ 1: simple join, no overlap
82
+ 2: known overlap \n
83
+ 3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
84
+ 4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
85
+ > "
86
+ ej_option = gets.chomp.rstrip
87
+ while ![1,2,3,4].include?(ej_option.to_i)
88
+ puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
89
+ ej_option = gets.chomp.rstrip.to_i
90
+ end
91
+ case ej_option.to_i
92
+ when 1
93
+ data[:end_join_option] = 1
94
+ data[:overlap] = 0
95
+ when 2
96
+ data[:end_join_option] = 1
97
+ print "overlap bases: \n> "
98
+ ol = gets.chomp.rstrip.to_i
99
+ data[:overlap] = ol
100
+ when 3
101
+ data[:end_join_option] = 3
102
+ when 4
103
+ data[:end_join_option] = 4
104
+ end
105
+
106
+ print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
107
+ qc = gets.chomp.rstrip
108
+ if qc =~ /y|yes/i
109
+ data[:TCS_QC] = true
110
+
111
+ data[:ref_genome] = get_ref
112
+
113
+ print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
114
+ data[:ref_start] = gets.chomp.rstrip.to_i
115
+
116
+ print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
117
+ data[:ref_end] = gets.chomp.rstrip.to_i
118
+
119
+ print "allow indels? (default as yes) Y/N \n> "
120
+ indel = gets.chomp.rstrip
121
+ if indel =~ /n|no/i
122
+ data[:indel] = false
123
+ else
124
+ data[:indel] = true
125
+ end
126
+ else
127
+ data[:TCS_QC] = false
128
+ end
129
+
130
+ print "Need trimming to a reference genome? Y/N \n> "
131
+ trim_option = gets.chomp.rstrip
132
+ if trim_option =~ /y|yes/i
133
+ data[:trim] = true
134
+ data[:trim_ref] = get_ref
135
+
136
+ print "reference 5'end ref position \n> "
137
+ data[:trim_ref_start] = gets.chomp.rstrip.to_i
138
+
139
+ print "reference 3'end ref position \n> "
140
+ data[:trim_ref_end] = gets.chomp.rstrip.to_i
141
+
142
+ else
143
+ data[:trim] = false
144
+ end
145
+
146
+ else
147
+ data[:end_join] = false
148
+ end
149
+
150
+ print "Do you wish to conintue? Y/N \n> "
151
+ continue_sig = gets.chomp.rstrip
152
+ if continue_sig =~ /y|yes/i
153
+ continue = true
154
+ else
155
+ continue = false
156
+ end
157
+ param[:primer_pairs] << data
158
+ end
159
+
160
+ puts "\nYour JSON string is:"
161
+ puts JSON.pretty_generate(param)
162
+
163
+ print "\nDo you wish to save it as a file? Y/N \n> "
164
+ save_option = gets.chomp.rstrip
165
+
166
+ if save_option =~ /y|yes/i
167
+ print "Path to save JSON file:\n> "
168
+ path = gets.chomp.rstrip
169
+ File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
170
+ end
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2019 Shuntai Zhou (shuntai.zhou@gmail.com)
1
+ # Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
2
2
  #
3
3
  # Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  # of this software and associated documentation files (the "Software"), to deal
@@ -1,4 +1,4 @@
1
- # addition methods for Class::Hash required for ViralSeq
1
+ # additional methods for Class::Hash required for ViralSeq
2
2
 
3
3
  class Hash
4
4
 
@@ -130,8 +130,8 @@ module ViralSeq
130
130
  end
131
131
  end
132
132
  end
133
- sequence_hash = Hash[*sequence_a]
134
- quality_hash = Hash[*quality_a]
133
+ sequence_hash = Hash[sequence_a.each_slice(2).to_a]
134
+ quality_hash = Hash[quality_a.each_slice(2).to_a]
135
135
 
136
136
  seq_hash = ViralSeq::SeqHash.new
137
137
  seq_hash.dna_hash = sequence_hash
@@ -181,6 +181,7 @@ module ViralSeq
181
181
  new_seqhash = ViralSeq::SeqHash.new
182
182
  new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
183
183
  new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
184
+ new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
184
185
  new_seqhash.title = self.title + "_with_" + sh2.title
185
186
  new_seqhash.file = self.file + "," + sh2.file
186
187
  return new_seqhash
@@ -1144,6 +1145,27 @@ module ViralSeq
1144
1145
  return new_sh
1145
1146
  end
1146
1147
 
1148
+ # trim dna sequences based on the provided reference coordinates.
1149
+ # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1150
+ # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
1151
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
1152
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
1153
+ # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
1154
+
1155
+ def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
1156
+ seq_hash = self.dna_hash.dup
1157
+ seq_hash_unique = seq_hash.uniq_hash
1158
+ trimmed_seq_hash = {}
1159
+ seq_hash_unique.each do |seq, names|
1160
+ trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
1161
+ names.each do |name|
1162
+ trimmed_seq_hash[name] = trimmed_seq
1163
+ end
1164
+ end
1165
+ return_seq_hash = self.dup
1166
+ return_seq_hash.dna_hash = trimmed_seq_hash
1167
+ return return_seq_hash
1168
+ end
1147
1169
 
1148
1170
  # start of private functions
1149
1171
  private
@@ -211,7 +211,7 @@ module ViralSeq
211
211
  # {minimal overlap set to 4. }
212
212
  def overlap_matrix(sequence1, sequence2)
213
213
  min_overlap = 4
214
- max_overlap = [sequence1.size, sequence2.size].max
214
+ max_overlap = [sequence1.size, sequence2.size].min
215
215
  matrix_hash = {}
216
216
  (min_overlap..max_overlap).each do |overlap|
217
217
  matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
@@ -2,5 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.0.7"
5
+ VERSION = "1.0.8"
6
+ TCS_VERSION = "2.0.0"
6
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.7
4
+ version: 1.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2020-01-28 00:00:00.000000000 Z
12
+ date: 2020-02-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -89,6 +89,8 @@ email:
89
89
  - clarkmu@gmail.com
90
90
  executables:
91
91
  - locator
92
+ - tcs
93
+ - tcs_json_generator
92
94
  extensions: []
93
95
  extra_rdoc_files: []
94
96
  files:
@@ -102,6 +104,8 @@ files:
102
104
  - README.md
103
105
  - Rakefile
104
106
  - bin/locator
107
+ - bin/tcs
108
+ - bin/tcs_json_generator
105
109
  - lib/viral_seq.rb
106
110
  - lib/viral_seq/constant.rb
107
111
  - lib/viral_seq/enumerable.rb