viral_seq 1.0.7 → 1.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +11 -0
- data/bin/locator +20 -0
- data/bin/tcs +528 -0
- data/bin/tcs_json_generator +170 -0
- data/lib/viral_seq.rb +1 -1
- data/lib/viral_seq/hash.rb +1 -1
- data/lib/viral_seq/seq_hash.rb +24 -2
- data/lib/viral_seq/seq_hash_pair.rb +1 -1
- data/lib/viral_seq/version.rb +2 -1
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d79f0676fb23cdc25fb3b0161b5665ecfe082e2401f40a1de3a782d9fb3d52a
|
4
|
+
data.tar.gz: 01a09f4cfca1274bfb1b870cdad62614def01fdaded727ce9100eec377962401
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 042f11da57209003bc84b0f7c764a9953f0ca6c1fcd00a5e943be531162bc06c9d54e3c4ceb1305c91fe5795894e3da394a196899a4f1df83d97b826c5582411
|
7
|
+
data.tar.gz: b2b2bfb9a8e6d023f610b19311a1a1ea331fbaa804cf20aebc3a34f6b049240ec43fe10e92b9f00feef3fd78e922fe0ed39281146693358998020036b9553504
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -51,6 +51,17 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
|
51
51
|
|
52
52
|
## Updates
|
53
53
|
|
54
|
+
Version 1.0.8-02282020:
|
55
|
+
|
56
|
+
1. TCS pipeline added as executable.
|
57
|
+
tcs - main TCS pipeline script.
|
58
|
+
tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
|
59
|
+
|
60
|
+
2. Methods added:
|
61
|
+
ViralSeq::SeqHash#trim
|
62
|
+
|
63
|
+
3. Bug fix for several methods.
|
64
|
+
|
54
65
|
Version 1.0.7-01282020:
|
55
66
|
|
56
67
|
1. Several methods added, including
|
data/bin/locator
CHANGED
@@ -1,5 +1,25 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
13
|
+
# all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
# THE SOFTWARE.
|
22
|
+
|
3
23
|
require 'viral_seq'
|
4
24
|
require 'csv'
|
5
25
|
require 'optparse'
|
data/bin/tcs
ADDED
@@ -0,0 +1,528 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# TCS pipeline for Primer ID sequencing data analysis.
|
4
|
+
|
5
|
+
# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
|
6
|
+
#
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
+
# of this software and associated documentation files (the "Software"), to deal
|
9
|
+
# in the Software without restriction, including without limitation the rights
|
10
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
# copies of the Software, and to permit persons to whom the Software is
|
12
|
+
# furnished to do so, subject to the following conditions:
|
13
|
+
#
|
14
|
+
# The above copyright notice and this permission notice shall be included in
|
15
|
+
# all copies or substantial portions of the Software.
|
16
|
+
#
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
|
+
# THE SOFTWARE.
|
24
|
+
|
25
|
+
# Use JSON file as the run param
|
26
|
+
# run tcs_json_generator.rb to generate param json file.
|
27
|
+
|
28
|
+
require 'viral_seq'
|
29
|
+
require 'json'
|
30
|
+
require 'colorize'
|
31
|
+
|
32
|
+
# updated the ViralSeq module. Push with the new version.
|
33
|
+
|
34
|
+
module ViralSeq
|
35
|
+
class SeqHash
|
36
|
+
def self.new_from_fastq(fastq_file)
|
37
|
+
count = 0
|
38
|
+
sequence_a = []
|
39
|
+
quality_a = []
|
40
|
+
count_seq = 0
|
41
|
+
|
42
|
+
File.open(fastq_file,'r') do |file|
|
43
|
+
file.readlines.collect do |line|
|
44
|
+
count +=1
|
45
|
+
count_m = count % 4
|
46
|
+
if count_m == 1
|
47
|
+
line.tr!('@','>')
|
48
|
+
sequence_a << line.chomp
|
49
|
+
quality_a << line.chomp
|
50
|
+
count_seq += 1
|
51
|
+
elsif count_m == 2
|
52
|
+
sequence_a << line.chomp
|
53
|
+
elsif count_m == 0
|
54
|
+
quality_a << line.chomp
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
59
|
+
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
60
|
+
|
61
|
+
seq_hash = ViralSeq::SeqHash.new
|
62
|
+
seq_hash.dna_hash = sequence_hash
|
63
|
+
seq_hash.qc_hash = quality_hash
|
64
|
+
seq_hash.title = File.basename(fastq_file,".*")
|
65
|
+
seq_hash.file = fastq_file
|
66
|
+
return seq_hash
|
67
|
+
end # end of ::new_from_fastq
|
68
|
+
|
69
|
+
class << self
|
70
|
+
alias_method :fq, :new_from_fastq
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
module ViralSeq
|
76
|
+
class SeqHash
|
77
|
+
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
78
|
+
seq_hash = self.dna_hash.dup
|
79
|
+
seq_hash_unique = seq_hash.uniq_hash
|
80
|
+
trimmed_seq_hash = {}
|
81
|
+
seq_hash_unique.each do |seq, names|
|
82
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
83
|
+
names.each do |name|
|
84
|
+
trimmed_seq_hash[name] = trimmed_seq
|
85
|
+
end
|
86
|
+
end
|
87
|
+
return_seq_hash = self.dup
|
88
|
+
return_seq_hash.dna_hash = trimmed_seq_hash
|
89
|
+
return return_seq_hash
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# end of additonal methods. Delete before publish
|
95
|
+
|
96
|
+
# calculate consensus cutoff
|
97
|
+
|
98
|
+
def calculate_cut_off(m, error_rate = 0.02)
|
99
|
+
n = 0
|
100
|
+
case error_rate
|
101
|
+
when 0.005...0.015
|
102
|
+
if m <= 10
|
103
|
+
n = 2
|
104
|
+
else
|
105
|
+
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
106
|
+
end
|
107
|
+
|
108
|
+
when 0...0.005
|
109
|
+
if m <= 10
|
110
|
+
n = 2
|
111
|
+
else
|
112
|
+
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
113
|
+
end
|
114
|
+
|
115
|
+
else
|
116
|
+
if m <= 10
|
117
|
+
n = 2
|
118
|
+
elsif m <= 8500
|
119
|
+
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
120
|
+
else
|
121
|
+
n = 0.0079 * m + 9.4869
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
n = n.round
|
126
|
+
n = 2 if n < 3
|
127
|
+
return n
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
TCS_VERSION = "2.0.0"
|
132
|
+
|
133
|
+
puts "\n" + '-'*58
|
134
|
+
puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
135
|
+
puts '-'*58 + "\n"
|
136
|
+
|
137
|
+
unless ARGV[0]
|
138
|
+
raise "No JSON param file found. Script terminated."
|
139
|
+
end
|
140
|
+
|
141
|
+
params = JSON.parse(File.read(ARGV[0]), symbolize_names: true)
|
142
|
+
|
143
|
+
indir = params[:raw_sequence_dir]
|
144
|
+
|
145
|
+
unless File.exist?(indir)
|
146
|
+
raise "No input sequence directory found. Script terminated."
|
147
|
+
end
|
148
|
+
|
149
|
+
libname = File.basename(indir)
|
150
|
+
|
151
|
+
# obtain R1 and R2 file path
|
152
|
+
files = []
|
153
|
+
Dir.chdir(indir) do
|
154
|
+
files = Dir.glob("*")
|
155
|
+
end
|
156
|
+
|
157
|
+
if files.empty?
|
158
|
+
raise "Input dir does not contain files. Script terminated."
|
159
|
+
end
|
160
|
+
|
161
|
+
r1_f = ""
|
162
|
+
r2_f = ""
|
163
|
+
|
164
|
+
# unzip .fasta.gz
|
165
|
+
def unzip_r(indir, f)
|
166
|
+
r_file = indir + "/" + f
|
167
|
+
if f =~ /.gz/
|
168
|
+
`gzip -d #{r_file}`
|
169
|
+
new_f = f.sub ".gz", ""
|
170
|
+
r_file = File.join(indir, new_f)
|
171
|
+
end
|
172
|
+
return r_file
|
173
|
+
end
|
174
|
+
runtime_log_file = File.join(indir,"runtime.log")
|
175
|
+
log = File.open(runtime_log_file, "w")
|
176
|
+
log.puts "TSC pipeline Version " + TCS_VERSION.to_s
|
177
|
+
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
178
|
+
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
179
|
+
|
180
|
+
|
181
|
+
files.each do |f|
|
182
|
+
t = f.split("_")
|
183
|
+
if t.size == 1
|
184
|
+
tag = f
|
185
|
+
else
|
186
|
+
tag = f.split("_")[1..-1].join("_")
|
187
|
+
end
|
188
|
+
|
189
|
+
if tag =~ /r1/i
|
190
|
+
r1_f = unzip_r(indir, f)
|
191
|
+
elsif tag =~ /r2/i
|
192
|
+
r2_f = unzip_r(indir, f)
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
|
197
|
+
unless File.exist?(r1_f)
|
198
|
+
log.puts "R1 file not found. Script terminated."
|
199
|
+
raise "R1 file not found. Script terminated."
|
200
|
+
end
|
201
|
+
|
202
|
+
unless File.exist?(r2_f)
|
203
|
+
log.puts "R2 file not found. Script terminated."
|
204
|
+
raise "R2 file not found. Script terminated."
|
205
|
+
end
|
206
|
+
|
207
|
+
r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
|
208
|
+
r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
|
209
|
+
|
210
|
+
raw_sequence_number = r1_fastq_sh.size
|
211
|
+
log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
|
212
|
+
|
213
|
+
if params[:platform_error_rate]
|
214
|
+
error_rate = params[:platform_error_rate]
|
215
|
+
else
|
216
|
+
error_rate = 0.02
|
217
|
+
end
|
218
|
+
|
219
|
+
primers = params[:primer_pairs]
|
220
|
+
if primers.empty?
|
221
|
+
log.puts "No primer information. Script terminated."
|
222
|
+
raise "No primer information. Script terminated."
|
223
|
+
end
|
224
|
+
|
225
|
+
primers.each do |primer|
|
226
|
+
summary_json = {}
|
227
|
+
summary_json[:tcs_version] = TCS_VERSION
|
228
|
+
summary_json[:viralseq_version] = ViralSeq::VERSION
|
229
|
+
summary_json[:runtime] = Time.now.to_s
|
230
|
+
|
231
|
+
primer[:region] ? region = primer[:region] : region = "region"
|
232
|
+
summary_json[:primer_set_name] = region
|
233
|
+
|
234
|
+
cdna_primer = primer[:cdna]
|
235
|
+
forward_primer = primer[:forward]
|
236
|
+
unless cdna_primer
|
237
|
+
log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
|
238
|
+
end
|
239
|
+
unless forward_primer
|
240
|
+
log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
|
241
|
+
end
|
242
|
+
summary_json[:cdan_primer] = cdna_primer
|
243
|
+
summary_json[:forward_primer] = forward_primer
|
244
|
+
|
245
|
+
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0.5
|
246
|
+
summary_json[:majority_cut_off] = majority_cut_off
|
247
|
+
|
248
|
+
summary_json[:total_raw_sequence] = raw_sequence_number
|
249
|
+
|
250
|
+
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
251
|
+
|
252
|
+
r1_raw = r1_fastq_sh.dna_hash
|
253
|
+
r2_raw = r2_fastq_sh.dna_hash
|
254
|
+
|
255
|
+
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
256
|
+
# obtain biological forward primer sequence
|
257
|
+
if forward_primer.match(/(N+)(\w+)$/)
|
258
|
+
forward_n = $1.size
|
259
|
+
forward_bio_primer = $2
|
260
|
+
else
|
261
|
+
forward_n = 0
|
262
|
+
forward_bio_primer = forward_primer
|
263
|
+
end
|
264
|
+
forward_bio_primer_size = forward_bio_primer.size
|
265
|
+
forward_starting_number = forward_n + forward_bio_primer_size
|
266
|
+
|
267
|
+
# filter R1 sequences with forward primers.
|
268
|
+
forward_primer_ref = forward_bio_primer.nt_parser
|
269
|
+
r1_passed_seq = {}
|
270
|
+
r1_raw.each do |name,seq|
|
271
|
+
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
272
|
+
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
273
|
+
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
274
|
+
|
275
|
+
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
276
|
+
if primer_region_seq =~ forward_primer_ref
|
277
|
+
r1_passed_seq[name.split("\s")[0]] = seq
|
278
|
+
end
|
279
|
+
end
|
280
|
+
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
281
|
+
|
282
|
+
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
283
|
+
|
284
|
+
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
285
|
+
# obtain biological reverse primer sequence
|
286
|
+
cdna_primer.match(/(N+)(\w+)$/)
|
287
|
+
pid_length = $1.size
|
288
|
+
cdna_bio_primer = $2
|
289
|
+
cdna_bio_primer_size = cdna_bio_primer.size
|
290
|
+
reverse_starting_number = pid_length + cdna_bio_primer_size
|
291
|
+
|
292
|
+
# filter R2 sequences with cDNA primers.
|
293
|
+
cdna_primer_ref = cdna_bio_primer.nt_parser
|
294
|
+
r2_passed_seq = {}
|
295
|
+
r2_raw.each do |name, seq|
|
296
|
+
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
297
|
+
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
298
|
+
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
299
|
+
|
300
|
+
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
301
|
+
if primer_region_seq =~ cdna_primer_ref
|
302
|
+
r2_passed_seq[name.split("\s")[0]] = seq
|
303
|
+
end
|
304
|
+
end
|
305
|
+
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
306
|
+
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
307
|
+
|
308
|
+
# pair-end
|
309
|
+
log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
|
310
|
+
id = {} # hash for :sequence_tag => primer_id
|
311
|
+
bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
|
312
|
+
bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
|
313
|
+
common_keys = r1_passed_seq.keys & r2_passed_seq.keys
|
314
|
+
paired_seq_number = common_keys.size
|
315
|
+
log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
|
316
|
+
summary_json[:paired_raw_sequence] = paired_seq_number
|
317
|
+
|
318
|
+
common_keys.each do |seqtag|
|
319
|
+
r1_seq = r1_passed_seq[seqtag]
|
320
|
+
r2_seq = r2_passed_seq[seqtag]
|
321
|
+
pid = r2_seq[0, pid_length]
|
322
|
+
id[seqtag] = pid
|
323
|
+
bio_r2[seqtag] = r2_seq[reverse_starting_number..-2]
|
324
|
+
bio_r1[seqtag] = r1_seq[forward_starting_number..-2]
|
325
|
+
end
|
326
|
+
|
327
|
+
# TCS cut-off
|
328
|
+
log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
|
329
|
+
|
330
|
+
primer_id_list = id.values
|
331
|
+
primer_id_count = primer_id_list.count_freq
|
332
|
+
primer_id_dis = primer_id_count.values.count_freq
|
333
|
+
|
334
|
+
# calculate distinct_to_raw
|
335
|
+
distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
|
336
|
+
summary_json[:distinct_to_raw] = distinct_to_raw
|
337
|
+
|
338
|
+
if primer_id_dis.keys.size < 5
|
339
|
+
log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
|
340
|
+
next
|
341
|
+
end
|
342
|
+
|
343
|
+
max_id = primer_id_dis.keys.sort[-5..-1].mean
|
344
|
+
consensus_cutoff = calculate_cut_off(max_id,error_rate)
|
345
|
+
log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
|
346
|
+
summary_json[:consensus_cutoff] = consensus_cutoff
|
347
|
+
summary_json[:length_of_pid] = pid_length
|
348
|
+
|
349
|
+
log.puts Time.now.to_s + "\t" + "Creating consensus..."
|
350
|
+
|
351
|
+
# Primer ID over the cut-off
|
352
|
+
primer_id_count_over_n = []
|
353
|
+
primer_id_count.each do |primer_id,count|
|
354
|
+
primer_id_count_over_n << primer_id if count > consensus_cutoff
|
355
|
+
end
|
356
|
+
pid_to_process = primer_id_count_over_n.size
|
357
|
+
log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
|
358
|
+
summary_json[:total_tcs_with_ambiguities] = pid_to_process
|
359
|
+
|
360
|
+
# setup output path
|
361
|
+
out_dir_set = File.join(indir, region)
|
362
|
+
Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
|
363
|
+
out_dir_consensus = File.join(out_dir_set, "consensus")
|
364
|
+
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
365
|
+
|
366
|
+
outfile_r1 = File.join(out_dir_consensus, 'r1.txt')
|
367
|
+
outfile_r2 = File.join(out_dir_consensus, 'r2.txt')
|
368
|
+
outfile_log = File.join(out_dir_set, 'log.json')
|
369
|
+
|
370
|
+
# create TCS
|
371
|
+
|
372
|
+
pid_seqtag_hash = {}
|
373
|
+
id.each do |name, pid|
|
374
|
+
if pid_seqtag_hash[pid]
|
375
|
+
pid_seqtag_hash[pid] << name
|
376
|
+
else
|
377
|
+
pid_seqtag_hash[pid] = []
|
378
|
+
pid_seqtag_hash[pid] << name
|
379
|
+
end
|
380
|
+
end
|
381
|
+
|
382
|
+
consensus = {}
|
383
|
+
r1_temp = {}
|
384
|
+
r2_temp = {}
|
385
|
+
m = 0
|
386
|
+
primer_id_count_over_n.each do |primer_id|
|
387
|
+
m += 1
|
388
|
+
log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
|
389
|
+
seq_with_same_primer_id = pid_seqtag_hash[primer_id]
|
390
|
+
r1_sub_seq = []
|
391
|
+
r2_sub_seq = []
|
392
|
+
seq_with_same_primer_id.each do |seq_name|
|
393
|
+
r1_sub_seq << bio_r1[seq_name]
|
394
|
+
r2_sub_seq << bio_r2[seq_name]
|
395
|
+
end
|
396
|
+
|
397
|
+
#consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
|
398
|
+
consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
|
399
|
+
r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
|
400
|
+
r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
|
401
|
+
next if r1_consensus =~ /[^ATCG]/
|
402
|
+
next if r2_consensus =~ /[^ATCG]/
|
403
|
+
|
404
|
+
# reverse complement sequence of the R2 region
|
405
|
+
r2_consensus = r2_consensus.rc
|
406
|
+
consensus[consensus_name] = [r1_consensus, r2_consensus]
|
407
|
+
r1_temp[consensus_name] = r1_consensus
|
408
|
+
r2_temp[consensus_name] = r2_consensus
|
409
|
+
end
|
410
|
+
r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
|
411
|
+
r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
|
412
|
+
|
413
|
+
# filter consensus sequences for residual offspring PIDs
|
414
|
+
consensus_filtered = {}
|
415
|
+
consensus_number_temp = consensus.size
|
416
|
+
max_pid_comb = 4**pid_length
|
417
|
+
if consensus_number_temp < 0.003*max_pid_comb
|
418
|
+
log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
|
419
|
+
r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
|
420
|
+
r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
|
421
|
+
common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
|
422
|
+
common_pid.each do |pid|
|
423
|
+
consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
|
424
|
+
end
|
425
|
+
else
|
426
|
+
consensus_filtered = consensus
|
427
|
+
end
|
428
|
+
n_con = consensus_filtered.size
|
429
|
+
log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
|
430
|
+
summary_json[:total_tcs] = n_con
|
431
|
+
summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
|
432
|
+
|
433
|
+
log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
|
434
|
+
# r1_file output
|
435
|
+
f1 = File.open(outfile_r1, 'w')
|
436
|
+
f2 = File.open(outfile_r2, 'w')
|
437
|
+
primer_id_in_use = {}
|
438
|
+
r1_seq_length = consensus_filtered.values[0][0].size
|
439
|
+
r2_seq_length = consensus_filtered.values[0][1].size
|
440
|
+
log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
|
441
|
+
log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
|
442
|
+
consensus_filtered.each do |seq_name,seq|
|
443
|
+
f1.print seq_name + "_r1\n" + seq[0] + "\n"
|
444
|
+
f2.print seq_name + "_r2\n" + seq[1] + "\n"
|
445
|
+
primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
|
446
|
+
end
|
447
|
+
f1.close
|
448
|
+
f2.close
|
449
|
+
|
450
|
+
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
451
|
+
pid_json = {}
|
452
|
+
pid_json[:primer_id_in_use] = Hash[*(primer_id_in_use.sort_by {|k, v| [-v,k]}.flatten)]
|
453
|
+
pid_json[:primer_id_distribution] = Hash[*(primer_id_dis.sort_by{|k,v| k}.flatten)]
|
454
|
+
pid_json[:primer_id_frequency] = Hash[*(primer_id_count.sort_by {|k, v| [-v,k]}.flatten)]
|
455
|
+
File.open(out_pid_json, 'w') do |f|
|
456
|
+
f.puts JSON.pretty_generate(pid_json)
|
457
|
+
end
|
458
|
+
|
459
|
+
if primer[:end_join]
|
460
|
+
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
461
|
+
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
462
|
+
case primer[:end_join_option]
|
463
|
+
when 1
|
464
|
+
joined_sh = shp.join1(primer[:overlap])
|
465
|
+
when 3
|
466
|
+
joined_sh = shp.join2
|
467
|
+
when 4
|
468
|
+
joined_sh = shp.join2(model: :indiv)
|
469
|
+
end
|
470
|
+
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
471
|
+
summary_json[:combined_tcs] = joined_sh.size
|
472
|
+
else
|
473
|
+
File.open(outfile_log, "w") do |f|
|
474
|
+
f.puts JSON.pretty_generate(summary_json)
|
475
|
+
end
|
476
|
+
next
|
477
|
+
end
|
478
|
+
|
479
|
+
if primer[:TCS_QC]
|
480
|
+
ref_start = primer[:ref_start]
|
481
|
+
ref_end = primer[:ref_end]
|
482
|
+
ref_genome = primer[:ref_genome].to_sym
|
483
|
+
indel = primer[:indel]
|
484
|
+
if ref_start == 0
|
485
|
+
ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
486
|
+
end
|
487
|
+
if ref_end == 0
|
488
|
+
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
489
|
+
end
|
490
|
+
if primer[:end_join_option] == 1 and primer[:overlap] == 0
|
491
|
+
r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
|
492
|
+
r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
|
493
|
+
r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
494
|
+
r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
495
|
+
new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
496
|
+
new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
497
|
+
joined_seq = {}
|
498
|
+
new_r1_seq.each do |seq_name, seq|
|
499
|
+
next unless seq
|
500
|
+
next unless new_r2_seq[seq_name]
|
501
|
+
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
502
|
+
end
|
503
|
+
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
504
|
+
else
|
505
|
+
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
506
|
+
end
|
507
|
+
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
508
|
+
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
509
|
+
if primer[:trim]
|
510
|
+
trim_start = primer[:trim_ref_start]
|
511
|
+
trim_end = primer[:trim_ref_end]
|
512
|
+
trim_ref = primer[:trim_ref].to_sym
|
513
|
+
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
514
|
+
end
|
515
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.txt"))
|
516
|
+
end
|
517
|
+
|
518
|
+
File.open(outfile_log, "w") do |f|
|
519
|
+
f.puts JSON.pretty_generate(summary_json)
|
520
|
+
end
|
521
|
+
end
|
522
|
+
|
523
|
+
log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
|
524
|
+
File.unlink(r1_f)
|
525
|
+
File.unlink(r2_f)
|
526
|
+
log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly exercuted."
|
527
|
+
log.close
|
528
|
+
puts "DONE!"
|
@@ -0,0 +1,170 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# TCS pipeline JSON params generator.
|
4
|
+
|
5
|
+
require 'colorize'
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
def get_ref
|
9
|
+
puts "Choose reference genome (1-3):"
|
10
|
+
puts "1. HIV-1 HXB2".red.bold
|
11
|
+
puts "2. HIV-1 NL4-3".blue.bold
|
12
|
+
puts "3. SIV MAC239".magenta.bold
|
13
|
+
print "> "
|
14
|
+
ref_option = gets.chomp.rstrip
|
15
|
+
while ![1,2,3].include?(ref_option.to_i)
|
16
|
+
print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
|
17
|
+
ref_option = gets.chomp.rstrip.to_i
|
18
|
+
end
|
19
|
+
ref = case ref_option.to_i
|
20
|
+
when 1
|
21
|
+
:HXB2
|
22
|
+
when 2
|
23
|
+
:NL43
|
24
|
+
when 3
|
25
|
+
:MAC239
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
TCS_VERSION = "2.0.0"
|
30
|
+
|
31
|
+
puts "\n" + '-'*58
|
32
|
+
puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
33
|
+
puts '-'*58 + "\n"
|
34
|
+
|
35
|
+
param = {}
|
36
|
+
|
37
|
+
puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
|
38
|
+
print '> '
|
39
|
+
param[:raw_sequence_dir] = gets.chomp.rstrip
|
40
|
+
|
41
|
+
puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
|
42
|
+
print '> '
|
43
|
+
input_error = gets.chomp.rstrip.to_f
|
44
|
+
if input_error == 0.0
|
45
|
+
param[:platform_error_rate] = 0.02
|
46
|
+
else
|
47
|
+
param[:platform_error_rate] = input_error
|
48
|
+
end
|
49
|
+
|
50
|
+
param[:primer_pairs] = []
|
51
|
+
continue = true
|
52
|
+
while continue
|
53
|
+
data = {}
|
54
|
+
puts "Enter the name for the sequenced region: "
|
55
|
+
print '> '
|
56
|
+
data[:region] = gets.chomp.rstrip
|
57
|
+
|
58
|
+
puts "Enter the #{"cDNA".red.bold} primer sequence: "
|
59
|
+
print '> '
|
60
|
+
data[:cdna] = gets.chomp.rstrip
|
61
|
+
|
62
|
+
puts "Enter the #{"forward".blue.bold} primer sequence: "
|
63
|
+
print '> '
|
64
|
+
data[:forward] = gets.chomp.rstrip
|
65
|
+
|
66
|
+
puts "Enter supermajority cut-off (0.5 - 0.9). Default: " + "0.5".blue.bold + " (simple majority)"
|
67
|
+
print '> '
|
68
|
+
mj = gets.chomp.rstrip.to_f
|
69
|
+
if (0.5..0.9).include?(mj)
|
70
|
+
data[:majority] = mj
|
71
|
+
else
|
72
|
+
data[:majority] = 0.5
|
73
|
+
end
|
74
|
+
|
75
|
+
print "Need end-join? Y/N \n> "
|
76
|
+
ej = gets.chomp.rstrip
|
77
|
+
if ej =~ /y|yes/i
|
78
|
+
data[:end_join] = true
|
79
|
+
|
80
|
+
print "End-join option? Choose from (1-4):\n
|
81
|
+
1: simple join, no overlap
|
82
|
+
2: known overlap \n
|
83
|
+
3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
|
84
|
+
4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
|
85
|
+
> "
|
86
|
+
ej_option = gets.chomp.rstrip
|
87
|
+
while ![1,2,3,4].include?(ej_option.to_i)
|
88
|
+
puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
|
89
|
+
ej_option = gets.chomp.rstrip.to_i
|
90
|
+
end
|
91
|
+
case ej_option.to_i
|
92
|
+
when 1
|
93
|
+
data[:end_join_option] = 1
|
94
|
+
data[:overlap] = 0
|
95
|
+
when 2
|
96
|
+
data[:end_join_option] = 1
|
97
|
+
print "overlap bases: \n> "
|
98
|
+
ol = gets.chomp.rstrip.to_i
|
99
|
+
data[:overlap] = ol
|
100
|
+
when 3
|
101
|
+
data[:end_join_option] = 3
|
102
|
+
when 4
|
103
|
+
data[:end_join_option] = 4
|
104
|
+
end
|
105
|
+
|
106
|
+
print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
|
107
|
+
qc = gets.chomp.rstrip
|
108
|
+
if qc =~ /y|yes/i
|
109
|
+
data[:TCS_QC] = true
|
110
|
+
|
111
|
+
data[:ref_genome] = get_ref
|
112
|
+
|
113
|
+
print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
|
114
|
+
data[:ref_start] = gets.chomp.rstrip.to_i
|
115
|
+
|
116
|
+
print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
|
117
|
+
data[:ref_end] = gets.chomp.rstrip.to_i
|
118
|
+
|
119
|
+
print "allow indels? (default as yes) Y/N \n> "
|
120
|
+
indel = gets.chomp.rstrip
|
121
|
+
if indel =~ /n|no/i
|
122
|
+
data[:indel] = false
|
123
|
+
else
|
124
|
+
data[:indel] = true
|
125
|
+
end
|
126
|
+
else
|
127
|
+
data[:TCS_QC] = false
|
128
|
+
end
|
129
|
+
|
130
|
+
print "Need trimming to a reference genome? Y/N \n> "
|
131
|
+
trim_option = gets.chomp.rstrip
|
132
|
+
if trim_option =~ /y|yes/i
|
133
|
+
data[:trim] = true
|
134
|
+
data[:trim_ref] = get_ref
|
135
|
+
|
136
|
+
print "reference 5'end ref position \n> "
|
137
|
+
data[:trim_ref_start] = gets.chomp.rstrip.to_i
|
138
|
+
|
139
|
+
print "reference 3'end ref position \n> "
|
140
|
+
data[:trim_ref_end] = gets.chomp.rstrip.to_i
|
141
|
+
|
142
|
+
else
|
143
|
+
data[:trim] = false
|
144
|
+
end
|
145
|
+
|
146
|
+
else
|
147
|
+
data[:end_join] = false
|
148
|
+
end
|
149
|
+
|
150
|
+
print "Do you wish to conintue? Y/N \n> "
|
151
|
+
continue_sig = gets.chomp.rstrip
|
152
|
+
if continue_sig =~ /y|yes/i
|
153
|
+
continue = true
|
154
|
+
else
|
155
|
+
continue = false
|
156
|
+
end
|
157
|
+
param[:primer_pairs] << data
|
158
|
+
end
|
159
|
+
|
160
|
+
puts "\nYour JSON string is:"
|
161
|
+
puts JSON.pretty_generate(param)
|
162
|
+
|
163
|
+
print "\nDo you wish to save it as a file? Y/N \n> "
|
164
|
+
save_option = gets.chomp.rstrip
|
165
|
+
|
166
|
+
if save_option =~ /y|yes/i
|
167
|
+
print "Path to save JSON file:\n> "
|
168
|
+
path = gets.chomp.rstrip
|
169
|
+
File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
|
170
|
+
end
|
data/lib/viral_seq.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (c)
|
1
|
+
# Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
|
2
2
|
#
|
3
3
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
4
|
# of this software and associated documentation files (the "Software"), to deal
|
data/lib/viral_seq/hash.rb
CHANGED
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -130,8 +130,8 @@ module ViralSeq
|
|
130
130
|
end
|
131
131
|
end
|
132
132
|
end
|
133
|
-
sequence_hash = Hash[
|
134
|
-
quality_hash = Hash[
|
133
|
+
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
134
|
+
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
135
135
|
|
136
136
|
seq_hash = ViralSeq::SeqHash.new
|
137
137
|
seq_hash.dna_hash = sequence_hash
|
@@ -181,6 +181,7 @@ module ViralSeq
|
|
181
181
|
new_seqhash = ViralSeq::SeqHash.new
|
182
182
|
new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
|
183
183
|
new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
|
184
|
+
new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
|
184
185
|
new_seqhash.title = self.title + "_with_" + sh2.title
|
185
186
|
new_seqhash.file = self.file + "," + sh2.file
|
186
187
|
return new_seqhash
|
@@ -1144,6 +1145,27 @@ module ViralSeq
|
|
1144
1145
|
return new_sh
|
1145
1146
|
end
|
1146
1147
|
|
1148
|
+
# trim dna sequences based on the provided reference coordinates.
|
1149
|
+
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1150
|
+
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
1151
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1152
|
+
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1153
|
+
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1154
|
+
|
1155
|
+
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1156
|
+
seq_hash = self.dna_hash.dup
|
1157
|
+
seq_hash_unique = seq_hash.uniq_hash
|
1158
|
+
trimmed_seq_hash = {}
|
1159
|
+
seq_hash_unique.each do |seq, names|
|
1160
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
1161
|
+
names.each do |name|
|
1162
|
+
trimmed_seq_hash[name] = trimmed_seq
|
1163
|
+
end
|
1164
|
+
end
|
1165
|
+
return_seq_hash = self.dup
|
1166
|
+
return_seq_hash.dna_hash = trimmed_seq_hash
|
1167
|
+
return return_seq_hash
|
1168
|
+
end
|
1147
1169
|
|
1148
1170
|
# start of private functions
|
1149
1171
|
private
|
@@ -211,7 +211,7 @@ module ViralSeq
|
|
211
211
|
# {minimal overlap set to 4. }
|
212
212
|
def overlap_matrix(sequence1, sequence2)
|
213
213
|
min_overlap = 4
|
214
|
-
max_overlap = [sequence1.size, sequence2.size].
|
214
|
+
max_overlap = [sequence1.size, sequence2.size].min
|
215
215
|
matrix_hash = {}
|
216
216
|
(min_overlap..max_overlap).each do |overlap|
|
217
217
|
matrix_hash[overlap] = sequence1[-overlap..-1].compare_with(sequence2[0, overlap])
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-
|
12
|
+
date: 2020-02-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -89,6 +89,8 @@ email:
|
|
89
89
|
- clarkmu@gmail.com
|
90
90
|
executables:
|
91
91
|
- locator
|
92
|
+
- tcs
|
93
|
+
- tcs_json_generator
|
92
94
|
extensions: []
|
93
95
|
extra_rdoc_files: []
|
94
96
|
files:
|
@@ -102,6 +104,8 @@ files:
|
|
102
104
|
- README.md
|
103
105
|
- Rakefile
|
104
106
|
- bin/locator
|
107
|
+
- bin/tcs
|
108
|
+
- bin/tcs_json_generator
|
105
109
|
- lib/viral_seq.rb
|
106
110
|
- lib/viral_seq/constant.rb
|
107
111
|
- lib/viral_seq/enumerable.rb
|