viral_seq 1.2.2 → 1.2.3
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of viral_seq might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/README.md +5 -0
- data/bin/tcs +369 -347
- data/lib/viral_seq/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2698b52858a35454ffcb452cfaaf7a88099e184791447d9852913ac013703903
|
4
|
+
data.tar.gz: cf8f87ee4486491dc35f3fc5719ce8e84bf2e94928fa143e085c9463e47c6b6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7dcfc6786d8791421ef7294405ac39a69cd03bca0778550b360111a20cb745b09c6c1432a3fb148b9b8ca012dd5b2c3d4c71837aed35879542a0b7b4a2cb0de2
|
7
|
+
data.tar.gz: a3dc5e58af51fc13e6b9672a589f6f1bd6f01b7ca7b2cff0beb763f557469882546496c29c198fb3df06793e413d72c830913c8509ead406853d0921868d9cf6
|
data/README.md
CHANGED
@@ -175,6 +175,11 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
175
175
|
|
176
176
|
## Updates
|
177
177
|
|
178
|
+
### Version 1.2.3-06042021
|
179
|
+
|
180
|
+
1. Add error rescue and report in the `tcs` pipeline.
|
181
|
+
error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.3.
|
182
|
+
|
178
183
|
### Version 1.2.2-05272021
|
179
184
|
|
180
185
|
1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
|
data/bin/tcs
CHANGED
@@ -101,407 +101,429 @@ log = File.open(runtime_log_file, "w")
|
|
101
101
|
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
102
102
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
103
103
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
104
|
+
File.unlink(File.join(indir, ".tcs_error"))
|
105
|
+
|
106
|
+
begin
|
107
|
+
libname = File.basename indir
|
108
|
+
seq_files = ViralSeq::TcsCore.r1r2 indir
|
109
|
+
|
110
|
+
if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
|
111
|
+
r1_f = seq_files[:r1_file]
|
112
|
+
r2_f = seq_files[:r2_file]
|
113
|
+
elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
|
114
|
+
raise StandardError.new "Missing R2 file."
|
115
|
+
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
116
|
+
raise StandardError.new "Missing R1 file."
|
117
|
+
else
|
118
|
+
raise StandardError.new "Cannot determine R1 R2 file in #{indir}."
|
119
|
+
end
|
104
120
|
|
105
|
-
|
121
|
+
r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
|
122
|
+
r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
|
106
123
|
|
107
|
-
|
124
|
+
raw_sequence_number = r1_fastq_sh.size
|
125
|
+
log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
|
108
126
|
|
109
|
-
if
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
115
|
-
exit_sig = "Missing R1 file. Aborted."
|
116
|
-
else
|
117
|
-
exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
|
118
|
-
end
|
127
|
+
if params[:platform_error_rate]
|
128
|
+
error_rate = params[:platform_error_rate]
|
129
|
+
else
|
130
|
+
error_rate = 0.02
|
131
|
+
end
|
119
132
|
|
120
|
-
if
|
121
|
-
|
122
|
-
|
133
|
+
if params[:platform_format]
|
134
|
+
$platform_sequencing_length = params[:platform_format]
|
135
|
+
else
|
136
|
+
$platform_sequencing_length = 300
|
137
|
+
end
|
123
138
|
|
124
|
-
|
125
|
-
|
139
|
+
primers = params[:primer_pairs]
|
140
|
+
if primers.empty?
|
141
|
+
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
142
|
+
end
|
126
143
|
|
127
|
-
raw_sequence_number = r1_fastq_sh.size
|
128
|
-
log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
|
129
144
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
145
|
+
primers.each do |primer|
|
146
|
+
summary_json = {}
|
147
|
+
summary_json[:warnings] = []
|
148
|
+
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
149
|
+
summary_json[:viralseq_version] = ViralSeq::VERSION
|
150
|
+
summary_json[:runtime] = Time.now.to_s
|
135
151
|
|
136
|
-
|
137
|
-
|
138
|
-
else
|
139
|
-
$platform_sequencing_length = 300
|
140
|
-
end
|
152
|
+
primer[:region] ? region = primer[:region] : region = "region"
|
153
|
+
summary_json[:primer_set_name] = region
|
141
154
|
|
142
|
-
|
143
|
-
|
144
|
-
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
145
|
-
end
|
155
|
+
cdna_primer = primer[:cdna]
|
156
|
+
forward_primer = primer[:forward]
|
146
157
|
|
158
|
+
export_raw = primer[:export_raw]
|
159
|
+
limit_raw = primer[:limit_raw]
|
147
160
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
161
|
+
unless cdna_primer
|
162
|
+
log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
|
163
|
+
end
|
164
|
+
unless forward_primer
|
165
|
+
log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
|
166
|
+
end
|
167
|
+
summary_json[:cdan_primer] = cdna_primer
|
168
|
+
summary_json[:forward_primer] = forward_primer
|
169
|
+
|
170
|
+
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
171
|
+
summary_json[:majority_cut_off] = majority_cut_off
|
172
|
+
|
173
|
+
summary_json[:total_raw_sequence] = raw_sequence_number
|
174
|
+
|
175
|
+
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
176
|
+
|
177
|
+
# filter R1
|
178
|
+
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
179
|
+
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
180
|
+
r1_passed_seq = filter_r1[:r1_passed_seq]
|
181
|
+
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
182
|
+
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
183
|
+
|
184
|
+
# filter R2
|
185
|
+
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
186
|
+
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
187
|
+
r2_passed_seq = filter_r2[:r2_passed_seq]
|
188
|
+
pid_length = filter_r2[:pid_length]
|
189
|
+
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
190
|
+
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
191
|
+
|
192
|
+
# pair-end
|
193
|
+
log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
|
194
|
+
id = {} # hash for :sequence_tag => primer_id
|
195
|
+
bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
|
196
|
+
bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
|
197
|
+
common_keys = r1_passed_seq.keys & r2_passed_seq.keys
|
198
|
+
paired_seq_number = common_keys.size
|
199
|
+
log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
|
200
|
+
summary_json[:paired_raw_sequence] = paired_seq_number
|
201
|
+
if paired_seq_number < raw_sequence_number * 0.001
|
202
|
+
summary_json[:warnings] <<
|
203
|
+
"WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
|
204
|
+
end
|
154
205
|
|
155
|
-
|
156
|
-
|
206
|
+
common_keys.each do |seqtag|
|
207
|
+
r1_seq = r1_passed_seq[seqtag]
|
208
|
+
r2_seq = r2_passed_seq[seqtag]
|
209
|
+
pid = r2_seq[0, pid_length]
|
210
|
+
id[seqtag] = pid
|
211
|
+
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
212
|
+
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
213
|
+
end
|
157
214
|
|
158
|
-
|
159
|
-
|
215
|
+
# TCS cut-off
|
216
|
+
log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
|
160
217
|
|
161
|
-
|
162
|
-
|
218
|
+
primer_id_list = id.values
|
219
|
+
primer_id_count = primer_id_list.count_freq
|
220
|
+
primer_id_dis = primer_id_count.values.count_freq
|
163
221
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
unless forward_primer
|
168
|
-
log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
|
169
|
-
end
|
170
|
-
summary_json[:cdan_primer] = cdna_primer
|
171
|
-
summary_json[:forward_primer] = forward_primer
|
172
|
-
|
173
|
-
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
174
|
-
summary_json[:majority_cut_off] = majority_cut_off
|
175
|
-
|
176
|
-
summary_json[:total_raw_sequence] = raw_sequence_number
|
177
|
-
|
178
|
-
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
179
|
-
|
180
|
-
# filter R1
|
181
|
-
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
182
|
-
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
183
|
-
r1_passed_seq = filter_r1[:r1_passed_seq]
|
184
|
-
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
185
|
-
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
186
|
-
|
187
|
-
# filter R2
|
188
|
-
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
189
|
-
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
190
|
-
r2_passed_seq = filter_r2[:r2_passed_seq]
|
191
|
-
pid_length = filter_r2[:pid_length]
|
192
|
-
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
193
|
-
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
194
|
-
|
195
|
-
# pair-end
|
196
|
-
log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
|
197
|
-
id = {} # hash for :sequence_tag => primer_id
|
198
|
-
bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
|
199
|
-
bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
|
200
|
-
common_keys = r1_passed_seq.keys & r2_passed_seq.keys
|
201
|
-
paired_seq_number = common_keys.size
|
202
|
-
log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
|
203
|
-
summary_json[:paired_raw_sequence] = paired_seq_number
|
204
|
-
if paired_seq_number < raw_sequence_number * 0.001
|
205
|
-
summary_json[:warnings] <<
|
206
|
-
"WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
|
207
|
-
end
|
222
|
+
# calculate distinct_to_raw
|
223
|
+
distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
|
224
|
+
summary_json[:distinct_to_raw] = distinct_to_raw
|
208
225
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
id[seqtag] = pid
|
214
|
-
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
215
|
-
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
216
|
-
end
|
226
|
+
if primer_id_dis.keys.size < 5
|
227
|
+
log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
|
228
|
+
next
|
229
|
+
end
|
217
230
|
|
218
|
-
|
219
|
-
|
231
|
+
max_id = primer_id_dis.keys.sort[-5..-1].mean
|
232
|
+
consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
|
233
|
+
log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
|
234
|
+
summary_json[:consensus_cutoff] = consensus_cutoff
|
235
|
+
summary_json[:length_of_pid] = pid_length
|
236
|
+
log.puts Time.now.to_s + "\t" + "Creating consensus..."
|
237
|
+
|
238
|
+
# Primer ID over the cut-off
|
239
|
+
primer_id_count_over_n = []
|
240
|
+
primer_id_count.each do |primer_id,count|
|
241
|
+
primer_id_count_over_n << primer_id if count > consensus_cutoff
|
242
|
+
end
|
243
|
+
pid_to_process = primer_id_count_over_n.size
|
244
|
+
log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
|
245
|
+
summary_json[:total_tcs_with_ambiguities] = pid_to_process
|
220
246
|
|
221
|
-
|
222
|
-
|
223
|
-
|
247
|
+
# setup output path
|
248
|
+
out_dir_set = File.join(indir, region)
|
249
|
+
Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
|
250
|
+
out_dir_consensus = File.join(out_dir_set, "consensus")
|
251
|
+
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
224
252
|
|
225
|
-
|
226
|
-
|
227
|
-
|
253
|
+
outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
|
254
|
+
outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
|
255
|
+
outfile_log = File.join(out_dir_set, 'log.json')
|
228
256
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
257
|
+
# if export_raw is true, create dir for raw sequence
|
258
|
+
if export_raw
|
259
|
+
out_dir_raw = File.join(out_dir_set, "raw")
|
260
|
+
Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
|
261
|
+
outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
|
262
|
+
outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
|
263
|
+
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
264
|
+
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
265
|
+
|
266
|
+
if limit_raw
|
267
|
+
raw_keys = bio_r1.keys.sample(limit_raw.to_i)
|
268
|
+
else
|
269
|
+
raw_keys = bio_r1.keys
|
270
|
+
end
|
233
271
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
# Primer ID over the cut-off
|
242
|
-
primer_id_count_over_n = []
|
243
|
-
primer_id_count.each do |primer_id,count|
|
244
|
-
primer_id_count_over_n << primer_id if count > consensus_cutoff
|
245
|
-
end
|
246
|
-
pid_to_process = primer_id_count_over_n.size
|
247
|
-
log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
|
248
|
-
summary_json[:total_tcs_with_ambiguities] = pid_to_process
|
249
|
-
|
250
|
-
# setup output path
|
251
|
-
out_dir_set = File.join(indir, region)
|
252
|
-
Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
|
253
|
-
out_dir_consensus = File.join(out_dir_set, "consensus")
|
254
|
-
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
255
|
-
|
256
|
-
outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
|
257
|
-
outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
|
258
|
-
outfile_log = File.join(out_dir_set, 'log.json')
|
259
|
-
|
260
|
-
# if export_raw is true, create dir for raw sequence
|
261
|
-
if export_raw
|
262
|
-
out_dir_raw = File.join(out_dir_set, "raw")
|
263
|
-
Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
|
264
|
-
outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
|
265
|
-
outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
|
266
|
-
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
267
|
-
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
268
|
-
|
269
|
-
if limit_raw
|
270
|
-
raw_keys = bio_r1.keys.sample(limit_raw.to_i)
|
271
|
-
else
|
272
|
-
raw_keys = bio_r1.keys
|
273
|
-
end
|
272
|
+
raw_keys.each do |k|
|
273
|
+
raw_r1_f.puts k + "_r1"
|
274
|
+
raw_r2_f.puts k + "_r2"
|
275
|
+
raw_r1_f.puts bio_r1[k]
|
276
|
+
raw_r2_f.puts bio_r2[k].rc
|
277
|
+
end
|
274
278
|
|
275
|
-
|
276
|
-
|
277
|
-
raw_r2_f.puts k + "_r2"
|
278
|
-
raw_r1_f.puts bio_r1[k]
|
279
|
-
raw_r2_f.puts bio_r2[k].rc
|
279
|
+
raw_r1_f.close
|
280
|
+
raw_r2_f.close
|
280
281
|
end
|
281
282
|
|
282
|
-
|
283
|
-
raw_r2_f.close
|
284
|
-
end
|
283
|
+
# create TCS
|
285
284
|
|
286
|
-
|
285
|
+
pid_seqtag_hash = {}
|
286
|
+
id.each do |name, pid|
|
287
|
+
if pid_seqtag_hash[pid]
|
288
|
+
pid_seqtag_hash[pid] << name
|
289
|
+
else
|
290
|
+
pid_seqtag_hash[pid] = []
|
291
|
+
pid_seqtag_hash[pid] << name
|
292
|
+
end
|
293
|
+
end
|
287
294
|
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
295
|
+
consensus = {}
|
296
|
+
r1_temp = {}
|
297
|
+
r2_temp = {}
|
298
|
+
m = 0
|
299
|
+
primer_id_count_over_n.each do |primer_id|
|
300
|
+
m += 1
|
301
|
+
log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
|
302
|
+
seq_with_same_primer_id = pid_seqtag_hash[primer_id]
|
303
|
+
r1_sub_seq = []
|
304
|
+
r2_sub_seq = []
|
305
|
+
seq_with_same_primer_id.each do |seq_name|
|
306
|
+
r1_sub_seq << bio_r1[seq_name]
|
307
|
+
r2_sub_seq << bio_r2[seq_name]
|
308
|
+
end
|
309
|
+
#consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
|
310
|
+
consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
|
311
|
+
r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
|
312
|
+
r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
|
313
|
+
|
314
|
+
# hide the following two lines if allowing sequence to have ambiguities.
|
315
|
+
next if r1_consensus =~ /[^ATCG]/
|
316
|
+
next if r2_consensus =~ /[^ATCG]/
|
317
|
+
|
318
|
+
# reverse complement sequence of the R2 region
|
319
|
+
r2_consensus = r2_consensus.rc
|
320
|
+
consensus[consensus_name] = [r1_consensus, r2_consensus]
|
321
|
+
r1_temp[consensus_name] = r1_consensus
|
322
|
+
r2_temp[consensus_name] = r2_consensus
|
323
|
+
end
|
324
|
+
r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
|
325
|
+
r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
|
326
|
+
|
327
|
+
# filter consensus sequences for residual offspring PIDs
|
328
|
+
consensus_filtered = {}
|
329
|
+
consensus_number_temp = consensus.size
|
330
|
+
max_pid_comb = 4**pid_length
|
331
|
+
if consensus_number_temp < 0.003*max_pid_comb
|
332
|
+
log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
|
333
|
+
r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
|
334
|
+
r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
|
335
|
+
common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
|
336
|
+
common_pid.each do |pid|
|
337
|
+
consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
|
338
|
+
end
|
292
339
|
else
|
293
|
-
|
294
|
-
pid_seqtag_hash[pid] << name
|
340
|
+
consensus_filtered = consensus
|
295
341
|
end
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
342
|
+
n_con = consensus_filtered.size
|
343
|
+
log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
|
344
|
+
summary_json[:total_tcs] = n_con
|
345
|
+
summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
|
346
|
+
|
347
|
+
log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
|
348
|
+
# r1_file output
|
349
|
+
f1 = File.open(outfile_r1, 'w')
|
350
|
+
f2 = File.open(outfile_r2, 'w')
|
351
|
+
primer_id_in_use = {}
|
352
|
+
if n_con > 0
|
353
|
+
r1_seq_length = consensus_filtered.values[0][0].size
|
354
|
+
r2_seq_length = consensus_filtered.values[0][1].size
|
355
|
+
else
|
356
|
+
next
|
311
357
|
end
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
|
329
|
-
|
330
|
-
# filter consensus sequences for residual offspring PIDs
|
331
|
-
consensus_filtered = {}
|
332
|
-
consensus_number_temp = consensus.size
|
333
|
-
max_pid_comb = 4**pid_length
|
334
|
-
if consensus_number_temp < 0.003*max_pid_comb
|
335
|
-
log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
|
336
|
-
r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
|
337
|
-
r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
|
338
|
-
common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
|
339
|
-
common_pid.each do |pid|
|
340
|
-
consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
|
358
|
+
log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
|
359
|
+
log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
|
360
|
+
consensus_filtered.each do |seq_name,seq|
|
361
|
+
f1.print seq_name + "_r1\n" + seq[0] + "\n"
|
362
|
+
f2.print seq_name + "_r2\n" + seq[1] + "\n"
|
363
|
+
primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
|
364
|
+
end
|
365
|
+
f1.close
|
366
|
+
f2.close
|
367
|
+
|
368
|
+
# Primer ID distribution in .json file
|
369
|
+
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
370
|
+
pid_json = {}
|
371
|
+
pid_json[:primer_id_in_use] = {}
|
372
|
+
primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
|
373
|
+
pid_json[:primer_id_in_use][k] = v
|
341
374
|
end
|
342
|
-
else
|
343
|
-
consensus_filtered = consensus
|
344
|
-
end
|
345
|
-
n_con = consensus_filtered.size
|
346
|
-
log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
|
347
|
-
summary_json[:total_tcs] = n_con
|
348
|
-
summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
|
349
|
-
|
350
|
-
log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
|
351
|
-
# r1_file output
|
352
|
-
f1 = File.open(outfile_r1, 'w')
|
353
|
-
f2 = File.open(outfile_r2, 'w')
|
354
|
-
primer_id_in_use = {}
|
355
|
-
if n_con > 0
|
356
|
-
r1_seq_length = consensus_filtered.values[0][0].size
|
357
|
-
r2_seq_length = consensus_filtered.values[0][1].size
|
358
|
-
else
|
359
|
-
next
|
360
|
-
end
|
361
|
-
log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
|
362
|
-
log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
|
363
|
-
consensus_filtered.each do |seq_name,seq|
|
364
|
-
f1.print seq_name + "_r1\n" + seq[0] + "\n"
|
365
|
-
f2.print seq_name + "_r2\n" + seq[1] + "\n"
|
366
|
-
primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
|
367
|
-
end
|
368
|
-
f1.close
|
369
|
-
f2.close
|
370
|
-
|
371
|
-
# Primer ID distribution in .json file
|
372
|
-
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
373
|
-
pid_json = {}
|
374
|
-
pid_json[:primer_id_in_use] = {}
|
375
|
-
primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
|
376
|
-
pid_json[:primer_id_in_use][k] = v
|
377
|
-
end
|
378
375
|
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
376
|
+
pid_json[:primer_id_distribution] = {}
|
377
|
+
primer_id_dis.sort_by{|k,v| k}.each do |k,v|
|
378
|
+
pid_json[:primer_id_distribution][k] = v
|
379
|
+
end
|
383
380
|
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
381
|
+
pid_json[:primer_id_frequency] = {}
|
382
|
+
primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
|
383
|
+
pid_json[:primer_id_frequency][k] = v
|
384
|
+
end
|
388
385
|
|
389
|
-
|
390
|
-
|
391
|
-
|
386
|
+
File.open(out_pid_json, 'w') do |f|
|
387
|
+
f.puts JSON.pretty_generate(pid_json)
|
388
|
+
end
|
392
389
|
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
390
|
+
# start end-join
|
391
|
+
def end_join(dir, option, overlap)
|
392
|
+
shp = ViralSeq::SeqHashPair.fa(dir)
|
393
|
+
case option
|
394
|
+
when 1
|
395
|
+
joined_sh = shp.join1()
|
396
|
+
when 2
|
397
|
+
joined_sh = shp.join1(overlap)
|
398
|
+
when 3
|
399
|
+
joined_sh = shp.join2
|
400
|
+
when 4
|
401
|
+
joined_sh = shp.join2(model: :indiv)
|
402
|
+
end
|
403
|
+
return joined_sh
|
405
404
|
end
|
406
|
-
return joined_sh
|
407
|
-
end
|
408
405
|
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
406
|
+
if primer[:end_join]
|
407
|
+
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
408
|
+
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
409
|
+
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
410
|
+
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
414
411
|
|
415
|
-
|
412
|
+
summary_json[:combined_tcs] = joined_sh.size
|
416
413
|
|
417
|
-
|
418
|
-
|
419
|
-
|
414
|
+
if export_raw
|
415
|
+
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
416
|
+
end
|
420
417
|
|
421
|
-
|
422
|
-
|
423
|
-
|
418
|
+
else
|
419
|
+
File.open(outfile_log, "w") do |f|
|
420
|
+
f.puts JSON.pretty_generate(summary_json)
|
421
|
+
end
|
422
|
+
next
|
424
423
|
end
|
425
|
-
next
|
426
|
-
end
|
427
424
|
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
end
|
436
|
-
if ref_end == 0
|
437
|
-
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
438
|
-
end
|
439
|
-
if primer[:end_join_option] == 1 and primer[:overlap] == 0
|
440
|
-
r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
|
441
|
-
r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
|
442
|
-
r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
443
|
-
r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
444
|
-
new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
445
|
-
new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
446
|
-
joined_seq = {}
|
447
|
-
new_r1_seq.each do |seq_name, seq|
|
448
|
-
next unless seq
|
449
|
-
next unless new_r2_seq[seq_name]
|
450
|
-
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
425
|
+
if primer[:TCS_QC]
|
426
|
+
ref_start = primer[:ref_start]
|
427
|
+
ref_end = primer[:ref_end]
|
428
|
+
ref_genome = primer[:ref_genome].to_sym
|
429
|
+
indel = primer[:indel]
|
430
|
+
if ref_start == 0
|
431
|
+
ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
451
432
|
end
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
433
|
+
if ref_end == 0
|
434
|
+
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
435
|
+
end
|
436
|
+
if primer[:end_join_option] == 1 and primer[:overlap] == 0
|
437
|
+
r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
|
438
|
+
r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
|
439
|
+
r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
440
|
+
r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
441
|
+
new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
442
|
+
new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
443
|
+
joined_seq = {}
|
444
|
+
new_r1_seq.each do |seq_name, seq|
|
463
445
|
next unless seq
|
464
|
-
next unless
|
465
|
-
|
446
|
+
next unless new_r2_seq[seq_name]
|
447
|
+
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
448
|
+
end
|
449
|
+
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
450
|
+
|
451
|
+
if export_raw
|
452
|
+
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
453
|
+
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
454
|
+
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
455
|
+
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
456
|
+
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
457
|
+
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
458
|
+
joined_seq_raw = {}
|
459
|
+
new_r1_seq_raw.each do |seq_name, seq|
|
460
|
+
next unless seq
|
461
|
+
next unless new_r2_seq_raw[seq_name]
|
462
|
+
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
463
|
+
end
|
464
|
+
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
465
|
+
end
|
466
|
+
else
|
467
|
+
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
468
|
+
|
469
|
+
if export_raw
|
470
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
466
471
|
end
|
467
|
-
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
468
472
|
end
|
469
|
-
else
|
470
|
-
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
471
473
|
|
472
|
-
|
473
|
-
|
474
|
+
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
475
|
+
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
476
|
+
if primer[:trim]
|
477
|
+
trim_start = primer[:trim_ref_start]
|
478
|
+
trim_end = primer[:trim_ref_end]
|
479
|
+
trim_ref = primer[:trim_ref].to_sym
|
480
|
+
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
481
|
+
if export_raw
|
482
|
+
joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
|
483
|
+
end
|
474
484
|
end
|
475
|
-
end
|
476
485
|
|
477
|
-
|
478
|
-
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
479
|
-
if primer[:trim]
|
480
|
-
trim_start = primer[:trim_ref_start]
|
481
|
-
trim_end = primer[:trim_ref_end]
|
482
|
-
trim_ref = primer[:trim_ref].to_sym
|
483
|
-
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
486
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
484
487
|
if export_raw
|
485
|
-
joined_sh_raw
|
488
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
486
489
|
end
|
487
490
|
end
|
488
491
|
|
489
|
-
|
490
|
-
|
491
|
-
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
492
|
+
File.open(outfile_log, "w") do |f|
|
493
|
+
f.puts JSON.pretty_generate(summary_json)
|
492
494
|
end
|
493
495
|
end
|
494
496
|
|
495
|
-
|
496
|
-
|
497
|
+
unless options[:keep]
|
498
|
+
log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
|
499
|
+
File.unlink(r1_f)
|
500
|
+
File.unlink(r2_f)
|
501
|
+
end
|
502
|
+
log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
|
503
|
+
log.close
|
504
|
+
puts "DONE!"
|
505
|
+
rescue => e
|
506
|
+
puts "`tcs` pipeline run with errors: " + e.message.red
|
507
|
+
puts "`tcs` pipeline aborted.".red.bold
|
508
|
+
log.puts Time.now.to_s + "\t" + e.full_message
|
509
|
+
log.puts Time.now.to_s + "\tAborted."
|
510
|
+
log.close
|
511
|
+
error_hash = {}
|
512
|
+
error_hash[:directory] = indir
|
513
|
+
error_hash[:tcs_version] = ViralSeq::TCS_VERSION
|
514
|
+
error_hash[:viralSeq_version] = ViralSeq::VERSION
|
515
|
+
error_hash[:time] = Time.now
|
516
|
+
error_hash[:error] = e.full_message
|
517
|
+
File.open(File.join(indir, ".tcs_error"), 'w') do |f|
|
518
|
+
f.puts JSON.pretty_generate([error_hash])
|
519
|
+
end
|
520
|
+
master_error_file = File.join(File.dirname(indir), ".tcs_error")
|
521
|
+
master_errors = []
|
522
|
+
if File.exist? master_error_file
|
523
|
+
master_errors << JSON.parse(File.read(master_error_file), symbolize_names: true)
|
524
|
+
end
|
525
|
+
master_errors << error_hash
|
526
|
+
File.open(master_error_file, 'w') do |f|
|
527
|
+
f.puts JSON.pretty_generate(master_errors)
|
497
528
|
end
|
498
529
|
end
|
499
|
-
|
500
|
-
unless options[:keep]
|
501
|
-
log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
|
502
|
-
File.unlink(r1_f)
|
503
|
-
File.unlink(r2_f)
|
504
|
-
end
|
505
|
-
log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
|
506
|
-
log.close
|
507
|
-
puts "DONE!"
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-06-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|