viral_seq 1.2.2 → 1.2.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of viral_seq might be problematic. Click here for more details.

Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -0
  3. data/bin/tcs +369 -347
  4. data/lib/viral_seq/version.rb +2 -2
  5. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a235cae95121a8522a47620eb9f8c05a3e2e416084743cd23df43aff7870a2c4
4
- data.tar.gz: f0ce3a9412774eed703b0b0b663e7bb2dccf340f3f558cffdca85e920291794d
3
+ metadata.gz: 2698b52858a35454ffcb452cfaaf7a88099e184791447d9852913ac013703903
4
+ data.tar.gz: cf8f87ee4486491dc35f3fc5719ce8e84bf2e94928fa143e085c9463e47c6b6f
5
5
  SHA512:
6
- metadata.gz: b97f98e40b8257281bd29cee40942d16084cf175933fc8357838ebb2a9eede1ab93ba323dbf315afb300f0a7852b2c6d939235831124710fc6f16f109e3eafc5
7
- data.tar.gz: 4d660da22c69ce1ff929ed7f67d2b03aad662bb0237e9a93d9a8ea6bd1866d8544ad108db9ab8a11eee2df992395e41b68ffc43a8d1dbb132cc1f83a897676ef
6
+ metadata.gz: 7dcfc6786d8791421ef7294405ac39a69cd03bca0778550b360111a20cb745b09c6c1432a3fb148b9b8ca012dd5b2c3d4c71837aed35879542a0b7b4a2cb0de2
7
+ data.tar.gz: a3dc5e58af51fc13e6b9672a589f6f1bd6f01b7ca7b2cff0beb763f557469882546496c29c198fb3df06793e413d72c830913c8509ead406853d0921868d9cf6
data/README.md CHANGED
@@ -175,6 +175,11 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
175
175
 
176
176
  ## Updates
177
177
 
178
+ ### Version 1.2.3-06042021
179
+
180
+ 1. Add error rescue and report in the `tcs` pipeline.
181
+ error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.3.
182
+
178
183
  ### Version 1.2.2-05272021
179
184
 
180
185
  1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
data/bin/tcs CHANGED
@@ -101,407 +101,429 @@ log = File.open(runtime_log_file, "w")
101
101
  log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
102
102
  log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
103
103
  log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
104
+ File.unlink(File.join(indir, ".tcs_error"))
105
+
106
+ begin
107
+ libname = File.basename indir
108
+ seq_files = ViralSeq::TcsCore.r1r2 indir
109
+
110
+ if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
111
+ r1_f = seq_files[:r1_file]
112
+ r2_f = seq_files[:r2_file]
113
+ elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
114
+ raise StandardError.new "Missing R2 file."
115
+ elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
116
+ raise StandardError.new "Missing R1 file."
117
+ else
118
+ raise StandardError.new "Cannot determine R1 R2 file in #{indir}."
119
+ end
104
120
 
105
- libname = File.basename indir
121
+ r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
122
+ r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
106
123
 
107
- seq_files = ViralSeq::TcsCore.r1r2 indir
124
+ raw_sequence_number = r1_fastq_sh.size
125
+ log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
108
126
 
109
- if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
110
- r1_f = seq_files[:r1_file]
111
- r2_f = seq_files[:r2_file]
112
- elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
113
- exit_sig = "Missing R2 file. Aborted."
114
- elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
115
- exit_sig = "Missing R1 file. Aborted."
116
- else
117
- exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
118
- end
127
+ if params[:platform_error_rate]
128
+ error_rate = params[:platform_error_rate]
129
+ else
130
+ error_rate = 0.02
131
+ end
119
132
 
120
- if exit_sig
121
- ViralSeq::TcsCore.log_and_abort log, exit_sig
122
- end
133
+ if params[:platform_format]
134
+ $platform_sequencing_length = params[:platform_format]
135
+ else
136
+ $platform_sequencing_length = 300
137
+ end
123
138
 
124
- r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
125
- r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
139
+ primers = params[:primer_pairs]
140
+ if primers.empty?
141
+ ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
142
+ end
126
143
 
127
- raw_sequence_number = r1_fastq_sh.size
128
- log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
129
144
 
130
- if params[:platform_error_rate]
131
- error_rate = params[:platform_error_rate]
132
- else
133
- error_rate = 0.02
134
- end
145
+ primers.each do |primer|
146
+ summary_json = {}
147
+ summary_json[:warnings] = []
148
+ summary_json[:tcs_version] = ViralSeq::TCS_VERSION
149
+ summary_json[:viralseq_version] = ViralSeq::VERSION
150
+ summary_json[:runtime] = Time.now.to_s
135
151
 
136
- if params[:platform_format]
137
- $platform_sequencing_length = params[:platform_format]
138
- else
139
- $platform_sequencing_length = 300
140
- end
152
+ primer[:region] ? region = primer[:region] : region = "region"
153
+ summary_json[:primer_set_name] = region
141
154
 
142
- primers = params[:primer_pairs]
143
- if primers.empty?
144
- ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
145
- end
155
+ cdna_primer = primer[:cdna]
156
+ forward_primer = primer[:forward]
146
157
 
158
+ export_raw = primer[:export_raw]
159
+ limit_raw = primer[:limit_raw]
147
160
 
148
- primers.each do |primer|
149
- summary_json = {}
150
- summary_json[:warnings] = []
151
- summary_json[:tcs_version] = ViralSeq::TCS_VERSION
152
- summary_json[:viralseq_version] = ViralSeq::VERSION
153
- summary_json[:runtime] = Time.now.to_s
161
+ unless cdna_primer
162
+ log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
163
+ end
164
+ unless forward_primer
165
+ log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
166
+ end
167
+ summary_json[:cdan_primer] = cdna_primer
168
+ summary_json[:forward_primer] = forward_primer
169
+
170
+ primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
171
+ summary_json[:majority_cut_off] = majority_cut_off
172
+
173
+ summary_json[:total_raw_sequence] = raw_sequence_number
174
+
175
+ log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
176
+
177
+ # filter R1
178
+ log.puts Time.now.to_s + "\t" + "filtering R1..."
179
+ filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
180
+ r1_passed_seq = filter_r1[:r1_passed_seq]
181
+ log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
182
+ summary_json[:r1_filtered_raw] = r1_passed_seq.size
183
+
184
+ # filter R2
185
+ log.puts Time.now.to_s + "\t" + "filtering R2..."
186
+ filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
187
+ r2_passed_seq = filter_r2[:r2_passed_seq]
188
+ pid_length = filter_r2[:pid_length]
189
+ log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
190
+ summary_json[:r2_filtered_raw] = r2_passed_seq.size
191
+
192
+ # pair-end
193
+ log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
194
+ id = {} # hash for :sequence_tag => primer_id
195
+ bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
196
+ bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
197
+ common_keys = r1_passed_seq.keys & r2_passed_seq.keys
198
+ paired_seq_number = common_keys.size
199
+ log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
200
+ summary_json[:paired_raw_sequence] = paired_seq_number
201
+ if paired_seq_number < raw_sequence_number * 0.001
202
+ summary_json[:warnings] <<
203
+ "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
204
+ end
154
205
 
155
- primer[:region] ? region = primer[:region] : region = "region"
156
- summary_json[:primer_set_name] = region
206
+ common_keys.each do |seqtag|
207
+ r1_seq = r1_passed_seq[seqtag]
208
+ r2_seq = r2_passed_seq[seqtag]
209
+ pid = r2_seq[0, pid_length]
210
+ id[seqtag] = pid
211
+ bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
212
+ bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
213
+ end
157
214
 
158
- cdna_primer = primer[:cdna]
159
- forward_primer = primer[:forward]
215
+ # TCS cut-off
216
+ log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
160
217
 
161
- export_raw = primer[:export_raw]
162
- limit_raw = primer[:limit_raw]
218
+ primer_id_list = id.values
219
+ primer_id_count = primer_id_list.count_freq
220
+ primer_id_dis = primer_id_count.values.count_freq
163
221
 
164
- unless cdna_primer
165
- log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
166
- end
167
- unless forward_primer
168
- log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
169
- end
170
- summary_json[:cdan_primer] = cdna_primer
171
- summary_json[:forward_primer] = forward_primer
172
-
173
- primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
174
- summary_json[:majority_cut_off] = majority_cut_off
175
-
176
- summary_json[:total_raw_sequence] = raw_sequence_number
177
-
178
- log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
179
-
180
- # filter R1
181
- log.puts Time.now.to_s + "\t" + "filtering R1..."
182
- filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
183
- r1_passed_seq = filter_r1[:r1_passed_seq]
184
- log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
185
- summary_json[:r1_filtered_raw] = r1_passed_seq.size
186
-
187
- # filter R2
188
- log.puts Time.now.to_s + "\t" + "filtering R2..."
189
- filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
190
- r2_passed_seq = filter_r2[:r2_passed_seq]
191
- pid_length = filter_r2[:pid_length]
192
- log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
193
- summary_json[:r2_filtered_raw] = r2_passed_seq.size
194
-
195
- # pair-end
196
- log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
197
- id = {} # hash for :sequence_tag => primer_id
198
- bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
199
- bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
200
- common_keys = r1_passed_seq.keys & r2_passed_seq.keys
201
- paired_seq_number = common_keys.size
202
- log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
203
- summary_json[:paired_raw_sequence] = paired_seq_number
204
- if paired_seq_number < raw_sequence_number * 0.001
205
- summary_json[:warnings] <<
206
- "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
207
- end
222
+ # calculate distinct_to_raw
223
+ distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
224
+ summary_json[:distinct_to_raw] = distinct_to_raw
208
225
 
209
- common_keys.each do |seqtag|
210
- r1_seq = r1_passed_seq[seqtag]
211
- r2_seq = r2_passed_seq[seqtag]
212
- pid = r2_seq[0, pid_length]
213
- id[seqtag] = pid
214
- bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
215
- bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
216
- end
226
+ if primer_id_dis.keys.size < 5
227
+ log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
228
+ next
229
+ end
217
230
 
218
- # TCS cut-off
219
- log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
231
+ max_id = primer_id_dis.keys.sort[-5..-1].mean
232
+ consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
233
+ log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
234
+ summary_json[:consensus_cutoff] = consensus_cutoff
235
+ summary_json[:length_of_pid] = pid_length
236
+ log.puts Time.now.to_s + "\t" + "Creating consensus..."
237
+
238
+ # Primer ID over the cut-off
239
+ primer_id_count_over_n = []
240
+ primer_id_count.each do |primer_id,count|
241
+ primer_id_count_over_n << primer_id if count > consensus_cutoff
242
+ end
243
+ pid_to_process = primer_id_count_over_n.size
244
+ log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
245
+ summary_json[:total_tcs_with_ambiguities] = pid_to_process
220
246
 
221
- primer_id_list = id.values
222
- primer_id_count = primer_id_list.count_freq
223
- primer_id_dis = primer_id_count.values.count_freq
247
+ # setup output path
248
+ out_dir_set = File.join(indir, region)
249
+ Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
250
+ out_dir_consensus = File.join(out_dir_set, "consensus")
251
+ Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
224
252
 
225
- # calculate distinct_to_raw
226
- distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
227
- summary_json[:distinct_to_raw] = distinct_to_raw
253
+ outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
254
+ outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
255
+ outfile_log = File.join(out_dir_set, 'log.json')
228
256
 
229
- if primer_id_dis.keys.size < 5
230
- log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
231
- next
232
- end
257
+ # if export_raw is true, create dir for raw sequence
258
+ if export_raw
259
+ out_dir_raw = File.join(out_dir_set, "raw")
260
+ Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
261
+ outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
262
+ outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
263
+ raw_r1_f = File.open(outfile_raw_r1, 'w')
264
+ raw_r2_f = File.open(outfile_raw_r2, 'w')
265
+
266
+ if limit_raw
267
+ raw_keys = bio_r1.keys.sample(limit_raw.to_i)
268
+ else
269
+ raw_keys = bio_r1.keys
270
+ end
233
271
 
234
- max_id = primer_id_dis.keys.sort[-5..-1].mean
235
- consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
236
- log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
237
- summary_json[:consensus_cutoff] = consensus_cutoff
238
- summary_json[:length_of_pid] = pid_length
239
- log.puts Time.now.to_s + "\t" + "Creating consensus..."
240
-
241
- # Primer ID over the cut-off
242
- primer_id_count_over_n = []
243
- primer_id_count.each do |primer_id,count|
244
- primer_id_count_over_n << primer_id if count > consensus_cutoff
245
- end
246
- pid_to_process = primer_id_count_over_n.size
247
- log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
248
- summary_json[:total_tcs_with_ambiguities] = pid_to_process
249
-
250
- # setup output path
251
- out_dir_set = File.join(indir, region)
252
- Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
253
- out_dir_consensus = File.join(out_dir_set, "consensus")
254
- Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
255
-
256
- outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
257
- outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
258
- outfile_log = File.join(out_dir_set, 'log.json')
259
-
260
- # if export_raw is true, create dir for raw sequence
261
- if export_raw
262
- out_dir_raw = File.join(out_dir_set, "raw")
263
- Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
264
- outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
265
- outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
266
- raw_r1_f = File.open(outfile_raw_r1, 'w')
267
- raw_r2_f = File.open(outfile_raw_r2, 'w')
268
-
269
- if limit_raw
270
- raw_keys = bio_r1.keys.sample(limit_raw.to_i)
271
- else
272
- raw_keys = bio_r1.keys
273
- end
272
+ raw_keys.each do |k|
273
+ raw_r1_f.puts k + "_r1"
274
+ raw_r2_f.puts k + "_r2"
275
+ raw_r1_f.puts bio_r1[k]
276
+ raw_r2_f.puts bio_r2[k].rc
277
+ end
274
278
 
275
- raw_keys.each do |k|
276
- raw_r1_f.puts k + "_r1"
277
- raw_r2_f.puts k + "_r2"
278
- raw_r1_f.puts bio_r1[k]
279
- raw_r2_f.puts bio_r2[k].rc
279
+ raw_r1_f.close
280
+ raw_r2_f.close
280
281
  end
281
282
 
282
- raw_r1_f.close
283
- raw_r2_f.close
284
- end
283
+ # create TCS
285
284
 
286
- # create TCS
285
+ pid_seqtag_hash = {}
286
+ id.each do |name, pid|
287
+ if pid_seqtag_hash[pid]
288
+ pid_seqtag_hash[pid] << name
289
+ else
290
+ pid_seqtag_hash[pid] = []
291
+ pid_seqtag_hash[pid] << name
292
+ end
293
+ end
287
294
 
288
- pid_seqtag_hash = {}
289
- id.each do |name, pid|
290
- if pid_seqtag_hash[pid]
291
- pid_seqtag_hash[pid] << name
295
+ consensus = {}
296
+ r1_temp = {}
297
+ r2_temp = {}
298
+ m = 0
299
+ primer_id_count_over_n.each do |primer_id|
300
+ m += 1
301
+ log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
302
+ seq_with_same_primer_id = pid_seqtag_hash[primer_id]
303
+ r1_sub_seq = []
304
+ r2_sub_seq = []
305
+ seq_with_same_primer_id.each do |seq_name|
306
+ r1_sub_seq << bio_r1[seq_name]
307
+ r2_sub_seq << bio_r2[seq_name]
308
+ end
309
+ #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
310
+ consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
311
+ r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
312
+ r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
313
+
314
+ # hide the following two lines if allowing sequence to have ambiguities.
315
+ next if r1_consensus =~ /[^ATCG]/
316
+ next if r2_consensus =~ /[^ATCG]/
317
+
318
+ # reverse complement sequence of the R2 region
319
+ r2_consensus = r2_consensus.rc
320
+ consensus[consensus_name] = [r1_consensus, r2_consensus]
321
+ r1_temp[consensus_name] = r1_consensus
322
+ r2_temp[consensus_name] = r2_consensus
323
+ end
324
+ r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
325
+ r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
326
+
327
+ # filter consensus sequences for residual offspring PIDs
328
+ consensus_filtered = {}
329
+ consensus_number_temp = consensus.size
330
+ max_pid_comb = 4**pid_length
331
+ if consensus_number_temp < 0.003*max_pid_comb
332
+ log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
333
+ r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
334
+ r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
335
+ common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
336
+ common_pid.each do |pid|
337
+ consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
338
+ end
292
339
  else
293
- pid_seqtag_hash[pid] = []
294
- pid_seqtag_hash[pid] << name
340
+ consensus_filtered = consensus
295
341
  end
296
- end
297
-
298
- consensus = {}
299
- r1_temp = {}
300
- r2_temp = {}
301
- m = 0
302
- primer_id_count_over_n.each do |primer_id|
303
- m += 1
304
- log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
305
- seq_with_same_primer_id = pid_seqtag_hash[primer_id]
306
- r1_sub_seq = []
307
- r2_sub_seq = []
308
- seq_with_same_primer_id.each do |seq_name|
309
- r1_sub_seq << bio_r1[seq_name]
310
- r2_sub_seq << bio_r2[seq_name]
342
+ n_con = consensus_filtered.size
343
+ log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
344
+ summary_json[:total_tcs] = n_con
345
+ summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
346
+
347
+ log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
348
+ # r1_file output
349
+ f1 = File.open(outfile_r1, 'w')
350
+ f2 = File.open(outfile_r2, 'w')
351
+ primer_id_in_use = {}
352
+ if n_con > 0
353
+ r1_seq_length = consensus_filtered.values[0][0].size
354
+ r2_seq_length = consensus_filtered.values[0][1].size
355
+ else
356
+ next
311
357
  end
312
- #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
313
- consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
314
- r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
315
- r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
316
-
317
- # hide the following two lines if allowing sequence to have ambiguities.
318
- next if r1_consensus =~ /[^ATCG]/
319
- next if r2_consensus =~ /[^ATCG]/
320
-
321
- # reverse complement sequence of the R2 region
322
- r2_consensus = r2_consensus.rc
323
- consensus[consensus_name] = [r1_consensus, r2_consensus]
324
- r1_temp[consensus_name] = r1_consensus
325
- r2_temp[consensus_name] = r2_consensus
326
- end
327
- r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
328
- r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
329
-
330
- # filter consensus sequences for residual offspring PIDs
331
- consensus_filtered = {}
332
- consensus_number_temp = consensus.size
333
- max_pid_comb = 4**pid_length
334
- if consensus_number_temp < 0.003*max_pid_comb
335
- log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
336
- r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
337
- r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
338
- common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
339
- common_pid.each do |pid|
340
- consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
358
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
359
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
360
+ consensus_filtered.each do |seq_name,seq|
361
+ f1.print seq_name + "_r1\n" + seq[0] + "\n"
362
+ f2.print seq_name + "_r2\n" + seq[1] + "\n"
363
+ primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
364
+ end
365
+ f1.close
366
+ f2.close
367
+
368
+ # Primer ID distribution in .json file
369
+ out_pid_json = File.join(out_dir_set, 'primer_id.json')
370
+ pid_json = {}
371
+ pid_json[:primer_id_in_use] = {}
372
+ primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
373
+ pid_json[:primer_id_in_use][k] = v
341
374
  end
342
- else
343
- consensus_filtered = consensus
344
- end
345
- n_con = consensus_filtered.size
346
- log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
347
- summary_json[:total_tcs] = n_con
348
- summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
349
-
350
- log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
351
- # r1_file output
352
- f1 = File.open(outfile_r1, 'w')
353
- f2 = File.open(outfile_r2, 'w')
354
- primer_id_in_use = {}
355
- if n_con > 0
356
- r1_seq_length = consensus_filtered.values[0][0].size
357
- r2_seq_length = consensus_filtered.values[0][1].size
358
- else
359
- next
360
- end
361
- log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
362
- log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
363
- consensus_filtered.each do |seq_name,seq|
364
- f1.print seq_name + "_r1\n" + seq[0] + "\n"
365
- f2.print seq_name + "_r2\n" + seq[1] + "\n"
366
- primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
367
- end
368
- f1.close
369
- f2.close
370
-
371
- # Primer ID distribution in .json file
372
- out_pid_json = File.join(out_dir_set, 'primer_id.json')
373
- pid_json = {}
374
- pid_json[:primer_id_in_use] = {}
375
- primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
376
- pid_json[:primer_id_in_use][k] = v
377
- end
378
375
 
379
- pid_json[:primer_id_distribution] = {}
380
- primer_id_dis.sort_by{|k,v| k}.each do |k,v|
381
- pid_json[:primer_id_distribution][k] = v
382
- end
376
+ pid_json[:primer_id_distribution] = {}
377
+ primer_id_dis.sort_by{|k,v| k}.each do |k,v|
378
+ pid_json[:primer_id_distribution][k] = v
379
+ end
383
380
 
384
- pid_json[:primer_id_frequency] = {}
385
- primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
386
- pid_json[:primer_id_frequency][k] = v
387
- end
381
+ pid_json[:primer_id_frequency] = {}
382
+ primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
383
+ pid_json[:primer_id_frequency][k] = v
384
+ end
388
385
 
389
- File.open(out_pid_json, 'w') do |f|
390
- f.puts JSON.pretty_generate(pid_json)
391
- end
386
+ File.open(out_pid_json, 'w') do |f|
387
+ f.puts JSON.pretty_generate(pid_json)
388
+ end
392
389
 
393
- # start end-join
394
- def end_join(dir, option, overlap)
395
- shp = ViralSeq::SeqHashPair.fa(dir)
396
- case option
397
- when 1
398
- joined_sh = shp.join1()
399
- when 2
400
- joined_sh = shp.join1(overlap)
401
- when 3
402
- joined_sh = shp.join2
403
- when 4
404
- joined_sh = shp.join2(model: :indiv)
390
+ # start end-join
391
+ def end_join(dir, option, overlap)
392
+ shp = ViralSeq::SeqHashPair.fa(dir)
393
+ case option
394
+ when 1
395
+ joined_sh = shp.join1()
396
+ when 2
397
+ joined_sh = shp.join1(overlap)
398
+ when 3
399
+ joined_sh = shp.join2
400
+ when 4
401
+ joined_sh = shp.join2(model: :indiv)
402
+ end
403
+ return joined_sh
405
404
  end
406
- return joined_sh
407
- end
408
405
 
409
- if primer[:end_join]
410
- log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
411
- shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
412
- joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
413
- log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
406
+ if primer[:end_join]
407
+ log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
408
+ shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
409
+ joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
410
+ log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
414
411
 
415
- summary_json[:combined_tcs] = joined_sh.size
412
+ summary_json[:combined_tcs] = joined_sh.size
416
413
 
417
- if export_raw
418
- joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
419
- end
414
+ if export_raw
415
+ joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
416
+ end
420
417
 
421
- else
422
- File.open(outfile_log, "w") do |f|
423
- f.puts JSON.pretty_generate(summary_json)
418
+ else
419
+ File.open(outfile_log, "w") do |f|
420
+ f.puts JSON.pretty_generate(summary_json)
421
+ end
422
+ next
424
423
  end
425
- next
426
- end
427
424
 
428
- if primer[:TCS_QC]
429
- ref_start = primer[:ref_start]
430
- ref_end = primer[:ref_end]
431
- ref_genome = primer[:ref_genome].to_sym
432
- indel = primer[:indel]
433
- if ref_start == 0
434
- ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
435
- end
436
- if ref_end == 0
437
- ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
438
- end
439
- if primer[:end_join_option] == 1 and primer[:overlap] == 0
440
- r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
441
- r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
442
- r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
443
- r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
444
- new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
445
- new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
446
- joined_seq = {}
447
- new_r1_seq.each do |seq_name, seq|
448
- next unless seq
449
- next unless new_r2_seq[seq_name]
450
- joined_seq[seq_name] = seq + new_r2_seq[seq_name]
425
+ if primer[:TCS_QC]
426
+ ref_start = primer[:ref_start]
427
+ ref_end = primer[:ref_end]
428
+ ref_genome = primer[:ref_genome].to_sym
429
+ indel = primer[:indel]
430
+ if ref_start == 0
431
+ ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
451
432
  end
452
- joined_sh = ViralSeq::SeqHash.new(joined_seq)
453
-
454
- if export_raw
455
- r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
456
- r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
457
- r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
458
- r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
459
- new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
460
- new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
461
- joined_seq_raw = {}
462
- new_r1_seq_raw.each do |seq_name, seq|
433
+ if ref_end == 0
434
+ ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
435
+ end
436
+ if primer[:end_join_option] == 1 and primer[:overlap] == 0
437
+ r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
438
+ r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
439
+ r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
440
+ r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
441
+ new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
442
+ new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
443
+ joined_seq = {}
444
+ new_r1_seq.each do |seq_name, seq|
463
445
  next unless seq
464
- next unless new_r2_seq_raw[seq_name]
465
- joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
446
+ next unless new_r2_seq[seq_name]
447
+ joined_seq[seq_name] = seq + new_r2_seq[seq_name]
448
+ end
449
+ joined_sh = ViralSeq::SeqHash.new(joined_seq)
450
+
451
+ if export_raw
452
+ r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
453
+ r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
454
+ r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
455
+ r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
456
+ new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
457
+ new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
458
+ joined_seq_raw = {}
459
+ new_r1_seq_raw.each do |seq_name, seq|
460
+ next unless seq
461
+ next unless new_r2_seq_raw[seq_name]
462
+ joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
463
+ end
464
+ joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
465
+ end
466
+ else
467
+ joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
468
+
469
+ if export_raw
470
+ joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
466
471
  end
467
- joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
468
472
  end
469
- else
470
- joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
471
473
 
472
- if export_raw
473
- joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
474
+ log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
475
+ summary_json[:combined_tcs_after_qc] = joined_sh.size
476
+ if primer[:trim]
477
+ trim_start = primer[:trim_ref_start]
478
+ trim_end = primer[:trim_ref_end]
479
+ trim_ref = primer[:trim_ref].to_sym
480
+ joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
481
+ if export_raw
482
+ joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
483
+ end
474
484
  end
475
- end
476
485
 
477
- log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
478
- summary_json[:combined_tcs_after_qc] = joined_sh.size
479
- if primer[:trim]
480
- trim_start = primer[:trim_ref_start]
481
- trim_end = primer[:trim_ref_end]
482
- trim_ref = primer[:trim_ref].to_sym
483
- joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
486
+ joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
484
487
  if export_raw
485
- joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
488
+ joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
486
489
  end
487
490
  end
488
491
 
489
- joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
490
- if export_raw
491
- joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
492
+ File.open(outfile_log, "w") do |f|
493
+ f.puts JSON.pretty_generate(summary_json)
492
494
  end
493
495
  end
494
496
 
495
- File.open(outfile_log, "w") do |f|
496
- f.puts JSON.pretty_generate(summary_json)
497
+ unless options[:keep]
498
+ log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
499
+ File.unlink(r1_f)
500
+ File.unlink(r2_f)
501
+ end
502
+ log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
503
+ log.close
504
+ puts "DONE!"
505
+ rescue => e
506
+ puts "`tcs` pipeline run with errors: " + e.message.red
507
+ puts "`tcs` pipeline aborted.".red.bold
508
+ log.puts Time.now.to_s + "\t" + e.full_message
509
+ log.puts Time.now.to_s + "\tAborted."
510
+ log.close
511
+ error_hash = {}
512
+ error_hash[:directory] = indir
513
+ error_hash[:tcs_version] = ViralSeq::TCS_VERSION
514
+ error_hash[:viralSeq_version] = ViralSeq::VERSION
515
+ error_hash[:time] = Time.now
516
+ error_hash[:error] = e.full_message
517
+ File.open(File.join(indir, ".tcs_error"), 'w') do |f|
518
+ f.puts JSON.pretty_generate([error_hash])
519
+ end
520
+ master_error_file = File.join(File.dirname(indir), ".tcs_error")
521
+ master_errors = []
522
+ if File.exist? master_error_file
523
+ master_errors << JSON.parse(File.read(master_error_file), symbolize_names: true)
524
+ end
525
+ master_errors << error_hash
526
+ File.open(master_error_file, 'w') do |f|
527
+ f.puts JSON.pretty_generate(master_errors)
497
528
  end
498
529
  end
499
-
500
- unless options[:keep]
501
- log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
502
- File.unlink(r1_f)
503
- File.unlink(r2_f)
504
- end
505
- log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
506
- log.close
507
- puts "DONE!"
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.2.2"
6
- TCS_VERSION = "2.3.2"
5
+ VERSION = "1.2.3"
6
+ TCS_VERSION = "2.3.3"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.2
4
+ version: 1.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-05-27 00:00:00.000000000 Z
12
+ date: 2021-06-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler