viral_seq 1.2.2 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a235cae95121a8522a47620eb9f8c05a3e2e416084743cd23df43aff7870a2c4
4
- data.tar.gz: f0ce3a9412774eed703b0b0b663e7bb2dccf340f3f558cffdca85e920291794d
3
+ metadata.gz: 5fef5f765c741aa14885673a2f980726956ae341a39926929d10ac0c7b4a6ece
4
+ data.tar.gz: d1e0cb4b691d4aff657209f5f87f02653421dc1bf9ba6dd4f87e43320ad143d5
5
5
  SHA512:
6
- metadata.gz: b97f98e40b8257281bd29cee40942d16084cf175933fc8357838ebb2a9eede1ab93ba323dbf315afb300f0a7852b2c6d939235831124710fc6f16f109e3eafc5
7
- data.tar.gz: 4d660da22c69ce1ff929ed7f67d2b03aad662bb0237e9a93d9a8ea6bd1866d8544ad108db9ab8a11eee2df992395e41b68ffc43a8d1dbb132cc1f83a897676ef
6
+ metadata.gz: 98b18297e15a5fb0eff8706029dcfdecdc5c39df28b6ae3ea8fe5b7611f63f91276f7ed5e459a478b1af25ec25662804290decc923b1c52f946a184d197807be
7
+ data.tar.gz: 7e359a05ff783971beced635cb2d913c4a9b418391425eba5ce9344b0137b989ea007d1ed5dc07a677a1775d9c73055aad552e971a89ca2207156db5233b243f
data/README.md CHANGED
@@ -10,6 +10,8 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
10
10
 
11
11
  Specifically for Primer ID sequencing and HIV drug resistance analysis.
12
12
 
13
+ #### tcs web app - https://primer-id.org/
14
+
13
15
  ## Illustration for the Primer ID Sequencing
14
16
 
15
17
 
@@ -31,7 +33,9 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
31
33
  ### Excutables
32
34
 
33
35
  ### `tcs`
34
- Use executable `tcs` pipeline (v2.3.2) to process **Primer ID MiSeq sequencing** data.
36
+ Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
37
+
38
+ Web-based `tcs` analysis can be accessed at https://primer-id.org/
35
39
 
36
40
  Example commands:
37
41
  ```bash
@@ -175,6 +179,12 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
175
179
 
176
180
  ## Updates
177
181
 
182
+ ### Version 1.2.5-06232021
183
+
184
+ 1. Add error rescue and report in the `tcs` pipeline.
185
+ error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.4.
186
+ 2. Use simple majority for the consensus cut-off in the default setting of the `tcs -dr` pipeline.
187
+
178
188
  ### Version 1.2.2-05272021
179
189
 
180
190
  1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
data/bin/tcs CHANGED
@@ -101,407 +101,429 @@ log = File.open(runtime_log_file, "w")
101
101
  log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
102
102
  log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
103
103
  log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
104
+ File.unlink(File.join(indir, ".tcs_error")) if File.exist?(File.join(indir, ".tcs_error"))
105
+
106
+ begin
107
+ libname = File.basename indir
108
+ seq_files = ViralSeq::TcsCore.r1r2 indir
109
+
110
+ if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
111
+ r1_f = seq_files[:r1_file]
112
+ r2_f = seq_files[:r2_file]
113
+ elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
114
+ raise StandardError.new "Missing R2 file."
115
+ elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
116
+ raise StandardError.new "Missing R1 file."
117
+ else
118
+ raise StandardError.new "Cannot determine R1 R2 file in #{indir}."
119
+ end
104
120
 
105
- libname = File.basename indir
121
+ r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
122
+ r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
106
123
 
107
- seq_files = ViralSeq::TcsCore.r1r2 indir
124
+ raw_sequence_number = r1_fastq_sh.size
125
+ log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
108
126
 
109
- if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
110
- r1_f = seq_files[:r1_file]
111
- r2_f = seq_files[:r2_file]
112
- elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
113
- exit_sig = "Missing R2 file. Aborted."
114
- elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
115
- exit_sig = "Missing R1 file. Aborted."
116
- else
117
- exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
118
- end
127
+ if params[:platform_error_rate]
128
+ error_rate = params[:platform_error_rate]
129
+ else
130
+ error_rate = 0.02
131
+ end
119
132
 
120
- if exit_sig
121
- ViralSeq::TcsCore.log_and_abort log, exit_sig
122
- end
133
+ if params[:platform_format]
134
+ $platform_sequencing_length = params[:platform_format]
135
+ else
136
+ $platform_sequencing_length = 300
137
+ end
123
138
 
124
- r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
125
- r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
139
+ primers = params[:primer_pairs]
140
+ if primers.empty?
141
+ ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
142
+ end
126
143
 
127
- raw_sequence_number = r1_fastq_sh.size
128
- log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
129
144
 
130
- if params[:platform_error_rate]
131
- error_rate = params[:platform_error_rate]
132
- else
133
- error_rate = 0.02
134
- end
145
+ primers.each do |primer|
146
+ summary_json = {}
147
+ summary_json[:warnings] = []
148
+ summary_json[:tcs_version] = ViralSeq::TCS_VERSION
149
+ summary_json[:viralseq_version] = ViralSeq::VERSION
150
+ summary_json[:runtime] = Time.now.to_s
135
151
 
136
- if params[:platform_format]
137
- $platform_sequencing_length = params[:platform_format]
138
- else
139
- $platform_sequencing_length = 300
140
- end
152
+ primer[:region] ? region = primer[:region] : region = "region"
153
+ summary_json[:primer_set_name] = region
141
154
 
142
- primers = params[:primer_pairs]
143
- if primers.empty?
144
- ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
145
- end
155
+ cdna_primer = primer[:cdna]
156
+ forward_primer = primer[:forward]
146
157
 
158
+ export_raw = primer[:export_raw]
159
+ limit_raw = primer[:limit_raw]
147
160
 
148
- primers.each do |primer|
149
- summary_json = {}
150
- summary_json[:warnings] = []
151
- summary_json[:tcs_version] = ViralSeq::TCS_VERSION
152
- summary_json[:viralseq_version] = ViralSeq::VERSION
153
- summary_json[:runtime] = Time.now.to_s
161
+ unless cdna_primer
162
+ log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
163
+ end
164
+ unless forward_primer
165
+ log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
166
+ end
167
+ summary_json[:cdan_primer] = cdna_primer
168
+ summary_json[:forward_primer] = forward_primer
169
+
170
+ primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
171
+ summary_json[:majority_cut_off] = majority_cut_off
172
+
173
+ summary_json[:total_raw_sequence] = raw_sequence_number
174
+
175
+ log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
176
+
177
+ # filter R1
178
+ log.puts Time.now.to_s + "\t" + "filtering R1..."
179
+ filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
180
+ r1_passed_seq = filter_r1[:r1_passed_seq]
181
+ log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
182
+ summary_json[:r1_filtered_raw] = r1_passed_seq.size
183
+
184
+ # filter R2
185
+ log.puts Time.now.to_s + "\t" + "filtering R2..."
186
+ filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
187
+ r2_passed_seq = filter_r2[:r2_passed_seq]
188
+ pid_length = filter_r2[:pid_length]
189
+ log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
190
+ summary_json[:r2_filtered_raw] = r2_passed_seq.size
191
+
192
+ # pair-end
193
+ log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
194
+ id = {} # hash for :sequence_tag => primer_id
195
+ bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
196
+ bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
197
+ common_keys = r1_passed_seq.keys & r2_passed_seq.keys
198
+ paired_seq_number = common_keys.size
199
+ log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
200
+ summary_json[:paired_raw_sequence] = paired_seq_number
201
+ if paired_seq_number < raw_sequence_number * 0.001
202
+ summary_json[:warnings] <<
203
+ "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
204
+ end
154
205
 
155
- primer[:region] ? region = primer[:region] : region = "region"
156
- summary_json[:primer_set_name] = region
206
+ common_keys.each do |seqtag|
207
+ r1_seq = r1_passed_seq[seqtag]
208
+ r2_seq = r2_passed_seq[seqtag]
209
+ pid = r2_seq[0, pid_length]
210
+ id[seqtag] = pid
211
+ bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
212
+ bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
213
+ end
157
214
 
158
- cdna_primer = primer[:cdna]
159
- forward_primer = primer[:forward]
215
+ # TCS cut-off
216
+ log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
160
217
 
161
- export_raw = primer[:export_raw]
162
- limit_raw = primer[:limit_raw]
218
+ primer_id_list = id.values
219
+ primer_id_count = primer_id_list.count_freq
220
+ primer_id_dis = primer_id_count.values.count_freq
163
221
 
164
- unless cdna_primer
165
- log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
166
- end
167
- unless forward_primer
168
- log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
169
- end
170
- summary_json[:cdan_primer] = cdna_primer
171
- summary_json[:forward_primer] = forward_primer
172
-
173
- primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
174
- summary_json[:majority_cut_off] = majority_cut_off
175
-
176
- summary_json[:total_raw_sequence] = raw_sequence_number
177
-
178
- log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
179
-
180
- # filter R1
181
- log.puts Time.now.to_s + "\t" + "filtering R1..."
182
- filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
183
- r1_passed_seq = filter_r1[:r1_passed_seq]
184
- log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
185
- summary_json[:r1_filtered_raw] = r1_passed_seq.size
186
-
187
- # filter R2
188
- log.puts Time.now.to_s + "\t" + "filtering R2..."
189
- filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
190
- r2_passed_seq = filter_r2[:r2_passed_seq]
191
- pid_length = filter_r2[:pid_length]
192
- log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
193
- summary_json[:r2_filtered_raw] = r2_passed_seq.size
194
-
195
- # pair-end
196
- log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
197
- id = {} # hash for :sequence_tag => primer_id
198
- bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
199
- bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
200
- common_keys = r1_passed_seq.keys & r2_passed_seq.keys
201
- paired_seq_number = common_keys.size
202
- log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
203
- summary_json[:paired_raw_sequence] = paired_seq_number
204
- if paired_seq_number < raw_sequence_number * 0.001
205
- summary_json[:warnings] <<
206
- "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
207
- end
222
+ # calculate distinct_to_raw
223
+ distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
224
+ summary_json[:distinct_to_raw] = distinct_to_raw
208
225
 
209
- common_keys.each do |seqtag|
210
- r1_seq = r1_passed_seq[seqtag]
211
- r2_seq = r2_passed_seq[seqtag]
212
- pid = r2_seq[0, pid_length]
213
- id[seqtag] = pid
214
- bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
215
- bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
216
- end
226
+ if primer_id_dis.keys.size < 5
227
+ log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
228
+ next
229
+ end
217
230
 
218
- # TCS cut-off
219
- log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
231
+ max_id = primer_id_dis.keys.sort[-5..-1].mean
232
+ consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
233
+ log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
234
+ summary_json[:consensus_cutoff] = consensus_cutoff
235
+ summary_json[:length_of_pid] = pid_length
236
+ log.puts Time.now.to_s + "\t" + "Creating consensus..."
237
+
238
+ # Primer ID over the cut-off
239
+ primer_id_count_over_n = []
240
+ primer_id_count.each do |primer_id,count|
241
+ primer_id_count_over_n << primer_id if count > consensus_cutoff
242
+ end
243
+ pid_to_process = primer_id_count_over_n.size
244
+ log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
245
+ summary_json[:total_tcs_with_ambiguities] = pid_to_process
220
246
 
221
- primer_id_list = id.values
222
- primer_id_count = primer_id_list.count_freq
223
- primer_id_dis = primer_id_count.values.count_freq
247
+ # setup output path
248
+ out_dir_set = File.join(indir, region)
249
+ Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
250
+ out_dir_consensus = File.join(out_dir_set, "consensus")
251
+ Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
224
252
 
225
- # calculate distinct_to_raw
226
- distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
227
- summary_json[:distinct_to_raw] = distinct_to_raw
253
+ outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
254
+ outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
255
+ outfile_log = File.join(out_dir_set, 'log.json')
228
256
 
229
- if primer_id_dis.keys.size < 5
230
- log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
231
- next
232
- end
257
+ # if export_raw is true, create dir for raw sequence
258
+ if export_raw
259
+ out_dir_raw = File.join(out_dir_set, "raw")
260
+ Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
261
+ outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
262
+ outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
263
+ raw_r1_f = File.open(outfile_raw_r1, 'w')
264
+ raw_r2_f = File.open(outfile_raw_r2, 'w')
265
+
266
+ if limit_raw
267
+ raw_keys = bio_r1.keys.sample(limit_raw.to_i)
268
+ else
269
+ raw_keys = bio_r1.keys
270
+ end
233
271
 
234
- max_id = primer_id_dis.keys.sort[-5..-1].mean
235
- consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
236
- log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
237
- summary_json[:consensus_cutoff] = consensus_cutoff
238
- summary_json[:length_of_pid] = pid_length
239
- log.puts Time.now.to_s + "\t" + "Creating consensus..."
240
-
241
- # Primer ID over the cut-off
242
- primer_id_count_over_n = []
243
- primer_id_count.each do |primer_id,count|
244
- primer_id_count_over_n << primer_id if count > consensus_cutoff
245
- end
246
- pid_to_process = primer_id_count_over_n.size
247
- log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
248
- summary_json[:total_tcs_with_ambiguities] = pid_to_process
249
-
250
- # setup output path
251
- out_dir_set = File.join(indir, region)
252
- Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
253
- out_dir_consensus = File.join(out_dir_set, "consensus")
254
- Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
255
-
256
- outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
257
- outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
258
- outfile_log = File.join(out_dir_set, 'log.json')
259
-
260
- # if export_raw is true, create dir for raw sequence
261
- if export_raw
262
- out_dir_raw = File.join(out_dir_set, "raw")
263
- Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
264
- outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
265
- outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
266
- raw_r1_f = File.open(outfile_raw_r1, 'w')
267
- raw_r2_f = File.open(outfile_raw_r2, 'w')
268
-
269
- if limit_raw
270
- raw_keys = bio_r1.keys.sample(limit_raw.to_i)
271
- else
272
- raw_keys = bio_r1.keys
273
- end
272
+ raw_keys.each do |k|
273
+ raw_r1_f.puts k + "_r1"
274
+ raw_r2_f.puts k + "_r2"
275
+ raw_r1_f.puts bio_r1[k]
276
+ raw_r2_f.puts bio_r2[k].rc
277
+ end
274
278
 
275
- raw_keys.each do |k|
276
- raw_r1_f.puts k + "_r1"
277
- raw_r2_f.puts k + "_r2"
278
- raw_r1_f.puts bio_r1[k]
279
- raw_r2_f.puts bio_r2[k].rc
279
+ raw_r1_f.close
280
+ raw_r2_f.close
280
281
  end
281
282
 
282
- raw_r1_f.close
283
- raw_r2_f.close
284
- end
283
+ # create TCS
285
284
 
286
- # create TCS
285
+ pid_seqtag_hash = {}
286
+ id.each do |name, pid|
287
+ if pid_seqtag_hash[pid]
288
+ pid_seqtag_hash[pid] << name
289
+ else
290
+ pid_seqtag_hash[pid] = []
291
+ pid_seqtag_hash[pid] << name
292
+ end
293
+ end
287
294
 
288
- pid_seqtag_hash = {}
289
- id.each do |name, pid|
290
- if pid_seqtag_hash[pid]
291
- pid_seqtag_hash[pid] << name
295
+ consensus = {}
296
+ r1_temp = {}
297
+ r2_temp = {}
298
+ m = 0
299
+ primer_id_count_over_n.each do |primer_id|
300
+ m += 1
301
+ log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
302
+ seq_with_same_primer_id = pid_seqtag_hash[primer_id]
303
+ r1_sub_seq = []
304
+ r2_sub_seq = []
305
+ seq_with_same_primer_id.each do |seq_name|
306
+ r1_sub_seq << bio_r1[seq_name]
307
+ r2_sub_seq << bio_r2[seq_name]
308
+ end
309
+ #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
310
+ consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
311
+ r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
312
+ r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
313
+
314
+ # hide the following two lines if allowing sequence to have ambiguities.
315
+ next if r1_consensus =~ /[^ATCG]/
316
+ next if r2_consensus =~ /[^ATCG]/
317
+
318
+ # reverse complement sequence of the R2 region
319
+ r2_consensus = r2_consensus.rc
320
+ consensus[consensus_name] = [r1_consensus, r2_consensus]
321
+ r1_temp[consensus_name] = r1_consensus
322
+ r2_temp[consensus_name] = r2_consensus
323
+ end
324
+ r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
325
+ r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
326
+
327
+ # filter consensus sequences for residual offspring PIDs
328
+ consensus_filtered = {}
329
+ consensus_number_temp = consensus.size
330
+ max_pid_comb = 4**pid_length
331
+ if consensus_number_temp < 0.003*max_pid_comb
332
+ log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
333
+ r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
334
+ r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
335
+ common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
336
+ common_pid.each do |pid|
337
+ consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
338
+ end
292
339
  else
293
- pid_seqtag_hash[pid] = []
294
- pid_seqtag_hash[pid] << name
340
+ consensus_filtered = consensus
295
341
  end
296
- end
297
-
298
- consensus = {}
299
- r1_temp = {}
300
- r2_temp = {}
301
- m = 0
302
- primer_id_count_over_n.each do |primer_id|
303
- m += 1
304
- log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
305
- seq_with_same_primer_id = pid_seqtag_hash[primer_id]
306
- r1_sub_seq = []
307
- r2_sub_seq = []
308
- seq_with_same_primer_id.each do |seq_name|
309
- r1_sub_seq << bio_r1[seq_name]
310
- r2_sub_seq << bio_r2[seq_name]
342
+ n_con = consensus_filtered.size
343
+ log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
344
+ summary_json[:total_tcs] = n_con
345
+ summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
346
+
347
+ log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
348
+ # r1_file output
349
+ f1 = File.open(outfile_r1, 'w')
350
+ f2 = File.open(outfile_r2, 'w')
351
+ primer_id_in_use = {}
352
+ if n_con > 0
353
+ r1_seq_length = consensus_filtered.values[0][0].size
354
+ r2_seq_length = consensus_filtered.values[0][1].size
355
+ else
356
+ next
311
357
  end
312
- #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
313
- consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
314
- r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
315
- r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
316
-
317
- # hide the following two lines if allowing sequence to have ambiguities.
318
- next if r1_consensus =~ /[^ATCG]/
319
- next if r2_consensus =~ /[^ATCG]/
320
-
321
- # reverse complement sequence of the R2 region
322
- r2_consensus = r2_consensus.rc
323
- consensus[consensus_name] = [r1_consensus, r2_consensus]
324
- r1_temp[consensus_name] = r1_consensus
325
- r2_temp[consensus_name] = r2_consensus
326
- end
327
- r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
328
- r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
329
-
330
- # filter consensus sequences for residual offspring PIDs
331
- consensus_filtered = {}
332
- consensus_number_temp = consensus.size
333
- max_pid_comb = 4**pid_length
334
- if consensus_number_temp < 0.003*max_pid_comb
335
- log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
336
- r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
337
- r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
338
- common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
339
- common_pid.each do |pid|
340
- consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
358
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
359
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
360
+ consensus_filtered.each do |seq_name,seq|
361
+ f1.print seq_name + "_r1\n" + seq[0] + "\n"
362
+ f2.print seq_name + "_r2\n" + seq[1] + "\n"
363
+ primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
364
+ end
365
+ f1.close
366
+ f2.close
367
+
368
+ # Primer ID distribution in .json file
369
+ out_pid_json = File.join(out_dir_set, 'primer_id.json')
370
+ pid_json = {}
371
+ pid_json[:primer_id_in_use] = {}
372
+ primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
373
+ pid_json[:primer_id_in_use][k] = v
341
374
  end
342
- else
343
- consensus_filtered = consensus
344
- end
345
- n_con = consensus_filtered.size
346
- log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
347
- summary_json[:total_tcs] = n_con
348
- summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
349
-
350
- log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
351
- # r1_file output
352
- f1 = File.open(outfile_r1, 'w')
353
- f2 = File.open(outfile_r2, 'w')
354
- primer_id_in_use = {}
355
- if n_con > 0
356
- r1_seq_length = consensus_filtered.values[0][0].size
357
- r2_seq_length = consensus_filtered.values[0][1].size
358
- else
359
- next
360
- end
361
- log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
362
- log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
363
- consensus_filtered.each do |seq_name,seq|
364
- f1.print seq_name + "_r1\n" + seq[0] + "\n"
365
- f2.print seq_name + "_r2\n" + seq[1] + "\n"
366
- primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
367
- end
368
- f1.close
369
- f2.close
370
-
371
- # Primer ID distribution in .json file
372
- out_pid_json = File.join(out_dir_set, 'primer_id.json')
373
- pid_json = {}
374
- pid_json[:primer_id_in_use] = {}
375
- primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
376
- pid_json[:primer_id_in_use][k] = v
377
- end
378
375
 
379
- pid_json[:primer_id_distribution] = {}
380
- primer_id_dis.sort_by{|k,v| k}.each do |k,v|
381
- pid_json[:primer_id_distribution][k] = v
382
- end
376
+ pid_json[:primer_id_distribution] = {}
377
+ primer_id_dis.sort_by{|k,v| k}.each do |k,v|
378
+ pid_json[:primer_id_distribution][k] = v
379
+ end
383
380
 
384
- pid_json[:primer_id_frequency] = {}
385
- primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
386
- pid_json[:primer_id_frequency][k] = v
387
- end
381
+ pid_json[:primer_id_frequency] = {}
382
+ primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
383
+ pid_json[:primer_id_frequency][k] = v
384
+ end
388
385
 
389
- File.open(out_pid_json, 'w') do |f|
390
- f.puts JSON.pretty_generate(pid_json)
391
- end
386
+ File.open(out_pid_json, 'w') do |f|
387
+ f.puts JSON.pretty_generate(pid_json)
388
+ end
392
389
 
393
- # start end-join
394
- def end_join(dir, option, overlap)
395
- shp = ViralSeq::SeqHashPair.fa(dir)
396
- case option
397
- when 1
398
- joined_sh = shp.join1()
399
- when 2
400
- joined_sh = shp.join1(overlap)
401
- when 3
402
- joined_sh = shp.join2
403
- when 4
404
- joined_sh = shp.join2(model: :indiv)
390
+ # start end-join
391
+ def end_join(dir, option, overlap)
392
+ shp = ViralSeq::SeqHashPair.fa(dir)
393
+ case option
394
+ when 1
395
+ joined_sh = shp.join1()
396
+ when 2
397
+ joined_sh = shp.join1(overlap)
398
+ when 3
399
+ joined_sh = shp.join2
400
+ when 4
401
+ joined_sh = shp.join2(model: :indiv)
402
+ end
403
+ return joined_sh
405
404
  end
406
- return joined_sh
407
- end
408
405
 
409
- if primer[:end_join]
410
- log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
411
- shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
412
- joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
413
- log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
406
+ if primer[:end_join]
407
+ log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
408
+ shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
409
+ joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
410
+ log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
414
411
 
415
- summary_json[:combined_tcs] = joined_sh.size
412
+ summary_json[:combined_tcs] = joined_sh.size
416
413
 
417
- if export_raw
418
- joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
419
- end
414
+ if export_raw
415
+ joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
416
+ end
420
417
 
421
- else
422
- File.open(outfile_log, "w") do |f|
423
- f.puts JSON.pretty_generate(summary_json)
418
+ else
419
+ File.open(outfile_log, "w") do |f|
420
+ f.puts JSON.pretty_generate(summary_json)
421
+ end
422
+ next
424
423
  end
425
- next
426
- end
427
424
 
428
- if primer[:TCS_QC]
429
- ref_start = primer[:ref_start]
430
- ref_end = primer[:ref_end]
431
- ref_genome = primer[:ref_genome].to_sym
432
- indel = primer[:indel]
433
- if ref_start == 0
434
- ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
435
- end
436
- if ref_end == 0
437
- ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
438
- end
439
- if primer[:end_join_option] == 1 and primer[:overlap] == 0
440
- r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
441
- r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
442
- r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
443
- r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
444
- new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
445
- new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
446
- joined_seq = {}
447
- new_r1_seq.each do |seq_name, seq|
448
- next unless seq
449
- next unless new_r2_seq[seq_name]
450
- joined_seq[seq_name] = seq + new_r2_seq[seq_name]
425
+ if primer[:TCS_QC]
426
+ ref_start = primer[:ref_start]
427
+ ref_end = primer[:ref_end]
428
+ ref_genome = primer[:ref_genome].to_sym
429
+ indel = primer[:indel]
430
+ if ref_start == 0
431
+ ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
451
432
  end
452
- joined_sh = ViralSeq::SeqHash.new(joined_seq)
453
-
454
- if export_raw
455
- r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
456
- r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
457
- r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
458
- r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
459
- new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
460
- new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
461
- joined_seq_raw = {}
462
- new_r1_seq_raw.each do |seq_name, seq|
433
+ if ref_end == 0
434
+ ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
435
+ end
436
+ if primer[:end_join_option] == 1 and primer[:overlap] == 0
437
+ r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
438
+ r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
439
+ r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
440
+ r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
441
+ new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
442
+ new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
443
+ joined_seq = {}
444
+ new_r1_seq.each do |seq_name, seq|
463
445
  next unless seq
464
- next unless new_r2_seq_raw[seq_name]
465
- joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
446
+ next unless new_r2_seq[seq_name]
447
+ joined_seq[seq_name] = seq + new_r2_seq[seq_name]
448
+ end
449
+ joined_sh = ViralSeq::SeqHash.new(joined_seq)
450
+
451
+ if export_raw
452
+ r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
453
+ r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
454
+ r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
455
+ r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
456
+ new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
457
+ new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
458
+ joined_seq_raw = {}
459
+ new_r1_seq_raw.each do |seq_name, seq|
460
+ next unless seq
461
+ next unless new_r2_seq_raw[seq_name]
462
+ joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
463
+ end
464
+ joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
465
+ end
466
+ else
467
+ joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
468
+
469
+ if export_raw
470
+ joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
466
471
  end
467
- joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
468
472
  end
469
- else
470
- joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
471
473
 
472
- if export_raw
473
- joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
474
+ log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
475
+ summary_json[:combined_tcs_after_qc] = joined_sh.size
476
+ if primer[:trim]
477
+ trim_start = primer[:trim_ref_start]
478
+ trim_end = primer[:trim_ref_end]
479
+ trim_ref = primer[:trim_ref].to_sym
480
+ joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
481
+ if export_raw
482
+ joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
483
+ end
474
484
  end
475
- end
476
485
 
477
- log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
478
- summary_json[:combined_tcs_after_qc] = joined_sh.size
479
- if primer[:trim]
480
- trim_start = primer[:trim_ref_start]
481
- trim_end = primer[:trim_ref_end]
482
- trim_ref = primer[:trim_ref].to_sym
483
- joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
486
+ joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
484
487
  if export_raw
485
- joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
488
+ joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
486
489
  end
487
490
  end
488
491
 
489
- joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
490
- if export_raw
491
- joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
492
+ File.open(outfile_log, "w") do |f|
493
+ f.puts JSON.pretty_generate(summary_json)
492
494
  end
493
495
  end
494
496
 
495
- File.open(outfile_log, "w") do |f|
496
- f.puts JSON.pretty_generate(summary_json)
497
+ unless options[:keep]
498
+ log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
499
+ File.unlink(r1_f)
500
+ File.unlink(r2_f)
501
+ end
502
+ log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
503
+ log.close
504
+ puts "DONE!"
505
+ rescue => e
506
+ puts "`tcs` pipeline run with errors: " + e.message.red
507
+ puts "`tcs` pipeline aborted.".red.bold
508
+ log.puts Time.now.to_s + "\t" + e.full_message
509
+ log.puts Time.now.to_s + "\tAborted."
510
+ log.close
511
+ error_hash = {}
512
+ error_hash[:directory] = indir
513
+ error_hash[:tcs_version] = ViralSeq::TCS_VERSION
514
+ error_hash[:viralSeq_version] = ViralSeq::VERSION
515
+ error_hash[:time] = Time.now
516
+ error_hash[:error] = e.full_message
517
+ File.open(File.join(indir, ".tcs_error"), 'w') do |f|
518
+ f.puts JSON.pretty_generate([error_hash])
519
+ end
520
+ master_error_file = File.join(File.dirname(indir), ".tcs_error")
521
+ master_errors = []
522
+ if File.exist? master_error_file
523
+ master_errors << JSON.parse(File.read(master_error_file), symbolize_names: true)
524
+ end
525
+ master_errors << error_hash
526
+ File.open(master_error_file, 'w') do |f|
527
+ f.puts JSON.pretty_generate(master_errors)
497
528
  end
498
529
  end
499
-
500
- unless options[:keep]
501
- log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
502
- File.unlink(r1_f)
503
- File.unlink(r2_f)
504
- end
505
- log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
506
- log.close
507
- puts "DONE!"
@@ -8,7 +8,7 @@ module ViralSeq
8
8
  "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCACTATAGGCTGTACTGTCCATTTATC",
9
9
  :forward=>
10
10
  "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
11
- :majority=>0.5,
11
+ :majority=>0,
12
12
  :end_join=>true,
13
13
  :end_join_option=>1,
14
14
  :overlap=>0,
@@ -23,7 +23,7 @@ module ViralSeq
23
23
  "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
24
24
  :forward=>
25
25
  "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
26
- :majority=>0.5,
26
+ :majority=>0,
27
27
  :end_join=>true,
28
28
  :end_join_option=>3,
29
29
  :TCS_QC=>true,
@@ -39,7 +39,7 @@ module ViralSeq
39
39
  :cdna=>
40
40
  "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNATCGAATACTGCCATTTGTACTGC",
41
41
  :forward=>"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNAAAAGGAGAAGCCATGCATG",
42
- :majority=>0.5,
42
+ :majority=>0,
43
43
  :end_join=>true,
44
44
  :end_join_option=>3,
45
45
  :overlap=>171,
@@ -54,7 +54,7 @@ module ViralSeq
54
54
  "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
55
55
  :forward=>
56
56
  "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
57
- :majority=>0.5,
57
+ :majority=>0,
58
58
  :end_join=>true,
59
59
  :end_join_option=>1,
60
60
  :overlap=>0,
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.2.2"
6
- TCS_VERSION = "2.3.2"
5
+ VERSION = "1.2.5"
6
+ TCS_VERSION = "2.3.4"
7
7
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.2
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
8
8
  - Michael Clark
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-05-27 00:00:00.000000000 Z
12
+ date: 2021-06-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -214,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
214
  requirements:
215
215
  - R required for some functions
216
216
  rubygems_version: 3.2.2
217
- signing_key:
217
+ signing_key:
218
218
  specification_version: 4
219
219
  summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.
220
220
  test_files: []