viral_seq 1.1.2 → 1.2.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '093a8d1d70e251b0748b7035c829eb512991437ffa78fd67387318412e54acf5'
4
- data.tar.gz: 1b9d6f6b2cb2ffa8d9cc588b8df096e7ac3840c694bfb241fcf970b738899328
3
+ metadata.gz: aee2536198f5579951f7c5f5f1b97ed27fb0bf5459e6dbfe79137a030951d443
4
+ data.tar.gz: 9ab9b69adf228d3429e02e5d47674ce0a0d137fa6231d940ef8d6cc9abfa1cdf
5
5
  SHA512:
6
- metadata.gz: 3853dbfa3f6604d907ec3d77b8c86ec8d885fedcc854c40ca6822ec72e8b2cfe9413bc188aa722a14e4e4f6c9503eca1b36d7f8e0963a5a997c9f0ca8b54fc86
7
- data.tar.gz: e5b056cddcf7b87cc30e52c878879cea82d865ea7fc867535767918c30c699d58d6f426518aad02be49916c49f38d9603b0ab27ca6f3625f7a5102ae86863023
6
+ metadata.gz: d9b5def3e29f819b6d83902676e80fa2d81531cb44132942244125f6b63110d66389e7f9c61d6e9ea3d80ece57d22d95d5c3948c3f7f71d48835b2714794ffe0
7
+ data.tar.gz: 44b996d9caf9029f2ef522d9058410cdc3b4ec253ccc7bacca5d9ecf66f487624b0b6d18c960b6c7a0af7bfcd2bc72c8c02df59d3c4c7834ab80d4a9eed2a424
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # ViralSeq
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/viral_seq.svg)](https://rubygems.org/gems/viral_seq)
3
+ [![Gem Version](https://img.shields.io/gem/v/viral_seq?color=%2300e673&style=flat-square)](https://rubygems.org/gems/viral_seq)
4
4
  ![GitHub](https://img.shields.io/github/license/viralseq/viral_seq)
5
5
  ![Gem](https://img.shields.io/gem/dt/viral_seq?color=%23E9967A)
6
6
  ![GitHub last commit](https://img.shields.io/github/last-commit/viralseq/viral_seq?color=%2300BFFF)
@@ -10,6 +10,8 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
10
10
 
11
11
  Specifically for Primer ID sequencing and HIV drug resistance analysis.
12
12
 
13
+ #### tcs web app - https://primer-id.org/
14
+
13
15
  ## Illustration for the Primer ID Sequencing
14
16
 
15
17
 
@@ -33,6 +35,8 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
33
35
  ### `tcs`
34
36
  Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
35
37
 
38
+ Web-based `tcs` analysis can be accessed at https://primer-id.org/
39
+
36
40
  Example commands:
37
41
  ```bash
38
42
  $ tcs -p params.json # run TCS pipeline with params.json
@@ -69,6 +73,44 @@ Example command:
69
73
  $ tcs_log batch_tcs_jobs
70
74
  ```
71
75
 
76
+ ---
77
+ ### `tcs_sdrm`
78
+
79
+ Use `tcs_sdrm` pipeline for HIV-1 drug resistance mutation and recency.
80
+
81
+ Example command:
82
+ ```bash
83
+ $ tcs_sdrm libs_dir
84
+ ```
85
+
86
+ lib_dir file structure:
87
+ ```
88
+ libs_dir/
89
+ ├── lib1
90
+ ├── lib1_RT
91
+ ├── lib1_PR
92
+ ├── lib1_IN
93
+ ├── lib1_V1V3
94
+ ├── lib2
95
+ ├── lib1_RT
96
+ ├── lib1_PR
97
+ ├── lib1_IN
98
+ ├── lib1_V1V3
99
+ ├── ...
100
+ ```
101
+
102
+ Output data in a new dir as 'libs_dir_SDRM'
103
+
104
+
105
+ **Note: [R](https://www.r-project.org/) and the following R libraries are required:**
106
+ - phangorn
107
+ - ape
108
+ - scales
109
+ - ggforce
110
+ - cowplot
111
+ - magrittr
112
+ - gridExtra
113
+
72
114
  ---
73
115
 
74
116
  ### `locator`
@@ -109,7 +151,7 @@ qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
109
151
  Further filter out sequences with Apobec3g/f hypermutations
110
152
 
111
153
  ```ruby
112
- qc_seqhash = qc_seqhash.a3g
154
+ qc_seqhash = qc_seqhash.a3g[:filtered_seq]
113
155
  ```
114
156
 
115
157
  Calculate nucleotide diveristy π
@@ -137,11 +179,43 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
137
179
 
138
180
  ## Updates
139
181
 
182
+ ### Version 1.2.6-07122021
183
+
184
+ 1. Optimized the workflow of the `tcs` pipeline in the "end-join/QC/Trimming" section.
185
+ `tcs` version to v2.3.5.
186
+
187
+
188
+ ### Version 1.2.5-06232021
189
+
190
+ 1. Add error rescue and report in the `tcs` pipeline.
191
+ error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.4.
192
+ 2. Use simple majority for the consensus cut-off in the default setting of the `tcs -dr` pipeline.
193
+
194
+ ### Version 1.2.2-05272021
195
+
196
+ 1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
197
+ `tcs` pipeline upgraded to v2.3.2
198
+
199
+ ### Version 1.2.1-05172021
200
+
201
+ 1. Added a function in R to check and install missing R packages for `tcs_sdrm` pipeline.
202
+
203
+ ### Version 1.2.0-05102021
204
+
205
+ 1. Added `tcs_sdrm` pipeline as an excutable.
206
+ `tcs_sdrm` processes `tcs`-processed HIV MPID-NGS data for drug resistance mutations, recency and phylogentic analysis.
207
+
208
+ 2. Added function ViralSeq::SeqHash#sample.
209
+
210
+ 3. Added recency determining function `ViralSeq::Recency::define`
211
+
212
+ 4. Fixed a few bugs related to `tcs_sdrm`.
213
+
140
214
  ### Version 1.1.2-04262021
141
215
 
142
216
  1. Added function `ViralSeq::DRMs.sdrm_json` to export SDRM as json object.
143
217
  2. Added a random string to the temp file names for `muscle_bio` to avoid issues when running scripts in parallel.
144
- 3. Added `--keep-original` flag to the `tcs` pipeline.
218
+ 3. Added `--keep-original` flag to the `tcs` pipeline.
145
219
 
146
220
  ### Version 1.1.1-04012021
147
221
 
data/bin/tcs CHANGED
@@ -101,395 +101,425 @@ log = File.open(runtime_log_file, "w")
101
101
  log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
102
102
  log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
103
103
  log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
104
+ File.unlink(File.join(indir, ".tcs_error")) if File.exist?(File.join(indir, ".tcs_error"))
105
+
106
+ begin
107
+ libname = File.basename indir
108
+ seq_files = ViralSeq::TcsCore.r1r2 indir
109
+
110
+ if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
111
+ r1_f = seq_files[:r1_file]
112
+ r2_f = seq_files[:r2_file]
113
+ elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
114
+ raise StandardError.new "Missing R2 file."
115
+ elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
116
+ raise StandardError.new "Missing R1 file."
117
+ else
118
+ raise StandardError.new "Cannot determine R1 R2 file in #{indir}."
119
+ end
104
120
 
105
- libname = File.basename indir
121
+ r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
122
+ r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
106
123
 
107
- seq_files = ViralSeq::TcsCore.r1r2 indir
124
+ raw_sequence_number = r1_fastq_sh.size
125
+ log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
108
126
 
109
- if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
110
- r1_f = seq_files[:r1_file]
111
- r2_f = seq_files[:r2_file]
112
- elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
113
- exit_sig = "Missing R2 file. Aborted."
114
- elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
115
- exit_sig = "Missing R1 file. Aborted."
116
- else
117
- exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
118
- end
127
+ if params[:platform_error_rate]
128
+ error_rate = params[:platform_error_rate]
129
+ else
130
+ error_rate = 0.02
131
+ end
119
132
 
120
- if exit_sig
121
- ViralSeq::TcsCore.log_and_abort log, exit_sig
122
- end
133
+ if params[:platform_format]
134
+ $platform_sequencing_length = params[:platform_format]
135
+ else
136
+ $platform_sequencing_length = 300
137
+ end
123
138
 
124
- r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
125
- r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
139
+ primers = params[:primer_pairs]
140
+ if primers.empty? or primers.nil?
141
+ ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
142
+ end
126
143
 
127
- raw_sequence_number = r1_fastq_sh.size
128
- log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
129
144
 
130
- if params[:platform_error_rate]
131
- error_rate = params[:platform_error_rate]
132
- else
133
- error_rate = 0.02
134
- end
145
+ primers.each do |primer|
146
+ summary_json = {}
147
+ summary_json[:warnings] = []
148
+ summary_json[:tcs_version] = ViralSeq::TCS_VERSION
149
+ summary_json[:viralseq_version] = ViralSeq::VERSION
150
+ summary_json[:runtime] = Time.now.to_s
135
151
 
136
- if params[:platform_format]
137
- $platform_sequencing_length = params[:platform_format]
138
- else
139
- $platform_sequencing_length = 300
140
- end
152
+ primer[:region] ? region = primer[:region] : region = "region"
153
+ summary_json[:primer_set_name] = region
141
154
 
142
- primers = params[:primer_pairs]
143
- if primers.empty?
144
- ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
145
- end
155
+ cdna_primer = primer[:cdna]
156
+ forward_primer = primer[:forward]
146
157
 
158
+ export_raw = primer[:export_raw]
159
+ limit_raw = primer[:limit_raw]
147
160
 
148
- primers.each do |primer|
149
- summary_json = {}
150
- summary_json[:warnings] = []
151
- summary_json[:tcs_version] = ViralSeq::TCS_VERSION
152
- summary_json[:viralseq_version] = ViralSeq::VERSION
153
- summary_json[:runtime] = Time.now.to_s
161
+ unless cdna_primer
162
+ log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
163
+ end
164
+ unless forward_primer
165
+ log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
166
+ end
167
+ summary_json[:cdan_primer] = cdna_primer
168
+ summary_json[:forward_primer] = forward_primer
169
+
170
+ primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
171
+ summary_json[:majority_cut_off] = majority_cut_off
172
+
173
+ summary_json[:total_raw_sequence] = raw_sequence_number
174
+
175
+ log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
176
+
177
+ # filter R1
178
+ log.puts Time.now.to_s + "\t" + "filtering R1..."
179
+ filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
180
+ r1_passed_seq = filter_r1[:r1_passed_seq]
181
+ log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
182
+ summary_json[:r1_filtered_raw] = r1_passed_seq.size
183
+
184
+ # filter R2
185
+ log.puts Time.now.to_s + "\t" + "filtering R2..."
186
+ filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
187
+ r2_passed_seq = filter_r2[:r2_passed_seq]
188
+ pid_length = filter_r2[:pid_length]
189
+ log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
190
+ summary_json[:r2_filtered_raw] = r2_passed_seq.size
191
+
192
+ # pair-end
193
+ log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
194
+ id = {} # hash for :sequence_tag => primer_id
195
+ bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
196
+ bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
197
+ common_keys = r1_passed_seq.keys & r2_passed_seq.keys
198
+ paired_seq_number = common_keys.size
199
+ log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
200
+ summary_json[:paired_raw_sequence] = paired_seq_number
201
+ if paired_seq_number < raw_sequence_number * 0.001
202
+ summary_json[:warnings] <<
203
+ "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
204
+ end
154
205
 
155
- primer[:region] ? region = primer[:region] : region = "region"
156
- summary_json[:primer_set_name] = region
206
+ common_keys.each do |seqtag|
207
+ r1_seq = r1_passed_seq[seqtag]
208
+ r2_seq = r2_passed_seq[seqtag]
209
+ pid = r2_seq[0, pid_length]
210
+ id[seqtag] = pid
211
+ bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
212
+ bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
213
+ end
157
214
 
158
- cdna_primer = primer[:cdna]
159
- forward_primer = primer[:forward]
215
+ # TCS cut-off
216
+ log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
160
217
 
161
- export_raw = primer[:export_raw]
162
- limit_raw = primer[:limit_raw]
218
+ primer_id_list = id.values
219
+ primer_id_count = primer_id_list.count_freq
220
+ primer_id_dis = primer_id_count.values.count_freq
163
221
 
164
- unless cdna_primer
165
- log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
166
- end
167
- unless forward_primer
168
- log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
169
- end
170
- summary_json[:cdan_primer] = cdna_primer
171
- summary_json[:forward_primer] = forward_primer
172
-
173
- primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
174
- summary_json[:majority_cut_off] = majority_cut_off
175
-
176
- summary_json[:total_raw_sequence] = raw_sequence_number
177
-
178
- log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
179
-
180
- # filter R1
181
- log.puts Time.now.to_s + "\t" + "filtering R1..."
182
- filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
183
- r1_passed_seq = filter_r1[:r1_passed_seq]
184
- log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
185
- summary_json[:r1_filtered_raw] = r1_passed_seq.size
186
-
187
- # filter R2
188
- log.puts Time.now.to_s + "\t" + "filtering R2..."
189
- filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
190
- r2_passed_seq = filter_r2[:r2_passed_seq]
191
- pid_length = filter_r2[:pid_length]
192
- log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
193
- summary_json[:r2_filtered_raw] = r2_passed_seq.size
194
-
195
- # pair-end
196
- log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
197
- id = {} # hash for :sequence_tag => primer_id
198
- bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
199
- bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
200
- common_keys = r1_passed_seq.keys & r2_passed_seq.keys
201
- paired_seq_number = common_keys.size
202
- log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
203
- summary_json[:paired_raw_sequence] = paired_seq_number
204
- if paired_seq_number < raw_sequence_number * 0.001
205
- summary_json[:warnings] <<
206
- "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
207
- end
222
+ # calculate distinct_to_raw
223
+ distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
224
+ summary_json[:distinct_to_raw] = distinct_to_raw
208
225
 
209
- common_keys.each do |seqtag|
210
- r1_seq = r1_passed_seq[seqtag]
211
- r2_seq = r2_passed_seq[seqtag]
212
- pid = r2_seq[0, pid_length]
213
- id[seqtag] = pid
214
- bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
215
- bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
216
- end
226
+ if primer_id_dis.keys.size < 5
227
+ log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
228
+ next
229
+ end
217
230
 
218
- # TCS cut-off
219
- log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
231
+ max_id = primer_id_dis.keys.sort[-5..-1].mean
232
+ consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
233
+ log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
234
+ summary_json[:consensus_cutoff] = consensus_cutoff
235
+ summary_json[:length_of_pid] = pid_length
236
+ log.puts Time.now.to_s + "\t" + "Creating consensus..."
237
+
238
+ # Primer ID over the cut-off
239
+ primer_id_count_over_n = []
240
+ primer_id_count.each do |primer_id,count|
241
+ primer_id_count_over_n << primer_id if count > consensus_cutoff
242
+ end
243
+ pid_to_process = primer_id_count_over_n.size
244
+ log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
245
+ summary_json[:total_tcs_with_ambiguities] = pid_to_process
220
246
 
221
- primer_id_list = id.values
222
- primer_id_count = primer_id_list.count_freq
223
- primer_id_dis = primer_id_count.values.count_freq
247
+ # setup output path
248
+ out_dir_set = File.join(indir, region)
249
+ Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
250
+ out_dir_consensus = File.join(out_dir_set, "consensus")
251
+ Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
224
252
 
225
- # calculate distinct_to_raw
226
- distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
227
- summary_json[:distinct_to_raw] = distinct_to_raw
253
+ outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
254
+ outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
255
+ outfile_log = File.join(out_dir_set, 'log.json')
228
256
 
229
- if primer_id_dis.keys.size < 5
230
- log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
231
- next
232
- end
257
+ # if export_raw is true, create dir for raw sequence
258
+ if export_raw
259
+ out_dir_raw = File.join(out_dir_set, "raw")
260
+ Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
261
+ outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
262
+ outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
263
+ raw_r1_f = File.open(outfile_raw_r1, 'w')
264
+ raw_r2_f = File.open(outfile_raw_r2, 'w')
265
+
266
+ if limit_raw
267
+ raw_keys = bio_r1.keys.sample(limit_raw.to_i)
268
+ else
269
+ raw_keys = bio_r1.keys
270
+ end
233
271
 
234
- max_id = primer_id_dis.keys.sort[-5..-1].mean
235
- consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
236
- log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
237
- summary_json[:consensus_cutoff] = consensus_cutoff
238
- summary_json[:length_of_pid] = pid_length
239
- log.puts Time.now.to_s + "\t" + "Creating consensus..."
240
-
241
- # Primer ID over the cut-off
242
- primer_id_count_over_n = []
243
- primer_id_count.each do |primer_id,count|
244
- primer_id_count_over_n << primer_id if count > consensus_cutoff
245
- end
246
- pid_to_process = primer_id_count_over_n.size
247
- log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
248
- summary_json[:total_tcs_with_ambiguities] = pid_to_process
249
-
250
- # setup output path
251
- out_dir_set = File.join(indir, region)
252
- Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
253
- out_dir_consensus = File.join(out_dir_set, "consensus")
254
- Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
255
-
256
- outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
257
- outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
258
- outfile_log = File.join(out_dir_set, 'log.json')
259
-
260
- # if export_raw is true, create dir for raw sequence
261
- if export_raw
262
- out_dir_raw = File.join(out_dir_set, "raw")
263
- Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
264
- outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
265
- outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
266
- raw_r1_f = File.open(outfile_raw_r1, 'w')
267
- raw_r2_f = File.open(outfile_raw_r2, 'w')
268
-
269
- if limit_raw
270
- raw_keys = bio_r1.keys.sample(limit_raw.to_i)
271
- else
272
- raw_keys = bio_r1.keys
273
- end
272
+ raw_keys.each do |k|
273
+ raw_r1_f.puts k + "_r1"
274
+ raw_r2_f.puts k + "_r2"
275
+ raw_r1_f.puts bio_r1[k]
276
+ raw_r2_f.puts bio_r2[k].rc
277
+ end
274
278
 
275
- raw_keys.each do |k|
276
- raw_r1_f.puts k + "_r1"
277
- raw_r2_f.puts k + "_r2"
278
- raw_r1_f.puts bio_r1[k]
279
- raw_r2_f.puts bio_r2[k].rc
279
+ raw_r1_f.close
280
+ raw_r2_f.close
280
281
  end
281
282
 
282
- raw_r1_f.close
283
- raw_r2_f.close
284
- end
283
+ # create TCS
285
284
 
286
- # create TCS
285
+ pid_seqtag_hash = {}
286
+ id.each do |name, pid|
287
+ if pid_seqtag_hash[pid]
288
+ pid_seqtag_hash[pid] << name
289
+ else
290
+ pid_seqtag_hash[pid] = []
291
+ pid_seqtag_hash[pid] << name
292
+ end
293
+ end
287
294
 
288
- pid_seqtag_hash = {}
289
- id.each do |name, pid|
290
- if pid_seqtag_hash[pid]
291
- pid_seqtag_hash[pid] << name
295
+ consensus = {}
296
+ r1_temp = {}
297
+ r2_temp = {}
298
+ m = 0
299
+ primer_id_count_over_n.each do |primer_id|
300
+ m += 1
301
+ log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
302
+ seq_with_same_primer_id = pid_seqtag_hash[primer_id]
303
+ r1_sub_seq = []
304
+ r2_sub_seq = []
305
+ seq_with_same_primer_id.each do |seq_name|
306
+ r1_sub_seq << bio_r1[seq_name]
307
+ r2_sub_seq << bio_r2[seq_name]
308
+ end
309
+ #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
310
+ consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
311
+ r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
312
+ r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
313
+
314
+ # hide the following two lines if allowing sequence to have ambiguities.
315
+ next if r1_consensus =~ /[^ATCG]/
316
+ next if r2_consensus =~ /[^ATCG]/
317
+
318
+ # reverse complement sequence of the R2 region
319
+ r2_consensus = r2_consensus.rc
320
+ consensus[consensus_name] = [r1_consensus, r2_consensus]
321
+ r1_temp[consensus_name] = r1_consensus
322
+ r2_temp[consensus_name] = r2_consensus
323
+ end
324
+ r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
325
+ r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
326
+
327
+ # filter consensus sequences for residual offspring PIDs
328
+ consensus_filtered = {}
329
+ consensus_number_temp = consensus.size
330
+ max_pid_comb = 4**pid_length
331
+ if consensus_number_temp < 0.003*max_pid_comb
332
+ log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
333
+ r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
334
+ r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
335
+ common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
336
+ common_pid.each do |pid|
337
+ consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
338
+ end
292
339
  else
293
- pid_seqtag_hash[pid] = []
294
- pid_seqtag_hash[pid] << name
340
+ consensus_filtered = consensus
295
341
  end
296
- end
297
-
298
- consensus = {}
299
- r1_temp = {}
300
- r2_temp = {}
301
- m = 0
302
- primer_id_count_over_n.each do |primer_id|
303
- m += 1
304
- log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
305
- seq_with_same_primer_id = pid_seqtag_hash[primer_id]
306
- r1_sub_seq = []
307
- r2_sub_seq = []
308
- seq_with_same_primer_id.each do |seq_name|
309
- r1_sub_seq << bio_r1[seq_name]
310
- r2_sub_seq << bio_r2[seq_name]
342
+ n_con = consensus_filtered.size
343
+ log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
344
+ summary_json[:total_tcs] = n_con
345
+ summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
346
+
347
+ log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
348
+ # r1_file output
349
+ f1 = File.open(outfile_r1, 'w')
350
+ f2 = File.open(outfile_r2, 'w')
351
+ primer_id_in_use = {}
352
+ if n_con > 0
353
+ r1_seq_length = consensus_filtered.values[0][0].size
354
+ r2_seq_length = consensus_filtered.values[0][1].size
355
+ else
356
+ next
311
357
  end
312
- #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
313
- consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
314
- r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
315
- r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
316
-
317
- # hide the following two lines if allowing sequence to have ambiguities.
318
- next if r1_consensus =~ /[^ATCG]/
319
- next if r2_consensus =~ /[^ATCG]/
320
-
321
- # reverse complement sequence of the R2 region
322
- r2_consensus = r2_consensus.rc
323
- consensus[consensus_name] = [r1_consensus, r2_consensus]
324
- r1_temp[consensus_name] = r1_consensus
325
- r2_temp[consensus_name] = r2_consensus
326
- end
327
- r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
328
- r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
329
-
330
- # filter consensus sequences for residual offspring PIDs
331
- consensus_filtered = {}
332
- consensus_number_temp = consensus.size
333
- max_pid_comb = 4**pid_length
334
- if consensus_number_temp < 0.003*max_pid_comb
335
- log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
336
- r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
337
- r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
338
- common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
339
- common_pid.each do |pid|
340
- consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
358
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
359
+ log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
360
+ consensus_filtered.each do |seq_name,seq|
361
+ f1.print seq_name + "_r1\n" + seq[0] + "\n"
362
+ f2.print seq_name + "_r2\n" + seq[1] + "\n"
363
+ primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
341
364
  end
342
- else
343
- consensus_filtered = consensus
344
- end
345
- n_con = consensus_filtered.size
346
- log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
347
- summary_json[:total_tcs] = n_con
348
- summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
349
-
350
- log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
351
- # r1_file output
352
- f1 = File.open(outfile_r1, 'w')
353
- f2 = File.open(outfile_r2, 'w')
354
- primer_id_in_use = {}
355
- if n_con > 0
356
- r1_seq_length = consensus_filtered.values[0][0].size
357
- r2_seq_length = consensus_filtered.values[0][1].size
358
- else
359
- next
360
- end
361
- log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
362
- log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
363
- consensus_filtered.each do |seq_name,seq|
364
- f1.print seq_name + "_r1\n" + seq[0] + "\n"
365
- f2.print seq_name + "_r2\n" + seq[1] + "\n"
366
- primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
367
- end
368
- f1.close
369
- f2.close
370
-
371
- # Primer ID distribution in .json file
372
- out_pid_json = File.join(out_dir_set, 'primer_id.json')
373
- pid_json = {}
374
- pid_json[:primer_id_in_use] = Hash[*(primer_id_in_use.sort_by {|k, v| [-v,k]}.flatten)]
375
- pid_json[:primer_id_distribution] = Hash[*(primer_id_dis.sort_by{|k,v| k}.flatten)]
376
- pid_json[:primer_id_frequency] = Hash[*(primer_id_count.sort_by {|k, v| [-v,k]}.flatten)]
377
- File.open(out_pid_json, 'w') do |f|
378
- f.puts JSON.pretty_generate(pid_json)
379
- end
380
-
381
- # start end-join
382
- def end_join(dir, option, overlap)
383
- shp = ViralSeq::SeqHashPair.fa(dir)
384
- case option
385
- when 1
386
- joined_sh = shp.join1()
387
- when 2
388
- joined_sh = shp.join1(overlap)
389
- when 3
390
- joined_sh = shp.join2
391
- when 4
392
- joined_sh = shp.join2(model: :indiv)
365
+ f1.close
366
+ f2.close
367
+
368
+ # Primer ID distribution in .json file
369
+ out_pid_json = File.join(out_dir_set, 'primer_id.json')
370
+ pid_json = {}
371
+ pid_json[:primer_id_in_use] = {}
372
+ primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
373
+ pid_json[:primer_id_in_use][k] = v
393
374
  end
394
- return joined_sh
395
- end
396
-
397
- if primer[:end_join]
398
- log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
399
- shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
400
- joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
401
- log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
402
375
 
403
- summary_json[:combined_tcs] = joined_sh.size
404
-
405
- if export_raw
406
- joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
376
+ pid_json[:primer_id_distribution] = {}
377
+ primer_id_dis.sort_by{|k,v| k}.each do |k,v|
378
+ pid_json[:primer_id_distribution][k] = v
407
379
  end
408
380
 
409
- else
410
- File.open(outfile_log, "w") do |f|
411
- f.puts JSON.pretty_generate(summary_json)
381
+ pid_json[:primer_id_frequency] = {}
382
+ primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
383
+ pid_json[:primer_id_frequency][k] = v
412
384
  end
413
- next
414
- end
415
385
 
416
- if primer[:TCS_QC]
417
- ref_start = primer[:ref_start]
418
- ref_end = primer[:ref_end]
419
- ref_genome = primer[:ref_genome].to_sym
420
- indel = primer[:indel]
421
- if ref_start == 0
422
- ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
423
- end
424
- if ref_end == 0
425
- ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
386
+ File.open(out_pid_json, 'w') do |f|
387
+ f.puts JSON.pretty_generate(pid_json)
426
388
  end
427
- if primer[:end_join_option] == 1 and primer[:overlap] == 0
428
- r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
429
- r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
430
- r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
431
- r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
432
- new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
433
- new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
434
- joined_seq = {}
435
- new_r1_seq.each do |seq_name, seq|
436
- next unless seq
437
- next unless new_r2_seq[seq_name]
438
- joined_seq[seq_name] = seq + new_r2_seq[seq_name]
389
+
390
+ # start end-join
391
+ def end_join(dir, option, overlap)
392
+ shp = ViralSeq::SeqHashPair.fa(dir)
393
+ case option
394
+ when 1
395
+ joined_sh = shp.join1()
396
+ when 2
397
+ joined_sh = shp.join1(overlap)
398
+ when 3
399
+ joined_sh = shp.join2
400
+ when 4
401
+ joined_sh = shp.join2(model: :indiv)
439
402
  end
440
- joined_sh = ViralSeq::SeqHash.new(joined_seq)
403
+ return joined_sh
404
+ end
405
+
406
+ if primer[:end_join]
407
+ log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
408
+ shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
409
+ joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
410
+ log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
411
+
412
+ summary_json[:combined_tcs] = joined_sh.size
441
413
 
442
414
  if export_raw
443
- r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
444
- r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
445
- r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
446
- r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
447
- new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
448
- new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
449
- joined_seq_raw = {}
450
- new_r1_seq_raw.each do |seq_name, seq|
451
- next unless seq
452
- next unless new_r2_seq_raw[seq_name]
453
- joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
454
- end
455
- joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
415
+ joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
456
416
  end
457
- else
458
- joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
459
417
 
460
- if export_raw
461
- joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
418
+ if primer[:TCS_QC]
419
+ ref_start = primer[:ref_start]
420
+ ref_end = primer[:ref_end]
421
+ ref_genome = primer[:ref_genome].to_sym
422
+ indel = primer[:indel]
423
+ if ref_start == 0
424
+ ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
425
+ end
426
+ if ref_end == 0
427
+ ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
428
+ end
429
+ if primer[:end_join_option] == 1 and primer[:overlap] == 0
430
+ r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
431
+ r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
432
+ r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
433
+ r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
434
+ new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
435
+ new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
436
+ joined_seq = {}
437
+ new_r1_seq.each do |seq_name, seq|
438
+ next unless seq
439
+ next unless new_r2_seq[seq_name]
440
+ joined_seq[seq_name] = seq + new_r2_seq[seq_name]
441
+ end
442
+ joined_sh = ViralSeq::SeqHash.new(joined_seq)
443
+
444
+ if export_raw
445
+ r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
446
+ r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
447
+ r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
448
+ r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
449
+ new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
450
+ new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
451
+ joined_seq_raw = {}
452
+ new_r1_seq_raw.each do |seq_name, seq|
453
+ next unless seq
454
+ next unless new_r2_seq_raw[seq_name]
455
+ joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
456
+ end
457
+ joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
458
+ end
459
+ else
460
+ joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
461
+
462
+ if export_raw
463
+ joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
464
+ end
465
+ end
466
+
467
+ log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
468
+ summary_json[:combined_tcs_after_qc] = joined_sh.size
469
+ if primer[:trim]
470
+ trim_start = primer[:trim_ref_start]
471
+ trim_end = primer[:trim_ref_end]
472
+ trim_ref = primer[:trim_ref].to_sym
473
+ joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
474
+ if export_raw
475
+ joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
476
+ end
477
+ end
462
478
  end
463
- end
464
479
 
465
- log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
466
- summary_json[:combined_tcs_after_qc] = joined_sh.size
467
- if primer[:trim]
468
- trim_start = primer[:trim_ref_start]
469
- trim_end = primer[:trim_ref_end]
470
- trim_ref = primer[:trim_ref].to_sym
471
- joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
480
+ joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
472
481
  if export_raw
473
- joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
482
+ joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
474
483
  end
484
+
475
485
  end
476
486
 
477
- joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
478
- if export_raw
479
- joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
487
+ File.open(outfile_log, "w") do |f|
488
+ f.puts JSON.pretty_generate(summary_json)
480
489
  end
481
- end
482
490
 
483
- File.open(outfile_log, "w") do |f|
484
- f.puts JSON.pretty_generate(summary_json)
485
491
  end
486
- end
487
492
 
488
- unless options[:keep]
489
- log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
490
- File.unlink(r1_f)
491
- File.unlink(r2_f)
493
+ unless options[:keep]
494
+ log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
495
+ File.unlink(r1_f)
496
+ File.unlink(r2_f)
497
+ end
498
+ log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
499
+ log.close
500
+ puts "DONE!"
501
+ rescue => e
502
+ puts "`tcs` pipeline run with errors: " + e.message.red
503
+ puts "`tcs` pipeline aborted.".red.bold
504
+ log.puts Time.now.to_s + "\t" + e.full_message
505
+ log.puts Time.now.to_s + "\tAborted."
506
+ log.close
507
+ error_hash = {}
508
+ error_hash[:directory] = indir
509
+ error_hash[:tcs_version] = ViralSeq::TCS_VERSION
510
+ error_hash[:viralSeq_version] = ViralSeq::VERSION
511
+ error_hash[:time] = Time.now
512
+ error_hash[:error] = e.full_message
513
+ File.open(File.join(indir, ".tcs_error"), 'w') do |f|
514
+ f.puts JSON.pretty_generate([error_hash])
515
+ end
516
+ master_error_file = File.join(File.dirname(indir), ".tcs_error")
517
+ master_errors = []
518
+ if File.exist? master_error_file
519
+ master_errors << JSON.parse(File.read(master_error_file), symbolize_names: true)
520
+ end
521
+ master_errors << error_hash
522
+ File.open(master_error_file, 'w') do |f|
523
+ f.puts JSON.pretty_generate(master_errors)
524
+ end
492
525
  end
493
- log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly exercuted."
494
- log.close
495
- puts "DONE!"