viral_seq 1.2.2 → 1.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -1
- data/bin/tcs +370 -351
- data/lib/viral_seq/seq_hash.rb +5 -1
- data/lib/viral_seq/tcs_core.rb +2 -1
- data/lib/viral_seq/tcs_dr.rb +4 -4
- data/lib/viral_seq/version.rb +2 -2
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bb2f3be04857f96f8cf15b02e67a585771c04d45d4e4be68e566b4226342b5f0
|
|
4
|
+
data.tar.gz: 2c36ff5494bcf415796a5a8b10da22721a6ea6f574a97601d0a9ad487236e70e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: abc5622ae5dc8d5e1343f8e85557fc03e169146a34f6d980bf0d5db4dc973d41d81cc89bb0ce9cc0dd5a090d1c58357834431d9d1ad106f31c3a8f0a739ce4b9
|
|
7
|
+
data.tar.gz: a890a557536ff43073258220801bdc8b6294b3c22bc8ec030f245bb9acccfeaebf8f0d3b228233de4e980558e88c953143d57f1ff3bc2a5b366026d7371df67d
|
data/README.md
CHANGED
|
@@ -10,6 +10,8 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
|
10
10
|
|
|
11
11
|
Specifically for Primer ID sequencing and HIV drug resistance analysis.
|
|
12
12
|
|
|
13
|
+
#### tcs web app - https://primer-id.org/
|
|
14
|
+
|
|
13
15
|
## Illustration for the Primer ID Sequencing
|
|
14
16
|
|
|
15
17
|
|
|
@@ -31,7 +33,9 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
|
|
|
31
33
|
### Excutables
|
|
32
34
|
|
|
33
35
|
### `tcs`
|
|
34
|
-
Use executable `tcs` pipeline
|
|
36
|
+
Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
|
|
37
|
+
|
|
38
|
+
Web-based `tcs` analysis can be accessed at https://primer-id.org/
|
|
35
39
|
|
|
36
40
|
Example commands:
|
|
37
41
|
```bash
|
|
@@ -175,6 +179,28 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
|
175
179
|
|
|
176
180
|
## Updates
|
|
177
181
|
|
|
182
|
+
### Version 1.2.8-07292021
|
|
183
|
+
|
|
184
|
+
1. Fixed an issue when reading .fastq files containing blank_lines.
|
|
185
|
+
|
|
186
|
+
### Version 1.2.7-07152021
|
|
187
|
+
|
|
188
|
+
1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
|
|
189
|
+
`tcs` version to v2.3.6.
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
### Version 1.2.6-07122021
|
|
193
|
+
|
|
194
|
+
1. Optimized the workflow of the `tcs` pipeline in the "end-join/QC/Trimming" section.
|
|
195
|
+
`tcs` version to v2.3.5.
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
### Version 1.2.5-06232021
|
|
199
|
+
|
|
200
|
+
1. Add error rescue and report in the `tcs` pipeline.
|
|
201
|
+
error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.4.
|
|
202
|
+
2. Use simple majority for the consensus cut-off in the default setting of the `tcs -dr` pipeline.
|
|
203
|
+
|
|
178
204
|
### Version 1.2.2-05272021
|
|
179
205
|
|
|
180
206
|
1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
|
data/bin/tcs
CHANGED
|
@@ -101,407 +101,426 @@ log = File.open(runtime_log_file, "w")
|
|
|
101
101
|
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
|
102
102
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
|
103
103
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
|
104
|
+
File.unlink(File.join(indir, ".tcs_error")) if File.exist?(File.join(indir, ".tcs_error"))
|
|
105
|
+
|
|
106
|
+
begin
|
|
107
|
+
libname = File.basename indir
|
|
108
|
+
seq_files = ViralSeq::TcsCore.r1r2 indir
|
|
109
|
+
|
|
110
|
+
if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
|
|
111
|
+
r1_f = seq_files[:r1_file]
|
|
112
|
+
r2_f = seq_files[:r2_file]
|
|
113
|
+
elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
|
|
114
|
+
raise StandardError.new "Missing R2 file."
|
|
115
|
+
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
|
116
|
+
raise StandardError.new "Missing R1 file."
|
|
117
|
+
else
|
|
118
|
+
raise StandardError.new "Cannot determine R1 R2 file in #{indir}."
|
|
119
|
+
end
|
|
104
120
|
|
|
105
|
-
|
|
121
|
+
r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
|
|
122
|
+
r2_fastq_sh = ViralSeq::SeqHash.fq(r2_f)
|
|
106
123
|
|
|
107
|
-
|
|
124
|
+
raw_sequence_number = r1_fastq_sh.size
|
|
125
|
+
log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
|
|
108
126
|
|
|
109
|
-
if
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
|
115
|
-
exit_sig = "Missing R1 file. Aborted."
|
|
116
|
-
else
|
|
117
|
-
exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
|
|
118
|
-
end
|
|
127
|
+
if params[:platform_error_rate]
|
|
128
|
+
error_rate = params[:platform_error_rate]
|
|
129
|
+
else
|
|
130
|
+
error_rate = 0.02
|
|
131
|
+
end
|
|
119
132
|
|
|
120
|
-
if
|
|
121
|
-
|
|
122
|
-
|
|
133
|
+
if params[:platform_format]
|
|
134
|
+
$platform_sequencing_length = params[:platform_format]
|
|
135
|
+
else
|
|
136
|
+
$platform_sequencing_length = 300
|
|
137
|
+
end
|
|
123
138
|
|
|
124
|
-
|
|
125
|
-
|
|
139
|
+
primers = params[:primer_pairs]
|
|
140
|
+
if primers.empty? or primers.nil?
|
|
141
|
+
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
|
142
|
+
end
|
|
126
143
|
|
|
127
|
-
raw_sequence_number = r1_fastq_sh.size
|
|
128
|
-
log.puts Time.now.to_s + "\tRaw sequence number: #{raw_sequence_number.to_s}"
|
|
129
144
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
145
|
+
primers.each do |primer|
|
|
146
|
+
summary_json = {}
|
|
147
|
+
summary_json[:warnings] = []
|
|
148
|
+
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
|
149
|
+
summary_json[:viralseq_version] = ViralSeq::VERSION
|
|
150
|
+
summary_json[:runtime] = Time.now.to_s
|
|
135
151
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
else
|
|
139
|
-
$platform_sequencing_length = 300
|
|
140
|
-
end
|
|
152
|
+
primer[:region] ? region = primer[:region] : region = "region"
|
|
153
|
+
summary_json[:primer_set_name] = region
|
|
141
154
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
|
145
|
-
end
|
|
155
|
+
cdna_primer = primer[:cdna]
|
|
156
|
+
forward_primer = primer[:forward]
|
|
146
157
|
|
|
158
|
+
export_raw = primer[:export_raw]
|
|
159
|
+
limit_raw = primer[:limit_raw]
|
|
147
160
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
161
|
+
unless cdna_primer
|
|
162
|
+
log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
|
|
163
|
+
end
|
|
164
|
+
unless forward_primer
|
|
165
|
+
log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
|
|
166
|
+
end
|
|
167
|
+
summary_json[:cdan_primer] = cdna_primer
|
|
168
|
+
summary_json[:forward_primer] = forward_primer
|
|
169
|
+
|
|
170
|
+
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
|
171
|
+
summary_json[:majority_cut_off] = majority_cut_off
|
|
172
|
+
|
|
173
|
+
summary_json[:total_raw_sequence] = raw_sequence_number
|
|
174
|
+
|
|
175
|
+
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
|
176
|
+
|
|
177
|
+
# filter R1
|
|
178
|
+
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
|
179
|
+
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
|
180
|
+
r1_passed_seq = filter_r1[:r1_passed_seq]
|
|
181
|
+
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
|
182
|
+
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
|
183
|
+
|
|
184
|
+
# filter R2
|
|
185
|
+
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
|
186
|
+
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
|
187
|
+
r2_passed_seq = filter_r2[:r2_passed_seq]
|
|
188
|
+
pid_length = filter_r2[:pid_length]
|
|
189
|
+
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
|
190
|
+
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
|
191
|
+
|
|
192
|
+
# pair-end
|
|
193
|
+
log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
|
|
194
|
+
id = {} # hash for :sequence_tag => primer_id
|
|
195
|
+
bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
|
|
196
|
+
bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
|
|
197
|
+
common_keys = r1_passed_seq.keys & r2_passed_seq.keys
|
|
198
|
+
paired_seq_number = common_keys.size
|
|
199
|
+
log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
|
|
200
|
+
summary_json[:paired_raw_sequence] = paired_seq_number
|
|
201
|
+
if paired_seq_number < raw_sequence_number * 0.001
|
|
202
|
+
summary_json[:warnings] <<
|
|
203
|
+
"WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
|
|
204
|
+
end
|
|
154
205
|
|
|
155
|
-
|
|
156
|
-
|
|
206
|
+
common_keys.each do |seqtag|
|
|
207
|
+
r1_seq = r1_passed_seq[seqtag]
|
|
208
|
+
r2_seq = r2_passed_seq[seqtag]
|
|
209
|
+
pid = r2_seq[0, pid_length]
|
|
210
|
+
id[seqtag] = pid
|
|
211
|
+
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
|
212
|
+
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
|
213
|
+
end
|
|
157
214
|
|
|
158
|
-
|
|
159
|
-
|
|
215
|
+
# TCS cut-off
|
|
216
|
+
log.puts Time.now.to_s + "\t" + "Calculate consensus cutoff...."
|
|
160
217
|
|
|
161
|
-
|
|
162
|
-
|
|
218
|
+
primer_id_list = id.values
|
|
219
|
+
primer_id_count = primer_id_list.count_freq
|
|
220
|
+
primer_id_dis = primer_id_count.values.count_freq
|
|
163
221
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
unless forward_primer
|
|
168
|
-
log.puts Time.now.to_s + "\t" + region + " does not have forward primer sequence. #{region} skipped."
|
|
169
|
-
end
|
|
170
|
-
summary_json[:cdan_primer] = cdna_primer
|
|
171
|
-
summary_json[:forward_primer] = forward_primer
|
|
172
|
-
|
|
173
|
-
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
|
174
|
-
summary_json[:majority_cut_off] = majority_cut_off
|
|
175
|
-
|
|
176
|
-
summary_json[:total_raw_sequence] = raw_sequence_number
|
|
177
|
-
|
|
178
|
-
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
|
179
|
-
|
|
180
|
-
# filter R1
|
|
181
|
-
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
|
182
|
-
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
|
183
|
-
r1_passed_seq = filter_r1[:r1_passed_seq]
|
|
184
|
-
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
|
185
|
-
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
|
186
|
-
|
|
187
|
-
# filter R2
|
|
188
|
-
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
|
189
|
-
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
|
190
|
-
r2_passed_seq = filter_r2[:r2_passed_seq]
|
|
191
|
-
pid_length = filter_r2[:pid_length]
|
|
192
|
-
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
|
193
|
-
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
|
194
|
-
|
|
195
|
-
# pair-end
|
|
196
|
-
log.puts Time.now.to_s + "\t" + "Pairing R1 and R2 seqs..."
|
|
197
|
-
id = {} # hash for :sequence_tag => primer_id
|
|
198
|
-
bio_r2 = {} # hash for :sequence_tag => primer_trimmed_r2_sequence
|
|
199
|
-
bio_r1 = {} # hash for :sequence_tag => primer_trimmed_r1_sequence
|
|
200
|
-
common_keys = r1_passed_seq.keys & r2_passed_seq.keys
|
|
201
|
-
paired_seq_number = common_keys.size
|
|
202
|
-
log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
|
|
203
|
-
summary_json[:paired_raw_sequence] = paired_seq_number
|
|
204
|
-
if paired_seq_number < raw_sequence_number * 0.001
|
|
205
|
-
summary_json[:warnings] <<
|
|
206
|
-
"WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
|
|
207
|
-
end
|
|
222
|
+
# calculate distinct_to_raw
|
|
223
|
+
distinct_to_raw = (primer_id_count.size/primer_id_list.size.to_f).round(3)
|
|
224
|
+
summary_json[:distinct_to_raw] = distinct_to_raw
|
|
208
225
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
id[seqtag] = pid
|
|
214
|
-
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
|
215
|
-
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
|
216
|
-
end
|
|
226
|
+
if primer_id_dis.keys.size < 5
|
|
227
|
+
log.puts Time.now.to_s + "\t" + "Less than 5 Primer IDs detected. Region #{region} aborted."
|
|
228
|
+
next
|
|
229
|
+
end
|
|
217
230
|
|
|
218
|
-
|
|
219
|
-
|
|
231
|
+
max_id = primer_id_dis.keys.sort[-5..-1].mean
|
|
232
|
+
consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
|
|
233
|
+
log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
|
|
234
|
+
summary_json[:consensus_cutoff] = consensus_cutoff
|
|
235
|
+
summary_json[:length_of_pid] = pid_length
|
|
236
|
+
log.puts Time.now.to_s + "\t" + "Creating consensus..."
|
|
237
|
+
|
|
238
|
+
# Primer ID over the cut-off
|
|
239
|
+
primer_id_count_over_n = []
|
|
240
|
+
primer_id_count.each do |primer_id,count|
|
|
241
|
+
primer_id_count_over_n << primer_id if count > consensus_cutoff
|
|
242
|
+
end
|
|
243
|
+
pid_to_process = primer_id_count_over_n.size
|
|
244
|
+
log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
|
|
245
|
+
summary_json[:total_tcs_with_ambiguities] = pid_to_process
|
|
220
246
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
247
|
+
# setup output path
|
|
248
|
+
out_dir_set = File.join(indir, region)
|
|
249
|
+
Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
|
|
250
|
+
out_dir_consensus = File.join(out_dir_set, "consensus")
|
|
251
|
+
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
|
224
252
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
253
|
+
outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
|
|
254
|
+
outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
|
|
255
|
+
outfile_log = File.join(out_dir_set, 'log.json')
|
|
228
256
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
257
|
+
# if export_raw is true, create dir for raw sequence
|
|
258
|
+
if export_raw
|
|
259
|
+
out_dir_raw = File.join(out_dir_set, "raw")
|
|
260
|
+
Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
|
|
261
|
+
outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
|
|
262
|
+
outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
|
|
263
|
+
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
|
264
|
+
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
|
265
|
+
|
|
266
|
+
if limit_raw
|
|
267
|
+
raw_keys = bio_r1.keys.sample(limit_raw.to_i)
|
|
268
|
+
else
|
|
269
|
+
raw_keys = bio_r1.keys
|
|
270
|
+
end
|
|
233
271
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
# Primer ID over the cut-off
|
|
242
|
-
primer_id_count_over_n = []
|
|
243
|
-
primer_id_count.each do |primer_id,count|
|
|
244
|
-
primer_id_count_over_n << primer_id if count > consensus_cutoff
|
|
245
|
-
end
|
|
246
|
-
pid_to_process = primer_id_count_over_n.size
|
|
247
|
-
log.puts Time.now.to_s + "\t" + "Number of consensus to process: #{pid_to_process.to_s}"
|
|
248
|
-
summary_json[:total_tcs_with_ambiguities] = pid_to_process
|
|
249
|
-
|
|
250
|
-
# setup output path
|
|
251
|
-
out_dir_set = File.join(indir, region)
|
|
252
|
-
Dir.mkdir(out_dir_set) unless File.directory?(out_dir_set)
|
|
253
|
-
out_dir_consensus = File.join(out_dir_set, "consensus")
|
|
254
|
-
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
|
255
|
-
|
|
256
|
-
outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
|
|
257
|
-
outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
|
|
258
|
-
outfile_log = File.join(out_dir_set, 'log.json')
|
|
259
|
-
|
|
260
|
-
# if export_raw is true, create dir for raw sequence
|
|
261
|
-
if export_raw
|
|
262
|
-
out_dir_raw = File.join(out_dir_set, "raw")
|
|
263
|
-
Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
|
|
264
|
-
outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
|
|
265
|
-
outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
|
|
266
|
-
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
|
267
|
-
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
|
268
|
-
|
|
269
|
-
if limit_raw
|
|
270
|
-
raw_keys = bio_r1.keys.sample(limit_raw.to_i)
|
|
271
|
-
else
|
|
272
|
-
raw_keys = bio_r1.keys
|
|
273
|
-
end
|
|
272
|
+
raw_keys.each do |k|
|
|
273
|
+
raw_r1_f.puts k + "_r1"
|
|
274
|
+
raw_r2_f.puts k + "_r2"
|
|
275
|
+
raw_r1_f.puts bio_r1[k]
|
|
276
|
+
raw_r2_f.puts bio_r2[k].rc
|
|
277
|
+
end
|
|
274
278
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
raw_r2_f.puts k + "_r2"
|
|
278
|
-
raw_r1_f.puts bio_r1[k]
|
|
279
|
-
raw_r2_f.puts bio_r2[k].rc
|
|
279
|
+
raw_r1_f.close
|
|
280
|
+
raw_r2_f.close
|
|
280
281
|
end
|
|
281
282
|
|
|
282
|
-
|
|
283
|
-
raw_r2_f.close
|
|
284
|
-
end
|
|
283
|
+
# create TCS
|
|
285
284
|
|
|
286
|
-
|
|
285
|
+
pid_seqtag_hash = {}
|
|
286
|
+
id.each do |name, pid|
|
|
287
|
+
if pid_seqtag_hash[pid]
|
|
288
|
+
pid_seqtag_hash[pid] << name
|
|
289
|
+
else
|
|
290
|
+
pid_seqtag_hash[pid] = []
|
|
291
|
+
pid_seqtag_hash[pid] << name
|
|
292
|
+
end
|
|
293
|
+
end
|
|
287
294
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
295
|
+
consensus = {}
|
|
296
|
+
r1_temp = {}
|
|
297
|
+
r2_temp = {}
|
|
298
|
+
m = 0
|
|
299
|
+
primer_id_count_over_n.each do |primer_id|
|
|
300
|
+
m += 1
|
|
301
|
+
log.puts Time.now.to_s + "\t" + "Now processing number #{m}" if m%100 == 0
|
|
302
|
+
seq_with_same_primer_id = pid_seqtag_hash[primer_id]
|
|
303
|
+
r1_sub_seq = []
|
|
304
|
+
r2_sub_seq = []
|
|
305
|
+
seq_with_same_primer_id.each do |seq_name|
|
|
306
|
+
r1_sub_seq << bio_r1[seq_name]
|
|
307
|
+
r2_sub_seq << bio_r2[seq_name]
|
|
308
|
+
end
|
|
309
|
+
#consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
|
|
310
|
+
consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
|
|
311
|
+
r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
|
|
312
|
+
r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
|
|
313
|
+
|
|
314
|
+
# hide the following two lines if allowing sequence to have ambiguities.
|
|
315
|
+
next if r1_consensus =~ /[^ATCG]/
|
|
316
|
+
next if r2_consensus =~ /[^ATCG]/
|
|
317
|
+
|
|
318
|
+
# reverse complement sequence of the R2 region
|
|
319
|
+
r2_consensus = r2_consensus.rc
|
|
320
|
+
consensus[consensus_name] = [r1_consensus, r2_consensus]
|
|
321
|
+
r1_temp[consensus_name] = r1_consensus
|
|
322
|
+
r2_temp[consensus_name] = r2_consensus
|
|
323
|
+
end
|
|
324
|
+
r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
|
|
325
|
+
r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
|
|
326
|
+
|
|
327
|
+
# filter consensus sequences for residual offspring PIDs
|
|
328
|
+
consensus_filtered = {}
|
|
329
|
+
consensus_number_temp = consensus.size
|
|
330
|
+
max_pid_comb = 4**pid_length
|
|
331
|
+
if consensus_number_temp < 0.003*max_pid_comb
|
|
332
|
+
log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
|
|
333
|
+
r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
|
|
334
|
+
r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
|
|
335
|
+
common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
|
|
336
|
+
common_pid.each do |pid|
|
|
337
|
+
consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
|
|
338
|
+
end
|
|
292
339
|
else
|
|
293
|
-
|
|
294
|
-
pid_seqtag_hash[pid] << name
|
|
340
|
+
consensus_filtered = consensus
|
|
295
341
|
end
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
342
|
+
n_con = consensus_filtered.size
|
|
343
|
+
log.puts Time.now.to_s + "\t" + "Number of consensus sequences: " + n_con.to_s
|
|
344
|
+
summary_json[:total_tcs] = n_con
|
|
345
|
+
summary_json[:resampling_param] = (n_con/pid_to_process.to_f).round(3)
|
|
346
|
+
|
|
347
|
+
log.puts Time.now.to_s + "\t" + "Writing R1 and R2 files..."
|
|
348
|
+
# r1_file output
|
|
349
|
+
f1 = File.open(outfile_r1, 'w')
|
|
350
|
+
f2 = File.open(outfile_r2, 'w')
|
|
351
|
+
primer_id_in_use = {}
|
|
352
|
+
if n_con > 0
|
|
353
|
+
r1_seq_length = consensus_filtered.values[0][0].size
|
|
354
|
+
r2_seq_length = consensus_filtered.values[0][1].size
|
|
355
|
+
else
|
|
356
|
+
r1_seq_length = "n/a"
|
|
357
|
+
r2_seq_length = "n/a"
|
|
311
358
|
end
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
next if r1_consensus =~ /[^ATCG]/
|
|
319
|
-
next if r2_consensus =~ /[^ATCG]/
|
|
320
|
-
|
|
321
|
-
# reverse complement sequence of the R2 region
|
|
322
|
-
r2_consensus = r2_consensus.rc
|
|
323
|
-
consensus[consensus_name] = [r1_consensus, r2_consensus]
|
|
324
|
-
r1_temp[consensus_name] = r1_consensus
|
|
325
|
-
r2_temp[consensus_name] = r2_consensus
|
|
326
|
-
end
|
|
327
|
-
r1_temp_sh = ViralSeq::SeqHash.new(r1_temp)
|
|
328
|
-
r2_temp_sh = ViralSeq::SeqHash.new(r2_temp)
|
|
329
|
-
|
|
330
|
-
# filter consensus sequences for residual offspring PIDs
|
|
331
|
-
consensus_filtered = {}
|
|
332
|
-
consensus_number_temp = consensus.size
|
|
333
|
-
max_pid_comb = 4**pid_length
|
|
334
|
-
if consensus_number_temp < 0.003*max_pid_comb
|
|
335
|
-
log.puts Time.now.to_s + "\t" + "Applying PID post TCS filter..."
|
|
336
|
-
r1_consensus_filtered = r1_temp_sh.filter_similar_pid.dna_hash
|
|
337
|
-
r2_consensus_filtered = r2_temp_sh.filter_similar_pid.dna_hash
|
|
338
|
-
common_pid = r1_consensus_filtered.keys & r2_consensus_filtered.keys
|
|
339
|
-
common_pid.each do |pid|
|
|
340
|
-
consensus_filtered[pid] = [r1_consensus_filtered[pid], r2_consensus_filtered[pid]]
|
|
359
|
+
log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
|
|
360
|
+
log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
|
|
361
|
+
consensus_filtered.each do |seq_name,seq|
|
|
362
|
+
f1.print seq_name + "_r1\n" + seq[0] + "\n"
|
|
363
|
+
f2.print seq_name + "_r2\n" + seq[1] + "\n"
|
|
364
|
+
primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
|
|
341
365
|
end
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
# r1_file output
|
|
352
|
-
f1 = File.open(outfile_r1, 'w')
|
|
353
|
-
f2 = File.open(outfile_r2, 'w')
|
|
354
|
-
primer_id_in_use = {}
|
|
355
|
-
if n_con > 0
|
|
356
|
-
r1_seq_length = consensus_filtered.values[0][0].size
|
|
357
|
-
r2_seq_length = consensus_filtered.values[0][1].size
|
|
358
|
-
else
|
|
359
|
-
next
|
|
360
|
-
end
|
|
361
|
-
log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
|
|
362
|
-
log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
|
|
363
|
-
consensus_filtered.each do |seq_name,seq|
|
|
364
|
-
f1.print seq_name + "_r1\n" + seq[0] + "\n"
|
|
365
|
-
f2.print seq_name + "_r2\n" + seq[1] + "\n"
|
|
366
|
-
primer_id_in_use[seq_name.split("_")[0][1..-1]] = seq_name.split("_")[1].to_i
|
|
367
|
-
end
|
|
368
|
-
f1.close
|
|
369
|
-
f2.close
|
|
370
|
-
|
|
371
|
-
# Primer ID distribution in .json file
|
|
372
|
-
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
|
373
|
-
pid_json = {}
|
|
374
|
-
pid_json[:primer_id_in_use] = {}
|
|
375
|
-
primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
|
|
376
|
-
pid_json[:primer_id_in_use][k] = v
|
|
377
|
-
end
|
|
378
|
-
|
|
379
|
-
pid_json[:primer_id_distribution] = {}
|
|
380
|
-
primer_id_dis.sort_by{|k,v| k}.each do |k,v|
|
|
381
|
-
pid_json[:primer_id_distribution][k] = v
|
|
382
|
-
end
|
|
383
|
-
|
|
384
|
-
pid_json[:primer_id_frequency] = {}
|
|
385
|
-
primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
|
|
386
|
-
pid_json[:primer_id_frequency][k] = v
|
|
387
|
-
end
|
|
388
|
-
|
|
389
|
-
File.open(out_pid_json, 'w') do |f|
|
|
390
|
-
f.puts JSON.pretty_generate(pid_json)
|
|
391
|
-
end
|
|
392
|
-
|
|
393
|
-
# start end-join
|
|
394
|
-
def end_join(dir, option, overlap)
|
|
395
|
-
shp = ViralSeq::SeqHashPair.fa(dir)
|
|
396
|
-
case option
|
|
397
|
-
when 1
|
|
398
|
-
joined_sh = shp.join1()
|
|
399
|
-
when 2
|
|
400
|
-
joined_sh = shp.join1(overlap)
|
|
401
|
-
when 3
|
|
402
|
-
joined_sh = shp.join2
|
|
403
|
-
when 4
|
|
404
|
-
joined_sh = shp.join2(model: :indiv)
|
|
366
|
+
f1.close
|
|
367
|
+
f2.close
|
|
368
|
+
|
|
369
|
+
# Primer ID distribution in .json file
|
|
370
|
+
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
|
371
|
+
pid_json = {}
|
|
372
|
+
pid_json[:primer_id_in_use] = {}
|
|
373
|
+
primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
|
|
374
|
+
pid_json[:primer_id_in_use][k] = v
|
|
405
375
|
end
|
|
406
|
-
return joined_sh
|
|
407
|
-
end
|
|
408
|
-
|
|
409
|
-
if primer[:end_join]
|
|
410
|
-
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
|
411
|
-
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
|
412
|
-
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
|
413
|
-
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
|
414
|
-
|
|
415
|
-
summary_json[:combined_tcs] = joined_sh.size
|
|
416
376
|
|
|
417
|
-
|
|
418
|
-
|
|
377
|
+
pid_json[:primer_id_distribution] = {}
|
|
378
|
+
primer_id_dis.sort_by{|k,v| k}.each do |k,v|
|
|
379
|
+
pid_json[:primer_id_distribution][k] = v
|
|
419
380
|
end
|
|
420
381
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
382
|
+
pid_json[:primer_id_frequency] = {}
|
|
383
|
+
primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
|
|
384
|
+
pid_json[:primer_id_frequency][k] = v
|
|
424
385
|
end
|
|
425
|
-
next
|
|
426
|
-
end
|
|
427
386
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
ref_end = primer[:ref_end]
|
|
431
|
-
ref_genome = primer[:ref_genome].to_sym
|
|
432
|
-
indel = primer[:indel]
|
|
433
|
-
if ref_start == 0
|
|
434
|
-
ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
|
435
|
-
end
|
|
436
|
-
if ref_end == 0
|
|
437
|
-
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
|
387
|
+
File.open(out_pid_json, 'w') do |f|
|
|
388
|
+
f.puts JSON.pretty_generate(pid_json)
|
|
438
389
|
end
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
390
|
+
|
|
391
|
+
# start end-join
|
|
392
|
+
def end_join(dir, option, overlap)
|
|
393
|
+
shp = ViralSeq::SeqHashPair.fa(dir)
|
|
394
|
+
case option
|
|
395
|
+
when 1
|
|
396
|
+
joined_sh = shp.join1()
|
|
397
|
+
when 2
|
|
398
|
+
joined_sh = shp.join1(overlap)
|
|
399
|
+
when 3
|
|
400
|
+
joined_sh = shp.join2
|
|
401
|
+
when 4
|
|
402
|
+
joined_sh = shp.join2(model: :indiv)
|
|
451
403
|
end
|
|
452
|
-
joined_sh
|
|
404
|
+
return joined_sh
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
if primer[:end_join]
|
|
408
|
+
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
|
409
|
+
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
|
410
|
+
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
|
411
|
+
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
|
412
|
+
|
|
413
|
+
summary_json[:combined_tcs] = joined_sh.size
|
|
453
414
|
|
|
454
415
|
if export_raw
|
|
455
|
-
|
|
456
|
-
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
|
457
|
-
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
|
458
|
-
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
|
459
|
-
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
|
460
|
-
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
|
461
|
-
joined_seq_raw = {}
|
|
462
|
-
new_r1_seq_raw.each do |seq_name, seq|
|
|
463
|
-
next unless seq
|
|
464
|
-
next unless new_r2_seq_raw[seq_name]
|
|
465
|
-
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
|
466
|
-
end
|
|
467
|
-
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
|
416
|
+
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
|
468
417
|
end
|
|
469
|
-
else
|
|
470
|
-
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
|
471
418
|
|
|
472
|
-
if
|
|
473
|
-
|
|
419
|
+
if primer[:TCS_QC]
|
|
420
|
+
ref_start = primer[:ref_start]
|
|
421
|
+
ref_end = primer[:ref_end]
|
|
422
|
+
ref_genome = primer[:ref_genome].to_sym
|
|
423
|
+
indel = primer[:indel]
|
|
424
|
+
if ref_start == 0
|
|
425
|
+
ref_start = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
|
426
|
+
end
|
|
427
|
+
if ref_end == 0
|
|
428
|
+
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
|
429
|
+
end
|
|
430
|
+
if primer[:end_join_option] == 1
|
|
431
|
+
r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
|
|
432
|
+
r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
|
|
433
|
+
r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
|
434
|
+
r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
|
435
|
+
new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
|
436
|
+
new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
|
437
|
+
joined_seq = {}
|
|
438
|
+
new_r1_seq.each do |seq_name, seq|
|
|
439
|
+
next unless seq
|
|
440
|
+
next unless new_r2_seq[seq_name]
|
|
441
|
+
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
|
442
|
+
end
|
|
443
|
+
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
|
444
|
+
|
|
445
|
+
if export_raw
|
|
446
|
+
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
|
447
|
+
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
|
448
|
+
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
|
449
|
+
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
|
450
|
+
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
|
451
|
+
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
|
452
|
+
joined_seq_raw = {}
|
|
453
|
+
new_r1_seq_raw.each do |seq_name, seq|
|
|
454
|
+
next unless seq
|
|
455
|
+
next unless new_r2_seq_raw[seq_name]
|
|
456
|
+
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
|
457
|
+
end
|
|
458
|
+
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
|
459
|
+
end
|
|
460
|
+
else
|
|
461
|
+
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
|
462
|
+
|
|
463
|
+
if export_raw
|
|
464
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
|
469
|
+
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
|
470
|
+
if primer[:trim]
|
|
471
|
+
trim_start = primer[:trim_ref_start]
|
|
472
|
+
trim_end = primer[:trim_ref_end]
|
|
473
|
+
trim_ref = primer[:trim_ref].to_sym
|
|
474
|
+
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
|
475
|
+
if export_raw
|
|
476
|
+
joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
|
|
477
|
+
end
|
|
478
|
+
end
|
|
474
479
|
end
|
|
475
|
-
end
|
|
476
480
|
|
|
477
|
-
|
|
478
|
-
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
|
479
|
-
if primer[:trim]
|
|
480
|
-
trim_start = primer[:trim_ref_start]
|
|
481
|
-
trim_end = primer[:trim_ref_end]
|
|
482
|
-
trim_ref = primer[:trim_ref].to_sym
|
|
483
|
-
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
|
481
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
|
484
482
|
if export_raw
|
|
485
|
-
joined_sh_raw
|
|
483
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
|
486
484
|
end
|
|
485
|
+
|
|
487
486
|
end
|
|
488
487
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
|
488
|
+
File.open(outfile_log, "w") do |f|
|
|
489
|
+
f.puts JSON.pretty_generate(summary_json)
|
|
492
490
|
end
|
|
493
|
-
end
|
|
494
491
|
|
|
495
|
-
File.open(outfile_log, "w") do |f|
|
|
496
|
-
f.puts JSON.pretty_generate(summary_json)
|
|
497
492
|
end
|
|
498
|
-
end
|
|
499
493
|
|
|
500
|
-
unless options[:keep]
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
494
|
+
unless options[:keep]
|
|
495
|
+
log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
|
|
496
|
+
File.unlink(r1_f)
|
|
497
|
+
File.unlink(r2_f)
|
|
498
|
+
end
|
|
499
|
+
log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
|
|
500
|
+
log.close
|
|
501
|
+
puts "DONE!"
|
|
502
|
+
rescue => e
|
|
503
|
+
puts "`tcs` pipeline run with errors: " + e.message.red
|
|
504
|
+
puts "`tcs` pipeline aborted.".red.bold
|
|
505
|
+
log.puts Time.now.to_s + "\t" + e.full_message
|
|
506
|
+
log.puts Time.now.to_s + "\tAborted."
|
|
507
|
+
log.close
|
|
508
|
+
error_hash = {}
|
|
509
|
+
error_hash[:directory] = indir
|
|
510
|
+
error_hash[:tcs_version] = ViralSeq::TCS_VERSION
|
|
511
|
+
error_hash[:viralSeq_version] = ViralSeq::VERSION
|
|
512
|
+
error_hash[:time] = Time.now
|
|
513
|
+
error_hash[:error] = e.full_message
|
|
514
|
+
File.open(File.join(indir, ".tcs_error"), 'w') do |f|
|
|
515
|
+
f.puts JSON.pretty_generate([error_hash])
|
|
516
|
+
end
|
|
517
|
+
master_error_file = File.join(File.dirname(indir), ".tcs_error")
|
|
518
|
+
master_errors = []
|
|
519
|
+
if File.exist? master_error_file
|
|
520
|
+
master_errors << JSON.parse(File.read(master_error_file), symbolize_names: true)
|
|
521
|
+
end
|
|
522
|
+
master_errors << error_hash
|
|
523
|
+
File.open(master_error_file, 'w') do |f|
|
|
524
|
+
f.puts JSON.pretty_generate(master_errors)
|
|
525
|
+
end
|
|
504
526
|
end
|
|
505
|
-
log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
|
|
506
|
-
log.close
|
|
507
|
-
puts "DONE!"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
|
@@ -116,6 +116,8 @@ module ViralSeq
|
|
|
116
116
|
|
|
117
117
|
File.open(fastq_file,'r') do |file|
|
|
118
118
|
file.readlines.collect do |line|
|
|
119
|
+
line.tr!("\u0000","")
|
|
120
|
+
next if line == "\n"
|
|
119
121
|
count +=1
|
|
120
122
|
count_m = count % 4
|
|
121
123
|
if count_m == 1
|
|
@@ -397,7 +399,9 @@ module ViralSeq
|
|
|
397
399
|
(0..(seq_length - 1)).each do |position|
|
|
398
400
|
all_base = []
|
|
399
401
|
seq_array.each do |seq|
|
|
400
|
-
|
|
402
|
+
if seq[position]
|
|
403
|
+
all_base << seq[position]
|
|
404
|
+
end
|
|
401
405
|
end
|
|
402
406
|
base_count = all_base.count_freq
|
|
403
407
|
max_base_list = []
|
data/lib/viral_seq/tcs_core.rb
CHANGED
|
@@ -305,7 +305,8 @@ module ViralSeq
|
|
|
305
305
|
end
|
|
306
306
|
|
|
307
307
|
def general_filter(seq)
|
|
308
|
-
|
|
308
|
+
return false unless seq
|
|
309
|
+
if seq.size < ($platform_sequencing_length - 10)
|
|
309
310
|
return false
|
|
310
311
|
elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
|
311
312
|
return false
|
data/lib/viral_seq/tcs_dr.rb
CHANGED
|
@@ -8,7 +8,7 @@ module ViralSeq
|
|
|
8
8
|
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCACTATAGGCTGTACTGTCCATTTATC",
|
|
9
9
|
:forward=>
|
|
10
10
|
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
|
|
11
|
-
:majority=>0
|
|
11
|
+
:majority=>0,
|
|
12
12
|
:end_join=>true,
|
|
13
13
|
:end_join_option=>1,
|
|
14
14
|
:overlap=>0,
|
|
@@ -23,7 +23,7 @@ module ViralSeq
|
|
|
23
23
|
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
|
|
24
24
|
:forward=>
|
|
25
25
|
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
|
|
26
|
-
:majority=>0
|
|
26
|
+
:majority=>0,
|
|
27
27
|
:end_join=>true,
|
|
28
28
|
:end_join_option=>3,
|
|
29
29
|
:TCS_QC=>true,
|
|
@@ -39,7 +39,7 @@ module ViralSeq
|
|
|
39
39
|
:cdna=>
|
|
40
40
|
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNATCGAATACTGCCATTTGTACTGC",
|
|
41
41
|
:forward=>"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNAAAAGGAGAAGCCATGCATG",
|
|
42
|
-
:majority=>0
|
|
42
|
+
:majority=>0,
|
|
43
43
|
:end_join=>true,
|
|
44
44
|
:end_join_option=>3,
|
|
45
45
|
:overlap=>171,
|
|
@@ -54,7 +54,7 @@ module ViralSeq
|
|
|
54
54
|
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
|
|
55
55
|
:forward=>
|
|
56
56
|
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
|
|
57
|
-
:majority=>0
|
|
57
|
+
:majority=>0,
|
|
58
58
|
:end_join=>true,
|
|
59
59
|
:end_join_option=>1,
|
|
60
60
|
:overlap=>0,
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: viral_seq
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.2.
|
|
4
|
+
version: 1.2.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Shuntai Zhou
|
|
8
8
|
- Michael Clark
|
|
9
|
-
autorequire:
|
|
9
|
+
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2021-
|
|
12
|
+
date: 2021-07-30 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: bundler
|
|
@@ -214,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
214
214
|
requirements:
|
|
215
215
|
- R required for some functions
|
|
216
216
|
rubygems_version: 3.2.2
|
|
217
|
-
signing_key:
|
|
217
|
+
signing_key:
|
|
218
218
|
specification_version: 4
|
|
219
219
|
summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
220
220
|
test_files: []
|