viral_seq 1.1.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +16 -3
- data/README.md +99 -12
- data/bin/tcs +54 -10
- data/bin/tcs_log +20 -1
- data/bin/tcs_sdrm +409 -0
- data/docs/assets/img/cover.jpg +0 -0
- data/{doc → docs}/dr.json +0 -1
- data/docs/sample_miseq_data/hivdr_control/r1.fastq.gz +0 -0
- data/docs/sample_miseq_data/hivdr_control/r2.fastq.gz +0 -0
- data/lib/viral_seq.rb +5 -1
- data/lib/viral_seq/constant.rb +41 -4
- data/lib/viral_seq/hivdr.rb +1 -1
- data/lib/viral_seq/muscle.rb +3 -2
- data/lib/viral_seq/recency.rb +52 -0
- data/lib/viral_seq/sdrm.rb +101 -35
- data/lib/viral_seq/seq_hash.rb +24 -4
- data/lib/viral_seq/sequence.rb +1 -84
- data/lib/viral_seq/tcs_dr.rb +71 -0
- data/lib/viral_seq/version.rb +2 -2
- data/viral_seq.gemspec +11 -0
- metadata +72 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a235cae95121a8522a47620eb9f8c05a3e2e416084743cd23df43aff7870a2c4
|
|
4
|
+
data.tar.gz: f0ce3a9412774eed703b0b0b663e7bb2dccf340f3f558cffdca85e920291794d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b97f98e40b8257281bd29cee40942d16084cf175933fc8357838ebb2a9eede1ab93ba323dbf315afb300f0a7852b2c6d939235831124710fc6f16f109e3eafc5
|
|
7
|
+
data.tar.gz: 4d660da22c69ce1ff929ed7f67d2b03aad662bb0237e9a93d9a8ea6bd1866d8544ad108db9ab8a11eee2df992395e41b68ffc43a8d1dbb132cc1f83a897676ef
|
data/Gemfile.lock
CHANGED
|
@@ -1,16 +1,27 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
viral_seq (1.
|
|
5
|
-
colorize (
|
|
6
|
-
|
|
4
|
+
viral_seq (1.1.1)
|
|
5
|
+
colorize (>= 0.1)
|
|
6
|
+
combine_pdf (>= 1.0.0)
|
|
7
|
+
muscle_bio (>= 0.4)
|
|
8
|
+
prawn (>= 2.3.0)
|
|
9
|
+
prawn-table (>= 0.2.0)
|
|
7
10
|
|
|
8
11
|
GEM
|
|
9
12
|
remote: https://rubygems.org/
|
|
10
13
|
specs:
|
|
11
14
|
colorize (0.8.1)
|
|
15
|
+
combine_pdf (1.0.21)
|
|
16
|
+
ruby-rc4 (>= 0.1.5)
|
|
12
17
|
diff-lcs (1.3)
|
|
13
18
|
muscle_bio (0.4.0)
|
|
19
|
+
pdf-core (0.9.0)
|
|
20
|
+
prawn (2.4.0)
|
|
21
|
+
pdf-core (~> 0.9.0)
|
|
22
|
+
ttfunk (~> 1.7)
|
|
23
|
+
prawn-table (0.2.2)
|
|
24
|
+
prawn (>= 1.3.0, < 3.0.0)
|
|
14
25
|
rake (13.0.1)
|
|
15
26
|
rspec (3.8.0)
|
|
16
27
|
rspec-core (~> 3.8.0)
|
|
@@ -25,6 +36,8 @@ GEM
|
|
|
25
36
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
26
37
|
rspec-support (~> 3.8.0)
|
|
27
38
|
rspec-support (3.8.0)
|
|
39
|
+
ruby-rc4 (0.1.5)
|
|
40
|
+
ttfunk (1.7.0)
|
|
28
41
|
|
|
29
42
|
PLATFORMS
|
|
30
43
|
ruby
|
data/README.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# ViralSeq
|
|
2
2
|
|
|
3
|
+
[](https://rubygems.org/gems/viral_seq)
|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+
[](https://gitter.im/viral_seq/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
|
8
|
+
|
|
3
9
|
A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
4
10
|
|
|
5
11
|
Specifically for Primer ID sequencing and HIV drug resistance analysis.
|
|
@@ -7,11 +13,12 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
|
|
|
7
13
|
## Illustration for the Primer ID Sequencing
|
|
8
14
|
|
|
9
15
|
|
|
10
|
-

|
|
11
17
|
|
|
12
18
|
### Reference readings on the Primer ID sequencing
|
|
13
|
-
[Primer ID
|
|
14
|
-
[Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
|
|
19
|
+
[Explantion of Primer ID sequencing](https://doi.org/10.21769/BioProtoc.3938)
|
|
20
|
+
[Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
|
|
21
|
+
[Application of Primer ID sequencing in COVID-19 research](https://doi.org/10.1126/scitranslmed.abb5883)
|
|
15
22
|
|
|
16
23
|
## Install
|
|
17
24
|
|
|
@@ -24,14 +31,23 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
|
|
|
24
31
|
### Excutables
|
|
25
32
|
|
|
26
33
|
### `tcs`
|
|
27
|
-
Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
|
|
34
|
+
Use executable `tcs` pipeline (v2.3.2) to process **Primer ID MiSeq sequencing** data.
|
|
28
35
|
|
|
29
36
|
Example commands:
|
|
30
37
|
```bash
|
|
31
38
|
$ tcs -p params.json # run TCS pipeline with params.json
|
|
39
|
+
$ tcs -p params.json -i DIRECTORY
|
|
40
|
+
# run TCS pipeline with params.json and DIRECTORY
|
|
41
|
+
# if DIRECTORY is not defined in params.json
|
|
42
|
+
$ tcs -dr -i DIRECTORY
|
|
43
|
+
# run tcs-dr (MPID HIV drug resistance sequencing) pipeline
|
|
44
|
+
# DIRECTORY needs to be given.
|
|
32
45
|
$ tcs -j # CLI to generate params.json
|
|
33
46
|
$ tcs -h # print out the help
|
|
34
47
|
```
|
|
48
|
+
|
|
49
|
+
[sample params.json for the tcs-dr pipeline](./docs/dr.json)
|
|
50
|
+
|
|
35
51
|
---
|
|
36
52
|
### `tcs_log`
|
|
37
53
|
|
|
@@ -53,6 +69,44 @@ Example command:
|
|
|
53
69
|
$ tcs_log batch_tcs_jobs
|
|
54
70
|
```
|
|
55
71
|
|
|
72
|
+
---
|
|
73
|
+
### `tcs_sdrm`
|
|
74
|
+
|
|
75
|
+
Use `tcs_sdrm` pipeline for HIV-1 drug resistance mutation and recency.
|
|
76
|
+
|
|
77
|
+
Example command:
|
|
78
|
+
```bash
|
|
79
|
+
$ tcs_sdrm libs_dir
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
lib_dir file structure:
|
|
83
|
+
```
|
|
84
|
+
libs_dir/
|
|
85
|
+
├── lib1
|
|
86
|
+
├── lib1_RT
|
|
87
|
+
├── lib1_PR
|
|
88
|
+
├── lib1_IN
|
|
89
|
+
├── lib1_V1V3
|
|
90
|
+
├── lib2
|
|
91
|
+
├── lib1_RT
|
|
92
|
+
├── lib1_PR
|
|
93
|
+
├── lib1_IN
|
|
94
|
+
├── lib1_V1V3
|
|
95
|
+
├── ...
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Output data in a new dir as 'libs_dir_SDRM'
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
**Note: [R](https://www.r-project.org/) and the following R libraries are required:**
|
|
102
|
+
- phangorn
|
|
103
|
+
- ape
|
|
104
|
+
- scales
|
|
105
|
+
- ggforce
|
|
106
|
+
- cowplot
|
|
107
|
+
- magrittr
|
|
108
|
+
- gridExtra
|
|
109
|
+
|
|
56
110
|
---
|
|
57
111
|
|
|
58
112
|
### `locator`
|
|
@@ -93,7 +147,7 @@ qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
|
|
93
147
|
Further filter out sequences with Apobec3g/f hypermutations
|
|
94
148
|
|
|
95
149
|
```ruby
|
|
96
|
-
qc_seqhash = qc_seqhash.a3g
|
|
150
|
+
qc_seqhash = qc_seqhash.a3g[:filtered_seq]
|
|
97
151
|
```
|
|
98
152
|
|
|
99
153
|
Calculate nucleotide diveristy π
|
|
@@ -121,15 +175,48 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
|
121
175
|
|
|
122
176
|
## Updates
|
|
123
177
|
|
|
178
|
+
### Version 1.2.2-05272021
|
|
179
|
+
|
|
180
|
+
1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
|
|
181
|
+
`tcs` pipeline upgraded to v2.3.2
|
|
182
|
+
|
|
183
|
+
### Version 1.2.1-05172021
|
|
184
|
+
|
|
185
|
+
1. Added a function in R to check and install missing R packages for `tcs_sdrm` pipeline.
|
|
186
|
+
|
|
187
|
+
### Version 1.2.0-05102021
|
|
188
|
+
|
|
189
|
+
1. Added `tcs_sdrm` pipeline as an excutable.
|
|
190
|
+
`tcs_sdrm` processes `tcs`-processed HIV MPID-NGS data for drug resistance mutations, recency and phylogentic analysis.
|
|
191
|
+
|
|
192
|
+
2. Added function ViralSeq::SeqHash#sample.
|
|
193
|
+
|
|
194
|
+
3. Added recency determining function `ViralSeq::Recency::define`
|
|
195
|
+
|
|
196
|
+
4. Fixed a few bugs related to `tcs_sdrm`.
|
|
197
|
+
|
|
198
|
+
### Version 1.1.2-04262021
|
|
199
|
+
|
|
200
|
+
1. Added function `ViralSeq::DRMs.sdrm_json` to export SDRM as json object.
|
|
201
|
+
2. Added a random string to the temp file names for `muscle_bio` to avoid issues when running scripts in parallel.
|
|
202
|
+
3. Added `--keep-original` flag to the `tcs` pipeline.
|
|
203
|
+
|
|
204
|
+
### Version 1.1.1-04012021
|
|
205
|
+
|
|
206
|
+
1. Added warning when paired_raw_sequence less than 0.1% of total_raw_sequence.
|
|
207
|
+
2. Added option `-i WORKING_DIRECTORY` to the `tcs` script.
|
|
208
|
+
If the `params.json` file does not contain the path to the working directory, it will append path to the run params.
|
|
209
|
+
3. Added option `-dr` to the `tcs` script.
|
|
210
|
+
|
|
124
211
|
### Version 1.1.0-03252021
|
|
125
212
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
213
|
+
1. Optimized the algorithm of end-join.
|
|
214
|
+
2. Fixed a bug in the `tcs` pipeline that sometimes combined tcs files are not saved.
|
|
215
|
+
3. Added `tcs_log` command to pool run logs and tcs files from one batch of tcs jobs.
|
|
216
|
+
4. Added the preset of MPID-HIVDR params file [***dr.json***](./docs/dr.json) in /docs.
|
|
217
|
+
5. Add `platform_format` option in the json generator of the `tcs` Pipeline.
|
|
218
|
+
Users can choose from 3 MiSeq platforms for processing their sequencing data.
|
|
219
|
+
MiSeq 300x7x300 is the default option.
|
|
133
220
|
|
|
134
221
|
### Version 1.0.14-03052021
|
|
135
222
|
|
data/bin/tcs
CHANGED
|
@@ -46,11 +46,23 @@ OptionParser.new do |opts|
|
|
|
46
46
|
options[:params_json] = p
|
|
47
47
|
end
|
|
48
48
|
|
|
49
|
+
opts.on("-i", "--input PATH_TO_WORKING_DIRECTORY", "Path to the working directory") do |p|
|
|
50
|
+
options[:input] = p
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
opts.on("-dr", "--dr_pipeline", "HIV drug resistance MPID pipeline") do |p|
|
|
54
|
+
options[:dr] = true
|
|
55
|
+
end
|
|
56
|
+
|
|
49
57
|
opts.on("-h", "--help", "Prints this help") do
|
|
50
58
|
puts opts
|
|
51
59
|
exit
|
|
52
60
|
end
|
|
53
61
|
|
|
62
|
+
opts.on("--keep-original", "keep raw sequence files") do
|
|
63
|
+
options[:keep] = true
|
|
64
|
+
end
|
|
65
|
+
|
|
54
66
|
opts.on("-v", "--version", "Version info") do
|
|
55
67
|
puts "tcs version: " + ViralSeq::TCS_VERSION.red.bold
|
|
56
68
|
puts "viral_seq version: " + ViralSeq::VERSION.red.bold
|
|
@@ -64,15 +76,21 @@ end.parse!
|
|
|
64
76
|
|
|
65
77
|
if options[:json_generator]
|
|
66
78
|
params = ViralSeq::TcsJson.generate
|
|
79
|
+
elsif options[:dr]
|
|
80
|
+
params = ViralSeq::TcsDr::PARAMS
|
|
67
81
|
elsif (options[:params_json] && File.exist?(options[:params_json]))
|
|
68
82
|
params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
|
|
69
83
|
else
|
|
70
84
|
abort "No params JSON file found. Script terminated.".red
|
|
71
85
|
end
|
|
72
86
|
|
|
73
|
-
|
|
87
|
+
if options[:input]
|
|
88
|
+
indir = options[:input]
|
|
89
|
+
else
|
|
90
|
+
indir = params[:raw_sequence_dir]
|
|
91
|
+
end
|
|
74
92
|
|
|
75
|
-
unless File.exist?(indir)
|
|
93
|
+
unless indir and File.exist?(indir)
|
|
76
94
|
abort "No input sequence directory found. Script terminated.".red.bold
|
|
77
95
|
end
|
|
78
96
|
|
|
@@ -129,6 +147,7 @@ end
|
|
|
129
147
|
|
|
130
148
|
primers.each do |primer|
|
|
131
149
|
summary_json = {}
|
|
150
|
+
summary_json[:warnings] = []
|
|
132
151
|
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
|
133
152
|
summary_json[:viralseq_version] = ViralSeq::VERSION
|
|
134
153
|
summary_json[:runtime] = Time.now.to_s
|
|
@@ -140,6 +159,7 @@ primers.each do |primer|
|
|
|
140
159
|
forward_primer = primer[:forward]
|
|
141
160
|
|
|
142
161
|
export_raw = primer[:export_raw]
|
|
162
|
+
limit_raw = primer[:limit_raw]
|
|
143
163
|
|
|
144
164
|
unless cdna_primer
|
|
145
165
|
log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
|
|
@@ -181,6 +201,10 @@ primers.each do |primer|
|
|
|
181
201
|
paired_seq_number = common_keys.size
|
|
182
202
|
log.puts Time.now.to_s + "\t" + "Paired raw sequences are : #{paired_seq_number.to_s}"
|
|
183
203
|
summary_json[:paired_raw_sequence] = paired_seq_number
|
|
204
|
+
if paired_seq_number < raw_sequence_number * 0.001
|
|
205
|
+
summary_json[:warnings] <<
|
|
206
|
+
"WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
|
|
207
|
+
end
|
|
184
208
|
|
|
185
209
|
common_keys.each do |seqtag|
|
|
186
210
|
r1_seq = r1_passed_seq[seqtag]
|
|
@@ -242,7 +266,13 @@ primers.each do |primer|
|
|
|
242
266
|
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
|
243
267
|
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
|
244
268
|
|
|
245
|
-
|
|
269
|
+
if limit_raw
|
|
270
|
+
raw_keys = bio_r1.keys.sample(limit_raw.to_i)
|
|
271
|
+
else
|
|
272
|
+
raw_keys = bio_r1.keys
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
raw_keys.each do |k|
|
|
246
276
|
raw_r1_f.puts k + "_r1"
|
|
247
277
|
raw_r2_f.puts k + "_r2"
|
|
248
278
|
raw_r1_f.puts bio_r1[k]
|
|
@@ -341,9 +371,21 @@ primers.each do |primer|
|
|
|
341
371
|
# Primer ID distribution in .json file
|
|
342
372
|
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
|
343
373
|
pid_json = {}
|
|
344
|
-
pid_json[:primer_id_in_use] =
|
|
345
|
-
|
|
346
|
-
|
|
374
|
+
pid_json[:primer_id_in_use] = {}
|
|
375
|
+
primer_id_in_use.sort_by {|k, v| [-v,k]}.each do |k,v|
|
|
376
|
+
pid_json[:primer_id_in_use][k] = v
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
pid_json[:primer_id_distribution] = {}
|
|
380
|
+
primer_id_dis.sort_by{|k,v| k}.each do |k,v|
|
|
381
|
+
pid_json[:primer_id_distribution][k] = v
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
pid_json[:primer_id_frequency] = {}
|
|
385
|
+
primer_id_count.sort_by {|k,v| [-v,k]}.each do |k,v|
|
|
386
|
+
pid_json[:primer_id_frequency][k] = v
|
|
387
|
+
end
|
|
388
|
+
|
|
347
389
|
File.open(out_pid_json, 'w') do |f|
|
|
348
390
|
f.puts JSON.pretty_generate(pid_json)
|
|
349
391
|
end
|
|
@@ -455,9 +497,11 @@ primers.each do |primer|
|
|
|
455
497
|
end
|
|
456
498
|
end
|
|
457
499
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
File.unlink(
|
|
461
|
-
|
|
500
|
+
unless options[:keep]
|
|
501
|
+
log.puts Time.now.to_s + "\t" + "Removing raw sequence files..."
|
|
502
|
+
File.unlink(r1_f)
|
|
503
|
+
File.unlink(r2_f)
|
|
504
|
+
end
|
|
505
|
+
log.puts Time.now.to_s + "\t" + "TCS pipeline successfuly executed."
|
|
462
506
|
log.close
|
|
463
507
|
puts "DONE!"
|
data/bin/tcs_log
CHANGED
|
@@ -37,8 +37,26 @@ Dir.mkdir(outdir4) unless File.directory?(outdir4)
|
|
|
37
37
|
|
|
38
38
|
log_file = File.join(tcs_dir,"log.csv")
|
|
39
39
|
log = File.open(log_file,'w')
|
|
40
|
-
log.puts "lib name,Region,Raw Sequences per barcode,R1 Raw,R2 Raw,Paired Raw,Cutoff,PID Length,Consensus1,Consensus2,Distinct to Raw,Resampling index,Combined TCS,Combined TCS after QC"
|
|
41
40
|
|
|
41
|
+
header = %w{
|
|
42
|
+
lib_name
|
|
43
|
+
Region
|
|
44
|
+
Raw_Sequences_per_barcode
|
|
45
|
+
R1_Raw
|
|
46
|
+
R2_Raw
|
|
47
|
+
Paired_Raw
|
|
48
|
+
Cutoff
|
|
49
|
+
PID_Length
|
|
50
|
+
Consensus1
|
|
51
|
+
Consensus2
|
|
52
|
+
Distinct_to_Raw
|
|
53
|
+
Resampling_index
|
|
54
|
+
Combined_TCS
|
|
55
|
+
Combined_TCS_after_QC
|
|
56
|
+
WARNINGS
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
log.puts header.join(',')
|
|
42
60
|
libs.each do |lib|
|
|
43
61
|
Dir.mkdir(File.join(outdir2, lib)) unless File.directory?(File.join(outdir2, lib))
|
|
44
62
|
fasta_files = []
|
|
@@ -77,6 +95,7 @@ libs.each do |lib|
|
|
|
77
95
|
json_log[:resampling_param],
|
|
78
96
|
json_log[:combined_tcs],
|
|
79
97
|
json_log[:combined_tcs_after_qc],
|
|
98
|
+
json_log[:warnings],
|
|
80
99
|
].join(',') + "\n"
|
|
81
100
|
end
|
|
82
101
|
end
|
data/bin/tcs_sdrm
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# tcs/sdrm pipeline for HIV-1 drug resistance mutation and recency
|
|
3
|
+
#
|
|
4
|
+
# command example:
|
|
5
|
+
# $ tcs_sdrm libs_dir
|
|
6
|
+
#
|
|
7
|
+
# lib_dir file structure:
|
|
8
|
+
# libs_dir
|
|
9
|
+
# ├── lib1
|
|
10
|
+
# ├── lib1_RT
|
|
11
|
+
# ├── lib1_PR
|
|
12
|
+
# ├── lib1_IN
|
|
13
|
+
# ├── lib1_V1V3
|
|
14
|
+
# ├── lib2
|
|
15
|
+
# ├── lib1_RT
|
|
16
|
+
# ├── lib1_PR
|
|
17
|
+
# ├── lib1_IN
|
|
18
|
+
# ├── lib1_V1V3
|
|
19
|
+
# ├── ...
|
|
20
|
+
#
|
|
21
|
+
# output data in a new dir as 'libs_dir_SDRM'
|
|
22
|
+
|
|
23
|
+
require 'viral_seq'
|
|
24
|
+
require 'json'
|
|
25
|
+
require 'csv'
|
|
26
|
+
require 'fileutils'
|
|
27
|
+
require 'prawn'
|
|
28
|
+
require 'prawn/table'
|
|
29
|
+
require 'combine_pdf'
|
|
30
|
+
|
|
31
|
+
unless ARGV[0] && File.directory?(ARGV[0])
|
|
32
|
+
abort "No sequence data provided. `tcs_sdrm` pipeline aborted. "
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
begin
|
|
36
|
+
r_version = `R --version`.split("\n")[0]
|
|
37
|
+
r_check = `R -e '#{ViralSeq::R_SCRIPT_CHECK_PACKAGES}' > /dev/null 2>&1`
|
|
38
|
+
rescue Errno::ENOENT
|
|
39
|
+
abort '"R" is not installed. Install R at https://www.r-project.org/' +
|
|
40
|
+
"\n`tcs_sdrm` pipeline aborted."
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def abstract_line(data)
|
|
44
|
+
return_data = data[3] + data[2] + data[4] + ":" +
|
|
45
|
+
(data[6].to_f * 100).round(2).to_s + "(" +
|
|
46
|
+
(data[7].to_f * 100).round(2).to_s + "-" +
|
|
47
|
+
(data[8].to_f * 100).round(2).to_s + "); "
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# run params
|
|
51
|
+
log = []
|
|
52
|
+
|
|
53
|
+
log << { time: Time.now }
|
|
54
|
+
log << { viral_seq_version: ViralSeq::VERSION }
|
|
55
|
+
log << { tcs_version: ViralSeq::TCS_VERSION }
|
|
56
|
+
log << { R_version: r_version}
|
|
57
|
+
sdrm_list = {}
|
|
58
|
+
sdrm_list[:nrti] = ViralSeq::DRMs.sdrm_json(:nrti)
|
|
59
|
+
sdrm_list[:nnrti] = ViralSeq::DRMs.sdrm_json(:nnrti)
|
|
60
|
+
sdrm_list[:hiv_pr] = ViralSeq::DRMs.sdrm_json(:hiv_pr)
|
|
61
|
+
sdrm_list[:hiv_in] = ViralSeq::DRMs.sdrm_json(:hiv_in)
|
|
62
|
+
log << { sdrm_list: sdrm_list }
|
|
63
|
+
|
|
64
|
+
# input dir
|
|
65
|
+
indir = ARGV[0]
|
|
66
|
+
libs = Dir[indir + "/*"]
|
|
67
|
+
log << { processed_libs: libs }
|
|
68
|
+
|
|
69
|
+
#output dir
|
|
70
|
+
outdir = indir + "_SDRM"
|
|
71
|
+
Dir.mkdir(outdir) unless File.directory?(outdir)
|
|
72
|
+
|
|
73
|
+
libs.each do |lib|
|
|
74
|
+
|
|
75
|
+
r_script = ViralSeq::R_SCRIPT.dup
|
|
76
|
+
|
|
77
|
+
next unless File.directory?(lib)
|
|
78
|
+
|
|
79
|
+
lib_name = File.basename(lib)
|
|
80
|
+
out_lib_dir = File.join(outdir, lib_name)
|
|
81
|
+
Dir.mkdir(out_lib_dir) unless File.directory?(out_lib_dir)
|
|
82
|
+
|
|
83
|
+
sub_seq_files = Dir[lib + "/*"]
|
|
84
|
+
|
|
85
|
+
seq_summary_file = File.join(out_lib_dir, (lib_name + "_summary.csv"))
|
|
86
|
+
seq_summary_out = File.open(seq_summary_file, "w")
|
|
87
|
+
seq_summary_out.puts 'Region,TCS,TCS with A3G/F hypermutation,TCS with stop codon,' +
|
|
88
|
+
'TCS w/o hypermutation and stop codon,' +
|
|
89
|
+
'Poisson cutoff for minority mutation (>=),Pi,Dist20'
|
|
90
|
+
|
|
91
|
+
point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
|
|
92
|
+
point_mutation_out = File.open(point_mutation_file, "w")
|
|
93
|
+
point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
|
|
94
|
+
"number,percentage,95% CI low, 95% CI high, notes"
|
|
95
|
+
|
|
96
|
+
linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
|
|
97
|
+
linkage_out = File.open(linkage_file, "w")
|
|
98
|
+
linkage_out.puts "region,TCS,mutation linkage,number," +
|
|
99
|
+
"percentage,95% CI low, 95% CI high, notes"
|
|
100
|
+
|
|
101
|
+
aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
|
|
102
|
+
aa_report_out = File.open(aa_report_file, "w")
|
|
103
|
+
aa_report_out.puts "region,ref.aa.positions,TCS.number," +
|
|
104
|
+
ViralSeq::AMINO_ACID_LIST.join(",")
|
|
105
|
+
|
|
106
|
+
summary_json_file = File.join(out_lib_dir, (lib_name + "_summary.json"))
|
|
107
|
+
summary_json_out = File.open(summary_json_file,"w")
|
|
108
|
+
|
|
109
|
+
filtered_seq_dir = File.join(out_lib_dir, (lib_name + "_filtered_seq"))
|
|
110
|
+
Dir.mkdir(filtered_seq_dir) unless File.directory?(filtered_seq_dir)
|
|
111
|
+
|
|
112
|
+
aln_seq_dir = File.join(out_lib_dir, (lib_name + "_aln_seq"))
|
|
113
|
+
Dir.mkdir(aln_seq_dir) unless File.directory?(aln_seq_dir)
|
|
114
|
+
|
|
115
|
+
point_mutation_list = []
|
|
116
|
+
linkage_list = []
|
|
117
|
+
aa_report_list = []
|
|
118
|
+
summary_hash = {}
|
|
119
|
+
|
|
120
|
+
sub_seq_files.each do |sub_seq|
|
|
121
|
+
seq_basename = File.basename(sub_seq)
|
|
122
|
+
seqs = ViralSeq::SeqHash.fa(sub_seq)
|
|
123
|
+
next if seqs.size < 3
|
|
124
|
+
if seq_basename =~ /V1V3/i
|
|
125
|
+
summary_hash[:V1V3] = "#{seqs.size.to_s},NA,NA,NA,NA"
|
|
126
|
+
FileUtils.cp(sub_seq, filtered_seq_dir)
|
|
127
|
+
elsif seq_basename =~ /PR/i
|
|
128
|
+
a3g_check = seqs.a3g
|
|
129
|
+
a3g_seqs = a3g_check[:a3g_seq]
|
|
130
|
+
a3g_filtered_seqs = a3g_check[:filtered_seq]
|
|
131
|
+
stop_codon_check = a3g_filtered_seqs.stop_codon
|
|
132
|
+
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
|
133
|
+
filtered_seqs = stop_codon_check[:without_stop_codon]
|
|
134
|
+
poisson_minority_cutoff = filtered_seqs.pm
|
|
135
|
+
summary_hash[:PR] = [
|
|
136
|
+
seqs.size.to_s,
|
|
137
|
+
a3g_seqs.size.to_s,
|
|
138
|
+
stop_codon_seqs.size.to_s,
|
|
139
|
+
filtered_seqs.size.to_s,
|
|
140
|
+
poisson_minority_cutoff.to_s
|
|
141
|
+
].join(',')
|
|
142
|
+
next if filtered_seqs.size < 3
|
|
143
|
+
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
|
144
|
+
|
|
145
|
+
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
|
|
146
|
+
point_mutation_list += sdrm[0]
|
|
147
|
+
linkage_list += sdrm[1]
|
|
148
|
+
aa_report_list += sdrm[2]
|
|
149
|
+
|
|
150
|
+
elsif seq_basename =~/IN/i
|
|
151
|
+
a3g_check = seqs.a3g
|
|
152
|
+
a3g_seqs = a3g_check[:a3g_seq]
|
|
153
|
+
a3g_filtered_seqs = a3g_check[:filtered_seq]
|
|
154
|
+
stop_codon_check = a3g_filtered_seqs.stop_codon(2)
|
|
155
|
+
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
|
156
|
+
filtered_seqs = stop_codon_check[:without_stop_codon]
|
|
157
|
+
poisson_minority_cutoff = filtered_seqs.pm
|
|
158
|
+
summary_hash[:IN] = [
|
|
159
|
+
seqs.size.to_s,
|
|
160
|
+
a3g_seqs.size.to_s,
|
|
161
|
+
stop_codon_seqs.size.to_s,
|
|
162
|
+
filtered_seqs.size.to_s,
|
|
163
|
+
poisson_minority_cutoff.to_s
|
|
164
|
+
].join(',')
|
|
165
|
+
next if filtered_seqs.size < 3
|
|
166
|
+
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
|
167
|
+
|
|
168
|
+
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
|
|
169
|
+
point_mutation_list += sdrm[0]
|
|
170
|
+
linkage_list += sdrm[1]
|
|
171
|
+
aa_report_list += sdrm[2]
|
|
172
|
+
|
|
173
|
+
elsif seq_basename =~/RT/i
|
|
174
|
+
rt_seq1 = {}
|
|
175
|
+
rt_seq2 = {}
|
|
176
|
+
seqs.dna_hash.each do |k,v|
|
|
177
|
+
rt_seq1[k] = v[0,267]
|
|
178
|
+
rt_seq2[k] = v[267..-1]
|
|
179
|
+
end
|
|
180
|
+
rt1 = ViralSeq::SeqHash.new(rt_seq1)
|
|
181
|
+
rt2 = ViralSeq::SeqHash.new(rt_seq2)
|
|
182
|
+
rt1_a3g = rt1.a3g
|
|
183
|
+
rt2_a3g = rt2.a3g
|
|
184
|
+
hypermut_seq_rt1 = rt1_a3g[:a3g_seq]
|
|
185
|
+
hypermut_seq_rt2 = rt2_a3g[:a3g_seq]
|
|
186
|
+
rt1_stop_codon = rt1.stop_codon(1)[:with_stop_codon]
|
|
187
|
+
rt2_stop_codon = rt2.stop_codon(2)[:with_stop_codon]
|
|
188
|
+
hypermut_seq_keys = (hypermut_seq_rt1.dna_hash.keys | hypermut_seq_rt2.dna_hash.keys)
|
|
189
|
+
stop_codon_seq_keys = (rt1_stop_codon.dna_hash.keys | rt2_stop_codon.dna_hash.keys)
|
|
190
|
+
reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
|
|
191
|
+
filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
|
|
192
|
+
poisson_minority_cutoff = filtered_seqs.pm
|
|
193
|
+
summary_hash[:RT] = [
|
|
194
|
+
seqs.size.to_s,
|
|
195
|
+
hypermut_seq_keys.size.to_s,
|
|
196
|
+
stop_codon_seq_keys.size.to_s,
|
|
197
|
+
filtered_seqs.size.to_s,
|
|
198
|
+
poisson_minority_cutoff.to_s
|
|
199
|
+
].join(',')
|
|
200
|
+
next if filtered_seqs.size < 3
|
|
201
|
+
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
|
202
|
+
|
|
203
|
+
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
|
|
204
|
+
point_mutation_list += sdrm[0]
|
|
205
|
+
linkage_list += sdrm[1]
|
|
206
|
+
aa_report_list += sdrm[2]
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
point_mutation_list.each do |record|
|
|
211
|
+
point_mutation_out.puts record.join(",")
|
|
212
|
+
end
|
|
213
|
+
linkage_list.each do |record|
|
|
214
|
+
linkage_out.puts record.join(",")
|
|
215
|
+
end
|
|
216
|
+
aa_report_list.each do |record|
|
|
217
|
+
aa_report_out.puts record.join(",")
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
filtered_seq_files = Dir[filtered_seq_dir + "/*"]
|
|
221
|
+
|
|
222
|
+
out_r_csv = File.join(out_lib_dir, (lib_name + "_pi.csv"))
|
|
223
|
+
out_r_pdf = File.join(out_lib_dir, (lib_name + "_pi.pdf"))
|
|
224
|
+
|
|
225
|
+
if filtered_seq_files.size > 0
|
|
226
|
+
filtered_seq_files.each do |seq_file|
|
|
227
|
+
filtered_sh = ViralSeq::SeqHash.fa(seq_file)
|
|
228
|
+
next if filtered_sh.size < 3
|
|
229
|
+
aligned_sh = filtered_sh.random_select(1000).align
|
|
230
|
+
aligned_sh.write_nt_fa(File.join(aln_seq_dir, File.basename(seq_file)))
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
r_script.gsub!(/PATH_TO_FASTA/,aln_seq_dir)
|
|
234
|
+
File.unlink(out_r_csv) if File.exist?(out_r_csv)
|
|
235
|
+
File.unlink(out_r_pdf) if File.exist?(out_r_pdf)
|
|
236
|
+
r_script.gsub!(/OUTPUT_CSV/,out_r_csv)
|
|
237
|
+
r_script.gsub!(/OUTPUT_PDF/,out_r_pdf)
|
|
238
|
+
r_script_file = File.join(out_lib_dir, "/pi.R")
|
|
239
|
+
File.open(r_script_file,"w") {|line| line.puts r_script}
|
|
240
|
+
print `Rscript #{r_script_file} 1> /dev/null 2> /dev/null`
|
|
241
|
+
if File.exist?(out_r_csv)
|
|
242
|
+
pi_csv = File.readlines(out_r_csv)
|
|
243
|
+
pi_csv.each do |line|
|
|
244
|
+
line.chomp!
|
|
245
|
+
data = line.split(",")
|
|
246
|
+
tag = data[0].split("_")[-1].gsub(/\W/,"").to_sym
|
|
247
|
+
summary_hash[tag] += "," + data[1].to_f.round(4).to_s + "," + data[2].to_f.round(4).to_s
|
|
248
|
+
end
|
|
249
|
+
[:PR, :RT, :IN, :V1V3].each do |regions|
|
|
250
|
+
next unless summary_hash[regions]
|
|
251
|
+
seq_summary_out.puts regions.to_s + "," + summary_hash[regions]
|
|
252
|
+
end
|
|
253
|
+
File.unlink(out_r_csv)
|
|
254
|
+
end
|
|
255
|
+
File.unlink(r_script_file)
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
seq_summary_out.close
|
|
259
|
+
point_mutation_out.close
|
|
260
|
+
linkage_out.close
|
|
261
|
+
aa_report_out.close
|
|
262
|
+
|
|
263
|
+
summary_lines = File.readlines(seq_summary_file)
|
|
264
|
+
summary_lines.shift
|
|
265
|
+
|
|
266
|
+
tcs_PR = 0
|
|
267
|
+
tcs_RT = 0
|
|
268
|
+
tcs_IN = 0
|
|
269
|
+
tcs_V1V3 = 0
|
|
270
|
+
pi_RT = 0.0
|
|
271
|
+
pi_V1V3 = 0.0
|
|
272
|
+
dist20_RT = 0.0
|
|
273
|
+
dist20_V1V3 = 0.0
|
|
274
|
+
summary_lines.each do |line|
|
|
275
|
+
data = line.chomp.split(",")
|
|
276
|
+
if data[0] == "PR"
|
|
277
|
+
tcs_PR = data[4].to_i
|
|
278
|
+
elsif data[0] == "RT"
|
|
279
|
+
tcs_RT = data[4].to_i
|
|
280
|
+
pi_RT = data[6].to_f
|
|
281
|
+
dist20_RT = data[7].to_f
|
|
282
|
+
elsif data[0] == "IN"
|
|
283
|
+
tcs_IN = data[4].to_i
|
|
284
|
+
elsif data[0] == "V1V3"
|
|
285
|
+
tcs_V1V3 = data[1].to_i
|
|
286
|
+
pi_V1V3 = data[6].to_f
|
|
287
|
+
dist20_V1V3 = data[7].to_f
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
recency = ViralSeq::Recency.define(
|
|
292
|
+
tcs_RT: tcs_RT,
|
|
293
|
+
tcs_V1V3: tcs_V1V3,
|
|
294
|
+
pi_RT: pi_RT,
|
|
295
|
+
dist20_RT: dist20_RT,
|
|
296
|
+
pi_V1V3: pi_V1V3,
|
|
297
|
+
dist20_V1V3: dist20_V1V3
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
sdrm_lines = File.readlines(point_mutation_file)
|
|
301
|
+
sdrm_lines.shift
|
|
302
|
+
sdrm_PR = ""
|
|
303
|
+
sdrm_RT = ""
|
|
304
|
+
sdrm_IN = ""
|
|
305
|
+
sdrm_lines.each do |line|
|
|
306
|
+
data = line.chomp.split(",")
|
|
307
|
+
next if data[-1] == "*"
|
|
308
|
+
if data[0] == "PR"
|
|
309
|
+
sdrm_PR += abstract_line(data)
|
|
310
|
+
elsif data[0] =~ /NRTI/
|
|
311
|
+
sdrm_RT += abstract_line(data)
|
|
312
|
+
elsif data[0] == "IN"
|
|
313
|
+
sdrm_IN += abstract_line(data)
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
summary_json = [
|
|
318
|
+
sample_id: lib_name,
|
|
319
|
+
tcs_PR: tcs_PR,
|
|
320
|
+
tcs_RT: tcs_RT,
|
|
321
|
+
tcs_IN: tcs_IN,
|
|
322
|
+
tcs_V1V3: tcs_V1V3,
|
|
323
|
+
pi_RT: pi_RT,
|
|
324
|
+
dist20_RT: dist20_RT,
|
|
325
|
+
dist20_V1V3: dist20_V1V3,
|
|
326
|
+
recency: recency,
|
|
327
|
+
sdrm_PR: sdrm_PR,
|
|
328
|
+
sdrm_RT: sdrm_RT,
|
|
329
|
+
sdrm_IN: sdrm_IN
|
|
330
|
+
]
|
|
331
|
+
|
|
332
|
+
summary_json_out.puts JSON.pretty_generate(summary_json)
|
|
333
|
+
summary_json_out.close
|
|
334
|
+
|
|
335
|
+
csvs = [
|
|
336
|
+
{
|
|
337
|
+
name: "summary",
|
|
338
|
+
title: "Summary",
|
|
339
|
+
file: seq_summary_file,
|
|
340
|
+
newPDF: "",
|
|
341
|
+
table_width: [65,55,110,110,110,110,60,60],
|
|
342
|
+
extra_text: ""
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
name: "substitution",
|
|
346
|
+
title: "Surveillance Drug Resistance Mutations",
|
|
347
|
+
file: point_mutation_file,
|
|
348
|
+
newPDF: "",
|
|
349
|
+
table_width: [65,55,85,80,60,65,85,85,85,45],
|
|
350
|
+
extra_text: "* Mutation below Poisson cut-off for minority mutations"
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
name: "linkage",
|
|
354
|
+
title: "Mutation Linkage",
|
|
355
|
+
file: linkage_file,
|
|
356
|
+
newPDF: "",
|
|
357
|
+
table_width: [55,50,250,60,80,80,80,45],
|
|
358
|
+
extra_text: "* Mutation below Poisson cut-off for minority mutations"
|
|
359
|
+
}
|
|
360
|
+
]
|
|
361
|
+
|
|
362
|
+
csvs.each do |csv|
|
|
363
|
+
file_name = File.join(out_lib_dir, (csv[:name] + ".pdf"))
|
|
364
|
+
next unless File.exist? csv[:file]
|
|
365
|
+
Prawn::Document.generate(file_name, :page_layout => :landscape) do |pdf|
|
|
366
|
+
pdf.text((File.basename(lib, ".*") + ': ' + csv[:title]),
|
|
367
|
+
:size => 20,
|
|
368
|
+
:align => :center,
|
|
369
|
+
:style => :bold)
|
|
370
|
+
pdf.move_down 20
|
|
371
|
+
table_data = CSV.open(csv[:file]).to_a
|
|
372
|
+
header = table_data.first
|
|
373
|
+
pdf.table(table_data,
|
|
374
|
+
:header => header,
|
|
375
|
+
:position => :center,
|
|
376
|
+
:column_widths => csv[:table_width],
|
|
377
|
+
:row_colors => ["B6B6B6", "FFFFFF"],
|
|
378
|
+
:cell_style => {:align => :center, :size => 10}) do |table|
|
|
379
|
+
table.row(0).style :font_style => :bold, :size => 12 #, :background_color => 'ff00ff'
|
|
380
|
+
end
|
|
381
|
+
pdf.move_down 5
|
|
382
|
+
pdf.text(csv[:extra_text], :size => 8, :align => :justify,)
|
|
383
|
+
end
|
|
384
|
+
csv[:newPDF] = file_name
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
pdf = CombinePDF.new
|
|
388
|
+
csvs.each do |csv|
|
|
389
|
+
pdf << CombinePDF.load(csv[:newPDF]) if File.exist?(csv[:newPDF])
|
|
390
|
+
end
|
|
391
|
+
pdf << CombinePDF.load(out_r_pdf) if File.exist?(out_r_pdf)
|
|
392
|
+
|
|
393
|
+
pdf.number_pages location: [:bottom_right],
|
|
394
|
+
number_format: "Swanstrom\'s lab HIV SDRM Pipeline, version #{$sdrm_version_number} by S.Z. and M.U.C. Page %s",
|
|
395
|
+
font_size: 6,
|
|
396
|
+
opacity: 0.5
|
|
397
|
+
|
|
398
|
+
pdf.save File.join(out_lib_dir, (lib_name + ".pdf"))
|
|
399
|
+
|
|
400
|
+
csvs.each do |csv|
|
|
401
|
+
File.unlink csv[:newPDF]
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
log_file = File.join(File.dirname(indir), "sdrm_log.json")
|
|
406
|
+
|
|
407
|
+
File.open(log_file, 'w') { |f| f.puts JSON.pretty_generate(log) }
|
|
408
|
+
|
|
409
|
+
FileUtils.touch(File.join(outdir, ".done"))
|