viral_seq 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +45 -32
- data/bin/tcs +72 -141
- data/lib/viral_seq.rb +3 -0
- data/lib/viral_seq/seq_hash.rb +13 -6
- data/lib/viral_seq/seq_hash_pair.rb +6 -0
- data/lib/viral_seq/tcs_core.rb +303 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +4 -4
- data/bin/tcs_json_generator +0 -166
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14d880e9f39b2b87892bec9d4377b358643c880cf32c81872cff51e1007bc23b
|
4
|
+
data.tar.gz: 6ee1c3293e2b0403a2eac033335f7575625b2d35f32127b5b57be53e94b4ec7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 951b75ced84aa21cf5650baa6970f60a617d3f29d20c14acadacefabea23d6b584f25990453c2008f30197aaef055a94edbdbb45494bb12b6343d90bc6bd45fb
|
7
|
+
data.tar.gz: 68ac69b4ebd5438a8f73780db823c94aa5a78c7c26d02cfd6bec979244dd1d6452c3698ade0606ddbbaccc480ad85e603171c11648dbb0110c2f5dbb3355bb35
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -4,72 +4,76 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
4
4
|
|
5
5
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
6
6
|
|
7
|
-
##
|
7
|
+
## Install
|
8
8
|
|
9
|
+
```bash
|
9
10
|
$ gem install viral_seq
|
11
|
+
```
|
10
12
|
|
11
13
|
## Usage
|
12
14
|
|
13
|
-
|
15
|
+
### Excutables
|
14
16
|
|
15
|
-
|
16
|
-
#!/usr/bin/env ruby
|
17
|
-
require 'viral_seq'
|
18
|
-
```
|
19
|
-
|
20
|
-
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
17
|
+
Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
21
18
|
|
19
|
+
```bash
|
22
20
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
21
|
+
```
|
23
22
|
|
23
|
+
Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data.
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
$ tcs params.json
|
28
|
-
|
29
|
-
|
25
|
+
```bash
|
26
|
+
$ tcs -p params.json # run TCS pipeline with params.json
|
27
|
+
$ tcs -j # CLI to generate params.json
|
28
|
+
$ tcs -h # print out the help
|
29
|
+
```
|
30
30
|
|
31
|
-
|
31
|
+
## Some Examples
|
32
32
|
|
33
|
+
Load all ViralSeq classes by requiring 'viral_seq.rb' in your Ruby scripts.
|
33
34
|
|
34
|
-
|
35
|
+
```ruby
|
36
|
+
#!/usr/bin/env ruby
|
37
|
+
require 'viral_seq'
|
38
|
+
```
|
35
39
|
|
36
|
-
|
40
|
+
Load nucleotide sequences from a FASTA format sequence file
|
37
41
|
|
38
42
|
```ruby
|
39
43
|
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
40
44
|
```
|
41
45
|
|
42
|
-
|
46
|
+
Make an alignment (using MUSCLE)
|
43
47
|
|
44
48
|
```ruby
|
45
49
|
aligned_seqhash = my_seqhash.align
|
46
50
|
```
|
47
51
|
|
48
|
-
|
52
|
+
Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
49
53
|
|
50
54
|
```ruby
|
51
55
|
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
52
56
|
```
|
53
57
|
|
54
|
-
|
58
|
+
Further filter out sequences with Apobec3g/f hypermutations
|
55
59
|
|
56
60
|
```ruby
|
57
61
|
qc_seqhash = qc_seqhash.a3g
|
58
62
|
```
|
59
63
|
|
60
|
-
|
64
|
+
Calculate nucleotide diveristy π
|
61
65
|
|
62
66
|
```ruby
|
63
67
|
qc_seqhash.pi
|
64
68
|
```
|
65
69
|
|
66
|
-
|
70
|
+
Calculate cut-off for minority variants based on Poisson model
|
67
71
|
|
68
72
|
```ruby
|
69
73
|
cut_off = qc_seqhash.pm
|
70
74
|
```
|
71
75
|
|
72
|
-
|
76
|
+
Examine for drug resistance mutations for HIV PR region
|
73
77
|
|
74
78
|
```ruby
|
75
79
|
qc_seqhash.sdrm_hiv_pr(cut_off)
|
@@ -77,13 +81,22 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
77
81
|
|
78
82
|
## Updates
|
79
83
|
|
80
|
-
Version 1.0
|
84
|
+
### Version 1.1.0-11112020:
|
85
|
+
|
86
|
+
1. Modularize TCS pipeline. Move key functions into /viral_seq/tcs_core.rb
|
87
|
+
2. `tcs_json_generator` is removed. This CLI is delivered within the `tcs` pipeline, by running `tcs -j`. The scripts are included in the /viral_seq/tcs_json.rb
|
88
|
+
3. consensus model now includes a true simple majority model, where no nt needs to be over 50% to be called.
|
89
|
+
4. a few optimizations.
|
90
|
+
5. TCS 2.1.0 delivered.
|
91
|
+
6. Tried parallel processing. Cannot achieve the goal because `parallel` gem by default can't pool data from memory of child processors and `in_threads` does not help with the speed.
|
92
|
+
|
93
|
+
### Version 1.0.9-07182020:
|
81
94
|
|
82
95
|
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
83
96
|
|
84
97
|
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
85
98
|
|
86
|
-
Version 1.0.8-02282020:
|
99
|
+
### Version 1.0.8-02282020:
|
87
100
|
|
88
101
|
1. TCS pipeline (version 2.0.0) added as executable.
|
89
102
|
tcs - main TCS pipeline script.
|
@@ -94,14 +107,14 @@ Version 1.0.8-02282020:
|
|
94
107
|
|
95
108
|
3. Bug fix for several methods.
|
96
109
|
|
97
|
-
Version 1.0.7-01282020:
|
110
|
+
### Version 1.0.7-01282020:
|
98
111
|
|
99
112
|
1. Several methods added, including
|
100
113
|
ViralSeq::SeqHash#error_table
|
101
114
|
ViralSeq::SeqHash#random_select
|
102
115
|
2. Improved performance for several functions.
|
103
116
|
|
104
|
-
Version 1.0.6-07232019:
|
117
|
+
### Version 1.0.6-07232019:
|
105
118
|
|
106
119
|
1. Several methods added to ViralSeq::SeqHash, including
|
107
120
|
ViralSeq::SeqHash#size
|
@@ -110,33 +123,33 @@ Version 1.0.6-07232019:
|
|
110
123
|
ViralSeq::SeqHash#mutation
|
111
124
|
2. Update documentations and rspec samples.
|
112
125
|
|
113
|
-
Version 1.0.5-07112019:
|
126
|
+
### Version 1.0.5-07112019:
|
114
127
|
|
115
128
|
1. Update ViralSeq::SeqHash#sequence_locator.
|
116
129
|
Program will try to determine the direction (`+` or `-` of the query sequence)
|
117
130
|
2. update executable `locator` to have a column of `direction` in output .csv file
|
118
131
|
|
119
|
-
Version 1.0.4-07102019:
|
132
|
+
### Version 1.0.4-07102019:
|
120
133
|
|
121
134
|
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
122
135
|
2. Fix bugs in bin `locator`
|
123
136
|
|
124
|
-
Version 1.0.3-07102019:
|
137
|
+
### Version 1.0.3-07102019:
|
125
138
|
|
126
139
|
1. Bug fix.
|
127
140
|
|
128
|
-
Version 1.0.2-07102019:
|
141
|
+
### Version 1.0.2-07102019:
|
129
142
|
|
130
143
|
1. Fixed a gem loading issue.
|
131
144
|
|
132
|
-
Version 1.0.1-07102019:
|
145
|
+
### Version 1.0.1-07102019:
|
133
146
|
|
134
147
|
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
135
148
|
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
136
149
|
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
137
150
|
4. update documentations
|
138
151
|
|
139
|
-
Version 1.0.0-07092019:
|
152
|
+
### Version 1.0.0-07092019:
|
140
153
|
|
141
154
|
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
142
155
|
|
data/bin/tcs
CHANGED
@@ -28,114 +28,79 @@
|
|
28
28
|
require 'viral_seq'
|
29
29
|
require 'json'
|
30
30
|
require 'colorize'
|
31
|
+
require 'OptionParser'
|
31
32
|
|
33
|
+
options = {}
|
32
34
|
|
33
|
-
|
35
|
+
banner = '-'*50 + "\n" +
|
36
|
+
'| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |' + "\n" +
|
37
|
+
'-'*50 + "\n"
|
34
38
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
n = 2
|
41
|
-
else
|
42
|
-
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
43
|
-
end
|
39
|
+
OptionParser.new do |opts|
|
40
|
+
opts.banner = banner + "Usage: tcs -j"
|
41
|
+
opts.on "-j", "--json_generator", "Command line interfac to generate new params json file" do |j|
|
42
|
+
options[:json_generator] = true
|
43
|
+
end
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
else
|
49
|
-
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
50
|
-
end
|
45
|
+
opts.on("-p", "--params PARAMS_JSON", "Execute the pipeline with input params json file") do |p|
|
46
|
+
options[:params_json] = p
|
47
|
+
end
|
51
48
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
elsif m <= 8500
|
56
|
-
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
57
|
-
else
|
58
|
-
n = 0.0079 * m + 9.4869
|
59
|
-
end
|
49
|
+
opts.on("-h", "--help", "Prints this help") do
|
50
|
+
puts opts
|
51
|
+
exit
|
60
52
|
end
|
61
53
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
54
|
+
opts.on("-v", "--version", "Version info") do
|
55
|
+
puts "tcs version: " + ViralSeq::TCS_VERSION.red.bold
|
56
|
+
puts "viral_seq version: " + ViralSeq::VERSION.red.bold
|
57
|
+
exit
|
58
|
+
end
|
66
59
|
|
67
|
-
|
68
|
-
|
69
|
-
|
60
|
+
# opts.on("--no-parallel", "toggle off parallel processing") do
|
61
|
+
# options[:no_parallel] = true
|
62
|
+
# end
|
63
|
+
end.parse!
|
70
64
|
|
71
|
-
|
72
|
-
|
65
|
+
if options[:json_generator]
|
66
|
+
params = ViralSeq::TcsJson.generate
|
67
|
+
elsif (options[:params_json] && File.exist?(options[:params_json]))
|
68
|
+
params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
|
69
|
+
else
|
70
|
+
abort "No params JSON file found. Script terminated.".red
|
73
71
|
end
|
74
72
|
|
75
|
-
params = JSON.parse(File.read(ARGV[0]), symbolize_names: true)
|
76
|
-
|
77
73
|
indir = params[:raw_sequence_dir]
|
78
74
|
|
79
75
|
unless File.exist?(indir)
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
libname = File.basename(indir)
|
84
|
-
|
85
|
-
# obtain R1 and R2 file path
|
86
|
-
files = []
|
87
|
-
Dir.chdir(indir) do
|
88
|
-
files = Dir.glob("*")
|
76
|
+
abort "No input sequence directory found. Script terminated.".red.bold
|
89
77
|
end
|
90
78
|
|
91
|
-
|
92
|
-
raise "Input dir does not contain files. Script terminated."
|
93
|
-
end
|
79
|
+
# log file
|
94
80
|
|
95
|
-
r1_f = ""
|
96
|
-
r2_f = ""
|
97
|
-
|
98
|
-
# unzip .fasta.gz
|
99
|
-
def unzip_r(indir, f)
|
100
|
-
r_file = indir + "/" + f
|
101
|
-
if f =~ /.gz/
|
102
|
-
`gzip -d #{r_file}`
|
103
|
-
new_f = f.sub ".gz", ""
|
104
|
-
r_file = File.join(indir, new_f)
|
105
|
-
end
|
106
|
-
return r_file
|
107
|
-
end
|
108
81
|
runtime_log_file = File.join(indir,"runtime.log")
|
109
82
|
log = File.open(runtime_log_file, "w")
|
110
83
|
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
111
84
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
112
85
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
113
86
|
|
87
|
+
libname = File.basename indir
|
114
88
|
|
115
|
-
|
116
|
-
t = f.split("_")
|
117
|
-
if t.size == 1
|
118
|
-
tag = f
|
119
|
-
else
|
120
|
-
tag = f.split("_")[1..-1].join("_")
|
121
|
-
end
|
89
|
+
seq_files = ViralSeq::TcsCore.r1r2 indir
|
122
90
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
log.puts "R1 file not found. Script terminated."
|
133
|
-
raise "R1 file not found. Script terminated."
|
91
|
+
if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
|
92
|
+
r1_f = seq_files[:r1_file]
|
93
|
+
r2_f = seq_files[:r2_file]
|
94
|
+
elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
|
95
|
+
exit_sig = "Missing R2 file. Aborted."
|
96
|
+
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
97
|
+
exit_sig = "Missing R1 file. Aborted."
|
98
|
+
else
|
99
|
+
exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
|
134
100
|
end
|
135
101
|
|
136
|
-
|
137
|
-
|
138
|
-
raise "R2 file not found. Script terminated."
|
102
|
+
if exit_sig
|
103
|
+
ViralSeq::TcsCore.log_and_abort log, exit_sig
|
139
104
|
end
|
140
105
|
|
141
106
|
r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
|
@@ -152,10 +117,10 @@ end
|
|
152
117
|
|
153
118
|
primers = params[:primer_pairs]
|
154
119
|
if primers.empty?
|
155
|
-
|
156
|
-
raise "No primer information. Script terminated."
|
120
|
+
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
157
121
|
end
|
158
122
|
|
123
|
+
|
159
124
|
primers.each do |primer|
|
160
125
|
summary_json = {}
|
161
126
|
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
@@ -179,66 +144,25 @@ primers.each do |primer|
|
|
179
144
|
summary_json[:cdan_primer] = cdna_primer
|
180
145
|
summary_json[:forward_primer] = forward_primer
|
181
146
|
|
182
|
-
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
147
|
+
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
183
148
|
summary_json[:majority_cut_off] = majority_cut_off
|
184
149
|
|
185
150
|
summary_json[:total_raw_sequence] = raw_sequence_number
|
186
151
|
|
187
152
|
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
188
153
|
|
189
|
-
|
190
|
-
r2_raw = r2_fastq_sh.dna_hash
|
191
|
-
|
154
|
+
# filter R1
|
192
155
|
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
193
|
-
|
194
|
-
|
195
|
-
forward_n = $1.size
|
196
|
-
forward_bio_primer = $2
|
197
|
-
else
|
198
|
-
forward_n = 0
|
199
|
-
forward_bio_primer = forward_primer
|
200
|
-
end
|
201
|
-
forward_bio_primer_size = forward_bio_primer.size
|
202
|
-
forward_starting_number = forward_n + forward_bio_primer_size
|
203
|
-
|
204
|
-
# filter R1 sequences with forward primers.
|
205
|
-
forward_primer_ref = forward_bio_primer.nt_parser
|
206
|
-
r1_passed_seq = {}
|
207
|
-
r1_raw.each do |name,seq|
|
208
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
209
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
210
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
211
|
-
|
212
|
-
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
213
|
-
if primer_region_seq =~ forward_primer_ref
|
214
|
-
r1_passed_seq[name.split("\s")[0]] = seq
|
215
|
-
end
|
216
|
-
end
|
156
|
+
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
157
|
+
r1_passed_seq = filter_r1[:r1_passed_seq]
|
217
158
|
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
218
|
-
|
219
159
|
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
220
160
|
|
161
|
+
# filter R2
|
221
162
|
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
222
|
-
|
223
|
-
|
224
|
-
pid_length =
|
225
|
-
cdna_bio_primer = $2
|
226
|
-
cdna_bio_primer_size = cdna_bio_primer.size
|
227
|
-
reverse_starting_number = pid_length + cdna_bio_primer_size
|
228
|
-
|
229
|
-
# filter R2 sequences with cDNA primers.
|
230
|
-
cdna_primer_ref = cdna_bio_primer.nt_parser
|
231
|
-
r2_passed_seq = {}
|
232
|
-
r2_raw.each do |name, seq|
|
233
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
234
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
235
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
236
|
-
|
237
|
-
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
238
|
-
if primer_region_seq =~ cdna_primer_ref
|
239
|
-
r2_passed_seq[name.split("\s")[0]] = seq
|
240
|
-
end
|
241
|
-
end
|
163
|
+
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
164
|
+
r2_passed_seq = filter_r2[:r2_passed_seq]
|
165
|
+
pid_length = filter_r2[:pid_length]
|
242
166
|
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
243
167
|
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
244
168
|
|
@@ -257,8 +181,8 @@ primers.each do |primer|
|
|
257
181
|
r2_seq = r2_passed_seq[seqtag]
|
258
182
|
pid = r2_seq[0, pid_length]
|
259
183
|
id[seqtag] = pid
|
260
|
-
bio_r2[seqtag] = r2_seq[reverse_starting_number..-2]
|
261
|
-
bio_r1[seqtag] = r1_seq[forward_starting_number..-2]
|
184
|
+
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
185
|
+
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
262
186
|
end
|
263
187
|
|
264
188
|
# TCS cut-off
|
@@ -278,11 +202,10 @@ primers.each do |primer|
|
|
278
202
|
end
|
279
203
|
|
280
204
|
max_id = primer_id_dis.keys.sort[-5..-1].mean
|
281
|
-
consensus_cutoff = calculate_cut_off(max_id,error_rate)
|
205
|
+
consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
|
282
206
|
log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
|
283
207
|
summary_json[:consensus_cutoff] = consensus_cutoff
|
284
208
|
summary_json[:length_of_pid] = pid_length
|
285
|
-
|
286
209
|
log.puts Time.now.to_s + "\t" + "Creating consensus..."
|
287
210
|
|
288
211
|
# Primer ID over the cut-off
|
@@ -355,6 +278,8 @@ primers.each do |primer|
|
|
355
278
|
consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
|
356
279
|
r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
|
357
280
|
r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
|
281
|
+
|
282
|
+
# hide the following two lines if allowing sequence to have ambiguities.
|
358
283
|
next if r1_consensus =~ /[^ATCG]/
|
359
284
|
next if r2_consensus =~ /[^ATCG]/
|
360
285
|
|
@@ -404,6 +329,7 @@ primers.each do |primer|
|
|
404
329
|
f1.close
|
405
330
|
f2.close
|
406
331
|
|
332
|
+
# Primer ID distribution in .json file
|
407
333
|
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
408
334
|
pid_json = {}
|
409
335
|
pid_json[:primer_id_in_use] = Hash[*(primer_id_in_use.sort_by {|k, v| [-v,k]}.flatten)]
|
@@ -413,11 +339,14 @@ primers.each do |primer|
|
|
413
339
|
f.puts JSON.pretty_generate(pid_json)
|
414
340
|
end
|
415
341
|
|
342
|
+
# start end-join
|
416
343
|
def end_join(dir, option, overlap)
|
417
344
|
shp = ViralSeq::SeqHashPair.fa(dir)
|
418
345
|
case option
|
419
346
|
when 1
|
420
347
|
joined_sh = shp.join1()
|
348
|
+
when 2
|
349
|
+
joined_sh = shp.join1(overlap)
|
421
350
|
when 3
|
422
351
|
joined_sh = shp.join2
|
423
352
|
when 4
|
@@ -489,9 +418,10 @@ primers.each do |primer|
|
|
489
418
|
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
490
419
|
|
491
420
|
if export_raw
|
492
|
-
joined_sh_raw =
|
421
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
493
422
|
end
|
494
423
|
end
|
424
|
+
|
495
425
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
496
426
|
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
497
427
|
if primer[:trim]
|
@@ -499,10 +429,11 @@ primers.each do |primer|
|
|
499
429
|
trim_end = primer[:trim_ref_end]
|
500
430
|
trim_ref = primer[:trim_ref].to_sym
|
501
431
|
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
432
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
433
|
+
if export_raw
|
434
|
+
joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
|
435
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
436
|
+
end
|
506
437
|
end
|
507
438
|
end
|
508
439
|
|
data/lib/viral_seq.rb
CHANGED
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
|
|
35
35
|
require_relative "viral_seq/sequence"
|
36
36
|
require_relative "viral_seq/string"
|
37
37
|
require_relative "viral_seq/version"
|
38
|
+
require_relative "viral_seq/tcs_core"
|
39
|
+
require_relative "viral_seq/tcs_json"
|
40
|
+
|
38
41
|
|
39
42
|
require "muscle_bio"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -9,7 +9,7 @@ module ViralSeq
|
|
9
9
|
# # align with MUSCLE
|
10
10
|
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
11
|
# # filter nt sequences with the reference coordinates
|
12
|
-
# filtered_seqhash = aligned_pr_seqhash.stop_codon[
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
|
13
13
|
# # return a new ViralSeq::SeqHash object without stop codons
|
14
14
|
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
15
|
# # further filter out sequences with A3G hypermutations
|
@@ -351,7 +351,7 @@ module ViralSeq
|
|
351
351
|
|
352
352
|
|
353
353
|
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
354
|
-
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at
|
354
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
|
355
355
|
# @return [String] consensus sequence
|
356
356
|
# @example consensus sequence from an array of sequences.
|
357
357
|
# seq_array = %w{ ATTTTTTTTT
|
@@ -383,11 +383,18 @@ module ViralSeq
|
|
383
383
|
base_count = all_base.count_freq
|
384
384
|
max_base_list = []
|
385
385
|
|
386
|
-
|
387
|
-
|
388
|
-
|
386
|
+
if cutoff.zero?
|
387
|
+
max_count = base_count.values.max
|
388
|
+
max_base_hash = base_count.select {|_k,v| v == max_count}
|
389
|
+
max_base_list = max_base_hash.keys
|
390
|
+
else
|
391
|
+
base_count.each do |k,v|
|
392
|
+
if v/seq_size.to_f >= cutoff
|
393
|
+
max_base_list << k
|
394
|
+
end
|
389
395
|
end
|
390
396
|
end
|
397
|
+
|
391
398
|
consensus_seq += call_consensus_base(max_base_list)
|
392
399
|
end
|
393
400
|
return consensus_seq
|
@@ -398,7 +405,7 @@ module ViralSeq
|
|
398
405
|
# # control pattern: G[YN|RC] -> A[YN|RC]
|
399
406
|
# # use the sample consensus to determine potential a3g sites
|
400
407
|
# # Two criteria to identify hypermutation
|
401
|
-
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G
|
408
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
|
402
409
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
403
410
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
404
411
|
# # b/c Poisson model does not do well on small sample size.
|
@@ -80,6 +80,12 @@ module ViralSeq
|
|
80
80
|
alias_method :fa, :new_from_fasta
|
81
81
|
end
|
82
82
|
|
83
|
+
# the size of nt sequence hash of the SeqHashPair object
|
84
|
+
# @return [Integer] size of nt sequence hash of the SeqHash object
|
85
|
+
def size
|
86
|
+
self.dna_hash.size
|
87
|
+
end
|
88
|
+
|
83
89
|
# Pair-end join function for KNOWN overlap size.
|
84
90
|
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
85
91
|
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
@@ -0,0 +1,303 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
|
3
|
+
# Core functions for `tcs` pipeline
|
4
|
+
|
5
|
+
class TcsCore
|
6
|
+
class << self
|
7
|
+
|
8
|
+
# methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
|
9
|
+
|
10
|
+
def calculate_cut_off(m, error_rate = 0.02)
|
11
|
+
n = 0
|
12
|
+
case error_rate
|
13
|
+
when 0.005...0.015
|
14
|
+
if m <= 10
|
15
|
+
n = 2
|
16
|
+
else
|
17
|
+
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
18
|
+
end
|
19
|
+
|
20
|
+
when 0...0.005
|
21
|
+
if m <= 10
|
22
|
+
n = 2
|
23
|
+
else
|
24
|
+
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
25
|
+
end
|
26
|
+
|
27
|
+
else
|
28
|
+
if m <= 10
|
29
|
+
n = 2
|
30
|
+
elsif m <= 8500
|
31
|
+
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
32
|
+
else
|
33
|
+
n = 0.0079 * m + 9.4869
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
n = n.round
|
38
|
+
n = 2 if n < 3
|
39
|
+
return n
|
40
|
+
end
|
41
|
+
|
42
|
+
# identify which file in the directory is R1 file, and which is R2 file based on file names
|
43
|
+
# input as directory (Dir object or a string of path)
|
44
|
+
# by default, .gz files will be unzipped.
|
45
|
+
# return as an hash of {r1_file: file1, r1_file: file2}
|
46
|
+
def r1r2(directory, unzip = true)
|
47
|
+
files = []
|
48
|
+
Dir.chdir(directory) { files = Dir.glob "*" }
|
49
|
+
r1_file = ""
|
50
|
+
r2_file = ""
|
51
|
+
files.each do |f|
|
52
|
+
tag = parser_file_name(f)[:tag]
|
53
|
+
|
54
|
+
if tag.include? "R1"
|
55
|
+
unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
|
56
|
+
elsif tag.include? "R2"
|
57
|
+
unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
return { r1_file: r1_file, r2_file: r2_file }
|
61
|
+
end # end of ViralSeq:TcsCore.r1r2
|
62
|
+
|
63
|
+
# sort directories containing mulitple r1 and r2 files.
|
64
|
+
# use the library name (first string before "_") to seperate libraries
|
65
|
+
# out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
|
66
|
+
# return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
|
67
|
+
|
68
|
+
def sort_by_lib(directory, out_dir = directory + "_sorted")
|
69
|
+
Dir.mkdir(out_dir) unless File.directory?(out_dir)
|
70
|
+
files = []
|
71
|
+
Dir.chdir(directory) {files = Dir.glob("*")}
|
72
|
+
|
73
|
+
files.each do |file|
|
74
|
+
path = File.join(directory,file)
|
75
|
+
index = file.split("_")[0]
|
76
|
+
index_dir = File.join(out_dir, index)
|
77
|
+
Dir.mkdir(index_dir) unless File.directory?(index_dir)
|
78
|
+
File.rename(path, File.join(index_dir, file))
|
79
|
+
end
|
80
|
+
|
81
|
+
return_obj = { with_both_r1_r2: [],
|
82
|
+
missing_r1: [],
|
83
|
+
missing_r2: [],
|
84
|
+
error: []
|
85
|
+
}
|
86
|
+
|
87
|
+
libs = []
|
88
|
+
Dir.chdir(out_dir) { libs = Dir.glob('*') }
|
89
|
+
libs.each do |lib|
|
90
|
+
file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
|
91
|
+
if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
92
|
+
return_obj[:with_both_r1_r2] << lib
|
93
|
+
elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
94
|
+
return_obj[:missing_r1] << lib
|
95
|
+
elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
|
96
|
+
return_obj[:missing_r2] << lib
|
97
|
+
else
|
98
|
+
return_obj[:error] << lib
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return return_obj
|
102
|
+
end
|
103
|
+
|
104
|
+
# sort array of file names to determine if there is potential errors
|
105
|
+
# input name_array array of file names
|
106
|
+
# output hash { }
|
107
|
+
|
108
|
+
def validate_file_name(name_array)
|
109
|
+
errors = { file_type_error: [] ,
|
110
|
+
missing_r1_file: [] ,
|
111
|
+
missing_r2_file: [] ,
|
112
|
+
extra_r1_r2_file: [],
|
113
|
+
no_region_tag: [] ,
|
114
|
+
multiple_region_tag: []}
|
115
|
+
|
116
|
+
passed_libs = {}
|
117
|
+
|
118
|
+
name_with_r1_r2 = []
|
119
|
+
|
120
|
+
name_array.each do |name|
|
121
|
+
tag = parser_file_name(name)[:tag]
|
122
|
+
if name !~ /\.fastq\Z|\.fastq\.gz\Z/
|
123
|
+
errors[:file_type_error] << name
|
124
|
+
elsif tag.count("R1") == 0 and tag.count("R2") == 0
|
125
|
+
errors[:no_region_tag] << name
|
126
|
+
elsif tag.count("R1") > 0 and tag.count("R2") > 0
|
127
|
+
errors[:multiple_region_tag] << name
|
128
|
+
elsif tag.count("R1") > 1 or tag.count("R2") > 1
|
129
|
+
errors[:multiple_region_tag] << name
|
130
|
+
else
|
131
|
+
name_with_r1_r2 << name
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
libs = {}
|
136
|
+
|
137
|
+
name_with_r1_r2.map do |name|
|
138
|
+
libname = parser_file_name(name)[:libname]
|
139
|
+
libs[libname] ||= []
|
140
|
+
libs[libname] << name
|
141
|
+
end
|
142
|
+
|
143
|
+
libs.each do |libname, files|
|
144
|
+
count_r1_file = 0
|
145
|
+
count_r2_file = 0
|
146
|
+
files.each do |name|
|
147
|
+
tag = parser_file_name(name)[:tag]
|
148
|
+
if tag.include? "R1"
|
149
|
+
count_r1_file += 1
|
150
|
+
elsif tag.include? "R2"
|
151
|
+
count_r2_file += 1
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
if count_r1_file > 1 or count_r2_file > 1
|
156
|
+
errors[:extra_r1_r2_file] += files
|
157
|
+
elsif count_r1_file.zero?
|
158
|
+
errors[:missing_r1_file] += files
|
159
|
+
elsif count_r2_file.zero?
|
160
|
+
errors[:missing_r2_file] += files
|
161
|
+
else
|
162
|
+
passed_libs[libname] = files
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
passed_names = []
|
167
|
+
|
168
|
+
passed_libs.values.each { |names| passed_names += names}
|
169
|
+
|
170
|
+
if passed_names.size < name_array.size
|
171
|
+
pass = false
|
172
|
+
else
|
173
|
+
pass = true
|
174
|
+
end
|
175
|
+
|
176
|
+
return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
|
177
|
+
end
|
178
|
+
|
179
|
+
# filter r1 raw sequences for non-specific primers.
|
180
|
+
# input r1_sh, SeqHash obj.
|
181
|
+
# return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
|
182
|
+
|
183
|
+
def filter_r1(r1_sh, forward_primer)
|
184
|
+
if forward_primer.match(/(N+)(\w+)$/)
|
185
|
+
forward_n = $1.size
|
186
|
+
forward_bio_primer = $2
|
187
|
+
else
|
188
|
+
forward_n = 0
|
189
|
+
forward_bio_primer = forward_primer
|
190
|
+
end
|
191
|
+
forward_bio_primer_size = forward_bio_primer.size
|
192
|
+
forward_starting_number = forward_n + forward_bio_primer_size
|
193
|
+
forward_primer_ref = forward_bio_primer.nt_parser
|
194
|
+
|
195
|
+
r1_passed_seq = {}
|
196
|
+
r1_raw = r1_sh.dna_hash
|
197
|
+
|
198
|
+
proc_filter = proc do |name|
|
199
|
+
seq = r1_raw[name]
|
200
|
+
next unless general_filter seq
|
201
|
+
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
202
|
+
if primer_region_seq =~ forward_primer_ref
|
203
|
+
new_name = remove_tag name
|
204
|
+
r1_passed_seq[new_name] = seq
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
r1_raw.keys.map do |name|
|
209
|
+
proc_filter.call name
|
210
|
+
end
|
211
|
+
|
212
|
+
return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
|
213
|
+
end # end of filter_r1
|
214
|
+
|
215
|
+
# filter r2 raw sequences for non-specific primers.
|
216
|
+
# input r2_sh, SeqHash obj.
|
217
|
+
# return filtered Hash of sequence name and seq pair, as well as the length of PID.
|
218
|
+
def filter_r2(r2_sh, cdna_primer)
|
219
|
+
r2_raw = r2_sh.dna_hash
|
220
|
+
cdna_primer.match(/(N+)(\w+)$/)
|
221
|
+
pid_length = $1.size
|
222
|
+
cdna_bio_primer = $2
|
223
|
+
cdna_bio_primer_size = cdna_bio_primer.size
|
224
|
+
reverse_starting_number = pid_length + cdna_bio_primer_size
|
225
|
+
cdna_primer_ref = cdna_bio_primer.nt_parser
|
226
|
+
r2_passed_seq = {}
|
227
|
+
proc_filter = proc do |name|
|
228
|
+
seq = r2_raw[name]
|
229
|
+
next unless general_filter seq
|
230
|
+
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
231
|
+
if primer_region_seq =~ cdna_primer_ref
|
232
|
+
new_name = remove_tag name
|
233
|
+
r2_passed_seq[new_name] = seq
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
r2_raw.keys.map do |name|
|
238
|
+
proc_filter.call name
|
239
|
+
end
|
240
|
+
|
241
|
+
return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
|
242
|
+
end # end of filter_r2
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
# puts error message in the log file handler, and abort with the same infor
|
247
|
+
|
248
|
+
def log_and_abort(log, infor)
|
249
|
+
log.puts Time.now.to_s + "\t" + infor
|
250
|
+
log.close
|
251
|
+
abort infor.red.bold
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
|
256
|
+
def unzip_r(indir, f)
|
257
|
+
r_file = File.join(indir, f)
|
258
|
+
if f =~ /.gz/
|
259
|
+
`gzip -d #{r_file}`
|
260
|
+
new_f = f.sub ".gz", ""
|
261
|
+
r_file = File.join(indir, new_f)
|
262
|
+
end
|
263
|
+
return r_file
|
264
|
+
end
|
265
|
+
|
266
|
+
def parser_file_name(file_name)
|
267
|
+
t = file_name.split(".")[0].split("_")
|
268
|
+
if t.size == 1
|
269
|
+
libname = "lib"
|
270
|
+
tag = [ t[0].upcase ]
|
271
|
+
else
|
272
|
+
libname = t[0]
|
273
|
+
tag = t[1..-1].map(&:upcase)
|
274
|
+
end
|
275
|
+
return {libname: libname, tag: tag}
|
276
|
+
end
|
277
|
+
|
278
|
+
def general_filter(seq)
|
279
|
+
if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
280
|
+
return false
|
281
|
+
elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
282
|
+
return false
|
283
|
+
elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
284
|
+
return false
|
285
|
+
else
|
286
|
+
return true
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
# remove region info tags from the raw MiSeq sequences.
|
291
|
+
def remove_tag(seq_name)
|
292
|
+
if seq_name =~ /\s/
|
293
|
+
new_tag = $`
|
294
|
+
else
|
295
|
+
new_tag = seq_name[0..-3]
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
end # end of class << self
|
300
|
+
|
301
|
+
end # end of TcsCore module
|
302
|
+
|
303
|
+
end # end of main module
|
@@ -0,0 +1,178 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
class TcsJson
|
3
|
+
class << self
|
4
|
+
|
5
|
+
def generate
|
6
|
+
puts '-'*58
|
7
|
+
puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
8
|
+
puts '-'*58 + "\n"
|
9
|
+
|
10
|
+
param = {}
|
11
|
+
|
12
|
+
puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
|
13
|
+
print '> '
|
14
|
+
param[:raw_sequence_dir] = gets.chomp.rstrip
|
15
|
+
|
16
|
+
puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
|
17
|
+
print '> '
|
18
|
+
input_error = gets.chomp.rstrip.to_f
|
19
|
+
if input_error == 0.0
|
20
|
+
param[:platform_error_rate] = 0.02
|
21
|
+
else
|
22
|
+
param[:platform_error_rate] = input_error
|
23
|
+
end
|
24
|
+
|
25
|
+
param[:primer_pairs] = []
|
26
|
+
|
27
|
+
loop do
|
28
|
+
data = {}
|
29
|
+
puts "Enter the name for the sequenced region: "
|
30
|
+
print '> '
|
31
|
+
data[:region] = gets.chomp.rstrip
|
32
|
+
|
33
|
+
puts "Enter the #{"cDNA".red.bold} primer sequence: "
|
34
|
+
print '> '
|
35
|
+
data[:cdna] = gets.chomp.rstrip
|
36
|
+
|
37
|
+
puts "Enter the #{"forward".blue.bold} primer sequence: "
|
38
|
+
print '> '
|
39
|
+
data[:forward] = gets.chomp.rstrip
|
40
|
+
|
41
|
+
puts "Enter supermajority cut-off (0.5 - 1.0). Default Simple Majority"
|
42
|
+
print '> '
|
43
|
+
mj = gets.chomp.rstrip.to_f
|
44
|
+
if (0.5..1.0).include?(mj)
|
45
|
+
data[:majority] = mj
|
46
|
+
else
|
47
|
+
data[:majority] = 0
|
48
|
+
end
|
49
|
+
|
50
|
+
print "Need end-join? Y/N \n> "
|
51
|
+
ej = gets.chomp.rstrip
|
52
|
+
if ej =~ /y|yes/i
|
53
|
+
data[:end_join] = true
|
54
|
+
|
55
|
+
print "End-join option? Choose from (1-4):\n
|
56
|
+
1: simple join, no overlap
|
57
|
+
2: known overlap \n
|
58
|
+
3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
|
59
|
+
4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
|
60
|
+
> "
|
61
|
+
ej_option = gets.chomp.rstrip
|
62
|
+
while ![1,2,3,4].include?(ej_option.to_i)
|
63
|
+
puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
|
64
|
+
ej_option = gets.chomp.rstrip.to_i
|
65
|
+
end
|
66
|
+
case ej_option.to_i
|
67
|
+
when 1
|
68
|
+
data[:end_join_option] = 1
|
69
|
+
data[:overlap] = 0
|
70
|
+
when 2
|
71
|
+
data[:end_join_option] = 1
|
72
|
+
print "overlap bases: \n> "
|
73
|
+
ol = gets.chomp.rstrip.to_i
|
74
|
+
data[:overlap] = ol
|
75
|
+
when 3
|
76
|
+
data[:end_join_option] = 3
|
77
|
+
when 4
|
78
|
+
data[:end_join_option] = 4
|
79
|
+
end
|
80
|
+
|
81
|
+
print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
|
82
|
+
qc = gets.chomp.rstrip
|
83
|
+
if qc =~ /y|yes/i
|
84
|
+
data[:TCS_QC] = true
|
85
|
+
|
86
|
+
data[:ref_genome] = get_ref
|
87
|
+
|
88
|
+
print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
|
89
|
+
data[:ref_start] = gets.chomp.rstrip.to_i
|
90
|
+
|
91
|
+
print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
|
92
|
+
data[:ref_end] = gets.chomp.rstrip.to_i
|
93
|
+
|
94
|
+
print "allow indels? (default as yes) Y/N \n> "
|
95
|
+
indel = gets.chomp.rstrip
|
96
|
+
if indel =~ /n|no/i
|
97
|
+
data[:indel] = false
|
98
|
+
else
|
99
|
+
data[:indel] = true
|
100
|
+
end
|
101
|
+
else
|
102
|
+
data[:TCS_QC] = false
|
103
|
+
end
|
104
|
+
|
105
|
+
print "Need trimming to a reference genome? Y/N \n> "
|
106
|
+
trim_option = gets.chomp.rstrip
|
107
|
+
if trim_option =~ /y|yes/i
|
108
|
+
data[:trim] = true
|
109
|
+
data[:trim_ref] = get_ref
|
110
|
+
|
111
|
+
print "reference 5'end ref position \n> "
|
112
|
+
data[:trim_ref_start] = gets.chomp.rstrip.to_i
|
113
|
+
|
114
|
+
print "reference 3'end ref position \n> "
|
115
|
+
data[:trim_ref_end] = gets.chomp.rstrip.to_i
|
116
|
+
|
117
|
+
else
|
118
|
+
data[:trim] = false
|
119
|
+
end
|
120
|
+
|
121
|
+
else
|
122
|
+
data[:end_join] = false
|
123
|
+
end
|
124
|
+
|
125
|
+
param[:primer_pairs] << data
|
126
|
+
print "Do you wish to conintue? Y/N \n> "
|
127
|
+
continue_sig = gets.chomp.rstrip
|
128
|
+
break unless continue_sig =~ /y|yes/i
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
puts "\nYour JSON string is:"
|
133
|
+
puts JSON.pretty_generate(param)
|
134
|
+
|
135
|
+
print "\nDo you wish to save it as a file? Y/N \n> "
|
136
|
+
save_option = gets.chomp.rstrip
|
137
|
+
|
138
|
+
if save_option =~ /y|yes/i
|
139
|
+
print "Path to save JSON file:\n> "
|
140
|
+
path = gets.chomp.rstrip
|
141
|
+
File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
|
142
|
+
end
|
143
|
+
|
144
|
+
print "\nDo you wish to execute tcs pipeline with the input params now? Y/N \n> "
|
145
|
+
|
146
|
+
rsp = gets.chomp.rstrip
|
147
|
+
if rsp =~ /y/i
|
148
|
+
return param
|
149
|
+
else
|
150
|
+
abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`"
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
private
|
156
|
+
def get_ref
|
157
|
+
puts "Choose reference genome (1-3):"
|
158
|
+
puts "1. HIV-1 HXB2".red.bold
|
159
|
+
puts "2. HIV-1 NL4-3".blue.bold
|
160
|
+
puts "3. SIV MAC239".magenta.bold
|
161
|
+
print "> "
|
162
|
+
ref_option = gets.chomp.rstrip
|
163
|
+
while ![1,2,3].include?(ref_option.to_i)
|
164
|
+
print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
|
165
|
+
ref_option = gets.chomp.rstrip.to_i
|
166
|
+
end
|
167
|
+
ref = case ref_option.to_i
|
168
|
+
when 1
|
169
|
+
:HXB2
|
170
|
+
when 2
|
171
|
+
:NL43
|
172
|
+
when 3
|
173
|
+
:MAC239
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end # end TcsJson
|
178
|
+
end # end main module
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-
|
12
|
+
date: 2020-11-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -90,7 +90,6 @@ email:
|
|
90
90
|
executables:
|
91
91
|
- locator
|
92
92
|
- tcs
|
93
|
-
- tcs_json_generator
|
94
93
|
extensions: []
|
95
94
|
extra_rdoc_files: []
|
96
95
|
files:
|
@@ -105,7 +104,6 @@ files:
|
|
105
104
|
- Rakefile
|
106
105
|
- bin/locator
|
107
106
|
- bin/tcs
|
108
|
-
- bin/tcs_json_generator
|
109
107
|
- lib/viral_seq.rb
|
110
108
|
- lib/viral_seq/constant.rb
|
111
109
|
- lib/viral_seq/enumerable.rb
|
@@ -120,6 +118,8 @@ files:
|
|
120
118
|
- lib/viral_seq/seq_hash_pair.rb
|
121
119
|
- lib/viral_seq/sequence.rb
|
122
120
|
- lib/viral_seq/string.rb
|
121
|
+
- lib/viral_seq/tcs_core.rb
|
122
|
+
- lib/viral_seq/tcs_json.rb
|
123
123
|
- lib/viral_seq/version.rb
|
124
124
|
- viral_seq.gemspec
|
125
125
|
homepage: https://github.com/ViralSeq/viral_seq
|
data/bin/tcs_json_generator
DELETED
@@ -1,166 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# TCS pipeline JSON params generator.
|
4
|
-
|
5
|
-
require 'viral_seq'
|
6
|
-
require 'colorize'
|
7
|
-
require 'json'
|
8
|
-
|
9
|
-
def get_ref
|
10
|
-
puts "Choose reference genome (1-3):"
|
11
|
-
puts "1. HIV-1 HXB2".red.bold
|
12
|
-
puts "2. HIV-1 NL4-3".blue.bold
|
13
|
-
puts "3. SIV MAC239".magenta.bold
|
14
|
-
print "> "
|
15
|
-
ref_option = gets.chomp.rstrip
|
16
|
-
while ![1,2,3].include?(ref_option.to_i)
|
17
|
-
print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
|
18
|
-
ref_option = gets.chomp.rstrip.to_i
|
19
|
-
end
|
20
|
-
ref = case ref_option.to_i
|
21
|
-
when 1
|
22
|
-
:HXB2
|
23
|
-
when 2
|
24
|
-
:NL43
|
25
|
-
when 3
|
26
|
-
:MAC239
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
puts "\n" + '-'*58
|
31
|
-
puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
32
|
-
puts '-'*58 + "\n"
|
33
|
-
|
34
|
-
param = {}
|
35
|
-
|
36
|
-
puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
|
37
|
-
print '> '
|
38
|
-
param[:raw_sequence_dir] = gets.chomp.rstrip
|
39
|
-
|
40
|
-
puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
|
41
|
-
print '> '
|
42
|
-
input_error = gets.chomp.rstrip.to_f
|
43
|
-
if input_error == 0.0
|
44
|
-
param[:platform_error_rate] = 0.02
|
45
|
-
else
|
46
|
-
param[:platform_error_rate] = input_error
|
47
|
-
end
|
48
|
-
|
49
|
-
param[:primer_pairs] = []
|
50
|
-
|
51
|
-
loop do
|
52
|
-
data = {}
|
53
|
-
puts "Enter the name for the sequenced region: "
|
54
|
-
print '> '
|
55
|
-
data[:region] = gets.chomp.rstrip
|
56
|
-
|
57
|
-
puts "Enter the #{"cDNA".red.bold} primer sequence: "
|
58
|
-
print '> '
|
59
|
-
data[:cdna] = gets.chomp.rstrip
|
60
|
-
|
61
|
-
puts "Enter the #{"forward".blue.bold} primer sequence: "
|
62
|
-
print '> '
|
63
|
-
data[:forward] = gets.chomp.rstrip
|
64
|
-
|
65
|
-
puts "Enter supermajority cut-off (0.5 - 0.9). Default: " + "0.5".blue.bold + " (simple majority)"
|
66
|
-
print '> '
|
67
|
-
mj = gets.chomp.rstrip.to_f
|
68
|
-
if (0.5..0.9).include?(mj)
|
69
|
-
data[:majority] = mj
|
70
|
-
else
|
71
|
-
data[:majority] = 0.5
|
72
|
-
end
|
73
|
-
|
74
|
-
print "Need end-join? Y/N \n> "
|
75
|
-
ej = gets.chomp.rstrip
|
76
|
-
if ej =~ /y|yes/i
|
77
|
-
data[:end_join] = true
|
78
|
-
|
79
|
-
print "End-join option? Choose from (1-4):\n
|
80
|
-
1: simple join, no overlap
|
81
|
-
2: known overlap \n
|
82
|
-
3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
|
83
|
-
4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
|
84
|
-
> "
|
85
|
-
ej_option = gets.chomp.rstrip
|
86
|
-
while ![1,2,3,4].include?(ej_option.to_i)
|
87
|
-
puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
|
88
|
-
ej_option = gets.chomp.rstrip.to_i
|
89
|
-
end
|
90
|
-
case ej_option.to_i
|
91
|
-
when 1
|
92
|
-
data[:end_join_option] = 1
|
93
|
-
data[:overlap] = 0
|
94
|
-
when 2
|
95
|
-
data[:end_join_option] = 1
|
96
|
-
print "overlap bases: \n> "
|
97
|
-
ol = gets.chomp.rstrip.to_i
|
98
|
-
data[:overlap] = ol
|
99
|
-
when 3
|
100
|
-
data[:end_join_option] = 3
|
101
|
-
when 4
|
102
|
-
data[:end_join_option] = 4
|
103
|
-
end
|
104
|
-
|
105
|
-
print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
|
106
|
-
qc = gets.chomp.rstrip
|
107
|
-
if qc =~ /y|yes/i
|
108
|
-
data[:TCS_QC] = true
|
109
|
-
|
110
|
-
data[:ref_genome] = get_ref
|
111
|
-
|
112
|
-
print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
|
113
|
-
data[:ref_start] = gets.chomp.rstrip.to_i
|
114
|
-
|
115
|
-
print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
|
116
|
-
data[:ref_end] = gets.chomp.rstrip.to_i
|
117
|
-
|
118
|
-
print "allow indels? (default as yes) Y/N \n> "
|
119
|
-
indel = gets.chomp.rstrip
|
120
|
-
if indel =~ /n|no/i
|
121
|
-
data[:indel] = false
|
122
|
-
else
|
123
|
-
data[:indel] = true
|
124
|
-
end
|
125
|
-
else
|
126
|
-
data[:TCS_QC] = false
|
127
|
-
end
|
128
|
-
|
129
|
-
print "Need trimming to a reference genome? Y/N \n> "
|
130
|
-
trim_option = gets.chomp.rstrip
|
131
|
-
if trim_option =~ /y|yes/i
|
132
|
-
data[:trim] = true
|
133
|
-
data[:trim_ref] = get_ref
|
134
|
-
|
135
|
-
print "reference 5'end ref position \n> "
|
136
|
-
data[:trim_ref_start] = gets.chomp.rstrip.to_i
|
137
|
-
|
138
|
-
print "reference 3'end ref position \n> "
|
139
|
-
data[:trim_ref_end] = gets.chomp.rstrip.to_i
|
140
|
-
|
141
|
-
else
|
142
|
-
data[:trim] = false
|
143
|
-
end
|
144
|
-
|
145
|
-
else
|
146
|
-
data[:end_join] = false
|
147
|
-
end
|
148
|
-
|
149
|
-
param[:primer_pairs] << data
|
150
|
-
print "Do you wish to conintue? Y/N \n> "
|
151
|
-
continue_sig = gets.chomp.rstrip
|
152
|
-
break unless continue_sig =~ /y|yes/i
|
153
|
-
|
154
|
-
end
|
155
|
-
|
156
|
-
puts "\nYour JSON string is:"
|
157
|
-
puts JSON.pretty_generate(param)
|
158
|
-
|
159
|
-
print "\nDo you wish to save it as a file? Y/N \n> "
|
160
|
-
save_option = gets.chomp.rstrip
|
161
|
-
|
162
|
-
if save_option =~ /y|yes/i
|
163
|
-
print "Path to save JSON file:\n> "
|
164
|
-
path = gets.chomp.rstrip
|
165
|
-
File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
|
166
|
-
end
|