viral_seq 1.0.9 → 1.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +45 -32
- data/bin/tcs +72 -141
- data/lib/viral_seq.rb +3 -0
- data/lib/viral_seq/seq_hash.rb +13 -6
- data/lib/viral_seq/seq_hash_pair.rb +6 -0
- data/lib/viral_seq/tcs_core.rb +303 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +4 -4
- data/bin/tcs_json_generator +0 -166
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14d880e9f39b2b87892bec9d4377b358643c880cf32c81872cff51e1007bc23b
|
4
|
+
data.tar.gz: 6ee1c3293e2b0403a2eac033335f7575625b2d35f32127b5b57be53e94b4ec7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 951b75ced84aa21cf5650baa6970f60a617d3f29d20c14acadacefabea23d6b584f25990453c2008f30197aaef055a94edbdbb45494bb12b6343d90bc6bd45fb
|
7
|
+
data.tar.gz: 68ac69b4ebd5438a8f73780db823c94aa5a78c7c26d02cfd6bec979244dd1d6452c3698ade0606ddbbaccc480ad85e603171c11648dbb0110c2f5dbb3355bb35
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -4,72 +4,76 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
4
4
|
|
5
5
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
6
6
|
|
7
|
-
##
|
7
|
+
## Install
|
8
8
|
|
9
|
+
```bash
|
9
10
|
$ gem install viral_seq
|
11
|
+
```
|
10
12
|
|
11
13
|
## Usage
|
12
14
|
|
13
|
-
|
15
|
+
### Excutables
|
14
16
|
|
15
|
-
|
16
|
-
#!/usr/bin/env ruby
|
17
|
-
require 'viral_seq'
|
18
|
-
```
|
19
|
-
|
20
|
-
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
17
|
+
Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
21
18
|
|
19
|
+
```bash
|
22
20
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
21
|
+
```
|
23
22
|
|
23
|
+
Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data.
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
$ tcs params.json
|
28
|
-
|
29
|
-
|
25
|
+
```bash
|
26
|
+
$ tcs -p params.json # run TCS pipeline with params.json
|
27
|
+
$ tcs -j # CLI to generate params.json
|
28
|
+
$ tcs -h # print out the help
|
29
|
+
```
|
30
30
|
|
31
|
-
|
31
|
+
## Some Examples
|
32
32
|
|
33
|
+
Load all ViralSeq classes by requiring 'viral_seq.rb' in your Ruby scripts.
|
33
34
|
|
34
|
-
|
35
|
+
```ruby
|
36
|
+
#!/usr/bin/env ruby
|
37
|
+
require 'viral_seq'
|
38
|
+
```
|
35
39
|
|
36
|
-
|
40
|
+
Load nucleotide sequences from a FASTA format sequence file
|
37
41
|
|
38
42
|
```ruby
|
39
43
|
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
40
44
|
```
|
41
45
|
|
42
|
-
|
46
|
+
Make an alignment (using MUSCLE)
|
43
47
|
|
44
48
|
```ruby
|
45
49
|
aligned_seqhash = my_seqhash.align
|
46
50
|
```
|
47
51
|
|
48
|
-
|
52
|
+
Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
49
53
|
|
50
54
|
```ruby
|
51
55
|
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
52
56
|
```
|
53
57
|
|
54
|
-
|
58
|
+
Further filter out sequences with Apobec3g/f hypermutations
|
55
59
|
|
56
60
|
```ruby
|
57
61
|
qc_seqhash = qc_seqhash.a3g
|
58
62
|
```
|
59
63
|
|
60
|
-
|
64
|
+
Calculate nucleotide diveristy π
|
61
65
|
|
62
66
|
```ruby
|
63
67
|
qc_seqhash.pi
|
64
68
|
```
|
65
69
|
|
66
|
-
|
70
|
+
Calculate cut-off for minority variants based on Poisson model
|
67
71
|
|
68
72
|
```ruby
|
69
73
|
cut_off = qc_seqhash.pm
|
70
74
|
```
|
71
75
|
|
72
|
-
|
76
|
+
Examine for drug resistance mutations for HIV PR region
|
73
77
|
|
74
78
|
```ruby
|
75
79
|
qc_seqhash.sdrm_hiv_pr(cut_off)
|
@@ -77,13 +81,22 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
77
81
|
|
78
82
|
## Updates
|
79
83
|
|
80
|
-
Version 1.0
|
84
|
+
### Version 1.1.0-11112020:
|
85
|
+
|
86
|
+
1. Modularize TCS pipeline. Move key functions into /viral_seq/tcs_core.rb
|
87
|
+
2. `tcs_json_generator` is removed. This CLI is delivered within the `tcs` pipeline, by running `tcs -j`. The scripts are included in the /viral_seq/tcs_json.rb
|
88
|
+
3. consensus model now includes a true simple majority model, where no nt needs to be over 50% to be called.
|
89
|
+
4. a few optimizations.
|
90
|
+
5. TCS 2.1.0 delivered.
|
91
|
+
6. Tried parallel processing. Cannot achieve the goal because `parallel` gem by default can't pool data from memory of child processors and `in_threads` does not help with the speed.
|
92
|
+
|
93
|
+
### Version 1.0.9-07182020:
|
81
94
|
|
82
95
|
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
83
96
|
|
84
97
|
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
85
98
|
|
86
|
-
Version 1.0.8-02282020:
|
99
|
+
### Version 1.0.8-02282020:
|
87
100
|
|
88
101
|
1. TCS pipeline (version 2.0.0) added as executable.
|
89
102
|
tcs - main TCS pipeline script.
|
@@ -94,14 +107,14 @@ Version 1.0.8-02282020:
|
|
94
107
|
|
95
108
|
3. Bug fix for several methods.
|
96
109
|
|
97
|
-
Version 1.0.7-01282020:
|
110
|
+
### Version 1.0.7-01282020:
|
98
111
|
|
99
112
|
1. Several methods added, including
|
100
113
|
ViralSeq::SeqHash#error_table
|
101
114
|
ViralSeq::SeqHash#random_select
|
102
115
|
2. Improved performance for several functions.
|
103
116
|
|
104
|
-
Version 1.0.6-07232019:
|
117
|
+
### Version 1.0.6-07232019:
|
105
118
|
|
106
119
|
1. Several methods added to ViralSeq::SeqHash, including
|
107
120
|
ViralSeq::SeqHash#size
|
@@ -110,33 +123,33 @@ Version 1.0.6-07232019:
|
|
110
123
|
ViralSeq::SeqHash#mutation
|
111
124
|
2. Update documentations and rspec samples.
|
112
125
|
|
113
|
-
Version 1.0.5-07112019:
|
126
|
+
### Version 1.0.5-07112019:
|
114
127
|
|
115
128
|
1. Update ViralSeq::SeqHash#sequence_locator.
|
116
129
|
Program will try to determine the direction (`+` or `-` of the query sequence)
|
117
130
|
2. update executable `locator` to have a column of `direction` in output .csv file
|
118
131
|
|
119
|
-
Version 1.0.4-07102019:
|
132
|
+
### Version 1.0.4-07102019:
|
120
133
|
|
121
134
|
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
122
135
|
2. Fix bugs in bin `locator`
|
123
136
|
|
124
|
-
Version 1.0.3-07102019:
|
137
|
+
### Version 1.0.3-07102019:
|
125
138
|
|
126
139
|
1. Bug fix.
|
127
140
|
|
128
|
-
Version 1.0.2-07102019:
|
141
|
+
### Version 1.0.2-07102019:
|
129
142
|
|
130
143
|
1. Fixed a gem loading issue.
|
131
144
|
|
132
|
-
Version 1.0.1-07102019:
|
145
|
+
### Version 1.0.1-07102019:
|
133
146
|
|
134
147
|
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
135
148
|
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
136
149
|
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
137
150
|
4. update documentations
|
138
151
|
|
139
|
-
Version 1.0.0-07092019:
|
152
|
+
### Version 1.0.0-07092019:
|
140
153
|
|
141
154
|
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
142
155
|
|
data/bin/tcs
CHANGED
@@ -28,114 +28,79 @@
|
|
28
28
|
require 'viral_seq'
|
29
29
|
require 'json'
|
30
30
|
require 'colorize'
|
31
|
+
require 'OptionParser'
|
31
32
|
|
33
|
+
options = {}
|
32
34
|
|
33
|
-
|
35
|
+
banner = '-'*50 + "\n" +
|
36
|
+
'| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |' + "\n" +
|
37
|
+
'-'*50 + "\n"
|
34
38
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
n = 2
|
41
|
-
else
|
42
|
-
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
43
|
-
end
|
39
|
+
OptionParser.new do |opts|
|
40
|
+
opts.banner = banner + "Usage: tcs -j"
|
41
|
+
opts.on "-j", "--json_generator", "Command line interfac to generate new params json file" do |j|
|
42
|
+
options[:json_generator] = true
|
43
|
+
end
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
else
|
49
|
-
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
50
|
-
end
|
45
|
+
opts.on("-p", "--params PARAMS_JSON", "Execute the pipeline with input params json file") do |p|
|
46
|
+
options[:params_json] = p
|
47
|
+
end
|
51
48
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
elsif m <= 8500
|
56
|
-
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
57
|
-
else
|
58
|
-
n = 0.0079 * m + 9.4869
|
59
|
-
end
|
49
|
+
opts.on("-h", "--help", "Prints this help") do
|
50
|
+
puts opts
|
51
|
+
exit
|
60
52
|
end
|
61
53
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
54
|
+
opts.on("-v", "--version", "Version info") do
|
55
|
+
puts "tcs version: " + ViralSeq::TCS_VERSION.red.bold
|
56
|
+
puts "viral_seq version: " + ViralSeq::VERSION.red.bold
|
57
|
+
exit
|
58
|
+
end
|
66
59
|
|
67
|
-
|
68
|
-
|
69
|
-
|
60
|
+
# opts.on("--no-parallel", "toggle off parallel processing") do
|
61
|
+
# options[:no_parallel] = true
|
62
|
+
# end
|
63
|
+
end.parse!
|
70
64
|
|
71
|
-
|
72
|
-
|
65
|
+
if options[:json_generator]
|
66
|
+
params = ViralSeq::TcsJson.generate
|
67
|
+
elsif (options[:params_json] && File.exist?(options[:params_json]))
|
68
|
+
params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
|
69
|
+
else
|
70
|
+
abort "No params JSON file found. Script terminated.".red
|
73
71
|
end
|
74
72
|
|
75
|
-
params = JSON.parse(File.read(ARGV[0]), symbolize_names: true)
|
76
|
-
|
77
73
|
indir = params[:raw_sequence_dir]
|
78
74
|
|
79
75
|
unless File.exist?(indir)
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
libname = File.basename(indir)
|
84
|
-
|
85
|
-
# obtain R1 and R2 file path
|
86
|
-
files = []
|
87
|
-
Dir.chdir(indir) do
|
88
|
-
files = Dir.glob("*")
|
76
|
+
abort "No input sequence directory found. Script terminated.".red.bold
|
89
77
|
end
|
90
78
|
|
91
|
-
|
92
|
-
raise "Input dir does not contain files. Script terminated."
|
93
|
-
end
|
79
|
+
# log file
|
94
80
|
|
95
|
-
r1_f = ""
|
96
|
-
r2_f = ""
|
97
|
-
|
98
|
-
# unzip .fasta.gz
|
99
|
-
def unzip_r(indir, f)
|
100
|
-
r_file = indir + "/" + f
|
101
|
-
if f =~ /.gz/
|
102
|
-
`gzip -d #{r_file}`
|
103
|
-
new_f = f.sub ".gz", ""
|
104
|
-
r_file = File.join(indir, new_f)
|
105
|
-
end
|
106
|
-
return r_file
|
107
|
-
end
|
108
81
|
runtime_log_file = File.join(indir,"runtime.log")
|
109
82
|
log = File.open(runtime_log_file, "w")
|
110
83
|
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
111
84
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
112
85
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
113
86
|
|
87
|
+
libname = File.basename indir
|
114
88
|
|
115
|
-
|
116
|
-
t = f.split("_")
|
117
|
-
if t.size == 1
|
118
|
-
tag = f
|
119
|
-
else
|
120
|
-
tag = f.split("_")[1..-1].join("_")
|
121
|
-
end
|
89
|
+
seq_files = ViralSeq::TcsCore.r1r2 indir
|
122
90
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
log.puts "R1 file not found. Script terminated."
|
133
|
-
raise "R1 file not found. Script terminated."
|
91
|
+
if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
|
92
|
+
r1_f = seq_files[:r1_file]
|
93
|
+
r2_f = seq_files[:r2_file]
|
94
|
+
elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
|
95
|
+
exit_sig = "Missing R2 file. Aborted."
|
96
|
+
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
97
|
+
exit_sig = "Missing R1 file. Aborted."
|
98
|
+
else
|
99
|
+
exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
|
134
100
|
end
|
135
101
|
|
136
|
-
|
137
|
-
|
138
|
-
raise "R2 file not found. Script terminated."
|
102
|
+
if exit_sig
|
103
|
+
ViralSeq::TcsCore.log_and_abort log, exit_sig
|
139
104
|
end
|
140
105
|
|
141
106
|
r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
|
@@ -152,10 +117,10 @@ end
|
|
152
117
|
|
153
118
|
primers = params[:primer_pairs]
|
154
119
|
if primers.empty?
|
155
|
-
|
156
|
-
raise "No primer information. Script terminated."
|
120
|
+
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
157
121
|
end
|
158
122
|
|
123
|
+
|
159
124
|
primers.each do |primer|
|
160
125
|
summary_json = {}
|
161
126
|
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
@@ -179,66 +144,25 @@ primers.each do |primer|
|
|
179
144
|
summary_json[:cdan_primer] = cdna_primer
|
180
145
|
summary_json[:forward_primer] = forward_primer
|
181
146
|
|
182
|
-
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
147
|
+
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
183
148
|
summary_json[:majority_cut_off] = majority_cut_off
|
184
149
|
|
185
150
|
summary_json[:total_raw_sequence] = raw_sequence_number
|
186
151
|
|
187
152
|
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
188
153
|
|
189
|
-
|
190
|
-
r2_raw = r2_fastq_sh.dna_hash
|
191
|
-
|
154
|
+
# filter R1
|
192
155
|
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
193
|
-
|
194
|
-
|
195
|
-
forward_n = $1.size
|
196
|
-
forward_bio_primer = $2
|
197
|
-
else
|
198
|
-
forward_n = 0
|
199
|
-
forward_bio_primer = forward_primer
|
200
|
-
end
|
201
|
-
forward_bio_primer_size = forward_bio_primer.size
|
202
|
-
forward_starting_number = forward_n + forward_bio_primer_size
|
203
|
-
|
204
|
-
# filter R1 sequences with forward primers.
|
205
|
-
forward_primer_ref = forward_bio_primer.nt_parser
|
206
|
-
r1_passed_seq = {}
|
207
|
-
r1_raw.each do |name,seq|
|
208
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
209
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
210
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
211
|
-
|
212
|
-
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
213
|
-
if primer_region_seq =~ forward_primer_ref
|
214
|
-
r1_passed_seq[name.split("\s")[0]] = seq
|
215
|
-
end
|
216
|
-
end
|
156
|
+
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
157
|
+
r1_passed_seq = filter_r1[:r1_passed_seq]
|
217
158
|
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
218
|
-
|
219
159
|
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
220
160
|
|
161
|
+
# filter R2
|
221
162
|
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
222
|
-
|
223
|
-
|
224
|
-
pid_length =
|
225
|
-
cdna_bio_primer = $2
|
226
|
-
cdna_bio_primer_size = cdna_bio_primer.size
|
227
|
-
reverse_starting_number = pid_length + cdna_bio_primer_size
|
228
|
-
|
229
|
-
# filter R2 sequences with cDNA primers.
|
230
|
-
cdna_primer_ref = cdna_bio_primer.nt_parser
|
231
|
-
r2_passed_seq = {}
|
232
|
-
r2_raw.each do |name, seq|
|
233
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
234
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
235
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
236
|
-
|
237
|
-
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
238
|
-
if primer_region_seq =~ cdna_primer_ref
|
239
|
-
r2_passed_seq[name.split("\s")[0]] = seq
|
240
|
-
end
|
241
|
-
end
|
163
|
+
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
164
|
+
r2_passed_seq = filter_r2[:r2_passed_seq]
|
165
|
+
pid_length = filter_r2[:pid_length]
|
242
166
|
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
243
167
|
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
244
168
|
|
@@ -257,8 +181,8 @@ primers.each do |primer|
|
|
257
181
|
r2_seq = r2_passed_seq[seqtag]
|
258
182
|
pid = r2_seq[0, pid_length]
|
259
183
|
id[seqtag] = pid
|
260
|
-
bio_r2[seqtag] = r2_seq[reverse_starting_number..-2]
|
261
|
-
bio_r1[seqtag] = r1_seq[forward_starting_number..-2]
|
184
|
+
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
185
|
+
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
262
186
|
end
|
263
187
|
|
264
188
|
# TCS cut-off
|
@@ -278,11 +202,10 @@ primers.each do |primer|
|
|
278
202
|
end
|
279
203
|
|
280
204
|
max_id = primer_id_dis.keys.sort[-5..-1].mean
|
281
|
-
consensus_cutoff = calculate_cut_off(max_id,error_rate)
|
205
|
+
consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
|
282
206
|
log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
|
283
207
|
summary_json[:consensus_cutoff] = consensus_cutoff
|
284
208
|
summary_json[:length_of_pid] = pid_length
|
285
|
-
|
286
209
|
log.puts Time.now.to_s + "\t" + "Creating consensus..."
|
287
210
|
|
288
211
|
# Primer ID over the cut-off
|
@@ -355,6 +278,8 @@ primers.each do |primer|
|
|
355
278
|
consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
|
356
279
|
r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
|
357
280
|
r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
|
281
|
+
|
282
|
+
# hide the following two lines if allowing sequence to have ambiguities.
|
358
283
|
next if r1_consensus =~ /[^ATCG]/
|
359
284
|
next if r2_consensus =~ /[^ATCG]/
|
360
285
|
|
@@ -404,6 +329,7 @@ primers.each do |primer|
|
|
404
329
|
f1.close
|
405
330
|
f2.close
|
406
331
|
|
332
|
+
# Primer ID distribution in .json file
|
407
333
|
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
408
334
|
pid_json = {}
|
409
335
|
pid_json[:primer_id_in_use] = Hash[*(primer_id_in_use.sort_by {|k, v| [-v,k]}.flatten)]
|
@@ -413,11 +339,14 @@ primers.each do |primer|
|
|
413
339
|
f.puts JSON.pretty_generate(pid_json)
|
414
340
|
end
|
415
341
|
|
342
|
+
# start end-join
|
416
343
|
def end_join(dir, option, overlap)
|
417
344
|
shp = ViralSeq::SeqHashPair.fa(dir)
|
418
345
|
case option
|
419
346
|
when 1
|
420
347
|
joined_sh = shp.join1()
|
348
|
+
when 2
|
349
|
+
joined_sh = shp.join1(overlap)
|
421
350
|
when 3
|
422
351
|
joined_sh = shp.join2
|
423
352
|
when 4
|
@@ -489,9 +418,10 @@ primers.each do |primer|
|
|
489
418
|
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
490
419
|
|
491
420
|
if export_raw
|
492
|
-
joined_sh_raw =
|
421
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
493
422
|
end
|
494
423
|
end
|
424
|
+
|
495
425
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
496
426
|
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
497
427
|
if primer[:trim]
|
@@ -499,10 +429,11 @@ primers.each do |primer|
|
|
499
429
|
trim_end = primer[:trim_ref_end]
|
500
430
|
trim_ref = primer[:trim_ref].to_sym
|
501
431
|
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
432
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
433
|
+
if export_raw
|
434
|
+
joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
|
435
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
436
|
+
end
|
506
437
|
end
|
507
438
|
end
|
508
439
|
|
data/lib/viral_seq.rb
CHANGED
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
|
|
35
35
|
require_relative "viral_seq/sequence"
|
36
36
|
require_relative "viral_seq/string"
|
37
37
|
require_relative "viral_seq/version"
|
38
|
+
require_relative "viral_seq/tcs_core"
|
39
|
+
require_relative "viral_seq/tcs_json"
|
40
|
+
|
38
41
|
|
39
42
|
require "muscle_bio"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -9,7 +9,7 @@ module ViralSeq
|
|
9
9
|
# # align with MUSCLE
|
10
10
|
# filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
11
11
|
# # filter nt sequences with the reference coordinates
|
12
|
-
# filtered_seqhash = aligned_pr_seqhash.stop_codon[
|
12
|
+
# filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
|
13
13
|
# # return a new ViralSeq::SeqHash object without stop codons
|
14
14
|
# filtered_seqhash = filtered_seqhash.a3g[1]
|
15
15
|
# # further filter out sequences with A3G hypermutations
|
@@ -351,7 +351,7 @@ module ViralSeq
|
|
351
351
|
|
352
352
|
|
353
353
|
# create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
|
354
|
-
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at
|
354
|
+
# @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
|
355
355
|
# @return [String] consensus sequence
|
356
356
|
# @example consensus sequence from an array of sequences.
|
357
357
|
# seq_array = %w{ ATTTTTTTTT
|
@@ -383,11 +383,18 @@ module ViralSeq
|
|
383
383
|
base_count = all_base.count_freq
|
384
384
|
max_base_list = []
|
385
385
|
|
386
|
-
|
387
|
-
|
388
|
-
|
386
|
+
if cutoff.zero?
|
387
|
+
max_count = base_count.values.max
|
388
|
+
max_base_hash = base_count.select {|_k,v| v == max_count}
|
389
|
+
max_base_list = max_base_hash.keys
|
390
|
+
else
|
391
|
+
base_count.each do |k,v|
|
392
|
+
if v/seq_size.to_f >= cutoff
|
393
|
+
max_base_list << k
|
394
|
+
end
|
389
395
|
end
|
390
396
|
end
|
397
|
+
|
391
398
|
consensus_seq += call_consensus_base(max_base_list)
|
392
399
|
end
|
393
400
|
return consensus_seq
|
@@ -398,7 +405,7 @@ module ViralSeq
|
|
398
405
|
# # control pattern: G[YN|RC] -> A[YN|RC]
|
399
406
|
# # use the sample consensus to determine potential a3g sites
|
400
407
|
# # Two criteria to identify hypermutation
|
401
|
-
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G
|
408
|
+
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
|
402
409
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
403
410
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
404
411
|
# # b/c Poisson model does not do well on small sample size.
|
@@ -80,6 +80,12 @@ module ViralSeq
|
|
80
80
|
alias_method :fa, :new_from_fasta
|
81
81
|
end
|
82
82
|
|
83
|
+
# the size of nt sequence hash of the SeqHashPair object
|
84
|
+
# @return [Integer] size of nt sequence hash of the SeqHash object
|
85
|
+
def size
|
86
|
+
self.dna_hash.size
|
87
|
+
end
|
88
|
+
|
83
89
|
# Pair-end join function for KNOWN overlap size.
|
84
90
|
# @param overlap [Integer] how many bases are overlapped. `0` means no overlap, R1 and R2 will be simply put together.
|
85
91
|
# @param diff [Integer, Float] the maximum mismatch rate allowed for the overlapping region. default at 0.0, i.e. no mis-match allowed.
|
@@ -0,0 +1,303 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
|
3
|
+
# Core functions for `tcs` pipeline
|
4
|
+
|
5
|
+
class TcsCore
|
6
|
+
class << self
|
7
|
+
|
8
|
+
# methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
|
9
|
+
|
10
|
+
def calculate_cut_off(m, error_rate = 0.02)
|
11
|
+
n = 0
|
12
|
+
case error_rate
|
13
|
+
when 0.005...0.015
|
14
|
+
if m <= 10
|
15
|
+
n = 2
|
16
|
+
else
|
17
|
+
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
18
|
+
end
|
19
|
+
|
20
|
+
when 0...0.005
|
21
|
+
if m <= 10
|
22
|
+
n = 2
|
23
|
+
else
|
24
|
+
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
25
|
+
end
|
26
|
+
|
27
|
+
else
|
28
|
+
if m <= 10
|
29
|
+
n = 2
|
30
|
+
elsif m <= 8500
|
31
|
+
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
32
|
+
else
|
33
|
+
n = 0.0079 * m + 9.4869
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
n = n.round
|
38
|
+
n = 2 if n < 3
|
39
|
+
return n
|
40
|
+
end
|
41
|
+
|
42
|
+
# identify which file in the directory is R1 file, and which is R2 file based on file names
|
43
|
+
# input as directory (Dir object or a string of path)
|
44
|
+
# by default, .gz files will be unzipped.
|
45
|
+
# return as an hash of {r1_file: file1, r1_file: file2}
|
46
|
+
def r1r2(directory, unzip = true)
|
47
|
+
files = []
|
48
|
+
Dir.chdir(directory) { files = Dir.glob "*" }
|
49
|
+
r1_file = ""
|
50
|
+
r2_file = ""
|
51
|
+
files.each do |f|
|
52
|
+
tag = parser_file_name(f)[:tag]
|
53
|
+
|
54
|
+
if tag.include? "R1"
|
55
|
+
unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
|
56
|
+
elsif tag.include? "R2"
|
57
|
+
unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
return { r1_file: r1_file, r2_file: r2_file }
|
61
|
+
end # end of ViralSeq:TcsCore.r1r2
|
62
|
+
|
63
|
+
# sort directories containing mulitple r1 and r2 files.
|
64
|
+
# use the library name (first string before "_") to seperate libraries
|
65
|
+
# out_dir is the Dir object or string of the output directory, by default named as directory + "_sorted"
|
66
|
+
# return a hash as { with_both_r1_r2: [lib1, lib2, ...], missing_r1: [lib1, lib2, ...], missing_r2: [lib1, lib2, ...], error: [lib1, lib2, ...]}
|
67
|
+
|
68
|
+
def sort_by_lib(directory, out_dir = directory + "_sorted")
|
69
|
+
Dir.mkdir(out_dir) unless File.directory?(out_dir)
|
70
|
+
files = []
|
71
|
+
Dir.chdir(directory) {files = Dir.glob("*")}
|
72
|
+
|
73
|
+
files.each do |file|
|
74
|
+
path = File.join(directory,file)
|
75
|
+
index = file.split("_")[0]
|
76
|
+
index_dir = File.join(out_dir, index)
|
77
|
+
Dir.mkdir(index_dir) unless File.directory?(index_dir)
|
78
|
+
File.rename(path, File.join(index_dir, file))
|
79
|
+
end
|
80
|
+
|
81
|
+
return_obj = { with_both_r1_r2: [],
|
82
|
+
missing_r1: [],
|
83
|
+
missing_r2: [],
|
84
|
+
error: []
|
85
|
+
}
|
86
|
+
|
87
|
+
libs = []
|
88
|
+
Dir.chdir(out_dir) { libs = Dir.glob('*') }
|
89
|
+
libs.each do |lib|
|
90
|
+
file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
|
91
|
+
if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
92
|
+
return_obj[:with_both_r1_r2] << lib
|
93
|
+
elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
|
94
|
+
return_obj[:missing_r1] << lib
|
95
|
+
elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
|
96
|
+
return_obj[:missing_r2] << lib
|
97
|
+
else
|
98
|
+
return_obj[:error] << lib
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return return_obj
|
102
|
+
end
|
103
|
+
|
104
|
+
# sort array of file names to determine if there is potential errors
|
105
|
+
# input name_array array of file names
|
106
|
+
# output hash { }
|
107
|
+
|
108
|
+
def validate_file_name(name_array)
|
109
|
+
errors = { file_type_error: [] ,
|
110
|
+
missing_r1_file: [] ,
|
111
|
+
missing_r2_file: [] ,
|
112
|
+
extra_r1_r2_file: [],
|
113
|
+
no_region_tag: [] ,
|
114
|
+
multiple_region_tag: []}
|
115
|
+
|
116
|
+
passed_libs = {}
|
117
|
+
|
118
|
+
name_with_r1_r2 = []
|
119
|
+
|
120
|
+
name_array.each do |name|
|
121
|
+
tag = parser_file_name(name)[:tag]
|
122
|
+
if name !~ /\.fastq\Z|\.fastq\.gz\Z/
|
123
|
+
errors[:file_type_error] << name
|
124
|
+
elsif tag.count("R1") == 0 and tag.count("R2") == 0
|
125
|
+
errors[:no_region_tag] << name
|
126
|
+
elsif tag.count("R1") > 0 and tag.count("R2") > 0
|
127
|
+
errors[:multiple_region_tag] << name
|
128
|
+
elsif tag.count("R1") > 1 or tag.count("R2") > 1
|
129
|
+
errors[:multiple_region_tag] << name
|
130
|
+
else
|
131
|
+
name_with_r1_r2 << name
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
libs = {}
|
136
|
+
|
137
|
+
name_with_r1_r2.map do |name|
|
138
|
+
libname = parser_file_name(name)[:libname]
|
139
|
+
libs[libname] ||= []
|
140
|
+
libs[libname] << name
|
141
|
+
end
|
142
|
+
|
143
|
+
libs.each do |libname, files|
|
144
|
+
count_r1_file = 0
|
145
|
+
count_r2_file = 0
|
146
|
+
files.each do |name|
|
147
|
+
tag = parser_file_name(name)[:tag]
|
148
|
+
if tag.include? "R1"
|
149
|
+
count_r1_file += 1
|
150
|
+
elsif tag.include? "R2"
|
151
|
+
count_r2_file += 1
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
if count_r1_file > 1 or count_r2_file > 1
|
156
|
+
errors[:extra_r1_r2_file] += files
|
157
|
+
elsif count_r1_file.zero?
|
158
|
+
errors[:missing_r1_file] += files
|
159
|
+
elsif count_r2_file.zero?
|
160
|
+
errors[:missing_r2_file] += files
|
161
|
+
else
|
162
|
+
passed_libs[libname] = files
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
passed_names = []
|
167
|
+
|
168
|
+
passed_libs.values.each { |names| passed_names += names}
|
169
|
+
|
170
|
+
if passed_names.size < name_array.size
|
171
|
+
pass = false
|
172
|
+
else
|
173
|
+
pass = true
|
174
|
+
end
|
175
|
+
|
176
|
+
return { errors: errors, all_pass: pass, passed_names: passed_names, passed_libs: passed_libs }
|
177
|
+
end
|
178
|
+
|
179
|
+
# filter r1 raw sequences for non-specific primers.
|
180
|
+
# input r1_sh, SeqHash obj.
|
181
|
+
# return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }
|
182
|
+
|
183
|
+
def filter_r1(r1_sh, forward_primer)
|
184
|
+
if forward_primer.match(/(N+)(\w+)$/)
|
185
|
+
forward_n = $1.size
|
186
|
+
forward_bio_primer = $2
|
187
|
+
else
|
188
|
+
forward_n = 0
|
189
|
+
forward_bio_primer = forward_primer
|
190
|
+
end
|
191
|
+
forward_bio_primer_size = forward_bio_primer.size
|
192
|
+
forward_starting_number = forward_n + forward_bio_primer_size
|
193
|
+
forward_primer_ref = forward_bio_primer.nt_parser
|
194
|
+
|
195
|
+
r1_passed_seq = {}
|
196
|
+
r1_raw = r1_sh.dna_hash
|
197
|
+
|
198
|
+
proc_filter = proc do |name|
|
199
|
+
seq = r1_raw[name]
|
200
|
+
next unless general_filter seq
|
201
|
+
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
202
|
+
if primer_region_seq =~ forward_primer_ref
|
203
|
+
new_name = remove_tag name
|
204
|
+
r1_passed_seq[new_name] = seq
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
r1_raw.keys.map do |name|
|
209
|
+
proc_filter.call name
|
210
|
+
end
|
211
|
+
|
212
|
+
return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
|
213
|
+
end # end of filter_r1
|
214
|
+
|
215
|
+
# filter r2 raw sequences for non-specific primers.
|
216
|
+
# input r2_sh, SeqHash obj.
|
217
|
+
# return filtered Hash of sequence name and seq pair, as well as the length of PID.
|
218
|
+
def filter_r2(r2_sh, cdna_primer)
|
219
|
+
r2_raw = r2_sh.dna_hash
|
220
|
+
cdna_primer.match(/(N+)(\w+)$/)
|
221
|
+
pid_length = $1.size
|
222
|
+
cdna_bio_primer = $2
|
223
|
+
cdna_bio_primer_size = cdna_bio_primer.size
|
224
|
+
reverse_starting_number = pid_length + cdna_bio_primer_size
|
225
|
+
cdna_primer_ref = cdna_bio_primer.nt_parser
|
226
|
+
r2_passed_seq = {}
|
227
|
+
proc_filter = proc do |name|
|
228
|
+
seq = r2_raw[name]
|
229
|
+
next unless general_filter seq
|
230
|
+
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
231
|
+
if primer_region_seq =~ cdna_primer_ref
|
232
|
+
new_name = remove_tag name
|
233
|
+
r2_passed_seq[new_name] = seq
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
r2_raw.keys.map do |name|
|
238
|
+
proc_filter.call name
|
239
|
+
end
|
240
|
+
|
241
|
+
return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
|
242
|
+
end # end of filter_r2
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
# puts error message in the log file handler, and abort with the same infor
|
247
|
+
|
248
|
+
def log_and_abort(log, infor)
|
249
|
+
log.puts Time.now.to_s + "\t" + infor
|
250
|
+
log.close
|
251
|
+
abort infor.red.bold
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
|
256
|
+
def unzip_r(indir, f)
|
257
|
+
r_file = File.join(indir, f)
|
258
|
+
if f =~ /.gz/
|
259
|
+
`gzip -d #{r_file}`
|
260
|
+
new_f = f.sub ".gz", ""
|
261
|
+
r_file = File.join(indir, new_f)
|
262
|
+
end
|
263
|
+
return r_file
|
264
|
+
end
|
265
|
+
|
266
|
+
def parser_file_name(file_name)
|
267
|
+
t = file_name.split(".")[0].split("_")
|
268
|
+
if t.size == 1
|
269
|
+
libname = "lib"
|
270
|
+
tag = [ t[0].upcase ]
|
271
|
+
else
|
272
|
+
libname = t[0]
|
273
|
+
tag = t[1..-1].map(&:upcase)
|
274
|
+
end
|
275
|
+
return {libname: libname, tag: tag}
|
276
|
+
end
|
277
|
+
|
278
|
+
def general_filter(seq)
|
279
|
+
if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
280
|
+
return false
|
281
|
+
elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
282
|
+
return false
|
283
|
+
elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
284
|
+
return false
|
285
|
+
else
|
286
|
+
return true
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
# remove region info tags from the raw MiSeq sequences.
|
291
|
+
def remove_tag(seq_name)
|
292
|
+
if seq_name =~ /\s/
|
293
|
+
new_tag = $`
|
294
|
+
else
|
295
|
+
new_tag = seq_name[0..-3]
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
end # end of class << self
|
300
|
+
|
301
|
+
end # end of TcsCore module
|
302
|
+
|
303
|
+
end # end of main module
|
@@ -0,0 +1,178 @@
|
|
1
|
+
module ViralSeq
|
2
|
+
class TcsJson
|
3
|
+
class << self
|
4
|
+
|
5
|
+
def generate
|
6
|
+
puts '-'*58
|
7
|
+
puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
8
|
+
puts '-'*58 + "\n"
|
9
|
+
|
10
|
+
param = {}
|
11
|
+
|
12
|
+
puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
|
13
|
+
print '> '
|
14
|
+
param[:raw_sequence_dir] = gets.chomp.rstrip
|
15
|
+
|
16
|
+
puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
|
17
|
+
print '> '
|
18
|
+
input_error = gets.chomp.rstrip.to_f
|
19
|
+
if input_error == 0.0
|
20
|
+
param[:platform_error_rate] = 0.02
|
21
|
+
else
|
22
|
+
param[:platform_error_rate] = input_error
|
23
|
+
end
|
24
|
+
|
25
|
+
param[:primer_pairs] = []
|
26
|
+
|
27
|
+
loop do
|
28
|
+
data = {}
|
29
|
+
puts "Enter the name for the sequenced region: "
|
30
|
+
print '> '
|
31
|
+
data[:region] = gets.chomp.rstrip
|
32
|
+
|
33
|
+
puts "Enter the #{"cDNA".red.bold} primer sequence: "
|
34
|
+
print '> '
|
35
|
+
data[:cdna] = gets.chomp.rstrip
|
36
|
+
|
37
|
+
puts "Enter the #{"forward".blue.bold} primer sequence: "
|
38
|
+
print '> '
|
39
|
+
data[:forward] = gets.chomp.rstrip
|
40
|
+
|
41
|
+
puts "Enter supermajority cut-off (0.5 - 1.0). Default Simple Majority"
|
42
|
+
print '> '
|
43
|
+
mj = gets.chomp.rstrip.to_f
|
44
|
+
if (0.5..1.0).include?(mj)
|
45
|
+
data[:majority] = mj
|
46
|
+
else
|
47
|
+
data[:majority] = 0
|
48
|
+
end
|
49
|
+
|
50
|
+
print "Need end-join? Y/N \n> "
|
51
|
+
ej = gets.chomp.rstrip
|
52
|
+
if ej =~ /y|yes/i
|
53
|
+
data[:end_join] = true
|
54
|
+
|
55
|
+
print "End-join option? Choose from (1-4):\n
|
56
|
+
1: simple join, no overlap
|
57
|
+
2: known overlap \n
|
58
|
+
3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
|
59
|
+
4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
|
60
|
+
> "
|
61
|
+
ej_option = gets.chomp.rstrip
|
62
|
+
while ![1,2,3,4].include?(ej_option.to_i)
|
63
|
+
puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
|
64
|
+
ej_option = gets.chomp.rstrip.to_i
|
65
|
+
end
|
66
|
+
case ej_option.to_i
|
67
|
+
when 1
|
68
|
+
data[:end_join_option] = 1
|
69
|
+
data[:overlap] = 0
|
70
|
+
when 2
|
71
|
+
data[:end_join_option] = 1
|
72
|
+
print "overlap bases: \n> "
|
73
|
+
ol = gets.chomp.rstrip.to_i
|
74
|
+
data[:overlap] = ol
|
75
|
+
when 3
|
76
|
+
data[:end_join_option] = 3
|
77
|
+
when 4
|
78
|
+
data[:end_join_option] = 4
|
79
|
+
end
|
80
|
+
|
81
|
+
print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
|
82
|
+
qc = gets.chomp.rstrip
|
83
|
+
if qc =~ /y|yes/i
|
84
|
+
data[:TCS_QC] = true
|
85
|
+
|
86
|
+
data[:ref_genome] = get_ref
|
87
|
+
|
88
|
+
print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
|
89
|
+
data[:ref_start] = gets.chomp.rstrip.to_i
|
90
|
+
|
91
|
+
print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
|
92
|
+
data[:ref_end] = gets.chomp.rstrip.to_i
|
93
|
+
|
94
|
+
print "allow indels? (default as yes) Y/N \n> "
|
95
|
+
indel = gets.chomp.rstrip
|
96
|
+
if indel =~ /n|no/i
|
97
|
+
data[:indel] = false
|
98
|
+
else
|
99
|
+
data[:indel] = true
|
100
|
+
end
|
101
|
+
else
|
102
|
+
data[:TCS_QC] = false
|
103
|
+
end
|
104
|
+
|
105
|
+
print "Need trimming to a reference genome? Y/N \n> "
|
106
|
+
trim_option = gets.chomp.rstrip
|
107
|
+
if trim_option =~ /y|yes/i
|
108
|
+
data[:trim] = true
|
109
|
+
data[:trim_ref] = get_ref
|
110
|
+
|
111
|
+
print "reference 5'end ref position \n> "
|
112
|
+
data[:trim_ref_start] = gets.chomp.rstrip.to_i
|
113
|
+
|
114
|
+
print "reference 3'end ref position \n> "
|
115
|
+
data[:trim_ref_end] = gets.chomp.rstrip.to_i
|
116
|
+
|
117
|
+
else
|
118
|
+
data[:trim] = false
|
119
|
+
end
|
120
|
+
|
121
|
+
else
|
122
|
+
data[:end_join] = false
|
123
|
+
end
|
124
|
+
|
125
|
+
param[:primer_pairs] << data
|
126
|
+
print "Do you wish to conintue? Y/N \n> "
|
127
|
+
continue_sig = gets.chomp.rstrip
|
128
|
+
break unless continue_sig =~ /y|yes/i
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
puts "\nYour JSON string is:"
|
133
|
+
puts JSON.pretty_generate(param)
|
134
|
+
|
135
|
+
print "\nDo you wish to save it as a file? Y/N \n> "
|
136
|
+
save_option = gets.chomp.rstrip
|
137
|
+
|
138
|
+
if save_option =~ /y|yes/i
|
139
|
+
print "Path to save JSON file:\n> "
|
140
|
+
path = gets.chomp.rstrip
|
141
|
+
File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
|
142
|
+
end
|
143
|
+
|
144
|
+
print "\nDo you wish to execute tcs pipeline with the input params now? Y/N \n> "
|
145
|
+
|
146
|
+
rsp = gets.chomp.rstrip
|
147
|
+
if rsp =~ /y/i
|
148
|
+
return param
|
149
|
+
else
|
150
|
+
abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`"
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
private
|
156
|
+
def get_ref
|
157
|
+
puts "Choose reference genome (1-3):"
|
158
|
+
puts "1. HIV-1 HXB2".red.bold
|
159
|
+
puts "2. HIV-1 NL4-3".blue.bold
|
160
|
+
puts "3. SIV MAC239".magenta.bold
|
161
|
+
print "> "
|
162
|
+
ref_option = gets.chomp.rstrip
|
163
|
+
while ![1,2,3].include?(ref_option.to_i)
|
164
|
+
print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
|
165
|
+
ref_option = gets.chomp.rstrip.to_i
|
166
|
+
end
|
167
|
+
ref = case ref_option.to_i
|
168
|
+
when 1
|
169
|
+
:HXB2
|
170
|
+
when 2
|
171
|
+
:NL43
|
172
|
+
when 3
|
173
|
+
:MAC239
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end # end TcsJson
|
178
|
+
end # end main module
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-
|
12
|
+
date: 2020-11-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -90,7 +90,6 @@ email:
|
|
90
90
|
executables:
|
91
91
|
- locator
|
92
92
|
- tcs
|
93
|
-
- tcs_json_generator
|
94
93
|
extensions: []
|
95
94
|
extra_rdoc_files: []
|
96
95
|
files:
|
@@ -105,7 +104,6 @@ files:
|
|
105
104
|
- Rakefile
|
106
105
|
- bin/locator
|
107
106
|
- bin/tcs
|
108
|
-
- bin/tcs_json_generator
|
109
107
|
- lib/viral_seq.rb
|
110
108
|
- lib/viral_seq/constant.rb
|
111
109
|
- lib/viral_seq/enumerable.rb
|
@@ -120,6 +118,8 @@ files:
|
|
120
118
|
- lib/viral_seq/seq_hash_pair.rb
|
121
119
|
- lib/viral_seq/sequence.rb
|
122
120
|
- lib/viral_seq/string.rb
|
121
|
+
- lib/viral_seq/tcs_core.rb
|
122
|
+
- lib/viral_seq/tcs_json.rb
|
123
123
|
- lib/viral_seq/version.rb
|
124
124
|
- viral_seq.gemspec
|
125
125
|
homepage: https://github.com/ViralSeq/viral_seq
|
data/bin/tcs_json_generator
DELETED
@@ -1,166 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# TCS pipeline JSON params generator.
|
4
|
-
|
5
|
-
require 'viral_seq'
|
6
|
-
require 'colorize'
|
7
|
-
require 'json'
|
8
|
-
|
9
|
-
def get_ref
|
10
|
-
puts "Choose reference genome (1-3):"
|
11
|
-
puts "1. HIV-1 HXB2".red.bold
|
12
|
-
puts "2. HIV-1 NL4-3".blue.bold
|
13
|
-
puts "3. SIV MAC239".magenta.bold
|
14
|
-
print "> "
|
15
|
-
ref_option = gets.chomp.rstrip
|
16
|
-
while ![1,2,3].include?(ref_option.to_i)
|
17
|
-
print "Entered end-join option #{ref_option.to_s.red.bold} not valid (choose 1-3), try again\n> "
|
18
|
-
ref_option = gets.chomp.rstrip.to_i
|
19
|
-
end
|
20
|
-
ref = case ref_option.to_i
|
21
|
-
when 1
|
22
|
-
:HXB2
|
23
|
-
when 2
|
24
|
-
:NL43
|
25
|
-
when 3
|
26
|
-
:MAC239
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
puts "\n" + '-'*58
|
31
|
-
puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
32
|
-
puts '-'*58 + "\n"
|
33
|
-
|
34
|
-
param = {}
|
35
|
-
|
36
|
-
puts 'Enter the path to the directory that contains the MiSeq pair-end R1 and R2 .fastq or .fastq.gz file'
|
37
|
-
print '> '
|
38
|
-
param[:raw_sequence_dir] = gets.chomp.rstrip
|
39
|
-
|
40
|
-
puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
|
41
|
-
print '> '
|
42
|
-
input_error = gets.chomp.rstrip.to_f
|
43
|
-
if input_error == 0.0
|
44
|
-
param[:platform_error_rate] = 0.02
|
45
|
-
else
|
46
|
-
param[:platform_error_rate] = input_error
|
47
|
-
end
|
48
|
-
|
49
|
-
param[:primer_pairs] = []
|
50
|
-
|
51
|
-
loop do
|
52
|
-
data = {}
|
53
|
-
puts "Enter the name for the sequenced region: "
|
54
|
-
print '> '
|
55
|
-
data[:region] = gets.chomp.rstrip
|
56
|
-
|
57
|
-
puts "Enter the #{"cDNA".red.bold} primer sequence: "
|
58
|
-
print '> '
|
59
|
-
data[:cdna] = gets.chomp.rstrip
|
60
|
-
|
61
|
-
puts "Enter the #{"forward".blue.bold} primer sequence: "
|
62
|
-
print '> '
|
63
|
-
data[:forward] = gets.chomp.rstrip
|
64
|
-
|
65
|
-
puts "Enter supermajority cut-off (0.5 - 0.9). Default: " + "0.5".blue.bold + " (simple majority)"
|
66
|
-
print '> '
|
67
|
-
mj = gets.chomp.rstrip.to_f
|
68
|
-
if (0.5..0.9).include?(mj)
|
69
|
-
data[:majority] = mj
|
70
|
-
else
|
71
|
-
data[:majority] = 0.5
|
72
|
-
end
|
73
|
-
|
74
|
-
print "Need end-join? Y/N \n> "
|
75
|
-
ej = gets.chomp.rstrip
|
76
|
-
if ej =~ /y|yes/i
|
77
|
-
data[:end_join] = true
|
78
|
-
|
79
|
-
print "End-join option? Choose from (1-4):\n
|
80
|
-
1: simple join, no overlap
|
81
|
-
2: known overlap \n
|
82
|
-
3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
|
83
|
-
4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
|
84
|
-
> "
|
85
|
-
ej_option = gets.chomp.rstrip
|
86
|
-
while ![1,2,3,4].include?(ej_option.to_i)
|
87
|
-
puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
|
88
|
-
ej_option = gets.chomp.rstrip.to_i
|
89
|
-
end
|
90
|
-
case ej_option.to_i
|
91
|
-
when 1
|
92
|
-
data[:end_join_option] = 1
|
93
|
-
data[:overlap] = 0
|
94
|
-
when 2
|
95
|
-
data[:end_join_option] = 1
|
96
|
-
print "overlap bases: \n> "
|
97
|
-
ol = gets.chomp.rstrip.to_i
|
98
|
-
data[:overlap] = ol
|
99
|
-
when 3
|
100
|
-
data[:end_join_option] = 3
|
101
|
-
when 4
|
102
|
-
data[:end_join_option] = 4
|
103
|
-
end
|
104
|
-
|
105
|
-
print "Need QC for TCS? (support for HIV-1 and SIV)? Y/N \n> "
|
106
|
-
qc = gets.chomp.rstrip
|
107
|
-
if qc =~ /y|yes/i
|
108
|
-
data[:TCS_QC] = true
|
109
|
-
|
110
|
-
data[:ref_genome] = get_ref
|
111
|
-
|
112
|
-
print "reference 5'end ref position or posiiton range, 0 if no need to match this end \n> "
|
113
|
-
data[:ref_start] = gets.chomp.rstrip.to_i
|
114
|
-
|
115
|
-
print "reference 3'end ref position or posiiton range: 0 if no need to match this end \n> "
|
116
|
-
data[:ref_end] = gets.chomp.rstrip.to_i
|
117
|
-
|
118
|
-
print "allow indels? (default as yes) Y/N \n> "
|
119
|
-
indel = gets.chomp.rstrip
|
120
|
-
if indel =~ /n|no/i
|
121
|
-
data[:indel] = false
|
122
|
-
else
|
123
|
-
data[:indel] = true
|
124
|
-
end
|
125
|
-
else
|
126
|
-
data[:TCS_QC] = false
|
127
|
-
end
|
128
|
-
|
129
|
-
print "Need trimming to a reference genome? Y/N \n> "
|
130
|
-
trim_option = gets.chomp.rstrip
|
131
|
-
if trim_option =~ /y|yes/i
|
132
|
-
data[:trim] = true
|
133
|
-
data[:trim_ref] = get_ref
|
134
|
-
|
135
|
-
print "reference 5'end ref position \n> "
|
136
|
-
data[:trim_ref_start] = gets.chomp.rstrip.to_i
|
137
|
-
|
138
|
-
print "reference 3'end ref position \n> "
|
139
|
-
data[:trim_ref_end] = gets.chomp.rstrip.to_i
|
140
|
-
|
141
|
-
else
|
142
|
-
data[:trim] = false
|
143
|
-
end
|
144
|
-
|
145
|
-
else
|
146
|
-
data[:end_join] = false
|
147
|
-
end
|
148
|
-
|
149
|
-
param[:primer_pairs] << data
|
150
|
-
print "Do you wish to conintue? Y/N \n> "
|
151
|
-
continue_sig = gets.chomp.rstrip
|
152
|
-
break unless continue_sig =~ /y|yes/i
|
153
|
-
|
154
|
-
end
|
155
|
-
|
156
|
-
puts "\nYour JSON string is:"
|
157
|
-
puts JSON.pretty_generate(param)
|
158
|
-
|
159
|
-
print "\nDo you wish to save it as a file? Y/N \n> "
|
160
|
-
save_option = gets.chomp.rstrip
|
161
|
-
|
162
|
-
if save_option =~ /y|yes/i
|
163
|
-
print "Path to save JSON file:\n> "
|
164
|
-
path = gets.chomp.rstrip
|
165
|
-
File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
|
166
|
-
end
|