viral_seq 1.0.9 → 1.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +67 -32
- data/bin/tcs +78 -143
- data/lib/viral_seq.rb +3 -0
- data/lib/viral_seq/constant.rb +5 -1
- data/lib/viral_seq/enumerable.rb +0 -10
- data/lib/viral_seq/hivdr.rb +1 -1
- data/lib/viral_seq/math.rb +3 -3
- data/lib/viral_seq/sdrm.rb +43 -0
- data/lib/viral_seq/seq_hash.rb +15 -8
- data/lib/viral_seq/seq_hash_pair.rb +6 -0
- data/lib/viral_seq/tcs_core.rb +332 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +6 -5
- data/bin/tcs_json_generator +0 -166
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '048e85ab67fbb667919d02d4509a15111798b116b3f927c921d203dc8565a1a2'
|
4
|
+
data.tar.gz: 6951e410bd4f9b727a44fab1aa88f9cc263151cf9aed2a9c25ae9d866ed72450
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02cc87e245918a5c8f1b16b0db978da66e3bf7e83c6c6140c394c560d31c86ab1845b337a4f53b4b7883ff8e452e8caef0b036ea113c4416d6a29d16f419eb81
|
7
|
+
data.tar.gz: 9f53bd6c46f4a49b5c14b8b8019ffa3e1abcf442bf0a6cc09a7dbc768a474f2afd81d5cf168f38eb8ab5abbd601c60f1f824f753bc77dbcc0d3c0d93568b9ae3
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -4,86 +4,121 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
4
4
|
|
5
5
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
6
6
|
|
7
|
-
##
|
7
|
+
## Install
|
8
8
|
|
9
|
+
```bash
|
9
10
|
$ gem install viral_seq
|
11
|
+
```
|
10
12
|
|
11
13
|
## Usage
|
12
14
|
|
13
|
-
|
15
|
+
### Excutables
|
14
16
|
|
15
|
-
|
16
|
-
#!/usr/bin/env ruby
|
17
|
-
require 'viral_seq'
|
18
|
-
```
|
19
|
-
|
20
|
-
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
17
|
+
Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
21
18
|
|
19
|
+
```bash
|
22
20
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
21
|
+
```
|
23
22
|
|
23
|
+
Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data.
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
$ tcs params.json
|
28
|
-
|
29
|
-
|
25
|
+
```bash
|
26
|
+
$ tcs -p params.json # run TCS pipeline with params.json
|
27
|
+
$ tcs -j # CLI to generate params.json
|
28
|
+
$ tcs -h # print out the help
|
29
|
+
```
|
30
30
|
|
31
|
-
|
31
|
+
## Some Examples
|
32
32
|
|
33
|
+
Load all ViralSeq classes by requiring 'viral_seq.rb' in your Ruby scripts.
|
33
34
|
|
34
|
-
|
35
|
+
```ruby
|
36
|
+
#!/usr/bin/env ruby
|
37
|
+
require 'viral_seq'
|
38
|
+
```
|
35
39
|
|
36
|
-
|
40
|
+
Load nucleotide sequences from a FASTA format sequence file
|
37
41
|
|
38
42
|
```ruby
|
39
43
|
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
40
44
|
```
|
41
45
|
|
42
|
-
|
46
|
+
Make an alignment (using MUSCLE)
|
43
47
|
|
44
48
|
```ruby
|
45
49
|
aligned_seqhash = my_seqhash.align
|
46
50
|
```
|
47
51
|
|
48
|
-
|
52
|
+
Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
49
53
|
|
50
54
|
```ruby
|
51
55
|
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
52
56
|
```
|
53
57
|
|
54
|
-
|
58
|
+
Further filter out sequences with Apobec3g/f hypermutations
|
55
59
|
|
56
60
|
```ruby
|
57
61
|
qc_seqhash = qc_seqhash.a3g
|
58
62
|
```
|
59
63
|
|
60
|
-
|
64
|
+
Calculate nucleotide diveristy π
|
61
65
|
|
62
66
|
```ruby
|
63
67
|
qc_seqhash.pi
|
64
68
|
```
|
65
69
|
|
66
|
-
|
70
|
+
Calculate cut-off for minority variants based on Poisson model
|
67
71
|
|
68
72
|
```ruby
|
69
73
|
cut_off = qc_seqhash.pm
|
70
74
|
```
|
71
75
|
|
72
|
-
|
76
|
+
Examine for drug resistance mutations for HIV PR region
|
73
77
|
|
74
78
|
```ruby
|
75
79
|
qc_seqhash.sdrm_hiv_pr(cut_off)
|
76
80
|
```
|
81
|
+
## Known issues
|
82
|
+
|
83
|
+
1. ~~have a conflict with rails.~~
|
84
|
+
2. ~~Update on 03032021. Still have conflict. But in rails gem file, can just use `requires: false` globally and only require "viral_seq" when the module is needed in controller.~~
|
85
|
+
3. The conflict seems to be resovled. It was from a combination of using `!` as a function for factorial and the gem name `viral_seq`. @_@
|
77
86
|
|
78
87
|
## Updates
|
79
88
|
|
80
|
-
Version 1.0.
|
89
|
+
### Version 1.0.14-03052021
|
90
|
+
|
91
|
+
1. Add a function `ViralSeq::TcsCore.validate_file_name` to check MiSeq paired-end file names.
|
92
|
+
|
93
|
+
### Version 1.0.13-03032021
|
94
|
+
|
95
|
+
1. Fixed the conflict with rails.
|
96
|
+
|
97
|
+
### Version 1.0.12-03032021
|
98
|
+
|
99
|
+
1. Fixed an issue that may cause conflicts with ActiveRecord.
|
100
|
+
|
101
|
+
### Version 1.0.11-03022021
|
102
|
+
|
103
|
+
1. Fixed an issue when calculating Poisson cutoff for minority mutations `ViralSeq::SeqHash.pm`.
|
104
|
+
2. fixed an issue loading class 'OptionParser'in some ruby environments.
|
105
|
+
|
106
|
+
### Version 1.0.10-11112020:
|
107
|
+
|
108
|
+
1. Modularize TCS pipeline. Move key functions into /viral_seq/tcs_core.rb
|
109
|
+
2. `tcs_json_generator` is removed. This CLI is delivered within the `tcs` pipeline, by running `tcs -j`. The scripts are included in the /viral_seq/tcs_json.rb
|
110
|
+
3. consensus model now includes a true simple majority model, where no nt needs to be over 50% to be called.
|
111
|
+
4. a few optimizations.
|
112
|
+
5. TCS 2.1.0 delivered.
|
113
|
+
6. Tried parallel processing. Cannot achieve the goal because `parallel` gem by default can't pool data from memory of child processors and `in_threads` does not help with the speed.
|
114
|
+
|
115
|
+
### Version 1.0.9-07182020:
|
81
116
|
|
82
117
|
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
83
118
|
|
84
119
|
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
85
120
|
|
86
|
-
Version 1.0.8-02282020:
|
121
|
+
### Version 1.0.8-02282020:
|
87
122
|
|
88
123
|
1. TCS pipeline (version 2.0.0) added as executable.
|
89
124
|
tcs - main TCS pipeline script.
|
@@ -94,14 +129,14 @@ Version 1.0.8-02282020:
|
|
94
129
|
|
95
130
|
3. Bug fix for several methods.
|
96
131
|
|
97
|
-
Version 1.0.7-01282020:
|
132
|
+
### Version 1.0.7-01282020:
|
98
133
|
|
99
134
|
1. Several methods added, including
|
100
135
|
ViralSeq::SeqHash#error_table
|
101
136
|
ViralSeq::SeqHash#random_select
|
102
137
|
2. Improved performance for several functions.
|
103
138
|
|
104
|
-
Version 1.0.6-07232019:
|
139
|
+
### Version 1.0.6-07232019:
|
105
140
|
|
106
141
|
1. Several methods added to ViralSeq::SeqHash, including
|
107
142
|
ViralSeq::SeqHash#size
|
@@ -110,33 +145,33 @@ Version 1.0.6-07232019:
|
|
110
145
|
ViralSeq::SeqHash#mutation
|
111
146
|
2. Update documentations and rspec samples.
|
112
147
|
|
113
|
-
Version 1.0.5-07112019:
|
148
|
+
### Version 1.0.5-07112019:
|
114
149
|
|
115
150
|
1. Update ViralSeq::SeqHash#sequence_locator.
|
116
151
|
Program will try to determine the direction (`+` or `-` of the query sequence)
|
117
152
|
2. update executable `locator` to have a column of `direction` in output .csv file
|
118
153
|
|
119
|
-
Version 1.0.4-07102019:
|
154
|
+
### Version 1.0.4-07102019:
|
120
155
|
|
121
156
|
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
122
157
|
2. Fix bugs in bin `locator`
|
123
158
|
|
124
|
-
Version 1.0.3-07102019:
|
159
|
+
### Version 1.0.3-07102019:
|
125
160
|
|
126
161
|
1. Bug fix.
|
127
162
|
|
128
|
-
Version 1.0.2-07102019:
|
163
|
+
### Version 1.0.2-07102019:
|
129
164
|
|
130
165
|
1. Fixed a gem loading issue.
|
131
166
|
|
132
|
-
Version 1.0.1-07102019:
|
167
|
+
### Version 1.0.1-07102019:
|
133
168
|
|
134
169
|
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
135
170
|
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
136
171
|
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
137
172
|
4. update documentations
|
138
173
|
|
139
|
-
Version 1.0.0-07092019:
|
174
|
+
### Version 1.0.0-07092019:
|
140
175
|
|
141
176
|
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
142
177
|
|
data/bin/tcs
CHANGED
@@ -28,114 +28,79 @@
|
|
28
28
|
require 'viral_seq'
|
29
29
|
require 'json'
|
30
30
|
require 'colorize'
|
31
|
+
require 'optparse'
|
31
32
|
|
33
|
+
options = {}
|
32
34
|
|
33
|
-
|
35
|
+
banner = '-'*50 + "\n" +
|
36
|
+
'| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |' + "\n" +
|
37
|
+
'-'*50 + "\n"
|
34
38
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
n = 2
|
41
|
-
else
|
42
|
-
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
43
|
-
end
|
39
|
+
OptionParser.new do |opts|
|
40
|
+
opts.banner = banner + "Usage: tcs -j"
|
41
|
+
opts.on "-j", "--json_generator", "Command line interfac to generate new params json file" do |j|
|
42
|
+
options[:json_generator] = true
|
43
|
+
end
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
else
|
49
|
-
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
50
|
-
end
|
45
|
+
opts.on("-p", "--params PARAMS_JSON", "Execute the pipeline with input params json file") do |p|
|
46
|
+
options[:params_json] = p
|
47
|
+
end
|
51
48
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
elsif m <= 8500
|
56
|
-
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
57
|
-
else
|
58
|
-
n = 0.0079 * m + 9.4869
|
59
|
-
end
|
49
|
+
opts.on("-h", "--help", "Prints this help") do
|
50
|
+
puts opts
|
51
|
+
exit
|
60
52
|
end
|
61
53
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
54
|
+
opts.on("-v", "--version", "Version info") do
|
55
|
+
puts "tcs version: " + ViralSeq::TCS_VERSION.red.bold
|
56
|
+
puts "viral_seq version: " + ViralSeq::VERSION.red.bold
|
57
|
+
exit
|
58
|
+
end
|
66
59
|
|
67
|
-
|
68
|
-
|
69
|
-
|
60
|
+
# opts.on("--no-parallel", "toggle off parallel processing") do
|
61
|
+
# options[:no_parallel] = true
|
62
|
+
# end
|
63
|
+
end.parse!
|
70
64
|
|
71
|
-
|
72
|
-
|
65
|
+
if options[:json_generator]
|
66
|
+
params = ViralSeq::TcsJson.generate
|
67
|
+
elsif (options[:params_json] && File.exist?(options[:params_json]))
|
68
|
+
params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
|
69
|
+
else
|
70
|
+
abort "No params JSON file found. Script terminated.".red
|
73
71
|
end
|
74
72
|
|
75
|
-
params = JSON.parse(File.read(ARGV[0]), symbolize_names: true)
|
76
|
-
|
77
73
|
indir = params[:raw_sequence_dir]
|
78
74
|
|
79
75
|
unless File.exist?(indir)
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
libname = File.basename(indir)
|
84
|
-
|
85
|
-
# obtain R1 and R2 file path
|
86
|
-
files = []
|
87
|
-
Dir.chdir(indir) do
|
88
|
-
files = Dir.glob("*")
|
89
|
-
end
|
90
|
-
|
91
|
-
if files.empty?
|
92
|
-
raise "Input dir does not contain files. Script terminated."
|
76
|
+
abort "No input sequence directory found. Script terminated.".red.bold
|
93
77
|
end
|
94
78
|
|
95
|
-
|
96
|
-
r2_f = ""
|
79
|
+
# log file
|
97
80
|
|
98
|
-
# unzip .fasta.gz
|
99
|
-
def unzip_r(indir, f)
|
100
|
-
r_file = indir + "/" + f
|
101
|
-
if f =~ /.gz/
|
102
|
-
`gzip -d #{r_file}`
|
103
|
-
new_f = f.sub ".gz", ""
|
104
|
-
r_file = File.join(indir, new_f)
|
105
|
-
end
|
106
|
-
return r_file
|
107
|
-
end
|
108
81
|
runtime_log_file = File.join(indir,"runtime.log")
|
109
82
|
log = File.open(runtime_log_file, "w")
|
110
83
|
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
111
84
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
112
85
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
113
86
|
|
87
|
+
libname = File.basename indir
|
114
88
|
|
115
|
-
|
116
|
-
t = f.split("_")
|
117
|
-
if t.size == 1
|
118
|
-
tag = f
|
119
|
-
else
|
120
|
-
tag = f.split("_")[1..-1].join("_")
|
121
|
-
end
|
122
|
-
|
123
|
-
if tag =~ /r1/i
|
124
|
-
r1_f = unzip_r(indir, f)
|
125
|
-
elsif tag =~ /r2/i
|
126
|
-
r2_f = unzip_r(indir, f)
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
89
|
+
seq_files = ViralSeq::TcsCore.r1r2 indir
|
130
90
|
|
131
|
-
|
132
|
-
|
133
|
-
|
91
|
+
if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
|
92
|
+
r1_f = seq_files[:r1_file]
|
93
|
+
r2_f = seq_files[:r2_file]
|
94
|
+
elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
|
95
|
+
exit_sig = "Missing R2 file. Aborted."
|
96
|
+
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
97
|
+
exit_sig = "Missing R1 file. Aborted."
|
98
|
+
else
|
99
|
+
exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
|
134
100
|
end
|
135
101
|
|
136
|
-
|
137
|
-
|
138
|
-
raise "R2 file not found. Script terminated."
|
102
|
+
if exit_sig
|
103
|
+
ViralSeq::TcsCore.log_and_abort log, exit_sig
|
139
104
|
end
|
140
105
|
|
141
106
|
r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
|
@@ -152,10 +117,10 @@ end
|
|
152
117
|
|
153
118
|
primers = params[:primer_pairs]
|
154
119
|
if primers.empty?
|
155
|
-
|
156
|
-
raise "No primer information. Script terminated."
|
120
|
+
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
157
121
|
end
|
158
122
|
|
123
|
+
|
159
124
|
primers.each do |primer|
|
160
125
|
summary_json = {}
|
161
126
|
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
@@ -179,66 +144,25 @@ primers.each do |primer|
|
|
179
144
|
summary_json[:cdan_primer] = cdna_primer
|
180
145
|
summary_json[:forward_primer] = forward_primer
|
181
146
|
|
182
|
-
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
147
|
+
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
183
148
|
summary_json[:majority_cut_off] = majority_cut_off
|
184
149
|
|
185
150
|
summary_json[:total_raw_sequence] = raw_sequence_number
|
186
151
|
|
187
152
|
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
188
153
|
|
189
|
-
|
190
|
-
r2_raw = r2_fastq_sh.dna_hash
|
191
|
-
|
154
|
+
# filter R1
|
192
155
|
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
193
|
-
|
194
|
-
|
195
|
-
forward_n = $1.size
|
196
|
-
forward_bio_primer = $2
|
197
|
-
else
|
198
|
-
forward_n = 0
|
199
|
-
forward_bio_primer = forward_primer
|
200
|
-
end
|
201
|
-
forward_bio_primer_size = forward_bio_primer.size
|
202
|
-
forward_starting_number = forward_n + forward_bio_primer_size
|
203
|
-
|
204
|
-
# filter R1 sequences with forward primers.
|
205
|
-
forward_primer_ref = forward_bio_primer.nt_parser
|
206
|
-
r1_passed_seq = {}
|
207
|
-
r1_raw.each do |name,seq|
|
208
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
209
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
210
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
211
|
-
|
212
|
-
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
213
|
-
if primer_region_seq =~ forward_primer_ref
|
214
|
-
r1_passed_seq[name.split("\s")[0]] = seq
|
215
|
-
end
|
216
|
-
end
|
156
|
+
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
157
|
+
r1_passed_seq = filter_r1[:r1_passed_seq]
|
217
158
|
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
218
|
-
|
219
159
|
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
220
160
|
|
161
|
+
# filter R2
|
221
162
|
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
222
|
-
|
223
|
-
|
224
|
-
pid_length =
|
225
|
-
cdna_bio_primer = $2
|
226
|
-
cdna_bio_primer_size = cdna_bio_primer.size
|
227
|
-
reverse_starting_number = pid_length + cdna_bio_primer_size
|
228
|
-
|
229
|
-
# filter R2 sequences with cDNA primers.
|
230
|
-
cdna_primer_ref = cdna_bio_primer.nt_parser
|
231
|
-
r2_passed_seq = {}
|
232
|
-
r2_raw.each do |name, seq|
|
233
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
234
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
235
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
236
|
-
|
237
|
-
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
238
|
-
if primer_region_seq =~ cdna_primer_ref
|
239
|
-
r2_passed_seq[name.split("\s")[0]] = seq
|
240
|
-
end
|
241
|
-
end
|
163
|
+
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
164
|
+
r2_passed_seq = filter_r2[:r2_passed_seq]
|
165
|
+
pid_length = filter_r2[:pid_length]
|
242
166
|
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
243
167
|
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
244
168
|
|
@@ -257,8 +181,8 @@ primers.each do |primer|
|
|
257
181
|
r2_seq = r2_passed_seq[seqtag]
|
258
182
|
pid = r2_seq[0, pid_length]
|
259
183
|
id[seqtag] = pid
|
260
|
-
bio_r2[seqtag] = r2_seq[reverse_starting_number..-2]
|
261
|
-
bio_r1[seqtag] = r1_seq[forward_starting_number..-2]
|
184
|
+
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
185
|
+
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
262
186
|
end
|
263
187
|
|
264
188
|
# TCS cut-off
|
@@ -278,11 +202,10 @@ primers.each do |primer|
|
|
278
202
|
end
|
279
203
|
|
280
204
|
max_id = primer_id_dis.keys.sort[-5..-1].mean
|
281
|
-
consensus_cutoff = calculate_cut_off(max_id,error_rate)
|
205
|
+
consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
|
282
206
|
log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
|
283
207
|
summary_json[:consensus_cutoff] = consensus_cutoff
|
284
208
|
summary_json[:length_of_pid] = pid_length
|
285
|
-
|
286
209
|
log.puts Time.now.to_s + "\t" + "Creating consensus..."
|
287
210
|
|
288
211
|
# Primer ID over the cut-off
|
@@ -355,6 +278,8 @@ primers.each do |primer|
|
|
355
278
|
consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
|
356
279
|
r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
|
357
280
|
r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
|
281
|
+
|
282
|
+
# hide the following two lines if allowing sequence to have ambiguities.
|
358
283
|
next if r1_consensus =~ /[^ATCG]/
|
359
284
|
next if r2_consensus =~ /[^ATCG]/
|
360
285
|
|
@@ -392,8 +317,12 @@ primers.each do |primer|
|
|
392
317
|
f1 = File.open(outfile_r1, 'w')
|
393
318
|
f2 = File.open(outfile_r2, 'w')
|
394
319
|
primer_id_in_use = {}
|
395
|
-
|
396
|
-
|
320
|
+
if n_con > 0
|
321
|
+
r1_seq_length = consensus_filtered.values[0][0].size
|
322
|
+
r2_seq_length = consensus_filtered.values[0][1].size
|
323
|
+
else
|
324
|
+
next
|
325
|
+
end
|
397
326
|
log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
|
398
327
|
log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
|
399
328
|
consensus_filtered.each do |seq_name,seq|
|
@@ -404,6 +333,7 @@ primers.each do |primer|
|
|
404
333
|
f1.close
|
405
334
|
f2.close
|
406
335
|
|
336
|
+
# Primer ID distribution in .json file
|
407
337
|
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
408
338
|
pid_json = {}
|
409
339
|
pid_json[:primer_id_in_use] = Hash[*(primer_id_in_use.sort_by {|k, v| [-v,k]}.flatten)]
|
@@ -413,11 +343,14 @@ primers.each do |primer|
|
|
413
343
|
f.puts JSON.pretty_generate(pid_json)
|
414
344
|
end
|
415
345
|
|
346
|
+
# start end-join
|
416
347
|
def end_join(dir, option, overlap)
|
417
348
|
shp = ViralSeq::SeqHashPair.fa(dir)
|
418
349
|
case option
|
419
350
|
when 1
|
420
351
|
joined_sh = shp.join1()
|
352
|
+
when 2
|
353
|
+
joined_sh = shp.join1(overlap)
|
421
354
|
when 3
|
422
355
|
joined_sh = shp.join2
|
423
356
|
when 4
|
@@ -489,9 +422,10 @@ primers.each do |primer|
|
|
489
422
|
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
490
423
|
|
491
424
|
if export_raw
|
492
|
-
joined_sh_raw =
|
425
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
493
426
|
end
|
494
427
|
end
|
428
|
+
|
495
429
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
496
430
|
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
497
431
|
if primer[:trim]
|
@@ -499,10 +433,11 @@ primers.each do |primer|
|
|
499
433
|
trim_end = primer[:trim_ref_end]
|
500
434
|
trim_ref = primer[:trim_ref].to_sym
|
501
435
|
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
436
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
437
|
+
if export_raw
|
438
|
+
joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
|
439
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
440
|
+
end
|
506
441
|
end
|
507
442
|
end
|
508
443
|
|