viral_seq 1.0.8 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/README.md +120 -57
- data/bin/tcs +140 -214
- data/lib/viral_seq.rb +3 -0
- data/lib/viral_seq/constant.rb +5 -1
- data/lib/viral_seq/enumerable.rb +0 -10
- data/lib/viral_seq/hivdr.rb +1 -1
- data/lib/viral_seq/math.rb +3 -3
- data/lib/viral_seq/sdrm.rb +43 -0
- data/lib/viral_seq/seq_hash.rb +38 -24
- data/lib/viral_seq/seq_hash_pair.rb +6 -0
- data/lib/viral_seq/tcs_core.rb +305 -0
- data/lib/viral_seq/tcs_json.rb +178 -0
- data/lib/viral_seq/version.rb +2 -2
- data/viral_seq.gemspec +1 -1
- metadata +8 -7
- data/bin/tcs_json_generator +0 -170
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7816fd2b8da8109a24a33b8663e5f4fa5f098ed590c7403f909b593ebdd78c2f
|
4
|
+
data.tar.gz: adaffa3e35268eaed0bb2d0c5a6ba387f8b09bc04561a3714f9e155a55466cd5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5bfbb3c2e78ae8ef01b1750b5135a76b7fdf65ecc00ccfe141e488154adfc9b0ddff42a58ee9f682f46576060632d12cd6e540bea26e81d8bd9e346f5e7bca84
|
7
|
+
data.tar.gz: 3b491d3070f2e7aacc73c1c9f4942fe770f2c0e4eebe5c021e5558ce7fa0e4299c44ddde51b07b4c3118d7832236be9703707209ea0ebfb6591a576871ad0804
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.0.
|
4
|
+
viral_seq (1.0.13)
|
5
5
|
colorize (~> 0.1)
|
6
6
|
muscle_bio (~> 0.4)
|
7
7
|
|
@@ -11,7 +11,7 @@ GEM
|
|
11
11
|
colorize (0.8.1)
|
12
12
|
diff-lcs (1.3)
|
13
13
|
muscle_bio (0.4.0)
|
14
|
-
rake (
|
14
|
+
rake (13.0.1)
|
15
15
|
rspec (3.8.0)
|
16
16
|
rspec-core (~> 3.8.0)
|
17
17
|
rspec-expectations (~> 3.8.0)
|
@@ -31,7 +31,7 @@ PLATFORMS
|
|
31
31
|
|
32
32
|
DEPENDENCIES
|
33
33
|
bundler (~> 2.0)
|
34
|
-
rake (~>
|
34
|
+
rake (~> 13.0)
|
35
35
|
rspec (~> 3.0)
|
36
36
|
viral_seq!
|
37
37
|
|
data/README.md
CHANGED
@@ -4,109 +4,172 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
4
4
|
|
5
5
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
6
6
|
|
7
|
-
##
|
7
|
+
## Install
|
8
8
|
|
9
|
+
```bash
|
9
10
|
$ gem install viral_seq
|
11
|
+
```
|
10
12
|
|
11
13
|
## Usage
|
12
14
|
|
13
|
-
|
15
|
+
### Excutables
|
14
16
|
|
15
|
-
|
16
|
-
require 'viral_seq'
|
17
|
-
|
18
|
-
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
17
|
+
Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
19
18
|
|
19
|
+
```bash
|
20
20
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
21
|
+
```
|
22
|
+
|
23
|
+
Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data.
|
24
|
+
|
25
|
+
```bash
|
26
|
+
$ tcs -p params.json # run TCS pipeline with params.json
|
27
|
+
$ tcs -j # CLI to generate params.json
|
28
|
+
$ tcs -h # print out the help
|
29
|
+
```
|
21
30
|
|
22
31
|
## Some Examples
|
23
32
|
|
24
|
-
|
33
|
+
Load all ViralSeq classes by requiring 'viral_seq.rb' in your Ruby scripts.
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
#!/usr/bin/env ruby
|
37
|
+
require 'viral_seq'
|
38
|
+
```
|
39
|
+
|
40
|
+
Load nucleotide sequences from a FASTA format sequence file
|
25
41
|
|
26
|
-
|
42
|
+
```ruby
|
43
|
+
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
44
|
+
```
|
27
45
|
|
28
|
-
|
46
|
+
Make an alignment (using MUSCLE)
|
29
47
|
|
30
|
-
|
48
|
+
```ruby
|
49
|
+
aligned_seqhash = my_seqhash.align
|
50
|
+
```
|
31
51
|
|
32
|
-
|
52
|
+
Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
33
53
|
|
34
|
-
|
54
|
+
```ruby
|
55
|
+
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
56
|
+
```
|
35
57
|
|
36
|
-
|
58
|
+
Further filter out sequences with Apobec3g/f hypermutations
|
37
59
|
|
38
|
-
|
60
|
+
```ruby
|
61
|
+
qc_seqhash = qc_seqhash.a3g
|
62
|
+
```
|
39
63
|
|
40
|
-
|
64
|
+
Calculate nucleotide diveristy π
|
41
65
|
|
42
|
-
|
66
|
+
```ruby
|
67
|
+
qc_seqhash.pi
|
68
|
+
```
|
43
69
|
|
44
|
-
|
70
|
+
Calculate cut-off for minority variants based on Poisson model
|
45
71
|
|
46
|
-
|
72
|
+
```ruby
|
73
|
+
cut_off = qc_seqhash.pm
|
74
|
+
```
|
47
75
|
|
48
|
-
|
76
|
+
Examine for drug resistance mutations for HIV PR region
|
49
77
|
|
50
|
-
|
78
|
+
```ruby
|
79
|
+
qc_seqhash.sdrm_hiv_pr(cut_off)
|
80
|
+
```
|
81
|
+
## Known issues
|
82
|
+
|
83
|
+
1. ~~have a conflict with rails.~~
|
84
|
+
2. ~~Update on 03032021. Still have conflict. But in rails gem file, can just use `requires: false` globally and only require "viral_seq" when the module is needed in controller.~~
|
85
|
+
3. The conflict seems to be resovled. It was from a combination of using `!` as a function for factorial and the gem name `viral_seq`. @_@
|
51
86
|
|
52
87
|
## Updates
|
53
88
|
|
54
|
-
Version 1.
|
89
|
+
### Version 1.1.3-03032021
|
90
|
+
|
91
|
+
1. Fixed the conflict with rails.
|
92
|
+
|
93
|
+
### Version 1.1.2-03032021
|
94
|
+
|
95
|
+
1. Fixed an issue that may cause conflicts with ActiveRecord.
|
96
|
+
|
97
|
+
### Version 1.1.1-03022021
|
98
|
+
|
99
|
+
1. Fixed an issue when calculating Poisson cutoff for minority mutations `ViralSeq::SeqHash.pm`.
|
100
|
+
2. fixed an issue loading class 'OptionParser'in some ruby environments.
|
101
|
+
|
102
|
+
### Version 1.1.0-11112020:
|
103
|
+
|
104
|
+
1. Modularize TCS pipeline. Move key functions into /viral_seq/tcs_core.rb
|
105
|
+
2. `tcs_json_generator` is removed. This CLI is delivered within the `tcs` pipeline, by running `tcs -j`. The scripts are included in the /viral_seq/tcs_json.rb
|
106
|
+
3. consensus model now includes a true simple majority model, where no nt needs to be over 50% to be called.
|
107
|
+
4. a few optimizations.
|
108
|
+
5. TCS 2.1.0 delivered.
|
109
|
+
6. Tried parallel processing. Cannot achieve the goal because `parallel` gem by default can't pool data from memory of child processors and `in_threads` does not help with the speed.
|
110
|
+
|
111
|
+
### Version 1.0.9-07182020:
|
112
|
+
|
113
|
+
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
114
|
+
|
115
|
+
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
116
|
+
|
117
|
+
### Version 1.0.8-02282020:
|
55
118
|
|
56
|
-
|
57
|
-
|
58
|
-
|
119
|
+
1. TCS pipeline (version 2.0.0) added as executable.
|
120
|
+
tcs - main TCS pipeline script.
|
121
|
+
tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
|
59
122
|
|
60
|
-
|
61
|
-
|
123
|
+
2. Methods added:
|
124
|
+
ViralSeq::SeqHash#trim
|
62
125
|
|
63
|
-
|
126
|
+
3. Bug fix for several methods.
|
64
127
|
|
65
|
-
Version 1.0.7-01282020:
|
128
|
+
### Version 1.0.7-01282020:
|
66
129
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
130
|
+
1. Several methods added, including
|
131
|
+
ViralSeq::SeqHash#error_table
|
132
|
+
ViralSeq::SeqHash#random_select
|
133
|
+
2. Improved performance for several functions.
|
71
134
|
|
72
|
-
Version 1.0.6-07232019:
|
135
|
+
### Version 1.0.6-07232019:
|
73
136
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
137
|
+
1. Several methods added to ViralSeq::SeqHash, including
|
138
|
+
ViralSeq::SeqHash#size
|
139
|
+
ViralSeq::SeqHash#+
|
140
|
+
ViralSeq::SeqHash#write_nt_fa
|
141
|
+
ViralSeq::SeqHash#mutation
|
142
|
+
2. Update documentations and rspec samples.
|
80
143
|
|
81
|
-
Version 1.0.5-07112019:
|
144
|
+
### Version 1.0.5-07112019:
|
82
145
|
|
83
|
-
|
84
|
-
|
85
|
-
|
146
|
+
1. Update ViralSeq::SeqHash#sequence_locator.
|
147
|
+
Program will try to determine the direction (`+` or `-` of the query sequence)
|
148
|
+
2. update executable `locator` to have a column of `direction` in output .csv file
|
86
149
|
|
87
|
-
Version 1.0.4-07102019:
|
150
|
+
### Version 1.0.4-07102019:
|
88
151
|
|
89
|
-
|
90
|
-
|
152
|
+
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
153
|
+
2. Fix bugs in bin `locator`
|
91
154
|
|
92
|
-
Version 1.0.3-07102019:
|
155
|
+
### Version 1.0.3-07102019:
|
93
156
|
|
94
|
-
|
157
|
+
1. Bug fix.
|
95
158
|
|
96
|
-
Version 1.0.2-07102019:
|
159
|
+
### Version 1.0.2-07102019:
|
97
160
|
|
98
|
-
|
161
|
+
1. Fixed a gem loading issue.
|
99
162
|
|
100
|
-
Version 1.0.1-07102019:
|
163
|
+
### Version 1.0.1-07102019:
|
101
164
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
165
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
166
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
167
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
168
|
+
4. update documentations
|
106
169
|
|
107
|
-
Version 1.0.0-07092019:
|
170
|
+
### Version 1.0.0-07092019:
|
108
171
|
|
109
|
-
|
172
|
+
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
110
173
|
|
111
174
|
## Development
|
112
175
|
|
data/bin/tcs
CHANGED
@@ -28,180 +28,79 @@
|
|
28
28
|
require 'viral_seq'
|
29
29
|
require 'json'
|
30
30
|
require 'colorize'
|
31
|
+
require 'optparse'
|
31
32
|
|
32
|
-
|
33
|
-
|
34
|
-
module ViralSeq
|
35
|
-
class SeqHash
|
36
|
-
def self.new_from_fastq(fastq_file)
|
37
|
-
count = 0
|
38
|
-
sequence_a = []
|
39
|
-
quality_a = []
|
40
|
-
count_seq = 0
|
41
|
-
|
42
|
-
File.open(fastq_file,'r') do |file|
|
43
|
-
file.readlines.collect do |line|
|
44
|
-
count +=1
|
45
|
-
count_m = count % 4
|
46
|
-
if count_m == 1
|
47
|
-
line.tr!('@','>')
|
48
|
-
sequence_a << line.chomp
|
49
|
-
quality_a << line.chomp
|
50
|
-
count_seq += 1
|
51
|
-
elsif count_m == 2
|
52
|
-
sequence_a << line.chomp
|
53
|
-
elsif count_m == 0
|
54
|
-
quality_a << line.chomp
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
59
|
-
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
60
|
-
|
61
|
-
seq_hash = ViralSeq::SeqHash.new
|
62
|
-
seq_hash.dna_hash = sequence_hash
|
63
|
-
seq_hash.qc_hash = quality_hash
|
64
|
-
seq_hash.title = File.basename(fastq_file,".*")
|
65
|
-
seq_hash.file = fastq_file
|
66
|
-
return seq_hash
|
67
|
-
end # end of ::new_from_fastq
|
68
|
-
|
69
|
-
class << self
|
70
|
-
alias_method :fq, :new_from_fastq
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
module ViralSeq
|
76
|
-
class SeqHash
|
77
|
-
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
78
|
-
seq_hash = self.dna_hash.dup
|
79
|
-
seq_hash_unique = seq_hash.uniq_hash
|
80
|
-
trimmed_seq_hash = {}
|
81
|
-
seq_hash_unique.each do |seq, names|
|
82
|
-
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
83
|
-
names.each do |name|
|
84
|
-
trimmed_seq_hash[name] = trimmed_seq
|
85
|
-
end
|
86
|
-
end
|
87
|
-
return_seq_hash = self.dup
|
88
|
-
return_seq_hash.dna_hash = trimmed_seq_hash
|
89
|
-
return return_seq_hash
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
# end of additonal methods. Delete before publish
|
95
|
-
|
96
|
-
# calculate consensus cutoff
|
97
|
-
|
98
|
-
def calculate_cut_off(m, error_rate = 0.02)
|
99
|
-
n = 0
|
100
|
-
case error_rate
|
101
|
-
when 0.005...0.015
|
102
|
-
if m <= 10
|
103
|
-
n = 2
|
104
|
-
else
|
105
|
-
n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
|
106
|
-
end
|
33
|
+
options = {}
|
107
34
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
else
|
112
|
-
n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
|
113
|
-
end
|
35
|
+
banner = '-'*50 + "\n" +
|
36
|
+
'| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |' + "\n" +
|
37
|
+
'-'*50 + "\n"
|
114
38
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
|
120
|
-
else
|
121
|
-
n = 0.0079 * m + 9.4869
|
122
|
-
end
|
39
|
+
OptionParser.new do |opts|
|
40
|
+
opts.banner = banner + "Usage: tcs -j"
|
41
|
+
opts.on "-j", "--json_generator", "Command line interfac to generate new params json file" do |j|
|
42
|
+
options[:json_generator] = true
|
123
43
|
end
|
124
44
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
end
|
45
|
+
opts.on("-p", "--params PARAMS_JSON", "Execute the pipeline with input params json file") do |p|
|
46
|
+
options[:params_json] = p
|
47
|
+
end
|
129
48
|
|
49
|
+
opts.on("-h", "--help", "Prints this help") do
|
50
|
+
puts opts
|
51
|
+
exit
|
52
|
+
end
|
130
53
|
|
131
|
-
|
54
|
+
opts.on("-v", "--version", "Version info") do
|
55
|
+
puts "tcs version: " + ViralSeq::TCS_VERSION.red.bold
|
56
|
+
puts "viral_seq version: " + ViralSeq::VERSION.red.bold
|
57
|
+
exit
|
58
|
+
end
|
132
59
|
|
133
|
-
|
134
|
-
|
135
|
-
|
60
|
+
# opts.on("--no-parallel", "toggle off parallel processing") do
|
61
|
+
# options[:no_parallel] = true
|
62
|
+
# end
|
63
|
+
end.parse!
|
136
64
|
|
137
|
-
|
138
|
-
|
65
|
+
if options[:json_generator]
|
66
|
+
params = ViralSeq::TcsJson.generate
|
67
|
+
elsif (options[:params_json] && File.exist?(options[:params_json]))
|
68
|
+
params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
|
69
|
+
else
|
70
|
+
abort "No params JSON file found. Script terminated.".red
|
139
71
|
end
|
140
72
|
|
141
|
-
params = JSON.parse(File.read(ARGV[0]), symbolize_names: true)
|
142
|
-
|
143
73
|
indir = params[:raw_sequence_dir]
|
144
74
|
|
145
75
|
unless File.exist?(indir)
|
146
|
-
|
147
|
-
end
|
148
|
-
|
149
|
-
libname = File.basename(indir)
|
150
|
-
|
151
|
-
# obtain R1 and R2 file path
|
152
|
-
files = []
|
153
|
-
Dir.chdir(indir) do
|
154
|
-
files = Dir.glob("*")
|
76
|
+
abort "No input sequence directory found. Script terminated.".red.bold
|
155
77
|
end
|
156
78
|
|
157
|
-
|
158
|
-
raise "Input dir does not contain files. Script terminated."
|
159
|
-
end
|
160
|
-
|
161
|
-
r1_f = ""
|
162
|
-
r2_f = ""
|
79
|
+
# log file
|
163
80
|
|
164
|
-
# unzip .fasta.gz
|
165
|
-
def unzip_r(indir, f)
|
166
|
-
r_file = indir + "/" + f
|
167
|
-
if f =~ /.gz/
|
168
|
-
`gzip -d #{r_file}`
|
169
|
-
new_f = f.sub ".gz", ""
|
170
|
-
r_file = File.join(indir, new_f)
|
171
|
-
end
|
172
|
-
return r_file
|
173
|
-
end
|
174
81
|
runtime_log_file = File.join(indir,"runtime.log")
|
175
82
|
log = File.open(runtime_log_file, "w")
|
176
|
-
log.puts "TSC pipeline Version " + TCS_VERSION.to_s
|
83
|
+
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
177
84
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
178
85
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
179
86
|
|
87
|
+
libname = File.basename indir
|
180
88
|
|
181
|
-
|
182
|
-
t = f.split("_")
|
183
|
-
if t.size == 1
|
184
|
-
tag = f
|
185
|
-
else
|
186
|
-
tag = f.split("_")[1..-1].join("_")
|
187
|
-
end
|
188
|
-
|
189
|
-
if tag =~ /r1/i
|
190
|
-
r1_f = unzip_r(indir, f)
|
191
|
-
elsif tag =~ /r2/i
|
192
|
-
r2_f = unzip_r(indir, f)
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
89
|
+
seq_files = ViralSeq::TcsCore.r1r2 indir
|
196
90
|
|
197
|
-
|
198
|
-
|
199
|
-
|
91
|
+
if seq_files[:r1_file].size > 0 and seq_files[:r2_file].size > 0
|
92
|
+
r1_f = seq_files[:r1_file]
|
93
|
+
r2_f = seq_files[:r2_file]
|
94
|
+
elsif seq_files[:r1_file].size > 0 and seq_files[:r2_file].empty?
|
95
|
+
exit_sig = "Missing R2 file. Aborted."
|
96
|
+
elsif seq_files[:r2_file].size > 0 and seq_files[:r1_file].empty?
|
97
|
+
exit_sig = "Missing R1 file. Aborted."
|
98
|
+
else
|
99
|
+
exit_sig = "Cannot determine R1 R2 file in #{indir}. Aborted."
|
200
100
|
end
|
201
101
|
|
202
|
-
|
203
|
-
|
204
|
-
raise "R2 file not found. Script terminated."
|
102
|
+
if exit_sig
|
103
|
+
ViralSeq::TcsCore.log_and_abort log, exit_sig
|
205
104
|
end
|
206
105
|
|
207
106
|
r1_fastq_sh = ViralSeq::SeqHash.fq(r1_f)
|
@@ -218,13 +117,13 @@ end
|
|
218
117
|
|
219
118
|
primers = params[:primer_pairs]
|
220
119
|
if primers.empty?
|
221
|
-
|
222
|
-
raise "No primer information. Script terminated."
|
120
|
+
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
223
121
|
end
|
224
122
|
|
123
|
+
|
225
124
|
primers.each do |primer|
|
226
125
|
summary_json = {}
|
227
|
-
summary_json[:tcs_version] = TCS_VERSION
|
126
|
+
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
228
127
|
summary_json[:viralseq_version] = ViralSeq::VERSION
|
229
128
|
summary_json[:runtime] = Time.now.to_s
|
230
129
|
|
@@ -233,6 +132,9 @@ primers.each do |primer|
|
|
233
132
|
|
234
133
|
cdna_primer = primer[:cdna]
|
235
134
|
forward_primer = primer[:forward]
|
135
|
+
|
136
|
+
export_raw = primer[:export_raw]
|
137
|
+
|
236
138
|
unless cdna_primer
|
237
139
|
log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
|
238
140
|
end
|
@@ -242,66 +144,25 @@ primers.each do |primer|
|
|
242
144
|
summary_json[:cdan_primer] = cdna_primer
|
243
145
|
summary_json[:forward_primer] = forward_primer
|
244
146
|
|
245
|
-
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
147
|
+
primer[:majority] ? majority_cut_off = primer[:majority] : majority_cut_off = 0
|
246
148
|
summary_json[:majority_cut_off] = majority_cut_off
|
247
149
|
|
248
150
|
summary_json[:total_raw_sequence] = raw_sequence_number
|
249
151
|
|
250
152
|
log.puts Time.now.to_s + "\t" + "Porcessing #{region}..."
|
251
153
|
|
252
|
-
|
253
|
-
r2_raw = r2_fastq_sh.dna_hash
|
254
|
-
|
154
|
+
# filter R1
|
255
155
|
log.puts Time.now.to_s + "\t" + "filtering R1..."
|
256
|
-
|
257
|
-
|
258
|
-
forward_n = $1.size
|
259
|
-
forward_bio_primer = $2
|
260
|
-
else
|
261
|
-
forward_n = 0
|
262
|
-
forward_bio_primer = forward_primer
|
263
|
-
end
|
264
|
-
forward_bio_primer_size = forward_bio_primer.size
|
265
|
-
forward_starting_number = forward_n + forward_bio_primer_size
|
266
|
-
|
267
|
-
# filter R1 sequences with forward primers.
|
268
|
-
forward_primer_ref = forward_bio_primer.nt_parser
|
269
|
-
r1_passed_seq = {}
|
270
|
-
r1_raw.each do |name,seq|
|
271
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
272
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
273
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
274
|
-
|
275
|
-
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
276
|
-
if primer_region_seq =~ forward_primer_ref
|
277
|
-
r1_passed_seq[name.split("\s")[0]] = seq
|
278
|
-
end
|
279
|
-
end
|
156
|
+
filter_r1 = ViralSeq::TcsCore.filter_r1(r1_fastq_sh, forward_primer)
|
157
|
+
r1_passed_seq = filter_r1[:r1_passed_seq]
|
280
158
|
log.puts Time.now.to_s + "\t" + "R1 filtered: #{r1_passed_seq.size.to_s}"
|
281
|
-
|
282
159
|
summary_json[:r1_filtered_raw] = r1_passed_seq.size
|
283
160
|
|
161
|
+
# filter R2
|
284
162
|
log.puts Time.now.to_s + "\t" + "filtering R2..."
|
285
|
-
|
286
|
-
|
287
|
-
pid_length =
|
288
|
-
cdna_bio_primer = $2
|
289
|
-
cdna_bio_primer_size = cdna_bio_primer.size
|
290
|
-
reverse_starting_number = pid_length + cdna_bio_primer_size
|
291
|
-
|
292
|
-
# filter R2 sequences with cDNA primers.
|
293
|
-
cdna_primer_ref = cdna_bio_primer.nt_parser
|
294
|
-
r2_passed_seq = {}
|
295
|
-
r2_raw.each do |name, seq|
|
296
|
-
next if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
297
|
-
next if seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
298
|
-
next if seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
299
|
-
|
300
|
-
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
301
|
-
if primer_region_seq =~ cdna_primer_ref
|
302
|
-
r2_passed_seq[name.split("\s")[0]] = seq
|
303
|
-
end
|
304
|
-
end
|
163
|
+
filter_r2 = ViralSeq::TcsCore.filter_r2(r2_fastq_sh, cdna_primer)
|
164
|
+
r2_passed_seq = filter_r2[:r2_passed_seq]
|
165
|
+
pid_length = filter_r2[:pid_length]
|
305
166
|
log.puts Time.now.to_s + "\t" + "R2 filtered: #{r2_passed_seq.size.to_s}"
|
306
167
|
summary_json[:r2_filtered_raw] = r2_passed_seq.size
|
307
168
|
|
@@ -320,8 +181,8 @@ primers.each do |primer|
|
|
320
181
|
r2_seq = r2_passed_seq[seqtag]
|
321
182
|
pid = r2_seq[0, pid_length]
|
322
183
|
id[seqtag] = pid
|
323
|
-
bio_r2[seqtag] = r2_seq[reverse_starting_number..-2]
|
324
|
-
bio_r1[seqtag] = r1_seq[forward_starting_number..-2]
|
184
|
+
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
|
185
|
+
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
|
325
186
|
end
|
326
187
|
|
327
188
|
# TCS cut-off
|
@@ -341,11 +202,10 @@ primers.each do |primer|
|
|
341
202
|
end
|
342
203
|
|
343
204
|
max_id = primer_id_dis.keys.sort[-5..-1].mean
|
344
|
-
consensus_cutoff = calculate_cut_off(max_id,error_rate)
|
205
|
+
consensus_cutoff = ViralSeq::TcsCore.calculate_cut_off(max_id,error_rate)
|
345
206
|
log.puts Time.now.to_s + "\t" + "Consensus cut-off is #{consensus_cutoff.to_s}"
|
346
207
|
summary_json[:consensus_cutoff] = consensus_cutoff
|
347
208
|
summary_json[:length_of_pid] = pid_length
|
348
|
-
|
349
209
|
log.puts Time.now.to_s + "\t" + "Creating consensus..."
|
350
210
|
|
351
211
|
# Primer ID over the cut-off
|
@@ -363,10 +223,30 @@ primers.each do |primer|
|
|
363
223
|
out_dir_consensus = File.join(out_dir_set, "consensus")
|
364
224
|
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
365
225
|
|
366
|
-
outfile_r1 = File.join(out_dir_consensus, 'r1.
|
367
|
-
outfile_r2 = File.join(out_dir_consensus, 'r2.
|
226
|
+
outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
|
227
|
+
outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
|
368
228
|
outfile_log = File.join(out_dir_set, 'log.json')
|
369
229
|
|
230
|
+
# if export_raw is true, create dir for raw sequence
|
231
|
+
if export_raw
|
232
|
+
out_dir_raw = File.join(out_dir_set, "raw")
|
233
|
+
Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
|
234
|
+
outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
|
235
|
+
outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
|
236
|
+
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
237
|
+
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
238
|
+
|
239
|
+
bio_r1.keys.each do |k|
|
240
|
+
raw_r1_f.puts k + "_r1"
|
241
|
+
raw_r2_f.puts k + "_r2"
|
242
|
+
raw_r1_f.puts bio_r1[k]
|
243
|
+
raw_r2_f.puts bio_r2[k].rc
|
244
|
+
end
|
245
|
+
|
246
|
+
raw_r1_f.close
|
247
|
+
raw_r2_f.close
|
248
|
+
end
|
249
|
+
|
370
250
|
# create TCS
|
371
251
|
|
372
252
|
pid_seqtag_hash = {}
|
@@ -398,6 +278,8 @@ primers.each do |primer|
|
|
398
278
|
consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
|
399
279
|
r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
|
400
280
|
r2_consensus = ViralSeq::SeqHash.array(r2_sub_seq).consensus(majority_cut_off)
|
281
|
+
|
282
|
+
# hide the following two lines if allowing sequence to have ambiguities.
|
401
283
|
next if r1_consensus =~ /[^ATCG]/
|
402
284
|
next if r2_consensus =~ /[^ATCG]/
|
403
285
|
|
@@ -435,8 +317,12 @@ primers.each do |primer|
|
|
435
317
|
f1 = File.open(outfile_r1, 'w')
|
436
318
|
f2 = File.open(outfile_r2, 'w')
|
437
319
|
primer_id_in_use = {}
|
438
|
-
|
439
|
-
|
320
|
+
if n_con > 0
|
321
|
+
r1_seq_length = consensus_filtered.values[0][0].size
|
322
|
+
r2_seq_length = consensus_filtered.values[0][1].size
|
323
|
+
else
|
324
|
+
next
|
325
|
+
end
|
440
326
|
log.puts Time.now.to_s + "\t" + "R1 sequence #{r1_seq_length} bp"
|
441
327
|
log.puts Time.now.to_s + "\t" + "R1 sequence #{r2_seq_length} bp"
|
442
328
|
consensus_filtered.each do |seq_name,seq|
|
@@ -447,6 +333,7 @@ primers.each do |primer|
|
|
447
333
|
f1.close
|
448
334
|
f2.close
|
449
335
|
|
336
|
+
# Primer ID distribution in .json file
|
450
337
|
out_pid_json = File.join(out_dir_set, 'primer_id.json')
|
451
338
|
pid_json = {}
|
452
339
|
pid_json[:primer_id_in_use] = Hash[*(primer_id_in_use.sort_by {|k, v| [-v,k]}.flatten)]
|
@@ -456,19 +343,33 @@ primers.each do |primer|
|
|
456
343
|
f.puts JSON.pretty_generate(pid_json)
|
457
344
|
end
|
458
345
|
|
459
|
-
|
460
|
-
|
461
|
-
shp = ViralSeq::SeqHashPair.fa(
|
462
|
-
case
|
346
|
+
# start end-join
|
347
|
+
def end_join(dir, option, overlap)
|
348
|
+
shp = ViralSeq::SeqHashPair.fa(dir)
|
349
|
+
case option
|
463
350
|
when 1
|
464
|
-
joined_sh = shp.join1(
|
351
|
+
joined_sh = shp.join1()
|
352
|
+
when 2
|
353
|
+
joined_sh = shp.join1(overlap)
|
465
354
|
when 3
|
466
355
|
joined_sh = shp.join2
|
467
356
|
when 4
|
468
357
|
joined_sh = shp.join2(model: :indiv)
|
469
358
|
end
|
359
|
+
return joined_sh
|
360
|
+
end
|
361
|
+
|
362
|
+
if primer[:end_join]
|
363
|
+
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
364
|
+
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
365
|
+
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
470
366
|
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
471
367
|
summary_json[:combined_tcs] = joined_sh.size
|
368
|
+
|
369
|
+
if export_raw
|
370
|
+
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
371
|
+
end
|
372
|
+
|
472
373
|
else
|
473
374
|
File.open(outfile_log, "w") do |f|
|
474
375
|
f.puts JSON.pretty_generate(summary_json)
|
@@ -501,9 +402,30 @@ primers.each do |primer|
|
|
501
402
|
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
502
403
|
end
|
503
404
|
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
405
|
+
|
406
|
+
if export_raw
|
407
|
+
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
408
|
+
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
409
|
+
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
410
|
+
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
411
|
+
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
412
|
+
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
413
|
+
joined_seq_raw = {}
|
414
|
+
new_r1_seq_raw.each do |seq_name, seq|
|
415
|
+
next unless seq
|
416
|
+
next unless new_r2_seq_raw[seq_name]
|
417
|
+
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
418
|
+
end
|
419
|
+
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
420
|
+
end
|
504
421
|
else
|
505
422
|
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
423
|
+
|
424
|
+
if export_raw
|
425
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
426
|
+
end
|
506
427
|
end
|
428
|
+
|
507
429
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
508
430
|
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
509
431
|
if primer[:trim]
|
@@ -511,8 +433,12 @@ primers.each do |primer|
|
|
511
433
|
trim_end = primer[:trim_ref_end]
|
512
434
|
trim_ref = primer[:trim_ref].to_sym
|
513
435
|
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
436
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
437
|
+
if export_raw
|
438
|
+
joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
|
439
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
440
|
+
end
|
514
441
|
end
|
515
|
-
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.txt"))
|
516
442
|
end
|
517
443
|
|
518
444
|
File.open(outfile_log, "w") do |f|
|