viral_seq 1.6.4 → 1.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +20 -0
- data/bin/tcs +44 -8
- data/bin/tcs_log +1 -1
- data/lib/viral_seq/seq_hash.rb +11 -8
- data/lib/viral_seq/sequence.rb +5 -5
- data/lib/viral_seq/string.rb +37 -0
- data/lib/viral_seq/tcs_core.rb +4 -4
- data/lib/viral_seq/version.rb +2 -2
- metadata +2 -3
- data/rc_swans.svc@longleaf.unc.edu +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e6d55ab37ecd3b9c5688c99772fc49792a5319bac853ac768367a8b42c0e0b6
|
4
|
+
data.tar.gz: a69e78c80f22848facb41ad4f9d9fb64e6d4e47ff6e18afa3421d64513ce6558
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae34ac12bd2b86d4c7fc040765b26b94d41cfe239a206b2e84bf55841988826bcfbf685e788b93224ee78e29b1280454059991d644f81cbf24f1b97fff3f2294
|
7
|
+
data.tar.gz: 254993ea2126ca51d0ad5e2b6be2dca90e1b3ed817266e46b5ca46f91d2a69288c0a87c58906ef3da8ad6465e08788c850d6bc72013e30f1ead13e186ba16dfd
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -10,6 +10,8 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
|
10
10
|
|
11
11
|
Specifically for Primer ID sequencing and HIV drug resistance analysis.
|
12
12
|
|
13
|
+
CLI tools `tcs`, `tcs_sdrm`, `tcs_log` and `locator` included in the gem.
|
14
|
+
|
13
15
|
#### tcs web app - https://primer-id.org/
|
14
16
|
|
15
17
|
## Illustration for the Primer ID Sequencing
|
@@ -22,6 +24,12 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
|
|
22
24
|
[Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
|
23
25
|
[Application of Primer ID sequencing in COVID-19 research](https://doi.org/10.1126/scitranslmed.abb5883)
|
24
26
|
|
27
|
+
## Requirements
|
28
|
+
|
29
|
+
Required Ruby Version: >= 2.5
|
30
|
+
|
31
|
+
Required RubyGems version: >= 1.3.6
|
32
|
+
|
25
33
|
## Install
|
26
34
|
|
27
35
|
```bash
|
@@ -179,6 +187,18 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
179
187
|
|
180
188
|
## Updates
|
181
189
|
|
190
|
+
### Version-1.7.1-05120203
|
191
|
+
|
192
|
+
1. Add a size check for the raw sequences. If the size smaller than the input params, error messages will be sent to users. IF the actual size is greater than the input params, extra bases will be truncated.
|
193
|
+
2. Now allows mismatch for the primer region sequences. Forward primer region allows 2 nt differences and cDNA primer region allows 3 nt differences.
|
194
|
+
3. Bug fix.
|
195
|
+
4. TCS version to 2.5.2
|
196
|
+
|
197
|
+
### Version-1.7.0-08242022
|
198
|
+
|
199
|
+
1. Add warnings if `tcs` pipeline is excecuting through source instead of installing from `gem`.
|
200
|
+
2. Optimized `ViralSeq:SeqHash#a3g` hypermut algorithm. Allowing a external reference other than the sample reference.
|
201
|
+
|
182
202
|
### Version-1.6.4-07182022
|
183
203
|
|
184
204
|
1. Included region "P17" in the default `tcs -d` pipeline setting. `tcs` pipeline updated to version 2.5.1.
|
data/bin/tcs
CHANGED
@@ -22,20 +22,38 @@
|
|
22
22
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
23
|
# THE SOFTWARE.
|
24
24
|
|
25
|
+
# Install using `gem install viral_seq`
|
25
26
|
# Use JSON file as the run param
|
26
27
|
# run `tcs -j` to generate param json file.
|
27
28
|
|
28
|
-
|
29
|
+
def gem_installed?(gem_name)
|
30
|
+
found_gem = false
|
31
|
+
begin
|
32
|
+
found_gem = Gem::Specification.find_by_name(gem_name)
|
33
|
+
rescue Gem::LoadError
|
34
|
+
return false
|
35
|
+
else
|
36
|
+
return true
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
if gem_installed?('viral_seq')
|
41
|
+
require 'viral_seq'
|
42
|
+
else
|
43
|
+
printf "\n****************************************************\n"
|
44
|
+
printf "**** THIS PACKAGE CANNOT BE RUN FROM SOURCE ********\n"
|
45
|
+
printf "**** PLEASE INSTALL USING `gem install viral_seq` **\n"
|
46
|
+
printf "****************************************************\n\n"
|
47
|
+
exit 1
|
48
|
+
end
|
49
|
+
|
50
|
+
|
29
51
|
require 'json'
|
30
52
|
require 'colorize'
|
31
53
|
require 'optparse'
|
32
54
|
|
33
55
|
options = {}
|
34
56
|
|
35
|
-
# banner = '-'*50 + "\n" +
|
36
|
-
# '| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |' + "\n" +
|
37
|
-
# '-'*50 + "\n"
|
38
|
-
|
39
57
|
banner = "\n" +
|
40
58
|
"████████ ██████ ███████ ██████ ██ ██████ ███████ ██ ██ ███ ██ ███████\n".light_red +
|
41
59
|
" ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██\n".light_yellow +
|
@@ -86,7 +104,7 @@ end.parse!
|
|
86
104
|
if options[:json_generator]
|
87
105
|
params = ViralSeq::TcsJson.generate
|
88
106
|
elsif options[:dr]
|
89
|
-
params = ViralSeq::TcsDr::PARAMS
|
107
|
+
params = ViralSeq::TcsDr::PARAMS
|
90
108
|
elsif (options[:params_json] && File.exist?(options[:params_json]))
|
91
109
|
params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
|
92
110
|
else
|
@@ -145,6 +163,24 @@ begin
|
|
145
163
|
$platform_sequencing_length = 300
|
146
164
|
end
|
147
165
|
|
166
|
+
r1_raw_size = r1_fastq_sh.dna_hash.values[0].size
|
167
|
+
r2_raw_size = r2_fastq_sh.dna_hash.values[0].size
|
168
|
+
|
169
|
+
if r1_raw_size >= $platform_sequencing_length
|
170
|
+
r1_size_diff = r1_raw_size - $platform_sequencing_length
|
171
|
+
else
|
172
|
+
raise StandardError.new "R1 size smaller than the input platform format #{$platform_sequencing_length} bp."
|
173
|
+
end
|
174
|
+
|
175
|
+
if r2_raw_size >= $platform_sequencing_length
|
176
|
+
r2_size_diff = r2_raw_size - $platform_sequencing_length
|
177
|
+
else
|
178
|
+
raise StandardError.new "R2 size smaller than the input platform format #{$platform_sequencing_length} bp."
|
179
|
+
end
|
180
|
+
|
181
|
+
r1_truncate_base_number = 2 + r1_size_diff
|
182
|
+
r2_truncate_base_number = 2 + r2_size_diff
|
183
|
+
|
148
184
|
primers = params[:primer_pairs]
|
149
185
|
if primers.empty? or primers.nil?
|
150
186
|
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
@@ -217,8 +253,8 @@ begin
|
|
217
253
|
r2_seq = r2_passed_seq[seqtag]
|
218
254
|
pid = r2_seq[0, pid_length]
|
219
255
|
id[seqtag] = pid
|
220
|
-
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-
|
221
|
-
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-
|
256
|
+
bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-r2_truncate_base_number]
|
257
|
+
bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-r1_truncate_base_number]
|
222
258
|
end
|
223
259
|
|
224
260
|
# TCS cut-off
|
data/bin/tcs_log
CHANGED
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -450,7 +450,7 @@ module ViralSeq
|
|
450
450
|
# function to determine if the sequences have APOBEC3g/f hypermutation.
|
451
451
|
# # APOBEC3G/F pattern: GRD -> ARD
|
452
452
|
# # control pattern: G[YN|RC] -> A[YN|RC]
|
453
|
-
# # use the sample consensus to determine potential a3g sites
|
453
|
+
# # use the sample consensus to determine potential a3g sites (default) or provide external reference sequences as a `String`
|
454
454
|
# # Two criteria to identify hypermutation
|
455
455
|
# # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
|
456
456
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
@@ -486,7 +486,7 @@ module ViralSeq
|
|
486
486
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
487
487
|
# @see https://www.hiv.lanl.gov/content/sequence/HYPERMUT/hypermut.html LANL Hypermut
|
488
488
|
|
489
|
-
def a3g_hypermut
|
489
|
+
def a3g_hypermut(ref = nil)
|
490
490
|
# mut_hash number of apobec3g/f mutations per sequence
|
491
491
|
mut_hash = {}
|
492
492
|
hm_hash = {}
|
@@ -495,8 +495,10 @@ module ViralSeq
|
|
495
495
|
# total G->A mutations at apobec3g/f positions.
|
496
496
|
total = 0
|
497
497
|
|
498
|
-
|
499
|
-
|
498
|
+
unless ref
|
499
|
+
# make consensus sequence for the input sequence hash
|
500
|
+
ref = self.consensus
|
501
|
+
end
|
500
502
|
|
501
503
|
# obtain apobec3g positions and control positions
|
502
504
|
apobec = apobec3gf(ref)
|
@@ -509,7 +511,6 @@ module ViralSeq
|
|
509
511
|
c = 0 # control muts
|
510
512
|
d = 0 # potenrial controls
|
511
513
|
mut.each do |n|
|
512
|
-
next if v[n] == "-"
|
513
514
|
if v[n] == "A"
|
514
515
|
a += 1
|
515
516
|
b += 1
|
@@ -521,7 +522,6 @@ module ViralSeq
|
|
521
522
|
total += a
|
522
523
|
|
523
524
|
control.each do |n|
|
524
|
-
next if v[n] == "-"
|
525
525
|
if v[n] == "A"
|
526
526
|
c += 1
|
527
527
|
d += 1
|
@@ -544,7 +544,7 @@ module ViralSeq
|
|
544
544
|
end
|
545
545
|
end
|
546
546
|
|
547
|
-
if self.dna_hash.size >
|
547
|
+
if self.dna_hash.size > 200
|
548
548
|
rate = total.to_f/(self.dna_hash.size)
|
549
549
|
count_mut = mut_hash.values.count_freq
|
550
550
|
maxi_count = count_mut.values.max
|
@@ -566,10 +566,12 @@ module ViralSeq
|
|
566
566
|
end
|
567
567
|
end
|
568
568
|
end
|
569
|
+
|
569
570
|
hm_seq_hash = ViralSeq::SeqHash.new
|
570
571
|
hm_hash.each do |k,_v|
|
571
572
|
hm_seq_hash.dna_hash[k] = self.dna_hash[k]
|
572
573
|
end
|
574
|
+
|
573
575
|
hm_seq_hash.title = self.title + "_hypermut"
|
574
576
|
hm_seq_hash.file = self.file
|
575
577
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
@@ -1356,7 +1358,7 @@ module ViralSeq
|
|
1356
1358
|
# APOBEC3G/F pattern: GRD -> ARD,
|
1357
1359
|
# control pattern: G[YN|RC] -> A[YN|RC],
|
1358
1360
|
def apobec3gf(seq = '')
|
1359
|
-
seq.tr!("-", "")
|
1361
|
+
#seq.tr!("-", "")
|
1360
1362
|
seq_length = seq.size
|
1361
1363
|
apobec_position = []
|
1362
1364
|
control_position = []
|
@@ -1368,6 +1370,7 @@ module ViralSeq
|
|
1368
1370
|
control_position << n
|
1369
1371
|
end
|
1370
1372
|
end
|
1373
|
+
|
1371
1374
|
return [apobec_position,control_position]
|
1372
1375
|
end # end of #apobec3gf
|
1373
1376
|
|
data/lib/viral_seq/sequence.rb
CHANGED
@@ -180,7 +180,7 @@ module ViralSeq
|
|
180
180
|
l1 = 0
|
181
181
|
l2 = 0
|
182
182
|
|
183
|
-
aln_seq = ViralSeq::Muscle.align(ori_ref, seq, :
|
183
|
+
aln_seq = ViralSeq::Muscle.align(ori_ref, seq, :Super5, path_to_muscle)
|
184
184
|
aln_test = aln_seq[1]
|
185
185
|
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
186
186
|
gap_begin = $1.size
|
@@ -214,7 +214,7 @@ module ViralSeq
|
|
214
214
|
l2 = l2 + (post_aln - b2)
|
215
215
|
end
|
216
216
|
|
217
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :
|
217
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
218
218
|
aln_test = aln_seq[1]
|
219
219
|
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
220
220
|
gap_begin = $1.size
|
@@ -263,7 +263,7 @@ module ViralSeq
|
|
263
263
|
end
|
264
264
|
|
265
265
|
while repeat == 1
|
266
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :
|
266
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
267
267
|
aln_test = aln_seq[1]
|
268
268
|
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
269
269
|
gap_begin = $1.size
|
@@ -293,7 +293,7 @@ module ViralSeq
|
|
293
293
|
end
|
294
294
|
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
295
295
|
|
296
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :
|
296
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
297
297
|
aln_test = aln_seq[1]
|
298
298
|
ref = aln_seq[0]
|
299
299
|
|
@@ -307,7 +307,7 @@ module ViralSeq
|
|
307
307
|
|
308
308
|
if (ori_ref_l - l2 - 1) >= l1
|
309
309
|
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
310
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :
|
310
|
+
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
311
311
|
aln_test = aln_seq[1]
|
312
312
|
ref = aln_seq[0]
|
313
313
|
|
data/lib/viral_seq/string.rb
CHANGED
@@ -56,6 +56,43 @@ class String
|
|
56
56
|
Regexp.new match
|
57
57
|
end
|
58
58
|
|
59
|
+
# parse the nucleotide sequences as an Array of Array
|
60
|
+
# @return [Array] Array of Array at each position
|
61
|
+
# @example parse a sequence with ambiguities to Array of Array
|
62
|
+
# "ATRWCG".nt_to_array
|
63
|
+
# => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
|
64
|
+
|
65
|
+
def nt_to_array
|
66
|
+
return_array = []
|
67
|
+
self.each_char.each do |base|
|
68
|
+
base_array = base.to_list
|
69
|
+
return_array.append base_array
|
70
|
+
end
|
71
|
+
return return_array
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
# compare the given nt sequence string with the ref sequence string
|
76
|
+
# @param ref [String] the ref sequence string to compare with
|
77
|
+
# @return [Interger] Number of differences
|
78
|
+
# @example parse a sequence with ambiguities to Array of Array
|
79
|
+
# "ATRWCG".nt_to_array
|
80
|
+
# => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
|
81
|
+
|
82
|
+
def nt_diff(ref)
|
83
|
+
count_diff = 0
|
84
|
+
self_array = self.split("")
|
85
|
+
ref_array = ref.nt_to_array
|
86
|
+
self_array.each_with_index do |nt, i|
|
87
|
+
ref_nt = ref_array[i]
|
88
|
+
unless ref_nt.include? nt
|
89
|
+
count_diff += 1
|
90
|
+
end
|
91
|
+
end
|
92
|
+
return count_diff
|
93
|
+
end
|
94
|
+
|
95
|
+
|
59
96
|
# parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
|
60
97
|
# @return [Array] parsed nt bases
|
61
98
|
# @example parse IUPAC `R`
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -223,7 +223,7 @@ module ViralSeq
|
|
223
223
|
end
|
224
224
|
forward_bio_primer_size = forward_bio_primer.size
|
225
225
|
forward_starting_number = forward_n + forward_bio_primer_size
|
226
|
-
forward_primer_ref = forward_bio_primer.nt_parser
|
226
|
+
#forward_primer_ref = forward_bio_primer.nt_parser
|
227
227
|
|
228
228
|
r1_passed_seq = {}
|
229
229
|
r1_raw = r1_sh.dna_hash
|
@@ -232,7 +232,7 @@ module ViralSeq
|
|
232
232
|
seq = r1_raw[name]
|
233
233
|
next unless general_filter seq
|
234
234
|
primer_region_seq = seq[forward_n, forward_bio_primer_size]
|
235
|
-
if primer_region_seq
|
235
|
+
if primer_region_seq.nt_diff(forward_bio_primer) < 3
|
236
236
|
new_name = remove_tag name
|
237
237
|
r1_passed_seq[new_name] = seq
|
238
238
|
end
|
@@ -255,13 +255,13 @@ module ViralSeq
|
|
255
255
|
cdna_bio_primer = $2
|
256
256
|
cdna_bio_primer_size = cdna_bio_primer.size
|
257
257
|
reverse_starting_number = pid_length + cdna_bio_primer_size
|
258
|
-
|
258
|
+
# cdna_primer_ref = cdna_bio_primer.nt_to_array
|
259
259
|
r2_passed_seq = {}
|
260
260
|
proc_filter = proc do |name|
|
261
261
|
seq = r2_raw[name]
|
262
262
|
next unless general_filter seq
|
263
263
|
primer_region_seq = seq[pid_length, cdna_bio_primer_size]
|
264
|
-
if primer_region_seq
|
264
|
+
if primer_region_seq.nt_diff(cdna_bio_primer) < 4
|
265
265
|
new_name = remove_tag name
|
266
266
|
r2_passed_seq[new_name] = seq
|
267
267
|
end
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2023-05-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -193,7 +193,6 @@ files:
|
|
193
193
|
- lib/viral_seq/tcs_dr.rb
|
194
194
|
- lib/viral_seq/tcs_json.rb
|
195
195
|
- lib/viral_seq/version.rb
|
196
|
-
- rc_swans.svc@longleaf.unc.edu
|
197
196
|
- viral_seq.gemspec
|
198
197
|
homepage: https://github.com/ViralSeq/viral_seq
|
199
198
|
licenses:
|
Binary file
|