viral_seq 1.0.8 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/README.md +69 -37
- data/bin/tcs +70 -79
- data/bin/tcs_json_generator +7 -11
- data/lib/viral_seq/seq_hash.rb +23 -16
- data/lib/viral_seq/version.rb +2 -2
- data/viral_seq.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4921d3609d6ffc7fd6fbafd7a4a86e5818d47ed855393addd68b20f28b9d214f
|
4
|
+
data.tar.gz: a9e18c01b287885f8f6238343d9633a52d4ae5ea061347e73bd4f3e86788b2a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd21b57e17751f6c3e475f05b7a565d295ac7592b7c02f8d89ed49192834bee444f08ee9ebf48e41922c8caaf37a03651d5d0c9aa89d97ccc2edb9aad8224d5f
|
7
|
+
data.tar.gz: d1162424ea877d9839c179cacc330c81cd3508fcff07b64a1e753c7c706485d1dcb9a6b60aec9ce02ed33b91bbd4386ed58329c17e247ba086e7d81ed107bfd4
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.0.
|
4
|
+
viral_seq (1.0.9)
|
5
5
|
colorize (~> 0.1)
|
6
6
|
muscle_bio (~> 0.4)
|
7
7
|
|
@@ -11,7 +11,7 @@ GEM
|
|
11
11
|
colorize (0.8.1)
|
12
12
|
diff-lcs (1.3)
|
13
13
|
muscle_bio (0.4.0)
|
14
|
-
rake (
|
14
|
+
rake (13.0.1)
|
15
15
|
rspec (3.8.0)
|
16
16
|
rspec-core (~> 3.8.0)
|
17
17
|
rspec-expectations (~> 3.8.0)
|
@@ -31,7 +31,7 @@ PLATFORMS
|
|
31
31
|
|
32
32
|
DEPENDENCIES
|
33
33
|
bundler (~> 2.0)
|
34
|
-
rake (~>
|
34
|
+
rake (~> 13.0)
|
35
35
|
rspec (~> 3.0)
|
36
36
|
viral_seq!
|
37
37
|
|
data/README.md
CHANGED
@@ -12,101 +12,133 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
|
12
12
|
|
13
13
|
#### Load all ViralSeq classes by requiring 'viral_seq.rb'
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
```ruby
|
16
|
+
#!/usr/bin/env ruby
|
17
|
+
require 'viral_seq'
|
18
|
+
```
|
17
19
|
|
18
20
|
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
19
21
|
|
20
22
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
21
23
|
|
24
|
+
|
25
|
+
#### Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data. Parameter json file can be generated using `tcs_json_generator` or at https://tcs-dr-dept-tcs.cloudapps.unc.edu/generator.php
|
26
|
+
|
27
|
+
$ tcs params.json
|
28
|
+
|
29
|
+
#### Use executable `tcs_json_generator` to generate params .json file for the `tcs` pipeline.
|
30
|
+
|
31
|
+
$ tcs_json_generator
|
32
|
+
|
33
|
+
|
22
34
|
## Some Examples
|
23
35
|
|
24
36
|
#### Load nucleotide sequences from a FASTA format sequence file
|
25
37
|
|
26
|
-
|
38
|
+
```ruby
|
39
|
+
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
40
|
+
```
|
27
41
|
|
28
42
|
#### Make an alignment (using MUSCLE)
|
29
43
|
|
30
|
-
|
44
|
+
```ruby
|
45
|
+
aligned_seqhash = my_seqhash.align
|
46
|
+
```
|
31
47
|
|
32
48
|
#### Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
33
49
|
|
34
|
-
|
50
|
+
```ruby
|
51
|
+
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
52
|
+
```
|
35
53
|
|
36
54
|
#### Further filter out sequences with Apobec3g/f hypermutations
|
37
55
|
|
38
|
-
|
56
|
+
```ruby
|
57
|
+
qc_seqhash = qc_seqhash.a3g
|
58
|
+
```
|
39
59
|
|
40
60
|
#### Calculate nucleotide diveristy π
|
41
61
|
|
42
|
-
|
62
|
+
```ruby
|
63
|
+
qc_seqhash.pi
|
64
|
+
```
|
43
65
|
|
44
66
|
#### Calculate cut-off for minority variants based on Poisson model
|
45
67
|
|
46
|
-
|
68
|
+
```ruby
|
69
|
+
cut_off = qc_seqhash.pm
|
70
|
+
```
|
47
71
|
|
48
72
|
#### Examine for drug resistance mutations for HIV PR region
|
49
73
|
|
50
|
-
|
74
|
+
```ruby
|
75
|
+
qc_seqhash.sdrm_hiv_pr(cut_off)
|
76
|
+
```
|
51
77
|
|
52
78
|
## Updates
|
53
79
|
|
80
|
+
Version 1.0.9-07182020:
|
81
|
+
|
82
|
+
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
83
|
+
|
84
|
+
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
85
|
+
|
54
86
|
Version 1.0.8-02282020:
|
55
87
|
|
56
|
-
|
57
|
-
|
58
|
-
|
88
|
+
1. TCS pipeline (version 2.0.0) added as executable.
|
89
|
+
tcs - main TCS pipeline script.
|
90
|
+
tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
|
59
91
|
|
60
|
-
|
61
|
-
|
92
|
+
2. Methods added:
|
93
|
+
ViralSeq::SeqHash#trim
|
62
94
|
|
63
|
-
|
95
|
+
3. Bug fix for several methods.
|
64
96
|
|
65
97
|
Version 1.0.7-01282020:
|
66
98
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
99
|
+
1. Several methods added, including
|
100
|
+
ViralSeq::SeqHash#error_table
|
101
|
+
ViralSeq::SeqHash#random_select
|
102
|
+
2. Improved performance for several functions.
|
71
103
|
|
72
104
|
Version 1.0.6-07232019:
|
73
105
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
106
|
+
1. Several methods added to ViralSeq::SeqHash, including
|
107
|
+
ViralSeq::SeqHash#size
|
108
|
+
ViralSeq::SeqHash#+
|
109
|
+
ViralSeq::SeqHash#write_nt_fa
|
110
|
+
ViralSeq::SeqHash#mutation
|
111
|
+
2. Update documentations and rspec samples.
|
80
112
|
|
81
113
|
Version 1.0.5-07112019:
|
82
114
|
|
83
|
-
|
84
|
-
|
85
|
-
|
115
|
+
1. Update ViralSeq::SeqHash#sequence_locator.
|
116
|
+
Program will try to determine the direction (`+` or `-` of the query sequence)
|
117
|
+
2. update executable `locator` to have a column of `direction` in output .csv file
|
86
118
|
|
87
119
|
Version 1.0.4-07102019:
|
88
120
|
|
89
|
-
|
90
|
-
|
121
|
+
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
122
|
+
2. Fix bugs in bin `locator`
|
91
123
|
|
92
124
|
Version 1.0.3-07102019:
|
93
125
|
|
94
|
-
|
126
|
+
1. Bug fix.
|
95
127
|
|
96
128
|
Version 1.0.2-07102019:
|
97
129
|
|
98
|
-
|
130
|
+
1. Fixed a gem loading issue.
|
99
131
|
|
100
132
|
Version 1.0.1-07102019:
|
101
133
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
134
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
135
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
136
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
137
|
+
4. update documentations
|
106
138
|
|
107
139
|
Version 1.0.0-07092019:
|
108
140
|
|
109
|
-
|
141
|
+
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
110
142
|
|
111
143
|
## Development
|
112
144
|
|
data/bin/tcs
CHANGED
@@ -29,69 +29,6 @@ require 'viral_seq'
|
|
29
29
|
require 'json'
|
30
30
|
require 'colorize'
|
31
31
|
|
32
|
-
# updated the ViralSeq module. Push with the new version.
|
33
|
-
|
34
|
-
module ViralSeq
|
35
|
-
class SeqHash
|
36
|
-
def self.new_from_fastq(fastq_file)
|
37
|
-
count = 0
|
38
|
-
sequence_a = []
|
39
|
-
quality_a = []
|
40
|
-
count_seq = 0
|
41
|
-
|
42
|
-
File.open(fastq_file,'r') do |file|
|
43
|
-
file.readlines.collect do |line|
|
44
|
-
count +=1
|
45
|
-
count_m = count % 4
|
46
|
-
if count_m == 1
|
47
|
-
line.tr!('@','>')
|
48
|
-
sequence_a << line.chomp
|
49
|
-
quality_a << line.chomp
|
50
|
-
count_seq += 1
|
51
|
-
elsif count_m == 2
|
52
|
-
sequence_a << line.chomp
|
53
|
-
elsif count_m == 0
|
54
|
-
quality_a << line.chomp
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
59
|
-
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
60
|
-
|
61
|
-
seq_hash = ViralSeq::SeqHash.new
|
62
|
-
seq_hash.dna_hash = sequence_hash
|
63
|
-
seq_hash.qc_hash = quality_hash
|
64
|
-
seq_hash.title = File.basename(fastq_file,".*")
|
65
|
-
seq_hash.file = fastq_file
|
66
|
-
return seq_hash
|
67
|
-
end # end of ::new_from_fastq
|
68
|
-
|
69
|
-
class << self
|
70
|
-
alias_method :fq, :new_from_fastq
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
module ViralSeq
|
76
|
-
class SeqHash
|
77
|
-
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
78
|
-
seq_hash = self.dna_hash.dup
|
79
|
-
seq_hash_unique = seq_hash.uniq_hash
|
80
|
-
trimmed_seq_hash = {}
|
81
|
-
seq_hash_unique.each do |seq, names|
|
82
|
-
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
83
|
-
names.each do |name|
|
84
|
-
trimmed_seq_hash[name] = trimmed_seq
|
85
|
-
end
|
86
|
-
end
|
87
|
-
return_seq_hash = self.dup
|
88
|
-
return_seq_hash.dna_hash = trimmed_seq_hash
|
89
|
-
return return_seq_hash
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
# end of additonal methods. Delete before publish
|
95
32
|
|
96
33
|
# calculate consensus cutoff
|
97
34
|
|
@@ -127,12 +64,9 @@ def calculate_cut_off(m, error_rate = 0.02)
|
|
127
64
|
return n
|
128
65
|
end
|
129
66
|
|
130
|
-
|
131
|
-
TCS_VERSION
|
132
|
-
|
133
|
-
puts "\n" + '-'*58
|
134
|
-
puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
135
|
-
puts '-'*58 + "\n"
|
67
|
+
puts "\n" + '-'*50
|
68
|
+
puts '| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
69
|
+
puts '-'*50 + "\n"
|
136
70
|
|
137
71
|
unless ARGV[0]
|
138
72
|
raise "No JSON param file found. Script terminated."
|
@@ -173,7 +107,7 @@ def unzip_r(indir, f)
|
|
173
107
|
end
|
174
108
|
runtime_log_file = File.join(indir,"runtime.log")
|
175
109
|
log = File.open(runtime_log_file, "w")
|
176
|
-
log.puts "TSC pipeline Version " + TCS_VERSION.to_s
|
110
|
+
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
177
111
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
178
112
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
179
113
|
|
@@ -224,7 +158,7 @@ end
|
|
224
158
|
|
225
159
|
primers.each do |primer|
|
226
160
|
summary_json = {}
|
227
|
-
summary_json[:tcs_version] = TCS_VERSION
|
161
|
+
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
228
162
|
summary_json[:viralseq_version] = ViralSeq::VERSION
|
229
163
|
summary_json[:runtime] = Time.now.to_s
|
230
164
|
|
@@ -233,6 +167,9 @@ primers.each do |primer|
|
|
233
167
|
|
234
168
|
cdna_primer = primer[:cdna]
|
235
169
|
forward_primer = primer[:forward]
|
170
|
+
|
171
|
+
export_raw = primer[:export_raw]
|
172
|
+
|
236
173
|
unless cdna_primer
|
237
174
|
log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
|
238
175
|
end
|
@@ -363,10 +300,30 @@ primers.each do |primer|
|
|
363
300
|
out_dir_consensus = File.join(out_dir_set, "consensus")
|
364
301
|
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
365
302
|
|
366
|
-
outfile_r1 = File.join(out_dir_consensus, 'r1.
|
367
|
-
outfile_r2 = File.join(out_dir_consensus, 'r2.
|
303
|
+
outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
|
304
|
+
outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
|
368
305
|
outfile_log = File.join(out_dir_set, 'log.json')
|
369
306
|
|
307
|
+
# if export_raw is true, create dir for raw sequence
|
308
|
+
if export_raw
|
309
|
+
out_dir_raw = File.join(out_dir_set, "raw")
|
310
|
+
Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
|
311
|
+
outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
|
312
|
+
outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
|
313
|
+
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
314
|
+
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
315
|
+
|
316
|
+
bio_r1.keys.each do |k|
|
317
|
+
raw_r1_f.puts k + "_r1"
|
318
|
+
raw_r2_f.puts k + "_r2"
|
319
|
+
raw_r1_f.puts bio_r1[k]
|
320
|
+
raw_r2_f.puts bio_r2[k].rc
|
321
|
+
end
|
322
|
+
|
323
|
+
raw_r1_f.close
|
324
|
+
raw_r2_f.close
|
325
|
+
end
|
326
|
+
|
370
327
|
# create TCS
|
371
328
|
|
372
329
|
pid_seqtag_hash = {}
|
@@ -456,19 +413,30 @@ primers.each do |primer|
|
|
456
413
|
f.puts JSON.pretty_generate(pid_json)
|
457
414
|
end
|
458
415
|
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
case primer[:end_join_option]
|
416
|
+
def end_join(dir, option, overlap)
|
417
|
+
shp = ViralSeq::SeqHashPair.fa(dir)
|
418
|
+
case option
|
463
419
|
when 1
|
464
|
-
joined_sh = shp.join1(
|
420
|
+
joined_sh = shp.join1()
|
465
421
|
when 3
|
466
422
|
joined_sh = shp.join2
|
467
423
|
when 4
|
468
424
|
joined_sh = shp.join2(model: :indiv)
|
469
425
|
end
|
426
|
+
return joined_sh
|
427
|
+
end
|
428
|
+
|
429
|
+
if primer[:end_join]
|
430
|
+
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
431
|
+
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
432
|
+
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
470
433
|
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
471
434
|
summary_json[:combined_tcs] = joined_sh.size
|
435
|
+
|
436
|
+
if export_raw
|
437
|
+
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
438
|
+
end
|
439
|
+
|
472
440
|
else
|
473
441
|
File.open(outfile_log, "w") do |f|
|
474
442
|
f.puts JSON.pretty_generate(summary_json)
|
@@ -501,8 +469,28 @@ primers.each do |primer|
|
|
501
469
|
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
502
470
|
end
|
503
471
|
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
472
|
+
|
473
|
+
if export_raw
|
474
|
+
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
475
|
+
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
476
|
+
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
477
|
+
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
478
|
+
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
479
|
+
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
480
|
+
joined_seq_raw = {}
|
481
|
+
new_r1_seq_raw.each do |seq_name, seq|
|
482
|
+
next unless seq
|
483
|
+
next unless new_r2_seq_raw[seq_name]
|
484
|
+
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
485
|
+
end
|
486
|
+
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
487
|
+
end
|
504
488
|
else
|
505
489
|
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
490
|
+
|
491
|
+
if export_raw
|
492
|
+
joined_sh_raw = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
493
|
+
end
|
506
494
|
end
|
507
495
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
508
496
|
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
@@ -512,7 +500,10 @@ primers.each do |primer|
|
|
512
500
|
trim_ref = primer[:trim_ref].to_sym
|
513
501
|
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
514
502
|
end
|
515
|
-
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.
|
503
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
504
|
+
if export_raw
|
505
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.fasta"))
|
506
|
+
end
|
516
507
|
end
|
517
508
|
|
518
509
|
File.open(outfile_log, "w") do |f|
|
data/bin/tcs_json_generator
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
# TCS pipeline JSON params generator.
|
4
4
|
|
5
|
+
require 'viral_seq'
|
5
6
|
require 'colorize'
|
6
7
|
require 'json'
|
7
8
|
|
@@ -26,10 +27,8 @@ def get_ref
|
|
26
27
|
end
|
27
28
|
end
|
28
29
|
|
29
|
-
TCS_VERSION = "2.0.0"
|
30
|
-
|
31
30
|
puts "\n" + '-'*58
|
32
|
-
puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
31
|
+
puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
33
32
|
puts '-'*58 + "\n"
|
34
33
|
|
35
34
|
param = {}
|
@@ -48,8 +47,8 @@ else
|
|
48
47
|
end
|
49
48
|
|
50
49
|
param[:primer_pairs] = []
|
51
|
-
|
52
|
-
|
50
|
+
|
51
|
+
loop do
|
53
52
|
data = {}
|
54
53
|
puts "Enter the name for the sequenced region: "
|
55
54
|
print '> '
|
@@ -147,14 +146,11 @@ while continue
|
|
147
146
|
data[:end_join] = false
|
148
147
|
end
|
149
148
|
|
149
|
+
param[:primer_pairs] << data
|
150
150
|
print "Do you wish to conintue? Y/N \n> "
|
151
151
|
continue_sig = gets.chomp.rstrip
|
152
|
-
|
153
|
-
|
154
|
-
else
|
155
|
-
continue = false
|
156
|
-
end
|
157
|
-
param[:primer_pairs] << data
|
152
|
+
break unless continue_sig =~ /y|yes/i
|
153
|
+
|
158
154
|
end
|
159
155
|
|
160
156
|
puts "\nYour JSON string is:"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -313,22 +313,22 @@ module ViralSeq
|
|
313
313
|
|
314
314
|
# screen for sequences with stop codons.
|
315
315
|
# @param (see #translate)
|
316
|
-
# @return [
|
316
|
+
# @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
|
317
317
|
#
|
318
|
-
# #
|
319
|
-
# #
|
318
|
+
# # :with_stop_codon : ViralSeq::SeqHash object with stop codons
|
319
|
+
# # :without_stop_codon: ViralSeq::SeqHash object without stop codons
|
320
320
|
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
321
321
|
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
322
322
|
# my_seqhash.dna_hash
|
323
323
|
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
324
|
-
# stop_codon_seqhash = my_seqhash.stop_codon[
|
324
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
|
325
325
|
# stop_codon_seqhash.dna_hash
|
326
326
|
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
327
327
|
# stop_codon_seqhash.aa_hash
|
328
328
|
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
329
329
|
# stop_codon_seqhash.title
|
330
330
|
# => "my_fasta_file_stop"
|
331
|
-
# filtered_seqhash = my_seqhash.stop_codon[
|
331
|
+
# filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
|
332
332
|
# filtered_seqhash.aa_hash
|
333
333
|
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
334
334
|
|
@@ -343,7 +343,10 @@ module ViralSeq
|
|
343
343
|
seqhash1.title = self.title + "_stop"
|
344
344
|
keys2 = aa_seqs.keys - keys
|
345
345
|
seqhash2 = self.sub(keys2)
|
346
|
-
return
|
346
|
+
return {
|
347
|
+
with_stop_codon: seqhash1,
|
348
|
+
without_stop_codon: seqhash2
|
349
|
+
}
|
347
350
|
end #end of #stop_codon
|
348
351
|
|
349
352
|
|
@@ -399,10 +402,10 @@ module ViralSeq
|
|
399
402
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
400
403
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
401
404
|
# # b/c Poisson model does not do well on small sample size.
|
402
|
-
# @return [
|
403
|
-
#
|
404
|
-
#
|
405
|
-
#
|
405
|
+
# @return [Hash] three paris.
|
406
|
+
# :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
|
407
|
+
# :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
|
408
|
+
# :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
406
409
|
# # sequence tag
|
407
410
|
# # G to A mutation numbers at potential a3g positions
|
408
411
|
# # total potential a3g G positions
|
@@ -413,17 +416,17 @@ module ViralSeq
|
|
413
416
|
# @example identify apobec3gf mutations from a sequence fasta file
|
414
417
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
415
418
|
# hypermut = my_seqhash.a3g
|
416
|
-
# hypermut[
|
419
|
+
# hypermut[:a3g_seq].dna_hash.keys
|
417
420
|
# => [">Seq7", ">Seq14"]
|
418
|
-
# hypermut[
|
421
|
+
# hypermut[:filtered_seq].dna_hash.keys
|
419
422
|
# => [">Seq1", ">Seq2", ">Seq5"]
|
420
|
-
# hypermut[
|
423
|
+
# hypermut[:stats]
|
421
424
|
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
422
425
|
#
|
423
426
|
# @example identify apobec3gf mutations from another sequence fasta file
|
424
427
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
425
428
|
# hypermut = my_seqhash.a3g
|
426
|
-
# hypermut[
|
429
|
+
# hypermut[:stats]
|
427
430
|
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
428
431
|
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
429
432
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
@@ -516,7 +519,10 @@ module ViralSeq
|
|
516
519
|
hm_seq_hash.title = self.title + "_hypermut"
|
517
520
|
hm_seq_hash.file = self.file
|
518
521
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
519
|
-
return
|
522
|
+
return { a3g_seq: hm_seq_hash,
|
523
|
+
filtered_seq: filtered_seq_hash,
|
524
|
+
stats: hm_hash.values
|
525
|
+
}
|
520
526
|
end #end of #a3g_hypermut
|
521
527
|
|
522
528
|
alias_method :a3g, :a3g_hypermut
|
@@ -730,6 +736,7 @@ module ViralSeq
|
|
730
736
|
|
731
737
|
seq_hash_unique.each do |seq|
|
732
738
|
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
739
|
+
next unless loc # if locator tool fails, skip this seq.
|
733
740
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
734
741
|
if indel
|
735
742
|
seq_hash_unique_pass << seq
|
@@ -1151,7 +1158,7 @@ module ViralSeq
|
|
1151
1158
|
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1152
1159
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1153
1160
|
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1154
|
-
|
1161
|
+
|
1155
1162
|
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1156
1163
|
seq_hash = self.dna_hash.dup
|
1157
1164
|
seq_hash_unique = seq_hash.uniq_hash
|
data/lib/viral_seq/version.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.post_install_message = "Thanks for installing!"
|
27
27
|
|
28
28
|
spec.add_development_dependency "bundler", "~> 2.0"
|
29
|
-
spec.add_development_dependency "rake", "~>
|
29
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
30
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
31
|
|
32
32
|
# muscle_bio gem required
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-
|
12
|
+
date: 2020-07-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -31,14 +31,14 @@ dependencies:
|
|
31
31
|
requirements:
|
32
32
|
- - "~>"
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version: '
|
34
|
+
version: '13.0'
|
35
35
|
type: :development
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
39
|
- - "~>"
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version: '
|
41
|
+
version: '13.0'
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
43
|
name: rspec
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|