viral_seq 1.0.8 → 1.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/README.md +69 -37
- data/bin/tcs +70 -79
- data/bin/tcs_json_generator +7 -11
- data/lib/viral_seq/seq_hash.rb +23 -16
- data/lib/viral_seq/version.rb +2 -2
- data/viral_seq.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4921d3609d6ffc7fd6fbafd7a4a86e5818d47ed855393addd68b20f28b9d214f
|
4
|
+
data.tar.gz: a9e18c01b287885f8f6238343d9633a52d4ae5ea061347e73bd4f3e86788b2a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd21b57e17751f6c3e475f05b7a565d295ac7592b7c02f8d89ed49192834bee444f08ee9ebf48e41922c8caaf37a03651d5d0c9aa89d97ccc2edb9aad8224d5f
|
7
|
+
data.tar.gz: d1162424ea877d9839c179cacc330c81cd3508fcff07b64a1e753c7c706485d1dcb9a6b60aec9ce02ed33b91bbd4386ed58329c17e247ba086e7d81ed107bfd4
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.0.
|
4
|
+
viral_seq (1.0.9)
|
5
5
|
colorize (~> 0.1)
|
6
6
|
muscle_bio (~> 0.4)
|
7
7
|
|
@@ -11,7 +11,7 @@ GEM
|
|
11
11
|
colorize (0.8.1)
|
12
12
|
diff-lcs (1.3)
|
13
13
|
muscle_bio (0.4.0)
|
14
|
-
rake (
|
14
|
+
rake (13.0.1)
|
15
15
|
rspec (3.8.0)
|
16
16
|
rspec-core (~> 3.8.0)
|
17
17
|
rspec-expectations (~> 3.8.0)
|
@@ -31,7 +31,7 @@ PLATFORMS
|
|
31
31
|
|
32
32
|
DEPENDENCIES
|
33
33
|
bundler (~> 2.0)
|
34
|
-
rake (~>
|
34
|
+
rake (~> 13.0)
|
35
35
|
rspec (~> 3.0)
|
36
36
|
viral_seq!
|
37
37
|
|
data/README.md
CHANGED
@@ -12,101 +12,133 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
|
12
12
|
|
13
13
|
#### Load all ViralSeq classes by requiring 'viral_seq.rb'
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
```ruby
|
16
|
+
#!/usr/bin/env ruby
|
17
|
+
require 'viral_seq'
|
18
|
+
```
|
17
19
|
|
18
20
|
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
19
21
|
|
20
22
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
21
23
|
|
24
|
+
|
25
|
+
#### Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data. Parameter json file can be generated using `tcs_json_generator` or at https://tcs-dr-dept-tcs.cloudapps.unc.edu/generator.php
|
26
|
+
|
27
|
+
$ tcs params.json
|
28
|
+
|
29
|
+
#### Use executable `tcs_json_generator` to generate params .json file for the `tcs` pipeline.
|
30
|
+
|
31
|
+
$ tcs_json_generator
|
32
|
+
|
33
|
+
|
22
34
|
## Some Examples
|
23
35
|
|
24
36
|
#### Load nucleotide sequences from a FASTA format sequence file
|
25
37
|
|
26
|
-
|
38
|
+
```ruby
|
39
|
+
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
40
|
+
```
|
27
41
|
|
28
42
|
#### Make an alignment (using MUSCLE)
|
29
43
|
|
30
|
-
|
44
|
+
```ruby
|
45
|
+
aligned_seqhash = my_seqhash.align
|
46
|
+
```
|
31
47
|
|
32
48
|
#### Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
33
49
|
|
34
|
-
|
50
|
+
```ruby
|
51
|
+
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
52
|
+
```
|
35
53
|
|
36
54
|
#### Further filter out sequences with Apobec3g/f hypermutations
|
37
55
|
|
38
|
-
|
56
|
+
```ruby
|
57
|
+
qc_seqhash = qc_seqhash.a3g
|
58
|
+
```
|
39
59
|
|
40
60
|
#### Calculate nucleotide diveristy π
|
41
61
|
|
42
|
-
|
62
|
+
```ruby
|
63
|
+
qc_seqhash.pi
|
64
|
+
```
|
43
65
|
|
44
66
|
#### Calculate cut-off for minority variants based on Poisson model
|
45
67
|
|
46
|
-
|
68
|
+
```ruby
|
69
|
+
cut_off = qc_seqhash.pm
|
70
|
+
```
|
47
71
|
|
48
72
|
#### Examine for drug resistance mutations for HIV PR region
|
49
73
|
|
50
|
-
|
74
|
+
```ruby
|
75
|
+
qc_seqhash.sdrm_hiv_pr(cut_off)
|
76
|
+
```
|
51
77
|
|
52
78
|
## Updates
|
53
79
|
|
80
|
+
Version 1.0.9-07182020:
|
81
|
+
|
82
|
+
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
83
|
+
|
84
|
+
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
85
|
+
|
54
86
|
Version 1.0.8-02282020:
|
55
87
|
|
56
|
-
|
57
|
-
|
58
|
-
|
88
|
+
1. TCS pipeline (version 2.0.0) added as executable.
|
89
|
+
tcs - main TCS pipeline script.
|
90
|
+
tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
|
59
91
|
|
60
|
-
|
61
|
-
|
92
|
+
2. Methods added:
|
93
|
+
ViralSeq::SeqHash#trim
|
62
94
|
|
63
|
-
|
95
|
+
3. Bug fix for several methods.
|
64
96
|
|
65
97
|
Version 1.0.7-01282020:
|
66
98
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
99
|
+
1. Several methods added, including
|
100
|
+
ViralSeq::SeqHash#error_table
|
101
|
+
ViralSeq::SeqHash#random_select
|
102
|
+
2. Improved performance for several functions.
|
71
103
|
|
72
104
|
Version 1.0.6-07232019:
|
73
105
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
106
|
+
1. Several methods added to ViralSeq::SeqHash, including
|
107
|
+
ViralSeq::SeqHash#size
|
108
|
+
ViralSeq::SeqHash#+
|
109
|
+
ViralSeq::SeqHash#write_nt_fa
|
110
|
+
ViralSeq::SeqHash#mutation
|
111
|
+
2. Update documentations and rspec samples.
|
80
112
|
|
81
113
|
Version 1.0.5-07112019:
|
82
114
|
|
83
|
-
|
84
|
-
|
85
|
-
|
115
|
+
1. Update ViralSeq::SeqHash#sequence_locator.
|
116
|
+
Program will try to determine the direction (`+` or `-` of the query sequence)
|
117
|
+
2. update executable `locator` to have a column of `direction` in output .csv file
|
86
118
|
|
87
119
|
Version 1.0.4-07102019:
|
88
120
|
|
89
|
-
|
90
|
-
|
121
|
+
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
122
|
+
2. Fix bugs in bin `locator`
|
91
123
|
|
92
124
|
Version 1.0.3-07102019:
|
93
125
|
|
94
|
-
|
126
|
+
1. Bug fix.
|
95
127
|
|
96
128
|
Version 1.0.2-07102019:
|
97
129
|
|
98
|
-
|
130
|
+
1. Fixed a gem loading issue.
|
99
131
|
|
100
132
|
Version 1.0.1-07102019:
|
101
133
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
134
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
135
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
136
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
137
|
+
4. update documentations
|
106
138
|
|
107
139
|
Version 1.0.0-07092019:
|
108
140
|
|
109
|
-
|
141
|
+
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
110
142
|
|
111
143
|
## Development
|
112
144
|
|
data/bin/tcs
CHANGED
@@ -29,69 +29,6 @@ require 'viral_seq'
|
|
29
29
|
require 'json'
|
30
30
|
require 'colorize'
|
31
31
|
|
32
|
-
# updated the ViralSeq module. Push with the new version.
|
33
|
-
|
34
|
-
module ViralSeq
|
35
|
-
class SeqHash
|
36
|
-
def self.new_from_fastq(fastq_file)
|
37
|
-
count = 0
|
38
|
-
sequence_a = []
|
39
|
-
quality_a = []
|
40
|
-
count_seq = 0
|
41
|
-
|
42
|
-
File.open(fastq_file,'r') do |file|
|
43
|
-
file.readlines.collect do |line|
|
44
|
-
count +=1
|
45
|
-
count_m = count % 4
|
46
|
-
if count_m == 1
|
47
|
-
line.tr!('@','>')
|
48
|
-
sequence_a << line.chomp
|
49
|
-
quality_a << line.chomp
|
50
|
-
count_seq += 1
|
51
|
-
elsif count_m == 2
|
52
|
-
sequence_a << line.chomp
|
53
|
-
elsif count_m == 0
|
54
|
-
quality_a << line.chomp
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
sequence_hash = Hash[sequence_a.each_slice(2).to_a]
|
59
|
-
quality_hash = Hash[quality_a.each_slice(2).to_a]
|
60
|
-
|
61
|
-
seq_hash = ViralSeq::SeqHash.new
|
62
|
-
seq_hash.dna_hash = sequence_hash
|
63
|
-
seq_hash.qc_hash = quality_hash
|
64
|
-
seq_hash.title = File.basename(fastq_file,".*")
|
65
|
-
seq_hash.file = fastq_file
|
66
|
-
return seq_hash
|
67
|
-
end # end of ::new_from_fastq
|
68
|
-
|
69
|
-
class << self
|
70
|
-
alias_method :fq, :new_from_fastq
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
module ViralSeq
|
76
|
-
class SeqHash
|
77
|
-
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
78
|
-
seq_hash = self.dna_hash.dup
|
79
|
-
seq_hash_unique = seq_hash.uniq_hash
|
80
|
-
trimmed_seq_hash = {}
|
81
|
-
seq_hash_unique.each do |seq, names|
|
82
|
-
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
|
83
|
-
names.each do |name|
|
84
|
-
trimmed_seq_hash[name] = trimmed_seq
|
85
|
-
end
|
86
|
-
end
|
87
|
-
return_seq_hash = self.dup
|
88
|
-
return_seq_hash.dna_hash = trimmed_seq_hash
|
89
|
-
return return_seq_hash
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
# end of additonal methods. Delete before publish
|
95
32
|
|
96
33
|
# calculate consensus cutoff
|
97
34
|
|
@@ -127,12 +64,9 @@ def calculate_cut_off(m, error_rate = 0.02)
|
|
127
64
|
return n
|
128
65
|
end
|
129
66
|
|
130
|
-
|
131
|
-
TCS_VERSION
|
132
|
-
|
133
|
-
puts "\n" + '-'*58
|
134
|
-
puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
135
|
-
puts '-'*58 + "\n"
|
67
|
+
puts "\n" + '-'*50
|
68
|
+
puts '| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
69
|
+
puts '-'*50 + "\n"
|
136
70
|
|
137
71
|
unless ARGV[0]
|
138
72
|
raise "No JSON param file found. Script terminated."
|
@@ -173,7 +107,7 @@ def unzip_r(indir, f)
|
|
173
107
|
end
|
174
108
|
runtime_log_file = File.join(indir,"runtime.log")
|
175
109
|
log = File.open(runtime_log_file, "w")
|
176
|
-
log.puts "TSC pipeline Version " + TCS_VERSION.to_s
|
110
|
+
log.puts "TSC pipeline Version " + ViralSeq::TCS_VERSION.to_s
|
177
111
|
log.puts "viral_seq Version " + ViralSeq::VERSION.to_s
|
178
112
|
log.puts Time.now.to_s + "\t" + "Start TCS pipeline..."
|
179
113
|
|
@@ -224,7 +158,7 @@ end
|
|
224
158
|
|
225
159
|
primers.each do |primer|
|
226
160
|
summary_json = {}
|
227
|
-
summary_json[:tcs_version] = TCS_VERSION
|
161
|
+
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
228
162
|
summary_json[:viralseq_version] = ViralSeq::VERSION
|
229
163
|
summary_json[:runtime] = Time.now.to_s
|
230
164
|
|
@@ -233,6 +167,9 @@ primers.each do |primer|
|
|
233
167
|
|
234
168
|
cdna_primer = primer[:cdna]
|
235
169
|
forward_primer = primer[:forward]
|
170
|
+
|
171
|
+
export_raw = primer[:export_raw]
|
172
|
+
|
236
173
|
unless cdna_primer
|
237
174
|
log.puts Time.now.to_s + "\t" + region + " does not have cDNA primer sequence. #{region} skipped."
|
238
175
|
end
|
@@ -363,10 +300,30 @@ primers.each do |primer|
|
|
363
300
|
out_dir_consensus = File.join(out_dir_set, "consensus")
|
364
301
|
Dir.mkdir(out_dir_consensus) unless File.directory?(out_dir_consensus)
|
365
302
|
|
366
|
-
outfile_r1 = File.join(out_dir_consensus, 'r1.
|
367
|
-
outfile_r2 = File.join(out_dir_consensus, 'r2.
|
303
|
+
outfile_r1 = File.join(out_dir_consensus, 'r1.fasta')
|
304
|
+
outfile_r2 = File.join(out_dir_consensus, 'r2.fasta')
|
368
305
|
outfile_log = File.join(out_dir_set, 'log.json')
|
369
306
|
|
307
|
+
# if export_raw is true, create dir for raw sequence
|
308
|
+
if export_raw
|
309
|
+
out_dir_raw = File.join(out_dir_set, "raw")
|
310
|
+
Dir.mkdir(out_dir_raw) unless File.directory?(out_dir_raw)
|
311
|
+
outfile_raw_r1 = File.join(out_dir_raw, 'r1.raw.fasta')
|
312
|
+
outfile_raw_r2 = File.join(out_dir_raw, 'r2.raw.fasta')
|
313
|
+
raw_r1_f = File.open(outfile_raw_r1, 'w')
|
314
|
+
raw_r2_f = File.open(outfile_raw_r2, 'w')
|
315
|
+
|
316
|
+
bio_r1.keys.each do |k|
|
317
|
+
raw_r1_f.puts k + "_r1"
|
318
|
+
raw_r2_f.puts k + "_r2"
|
319
|
+
raw_r1_f.puts bio_r1[k]
|
320
|
+
raw_r2_f.puts bio_r2[k].rc
|
321
|
+
end
|
322
|
+
|
323
|
+
raw_r1_f.close
|
324
|
+
raw_r2_f.close
|
325
|
+
end
|
326
|
+
|
370
327
|
# create TCS
|
371
328
|
|
372
329
|
pid_seqtag_hash = {}
|
@@ -456,19 +413,30 @@ primers.each do |primer|
|
|
456
413
|
f.puts JSON.pretty_generate(pid_json)
|
457
414
|
end
|
458
415
|
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
case primer[:end_join_option]
|
416
|
+
def end_join(dir, option, overlap)
|
417
|
+
shp = ViralSeq::SeqHashPair.fa(dir)
|
418
|
+
case option
|
463
419
|
when 1
|
464
|
-
joined_sh = shp.join1(
|
420
|
+
joined_sh = shp.join1()
|
465
421
|
when 3
|
466
422
|
joined_sh = shp.join2
|
467
423
|
when 4
|
468
424
|
joined_sh = shp.join2(model: :indiv)
|
469
425
|
end
|
426
|
+
return joined_sh
|
427
|
+
end
|
428
|
+
|
429
|
+
if primer[:end_join]
|
430
|
+
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
431
|
+
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
432
|
+
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
470
433
|
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
471
434
|
summary_json[:combined_tcs] = joined_sh.size
|
435
|
+
|
436
|
+
if export_raw
|
437
|
+
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
438
|
+
end
|
439
|
+
|
472
440
|
else
|
473
441
|
File.open(outfile_log, "w") do |f|
|
474
442
|
f.puts JSON.pretty_generate(summary_json)
|
@@ -501,8 +469,28 @@ primers.each do |primer|
|
|
501
469
|
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
502
470
|
end
|
503
471
|
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
472
|
+
|
473
|
+
if export_raw
|
474
|
+
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
475
|
+
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
476
|
+
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
477
|
+
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
478
|
+
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
479
|
+
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
480
|
+
joined_seq_raw = {}
|
481
|
+
new_r1_seq_raw.each do |seq_name, seq|
|
482
|
+
next unless seq
|
483
|
+
next unless new_r2_seq_raw[seq_name]
|
484
|
+
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
485
|
+
end
|
486
|
+
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
487
|
+
end
|
504
488
|
else
|
505
489
|
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
490
|
+
|
491
|
+
if export_raw
|
492
|
+
joined_sh_raw = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
493
|
+
end
|
506
494
|
end
|
507
495
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
508
496
|
summary_json[:combined_tcs_after_qc] = joined_sh.size
|
@@ -512,7 +500,10 @@ primers.each do |primer|
|
|
512
500
|
trim_ref = primer[:trim_ref].to_sym
|
513
501
|
joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
|
514
502
|
end
|
515
|
-
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.
|
503
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
504
|
+
if export_raw
|
505
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.fasta"))
|
506
|
+
end
|
516
507
|
end
|
517
508
|
|
518
509
|
File.open(outfile_log, "w") do |f|
|
data/bin/tcs_json_generator
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
# TCS pipeline JSON params generator.
|
4
4
|
|
5
|
+
require 'viral_seq'
|
5
6
|
require 'colorize'
|
6
7
|
require 'json'
|
7
8
|
|
@@ -26,10 +27,8 @@ def get_ref
|
|
26
27
|
end
|
27
28
|
end
|
28
29
|
|
29
|
-
TCS_VERSION = "2.0.0"
|
30
|
-
|
31
30
|
puts "\n" + '-'*58
|
32
|
-
puts '| JSON Parameter Generator for ' + "TCS #{TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
31
|
+
puts '| JSON Parameter Generator for ' + "TCS #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |'
|
33
32
|
puts '-'*58 + "\n"
|
34
33
|
|
35
34
|
param = {}
|
@@ -48,8 +47,8 @@ else
|
|
48
47
|
end
|
49
48
|
|
50
49
|
param[:primer_pairs] = []
|
51
|
-
|
52
|
-
|
50
|
+
|
51
|
+
loop do
|
53
52
|
data = {}
|
54
53
|
puts "Enter the name for the sequenced region: "
|
55
54
|
print '> '
|
@@ -147,14 +146,11 @@ while continue
|
|
147
146
|
data[:end_join] = false
|
148
147
|
end
|
149
148
|
|
149
|
+
param[:primer_pairs] << data
|
150
150
|
print "Do you wish to conintue? Y/N \n> "
|
151
151
|
continue_sig = gets.chomp.rstrip
|
152
|
-
|
153
|
-
|
154
|
-
else
|
155
|
-
continue = false
|
156
|
-
end
|
157
|
-
param[:primer_pairs] << data
|
152
|
+
break unless continue_sig =~ /y|yes/i
|
153
|
+
|
158
154
|
end
|
159
155
|
|
160
156
|
puts "\nYour JSON string is:"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -313,22 +313,22 @@ module ViralSeq
|
|
313
313
|
|
314
314
|
# screen for sequences with stop codons.
|
315
315
|
# @param (see #translate)
|
316
|
-
# @return [
|
316
|
+
# @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
|
317
317
|
#
|
318
|
-
# #
|
319
|
-
# #
|
318
|
+
# # :with_stop_codon : ViralSeq::SeqHash object with stop codons
|
319
|
+
# # :without_stop_codon: ViralSeq::SeqHash object without stop codons
|
320
320
|
# @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
|
321
321
|
# my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
|
322
322
|
# my_seqhash.dna_hash
|
323
323
|
# => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
324
|
-
# stop_codon_seqhash = my_seqhash.stop_codon[
|
324
|
+
# stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
|
325
325
|
# stop_codon_seqhash.dna_hash
|
326
326
|
# => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
|
327
327
|
# stop_codon_seqhash.aa_hash
|
328
328
|
# => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
|
329
329
|
# stop_codon_seqhash.title
|
330
330
|
# => "my_fasta_file_stop"
|
331
|
-
# filtered_seqhash = my_seqhash.stop_codon[
|
331
|
+
# filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
|
332
332
|
# filtered_seqhash.aa_hash
|
333
333
|
# {">seq1"=>"IRT", ">seq3"=>"MRT"}
|
334
334
|
|
@@ -343,7 +343,10 @@ module ViralSeq
|
|
343
343
|
seqhash1.title = self.title + "_stop"
|
344
344
|
keys2 = aa_seqs.keys - keys
|
345
345
|
seqhash2 = self.sub(keys2)
|
346
|
-
return
|
346
|
+
return {
|
347
|
+
with_stop_codon: seqhash1,
|
348
|
+
without_stop_codon: seqhash2
|
349
|
+
}
|
347
350
|
end #end of #stop_codon
|
348
351
|
|
349
352
|
|
@@ -399,10 +402,10 @@ module ViralSeq
|
|
399
402
|
# # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
400
403
|
# # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
|
401
404
|
# # b/c Poisson model does not do well on small sample size.
|
402
|
-
# @return [
|
403
|
-
#
|
404
|
-
#
|
405
|
-
#
|
405
|
+
# @return [Hash] three paris.
|
406
|
+
# :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
|
407
|
+
# :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
|
408
|
+
# :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
|
406
409
|
# # sequence tag
|
407
410
|
# # G to A mutation numbers at potential a3g positions
|
408
411
|
# # total potential a3g G positions
|
@@ -413,17 +416,17 @@ module ViralSeq
|
|
413
416
|
# @example identify apobec3gf mutations from a sequence fasta file
|
414
417
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
|
415
418
|
# hypermut = my_seqhash.a3g
|
416
|
-
# hypermut[
|
419
|
+
# hypermut[:a3g_seq].dna_hash.keys
|
417
420
|
# => [">Seq7", ">Seq14"]
|
418
|
-
# hypermut[
|
421
|
+
# hypermut[:filtered_seq].dna_hash.keys
|
419
422
|
# => [">Seq1", ">Seq2", ">Seq5"]
|
420
|
-
# hypermut[
|
423
|
+
# hypermut[:stats]
|
421
424
|
# => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
|
422
425
|
#
|
423
426
|
# @example identify apobec3gf mutations from another sequence fasta file
|
424
427
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
|
425
428
|
# hypermut = my_seqhash.a3g
|
426
|
-
# hypermut[
|
429
|
+
# hypermut[:stats]
|
427
430
|
# => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
|
428
431
|
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
|
429
432
|
# # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
@@ -516,7 +519,10 @@ module ViralSeq
|
|
516
519
|
hm_seq_hash.title = self.title + "_hypermut"
|
517
520
|
hm_seq_hash.file = self.file
|
518
521
|
filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
|
519
|
-
return
|
522
|
+
return { a3g_seq: hm_seq_hash,
|
523
|
+
filtered_seq: filtered_seq_hash,
|
524
|
+
stats: hm_hash.values
|
525
|
+
}
|
520
526
|
end #end of #a3g_hypermut
|
521
527
|
|
522
528
|
alias_method :a3g, :a3g_hypermut
|
@@ -730,6 +736,7 @@ module ViralSeq
|
|
730
736
|
|
731
737
|
seq_hash_unique.each do |seq|
|
732
738
|
loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
|
739
|
+
next unless loc # if locator tool fails, skip this seq.
|
733
740
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
734
741
|
if indel
|
735
742
|
seq_hash_unique_pass << seq
|
@@ -1151,7 +1158,7 @@ module ViralSeq
|
|
1151
1158
|
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
1152
1159
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
1153
1160
|
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
|
1154
|
-
|
1161
|
+
|
1155
1162
|
def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
|
1156
1163
|
seq_hash = self.dna_hash.dup
|
1157
1164
|
seq_hash_unique = seq_hash.uniq_hash
|
data/lib/viral_seq/version.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.post_install_message = "Thanks for installing!"
|
27
27
|
|
28
28
|
spec.add_development_dependency "bundler", "~> 2.0"
|
29
|
-
spec.add_development_dependency "rake", "~>
|
29
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
30
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
31
|
|
32
32
|
# muscle_bio gem required
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2020-
|
12
|
+
date: 2020-07-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -31,14 +31,14 @@ dependencies:
|
|
31
31
|
requirements:
|
32
32
|
- - "~>"
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version: '
|
34
|
+
version: '13.0'
|
35
35
|
type: :development
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
39
|
- - "~>"
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version: '
|
41
|
+
version: '13.0'
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
43
|
name: rspec
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|