viral_seq 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/README.md +7 -2
- data/bin/tcs_sdrm +9 -6
- data/lib/viral_seq/hivdr.rb +22 -14
- data/lib/viral_seq/seq_hash.rb +31 -0
- data/lib/viral_seq/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
|
4
|
+
data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
|
7
|
+
data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.
|
5
|
-
colorize (
|
6
|
-
combine_pdf (>= 1.0.0)
|
7
|
-
muscle_bio (
|
8
|
-
prawn (>= 2.3.0)
|
9
|
-
prawn-table (>= 0.2.0)
|
4
|
+
viral_seq (1.3.0)
|
5
|
+
colorize (~> 0.1)
|
6
|
+
combine_pdf (~> 1.0, >= 1.0.0)
|
7
|
+
muscle_bio (~> 0.4)
|
8
|
+
prawn (~> 2.3, >= 2.3.0)
|
9
|
+
prawn-table (~> 0.2, >= 0.2.0)
|
10
10
|
|
11
11
|
GEM
|
12
12
|
remote: https://rubygems.org/
|
data/README.md
CHANGED
@@ -179,14 +179,19 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
179
179
|
|
180
180
|
## Updates
|
181
181
|
|
182
|
+
### Version 1.4.0-10132021
|
183
|
+
|
184
|
+
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
185
|
+
2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
|
186
|
+
|
182
187
|
### Version 1.3.0-08302021
|
183
188
|
|
184
|
-
1. Fixed a bug in the `tcs` pipeline.
|
189
|
+
1. Fixed a bug in the `tcs` pipeline.
|
185
190
|
|
186
191
|
### Version 1.2.9-08022021
|
187
192
|
|
188
193
|
1. Fixed a bug when reading the input primer sequences in lowercases.
|
189
|
-
2.
|
194
|
+
2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
|
190
195
|
|
191
196
|
### Version 1.2.8-07292021
|
192
197
|
|
data/bin/tcs_sdrm
CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
|
|
91
91
|
point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
|
92
92
|
point_mutation_out = File.open(point_mutation_file, "w")
|
93
93
|
point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
|
94
|
-
"number,
|
94
|
+
"number,frequency,95% CI low,95% CI high,fdr,notes"
|
95
95
|
|
96
96
|
linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
|
97
97
|
linkage_out = File.open(linkage_file, "w")
|
98
98
|
linkage_out.puts "region,TCS,mutation linkage,number," +
|
99
|
-
"
|
99
|
+
"frequency,95% CI low, 95% CI high, notes"
|
100
100
|
|
101
101
|
aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
|
102
102
|
aa_report_out = File.open(aa_report_file, "w")
|
@@ -132,6 +132,7 @@ libs.each do |lib|
|
|
132
132
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
133
133
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
134
134
|
poisson_minority_cutoff = filtered_seqs.pm
|
135
|
+
fdr_hash = filtered_seqs.fdr
|
135
136
|
summary_hash[:PR] = [
|
136
137
|
seqs.size.to_s,
|
137
138
|
a3g_seqs.size.to_s,
|
@@ -142,7 +143,7 @@ libs.each do |lib|
|
|
142
143
|
next if filtered_seqs.size < 3
|
143
144
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
144
145
|
|
145
|
-
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
|
146
|
+
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
|
146
147
|
point_mutation_list += sdrm[0]
|
147
148
|
linkage_list += sdrm[1]
|
148
149
|
aa_report_list += sdrm[2]
|
@@ -155,6 +156,7 @@ libs.each do |lib|
|
|
155
156
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
156
157
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
157
158
|
poisson_minority_cutoff = filtered_seqs.pm
|
159
|
+
fdr_hash = filtered_seqs.fdr
|
158
160
|
summary_hash[:IN] = [
|
159
161
|
seqs.size.to_s,
|
160
162
|
a3g_seqs.size.to_s,
|
@@ -165,7 +167,7 @@ libs.each do |lib|
|
|
165
167
|
next if filtered_seqs.size < 3
|
166
168
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
167
169
|
|
168
|
-
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
|
170
|
+
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
|
169
171
|
point_mutation_list += sdrm[0]
|
170
172
|
linkage_list += sdrm[1]
|
171
173
|
aa_report_list += sdrm[2]
|
@@ -190,6 +192,7 @@ libs.each do |lib|
|
|
190
192
|
reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
|
191
193
|
filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
|
192
194
|
poisson_minority_cutoff = filtered_seqs.pm
|
195
|
+
fdr_hash = filtered_seqs.fdr
|
193
196
|
summary_hash[:RT] = [
|
194
197
|
seqs.size.to_s,
|
195
198
|
hypermut_seq_keys.size.to_s,
|
@@ -200,7 +203,7 @@ libs.each do |lib|
|
|
200
203
|
next if filtered_seqs.size < 3
|
201
204
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
202
205
|
|
203
|
-
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
|
206
|
+
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
|
204
207
|
point_mutation_list += sdrm[0]
|
205
208
|
linkage_list += sdrm[1]
|
206
209
|
aa_report_list += sdrm[2]
|
@@ -346,7 +349,7 @@ libs.each do |lib|
|
|
346
349
|
title: "Surveillance Drug Resistance Mutations",
|
347
350
|
file: point_mutation_file,
|
348
351
|
newPDF: "",
|
349
|
-
table_width: [
|
352
|
+
table_width: [60,50,70,65,65,60,75,70,70,70,45],
|
350
353
|
extra_text: "* Mutation below Poisson cut-off for minority mutations"
|
351
354
|
},
|
352
355
|
{
|
data/lib/viral_seq/hivdr.rb
CHANGED
@@ -9,10 +9,13 @@ module ViralSeq
|
|
9
9
|
# IN codon 53-174 (HXB2 4384-4751)
|
10
10
|
# @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
|
11
11
|
# can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
|
12
|
+
# @param fdr [Hash] hash of events => (false detecton rate)
|
13
|
+
# can be obtained using ViralSeq::SeqHash#fdr
|
14
|
+
#
|
12
15
|
# @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
|
13
16
|
#
|
14
17
|
# # point_mutation_list: two demensional array for the following information,
|
15
|
-
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
|
18
|
+
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
|
16
19
|
# # linkage_list: two demensional array for the following information,
|
17
20
|
# # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
|
18
21
|
# # report_list: two demensional array for the following information,
|
@@ -20,12 +23,13 @@ module ViralSeq
|
|
20
23
|
# @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
|
21
24
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
|
22
25
|
# p_cut_off = my_seqhash.pm
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
# =>
|
27
|
-
# => PR,396,
|
28
|
-
# => PR,396,
|
26
|
+
# fdr_hash = my_seqhash.fdr
|
27
|
+
# pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
|
28
|
+
# puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
|
29
|
+
# => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
|
30
|
+
# => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
|
31
|
+
# => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
|
32
|
+
# => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
|
29
33
|
#
|
30
34
|
# puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
|
31
35
|
# => region,tcs_number,linkage,count,%,CI_low,CI_high,label
|
@@ -136,7 +140,7 @@ module ViralSeq
|
|
136
140
|
# => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
|
137
141
|
# => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
138
142
|
|
139
|
-
def sdrm_hiv_pr(cutoff = 0)
|
143
|
+
def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
|
140
144
|
sequences = self.dna_hash
|
141
145
|
region = "PR"
|
142
146
|
rf_label = 0
|
@@ -167,8 +171,9 @@ module ViralSeq
|
|
167
171
|
count_mut_list = mut_list.count_freq
|
168
172
|
count_mut_list.each do |m,number|
|
169
173
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
174
|
+
fdr = fdr_hash[number].round(5)
|
170
175
|
label = number < cutoff ? "*" : ""
|
171
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
176
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
172
177
|
end
|
173
178
|
end
|
174
179
|
point_mutation_list.sort_by! {|record| record[2]}
|
@@ -229,7 +234,7 @@ module ViralSeq
|
|
229
234
|
# @param (see #sdrm_hiv_pr)
|
230
235
|
# @return (see #sdrm_hiv_pr)
|
231
236
|
|
232
|
-
def sdrm_hiv_rt(cutoff = 0)
|
237
|
+
def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
|
233
238
|
sequences = self.dna_hash
|
234
239
|
region = "RT"
|
235
240
|
rf_label = 1
|
@@ -280,8 +285,9 @@ module ViralSeq
|
|
280
285
|
count_mut_list = mut_list.count_freq
|
281
286
|
count_mut_list.each do |m,number|
|
282
287
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
288
|
+
fdr = fdr_hash[number].round(5)
|
283
289
|
label = number < cutoff ? "*" : ""
|
284
|
-
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
290
|
+
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
285
291
|
end
|
286
292
|
end
|
287
293
|
|
@@ -291,8 +297,9 @@ module ViralSeq
|
|
291
297
|
count_mut_list = mut_list.count_freq
|
292
298
|
count_mut_list.each do |m,number|
|
293
299
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
300
|
+
fdr = fdr_hash[number].round(5)
|
294
301
|
label = number < cutoff ? "*" : ""
|
295
|
-
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
302
|
+
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
296
303
|
end
|
297
304
|
end
|
298
305
|
|
@@ -365,7 +372,7 @@ module ViralSeq
|
|
365
372
|
# @param (see #sdrm_hiv_pr)
|
366
373
|
# @return (see #sdrm_hiv_pr)
|
367
374
|
|
368
|
-
def sdrm_hiv_in(cutoff = 0)
|
375
|
+
def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
|
369
376
|
sequences = self.dna_hash
|
370
377
|
region = "IN"
|
371
378
|
rf_label = 2
|
@@ -397,8 +404,9 @@ module ViralSeq
|
|
397
404
|
count_mut_list = mut_list.count_freq
|
398
405
|
count_mut_list.each do |m,number|
|
399
406
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
407
|
+
fdr = fdr_hash[number].round(5)
|
400
408
|
label = number < cutoff ? "*" : ""
|
401
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
409
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
402
410
|
end
|
403
411
|
end
|
404
412
|
point_mutation_list.sort_by! {|record| record[2]}
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -592,6 +592,37 @@ module ViralSeq
|
|
592
592
|
|
593
593
|
alias_method :pm, :poisson_minority_cutoff
|
594
594
|
|
595
|
+
# calculate false detection rate for minority mutations
|
596
|
+
# Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
|
597
|
+
# @param error_rate [Float] estimated sequencing error rate
|
598
|
+
# @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
|
599
|
+
# @example calculate FDR for mutations that appeared twice in the sample dataset
|
600
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
601
|
+
# fdr_hash = my_seqhash.fdr
|
602
|
+
# fdr_hash[2].round(5)
|
603
|
+
# => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
|
604
|
+
|
605
|
+
def fdr(error_rate = 0.0001)
|
606
|
+
sequences = self.dna_hash.values
|
607
|
+
if sequences.size == 0
|
608
|
+
return {}
|
609
|
+
else
|
610
|
+
seq_count = self.size
|
611
|
+
observed_hash = variant_for_poisson(sequences)
|
612
|
+
p_unadjusted = []
|
613
|
+
observed_hash.each do |k, v|
|
614
|
+
p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
|
615
|
+
p_unadjusted += Array.new(v, p_value)
|
616
|
+
end
|
617
|
+
p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
|
618
|
+
vars_pair = observed_hash.to_a
|
619
|
+
fdr_hash = Hash.new(0)
|
620
|
+
(0..(p_fdr.size - 1)).each do |i|
|
621
|
+
fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
|
622
|
+
end
|
623
|
+
return fdr_hash
|
624
|
+
end
|
625
|
+
end #end of #fdr
|
595
626
|
|
596
627
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
597
628
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-10-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|