viral_seq 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/README.md +7 -2
- data/bin/tcs_sdrm +9 -6
- data/lib/viral_seq/hivdr.rb +22 -14
- data/lib/viral_seq/seq_hash.rb +31 -0
- data/lib/viral_seq/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
|
4
|
+
data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
|
7
|
+
data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.
|
5
|
-
colorize (
|
6
|
-
combine_pdf (>= 1.0.0)
|
7
|
-
muscle_bio (
|
8
|
-
prawn (>= 2.3.0)
|
9
|
-
prawn-table (>= 0.2.0)
|
4
|
+
viral_seq (1.3.0)
|
5
|
+
colorize (~> 0.1)
|
6
|
+
combine_pdf (~> 1.0, >= 1.0.0)
|
7
|
+
muscle_bio (~> 0.4)
|
8
|
+
prawn (~> 2.3, >= 2.3.0)
|
9
|
+
prawn-table (~> 0.2, >= 0.2.0)
|
10
10
|
|
11
11
|
GEM
|
12
12
|
remote: https://rubygems.org/
|
data/README.md
CHANGED
@@ -179,14 +179,19 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
179
179
|
|
180
180
|
## Updates
|
181
181
|
|
182
|
+
### Version 1.4.0-10132021
|
183
|
+
|
184
|
+
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
185
|
+
2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
|
186
|
+
|
182
187
|
### Version 1.3.0-08302021
|
183
188
|
|
184
|
-
1. Fixed a bug in the `tcs` pipeline.
|
189
|
+
1. Fixed a bug in the `tcs` pipeline.
|
185
190
|
|
186
191
|
### Version 1.2.9-08022021
|
187
192
|
|
188
193
|
1. Fixed a bug when reading the input primer sequences in lowercases.
|
189
|
-
2.
|
194
|
+
2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
|
190
195
|
|
191
196
|
### Version 1.2.8-07292021
|
192
197
|
|
data/bin/tcs_sdrm
CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
|
|
91
91
|
point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
|
92
92
|
point_mutation_out = File.open(point_mutation_file, "w")
|
93
93
|
point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
|
94
|
-
"number,
|
94
|
+
"number,frequency,95% CI low,95% CI high,fdr,notes"
|
95
95
|
|
96
96
|
linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
|
97
97
|
linkage_out = File.open(linkage_file, "w")
|
98
98
|
linkage_out.puts "region,TCS,mutation linkage,number," +
|
99
|
-
"
|
99
|
+
"frequency,95% CI low, 95% CI high, notes"
|
100
100
|
|
101
101
|
aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
|
102
102
|
aa_report_out = File.open(aa_report_file, "w")
|
@@ -132,6 +132,7 @@ libs.each do |lib|
|
|
132
132
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
133
133
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
134
134
|
poisson_minority_cutoff = filtered_seqs.pm
|
135
|
+
fdr_hash = filtered_seqs.fdr
|
135
136
|
summary_hash[:PR] = [
|
136
137
|
seqs.size.to_s,
|
137
138
|
a3g_seqs.size.to_s,
|
@@ -142,7 +143,7 @@ libs.each do |lib|
|
|
142
143
|
next if filtered_seqs.size < 3
|
143
144
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
144
145
|
|
145
|
-
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
|
146
|
+
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
|
146
147
|
point_mutation_list += sdrm[0]
|
147
148
|
linkage_list += sdrm[1]
|
148
149
|
aa_report_list += sdrm[2]
|
@@ -155,6 +156,7 @@ libs.each do |lib|
|
|
155
156
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
156
157
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
157
158
|
poisson_minority_cutoff = filtered_seqs.pm
|
159
|
+
fdr_hash = filtered_seqs.fdr
|
158
160
|
summary_hash[:IN] = [
|
159
161
|
seqs.size.to_s,
|
160
162
|
a3g_seqs.size.to_s,
|
@@ -165,7 +167,7 @@ libs.each do |lib|
|
|
165
167
|
next if filtered_seqs.size < 3
|
166
168
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
167
169
|
|
168
|
-
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
|
170
|
+
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
|
169
171
|
point_mutation_list += sdrm[0]
|
170
172
|
linkage_list += sdrm[1]
|
171
173
|
aa_report_list += sdrm[2]
|
@@ -190,6 +192,7 @@ libs.each do |lib|
|
|
190
192
|
reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
|
191
193
|
filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
|
192
194
|
poisson_minority_cutoff = filtered_seqs.pm
|
195
|
+
fdr_hash = filtered_seqs.fdr
|
193
196
|
summary_hash[:RT] = [
|
194
197
|
seqs.size.to_s,
|
195
198
|
hypermut_seq_keys.size.to_s,
|
@@ -200,7 +203,7 @@ libs.each do |lib|
|
|
200
203
|
next if filtered_seqs.size < 3
|
201
204
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
202
205
|
|
203
|
-
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
|
206
|
+
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
|
204
207
|
point_mutation_list += sdrm[0]
|
205
208
|
linkage_list += sdrm[1]
|
206
209
|
aa_report_list += sdrm[2]
|
@@ -346,7 +349,7 @@ libs.each do |lib|
|
|
346
349
|
title: "Surveillance Drug Resistance Mutations",
|
347
350
|
file: point_mutation_file,
|
348
351
|
newPDF: "",
|
349
|
-
table_width: [
|
352
|
+
table_width: [60,50,70,65,65,60,75,70,70,70,45],
|
350
353
|
extra_text: "* Mutation below Poisson cut-off for minority mutations"
|
351
354
|
},
|
352
355
|
{
|
data/lib/viral_seq/hivdr.rb
CHANGED
@@ -9,10 +9,13 @@ module ViralSeq
|
|
9
9
|
# IN codon 53-174 (HXB2 4384-4751)
|
10
10
|
# @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
|
11
11
|
# can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
|
12
|
+
# @param fdr [Hash] hash of events => (false detecton rate)
|
13
|
+
# can be obtained using ViralSeq::SeqHash#fdr
|
14
|
+
#
|
12
15
|
# @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
|
13
16
|
#
|
14
17
|
# # point_mutation_list: two demensional array for the following information,
|
15
|
-
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
|
18
|
+
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
|
16
19
|
# # linkage_list: two demensional array for the following information,
|
17
20
|
# # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
|
18
21
|
# # report_list: two demensional array for the following information,
|
@@ -20,12 +23,13 @@ module ViralSeq
|
|
20
23
|
# @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
|
21
24
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
|
22
25
|
# p_cut_off = my_seqhash.pm
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
# =>
|
27
|
-
# => PR,396,
|
28
|
-
# => PR,396,
|
26
|
+
# fdr_hash = my_seqhash.fdr
|
27
|
+
# pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
|
28
|
+
# puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
|
29
|
+
# => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
|
30
|
+
# => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
|
31
|
+
# => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
|
32
|
+
# => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
|
29
33
|
#
|
30
34
|
# puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
|
31
35
|
# => region,tcs_number,linkage,count,%,CI_low,CI_high,label
|
@@ -136,7 +140,7 @@ module ViralSeq
|
|
136
140
|
# => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
|
137
141
|
# => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
138
142
|
|
139
|
-
def sdrm_hiv_pr(cutoff = 0)
|
143
|
+
def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
|
140
144
|
sequences = self.dna_hash
|
141
145
|
region = "PR"
|
142
146
|
rf_label = 0
|
@@ -167,8 +171,9 @@ module ViralSeq
|
|
167
171
|
count_mut_list = mut_list.count_freq
|
168
172
|
count_mut_list.each do |m,number|
|
169
173
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
174
|
+
fdr = fdr_hash[number].round(5)
|
170
175
|
label = number < cutoff ? "*" : ""
|
171
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
176
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
172
177
|
end
|
173
178
|
end
|
174
179
|
point_mutation_list.sort_by! {|record| record[2]}
|
@@ -229,7 +234,7 @@ module ViralSeq
|
|
229
234
|
# @param (see #sdrm_hiv_pr)
|
230
235
|
# @return (see #sdrm_hiv_pr)
|
231
236
|
|
232
|
-
def sdrm_hiv_rt(cutoff = 0)
|
237
|
+
def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
|
233
238
|
sequences = self.dna_hash
|
234
239
|
region = "RT"
|
235
240
|
rf_label = 1
|
@@ -280,8 +285,9 @@ module ViralSeq
|
|
280
285
|
count_mut_list = mut_list.count_freq
|
281
286
|
count_mut_list.each do |m,number|
|
282
287
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
288
|
+
fdr = fdr_hash[number].round(5)
|
283
289
|
label = number < cutoff ? "*" : ""
|
284
|
-
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
290
|
+
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
285
291
|
end
|
286
292
|
end
|
287
293
|
|
@@ -291,8 +297,9 @@ module ViralSeq
|
|
291
297
|
count_mut_list = mut_list.count_freq
|
292
298
|
count_mut_list.each do |m,number|
|
293
299
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
300
|
+
fdr = fdr_hash[number].round(5)
|
294
301
|
label = number < cutoff ? "*" : ""
|
295
|
-
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
302
|
+
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
296
303
|
end
|
297
304
|
end
|
298
305
|
|
@@ -365,7 +372,7 @@ module ViralSeq
|
|
365
372
|
# @param (see #sdrm_hiv_pr)
|
366
373
|
# @return (see #sdrm_hiv_pr)
|
367
374
|
|
368
|
-
def sdrm_hiv_in(cutoff = 0)
|
375
|
+
def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
|
369
376
|
sequences = self.dna_hash
|
370
377
|
region = "IN"
|
371
378
|
rf_label = 2
|
@@ -397,8 +404,9 @@ module ViralSeq
|
|
397
404
|
count_mut_list = mut_list.count_freq
|
398
405
|
count_mut_list.each do |m,number|
|
399
406
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
407
|
+
fdr = fdr_hash[number].round(5)
|
400
408
|
label = number < cutoff ? "*" : ""
|
401
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
409
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
402
410
|
end
|
403
411
|
end
|
404
412
|
point_mutation_list.sort_by! {|record| record[2]}
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -592,6 +592,37 @@ module ViralSeq
|
|
592
592
|
|
593
593
|
alias_method :pm, :poisson_minority_cutoff
|
594
594
|
|
595
|
+
# calculate false detection rate for minority mutations
|
596
|
+
# Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
|
597
|
+
# @param error_rate [Float] estimated sequencing error rate
|
598
|
+
# @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
|
599
|
+
# @example calculate FDR for mutations that appeared twice in the sample dataset
|
600
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
601
|
+
# fdr_hash = my_seqhash.fdr
|
602
|
+
# fdr_hash[2].round(5)
|
603
|
+
# => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
|
604
|
+
|
605
|
+
def fdr(error_rate = 0.0001)
|
606
|
+
sequences = self.dna_hash.values
|
607
|
+
if sequences.size == 0
|
608
|
+
return {}
|
609
|
+
else
|
610
|
+
seq_count = self.size
|
611
|
+
observed_hash = variant_for_poisson(sequences)
|
612
|
+
p_unadjusted = []
|
613
|
+
observed_hash.each do |k, v|
|
614
|
+
p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
|
615
|
+
p_unadjusted += Array.new(v, p_value)
|
616
|
+
end
|
617
|
+
p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
|
618
|
+
vars_pair = observed_hash.to_a
|
619
|
+
fdr_hash = Hash.new(0)
|
620
|
+
(0..(p_fdr.size - 1)).each do |i|
|
621
|
+
fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
|
622
|
+
end
|
623
|
+
return fdr_hash
|
624
|
+
end
|
625
|
+
end #end of #fdr
|
595
626
|
|
596
627
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
597
628
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-10-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|