viral_seq 1.2.9 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/README.md +24 -2
- data/bin/tcs +18 -5
- data/bin/tcs_log +614 -0
- data/bin/tcs_sdrm +9 -6
- data/docs/variants_structure.pdf +0 -0
- data/lib/viral_seq/constant.rb +3 -1
- data/lib/viral_seq/hivdr.rb +22 -14
- data/lib/viral_seq/seq_hash.rb +139 -0
- data/lib/viral_seq/tcs_core.rb +21 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +3 -2
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -208,6 +208,31 @@ module ViralSeq
|
|
208
208
|
return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
|
209
209
|
end
|
210
210
|
|
211
|
+
# return a new SeqHash object with given a range on the nt sequence position
|
212
|
+
# @param range [Range] range of positions on the nt sequence
|
213
|
+
# @return [ViralSeq::SeqHash] a sub SeqHash object
|
214
|
+
|
215
|
+
def nt_range(range)
|
216
|
+
dna_hash = self.dna_hash
|
217
|
+
new_hash = {}
|
218
|
+
dna_hash.each do |k,v|
|
219
|
+
new_hash[k] = v[range]
|
220
|
+
end
|
221
|
+
ViralSeq::SeqHash.new(new_hash)
|
222
|
+
end # end of #nt_range
|
223
|
+
|
224
|
+
# check the size range of the DNA sequences of the SeqHash object
|
225
|
+
# @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
|
226
|
+
|
227
|
+
def check_nt_size
|
228
|
+
dna_hash = self.dna_hash
|
229
|
+
size_array = []
|
230
|
+
dna_hash.values.each do |v|
|
231
|
+
size_array << v.size
|
232
|
+
end
|
233
|
+
return { max: size_array.max, min: size_array.min }
|
234
|
+
end
|
235
|
+
|
211
236
|
# write the nt sequences to a FASTA format file
|
212
237
|
# @param file [String] path to the FASTA output file
|
213
238
|
# @return [NilClass]
|
@@ -592,6 +617,98 @@ module ViralSeq
|
|
592
617
|
|
593
618
|
alias_method :pm, :poisson_minority_cutoff
|
594
619
|
|
620
|
+
# calculate false detection rate for minority mutations
|
621
|
+
# Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
|
622
|
+
# @param error_rate [Float] estimated sequencing error rate
|
623
|
+
# @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
|
624
|
+
# @example calculate FDR for mutations that appeared twice in the sample dataset
|
625
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
626
|
+
# fdr_hash = my_seqhash.fdr
|
627
|
+
# fdr_hash[2].round(5)
|
628
|
+
# => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
|
629
|
+
|
630
|
+
def fdr(error_rate = 0.0001)
|
631
|
+
sequences = self.dna_hash.values
|
632
|
+
if sequences.size == 0
|
633
|
+
return {}
|
634
|
+
else
|
635
|
+
seq_count = self.size
|
636
|
+
observed_hash = variant_for_poisson(sequences)
|
637
|
+
p_unadjusted = []
|
638
|
+
observed_hash.each do |k, v|
|
639
|
+
p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
|
640
|
+
p_unadjusted += Array.new(v, p_value)
|
641
|
+
end
|
642
|
+
p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
|
643
|
+
vars_pair = observed_hash.to_a
|
644
|
+
fdr_hash = Hash.new(0)
|
645
|
+
(0..(p_fdr.size - 1)).each do |i|
|
646
|
+
fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
|
647
|
+
end
|
648
|
+
return fdr_hash
|
649
|
+
end
|
650
|
+
end #end of #fdr
|
651
|
+
|
652
|
+
# analysis for the nt sequence variants.
|
653
|
+
# @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
|
654
|
+
|
655
|
+
def nt_variants
|
656
|
+
return_obj = {}
|
657
|
+
nt_hash = self.dna_hash
|
658
|
+
tcs_number = self.size
|
659
|
+
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
660
|
+
fdr_hash = self.fdr
|
661
|
+
pm_cut_off = self.pm
|
662
|
+
con = self.consensus
|
663
|
+
return_obj[:tcs_number] = tcs_number
|
664
|
+
return_obj[:lower_detection_limit] = dl
|
665
|
+
return_obj[:pm_cut_off] = pm_cut_off
|
666
|
+
return_obj[:positions] = []
|
667
|
+
cis = {}
|
668
|
+
|
669
|
+
(0..(con.size - 1)).each do |p|
|
670
|
+
position_obj = {}
|
671
|
+
position_obj[:position] = p + 1
|
672
|
+
position_obj[:tcs_number] = tcs_number
|
673
|
+
position_obj[:lower_detection_limit] = dl
|
674
|
+
position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
|
675
|
+
|
676
|
+
nts = []
|
677
|
+
dna_hash.each do |n,s|
|
678
|
+
nts << s[p]
|
679
|
+
end
|
680
|
+
freq_hash = nts.count_freq
|
681
|
+
[:A, :C, :G, :T, :-].each do |k|
|
682
|
+
v = freq_hash[k.to_s]
|
683
|
+
position_obj[k] = {}
|
684
|
+
position_obj[k][:count] = v
|
685
|
+
if v > 0
|
686
|
+
if cis[[v, tcs_number]]
|
687
|
+
ci = cis[[v, tcs_number]]
|
688
|
+
else
|
689
|
+
ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
|
690
|
+
cis[[v, tcs_number]] = ci
|
691
|
+
end
|
692
|
+
position_obj[k][:freq] = ci.mean.round(4)
|
693
|
+
position_obj[k][:freq_ci_low] = ci.lower.round(4)
|
694
|
+
position_obj[k][:freq_ci_high] = ci.upper.round(4)
|
695
|
+
position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
|
696
|
+
position_obj[k][:fdr] = fdr_hash[v]
|
697
|
+
else
|
698
|
+
position_obj[k][:freq] = 0
|
699
|
+
position_obj[k][:freq_ci_low] = 0
|
700
|
+
position_obj[k][:freq_ci_high] = 0
|
701
|
+
position_obj[k][:greater_than_pm] = false
|
702
|
+
position_obj[k][:fdr] = nil
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
return_obj[:positions] << position_obj
|
707
|
+
end
|
708
|
+
|
709
|
+
return_obj
|
710
|
+
end # end of nt_variants
|
711
|
+
|
595
712
|
|
596
713
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
597
714
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
@@ -1183,6 +1300,28 @@ module ViralSeq
|
|
1183
1300
|
return new_sh
|
1184
1301
|
end
|
1185
1302
|
|
1303
|
+
# QC for each nucleotide sequence comparing with sample consensus for indels
|
1304
|
+
# @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
|
1305
|
+
|
1306
|
+
def qc_indel
|
1307
|
+
con = self.consensus
|
1308
|
+
dna_hash = self.dna_hash
|
1309
|
+
names_passed = []
|
1310
|
+
names_indel = []
|
1311
|
+
dna_hash.uniq_hash.each do |seq, names|
|
1312
|
+
if seq.compare_with(con) < 4
|
1313
|
+
names_passed += names
|
1314
|
+
elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
|
1315
|
+
names_indel += names
|
1316
|
+
else
|
1317
|
+
names_passed += names
|
1318
|
+
end
|
1319
|
+
end
|
1320
|
+
return {no_indel: self.sub(names_passed),
|
1321
|
+
has_indel: self.sub(names_indel)}
|
1322
|
+
end # end of qc_indel
|
1323
|
+
|
1324
|
+
|
1186
1325
|
# trim dna sequences based on the provided reference coordinates.
|
1187
1326
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1188
1327
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -6,6 +6,10 @@ module ViralSeq
|
|
6
6
|
class << self
|
7
7
|
|
8
8
|
# methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
|
9
|
+
# @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 reference at Zhou et al. JVI 2016.
|
10
|
+
# @param m [Integer] PID abundance
|
11
|
+
# @param error_rate [Float] estimated platform error rate.
|
12
|
+
# @return [Integer] an abundance cut-off (Integer) for offspring Primer IDs.
|
9
13
|
|
10
14
|
def calculate_cut_off(m, error_rate = 0.02)
|
11
15
|
n = 0
|
@@ -280,6 +284,23 @@ module ViralSeq
|
|
280
284
|
abort infor.red.bold
|
281
285
|
end
|
282
286
|
|
287
|
+
# lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
|
288
|
+
# R required.
|
289
|
+
# @param tcs_number [Integer] number of TCS
|
290
|
+
# @return [Float] lower detection limit
|
291
|
+
# @example calculate lower detection limit
|
292
|
+
# ViralSeq::TcsCore.detection_limit(100)
|
293
|
+
# => 0.0362
|
294
|
+
|
295
|
+
def detection_limit(tcs_number)
|
296
|
+
if ViralSeq::DETECT_SEN[tcs_number]
|
297
|
+
return ViralSeq::DETECT_SEN[tcs_number]
|
298
|
+
else
|
299
|
+
dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
|
300
|
+
dl.to_f
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
283
304
|
private
|
284
305
|
|
285
306
|
def unzip_r(indir, f)
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-01-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -172,6 +172,7 @@ files:
|
|
172
172
|
- docs/dr.json
|
173
173
|
- docs/sample_miseq_data/hivdr_control/r1.fastq.gz
|
174
174
|
- docs/sample_miseq_data/hivdr_control/r2.fastq.gz
|
175
|
+
- docs/variants_structure.pdf
|
175
176
|
- lib/viral_seq.rb
|
176
177
|
- lib/viral_seq/constant.rb
|
177
178
|
- lib/viral_seq/enumerable.rb
|