viral_seq 1.3.0 → 1.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/README.md +26 -3
- data/bin/tcs +14 -5
- data/bin/tcs_log +614 -0
- data/bin/tcs_sdrm +9 -6
- data/docs/variants_structure.pdf +0 -0
- data/lib/viral_seq/constant.rb +3 -1
- data/lib/viral_seq/hivdr.rb +22 -14
- data/lib/viral_seq/seq_hash.rb +139 -0
- data/lib/viral_seq/tcs_core.rb +21 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +3 -2
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -208,6 +208,31 @@ module ViralSeq
|
|
208
208
|
return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
|
209
209
|
end
|
210
210
|
|
211
|
+
# return a new SeqHash object with given a range on the nt sequence position
|
212
|
+
# @param range [Range] range of positions on the nt sequence
|
213
|
+
# @return [ViralSeq::SeqHash] a sub SeqHash object
|
214
|
+
|
215
|
+
def nt_range(range)
|
216
|
+
dna_hash = self.dna_hash
|
217
|
+
new_hash = {}
|
218
|
+
dna_hash.each do |k,v|
|
219
|
+
new_hash[k] = v[range]
|
220
|
+
end
|
221
|
+
ViralSeq::SeqHash.new(new_hash)
|
222
|
+
end # end of #nt_range
|
223
|
+
|
224
|
+
# check the size range of the DNA sequences of the SeqHash object
|
225
|
+
# @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
|
226
|
+
|
227
|
+
def check_nt_size
|
228
|
+
dna_hash = self.dna_hash
|
229
|
+
size_array = []
|
230
|
+
dna_hash.values.each do |v|
|
231
|
+
size_array << v.size
|
232
|
+
end
|
233
|
+
return { max: size_array.max, min: size_array.min }
|
234
|
+
end
|
235
|
+
|
211
236
|
# write the nt sequences to a FASTA format file
|
212
237
|
# @param file [String] path to the FASTA output file
|
213
238
|
# @return [NilClass]
|
@@ -592,6 +617,98 @@ module ViralSeq
|
|
592
617
|
|
593
618
|
alias_method :pm, :poisson_minority_cutoff
|
594
619
|
|
620
|
+
# calculate false detection rate for minority mutations
|
621
|
+
# Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
|
622
|
+
# @param error_rate [Float] estimated sequencing error rate
|
623
|
+
# @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
|
624
|
+
# @example calculate FDR for mutations that appeared twice in the sample dataset
|
625
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
626
|
+
# fdr_hash = my_seqhash.fdr
|
627
|
+
# fdr_hash[2].round(5)
|
628
|
+
# => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
|
629
|
+
|
630
|
+
def fdr(error_rate = 0.0001)
|
631
|
+
sequences = self.dna_hash.values
|
632
|
+
if sequences.size == 0
|
633
|
+
return {}
|
634
|
+
else
|
635
|
+
seq_count = self.size
|
636
|
+
observed_hash = variant_for_poisson(sequences)
|
637
|
+
p_unadjusted = []
|
638
|
+
observed_hash.each do |k, v|
|
639
|
+
p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
|
640
|
+
p_unadjusted += Array.new(v, p_value)
|
641
|
+
end
|
642
|
+
p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
|
643
|
+
vars_pair = observed_hash.to_a
|
644
|
+
fdr_hash = Hash.new(0)
|
645
|
+
(0..(p_fdr.size - 1)).each do |i|
|
646
|
+
fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
|
647
|
+
end
|
648
|
+
return fdr_hash
|
649
|
+
end
|
650
|
+
end #end of #fdr
|
651
|
+
|
652
|
+
# analysis for the nt sequence variants.
|
653
|
+
# @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
|
654
|
+
|
655
|
+
def nt_variants
|
656
|
+
return_obj = {}
|
657
|
+
nt_hash = self.dna_hash
|
658
|
+
tcs_number = self.size
|
659
|
+
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
660
|
+
fdr_hash = self.fdr
|
661
|
+
pm_cut_off = self.pm
|
662
|
+
con = self.consensus
|
663
|
+
return_obj[:tcs_number] = tcs_number
|
664
|
+
return_obj[:lower_detection_limit] = dl
|
665
|
+
return_obj[:pm_cut_off] = pm_cut_off
|
666
|
+
return_obj[:positions] = []
|
667
|
+
cis = {}
|
668
|
+
|
669
|
+
(0..(con.size - 1)).each do |p|
|
670
|
+
position_obj = {}
|
671
|
+
position_obj[:position] = p + 1
|
672
|
+
position_obj[:tcs_number] = tcs_number
|
673
|
+
position_obj[:lower_detection_limit] = dl
|
674
|
+
position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
|
675
|
+
|
676
|
+
nts = []
|
677
|
+
dna_hash.each do |n,s|
|
678
|
+
nts << s[p]
|
679
|
+
end
|
680
|
+
freq_hash = nts.count_freq
|
681
|
+
[:A, :C, :G, :T, :-].each do |k|
|
682
|
+
v = freq_hash[k.to_s]
|
683
|
+
position_obj[k] = {}
|
684
|
+
position_obj[k][:count] = v
|
685
|
+
if v > 0
|
686
|
+
if cis[[v, tcs_number]]
|
687
|
+
ci = cis[[v, tcs_number]]
|
688
|
+
else
|
689
|
+
ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
|
690
|
+
cis[[v, tcs_number]] = ci
|
691
|
+
end
|
692
|
+
position_obj[k][:freq] = ci.mean.round(4)
|
693
|
+
position_obj[k][:freq_ci_low] = ci.lower.round(4)
|
694
|
+
position_obj[k][:freq_ci_high] = ci.upper.round(4)
|
695
|
+
position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
|
696
|
+
position_obj[k][:fdr] = fdr_hash[v]
|
697
|
+
else
|
698
|
+
position_obj[k][:freq] = 0
|
699
|
+
position_obj[k][:freq_ci_low] = 0
|
700
|
+
position_obj[k][:freq_ci_high] = 0
|
701
|
+
position_obj[k][:greater_than_pm] = false
|
702
|
+
position_obj[k][:fdr] = nil
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
return_obj[:positions] << position_obj
|
707
|
+
end
|
708
|
+
|
709
|
+
return_obj
|
710
|
+
end # end of nt_variants
|
711
|
+
|
595
712
|
|
596
713
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
597
714
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
@@ -1183,6 +1300,28 @@ module ViralSeq
|
|
1183
1300
|
return new_sh
|
1184
1301
|
end
|
1185
1302
|
|
1303
|
+
# QC for each nucleotide sequence comparing with sample consensus for indels
|
1304
|
+
# @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
|
1305
|
+
|
1306
|
+
def qc_indel
|
1307
|
+
con = self.consensus
|
1308
|
+
dna_hash = self.dna_hash
|
1309
|
+
names_passed = []
|
1310
|
+
names_indel = []
|
1311
|
+
dna_hash.uniq_hash.each do |seq, names|
|
1312
|
+
if seq.compare_with(con) < 4
|
1313
|
+
names_passed += names
|
1314
|
+
elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
|
1315
|
+
names_indel += names
|
1316
|
+
else
|
1317
|
+
names_passed += names
|
1318
|
+
end
|
1319
|
+
end
|
1320
|
+
return {no_indel: self.sub(names_passed),
|
1321
|
+
has_indel: self.sub(names_indel)}
|
1322
|
+
end # end of qc_indel
|
1323
|
+
|
1324
|
+
|
1186
1325
|
# trim dna sequences based on the provided reference coordinates.
|
1187
1326
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1188
1327
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -6,6 +6,10 @@ module ViralSeq
|
|
6
6
|
class << self
|
7
7
|
|
8
8
|
# methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
|
9
|
+
# @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 reference at Zhou et al. JVI 2016.
|
10
|
+
# @param m [Integer] PID abundance
|
11
|
+
# @param error_rate [Float] estimated platform error rate.
|
12
|
+
# @return [Integer] an abundance cut-off (Integer) for offspring Primer IDs.
|
9
13
|
|
10
14
|
def calculate_cut_off(m, error_rate = 0.02)
|
11
15
|
n = 0
|
@@ -280,6 +284,23 @@ module ViralSeq
|
|
280
284
|
abort infor.red.bold
|
281
285
|
end
|
282
286
|
|
287
|
+
# lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
|
288
|
+
# R required.
|
289
|
+
# @param tcs_number [Integer] number of TCS
|
290
|
+
# @return [Float] lower detection limit
|
291
|
+
# @example calculate lower detection limit
|
292
|
+
# ViralSeq::TcsCore.detection_limit(100)
|
293
|
+
# => 0.0362
|
294
|
+
|
295
|
+
def detection_limit(tcs_number)
|
296
|
+
if ViralSeq::DETECT_SEN[tcs_number]
|
297
|
+
return ViralSeq::DETECT_SEN[tcs_number]
|
298
|
+
else
|
299
|
+
dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
|
300
|
+
dl.to_f
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
283
304
|
private
|
284
305
|
|
285
306
|
def unzip_r(indir, f)
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-02-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -172,6 +172,7 @@ files:
|
|
172
172
|
- docs/dr.json
|
173
173
|
- docs/sample_miseq_data/hivdr_control/r1.fastq.gz
|
174
174
|
- docs/sample_miseq_data/hivdr_control/r2.fastq.gz
|
175
|
+
- docs/variants_structure.pdf
|
175
176
|
- lib/viral_seq.rb
|
176
177
|
- lib/viral_seq/constant.rb
|
177
178
|
- lib/viral_seq/enumerable.rb
|