viral_seq 1.2.9 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -208,6 +208,31 @@ module ViralSeq
208
208
  return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
209
209
  end
210
210
 
211
+ # return a new SeqHash object with given a range on the nt sequence position
212
+ # @param range [Range] range of positions on the nt sequence
213
+ # @return [ViralSeq::SeqHash] a sub SeqHash object
214
+
215
+ def nt_range(range)
216
+ dna_hash = self.dna_hash
217
+ new_hash = {}
218
+ dna_hash.each do |k,v|
219
+ new_hash[k] = v[range]
220
+ end
221
+ ViralSeq::SeqHash.new(new_hash)
222
+ end # end of #nt_range
223
+
224
+ # check the size range of the DNA sequences of the SeqHash object
225
+ # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
226
+
227
+ def check_nt_size
228
+ dna_hash = self.dna_hash
229
+ size_array = []
230
+ dna_hash.values.each do |v|
231
+ size_array << v.size
232
+ end
233
+ return { max: size_array.max, min: size_array.min }
234
+ end
235
+
211
236
  # write the nt sequences to a FASTA format file
212
237
  # @param file [String] path to the FASTA output file
213
238
  # @return [NilClass]
@@ -592,6 +617,98 @@ module ViralSeq
592
617
 
593
618
  alias_method :pm, :poisson_minority_cutoff
594
619
 
620
+ # calculate false detection rate for minority mutations
621
+ # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
622
+ # @param error_rate [Float] estimated sequencing error rate
623
+ # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
624
+ # @example calculate FDR for mutations that appeared twice in the sample dataset
625
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
626
+ # fdr_hash = my_seqhash.fdr
627
+ # fdr_hash[2].round(5)
628
+ # => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
629
+
630
+ def fdr(error_rate = 0.0001)
631
+ sequences = self.dna_hash.values
632
+ if sequences.size == 0
633
+ return {}
634
+ else
635
+ seq_count = self.size
636
+ observed_hash = variant_for_poisson(sequences)
637
+ p_unadjusted = []
638
+ observed_hash.each do |k, v|
639
+ p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
640
+ p_unadjusted += Array.new(v, p_value)
641
+ end
642
+ p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
643
+ vars_pair = observed_hash.to_a
644
+ fdr_hash = Hash.new(0)
645
+ (0..(p_fdr.size - 1)).each do |i|
646
+ fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
647
+ end
648
+ return fdr_hash
649
+ end
650
+ end #end of #fdr
651
+
652
+ # analysis for the nt sequence variants.
653
+ # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
654
+
655
+ def nt_variants
656
+ return_obj = {}
657
+ nt_hash = self.dna_hash
658
+ tcs_number = self.size
659
+ dl = ViralSeq::TcsCore.detection_limit(tcs_number)
660
+ fdr_hash = self.fdr
661
+ pm_cut_off = self.pm
662
+ con = self.consensus
663
+ return_obj[:tcs_number] = tcs_number
664
+ return_obj[:lower_detection_limit] = dl
665
+ return_obj[:pm_cut_off] = pm_cut_off
666
+ return_obj[:positions] = []
667
+ cis = {}
668
+
669
+ (0..(con.size - 1)).each do |p|
670
+ position_obj = {}
671
+ position_obj[:position] = p + 1
672
+ position_obj[:tcs_number] = tcs_number
673
+ position_obj[:lower_detection_limit] = dl
674
+ position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
675
+
676
+ nts = []
677
+ dna_hash.each do |n,s|
678
+ nts << s[p]
679
+ end
680
+ freq_hash = nts.count_freq
681
+ [:A, :C, :G, :T, :-].each do |k|
682
+ v = freq_hash[k.to_s]
683
+ position_obj[k] = {}
684
+ position_obj[k][:count] = v
685
+ if v > 0
686
+ if cis[[v, tcs_number]]
687
+ ci = cis[[v, tcs_number]]
688
+ else
689
+ ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
690
+ cis[[v, tcs_number]] = ci
691
+ end
692
+ position_obj[k][:freq] = ci.mean.round(4)
693
+ position_obj[k][:freq_ci_low] = ci.lower.round(4)
694
+ position_obj[k][:freq_ci_high] = ci.upper.round(4)
695
+ position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
696
+ position_obj[k][:fdr] = fdr_hash[v]
697
+ else
698
+ position_obj[k][:freq] = 0
699
+ position_obj[k][:freq_ci_low] = 0
700
+ position_obj[k][:freq_ci_high] = 0
701
+ position_obj[k][:greater_than_pm] = false
702
+ position_obj[k][:fdr] = nil
703
+ end
704
+ end
705
+
706
+ return_obj[:positions] << position_obj
707
+ end
708
+
709
+ return_obj
710
+ end # end of nt_variants
711
+
595
712
 
596
713
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
597
714
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
@@ -1183,6 +1300,28 @@ module ViralSeq
1183
1300
  return new_sh
1184
1301
  end
1185
1302
 
1303
+ # QC for each nucleotide sequence comparing with sample consensus for indels
1304
+ # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
1305
+
1306
+ def qc_indel
1307
+ con = self.consensus
1308
+ dna_hash = self.dna_hash
1309
+ names_passed = []
1310
+ names_indel = []
1311
+ dna_hash.uniq_hash.each do |seq, names|
1312
+ if seq.compare_with(con) < 4
1313
+ names_passed += names
1314
+ elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
1315
+ names_indel += names
1316
+ else
1317
+ names_passed += names
1318
+ end
1319
+ end
1320
+ return {no_indel: self.sub(names_passed),
1321
+ has_indel: self.sub(names_indel)}
1322
+ end # end of qc_indel
1323
+
1324
+
1186
1325
  # trim dna sequences based on the provided reference coordinates.
1187
1326
  # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1188
1327
  # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
@@ -6,6 +6,10 @@ module ViralSeq
6
6
  class << self
7
7
 
8
8
  # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
9
+ # @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 reference at Zhou et al. JVI 2016.
10
+ # @param m [Integer] PID abundance
11
+ # @param error_rate [Float] estimated platform error rate.
12
+ # @return [Integer] an abundance cut-off (Integer) for offspring Primer IDs.
9
13
 
10
14
  def calculate_cut_off(m, error_rate = 0.02)
11
15
  n = 0
@@ -280,6 +284,23 @@ module ViralSeq
280
284
  abort infor.red.bold
281
285
  end
282
286
 
287
+ # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
288
+ # R required.
289
+ # @param tcs_number [Integer] number of TCS
290
+ # @return [Float] lower detection limit
291
+ # @example calculate lower detection limit
292
+ # ViralSeq::TcsCore.detection_limit(100)
293
+ # => 0.0362
294
+
295
+ def detection_limit(tcs_number)
296
+ if ViralSeq::DETECT_SEN[tcs_number]
297
+ return ViralSeq::DETECT_SEN[tcs_number]
298
+ else
299
+ dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
300
+ dl.to_f
301
+ end
302
+ end
303
+
283
304
  private
284
305
 
285
306
  def unzip_r(indir, f)
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.2.9"
6
- TCS_VERSION = "2.3.8"
5
+ VERSION = "1.6.0"
6
+ TCS_VERSION = "2.5.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.9
4
+ version: 1.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-08-02 00:00:00.000000000 Z
12
+ date: 2022-01-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -172,6 +172,7 @@ files:
172
172
  - docs/dr.json
173
173
  - docs/sample_miseq_data/hivdr_control/r1.fastq.gz
174
174
  - docs/sample_miseq_data/hivdr_control/r2.fastq.gz
175
+ - docs/variants_structure.pdf
175
176
  - lib/viral_seq.rb
176
177
  - lib/viral_seq/constant.rb
177
178
  - lib/viral_seq/enumerable.rb