viral_seq 1.3.0 → 1.6.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -208,6 +208,31 @@ module ViralSeq
208
208
  return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
209
209
  end
210
210
 
211
+ # return a new SeqHash object with given a range on the nt sequence position
212
+ # @param range [Range] range of positions on the nt sequence
213
+ # @return [ViralSeq::SeqHash] a sub SeqHash object
214
+
215
+ def nt_range(range)
216
+ dna_hash = self.dna_hash
217
+ new_hash = {}
218
+ dna_hash.each do |k,v|
219
+ new_hash[k] = v[range]
220
+ end
221
+ ViralSeq::SeqHash.new(new_hash)
222
+ end # end of #nt_range
223
+
224
+ # check the size range of the DNA sequences of the SeqHash object
225
+ # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
226
+
227
+ def check_nt_size
228
+ dna_hash = self.dna_hash
229
+ size_array = []
230
+ dna_hash.values.each do |v|
231
+ size_array << v.size
232
+ end
233
+ return { max: size_array.max, min: size_array.min }
234
+ end
235
+
211
236
  # write the nt sequences to a FASTA format file
212
237
  # @param file [String] path to the FASTA output file
213
238
  # @return [NilClass]
@@ -592,6 +617,98 @@ module ViralSeq
592
617
 
593
618
  alias_method :pm, :poisson_minority_cutoff
594
619
 
620
+ # calculate false detection rate for minority mutations
621
+ # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
622
+ # @param error_rate [Float] estimated sequencing error rate
623
+ # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
624
+ # @example calculate FDR for mutations that appeared twice in the sample dataset
625
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
626
+ # fdr_hash = my_seqhash.fdr
627
+ # fdr_hash[2].round(5)
628
+ # => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
629
+
630
+ def fdr(error_rate = 0.0001)
631
+ sequences = self.dna_hash.values
632
+ if sequences.size == 0
633
+ return {}
634
+ else
635
+ seq_count = self.size
636
+ observed_hash = variant_for_poisson(sequences)
637
+ p_unadjusted = []
638
+ observed_hash.each do |k, v|
639
+ p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
640
+ p_unadjusted += Array.new(v, p_value)
641
+ end
642
+ p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
643
+ vars_pair = observed_hash.to_a
644
+ fdr_hash = Hash.new(0)
645
+ (0..(p_fdr.size - 1)).each do |i|
646
+ fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
647
+ end
648
+ return fdr_hash
649
+ end
650
+ end #end of #fdr
651
+
652
+ # analysis for the nt sequence variants.
653
+ # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
654
+
655
+ def nt_variants
656
+ return_obj = {}
657
+ nt_hash = self.dna_hash
658
+ tcs_number = self.size
659
+ dl = ViralSeq::TcsCore.detection_limit(tcs_number)
660
+ fdr_hash = self.fdr
661
+ pm_cut_off = self.pm
662
+ con = self.consensus
663
+ return_obj[:tcs_number] = tcs_number
664
+ return_obj[:lower_detection_limit] = dl
665
+ return_obj[:pm_cut_off] = pm_cut_off
666
+ return_obj[:positions] = []
667
+ cis = {}
668
+
669
+ (0..(con.size - 1)).each do |p|
670
+ position_obj = {}
671
+ position_obj[:position] = p + 1
672
+ position_obj[:tcs_number] = tcs_number
673
+ position_obj[:lower_detection_limit] = dl
674
+ position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
675
+
676
+ nts = []
677
+ dna_hash.each do |n,s|
678
+ nts << s[p]
679
+ end
680
+ freq_hash = nts.count_freq
681
+ [:A, :C, :G, :T, :-].each do |k|
682
+ v = freq_hash[k.to_s]
683
+ position_obj[k] = {}
684
+ position_obj[k][:count] = v
685
+ if v > 0
686
+ if cis[[v, tcs_number]]
687
+ ci = cis[[v, tcs_number]]
688
+ else
689
+ ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
690
+ cis[[v, tcs_number]] = ci
691
+ end
692
+ position_obj[k][:freq] = ci.mean.round(4)
693
+ position_obj[k][:freq_ci_low] = ci.lower.round(4)
694
+ position_obj[k][:freq_ci_high] = ci.upper.round(4)
695
+ position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
696
+ position_obj[k][:fdr] = fdr_hash[v]
697
+ else
698
+ position_obj[k][:freq] = 0
699
+ position_obj[k][:freq_ci_low] = 0
700
+ position_obj[k][:freq_ci_high] = 0
701
+ position_obj[k][:greater_than_pm] = false
702
+ position_obj[k][:fdr] = nil
703
+ end
704
+ end
705
+
706
+ return_obj[:positions] << position_obj
707
+ end
708
+
709
+ return_obj
710
+ end # end of nt_variants
711
+
595
712
 
596
713
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
597
714
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
@@ -1183,6 +1300,28 @@ module ViralSeq
1183
1300
  return new_sh
1184
1301
  end
1185
1302
 
1303
+ # QC for each nucleotide sequence comparing with sample consensus for indels
1304
+ # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
1305
+
1306
+ def qc_indel
1307
+ con = self.consensus
1308
+ dna_hash = self.dna_hash
1309
+ names_passed = []
1310
+ names_indel = []
1311
+ dna_hash.uniq_hash.each do |seq, names|
1312
+ if seq.compare_with(con) < 4
1313
+ names_passed += names
1314
+ elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
1315
+ names_indel += names
1316
+ else
1317
+ names_passed += names
1318
+ end
1319
+ end
1320
+ return {no_indel: self.sub(names_passed),
1321
+ has_indel: self.sub(names_indel)}
1322
+ end # end of qc_indel
1323
+
1324
+
1186
1325
  # trim dna sequences based on the provided reference coordinates.
1187
1326
  # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1188
1327
  # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
@@ -6,6 +6,10 @@ module ViralSeq
6
6
  class << self
7
7
 
8
8
  # methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.
9
+ # @see https://www.ncbi.nlm.nih.gov/pubmed/26041299 reference at Zhou et al. JVI 2016.
10
+ # @param m [Integer] PID abundance
11
+ # @param error_rate [Float] estimated platform error rate.
12
+ # @return [Integer] an abundance cut-off (Integer) for offspring Primer IDs.
9
13
 
10
14
  def calculate_cut_off(m, error_rate = 0.02)
11
15
  n = 0
@@ -280,6 +284,23 @@ module ViralSeq
280
284
  abort infor.red.bold
281
285
  end
282
286
 
287
+ # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
288
+ # R required.
289
+ # @param tcs_number [Integer] number of TCS
290
+ # @return [Float] lower detection limit
291
+ # @example calculate lower detection limit
292
+ # ViralSeq::TcsCore.detection_limit(100)
293
+ # => 0.0362
294
+
295
+ def detection_limit(tcs_number)
296
+ if ViralSeq::DETECT_SEN[tcs_number]
297
+ return ViralSeq::DETECT_SEN[tcs_number]
298
+ else
299
+ dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
300
+ dl.to_f
301
+ end
302
+ end
303
+
283
304
  private
284
305
 
285
306
  def unzip_r(indir, f)
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.3.0"
6
- TCS_VERSION = "2.3.8"
5
+ VERSION = "1.6.1"
6
+ TCS_VERSION = "2.5.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-08-30 00:00:00.000000000 Z
12
+ date: 2022-02-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -172,6 +172,7 @@ files:
172
172
  - docs/dr.json
173
173
  - docs/sample_miseq_data/hivdr_control/r1.fastq.gz
174
174
  - docs/sample_miseq_data/hivdr_control/r2.fastq.gz
175
+ - docs/variants_structure.pdf
175
176
  - lib/viral_seq.rb
176
177
  - lib/viral_seq/constant.rb
177
178
  - lib/viral_seq/enumerable.rb