viral_seq 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
4
- data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
3
+ metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
4
+ data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
5
5
  SHA512:
6
- metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
7
- data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead
6
+ metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
7
+ data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
data/README.md CHANGED
@@ -179,6 +179,14 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
179
179
 
180
180
  ## Updates
181
181
 
182
+ ### Version 1.5.0-01042022
183
+
184
+ 1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
185
+ 2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
186
+ 3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
187
+ 4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
188
+ 5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
189
+
182
190
  ### Version 1.4.0-10132021
183
191
 
184
192
  1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
data/bin/tcs CHANGED
@@ -200,7 +200,7 @@ begin
200
200
  summary_json[:paired_raw_sequence] = paired_seq_number
201
201
  if paired_seq_number < raw_sequence_number * 0.001
202
202
  summary_json[:warnings] <<
203
- "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
203
+ "WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
204
204
  end
205
205
 
206
206
  common_keys.each do |seqtag|
@@ -208,6 +208,31 @@ module ViralSeq
208
208
  return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
209
209
  end
210
210
 
211
+ # return a new SeqHash object with given a range on the nt sequence position
212
+ # @param range [Range] range of positions on the nt sequence
213
+ # @return [ViralSeq::SeqHash] a sub SeqHash object
214
+
215
+ def nt_range(range)
216
+ dna_hash = self.dna_hash
217
+ new_hash = {}
218
+ dna_hash.each do |k,v|
219
+ new_hash[k] = v[range]
220
+ end
221
+ ViralSeq::SeqHash.new(new_hash)
222
+ end # end of #nt_range
223
+
224
+ # check the size range of the DNA sequences of the SeqHash object
225
+ # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
226
+
227
+ def check_nt_size
228
+ dna_hash = self.dna_hash
229
+ size_array = []
230
+ dna_hash.values.each do |v|
231
+ size_array << v.size
232
+ end
233
+ return { max: size_array.max, min: size_array.min }
234
+ end
235
+
211
236
  # write the nt sequences to a FASTA format file
212
237
  # @param file [String] path to the FASTA output file
213
238
  # @return [NilClass]
@@ -624,6 +649,67 @@ module ViralSeq
624
649
  end
625
650
  end #end of #fdr
626
651
 
652
+ # analysis for the nt sequence variants.
653
+ # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
654
+
655
+ def nt_variants
656
+ return_obj = {}
657
+ nt_hash = self.dna_hash
658
+ tcs_number = self.size
659
+ dl = ViralSeq::TcsCore.detection_limit(tcs_number)
660
+ fdr_hash = self.fdr
661
+ pm_cut_off = self.pm
662
+ con = self.consensus
663
+ return_obj[:tcs_number] = tcs_number
664
+ return_obj[:lower_detection_limit] = dl
665
+ return_obj[:pm_cut_off] = pm_cut_off
666
+ return_obj[:positions] = []
667
+ cis = {}
668
+
669
+ (0..(con.size - 1)).each do |p|
670
+ position_obj = {}
671
+ position_obj[:position] = p + 1
672
+ position_obj[:tcs_number] = tcs_number
673
+ position_obj[:lower_detection_limit] = dl
674
+ position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
675
+
676
+ nts = []
677
+ dna_hash.each do |n,s|
678
+ nts << s[p]
679
+ end
680
+ freq_hash = nts.count_freq
681
+ [:A, :C, :G, :T, :-].each do |k|
682
+ v = freq_hash[k.to_s]
683
+ position_obj[k] = {}
684
+ position_obj[k][:count] = v
685
+ if v > 0
686
+ if cis[[v, tcs_number]]
687
+ ci = cis[[v, tcs_number]]
688
+ else
689
+ ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
690
+ cis[[v, tcs_number]] = ci
691
+ end
692
+ position_obj[k][:freq] = ci.mean.round(4)
693
+ position_obj[k][:freq_ci_low] = ci.lower.round(4)
694
+ position_obj[k][:freq_ci_high] = ci.upper.round(4)
695
+ position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
696
+ position_obj[k][:fdr] = fdr_hash[v]
697
+ else
698
+ position_obj[k][:freq] = 0
699
+ position_obj[k][:freq_ci_low] = 0
700
+ position_obj[k][:freq_ci_high] = 0
701
+ position_obj[k][:greater_than_pm] = false
702
+ position_obj[k][:fdr] = nil
703
+ end
704
+ end
705
+
706
+ return_obj[:positions] << position_obj
707
+ end
708
+
709
+ return_obj
710
+ end # end of nt_variants
711
+
712
+
627
713
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
628
714
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
629
715
  # @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
@@ -1214,6 +1300,28 @@ module ViralSeq
1214
1300
  return new_sh
1215
1301
  end
1216
1302
 
1303
+ # QC for each nucleotide sequence comparing with sample consensus for indels
1304
+ # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
1305
+
1306
+ def qc_indel
1307
+ con = self.consensus
1308
+ dna_hash = self.dna_hash
1309
+ names_passed = []
1310
+ names_indel = []
1311
+ dna_hash.uniq_hash.each do |seq, names|
1312
+ if seq.compare_with(con) < 4
1313
+ names_passed += names
1314
+ elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
1315
+ names_indel += names
1316
+ else
1317
+ names_passed += names
1318
+ end
1319
+ end
1320
+ return {no_indel: self.sub(names_passed),
1321
+ has_indel: self.sub(names_indel)}
1322
+ end # end of qc_indel
1323
+
1324
+
1217
1325
  # trim dna sequences based on the provided reference coordinates.
1218
1326
  # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1219
1327
  # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
@@ -280,6 +280,19 @@ module ViralSeq
280
280
  abort infor.red.bold
281
281
  end
282
282
 
283
+ # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
284
+ # R required.
285
+ # @param tcs_number [Integer] number of TCS
286
+ # @return [Float] lower detection limit
287
+ # @example calculate lower detection limit
288
+ # ViralSeq::TcsCore.detection_limit(100)
289
+ # => 0.0362
290
+
291
+ def detection_limit(tcs_number)
292
+ dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
293
+ dl.to_f
294
+ end
295
+
283
296
  private
284
297
 
285
298
  def unzip_r(indir, f)
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.4.0"
6
- TCS_VERSION = "2.3.8"
5
+ VERSION = "1.5.0"
6
+ TCS_VERSION = "2.4.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-10-14 00:00:00.000000000 Z
12
+ date: 2022-01-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler