viral_seq 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
4
- data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
3
+ metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
4
+ data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
5
5
  SHA512:
6
- metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
7
- data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead
6
+ metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
7
+ data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
data/README.md CHANGED
@@ -179,6 +179,14 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
179
179
 
180
180
  ## Updates
181
181
 
182
+ ### Version 1.5.0-01042022
183
+
184
+ 1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
185
+ 2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
186
+ 3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
187
+ 4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
188
+ 5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
189
+
182
190
  ### Version 1.4.0-10132021
183
191
 
184
192
  1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
data/bin/tcs CHANGED
@@ -200,7 +200,7 @@ begin
200
200
  summary_json[:paired_raw_sequence] = paired_seq_number
201
201
  if paired_seq_number < raw_sequence_number * 0.001
202
202
  summary_json[:warnings] <<
203
- "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
203
+ "WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
204
204
  end
205
205
 
206
206
  common_keys.each do |seqtag|
@@ -208,6 +208,31 @@ module ViralSeq
208
208
  return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
209
209
  end
210
210
 
211
+ # return a new SeqHash object with given a range on the nt sequence position
212
+ # @param range [Range] range of positions on the nt sequence
213
+ # @return [ViralSeq::SeqHash] a sub SeqHash object
214
+
215
+ def nt_range(range)
216
+ dna_hash = self.dna_hash
217
+ new_hash = {}
218
+ dna_hash.each do |k,v|
219
+ new_hash[k] = v[range]
220
+ end
221
+ ViralSeq::SeqHash.new(new_hash)
222
+ end # end of #nt_range
223
+
224
+ # check the size range of the DNA sequences of the SeqHash object
225
+ # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
226
+
227
+ def check_nt_size
228
+ dna_hash = self.dna_hash
229
+ size_array = []
230
+ dna_hash.values.each do |v|
231
+ size_array << v.size
232
+ end
233
+ return { max: size_array.max, min: size_array.min }
234
+ end
235
+
211
236
  # write the nt sequences to a FASTA format file
212
237
  # @param file [String] path to the FASTA output file
213
238
  # @return [NilClass]
@@ -624,6 +649,67 @@ module ViralSeq
624
649
  end
625
650
  end #end of #fdr
626
651
 
652
+ # analysis for the nt sequence variants.
653
+ # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
654
+
655
+ def nt_variants
656
+ return_obj = {}
657
+ nt_hash = self.dna_hash
658
+ tcs_number = self.size
659
+ dl = ViralSeq::TcsCore.detection_limit(tcs_number)
660
+ fdr_hash = self.fdr
661
+ pm_cut_off = self.pm
662
+ con = self.consensus
663
+ return_obj[:tcs_number] = tcs_number
664
+ return_obj[:lower_detection_limit] = dl
665
+ return_obj[:pm_cut_off] = pm_cut_off
666
+ return_obj[:positions] = []
667
+ cis = {}
668
+
669
+ (0..(con.size - 1)).each do |p|
670
+ position_obj = {}
671
+ position_obj[:position] = p + 1
672
+ position_obj[:tcs_number] = tcs_number
673
+ position_obj[:lower_detection_limit] = dl
674
+ position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
675
+
676
+ nts = []
677
+ dna_hash.each do |n,s|
678
+ nts << s[p]
679
+ end
680
+ freq_hash = nts.count_freq
681
+ [:A, :C, :G, :T, :-].each do |k|
682
+ v = freq_hash[k.to_s]
683
+ position_obj[k] = {}
684
+ position_obj[k][:count] = v
685
+ if v > 0
686
+ if cis[[v, tcs_number]]
687
+ ci = cis[[v, tcs_number]]
688
+ else
689
+ ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
690
+ cis[[v, tcs_number]] = ci
691
+ end
692
+ position_obj[k][:freq] = ci.mean.round(4)
693
+ position_obj[k][:freq_ci_low] = ci.lower.round(4)
694
+ position_obj[k][:freq_ci_high] = ci.upper.round(4)
695
+ position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
696
+ position_obj[k][:fdr] = fdr_hash[v]
697
+ else
698
+ position_obj[k][:freq] = 0
699
+ position_obj[k][:freq_ci_low] = 0
700
+ position_obj[k][:freq_ci_high] = 0
701
+ position_obj[k][:greater_than_pm] = false
702
+ position_obj[k][:fdr] = nil
703
+ end
704
+ end
705
+
706
+ return_obj[:positions] << position_obj
707
+ end
708
+
709
+ return_obj
710
+ end # end of nt_variants
711
+
712
+
627
713
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
628
714
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
629
715
  # @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
@@ -1214,6 +1300,28 @@ module ViralSeq
1214
1300
  return new_sh
1215
1301
  end
1216
1302
 
1303
+ # QC for each nucleotide sequence comparing with sample consensus for indels
1304
+ # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
1305
+
1306
+ def qc_indel
1307
+ con = self.consensus
1308
+ dna_hash = self.dna_hash
1309
+ names_passed = []
1310
+ names_indel = []
1311
+ dna_hash.uniq_hash.each do |seq, names|
1312
+ if seq.compare_with(con) < 4
1313
+ names_passed += names
1314
+ elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
1315
+ names_indel += names
1316
+ else
1317
+ names_passed += names
1318
+ end
1319
+ end
1320
+ return {no_indel: self.sub(names_passed),
1321
+ has_indel: self.sub(names_indel)}
1322
+ end # end of qc_indel
1323
+
1324
+
1217
1325
  # trim dna sequences based on the provided reference coordinates.
1218
1326
  # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1219
1327
  # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
@@ -280,6 +280,19 @@ module ViralSeq
280
280
  abort infor.red.bold
281
281
  end
282
282
 
283
+ # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
284
+ # R required.
285
+ # @param tcs_number [Integer] number of TCS
286
+ # @return [Float] lower detection limit
287
+ # @example calculate lower detection limit
288
+ # ViralSeq::TcsCore.detection_limit(100)
289
+ # => 0.0362
290
+
291
+ def detection_limit(tcs_number)
292
+ dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
293
+ dl.to_f
294
+ end
295
+
283
296
  private
284
297
 
285
298
  def unzip_r(indir, f)
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.4.0"
6
- TCS_VERSION = "2.3.8"
5
+ VERSION = "1.5.0"
6
+ TCS_VERSION = "2.4.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-10-14 00:00:00.000000000 Z
12
+ date: 2022-01-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler