viral_seq 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -0
- data/bin/tcs +1 -1
- data/lib/viral_seq/seq_hash.rb +108 -0
- data/lib/viral_seq/tcs_core.rb +13 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
|
4
|
+
data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
|
7
|
+
data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
|
data/README.md
CHANGED
@@ -179,6 +179,14 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
179
179
|
|
180
180
|
## Updates
|
181
181
|
|
182
|
+
### Version 1.5.0-01042022
|
183
|
+
|
184
|
+
1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
|
185
|
+
2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
|
186
|
+
3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
|
187
|
+
4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
|
188
|
+
5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
|
189
|
+
|
182
190
|
### Version 1.4.0-10132021
|
183
191
|
|
184
192
|
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
data/bin/tcs
CHANGED
@@ -200,7 +200,7 @@ begin
|
|
200
200
|
summary_json[:paired_raw_sequence] = paired_seq_number
|
201
201
|
if paired_seq_number < raw_sequence_number * 0.001
|
202
202
|
summary_json[:warnings] <<
|
203
|
-
"WARNING: Filtered raw
|
203
|
+
"WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
|
204
204
|
end
|
205
205
|
|
206
206
|
common_keys.each do |seqtag|
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -208,6 +208,31 @@ module ViralSeq
|
|
208
208
|
return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
|
209
209
|
end
|
210
210
|
|
211
|
+
# return a new SeqHash object with given a range on the nt sequence position
|
212
|
+
# @param range [Range] range of positions on the nt sequence
|
213
|
+
# @return [ViralSeq::SeqHash] a sub SeqHash object
|
214
|
+
|
215
|
+
def nt_range(range)
|
216
|
+
dna_hash = self.dna_hash
|
217
|
+
new_hash = {}
|
218
|
+
dna_hash.each do |k,v|
|
219
|
+
new_hash[k] = v[range]
|
220
|
+
end
|
221
|
+
ViralSeq::SeqHash.new(new_hash)
|
222
|
+
end # end of #nt_range
|
223
|
+
|
224
|
+
# check the size range of the DNA sequences of the SeqHash object
|
225
|
+
# @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
|
226
|
+
|
227
|
+
def check_nt_size
|
228
|
+
dna_hash = self.dna_hash
|
229
|
+
size_array = []
|
230
|
+
dna_hash.values.each do |v|
|
231
|
+
size_array << v.size
|
232
|
+
end
|
233
|
+
return { max: size_array.max, min: size_array.min }
|
234
|
+
end
|
235
|
+
|
211
236
|
# write the nt sequences to a FASTA format file
|
212
237
|
# @param file [String] path to the FASTA output file
|
213
238
|
# @return [NilClass]
|
@@ -624,6 +649,67 @@ module ViralSeq
|
|
624
649
|
end
|
625
650
|
end #end of #fdr
|
626
651
|
|
652
|
+
# analysis for the nt sequence variants.
|
653
|
+
# @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
|
654
|
+
|
655
|
+
def nt_variants
|
656
|
+
return_obj = {}
|
657
|
+
nt_hash = self.dna_hash
|
658
|
+
tcs_number = self.size
|
659
|
+
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
660
|
+
fdr_hash = self.fdr
|
661
|
+
pm_cut_off = self.pm
|
662
|
+
con = self.consensus
|
663
|
+
return_obj[:tcs_number] = tcs_number
|
664
|
+
return_obj[:lower_detection_limit] = dl
|
665
|
+
return_obj[:pm_cut_off] = pm_cut_off
|
666
|
+
return_obj[:positions] = []
|
667
|
+
cis = {}
|
668
|
+
|
669
|
+
(0..(con.size - 1)).each do |p|
|
670
|
+
position_obj = {}
|
671
|
+
position_obj[:position] = p + 1
|
672
|
+
position_obj[:tcs_number] = tcs_number
|
673
|
+
position_obj[:lower_detection_limit] = dl
|
674
|
+
position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
|
675
|
+
|
676
|
+
nts = []
|
677
|
+
dna_hash.each do |n,s|
|
678
|
+
nts << s[p]
|
679
|
+
end
|
680
|
+
freq_hash = nts.count_freq
|
681
|
+
[:A, :C, :G, :T, :-].each do |k|
|
682
|
+
v = freq_hash[k.to_s]
|
683
|
+
position_obj[k] = {}
|
684
|
+
position_obj[k][:count] = v
|
685
|
+
if v > 0
|
686
|
+
if cis[[v, tcs_number]]
|
687
|
+
ci = cis[[v, tcs_number]]
|
688
|
+
else
|
689
|
+
ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
|
690
|
+
cis[[v, tcs_number]] = ci
|
691
|
+
end
|
692
|
+
position_obj[k][:freq] = ci.mean.round(4)
|
693
|
+
position_obj[k][:freq_ci_low] = ci.lower.round(4)
|
694
|
+
position_obj[k][:freq_ci_high] = ci.upper.round(4)
|
695
|
+
position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
|
696
|
+
position_obj[k][:fdr] = fdr_hash[v]
|
697
|
+
else
|
698
|
+
position_obj[k][:freq] = 0
|
699
|
+
position_obj[k][:freq_ci_low] = 0
|
700
|
+
position_obj[k][:freq_ci_high] = 0
|
701
|
+
position_obj[k][:greater_than_pm] = false
|
702
|
+
position_obj[k][:fdr] = nil
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
return_obj[:positions] << position_obj
|
707
|
+
end
|
708
|
+
|
709
|
+
return_obj
|
710
|
+
end # end of nt_variants
|
711
|
+
|
712
|
+
|
627
713
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
628
714
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
629
715
|
# @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
|
@@ -1214,6 +1300,28 @@ module ViralSeq
|
|
1214
1300
|
return new_sh
|
1215
1301
|
end
|
1216
1302
|
|
1303
|
+
# QC for each nucleotide sequence comparing with sample consensus for indels
|
1304
|
+
# @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
|
1305
|
+
|
1306
|
+
def qc_indel
|
1307
|
+
con = self.consensus
|
1308
|
+
dna_hash = self.dna_hash
|
1309
|
+
names_passed = []
|
1310
|
+
names_indel = []
|
1311
|
+
dna_hash.uniq_hash.each do |seq, names|
|
1312
|
+
if seq.compare_with(con) < 4
|
1313
|
+
names_passed += names
|
1314
|
+
elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
|
1315
|
+
names_indel += names
|
1316
|
+
else
|
1317
|
+
names_passed += names
|
1318
|
+
end
|
1319
|
+
end
|
1320
|
+
return {no_indel: self.sub(names_passed),
|
1321
|
+
has_indel: self.sub(names_indel)}
|
1322
|
+
end # end of qc_indel
|
1323
|
+
|
1324
|
+
|
1217
1325
|
# trim dna sequences based on the provided reference coordinates.
|
1218
1326
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1219
1327
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -280,6 +280,19 @@ module ViralSeq
|
|
280
280
|
abort infor.red.bold
|
281
281
|
end
|
282
282
|
|
283
|
+
# lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
|
284
|
+
# R required.
|
285
|
+
# @param tcs_number [Integer] number of TCS
|
286
|
+
# @return [Float] lower detection limit
|
287
|
+
# @example calculate lower detection limit
|
288
|
+
# ViralSeq::TcsCore.detection_limit(100)
|
289
|
+
# => 0.0362
|
290
|
+
|
291
|
+
def detection_limit(tcs_number)
|
292
|
+
dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
|
293
|
+
dl.to_f
|
294
|
+
end
|
295
|
+
|
283
296
|
private
|
284
297
|
|
285
298
|
def unzip_r(indir, f)
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-01-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|