viral_seq 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -0
- data/bin/tcs +1 -1
- data/lib/viral_seq/seq_hash.rb +108 -0
- data/lib/viral_seq/tcs_core.rb +13 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
|
|
4
|
+
data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
|
|
7
|
+
data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
|
data/README.md
CHANGED
|
@@ -179,6 +179,14 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
|
179
179
|
|
|
180
180
|
## Updates
|
|
181
181
|
|
|
182
|
+
### Version 1.5.0-01042022
|
|
183
|
+
|
|
184
|
+
1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
|
|
185
|
+
2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
|
|
186
|
+
3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
|
|
187
|
+
4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
|
|
188
|
+
5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
|
|
189
|
+
|
|
182
190
|
### Version 1.4.0-10132021
|
|
183
191
|
|
|
184
192
|
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
data/bin/tcs
CHANGED
|
@@ -200,7 +200,7 @@ begin
|
|
|
200
200
|
summary_json[:paired_raw_sequence] = paired_seq_number
|
|
201
201
|
if paired_seq_number < raw_sequence_number * 0.001
|
|
202
202
|
summary_json[:warnings] <<
|
|
203
|
-
"WARNING: Filtered raw
|
|
203
|
+
"WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
|
|
204
204
|
end
|
|
205
205
|
|
|
206
206
|
common_keys.each do |seqtag|
|
data/lib/viral_seq/seq_hash.rb
CHANGED
|
@@ -208,6 +208,31 @@ module ViralSeq
|
|
|
208
208
|
return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
|
|
209
209
|
end
|
|
210
210
|
|
|
211
|
+
# return a new SeqHash object with given a range on the nt sequence position
|
|
212
|
+
# @param range [Range] range of positions on the nt sequence
|
|
213
|
+
# @return [ViralSeq::SeqHash] a sub SeqHash object
|
|
214
|
+
|
|
215
|
+
def nt_range(range)
|
|
216
|
+
dna_hash = self.dna_hash
|
|
217
|
+
new_hash = {}
|
|
218
|
+
dna_hash.each do |k,v|
|
|
219
|
+
new_hash[k] = v[range]
|
|
220
|
+
end
|
|
221
|
+
ViralSeq::SeqHash.new(new_hash)
|
|
222
|
+
end # end of #nt_range
|
|
223
|
+
|
|
224
|
+
# check the size range of the DNA sequences of the SeqHash object
|
|
225
|
+
# @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
|
|
226
|
+
|
|
227
|
+
def check_nt_size
|
|
228
|
+
dna_hash = self.dna_hash
|
|
229
|
+
size_array = []
|
|
230
|
+
dna_hash.values.each do |v|
|
|
231
|
+
size_array << v.size
|
|
232
|
+
end
|
|
233
|
+
return { max: size_array.max, min: size_array.min }
|
|
234
|
+
end
|
|
235
|
+
|
|
211
236
|
# write the nt sequences to a FASTA format file
|
|
212
237
|
# @param file [String] path to the FASTA output file
|
|
213
238
|
# @return [NilClass]
|
|
@@ -624,6 +649,67 @@ module ViralSeq
|
|
|
624
649
|
end
|
|
625
650
|
end #end of #fdr
|
|
626
651
|
|
|
652
|
+
# analysis for the nt sequence variants.
|
|
653
|
+
# @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
|
|
654
|
+
|
|
655
|
+
def nt_variants
|
|
656
|
+
return_obj = {}
|
|
657
|
+
nt_hash = self.dna_hash
|
|
658
|
+
tcs_number = self.size
|
|
659
|
+
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
|
660
|
+
fdr_hash = self.fdr
|
|
661
|
+
pm_cut_off = self.pm
|
|
662
|
+
con = self.consensus
|
|
663
|
+
return_obj[:tcs_number] = tcs_number
|
|
664
|
+
return_obj[:lower_detection_limit] = dl
|
|
665
|
+
return_obj[:pm_cut_off] = pm_cut_off
|
|
666
|
+
return_obj[:positions] = []
|
|
667
|
+
cis = {}
|
|
668
|
+
|
|
669
|
+
(0..(con.size - 1)).each do |p|
|
|
670
|
+
position_obj = {}
|
|
671
|
+
position_obj[:position] = p + 1
|
|
672
|
+
position_obj[:tcs_number] = tcs_number
|
|
673
|
+
position_obj[:lower_detection_limit] = dl
|
|
674
|
+
position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
|
|
675
|
+
|
|
676
|
+
nts = []
|
|
677
|
+
dna_hash.each do |n,s|
|
|
678
|
+
nts << s[p]
|
|
679
|
+
end
|
|
680
|
+
freq_hash = nts.count_freq
|
|
681
|
+
[:A, :C, :G, :T, :-].each do |k|
|
|
682
|
+
v = freq_hash[k.to_s]
|
|
683
|
+
position_obj[k] = {}
|
|
684
|
+
position_obj[k][:count] = v
|
|
685
|
+
if v > 0
|
|
686
|
+
if cis[[v, tcs_number]]
|
|
687
|
+
ci = cis[[v, tcs_number]]
|
|
688
|
+
else
|
|
689
|
+
ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
|
|
690
|
+
cis[[v, tcs_number]] = ci
|
|
691
|
+
end
|
|
692
|
+
position_obj[k][:freq] = ci.mean.round(4)
|
|
693
|
+
position_obj[k][:freq_ci_low] = ci.lower.round(4)
|
|
694
|
+
position_obj[k][:freq_ci_high] = ci.upper.round(4)
|
|
695
|
+
position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
|
|
696
|
+
position_obj[k][:fdr] = fdr_hash[v]
|
|
697
|
+
else
|
|
698
|
+
position_obj[k][:freq] = 0
|
|
699
|
+
position_obj[k][:freq_ci_low] = 0
|
|
700
|
+
position_obj[k][:freq_ci_high] = 0
|
|
701
|
+
position_obj[k][:greater_than_pm] = false
|
|
702
|
+
position_obj[k][:fdr] = nil
|
|
703
|
+
end
|
|
704
|
+
end
|
|
705
|
+
|
|
706
|
+
return_obj[:positions] << position_obj
|
|
707
|
+
end
|
|
708
|
+
|
|
709
|
+
return_obj
|
|
710
|
+
end # end of nt_variants
|
|
711
|
+
|
|
712
|
+
|
|
627
713
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
|
628
714
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
|
629
715
|
# @return [SeqHash] new SeqHash object of the aligned @dna_hash, the title has "_aligned"
|
|
@@ -1214,6 +1300,28 @@ module ViralSeq
|
|
|
1214
1300
|
return new_sh
|
|
1215
1301
|
end
|
|
1216
1302
|
|
|
1303
|
+
# QC for each nucleotide sequence comparing with sample consensus for indels
|
|
1304
|
+
# @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
|
|
1305
|
+
|
|
1306
|
+
def qc_indel
|
|
1307
|
+
con = self.consensus
|
|
1308
|
+
dna_hash = self.dna_hash
|
|
1309
|
+
names_passed = []
|
|
1310
|
+
names_indel = []
|
|
1311
|
+
dna_hash.uniq_hash.each do |seq, names|
|
|
1312
|
+
if seq.compare_with(con) < 4
|
|
1313
|
+
names_passed += names
|
|
1314
|
+
elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
|
|
1315
|
+
names_indel += names
|
|
1316
|
+
else
|
|
1317
|
+
names_passed += names
|
|
1318
|
+
end
|
|
1319
|
+
end
|
|
1320
|
+
return {no_indel: self.sub(names_passed),
|
|
1321
|
+
has_indel: self.sub(names_indel)}
|
|
1322
|
+
end # end of qc_indel
|
|
1323
|
+
|
|
1324
|
+
|
|
1217
1325
|
# trim dna sequences based on the provided reference coordinates.
|
|
1218
1326
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
|
1219
1327
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
data/lib/viral_seq/tcs_core.rb
CHANGED
|
@@ -280,6 +280,19 @@ module ViralSeq
|
|
|
280
280
|
abort infor.red.bold
|
|
281
281
|
end
|
|
282
282
|
|
|
283
|
+
# lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
|
|
284
|
+
# R required.
|
|
285
|
+
# @param tcs_number [Integer] number of TCS
|
|
286
|
+
# @return [Float] lower detection limit
|
|
287
|
+
# @example calculate lower detection limit
|
|
288
|
+
# ViralSeq::TcsCore.detection_limit(100)
|
|
289
|
+
# => 0.0362
|
|
290
|
+
|
|
291
|
+
def detection_limit(tcs_number)
|
|
292
|
+
dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
|
|
293
|
+
dl.to_f
|
|
294
|
+
end
|
|
295
|
+
|
|
283
296
|
private
|
|
284
297
|
|
|
285
298
|
def unzip_r(indir, f)
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: viral_seq
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Shuntai Zhou
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date:
|
|
12
|
+
date: 2022-01-06 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: bundler
|