viral_seq 1.2.8 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/README.md +22 -0
- data/bin/tcs +8 -4
- data/bin/tcs_sdrm +9 -6
- data/lib/viral_seq/hivdr.rb +22 -14
- data/lib/viral_seq/math.rb +1 -1
- data/lib/viral_seq/seq_hash.rb +139 -0
- data/lib/viral_seq/tcs_core.rb +13 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
|
4
|
+
data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
|
7
|
+
data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.
|
5
|
-
colorize (
|
6
|
-
combine_pdf (>= 1.0.0)
|
7
|
-
muscle_bio (
|
8
|
-
prawn (>= 2.3.0)
|
9
|
-
prawn-table (>= 0.2.0)
|
4
|
+
viral_seq (1.3.0)
|
5
|
+
colorize (~> 0.1)
|
6
|
+
combine_pdf (~> 1.0, >= 1.0.0)
|
7
|
+
muscle_bio (~> 0.4)
|
8
|
+
prawn (~> 2.3, >= 2.3.0)
|
9
|
+
prawn-table (~> 0.2, >= 0.2.0)
|
10
10
|
|
11
11
|
GEM
|
12
12
|
remote: https://rubygems.org/
|
data/README.md
CHANGED
@@ -179,6 +179,28 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
179
179
|
|
180
180
|
## Updates
|
181
181
|
|
182
|
+
### Version 1.5.0-01042022
|
183
|
+
|
184
|
+
1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
|
185
|
+
2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
|
186
|
+
3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
|
187
|
+
4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
|
188
|
+
5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
|
189
|
+
|
190
|
+
### Version 1.4.0-10132021
|
191
|
+
|
192
|
+
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
193
|
+
2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
|
194
|
+
|
195
|
+
### Version 1.3.0-08302021
|
196
|
+
|
197
|
+
1. Fixed a bug in the `tcs` pipeline.
|
198
|
+
|
199
|
+
### Version 1.2.9-08022021
|
200
|
+
|
201
|
+
1. Fixed a bug when reading the input primer sequences in lowercases.
|
202
|
+
2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
|
203
|
+
|
182
204
|
### Version 1.2.8-07292021
|
183
205
|
|
184
206
|
1. Fixed an issue when reading .fastq files containing blank_lines.
|
data/bin/tcs
CHANGED
@@ -152,8 +152,8 @@ begin
|
|
152
152
|
primer[:region] ? region = primer[:region] : region = "region"
|
153
153
|
summary_json[:primer_set_name] = region
|
154
154
|
|
155
|
-
cdna_primer = primer[:cdna]
|
156
|
-
forward_primer = primer[:forward]
|
155
|
+
cdna_primer = primer[:cdna].upcase
|
156
|
+
forward_primer = primer[:forward].upcase
|
157
157
|
|
158
158
|
export_raw = primer[:export_raw]
|
159
159
|
limit_raw = primer[:limit_raw]
|
@@ -200,7 +200,7 @@ begin
|
|
200
200
|
summary_json[:paired_raw_sequence] = paired_seq_number
|
201
201
|
if paired_seq_number < raw_sequence_number * 0.001
|
202
202
|
summary_json[:warnings] <<
|
203
|
-
"WARNING: Filtered raw
|
203
|
+
"WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
|
204
204
|
end
|
205
205
|
|
206
206
|
common_keys.each do |seqtag|
|
@@ -401,7 +401,11 @@ begin
|
|
401
401
|
when 4
|
402
402
|
joined_sh = shp.join2(model: :indiv)
|
403
403
|
end
|
404
|
-
|
404
|
+
if joined_sh
|
405
|
+
return joined_sh
|
406
|
+
else
|
407
|
+
joined_sh = ViralSeq::SeqHash.new
|
408
|
+
end
|
405
409
|
end
|
406
410
|
|
407
411
|
if primer[:end_join]
|
data/bin/tcs_sdrm
CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
|
|
91
91
|
point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
|
92
92
|
point_mutation_out = File.open(point_mutation_file, "w")
|
93
93
|
point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
|
94
|
-
"number,
|
94
|
+
"number,frequency,95% CI low,95% CI high,fdr,notes"
|
95
95
|
|
96
96
|
linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
|
97
97
|
linkage_out = File.open(linkage_file, "w")
|
98
98
|
linkage_out.puts "region,TCS,mutation linkage,number," +
|
99
|
-
"
|
99
|
+
"frequency,95% CI low, 95% CI high, notes"
|
100
100
|
|
101
101
|
aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
|
102
102
|
aa_report_out = File.open(aa_report_file, "w")
|
@@ -132,6 +132,7 @@ libs.each do |lib|
|
|
132
132
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
133
133
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
134
134
|
poisson_minority_cutoff = filtered_seqs.pm
|
135
|
+
fdr_hash = filtered_seqs.fdr
|
135
136
|
summary_hash[:PR] = [
|
136
137
|
seqs.size.to_s,
|
137
138
|
a3g_seqs.size.to_s,
|
@@ -142,7 +143,7 @@ libs.each do |lib|
|
|
142
143
|
next if filtered_seqs.size < 3
|
143
144
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
144
145
|
|
145
|
-
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
|
146
|
+
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
|
146
147
|
point_mutation_list += sdrm[0]
|
147
148
|
linkage_list += sdrm[1]
|
148
149
|
aa_report_list += sdrm[2]
|
@@ -155,6 +156,7 @@ libs.each do |lib|
|
|
155
156
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
156
157
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
157
158
|
poisson_minority_cutoff = filtered_seqs.pm
|
159
|
+
fdr_hash = filtered_seqs.fdr
|
158
160
|
summary_hash[:IN] = [
|
159
161
|
seqs.size.to_s,
|
160
162
|
a3g_seqs.size.to_s,
|
@@ -165,7 +167,7 @@ libs.each do |lib|
|
|
165
167
|
next if filtered_seqs.size < 3
|
166
168
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
167
169
|
|
168
|
-
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
|
170
|
+
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
|
169
171
|
point_mutation_list += sdrm[0]
|
170
172
|
linkage_list += sdrm[1]
|
171
173
|
aa_report_list += sdrm[2]
|
@@ -190,6 +192,7 @@ libs.each do |lib|
|
|
190
192
|
reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
|
191
193
|
filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
|
192
194
|
poisson_minority_cutoff = filtered_seqs.pm
|
195
|
+
fdr_hash = filtered_seqs.fdr
|
193
196
|
summary_hash[:RT] = [
|
194
197
|
seqs.size.to_s,
|
195
198
|
hypermut_seq_keys.size.to_s,
|
@@ -200,7 +203,7 @@ libs.each do |lib|
|
|
200
203
|
next if filtered_seqs.size < 3
|
201
204
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
202
205
|
|
203
|
-
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
|
206
|
+
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
|
204
207
|
point_mutation_list += sdrm[0]
|
205
208
|
linkage_list += sdrm[1]
|
206
209
|
aa_report_list += sdrm[2]
|
@@ -346,7 +349,7 @@ libs.each do |lib|
|
|
346
349
|
title: "Surveillance Drug Resistance Mutations",
|
347
350
|
file: point_mutation_file,
|
348
351
|
newPDF: "",
|
349
|
-
table_width: [
|
352
|
+
table_width: [60,50,70,65,65,60,75,70,70,70,45],
|
350
353
|
extra_text: "* Mutation below Poisson cut-off for minority mutations"
|
351
354
|
},
|
352
355
|
{
|
data/lib/viral_seq/hivdr.rb
CHANGED
@@ -9,10 +9,13 @@ module ViralSeq
|
|
9
9
|
# IN codon 53-174 (HXB2 4384-4751)
|
10
10
|
# @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
|
11
11
|
# can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
|
12
|
+
# @param fdr [Hash] hash of events => (false detecton rate)
|
13
|
+
# can be obtained using ViralSeq::SeqHash#fdr
|
14
|
+
#
|
12
15
|
# @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
|
13
16
|
#
|
14
17
|
# # point_mutation_list: two demensional array for the following information,
|
15
|
-
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
|
18
|
+
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
|
16
19
|
# # linkage_list: two demensional array for the following information,
|
17
20
|
# # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
|
18
21
|
# # report_list: two demensional array for the following information,
|
@@ -20,12 +23,13 @@ module ViralSeq
|
|
20
23
|
# @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
|
21
24
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
|
22
25
|
# p_cut_off = my_seqhash.pm
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
# =>
|
27
|
-
# => PR,396,
|
28
|
-
# => PR,396,
|
26
|
+
# fdr_hash = my_seqhash.fdr
|
27
|
+
# pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
|
28
|
+
# puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
|
29
|
+
# => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
|
30
|
+
# => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
|
31
|
+
# => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
|
32
|
+
# => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
|
29
33
|
#
|
30
34
|
# puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
|
31
35
|
# => region,tcs_number,linkage,count,%,CI_low,CI_high,label
|
@@ -136,7 +140,7 @@ module ViralSeq
|
|
136
140
|
# => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
|
137
141
|
# => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
138
142
|
|
139
|
-
def sdrm_hiv_pr(cutoff = 0)
|
143
|
+
def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
|
140
144
|
sequences = self.dna_hash
|
141
145
|
region = "PR"
|
142
146
|
rf_label = 0
|
@@ -167,8 +171,9 @@ module ViralSeq
|
|
167
171
|
count_mut_list = mut_list.count_freq
|
168
172
|
count_mut_list.each do |m,number|
|
169
173
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
174
|
+
fdr = fdr_hash[number].round(5)
|
170
175
|
label = number < cutoff ? "*" : ""
|
171
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
176
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
172
177
|
end
|
173
178
|
end
|
174
179
|
point_mutation_list.sort_by! {|record| record[2]}
|
@@ -229,7 +234,7 @@ module ViralSeq
|
|
229
234
|
# @param (see #sdrm_hiv_pr)
|
230
235
|
# @return (see #sdrm_hiv_pr)
|
231
236
|
|
232
|
-
def sdrm_hiv_rt(cutoff = 0)
|
237
|
+
def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
|
233
238
|
sequences = self.dna_hash
|
234
239
|
region = "RT"
|
235
240
|
rf_label = 1
|
@@ -280,8 +285,9 @@ module ViralSeq
|
|
280
285
|
count_mut_list = mut_list.count_freq
|
281
286
|
count_mut_list.each do |m,number|
|
282
287
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
288
|
+
fdr = fdr_hash[number].round(5)
|
283
289
|
label = number < cutoff ? "*" : ""
|
284
|
-
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
290
|
+
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
285
291
|
end
|
286
292
|
end
|
287
293
|
|
@@ -291,8 +297,9 @@ module ViralSeq
|
|
291
297
|
count_mut_list = mut_list.count_freq
|
292
298
|
count_mut_list.each do |m,number|
|
293
299
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
300
|
+
fdr = fdr_hash[number].round(5)
|
294
301
|
label = number < cutoff ? "*" : ""
|
295
|
-
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
302
|
+
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
296
303
|
end
|
297
304
|
end
|
298
305
|
|
@@ -365,7 +372,7 @@ module ViralSeq
|
|
365
372
|
# @param (see #sdrm_hiv_pr)
|
366
373
|
# @return (see #sdrm_hiv_pr)
|
367
374
|
|
368
|
-
def sdrm_hiv_in(cutoff = 0)
|
375
|
+
def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
|
369
376
|
sequences = self.dna_hash
|
370
377
|
region = "IN"
|
371
378
|
rf_label = 2
|
@@ -397,8 +404,9 @@ module ViralSeq
|
|
397
404
|
count_mut_list = mut_list.count_freq
|
398
405
|
count_mut_list.each do |m,number|
|
399
406
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
407
|
+
fdr = fdr_hash[number].round(5)
|
400
408
|
label = number < cutoff ? "*" : ""
|
401
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
409
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
402
410
|
end
|
403
411
|
end
|
404
412
|
point_mutation_list.sort_by! {|record| record[2]}
|
data/lib/viral_seq/math.rb
CHANGED
@@ -31,7 +31,7 @@ module ViralSeq
|
|
31
31
|
def rand
|
32
32
|
if (@compute_next_pair = !@compute_next_pair)
|
33
33
|
theta = 2 * ::Math::PI * @rng.call
|
34
|
-
scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
|
34
|
+
scale = @sd * ::Math.sqrt(-2 * ::Math.log(1 - @rng.call))
|
35
35
|
@g1 = @mean + scale * ::Math.sin(theta)
|
36
36
|
@g0 = @mean + scale * ::Math.cos(theta)
|
37
37
|
else
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -208,6 +208,31 @@ module ViralSeq
|
|
208
208
|
return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
|
209
209
|
end
|
210
210
|
|
211
|
+
# return a new SeqHash object with given a range on the nt sequence position
|
212
|
+
# @param range [Range] range of positions on the nt sequence
|
213
|
+
# @return [ViralSeq::SeqHash] a sub SeqHash object
|
214
|
+
|
215
|
+
def nt_range(range)
|
216
|
+
dna_hash = self.dna_hash
|
217
|
+
new_hash = {}
|
218
|
+
dna_hash.each do |k,v|
|
219
|
+
new_hash[k] = v[range]
|
220
|
+
end
|
221
|
+
ViralSeq::SeqHash.new(new_hash)
|
222
|
+
end # end of #nt_range
|
223
|
+
|
224
|
+
# check the size range of the DNA sequences of the SeqHash object
|
225
|
+
# @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
|
226
|
+
|
227
|
+
def check_nt_size
|
228
|
+
dna_hash = self.dna_hash
|
229
|
+
size_array = []
|
230
|
+
dna_hash.values.each do |v|
|
231
|
+
size_array << v.size
|
232
|
+
end
|
233
|
+
return { max: size_array.max, min: size_array.min }
|
234
|
+
end
|
235
|
+
|
211
236
|
# write the nt sequences to a FASTA format file
|
212
237
|
# @param file [String] path to the FASTA output file
|
213
238
|
# @return [NilClass]
|
@@ -592,6 +617,98 @@ module ViralSeq
|
|
592
617
|
|
593
618
|
alias_method :pm, :poisson_minority_cutoff
|
594
619
|
|
620
|
+
# calculate false detection rate for minority mutations
|
621
|
+
# Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
|
622
|
+
# @param error_rate [Float] estimated sequencing error rate
|
623
|
+
# @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
|
624
|
+
# @example calculate FDR for mutations that appeared twice in the sample dataset
|
625
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
626
|
+
# fdr_hash = my_seqhash.fdr
|
627
|
+
# fdr_hash[2].round(5)
|
628
|
+
# => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
|
629
|
+
|
630
|
+
def fdr(error_rate = 0.0001)
|
631
|
+
sequences = self.dna_hash.values
|
632
|
+
if sequences.size == 0
|
633
|
+
return {}
|
634
|
+
else
|
635
|
+
seq_count = self.size
|
636
|
+
observed_hash = variant_for_poisson(sequences)
|
637
|
+
p_unadjusted = []
|
638
|
+
observed_hash.each do |k, v|
|
639
|
+
p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
|
640
|
+
p_unadjusted += Array.new(v, p_value)
|
641
|
+
end
|
642
|
+
p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
|
643
|
+
vars_pair = observed_hash.to_a
|
644
|
+
fdr_hash = Hash.new(0)
|
645
|
+
(0..(p_fdr.size - 1)).each do |i|
|
646
|
+
fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
|
647
|
+
end
|
648
|
+
return fdr_hash
|
649
|
+
end
|
650
|
+
end #end of #fdr
|
651
|
+
|
652
|
+
# analysis for the nt sequence variants.
|
653
|
+
# @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
|
654
|
+
|
655
|
+
def nt_variants
|
656
|
+
return_obj = {}
|
657
|
+
nt_hash = self.dna_hash
|
658
|
+
tcs_number = self.size
|
659
|
+
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
660
|
+
fdr_hash = self.fdr
|
661
|
+
pm_cut_off = self.pm
|
662
|
+
con = self.consensus
|
663
|
+
return_obj[:tcs_number] = tcs_number
|
664
|
+
return_obj[:lower_detection_limit] = dl
|
665
|
+
return_obj[:pm_cut_off] = pm_cut_off
|
666
|
+
return_obj[:positions] = []
|
667
|
+
cis = {}
|
668
|
+
|
669
|
+
(0..(con.size - 1)).each do |p|
|
670
|
+
position_obj = {}
|
671
|
+
position_obj[:position] = p + 1
|
672
|
+
position_obj[:tcs_number] = tcs_number
|
673
|
+
position_obj[:lower_detection_limit] = dl
|
674
|
+
position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
|
675
|
+
|
676
|
+
nts = []
|
677
|
+
dna_hash.each do |n,s|
|
678
|
+
nts << s[p]
|
679
|
+
end
|
680
|
+
freq_hash = nts.count_freq
|
681
|
+
[:A, :C, :G, :T, :-].each do |k|
|
682
|
+
v = freq_hash[k.to_s]
|
683
|
+
position_obj[k] = {}
|
684
|
+
position_obj[k][:count] = v
|
685
|
+
if v > 0
|
686
|
+
if cis[[v, tcs_number]]
|
687
|
+
ci = cis[[v, tcs_number]]
|
688
|
+
else
|
689
|
+
ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
|
690
|
+
cis[[v, tcs_number]] = ci
|
691
|
+
end
|
692
|
+
position_obj[k][:freq] = ci.mean.round(4)
|
693
|
+
position_obj[k][:freq_ci_low] = ci.lower.round(4)
|
694
|
+
position_obj[k][:freq_ci_high] = ci.upper.round(4)
|
695
|
+
position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
|
696
|
+
position_obj[k][:fdr] = fdr_hash[v]
|
697
|
+
else
|
698
|
+
position_obj[k][:freq] = 0
|
699
|
+
position_obj[k][:freq_ci_low] = 0
|
700
|
+
position_obj[k][:freq_ci_high] = 0
|
701
|
+
position_obj[k][:greater_than_pm] = false
|
702
|
+
position_obj[k][:fdr] = nil
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
return_obj[:positions] << position_obj
|
707
|
+
end
|
708
|
+
|
709
|
+
return_obj
|
710
|
+
end # end of nt_variants
|
711
|
+
|
595
712
|
|
596
713
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
597
714
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
@@ -1183,6 +1300,28 @@ module ViralSeq
|
|
1183
1300
|
return new_sh
|
1184
1301
|
end
|
1185
1302
|
|
1303
|
+
# QC for each nucleotide sequence comparing with sample consensus for indels
|
1304
|
+
# @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
|
1305
|
+
|
1306
|
+
def qc_indel
|
1307
|
+
con = self.consensus
|
1308
|
+
dna_hash = self.dna_hash
|
1309
|
+
names_passed = []
|
1310
|
+
names_indel = []
|
1311
|
+
dna_hash.uniq_hash.each do |seq, names|
|
1312
|
+
if seq.compare_with(con) < 4
|
1313
|
+
names_passed += names
|
1314
|
+
elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
|
1315
|
+
names_indel += names
|
1316
|
+
else
|
1317
|
+
names_passed += names
|
1318
|
+
end
|
1319
|
+
end
|
1320
|
+
return {no_indel: self.sub(names_passed),
|
1321
|
+
has_indel: self.sub(names_indel)}
|
1322
|
+
end # end of qc_indel
|
1323
|
+
|
1324
|
+
|
1186
1325
|
# trim dna sequences based on the provided reference coordinates.
|
1187
1326
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1188
1327
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -280,6 +280,19 @@ module ViralSeq
|
|
280
280
|
abort infor.red.bold
|
281
281
|
end
|
282
282
|
|
283
|
+
# lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
|
284
|
+
# R required.
|
285
|
+
# @param tcs_number [Integer] number of TCS
|
286
|
+
# @return [Float] lower detection limit
|
287
|
+
# @example calculate lower detection limit
|
288
|
+
# ViralSeq::TcsCore.detection_limit(100)
|
289
|
+
# => 0.0362
|
290
|
+
|
291
|
+
def detection_limit(tcs_number)
|
292
|
+
dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
|
293
|
+
dl.to_f
|
294
|
+
end
|
295
|
+
|
283
296
|
private
|
284
297
|
|
285
298
|
def unzip_r(indir, f)
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-01-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|