viral_seq 1.2.8 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/README.md +22 -0
- data/bin/tcs +8 -4
- data/bin/tcs_sdrm +9 -6
- data/lib/viral_seq/hivdr.rb +22 -14
- data/lib/viral_seq/math.rb +1 -1
- data/lib/viral_seq/seq_hash.rb +139 -0
- data/lib/viral_seq/tcs_core.rb +13 -0
- data/lib/viral_seq/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
|
4
|
+
data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
|
7
|
+
data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.
|
5
|
-
colorize (
|
6
|
-
combine_pdf (>= 1.0.0)
|
7
|
-
muscle_bio (
|
8
|
-
prawn (>= 2.3.0)
|
9
|
-
prawn-table (>= 0.2.0)
|
4
|
+
viral_seq (1.3.0)
|
5
|
+
colorize (~> 0.1)
|
6
|
+
combine_pdf (~> 1.0, >= 1.0.0)
|
7
|
+
muscle_bio (~> 0.4)
|
8
|
+
prawn (~> 2.3, >= 2.3.0)
|
9
|
+
prawn-table (~> 0.2, >= 0.2.0)
|
10
10
|
|
11
11
|
GEM
|
12
12
|
remote: https://rubygems.org/
|
data/README.md
CHANGED
@@ -179,6 +179,28 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
|
|
179
179
|
|
180
180
|
## Updates
|
181
181
|
|
182
|
+
### Version 1.5.0-01042022
|
183
|
+
|
184
|
+
1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
|
185
|
+
2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
|
186
|
+
3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
|
187
|
+
4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
|
188
|
+
5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
|
189
|
+
|
190
|
+
### Version 1.4.0-10132021
|
191
|
+
|
192
|
+
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
193
|
+
2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
|
194
|
+
|
195
|
+
### Version 1.3.0-08302021
|
196
|
+
|
197
|
+
1. Fixed a bug in the `tcs` pipeline.
|
198
|
+
|
199
|
+
### Version 1.2.9-08022021
|
200
|
+
|
201
|
+
1. Fixed a bug when reading the input primer sequences in lowercases.
|
202
|
+
2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
|
203
|
+
|
182
204
|
### Version 1.2.8-07292021
|
183
205
|
|
184
206
|
1. Fixed an issue when reading .fastq files containing blank_lines.
|
data/bin/tcs
CHANGED
@@ -152,8 +152,8 @@ begin
|
|
152
152
|
primer[:region] ? region = primer[:region] : region = "region"
|
153
153
|
summary_json[:primer_set_name] = region
|
154
154
|
|
155
|
-
cdna_primer = primer[:cdna]
|
156
|
-
forward_primer = primer[:forward]
|
155
|
+
cdna_primer = primer[:cdna].upcase
|
156
|
+
forward_primer = primer[:forward].upcase
|
157
157
|
|
158
158
|
export_raw = primer[:export_raw]
|
159
159
|
limit_raw = primer[:limit_raw]
|
@@ -200,7 +200,7 @@ begin
|
|
200
200
|
summary_json[:paired_raw_sequence] = paired_seq_number
|
201
201
|
if paired_seq_number < raw_sequence_number * 0.001
|
202
202
|
summary_json[:warnings] <<
|
203
|
-
"WARNING: Filtered raw
|
203
|
+
"WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
|
204
204
|
end
|
205
205
|
|
206
206
|
common_keys.each do |seqtag|
|
@@ -401,7 +401,11 @@ begin
|
|
401
401
|
when 4
|
402
402
|
joined_sh = shp.join2(model: :indiv)
|
403
403
|
end
|
404
|
-
|
404
|
+
if joined_sh
|
405
|
+
return joined_sh
|
406
|
+
else
|
407
|
+
joined_sh = ViralSeq::SeqHash.new
|
408
|
+
end
|
405
409
|
end
|
406
410
|
|
407
411
|
if primer[:end_join]
|
data/bin/tcs_sdrm
CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
|
|
91
91
|
point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
|
92
92
|
point_mutation_out = File.open(point_mutation_file, "w")
|
93
93
|
point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
|
94
|
-
"number,
|
94
|
+
"number,frequency,95% CI low,95% CI high,fdr,notes"
|
95
95
|
|
96
96
|
linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
|
97
97
|
linkage_out = File.open(linkage_file, "w")
|
98
98
|
linkage_out.puts "region,TCS,mutation linkage,number," +
|
99
|
-
"
|
99
|
+
"frequency,95% CI low, 95% CI high, notes"
|
100
100
|
|
101
101
|
aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
|
102
102
|
aa_report_out = File.open(aa_report_file, "w")
|
@@ -132,6 +132,7 @@ libs.each do |lib|
|
|
132
132
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
133
133
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
134
134
|
poisson_minority_cutoff = filtered_seqs.pm
|
135
|
+
fdr_hash = filtered_seqs.fdr
|
135
136
|
summary_hash[:PR] = [
|
136
137
|
seqs.size.to_s,
|
137
138
|
a3g_seqs.size.to_s,
|
@@ -142,7 +143,7 @@ libs.each do |lib|
|
|
142
143
|
next if filtered_seqs.size < 3
|
143
144
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
144
145
|
|
145
|
-
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
|
146
|
+
sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
|
146
147
|
point_mutation_list += sdrm[0]
|
147
148
|
linkage_list += sdrm[1]
|
148
149
|
aa_report_list += sdrm[2]
|
@@ -155,6 +156,7 @@ libs.each do |lib|
|
|
155
156
|
stop_codon_seqs = stop_codon_check[:with_stop_codon]
|
156
157
|
filtered_seqs = stop_codon_check[:without_stop_codon]
|
157
158
|
poisson_minority_cutoff = filtered_seqs.pm
|
159
|
+
fdr_hash = filtered_seqs.fdr
|
158
160
|
summary_hash[:IN] = [
|
159
161
|
seqs.size.to_s,
|
160
162
|
a3g_seqs.size.to_s,
|
@@ -165,7 +167,7 @@ libs.each do |lib|
|
|
165
167
|
next if filtered_seqs.size < 3
|
166
168
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
167
169
|
|
168
|
-
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
|
170
|
+
sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
|
169
171
|
point_mutation_list += sdrm[0]
|
170
172
|
linkage_list += sdrm[1]
|
171
173
|
aa_report_list += sdrm[2]
|
@@ -190,6 +192,7 @@ libs.each do |lib|
|
|
190
192
|
reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
|
191
193
|
filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
|
192
194
|
poisson_minority_cutoff = filtered_seqs.pm
|
195
|
+
fdr_hash = filtered_seqs.fdr
|
193
196
|
summary_hash[:RT] = [
|
194
197
|
seqs.size.to_s,
|
195
198
|
hypermut_seq_keys.size.to_s,
|
@@ -200,7 +203,7 @@ libs.each do |lib|
|
|
200
203
|
next if filtered_seqs.size < 3
|
201
204
|
filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
|
202
205
|
|
203
|
-
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
|
206
|
+
sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
|
204
207
|
point_mutation_list += sdrm[0]
|
205
208
|
linkage_list += sdrm[1]
|
206
209
|
aa_report_list += sdrm[2]
|
@@ -346,7 +349,7 @@ libs.each do |lib|
|
|
346
349
|
title: "Surveillance Drug Resistance Mutations",
|
347
350
|
file: point_mutation_file,
|
348
351
|
newPDF: "",
|
349
|
-
table_width: [
|
352
|
+
table_width: [60,50,70,65,65,60,75,70,70,70,45],
|
350
353
|
extra_text: "* Mutation below Poisson cut-off for minority mutations"
|
351
354
|
},
|
352
355
|
{
|
data/lib/viral_seq/hivdr.rb
CHANGED
@@ -9,10 +9,13 @@ module ViralSeq
|
|
9
9
|
# IN codon 53-174 (HXB2 4384-4751)
|
10
10
|
# @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
|
11
11
|
# can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
|
12
|
+
# @param fdr [Hash] hash of events => (false detecton rate)
|
13
|
+
# can be obtained using ViralSeq::SeqHash#fdr
|
14
|
+
#
|
12
15
|
# @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
|
13
16
|
#
|
14
17
|
# # point_mutation_list: two demensional array for the following information,
|
15
|
-
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
|
18
|
+
# # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
|
16
19
|
# # linkage_list: two demensional array for the following information,
|
17
20
|
# # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
|
18
21
|
# # report_list: two demensional array for the following information,
|
@@ -20,12 +23,13 @@ module ViralSeq
|
|
20
23
|
# @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
|
21
24
|
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
|
22
25
|
# p_cut_off = my_seqhash.pm
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
# =>
|
27
|
-
# => PR,396,
|
28
|
-
# => PR,396,
|
26
|
+
# fdr_hash = my_seqhash.fdr
|
27
|
+
# pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
|
28
|
+
# puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
|
29
|
+
# => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
|
30
|
+
# => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
|
31
|
+
# => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
|
32
|
+
# => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
|
29
33
|
#
|
30
34
|
# puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
|
31
35
|
# => region,tcs_number,linkage,count,%,CI_low,CI_high,label
|
@@ -136,7 +140,7 @@ module ViralSeq
|
|
136
140
|
# => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
|
137
141
|
# => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
138
142
|
|
139
|
-
def sdrm_hiv_pr(cutoff = 0)
|
143
|
+
def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
|
140
144
|
sequences = self.dna_hash
|
141
145
|
region = "PR"
|
142
146
|
rf_label = 0
|
@@ -167,8 +171,9 @@ module ViralSeq
|
|
167
171
|
count_mut_list = mut_list.count_freq
|
168
172
|
count_mut_list.each do |m,number|
|
169
173
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
174
|
+
fdr = fdr_hash[number].round(5)
|
170
175
|
label = number < cutoff ? "*" : ""
|
171
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
176
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
172
177
|
end
|
173
178
|
end
|
174
179
|
point_mutation_list.sort_by! {|record| record[2]}
|
@@ -229,7 +234,7 @@ module ViralSeq
|
|
229
234
|
# @param (see #sdrm_hiv_pr)
|
230
235
|
# @return (see #sdrm_hiv_pr)
|
231
236
|
|
232
|
-
def sdrm_hiv_rt(cutoff = 0)
|
237
|
+
def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
|
233
238
|
sequences = self.dna_hash
|
234
239
|
region = "RT"
|
235
240
|
rf_label = 1
|
@@ -280,8 +285,9 @@ module ViralSeq
|
|
280
285
|
count_mut_list = mut_list.count_freq
|
281
286
|
count_mut_list.each do |m,number|
|
282
287
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
288
|
+
fdr = fdr_hash[number].round(5)
|
283
289
|
label = number < cutoff ? "*" : ""
|
284
|
-
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
290
|
+
point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
285
291
|
end
|
286
292
|
end
|
287
293
|
|
@@ -291,8 +297,9 @@ module ViralSeq
|
|
291
297
|
count_mut_list = mut_list.count_freq
|
292
298
|
count_mut_list.each do |m,number|
|
293
299
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
300
|
+
fdr = fdr_hash[number].round(5)
|
294
301
|
label = number < cutoff ? "*" : ""
|
295
|
-
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
302
|
+
point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
296
303
|
end
|
297
304
|
end
|
298
305
|
|
@@ -365,7 +372,7 @@ module ViralSeq
|
|
365
372
|
# @param (see #sdrm_hiv_pr)
|
366
373
|
# @return (see #sdrm_hiv_pr)
|
367
374
|
|
368
|
-
def sdrm_hiv_in(cutoff = 0)
|
375
|
+
def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
|
369
376
|
sequences = self.dna_hash
|
370
377
|
region = "IN"
|
371
378
|
rf_label = 2
|
@@ -397,8 +404,9 @@ module ViralSeq
|
|
397
404
|
count_mut_list = mut_list.count_freq
|
398
405
|
count_mut_list.each do |m,number|
|
399
406
|
ci = ViralSeq::Math::BinomCI.new(number, n_seq)
|
407
|
+
fdr = fdr_hash[number].round(5)
|
400
408
|
label = number < cutoff ? "*" : ""
|
401
|
-
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
|
409
|
+
point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
|
402
410
|
end
|
403
411
|
end
|
404
412
|
point_mutation_list.sort_by! {|record| record[2]}
|
data/lib/viral_seq/math.rb
CHANGED
@@ -31,7 +31,7 @@ module ViralSeq
|
|
31
31
|
def rand
|
32
32
|
if (@compute_next_pair = !@compute_next_pair)
|
33
33
|
theta = 2 * ::Math::PI * @rng.call
|
34
|
-
scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
|
34
|
+
scale = @sd * ::Math.sqrt(-2 * ::Math.log(1 - @rng.call))
|
35
35
|
@g1 = @mean + scale * ::Math.sin(theta)
|
36
36
|
@g0 = @mean + scale * ::Math.cos(theta)
|
37
37
|
else
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -208,6 +208,31 @@ module ViralSeq
|
|
208
208
|
return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
|
209
209
|
end
|
210
210
|
|
211
|
+
# return a new SeqHash object with given a range on the nt sequence position
|
212
|
+
# @param range [Range] range of positions on the nt sequence
|
213
|
+
# @return [ViralSeq::SeqHash] a sub SeqHash object
|
214
|
+
|
215
|
+
def nt_range(range)
|
216
|
+
dna_hash = self.dna_hash
|
217
|
+
new_hash = {}
|
218
|
+
dna_hash.each do |k,v|
|
219
|
+
new_hash[k] = v[range]
|
220
|
+
end
|
221
|
+
ViralSeq::SeqHash.new(new_hash)
|
222
|
+
end # end of #nt_range
|
223
|
+
|
224
|
+
# check the size range of the DNA sequences of the SeqHash object
|
225
|
+
# @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
|
226
|
+
|
227
|
+
def check_nt_size
|
228
|
+
dna_hash = self.dna_hash
|
229
|
+
size_array = []
|
230
|
+
dna_hash.values.each do |v|
|
231
|
+
size_array << v.size
|
232
|
+
end
|
233
|
+
return { max: size_array.max, min: size_array.min }
|
234
|
+
end
|
235
|
+
|
211
236
|
# write the nt sequences to a FASTA format file
|
212
237
|
# @param file [String] path to the FASTA output file
|
213
238
|
# @return [NilClass]
|
@@ -592,6 +617,98 @@ module ViralSeq
|
|
592
617
|
|
593
618
|
alias_method :pm, :poisson_minority_cutoff
|
594
619
|
|
620
|
+
# calculate false detection rate for minority mutations
|
621
|
+
# Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
|
622
|
+
# @param error_rate [Float] estimated sequencing error rate
|
623
|
+
# @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
|
624
|
+
# @example calculate FDR for mutations that appeared twice in the sample dataset
|
625
|
+
# my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
|
626
|
+
# fdr_hash = my_seqhash.fdr
|
627
|
+
# fdr_hash[2].round(5)
|
628
|
+
# => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
|
629
|
+
|
630
|
+
def fdr(error_rate = 0.0001)
|
631
|
+
sequences = self.dna_hash.values
|
632
|
+
if sequences.size == 0
|
633
|
+
return {}
|
634
|
+
else
|
635
|
+
seq_count = self.size
|
636
|
+
observed_hash = variant_for_poisson(sequences)
|
637
|
+
p_unadjusted = []
|
638
|
+
observed_hash.each do |k, v|
|
639
|
+
p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
|
640
|
+
p_unadjusted += Array.new(v, p_value)
|
641
|
+
end
|
642
|
+
p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
|
643
|
+
vars_pair = observed_hash.to_a
|
644
|
+
fdr_hash = Hash.new(0)
|
645
|
+
(0..(p_fdr.size - 1)).each do |i|
|
646
|
+
fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
|
647
|
+
end
|
648
|
+
return fdr_hash
|
649
|
+
end
|
650
|
+
end #end of #fdr
|
651
|
+
|
652
|
+
# analysis for the nt sequence variants.
|
653
|
+
# @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
|
654
|
+
|
655
|
+
def nt_variants
|
656
|
+
return_obj = {}
|
657
|
+
nt_hash = self.dna_hash
|
658
|
+
tcs_number = self.size
|
659
|
+
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
660
|
+
fdr_hash = self.fdr
|
661
|
+
pm_cut_off = self.pm
|
662
|
+
con = self.consensus
|
663
|
+
return_obj[:tcs_number] = tcs_number
|
664
|
+
return_obj[:lower_detection_limit] = dl
|
665
|
+
return_obj[:pm_cut_off] = pm_cut_off
|
666
|
+
return_obj[:positions] = []
|
667
|
+
cis = {}
|
668
|
+
|
669
|
+
(0..(con.size - 1)).each do |p|
|
670
|
+
position_obj = {}
|
671
|
+
position_obj[:position] = p + 1
|
672
|
+
position_obj[:tcs_number] = tcs_number
|
673
|
+
position_obj[:lower_detection_limit] = dl
|
674
|
+
position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
|
675
|
+
|
676
|
+
nts = []
|
677
|
+
dna_hash.each do |n,s|
|
678
|
+
nts << s[p]
|
679
|
+
end
|
680
|
+
freq_hash = nts.count_freq
|
681
|
+
[:A, :C, :G, :T, :-].each do |k|
|
682
|
+
v = freq_hash[k.to_s]
|
683
|
+
position_obj[k] = {}
|
684
|
+
position_obj[k][:count] = v
|
685
|
+
if v > 0
|
686
|
+
if cis[[v, tcs_number]]
|
687
|
+
ci = cis[[v, tcs_number]]
|
688
|
+
else
|
689
|
+
ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
|
690
|
+
cis[[v, tcs_number]] = ci
|
691
|
+
end
|
692
|
+
position_obj[k][:freq] = ci.mean.round(4)
|
693
|
+
position_obj[k][:freq_ci_low] = ci.lower.round(4)
|
694
|
+
position_obj[k][:freq_ci_high] = ci.upper.round(4)
|
695
|
+
position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
|
696
|
+
position_obj[k][:fdr] = fdr_hash[v]
|
697
|
+
else
|
698
|
+
position_obj[k][:freq] = 0
|
699
|
+
position_obj[k][:freq_ci_low] = 0
|
700
|
+
position_obj[k][:freq_ci_high] = 0
|
701
|
+
position_obj[k][:greater_than_pm] = false
|
702
|
+
position_obj[k][:fdr] = nil
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
return_obj[:positions] << position_obj
|
707
|
+
end
|
708
|
+
|
709
|
+
return_obj
|
710
|
+
end # end of nt_variants
|
711
|
+
|
595
712
|
|
596
713
|
# align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
|
597
714
|
# @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
|
@@ -1183,6 +1300,28 @@ module ViralSeq
|
|
1183
1300
|
return new_sh
|
1184
1301
|
end
|
1185
1302
|
|
1303
|
+
# QC for each nucleotide sequence comparing with sample consensus for indels
|
1304
|
+
# @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
|
1305
|
+
|
1306
|
+
def qc_indel
|
1307
|
+
con = self.consensus
|
1308
|
+
dna_hash = self.dna_hash
|
1309
|
+
names_passed = []
|
1310
|
+
names_indel = []
|
1311
|
+
dna_hash.uniq_hash.each do |seq, names|
|
1312
|
+
if seq.compare_with(con) < 4
|
1313
|
+
names_passed += names
|
1314
|
+
elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
|
1315
|
+
names_indel += names
|
1316
|
+
else
|
1317
|
+
names_passed += names
|
1318
|
+
end
|
1319
|
+
end
|
1320
|
+
return {no_indel: self.sub(names_passed),
|
1321
|
+
has_indel: self.sub(names_indel)}
|
1322
|
+
end # end of qc_indel
|
1323
|
+
|
1324
|
+
|
1186
1325
|
# trim dna sequences based on the provided reference coordinates.
|
1187
1326
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
1188
1327
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -280,6 +280,19 @@ module ViralSeq
|
|
280
280
|
abort infor.red.bold
|
281
281
|
end
|
282
282
|
|
283
|
+
# lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
|
284
|
+
# R required.
|
285
|
+
# @param tcs_number [Integer] number of TCS
|
286
|
+
# @return [Float] lower detection limit
|
287
|
+
# @example calculate lower detection limit
|
288
|
+
# ViralSeq::TcsCore.detection_limit(100)
|
289
|
+
# => 0.0362
|
290
|
+
|
291
|
+
def detection_limit(tcs_number)
|
292
|
+
dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
|
293
|
+
dl.to_f
|
294
|
+
end
|
295
|
+
|
283
296
|
private
|
284
297
|
|
285
298
|
def unzip_r(indir, f)
|
data/lib/viral_seq/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-01-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|