viral_seq 1.2.8 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb2f3be04857f96f8cf15b02e67a585771c04d45d4e4be68e566b4226342b5f0
4
- data.tar.gz: 2c36ff5494bcf415796a5a8b10da22721a6ea6f574a97601d0a9ad487236e70e
3
+ metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
4
+ data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
5
5
  SHA512:
6
- metadata.gz: abc5622ae5dc8d5e1343f8e85557fc03e169146a34f6d980bf0d5db4dc973d41d81cc89bb0ce9cc0dd5a090d1c58357834431d9d1ad106f31c3a8f0a739ce4b9
7
- data.tar.gz: a890a557536ff43073258220801bdc8b6294b3c22bc8ec030f245bb9acccfeaebf8f0d3b228233de4e980558e88c953143d57f1ff3bc2a5b366026d7371df67d
6
+ metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
7
+ data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
data/Gemfile.lock CHANGED
@@ -1,12 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.1.1)
5
- colorize (>= 0.1)
6
- combine_pdf (>= 1.0.0)
7
- muscle_bio (>= 0.4)
8
- prawn (>= 2.3.0)
9
- prawn-table (>= 0.2.0)
4
+ viral_seq (1.3.0)
5
+ colorize (~> 0.1)
6
+ combine_pdf (~> 1.0, >= 1.0.0)
7
+ muscle_bio (~> 0.4)
8
+ prawn (~> 2.3, >= 2.3.0)
9
+ prawn-table (~> 0.2, >= 0.2.0)
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -179,6 +179,28 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
179
179
 
180
180
  ## Updates
181
181
 
182
+ ### Version 1.5.0-01042022
183
+
184
+ 1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
185
+ 2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
186
+ 3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
187
+ 4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
188
+ 5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
189
+
190
+ ### Version 1.4.0-10132021
191
+
192
+ 1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
193
+ 2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
194
+
195
+ ### Version 1.3.0-08302021
196
+
197
+ 1. Fixed a bug in the `tcs` pipeline.
198
+
199
+ ### Version 1.2.9-08022021
200
+
201
+ 1. Fixed a bug when reading the input primer sequences in lowercases.
202
+ 2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
203
+
182
204
  ### Version 1.2.8-07292021
183
205
 
184
206
  1. Fixed an issue when reading .fastq files containing blank_lines.
data/bin/tcs CHANGED
@@ -152,8 +152,8 @@ begin
152
152
  primer[:region] ? region = primer[:region] : region = "region"
153
153
  summary_json[:primer_set_name] = region
154
154
 
155
- cdna_primer = primer[:cdna]
156
- forward_primer = primer[:forward]
155
+ cdna_primer = primer[:cdna].upcase
156
+ forward_primer = primer[:forward].upcase
157
157
 
158
158
  export_raw = primer[:export_raw]
159
159
  limit_raw = primer[:limit_raw]
@@ -200,7 +200,7 @@ begin
200
200
  summary_json[:paired_raw_sequence] = paired_seq_number
201
201
  if paired_seq_number < raw_sequence_number * 0.001
202
202
  summary_json[:warnings] <<
203
- "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
203
+ "WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
204
204
  end
205
205
 
206
206
  common_keys.each do |seqtag|
@@ -401,7 +401,11 @@ begin
401
401
  when 4
402
402
  joined_sh = shp.join2(model: :indiv)
403
403
  end
404
- return joined_sh
404
+ if joined_sh
405
+ return joined_sh
406
+ else
407
+ joined_sh = ViralSeq::SeqHash.new
408
+ end
405
409
  end
406
410
 
407
411
  if primer[:end_join]
data/bin/tcs_sdrm CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
91
91
  point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
92
92
  point_mutation_out = File.open(point_mutation_file, "w")
93
93
  point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
94
- "number,percentage,95% CI low, 95% CI high, notes"
94
+ "number,frequency,95% CI low,95% CI high,fdr,notes"
95
95
 
96
96
  linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
97
97
  linkage_out = File.open(linkage_file, "w")
98
98
  linkage_out.puts "region,TCS,mutation linkage,number," +
99
- "percentage,95% CI low, 95% CI high, notes"
99
+ "frequency,95% CI low, 95% CI high, notes"
100
100
 
101
101
  aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
102
102
  aa_report_out = File.open(aa_report_file, "w")
@@ -132,6 +132,7 @@ libs.each do |lib|
132
132
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
133
133
  filtered_seqs = stop_codon_check[:without_stop_codon]
134
134
  poisson_minority_cutoff = filtered_seqs.pm
135
+ fdr_hash = filtered_seqs.fdr
135
136
  summary_hash[:PR] = [
136
137
  seqs.size.to_s,
137
138
  a3g_seqs.size.to_s,
@@ -142,7 +143,7 @@ libs.each do |lib|
142
143
  next if filtered_seqs.size < 3
143
144
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
144
145
 
145
- sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
146
+ sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
146
147
  point_mutation_list += sdrm[0]
147
148
  linkage_list += sdrm[1]
148
149
  aa_report_list += sdrm[2]
@@ -155,6 +156,7 @@ libs.each do |lib|
155
156
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
156
157
  filtered_seqs = stop_codon_check[:without_stop_codon]
157
158
  poisson_minority_cutoff = filtered_seqs.pm
159
+ fdr_hash = filtered_seqs.fdr
158
160
  summary_hash[:IN] = [
159
161
  seqs.size.to_s,
160
162
  a3g_seqs.size.to_s,
@@ -165,7 +167,7 @@ libs.each do |lib|
165
167
  next if filtered_seqs.size < 3
166
168
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
167
169
 
168
- sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
170
+ sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
169
171
  point_mutation_list += sdrm[0]
170
172
  linkage_list += sdrm[1]
171
173
  aa_report_list += sdrm[2]
@@ -190,6 +192,7 @@ libs.each do |lib|
190
192
  reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
191
193
  filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
192
194
  poisson_minority_cutoff = filtered_seqs.pm
195
+ fdr_hash = filtered_seqs.fdr
193
196
  summary_hash[:RT] = [
194
197
  seqs.size.to_s,
195
198
  hypermut_seq_keys.size.to_s,
@@ -200,7 +203,7 @@ libs.each do |lib|
200
203
  next if filtered_seqs.size < 3
201
204
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
202
205
 
203
- sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
206
+ sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
204
207
  point_mutation_list += sdrm[0]
205
208
  linkage_list += sdrm[1]
206
209
  aa_report_list += sdrm[2]
@@ -346,7 +349,7 @@ libs.each do |lib|
346
349
  title: "Surveillance Drug Resistance Mutations",
347
350
  file: point_mutation_file,
348
351
  newPDF: "",
349
- table_width: [65,55,85,80,60,65,85,85,85,45],
352
+ table_width: [60,50,70,65,65,60,75,70,70,70,45],
350
353
  extra_text: "* Mutation below Poisson cut-off for minority mutations"
351
354
  },
352
355
  {
@@ -9,10 +9,13 @@ module ViralSeq
9
9
  # IN codon 53-174 (HXB2 4384-4751)
10
10
  # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
11
11
  # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
12
+ # @param fdr [Hash] hash of events => (false detecton rate)
13
+ # can be obtained using ViralSeq::SeqHash#fdr
14
+ #
12
15
  # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
13
16
  #
14
17
  # # point_mutation_list: two demensional array for the following information,
15
- # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
18
+ # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
16
19
  # # linkage_list: two demensional array for the following information,
17
20
  # # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
18
21
  # # report_list: two demensional array for the following information,
@@ -20,12 +23,13 @@ module ViralSeq
20
23
  # @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
21
24
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
22
25
  # p_cut_off = my_seqhash.pm
23
- # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off)
24
- # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"; pr_sdrm[0].each {|n| puts n.join(',')}
25
- # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
26
- # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
27
- # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
28
- # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
26
+ # fdr_hash = my_seqhash.fdr
27
+ # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
28
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
29
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
30
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
31
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
32
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
29
33
  #
30
34
  # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
31
35
  # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
@@ -136,7 +140,7 @@ module ViralSeq
136
140
  # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
137
141
  # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138
142
 
139
- def sdrm_hiv_pr(cutoff = 0)
143
+ def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
140
144
  sequences = self.dna_hash
141
145
  region = "PR"
142
146
  rf_label = 0
@@ -167,8 +171,9 @@ module ViralSeq
167
171
  count_mut_list = mut_list.count_freq
168
172
  count_mut_list.each do |m,number|
169
173
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
174
+ fdr = fdr_hash[number].round(5)
170
175
  label = number < cutoff ? "*" : ""
171
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
176
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
172
177
  end
173
178
  end
174
179
  point_mutation_list.sort_by! {|record| record[2]}
@@ -229,7 +234,7 @@ module ViralSeq
229
234
  # @param (see #sdrm_hiv_pr)
230
235
  # @return (see #sdrm_hiv_pr)
231
236
 
232
- def sdrm_hiv_rt(cutoff = 0)
237
+ def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
233
238
  sequences = self.dna_hash
234
239
  region = "RT"
235
240
  rf_label = 1
@@ -280,8 +285,9 @@ module ViralSeq
280
285
  count_mut_list = mut_list.count_freq
281
286
  count_mut_list.each do |m,number|
282
287
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
288
+ fdr = fdr_hash[number].round(5)
283
289
  label = number < cutoff ? "*" : ""
284
- point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
290
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
285
291
  end
286
292
  end
287
293
 
@@ -291,8 +297,9 @@ module ViralSeq
291
297
  count_mut_list = mut_list.count_freq
292
298
  count_mut_list.each do |m,number|
293
299
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
300
+ fdr = fdr_hash[number].round(5)
294
301
  label = number < cutoff ? "*" : ""
295
- point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
302
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
296
303
  end
297
304
  end
298
305
 
@@ -365,7 +372,7 @@ module ViralSeq
365
372
  # @param (see #sdrm_hiv_pr)
366
373
  # @return (see #sdrm_hiv_pr)
367
374
 
368
- def sdrm_hiv_in(cutoff = 0)
375
+ def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
369
376
  sequences = self.dna_hash
370
377
  region = "IN"
371
378
  rf_label = 2
@@ -397,8 +404,9 @@ module ViralSeq
397
404
  count_mut_list = mut_list.count_freq
398
405
  count_mut_list.each do |m,number|
399
406
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
407
+ fdr = fdr_hash[number].round(5)
400
408
  label = number < cutoff ? "*" : ""
401
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
409
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
402
410
  end
403
411
  end
404
412
  point_mutation_list.sort_by! {|record| record[2]}
@@ -31,7 +31,7 @@ module ViralSeq
31
31
  def rand
32
32
  if (@compute_next_pair = !@compute_next_pair)
33
33
  theta = 2 * ::Math::PI * @rng.call
34
- scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
34
+ scale = @sd * ::Math.sqrt(-2 * ::Math.log(1 - @rng.call))
35
35
  @g1 = @mean + scale * ::Math.sin(theta)
36
36
  @g0 = @mean + scale * ::Math.cos(theta)
37
37
  else
@@ -208,6 +208,31 @@ module ViralSeq
208
208
  return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
209
209
  end
210
210
 
211
+ # return a new SeqHash object with given a range on the nt sequence position
212
+ # @param range [Range] range of positions on the nt sequence
213
+ # @return [ViralSeq::SeqHash] a sub SeqHash object
214
+
215
+ def nt_range(range)
216
+ dna_hash = self.dna_hash
217
+ new_hash = {}
218
+ dna_hash.each do |k,v|
219
+ new_hash[k] = v[range]
220
+ end
221
+ ViralSeq::SeqHash.new(new_hash)
222
+ end # end of #nt_range
223
+
224
+ # check the size range of the DNA sequences of the SeqHash object
225
+ # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
226
+
227
+ def check_nt_size
228
+ dna_hash = self.dna_hash
229
+ size_array = []
230
+ dna_hash.values.each do |v|
231
+ size_array << v.size
232
+ end
233
+ return { max: size_array.max, min: size_array.min }
234
+ end
235
+
211
236
  # write the nt sequences to a FASTA format file
212
237
  # @param file [String] path to the FASTA output file
213
238
  # @return [NilClass]
@@ -592,6 +617,98 @@ module ViralSeq
592
617
 
593
618
  alias_method :pm, :poisson_minority_cutoff
594
619
 
620
+ # calculate false detection rate for minority mutations
621
+ # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
622
+ # @param error_rate [Float] estimated sequencing error rate
623
+ # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
624
+ # @example calculate FDR for mutations that appeared twice in the sample dataset
625
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
626
+ # fdr_hash = my_seqhash.fdr
627
+ # fdr_hash[2].round(5)
628
+ # => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
629
+
630
+ def fdr(error_rate = 0.0001)
631
+ sequences = self.dna_hash.values
632
+ if sequences.size == 0
633
+ return {}
634
+ else
635
+ seq_count = self.size
636
+ observed_hash = variant_for_poisson(sequences)
637
+ p_unadjusted = []
638
+ observed_hash.each do |k, v|
639
+ p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
640
+ p_unadjusted += Array.new(v, p_value)
641
+ end
642
+ p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
643
+ vars_pair = observed_hash.to_a
644
+ fdr_hash = Hash.new(0)
645
+ (0..(p_fdr.size - 1)).each do |i|
646
+ fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
647
+ end
648
+ return fdr_hash
649
+ end
650
+ end #end of #fdr
651
+
652
+ # analysis for the nt sequence variants.
653
+ # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
654
+
655
+ def nt_variants
656
+ return_obj = {}
657
+ nt_hash = self.dna_hash
658
+ tcs_number = self.size
659
+ dl = ViralSeq::TcsCore.detection_limit(tcs_number)
660
+ fdr_hash = self.fdr
661
+ pm_cut_off = self.pm
662
+ con = self.consensus
663
+ return_obj[:tcs_number] = tcs_number
664
+ return_obj[:lower_detection_limit] = dl
665
+ return_obj[:pm_cut_off] = pm_cut_off
666
+ return_obj[:positions] = []
667
+ cis = {}
668
+
669
+ (0..(con.size - 1)).each do |p|
670
+ position_obj = {}
671
+ position_obj[:position] = p + 1
672
+ position_obj[:tcs_number] = tcs_number
673
+ position_obj[:lower_detection_limit] = dl
674
+ position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
675
+
676
+ nts = []
677
+ dna_hash.each do |n,s|
678
+ nts << s[p]
679
+ end
680
+ freq_hash = nts.count_freq
681
+ [:A, :C, :G, :T, :-].each do |k|
682
+ v = freq_hash[k.to_s]
683
+ position_obj[k] = {}
684
+ position_obj[k][:count] = v
685
+ if v > 0
686
+ if cis[[v, tcs_number]]
687
+ ci = cis[[v, tcs_number]]
688
+ else
689
+ ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
690
+ cis[[v, tcs_number]] = ci
691
+ end
692
+ position_obj[k][:freq] = ci.mean.round(4)
693
+ position_obj[k][:freq_ci_low] = ci.lower.round(4)
694
+ position_obj[k][:freq_ci_high] = ci.upper.round(4)
695
+ position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
696
+ position_obj[k][:fdr] = fdr_hash[v]
697
+ else
698
+ position_obj[k][:freq] = 0
699
+ position_obj[k][:freq_ci_low] = 0
700
+ position_obj[k][:freq_ci_high] = 0
701
+ position_obj[k][:greater_than_pm] = false
702
+ position_obj[k][:fdr] = nil
703
+ end
704
+ end
705
+
706
+ return_obj[:positions] << position_obj
707
+ end
708
+
709
+ return_obj
710
+ end # end of nt_variants
711
+
595
712
 
596
713
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
597
714
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
@@ -1183,6 +1300,28 @@ module ViralSeq
1183
1300
  return new_sh
1184
1301
  end
1185
1302
 
1303
+ # QC for each nucleotide sequence comparing with sample consensus for indels
1304
+ # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
1305
+
1306
+ def qc_indel
1307
+ con = self.consensus
1308
+ dna_hash = self.dna_hash
1309
+ names_passed = []
1310
+ names_indel = []
1311
+ dna_hash.uniq_hash.each do |seq, names|
1312
+ if seq.compare_with(con) < 4
1313
+ names_passed += names
1314
+ elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
1315
+ names_indel += names
1316
+ else
1317
+ names_passed += names
1318
+ end
1319
+ end
1320
+ return {no_indel: self.sub(names_passed),
1321
+ has_indel: self.sub(names_indel)}
1322
+ end # end of qc_indel
1323
+
1324
+
1186
1325
  # trim dna sequences based on the provided reference coordinates.
1187
1326
  # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1188
1327
  # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
@@ -280,6 +280,19 @@ module ViralSeq
280
280
  abort infor.red.bold
281
281
  end
282
282
 
283
+ # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
284
+ # R required.
285
+ # @param tcs_number [Integer] number of TCS
286
+ # @return [Float] lower detection limit
287
+ # @example calculate lower detection limit
288
+ # ViralSeq::TcsCore.detection_limit(100)
289
+ # => 0.0362
290
+
291
+ def detection_limit(tcs_number)
292
+ dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
293
+ dl.to_f
294
+ end
295
+
283
296
  private
284
297
 
285
298
  def unzip_r(indir, f)
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.2.8"
6
- TCS_VERSION = "2.3.7"
5
+ VERSION = "1.5.0"
6
+ TCS_VERSION = "2.4.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.8
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-30 00:00:00.000000000 Z
12
+ date: 2022-01-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler