viral_seq 1.2.8 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb2f3be04857f96f8cf15b02e67a585771c04d45d4e4be68e566b4226342b5f0
4
- data.tar.gz: 2c36ff5494bcf415796a5a8b10da22721a6ea6f574a97601d0a9ad487236e70e
3
+ metadata.gz: c678fb3b1c37bd996ccf65f1b062a044e60eee32c01b0d75f7c9b7859c3136dd
4
+ data.tar.gz: 7e42acba2e2ae0e3f17a2786cfa1b5b4a376e0b5723c73ed83d32e9d93509c34
5
5
  SHA512:
6
- metadata.gz: abc5622ae5dc8d5e1343f8e85557fc03e169146a34f6d980bf0d5db4dc973d41d81cc89bb0ce9cc0dd5a090d1c58357834431d9d1ad106f31c3a8f0a739ce4b9
7
- data.tar.gz: a890a557536ff43073258220801bdc8b6294b3c22bc8ec030f245bb9acccfeaebf8f0d3b228233de4e980558e88c953143d57f1ff3bc2a5b366026d7371df67d
6
+ metadata.gz: 527894489a0f2d899c449b802a6986f9ebad74fb324339c29a523f6a39cc2ce9d8639f39b837e32881e977e05ea639b4a3681d0b5cbdf87650f5749cfdc72b64
7
+ data.tar.gz: c6418b6b395fdc52e9ed53c4e26531f4baac0cf61356db83aa9089e3e37a348edc8f12d2ca1a6b9cc8d5609f1ff0bfd9ba3e98025c0e4d71d80702d662ecbaa6
data/Gemfile.lock CHANGED
@@ -1,12 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.1.1)
5
- colorize (>= 0.1)
6
- combine_pdf (>= 1.0.0)
7
- muscle_bio (>= 0.4)
8
- prawn (>= 2.3.0)
9
- prawn-table (>= 0.2.0)
4
+ viral_seq (1.3.0)
5
+ colorize (~> 0.1)
6
+ combine_pdf (~> 1.0, >= 1.0.0)
7
+ muscle_bio (~> 0.4)
8
+ prawn (~> 2.3, >= 2.3.0)
9
+ prawn-table (~> 0.2, >= 0.2.0)
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -179,6 +179,28 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
179
179
 
180
180
  ## Updates
181
181
 
182
+ ### Version 1.5.0-01042022
183
+
184
+ 1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
185
+ 2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
186
+ 3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
187
+ 4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
188
+ 5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
189
+
190
+ ### Version 1.4.0-10132021
191
+
192
+ 1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
193
+ 2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
194
+
195
+ ### Version 1.3.0-08302021
196
+
197
+ 1. Fixed a bug in the `tcs` pipeline.
198
+
199
+ ### Version 1.2.9-08022021
200
+
201
+ 1. Fixed a bug when reading the input primer sequences in lowercases.
202
+ 2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
203
+
182
204
  ### Version 1.2.8-07292021
183
205
 
184
206
  1. Fixed an issue when reading .fastq files containing blank_lines.
data/bin/tcs CHANGED
@@ -152,8 +152,8 @@ begin
152
152
  primer[:region] ? region = primer[:region] : region = "region"
153
153
  summary_json[:primer_set_name] = region
154
154
 
155
- cdna_primer = primer[:cdna]
156
- forward_primer = primer[:forward]
155
+ cdna_primer = primer[:cdna].upcase
156
+ forward_primer = primer[:forward].upcase
157
157
 
158
158
  export_raw = primer[:export_raw]
159
159
  limit_raw = primer[:limit_raw]
@@ -200,7 +200,7 @@ begin
200
200
  summary_json[:paired_raw_sequence] = paired_seq_number
201
201
  if paired_seq_number < raw_sequence_number * 0.001
202
202
  summary_json[:warnings] <<
203
- "WARNING: Filtered raw sequneces less than 0.1% of the total raw sequences. Possible contamination."
203
+ "WARNING: Filtered raw sequences less than 0.1% of the total raw sequences. Possible contamination."
204
204
  end
205
205
 
206
206
  common_keys.each do |seqtag|
@@ -401,7 +401,11 @@ begin
401
401
  when 4
402
402
  joined_sh = shp.join2(model: :indiv)
403
403
  end
404
- return joined_sh
404
+ if joined_sh
405
+ return joined_sh
406
+ else
407
+ joined_sh = ViralSeq::SeqHash.new
408
+ end
405
409
  end
406
410
 
407
411
  if primer[:end_join]
data/bin/tcs_sdrm CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
91
91
  point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
92
92
  point_mutation_out = File.open(point_mutation_file, "w")
93
93
  point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
94
- "number,percentage,95% CI low, 95% CI high, notes"
94
+ "number,frequency,95% CI low,95% CI high,fdr,notes"
95
95
 
96
96
  linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
97
97
  linkage_out = File.open(linkage_file, "w")
98
98
  linkage_out.puts "region,TCS,mutation linkage,number," +
99
- "percentage,95% CI low, 95% CI high, notes"
99
+ "frequency,95% CI low, 95% CI high, notes"
100
100
 
101
101
  aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
102
102
  aa_report_out = File.open(aa_report_file, "w")
@@ -132,6 +132,7 @@ libs.each do |lib|
132
132
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
133
133
  filtered_seqs = stop_codon_check[:without_stop_codon]
134
134
  poisson_minority_cutoff = filtered_seqs.pm
135
+ fdr_hash = filtered_seqs.fdr
135
136
  summary_hash[:PR] = [
136
137
  seqs.size.to_s,
137
138
  a3g_seqs.size.to_s,
@@ -142,7 +143,7 @@ libs.each do |lib|
142
143
  next if filtered_seqs.size < 3
143
144
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
144
145
 
145
- sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
146
+ sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
146
147
  point_mutation_list += sdrm[0]
147
148
  linkage_list += sdrm[1]
148
149
  aa_report_list += sdrm[2]
@@ -155,6 +156,7 @@ libs.each do |lib|
155
156
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
156
157
  filtered_seqs = stop_codon_check[:without_stop_codon]
157
158
  poisson_minority_cutoff = filtered_seqs.pm
159
+ fdr_hash = filtered_seqs.fdr
158
160
  summary_hash[:IN] = [
159
161
  seqs.size.to_s,
160
162
  a3g_seqs.size.to_s,
@@ -165,7 +167,7 @@ libs.each do |lib|
165
167
  next if filtered_seqs.size < 3
166
168
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
167
169
 
168
- sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
170
+ sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
169
171
  point_mutation_list += sdrm[0]
170
172
  linkage_list += sdrm[1]
171
173
  aa_report_list += sdrm[2]
@@ -190,6 +192,7 @@ libs.each do |lib|
190
192
  reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
191
193
  filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
192
194
  poisson_minority_cutoff = filtered_seqs.pm
195
+ fdr_hash = filtered_seqs.fdr
193
196
  summary_hash[:RT] = [
194
197
  seqs.size.to_s,
195
198
  hypermut_seq_keys.size.to_s,
@@ -200,7 +203,7 @@ libs.each do |lib|
200
203
  next if filtered_seqs.size < 3
201
204
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
202
205
 
203
- sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
206
+ sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
204
207
  point_mutation_list += sdrm[0]
205
208
  linkage_list += sdrm[1]
206
209
  aa_report_list += sdrm[2]
@@ -346,7 +349,7 @@ libs.each do |lib|
346
349
  title: "Surveillance Drug Resistance Mutations",
347
350
  file: point_mutation_file,
348
351
  newPDF: "",
349
- table_width: [65,55,85,80,60,65,85,85,85,45],
352
+ table_width: [60,50,70,65,65,60,75,70,70,70,45],
350
353
  extra_text: "* Mutation below Poisson cut-off for minority mutations"
351
354
  },
352
355
  {
@@ -9,10 +9,13 @@ module ViralSeq
9
9
  # IN codon 53-174 (HXB2 4384-4751)
10
10
  # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
11
11
  # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
12
+ # @param fdr [Hash] hash of events => (false detecton rate)
13
+ # can be obtained using ViralSeq::SeqHash#fdr
14
+ #
12
15
  # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
13
16
  #
14
17
  # # point_mutation_list: two demensional array for the following information,
15
- # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
18
+ # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
16
19
  # # linkage_list: two demensional array for the following information,
17
20
  # # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
18
21
  # # report_list: two demensional array for the following information,
@@ -20,12 +23,13 @@ module ViralSeq
20
23
  # @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
21
24
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
22
25
  # p_cut_off = my_seqhash.pm
23
- # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off)
24
- # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"; pr_sdrm[0].each {|n| puts n.join(',')}
25
- # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
26
- # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
27
- # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
28
- # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
26
+ # fdr_hash = my_seqhash.fdr
27
+ # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
28
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
29
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
30
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
31
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
32
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
29
33
  #
30
34
  # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
31
35
  # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
@@ -136,7 +140,7 @@ module ViralSeq
136
140
  # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
137
141
  # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138
142
 
139
- def sdrm_hiv_pr(cutoff = 0)
143
+ def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
140
144
  sequences = self.dna_hash
141
145
  region = "PR"
142
146
  rf_label = 0
@@ -167,8 +171,9 @@ module ViralSeq
167
171
  count_mut_list = mut_list.count_freq
168
172
  count_mut_list.each do |m,number|
169
173
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
174
+ fdr = fdr_hash[number].round(5)
170
175
  label = number < cutoff ? "*" : ""
171
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
176
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
172
177
  end
173
178
  end
174
179
  point_mutation_list.sort_by! {|record| record[2]}
@@ -229,7 +234,7 @@ module ViralSeq
229
234
  # @param (see #sdrm_hiv_pr)
230
235
  # @return (see #sdrm_hiv_pr)
231
236
 
232
- def sdrm_hiv_rt(cutoff = 0)
237
+ def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
233
238
  sequences = self.dna_hash
234
239
  region = "RT"
235
240
  rf_label = 1
@@ -280,8 +285,9 @@ module ViralSeq
280
285
  count_mut_list = mut_list.count_freq
281
286
  count_mut_list.each do |m,number|
282
287
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
288
+ fdr = fdr_hash[number].round(5)
283
289
  label = number < cutoff ? "*" : ""
284
- point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
290
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
285
291
  end
286
292
  end
287
293
 
@@ -291,8 +297,9 @@ module ViralSeq
291
297
  count_mut_list = mut_list.count_freq
292
298
  count_mut_list.each do |m,number|
293
299
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
300
+ fdr = fdr_hash[number].round(5)
294
301
  label = number < cutoff ? "*" : ""
295
- point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
302
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
296
303
  end
297
304
  end
298
305
 
@@ -365,7 +372,7 @@ module ViralSeq
365
372
  # @param (see #sdrm_hiv_pr)
366
373
  # @return (see #sdrm_hiv_pr)
367
374
 
368
- def sdrm_hiv_in(cutoff = 0)
375
+ def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
369
376
  sequences = self.dna_hash
370
377
  region = "IN"
371
378
  rf_label = 2
@@ -397,8 +404,9 @@ module ViralSeq
397
404
  count_mut_list = mut_list.count_freq
398
405
  count_mut_list.each do |m,number|
399
406
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
407
+ fdr = fdr_hash[number].round(5)
400
408
  label = number < cutoff ? "*" : ""
401
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
409
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
402
410
  end
403
411
  end
404
412
  point_mutation_list.sort_by! {|record| record[2]}
@@ -31,7 +31,7 @@ module ViralSeq
31
31
  def rand
32
32
  if (@compute_next_pair = !@compute_next_pair)
33
33
  theta = 2 * ::Math::PI * @rng.call
34
- scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
34
+ scale = @sd * ::Math.sqrt(-2 * ::Math.log(1 - @rng.call))
35
35
  @g1 = @mean + scale * ::Math.sin(theta)
36
36
  @g0 = @mean + scale * ::Math.cos(theta)
37
37
  else
@@ -208,6 +208,31 @@ module ViralSeq
208
208
  return ViralSeq::SeqHash.new(sampled_nt, sampled_aa, sampled_qc, sampled_title, self.file)
209
209
  end
210
210
 
211
+ # return a new SeqHash object with given a range on the nt sequence position
212
+ # @param range [Range] range of positions on the nt sequence
213
+ # @return [ViralSeq::SeqHash] a sub SeqHash object
214
+
215
+ def nt_range(range)
216
+ dna_hash = self.dna_hash
217
+ new_hash = {}
218
+ dna_hash.each do |k,v|
219
+ new_hash[k] = v[range]
220
+ end
221
+ ViralSeq::SeqHash.new(new_hash)
222
+ end # end of #nt_range
223
+
224
+ # check the size range of the DNA sequences of the SeqHash object
225
+ # @return [Hash] Hash of {max: MAX_SIZE, min: MIN_SIZE}
226
+
227
+ def check_nt_size
228
+ dna_hash = self.dna_hash
229
+ size_array = []
230
+ dna_hash.values.each do |v|
231
+ size_array << v.size
232
+ end
233
+ return { max: size_array.max, min: size_array.min }
234
+ end
235
+
211
236
  # write the nt sequences to a FASTA format file
212
237
  # @param file [String] path to the FASTA output file
213
238
  # @return [NilClass]
@@ -592,6 +617,98 @@ module ViralSeq
592
617
 
593
618
  alias_method :pm, :poisson_minority_cutoff
594
619
 
620
+ # calculate false detection rate for minority mutations
621
+ # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
622
+ # @param error_rate [Float] estimated sequencing error rate
623
+ # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
624
+ # @example calculate FDR for mutations that appeared twice in the sample dataset
625
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
626
+ # fdr_hash = my_seqhash.fdr
627
+ # fdr_hash[2].round(5)
628
+ # => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
629
+
630
+ def fdr(error_rate = 0.0001)
631
+ sequences = self.dna_hash.values
632
+ if sequences.size == 0
633
+ return {}
634
+ else
635
+ seq_count = self.size
636
+ observed_hash = variant_for_poisson(sequences)
637
+ p_unadjusted = []
638
+ observed_hash.each do |k, v|
639
+ p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
640
+ p_unadjusted += Array.new(v, p_value)
641
+ end
642
+ p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
643
+ vars_pair = observed_hash.to_a
644
+ fdr_hash = Hash.new(0)
645
+ (0..(p_fdr.size - 1)).each do |i|
646
+ fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
647
+ end
648
+ return fdr_hash
649
+ end
650
+ end #end of #fdr
651
+
652
+ # analysis for the nt sequence variants.
653
+ # @return [Hash] An Hash with information of variant analysis. Key/values of the return object see /docs/variants_structure.pdf
654
+
655
+ def nt_variants
656
+ return_obj = {}
657
+ nt_hash = self.dna_hash
658
+ tcs_number = self.size
659
+ dl = ViralSeq::TcsCore.detection_limit(tcs_number)
660
+ fdr_hash = self.fdr
661
+ pm_cut_off = self.pm
662
+ con = self.consensus
663
+ return_obj[:tcs_number] = tcs_number
664
+ return_obj[:lower_detection_limit] = dl
665
+ return_obj[:pm_cut_off] = pm_cut_off
666
+ return_obj[:positions] = []
667
+ cis = {}
668
+
669
+ (0..(con.size - 1)).each do |p|
670
+ position_obj = {}
671
+ position_obj[:position] = p + 1
672
+ position_obj[:tcs_number] = tcs_number
673
+ position_obj[:lower_detection_limit] = dl
674
+ position_obj[:pm_cut_off] = (pm_cut_off == Float::INFINITY ? pm_cut_off.to_s : pm_cut_off)
675
+
676
+ nts = []
677
+ dna_hash.each do |n,s|
678
+ nts << s[p]
679
+ end
680
+ freq_hash = nts.count_freq
681
+ [:A, :C, :G, :T, :-].each do |k|
682
+ v = freq_hash[k.to_s]
683
+ position_obj[k] = {}
684
+ position_obj[k][:count] = v
685
+ if v > 0
686
+ if cis[[v, tcs_number]]
687
+ ci = cis[[v, tcs_number]]
688
+ else
689
+ ci = ViralSeq::Math::BinomCI.new(v, tcs_number)
690
+ cis[[v, tcs_number]] = ci
691
+ end
692
+ position_obj[k][:freq] = ci.mean.round(4)
693
+ position_obj[k][:freq_ci_low] = ci.lower.round(4)
694
+ position_obj[k][:freq_ci_high] = ci.upper.round(4)
695
+ position_obj[k][:greater_than_pm] = (v >= pm_cut_off ? true : false)
696
+ position_obj[k][:fdr] = fdr_hash[v]
697
+ else
698
+ position_obj[k][:freq] = 0
699
+ position_obj[k][:freq_ci_low] = 0
700
+ position_obj[k][:freq_ci_high] = 0
701
+ position_obj[k][:greater_than_pm] = false
702
+ position_obj[k][:fdr] = nil
703
+ end
704
+ end
705
+
706
+ return_obj[:positions] << position_obj
707
+ end
708
+
709
+ return_obj
710
+ end # end of nt_variants
711
+
595
712
 
596
713
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
597
714
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
@@ -1183,6 +1300,28 @@ module ViralSeq
1183
1300
  return new_sh
1184
1301
  end
1185
1302
 
1303
+ # QC for each nucleotide sequence comparing with sample consensus for indels
1304
+ # @return [Hash] object containing two SeqHash {no_indel: seq_hash, has_indel: seq_hash}
1305
+
1306
+ def qc_indel
1307
+ con = self.consensus
1308
+ dna_hash = self.dna_hash
1309
+ names_passed = []
1310
+ names_indel = []
1311
+ dna_hash.uniq_hash.each do |seq, names|
1312
+ if seq.compare_with(con) < 4
1313
+ names_passed += names
1314
+ elsif ViralSeq::Muscle.align(con, seq)[0]["-"]
1315
+ names_indel += names
1316
+ else
1317
+ names_passed += names
1318
+ end
1319
+ end
1320
+ return {no_indel: self.sub(names_passed),
1321
+ has_indel: self.sub(names_indel)}
1322
+ end # end of qc_indel
1323
+
1324
+
1186
1325
  # trim dna sequences based on the provided reference coordinates.
1187
1326
  # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1188
1327
  # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
@@ -280,6 +280,19 @@ module ViralSeq
280
280
  abort infor.red.bold
281
281
  end
282
282
 
283
+ # lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution.
284
+ # R required.
285
+ # @param tcs_number [Integer] number of TCS
286
+ # @return [Float] lower detection limit
287
+ # @example calculate lower detection limit
288
+ # ViralSeq::TcsCore.detection_limit(100)
289
+ # => 0.0362
290
+
291
+ def detection_limit(tcs_number)
292
+ dl = `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
293
+ dl.to_f
294
+ end
295
+
283
296
  private
284
297
 
285
298
  def unzip_r(indir, f)
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.2.8"
6
- TCS_VERSION = "2.3.7"
5
+ VERSION = "1.5.0"
6
+ TCS_VERSION = "2.4.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.8
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-30 00:00:00.000000000 Z
12
+ date: 2022-01-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler