viral_seq 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6ff64bcbe0e019c4e6842feae4c775e5975307a51ca111ea6629de5f918a163
4
- data.tar.gz: 464d2eeacc49243722c8f8121df7a118c9a433da8df0e2f02e6c5195aa03e1a9
3
+ metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
4
+ data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
5
5
  SHA512:
6
- metadata.gz: 9b0da2b42f7c1fb039995fa017cd6535a1253c6352be2f502ecf3ef25cbdc4bcdd7b8497e68ed00eab82e67007751be9474cc8a6f06ace3386a81456dae19a4e
7
- data.tar.gz: df93fd5689cfc8e6b5248febd0a57961cb6773b0e939cb3c980b9b13cae60ca9177209f24acd13e94748733c6c7dee165b89a86c4505f961618d68c4a05782d2
6
+ metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
7
+ data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead
data/Gemfile.lock CHANGED
@@ -1,12 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.1.1)
5
- colorize (>= 0.1)
6
- combine_pdf (>= 1.0.0)
7
- muscle_bio (>= 0.4)
8
- prawn (>= 2.3.0)
9
- prawn-table (>= 0.2.0)
4
+ viral_seq (1.3.0)
5
+ colorize (~> 0.1)
6
+ combine_pdf (~> 1.0, >= 1.0.0)
7
+ muscle_bio (~> 0.4)
8
+ prawn (~> 2.3, >= 2.3.0)
9
+ prawn-table (~> 0.2, >= 0.2.0)
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -179,14 +179,19 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
179
179
 
180
180
  ## Updates
181
181
 
182
+ ### Version 1.4.0-10132021
183
+
184
+ 1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
185
+ 2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
186
+
182
187
  ### Version 1.3.0-08302021
183
188
 
184
- 1. Fixed a bug in the `tcs` pipeline.
189
+ 1. Fixed a bug in the `tcs` pipeline.
185
190
 
186
191
  ### Version 1.2.9-08022021
187
192
 
188
193
  1. Fixed a bug when reading the input primer sequences in lowercases.
189
- 2. Fix a bug in the method ViralSeq::Math::RandomGaussian
194
+ 2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
190
195
 
191
196
  ### Version 1.2.8-07292021
192
197
 
data/bin/tcs_sdrm CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
91
91
  point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
92
92
  point_mutation_out = File.open(point_mutation_file, "w")
93
93
  point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
94
- "number,percentage,95% CI low, 95% CI high, notes"
94
+ "number,frequency,95% CI low,95% CI high,fdr,notes"
95
95
 
96
96
  linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
97
97
  linkage_out = File.open(linkage_file, "w")
98
98
  linkage_out.puts "region,TCS,mutation linkage,number," +
99
- "percentage,95% CI low, 95% CI high, notes"
99
+ "frequency,95% CI low, 95% CI high, notes"
100
100
 
101
101
  aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
102
102
  aa_report_out = File.open(aa_report_file, "w")
@@ -132,6 +132,7 @@ libs.each do |lib|
132
132
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
133
133
  filtered_seqs = stop_codon_check[:without_stop_codon]
134
134
  poisson_minority_cutoff = filtered_seqs.pm
135
+ fdr_hash = filtered_seqs.fdr
135
136
  summary_hash[:PR] = [
136
137
  seqs.size.to_s,
137
138
  a3g_seqs.size.to_s,
@@ -142,7 +143,7 @@ libs.each do |lib|
142
143
  next if filtered_seqs.size < 3
143
144
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
144
145
 
145
- sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
146
+ sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
146
147
  point_mutation_list += sdrm[0]
147
148
  linkage_list += sdrm[1]
148
149
  aa_report_list += sdrm[2]
@@ -155,6 +156,7 @@ libs.each do |lib|
155
156
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
156
157
  filtered_seqs = stop_codon_check[:without_stop_codon]
157
158
  poisson_minority_cutoff = filtered_seqs.pm
159
+ fdr_hash = filtered_seqs.fdr
158
160
  summary_hash[:IN] = [
159
161
  seqs.size.to_s,
160
162
  a3g_seqs.size.to_s,
@@ -165,7 +167,7 @@ libs.each do |lib|
165
167
  next if filtered_seqs.size < 3
166
168
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
167
169
 
168
- sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
170
+ sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
169
171
  point_mutation_list += sdrm[0]
170
172
  linkage_list += sdrm[1]
171
173
  aa_report_list += sdrm[2]
@@ -190,6 +192,7 @@ libs.each do |lib|
190
192
  reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
191
193
  filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
192
194
  poisson_minority_cutoff = filtered_seqs.pm
195
+ fdr_hash = filtered_seqs.fdr
193
196
  summary_hash[:RT] = [
194
197
  seqs.size.to_s,
195
198
  hypermut_seq_keys.size.to_s,
@@ -200,7 +203,7 @@ libs.each do |lib|
200
203
  next if filtered_seqs.size < 3
201
204
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
202
205
 
203
- sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
206
+ sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
204
207
  point_mutation_list += sdrm[0]
205
208
  linkage_list += sdrm[1]
206
209
  aa_report_list += sdrm[2]
@@ -346,7 +349,7 @@ libs.each do |lib|
346
349
  title: "Surveillance Drug Resistance Mutations",
347
350
  file: point_mutation_file,
348
351
  newPDF: "",
349
- table_width: [65,55,85,80,60,65,85,85,85,45],
352
+ table_width: [60,50,70,65,65,60,75,70,70,70,45],
350
353
  extra_text: "* Mutation below Poisson cut-off for minority mutations"
351
354
  },
352
355
  {
@@ -9,10 +9,13 @@ module ViralSeq
9
9
  # IN codon 53-174 (HXB2 4384-4751)
10
10
  # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
11
11
  # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
12
+ # @param fdr [Hash] hash of events => (false detecton rate)
13
+ # can be obtained using ViralSeq::SeqHash#fdr
14
+ #
12
15
  # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
13
16
  #
14
17
  # # point_mutation_list: two demensional array for the following information,
15
- # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
18
+ # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
16
19
  # # linkage_list: two demensional array for the following information,
17
20
  # # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
18
21
  # # report_list: two demensional array for the following information,
@@ -20,12 +23,13 @@ module ViralSeq
20
23
  # @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
21
24
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
22
25
  # p_cut_off = my_seqhash.pm
23
- # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off)
24
- # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"; pr_sdrm[0].each {|n| puts n.join(',')}
25
- # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
26
- # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
27
- # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
28
- # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
26
+ # fdr_hash = my_seqhash.fdr
27
+ # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
28
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
29
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
30
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
31
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
32
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
29
33
  #
30
34
  # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
31
35
  # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
@@ -136,7 +140,7 @@ module ViralSeq
136
140
  # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
137
141
  # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138
142
 
139
- def sdrm_hiv_pr(cutoff = 0)
143
+ def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
140
144
  sequences = self.dna_hash
141
145
  region = "PR"
142
146
  rf_label = 0
@@ -167,8 +171,9 @@ module ViralSeq
167
171
  count_mut_list = mut_list.count_freq
168
172
  count_mut_list.each do |m,number|
169
173
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
174
+ fdr = fdr_hash[number].round(5)
170
175
  label = number < cutoff ? "*" : ""
171
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
176
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
172
177
  end
173
178
  end
174
179
  point_mutation_list.sort_by! {|record| record[2]}
@@ -229,7 +234,7 @@ module ViralSeq
229
234
  # @param (see #sdrm_hiv_pr)
230
235
  # @return (see #sdrm_hiv_pr)
231
236
 
232
- def sdrm_hiv_rt(cutoff = 0)
237
+ def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
233
238
  sequences = self.dna_hash
234
239
  region = "RT"
235
240
  rf_label = 1
@@ -280,8 +285,9 @@ module ViralSeq
280
285
  count_mut_list = mut_list.count_freq
281
286
  count_mut_list.each do |m,number|
282
287
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
288
+ fdr = fdr_hash[number].round(5)
283
289
  label = number < cutoff ? "*" : ""
284
- point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
290
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
285
291
  end
286
292
  end
287
293
 
@@ -291,8 +297,9 @@ module ViralSeq
291
297
  count_mut_list = mut_list.count_freq
292
298
  count_mut_list.each do |m,number|
293
299
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
300
+ fdr = fdr_hash[number].round(5)
294
301
  label = number < cutoff ? "*" : ""
295
- point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
302
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
296
303
  end
297
304
  end
298
305
 
@@ -365,7 +372,7 @@ module ViralSeq
365
372
  # @param (see #sdrm_hiv_pr)
366
373
  # @return (see #sdrm_hiv_pr)
367
374
 
368
- def sdrm_hiv_in(cutoff = 0)
375
+ def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
369
376
  sequences = self.dna_hash
370
377
  region = "IN"
371
378
  rf_label = 2
@@ -397,8 +404,9 @@ module ViralSeq
397
404
  count_mut_list = mut_list.count_freq
398
405
  count_mut_list.each do |m,number|
399
406
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
407
+ fdr = fdr_hash[number].round(5)
400
408
  label = number < cutoff ? "*" : ""
401
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
409
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
402
410
  end
403
411
  end
404
412
  point_mutation_list.sort_by! {|record| record[2]}
@@ -592,6 +592,37 @@ module ViralSeq
592
592
 
593
593
  alias_method :pm, :poisson_minority_cutoff
594
594
 
595
+ # calculate false detection rate for minority mutations
596
+ # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
597
+ # @param error_rate [Float] estimated sequencing error rate
598
+ # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
599
+ # @example calculate FDR for mutations that appeared twice in the sample dataset
600
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
601
+ # fdr_hash = my_seqhash.fdr
602
+ # fdr_hash[2].round(5)
603
+ # => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
604
+
605
+ def fdr(error_rate = 0.0001)
606
+ sequences = self.dna_hash.values
607
+ if sequences.size == 0
608
+ return {}
609
+ else
610
+ seq_count = self.size
611
+ observed_hash = variant_for_poisson(sequences)
612
+ p_unadjusted = []
613
+ observed_hash.each do |k, v|
614
+ p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
615
+ p_unadjusted += Array.new(v, p_value)
616
+ end
617
+ p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
618
+ vars_pair = observed_hash.to_a
619
+ fdr_hash = Hash.new(0)
620
+ (0..(p_fdr.size - 1)).each do |i|
621
+ fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
622
+ end
623
+ return fdr_hash
624
+ end
625
+ end #end of #fdr
595
626
 
596
627
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
597
628
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.3.0"
5
+ VERSION = "1.4.0"
6
6
  TCS_VERSION = "2.3.8"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-08-30 00:00:00.000000000 Z
12
+ date: 2021-10-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler