viral_seq 1.2.7 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 554845dba339d0e06b84c88bc117258516f391bdf58cce015c2669e7b2c6c0d5
4
- data.tar.gz: 870280337c90d1f5b9ecbea6e6478d7e2dc22aa70917c6b2ecd94afaa185c1c6
3
+ metadata.gz: 2bf2afba235cb99f680f10e0913b8e0f715dd2f9c831f2dcba4534a2607685e6
4
+ data.tar.gz: 8995b0417a1f6ca4de39e26405ab012766a31b1460f5c1c6d11147271f587044
5
5
  SHA512:
6
- metadata.gz: 54db76e6fd8333ccebb19dee602378ec8dbe5d196ec7bd675e55f65db80cb06ac2ab51ce1f13ab7ea65c0a50ad49978bd3e9581074c497b298f0912858946fa8
7
- data.tar.gz: 03d02329192465a9f278715c8a85e3a910e5c5c7252026980d29e669df823a5bdb4be323eeb56f7c9804b71fa8f1763c5a526227f3764315d6eb8e208934ce81
6
+ metadata.gz: 233485f39d610945794a033c1d2c53680d753ca0284c6b0b9075295352ceb765df11727816dbc061429ccabfb03204c0db82a24a4d1c4a6ebd5a99df770253ff
7
+ data.tar.gz: ae029b7ae6f530e748ba256a4ba9bb4af95de7e57cbdf49808f1b257794f22830fe3ffbd636a742bd2bae6f5535539b08d0d042b3100ec2b5ef8b59d83098ead
data/Gemfile.lock CHANGED
@@ -1,12 +1,12 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.1.1)
5
- colorize (>= 0.1)
6
- combine_pdf (>= 1.0.0)
7
- muscle_bio (>= 0.4)
8
- prawn (>= 2.3.0)
9
- prawn-table (>= 0.2.0)
4
+ viral_seq (1.3.0)
5
+ colorize (~> 0.1)
6
+ combine_pdf (~> 1.0, >= 1.0.0)
7
+ muscle_bio (~> 0.4)
8
+ prawn (~> 2.3, >= 2.3.0)
9
+ prawn-table (~> 0.2, >= 0.2.0)
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -179,10 +179,28 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
179
179
 
180
180
  ## Updates
181
181
 
182
+ ### Version 1.4.0-10132021
183
+
184
+ 1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
185
+ 2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
186
+
187
+ ### Version 1.3.0-08302021
188
+
189
+ 1. Fixed a bug in the `tcs` pipeline.
190
+
191
+ ### Version 1.2.9-08022021
192
+
193
+ 1. Fixed a bug when reading the input primer sequences in lowercases.
194
+ 2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
195
+
196
+ ### Version 1.2.8-07292021
197
+
198
+ 1. Fixed an issue when reading .fastq files containing blank_lines.
199
+
182
200
  ### Version 1.2.7-07152021
183
201
 
184
- 1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
185
- `tcs` version to v2.3.5.
202
+ 1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
203
+ `tcs` version to v2.3.6.
186
204
 
187
205
 
188
206
  ### Version 1.2.6-07122021
data/bin/tcs CHANGED
@@ -152,8 +152,8 @@ begin
152
152
  primer[:region] ? region = primer[:region] : region = "region"
153
153
  summary_json[:primer_set_name] = region
154
154
 
155
- cdna_primer = primer[:cdna]
156
- forward_primer = primer[:forward]
155
+ cdna_primer = primer[:cdna].upcase
156
+ forward_primer = primer[:forward].upcase
157
157
 
158
158
  export_raw = primer[:export_raw]
159
159
  limit_raw = primer[:limit_raw]
@@ -401,7 +401,11 @@ begin
401
401
  when 4
402
402
  joined_sh = shp.join2(model: :indiv)
403
403
  end
404
- return joined_sh
404
+ if joined_sh
405
+ return joined_sh
406
+ else
407
+ joined_sh = ViralSeq::SeqHash.new
408
+ end
405
409
  end
406
410
 
407
411
  if primer[:end_join]
data/bin/tcs_sdrm CHANGED
@@ -91,12 +91,12 @@ libs.each do |lib|
91
91
  point_mutation_file = File.join(out_lib_dir, (lib_name + "_substitution.csv"))
92
92
  point_mutation_out = File.open(point_mutation_file, "w")
93
93
  point_mutation_out.puts "region,TCS,AA position,wild type,mutation," +
94
- "number,percentage,95% CI low, 95% CI high, notes"
94
+ "number,frequency,95% CI low,95% CI high,fdr,notes"
95
95
 
96
96
  linkage_file = File.join(out_lib_dir, (lib_name + "_linkage.csv"))
97
97
  linkage_out = File.open(linkage_file, "w")
98
98
  linkage_out.puts "region,TCS,mutation linkage,number," +
99
- "percentage,95% CI low, 95% CI high, notes"
99
+ "frequency,95% CI low, 95% CI high, notes"
100
100
 
101
101
  aa_report_file = File.join(out_lib_dir, (lib_name + "_aa.csv"))
102
102
  aa_report_out = File.open(aa_report_file, "w")
@@ -132,6 +132,7 @@ libs.each do |lib|
132
132
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
133
133
  filtered_seqs = stop_codon_check[:without_stop_codon]
134
134
  poisson_minority_cutoff = filtered_seqs.pm
135
+ fdr_hash = filtered_seqs.fdr
135
136
  summary_hash[:PR] = [
136
137
  seqs.size.to_s,
137
138
  a3g_seqs.size.to_s,
@@ -142,7 +143,7 @@ libs.each do |lib|
142
143
  next if filtered_seqs.size < 3
143
144
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
144
145
 
145
- sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff)
146
+ sdrm = filtered_seqs.sdrm_hiv_pr(poisson_minority_cutoff, fdr_hash)
146
147
  point_mutation_list += sdrm[0]
147
148
  linkage_list += sdrm[1]
148
149
  aa_report_list += sdrm[2]
@@ -155,6 +156,7 @@ libs.each do |lib|
155
156
  stop_codon_seqs = stop_codon_check[:with_stop_codon]
156
157
  filtered_seqs = stop_codon_check[:without_stop_codon]
157
158
  poisson_minority_cutoff = filtered_seqs.pm
159
+ fdr_hash = filtered_seqs.fdr
158
160
  summary_hash[:IN] = [
159
161
  seqs.size.to_s,
160
162
  a3g_seqs.size.to_s,
@@ -165,7 +167,7 @@ libs.each do |lib|
165
167
  next if filtered_seqs.size < 3
166
168
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
167
169
 
168
- sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff)
170
+ sdrm = filtered_seqs.sdrm_hiv_in(poisson_minority_cutoff, fdr_hash)
169
171
  point_mutation_list += sdrm[0]
170
172
  linkage_list += sdrm[1]
171
173
  aa_report_list += sdrm[2]
@@ -190,6 +192,7 @@ libs.each do |lib|
190
192
  reject_keys = (hypermut_seq_keys | stop_codon_seq_keys)
191
193
  filtered_seqs = ViralSeq::SeqHash.new(seqs.dna_hash.reject {|k,v| reject_keys.include?(k) })
192
194
  poisson_minority_cutoff = filtered_seqs.pm
195
+ fdr_hash = filtered_seqs.fdr
193
196
  summary_hash[:RT] = [
194
197
  seqs.size.to_s,
195
198
  hypermut_seq_keys.size.to_s,
@@ -200,7 +203,7 @@ libs.each do |lib|
200
203
  next if filtered_seqs.size < 3
201
204
  filtered_seqs.write_nt_fa(File.join(filtered_seq_dir,seq_basename))
202
205
 
203
- sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff)
206
+ sdrm = filtered_seqs.sdrm_hiv_rt(poisson_minority_cutoff, fdr_hash)
204
207
  point_mutation_list += sdrm[0]
205
208
  linkage_list += sdrm[1]
206
209
  aa_report_list += sdrm[2]
@@ -346,7 +349,7 @@ libs.each do |lib|
346
349
  title: "Surveillance Drug Resistance Mutations",
347
350
  file: point_mutation_file,
348
351
  newPDF: "",
349
- table_width: [65,55,85,80,60,65,85,85,85,45],
352
+ table_width: [60,50,70,65,65,60,75,70,70,70,45],
350
353
  extra_text: "* Mutation below Poisson cut-off for minority mutations"
351
354
  },
352
355
  {
@@ -9,10 +9,13 @@ module ViralSeq
9
9
  # IN codon 53-174 (HXB2 4384-4751)
10
10
  # @param cutoff [Integer] cut-off for minimal abundance of a mutation to be called as valid mutation,
11
11
  # can be obtained using ViralSeq::SeqHash#poisson_minority_cutoff function
12
+ # @param fdr [Hash] hash of events => (false detecton rate)
13
+ # can be obtained using ViralSeq::SeqHash#fdr
14
+ #
12
15
  # @return [Array] three elements `[point_mutation_list, linkage_list, report_list]`
13
16
  #
14
17
  # # point_mutation_list: two demensional array for the following information,
15
- # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label]
18
+ # # [region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label]
16
19
  # # linkage_list: two demensional array for the following information,
17
20
  # # [region,tcs_number,linkage,count,%,CI_low,CI_high,label]
18
21
  # # report_list: two demensional array for the following information,
@@ -20,12 +23,13 @@ module ViralSeq
20
23
  # @example identify SDRMs from a FASTA sequence file of HIV PR sequences obtained after MPID-DR sequencing
21
24
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_dr_sequences/pr.fasta')
22
25
  # p_cut_off = my_seqhash.pm
23
- # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off)
24
- # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label"; pr_sdrm[0].each {|n| puts n.join(',')}
25
- # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,label
26
- # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,
27
- # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,*
28
- # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,
26
+ # fdr_hash = my_seqhash.fdr
27
+ # pr_sdrm = my_seqhash.sdrm_hiv_pr(p_cut_off, fdr_hash)
28
+ # puts "region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label"; pr_sdrm[0].each {|n| puts n.join(',')}
29
+ # => region,tcs_number,position,wildtype,mutation,count,%,CI_low,CI_high,fdr,label
30
+ # => PR,396,30,D,N,247,0.62374,0.57398,0.67163,0,
31
+ # => PR,396,50,I,V,1,0.00253,6.0e-05,0.01399,0.18905,*
32
+ # => PR,396,88,N,D,246,0.62121,0.57141,0.66919,0,
29
33
  #
30
34
  # puts "region,tcs_number,linkage,count,%,CI_low,CI_high,label"; pr_sdrm[1].each {|n| puts n.join(',')}
31
35
  # => region,tcs_number,linkage,count,%,CI_low,CI_high,label
@@ -136,7 +140,7 @@ module ViralSeq
136
140
  # => PR,98,396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.7475,0.0,0.0,0.0,0.2525,0.0,0.0,0.0,0.0,0.0
137
141
  # => PR,99,396,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138
142
 
139
- def sdrm_hiv_pr(cutoff = 0)
143
+ def sdrm_hiv_pr(cutoff = 0, fdr_hash = Hash.new(0))
140
144
  sequences = self.dna_hash
141
145
  region = "PR"
142
146
  rf_label = 0
@@ -167,8 +171,9 @@ module ViralSeq
167
171
  count_mut_list = mut_list.count_freq
168
172
  count_mut_list.each do |m,number|
169
173
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
174
+ fdr = fdr_hash[number].round(5)
170
175
  label = number < cutoff ? "*" : ""
171
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
176
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
172
177
  end
173
178
  end
174
179
  point_mutation_list.sort_by! {|record| record[2]}
@@ -229,7 +234,7 @@ module ViralSeq
229
234
  # @param (see #sdrm_hiv_pr)
230
235
  # @return (see #sdrm_hiv_pr)
231
236
 
232
- def sdrm_hiv_rt(cutoff = 0)
237
+ def sdrm_hiv_rt(cutoff = 0, fdr_hash = Hash.new(0))
233
238
  sequences = self.dna_hash
234
239
  region = "RT"
235
240
  rf_label = 1
@@ -280,8 +285,9 @@ module ViralSeq
280
285
  count_mut_list = mut_list.count_freq
281
286
  count_mut_list.each do |m,number|
282
287
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
288
+ fdr = fdr_hash[number].round(5)
283
289
  label = number < cutoff ? "*" : ""
284
- point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
290
+ point_mutation_list << ["NRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
285
291
  end
286
292
  end
287
293
 
@@ -291,8 +297,9 @@ module ViralSeq
291
297
  count_mut_list = mut_list.count_freq
292
298
  count_mut_list.each do |m,number|
293
299
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
300
+ fdr = fdr_hash[number].round(5)
294
301
  label = number < cutoff ? "*" : ""
295
- point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
302
+ point_mutation_list << ["NNRTI", n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
296
303
  end
297
304
  end
298
305
 
@@ -365,7 +372,7 @@ module ViralSeq
365
372
  # @param (see #sdrm_hiv_pr)
366
373
  # @return (see #sdrm_hiv_pr)
367
374
 
368
- def sdrm_hiv_in(cutoff = 0)
375
+ def sdrm_hiv_in(cutoff = 0, fdr_hash = Hash.new(0))
369
376
  sequences = self.dna_hash
370
377
  region = "IN"
371
378
  rf_label = 2
@@ -397,8 +404,9 @@ module ViralSeq
397
404
  count_mut_list = mut_list.count_freq
398
405
  count_mut_list.each do |m,number|
399
406
  ci = ViralSeq::Math::BinomCI.new(number, n_seq)
407
+ fdr = fdr_hash[number].round(5)
400
408
  label = number < cutoff ? "*" : ""
401
- point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
409
+ point_mutation_list << [region, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
402
410
  end
403
411
  end
404
412
  point_mutation_list.sort_by! {|record| record[2]}
@@ -31,7 +31,7 @@ module ViralSeq
31
31
  def rand
32
32
  if (@compute_next_pair = !@compute_next_pair)
33
33
  theta = 2 * ::Math::PI * @rng.call
34
- scale = @sd * ::Math.sqrt(-2 * Math.log(1 - @rng.call))
34
+ scale = @sd * ::Math.sqrt(-2 * ::Math.log(1 - @rng.call))
35
35
  @g1 = @mean + scale * ::Math.sin(theta)
36
36
  @g0 = @mean + scale * ::Math.cos(theta)
37
37
  else
@@ -116,6 +116,8 @@ module ViralSeq
116
116
 
117
117
  File.open(fastq_file,'r') do |file|
118
118
  file.readlines.collect do |line|
119
+ line.tr!("\u0000","")
120
+ next if line == "\n"
119
121
  count +=1
120
122
  count_m = count % 4
121
123
  if count_m == 1
@@ -590,6 +592,37 @@ module ViralSeq
590
592
 
591
593
  alias_method :pm, :poisson_minority_cutoff
592
594
 
595
+ # calculate false detection rate for minority mutations
596
+ # Credit: Prof. Michael G. Hudgens from UNC-CH for providing the method for fdr calculation
597
+ # @param error_rate [Float] estimated sequencing error rate
598
+ # @return [Hash] pair of mutation frequency to false detection rate. (freq => fdr)
599
+ # @example calculate FDR for mutations that appeared twice in the sample dataset
600
+ # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_sequence_for_poisson.fasta')
601
+ # fdr_hash = my_seqhash.fdr
602
+ # fdr_hash[2].round(5)
603
+ # => 0.00726 # means that mutations appear twice have 0.007261748 chance to be caused by residual errors.
604
+
605
+ def fdr(error_rate = 0.0001)
606
+ sequences = self.dna_hash.values
607
+ if sequences.size == 0
608
+ return {}
609
+ else
610
+ seq_count = self.size
611
+ observed_hash = variant_for_poisson(sequences)
612
+ p_unadjusted = []
613
+ observed_hash.each do |k, v|
614
+ p_value = 1 - `Rscript -e "cat(pbinom(#{k}-1, #{seq_count}, #{error_rate}))"`.to_f # compute unadjusted exact p-value, ie under null, probability of observing observed_hash[k] or more extreme
615
+ p_unadjusted += Array.new(v, p_value)
616
+ end
617
+ p_fdr = `Rscript -e "cat(p.adjust(c(#{p_unadjusted.join(',')}), 'fdr'))"`.split("\s").count_freq.to_a # controls fdr. aka Benjamini-Hochberg correction
618
+ vars_pair = observed_hash.to_a
619
+ fdr_hash = Hash.new(0)
620
+ (0..(p_fdr.size - 1)).each do |i|
621
+ fdr_hash[vars_pair[i][0]] = p_fdr[i][0].to_f
622
+ end
623
+ return fdr_hash
624
+ end
625
+ end #end of #fdr
593
626
 
594
627
  # align the @dna_hash sequences, return a new ViralSeq::SeqHash object with aligned @dna_hash using MUSCLE
595
628
  # @param path_to_muscle [String], path to MUSCLE excutable. if not provided (as default), it will use RubyGem::MuscleBio
@@ -305,7 +305,8 @@ module ViralSeq
305
305
  end
306
306
 
307
307
  def general_filter(seq)
308
- if seq.size < ($platform_sequencing_length - 1)
308
+ return false unless seq
309
+ if seq.size < ($platform_sequencing_length - 10)
309
310
  return false
310
311
  elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
311
312
  return false
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.2.7"
6
- TCS_VERSION = "2.3.6"
5
+ VERSION = "1.4.0"
6
+ TCS_VERSION = "2.3.8"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.7
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-15 00:00:00.000000000 Z
12
+ date: 2021-10-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler