viral_seq 1.8.1.1 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -154,7 +154,7 @@ module ViralSeq
154
154
  s = ViralSeq::Sequence.new(name,seq)
155
155
  s.translate(rf_label)
156
156
  aa[name] = s.aa_string
157
- record = s.sdrm(:hiv_pr)
157
+ record = s.sdrm(:PI)
158
158
  mut_com << record
159
159
  record.each do |position,mutation|
160
160
  if mut[position]
@@ -386,7 +386,7 @@ module ViralSeq
386
386
  s = ViralSeq::Sequence.new(name,seq)
387
387
  s.translate(rf_label)
388
388
  aa[name] = s.aa_string
389
- record = s.sdrm(:hiv_in, start_codon_number)
389
+ record = s.sdrm(:INSTI, start_codon_number)
390
390
  mut_com << record
391
391
  record.each do |position,mutation|
392
392
  if mut[position]
@@ -460,5 +460,209 @@ module ViralSeq
460
460
  return [point_mutation_list, linkage_list, report_list]
461
461
  end
462
462
 
463
+
464
+ # wrapper function for #a3g_hypermut and #stop_codon with ViralSeq::DrmRegionConfig as a param.
465
+
466
+ def filter_for_drm(region_config)
467
+ seq_coord = region_config.seq_coord
468
+ reading_frame_number = region_config.get_reading_frame_number
469
+
470
+ if !seq_coord["gap"]
471
+
472
+ a3g_check = self.a3g
473
+ a3g_seqs = a3g_check[:a3g_seq]
474
+ a3g_filtered_seqs = a3g_check[:filtered_seq]
475
+
476
+ stop_codon_check = a3g_filtered_seqs.stop_codon(reading_frame_number[0])
477
+ stop_codon_seqs = stop_codon_check[:with_stop_codon]
478
+ filtered_seqs = stop_codon_check[:without_stop_codon]
479
+
480
+ return {
481
+ filtered_seq: filtered_seqs,
482
+ a3g_seq: a3g_seqs,
483
+ stop_codon_seq: stop_codon_seqs
484
+ }
485
+
486
+ else
487
+
488
+ r1_length, r2_length = region_config.r1_r2_length.values
489
+
490
+ r1_seqs = {}
491
+ r2_seqs = {}
492
+
493
+ self.dna_hash.each do |k,v|
494
+ r1_seqs[k] = v[0,r1_length]
495
+ r2_seqs[k] = v[r1_length, r2_length]
496
+ end
497
+
498
+ r1_sh = ViralSeq::SeqHash.new(r1_seqs)
499
+ r2_sh = ViralSeq::SeqHash.new(r2_seqs)
500
+
501
+ a3g_seqs_r1 = r1_sh.a3g[:a3g_seq]
502
+ a3g_seqs_r2 = r2_sh.a3g[:a3g_seq]
503
+
504
+ stop_codon_r1 = r1_sh.stop_codon(reading_frame_number[0])[:with_stop_codon]
505
+ stop_codon_r2 = r2_sh.stop_codon(reading_frame_number[1])[:with_stop_codon]
506
+
507
+ a3g_seq_keys = (a3g_seqs_r1.dna_hash.keys | a3g_seqs_r2.dna_hash.keys)
508
+ a3g_seqs = ViralSeq::SeqHash.new(self.dna_hash.select {|k, _v| a3g_seq_keys.include? k})
509
+
510
+ stop_codon_keys = (stop_codon_r1.dna_hash.keys | stop_codon_r2.dna_hash.keys)
511
+ stop_codon_seqs = ViralSeq::SeqHash.new(self.dna_hash.select {|k, _v| stop_codon_keys.include? k})
512
+
513
+ reject_keys = (a3g_seq_keys | stop_codon_keys)
514
+
515
+ filtered_seqs = ViralSeq::SeqHash.new(self.dna_hash.reject { |k, _v| reject_keys.include? k })
516
+
517
+ return {
518
+ filtered_seq: filtered_seqs,
519
+ a3g_seq: a3g_seqs,
520
+ stop_codon_seq: stop_codon_seqs
521
+ }
522
+
523
+ end
524
+
525
+ end # end of #filter_for_drm
526
+
527
+
528
+ # insert the partial genome into the whole gene for HIV resistance analysis
529
+
530
+
531
+ def complete_with_ref(region_config)
532
+ complete_seqs = {}
533
+ seq_coord = region_config.seq_coord
534
+
535
+ ref = ViralSeq::RefSeq.get(region_config.ref_info["ref_type"].to_sym)
536
+ a = region_config.ref_info["ref_coord"][0]
537
+ b = region_config.ref_info["ref_coord"][1]
538
+ c = seq_coord["minimum"]
539
+ d = seq_coord["maximum"]
540
+
541
+ if seq_coord["gap"]
542
+ e = seq_coord["gap"]["minimum"]
543
+ f = seq_coord["gap"]["maximum"]
544
+
545
+ self.dna_hash.each do |k,v|
546
+ complete_seqs[k] = ref[(a-1)..(c-2)] + v[0,(e-c)] + ref[(e-1)..(f-1)] + v[(e-c)..-1] + ref[d..(b-1)]
547
+ end
548
+ else
549
+ self.dna_hash.each do |k,v|
550
+ complete_seqs[k] = ref[(a-1)..(c-2)] + v + ref[d..(b-1)]
551
+ end
552
+ end
553
+
554
+ return ViralSeq::SeqHash.new(complete_seqs)
555
+ end #end of #complete_with_ref
556
+
557
+
558
+ # function to interpret HIV drms with ViralSeq::DrmRegionConfig as a param.
559
+
560
+ def drm(region_config)
561
+ region = region_config.region
562
+ fdr_hash = self.fdr # must run fdr before the completion of the sequences
563
+
564
+ complete_gene = self.complete_with_ref(region_config)
565
+ sequences = complete_gene.dna_hash
566
+
567
+ n_seq = sequences.size
568
+ aa = {}
569
+ mut = {}
570
+ mut_com = []
571
+ point_mutation_list = []
572
+
573
+ drm_list = region_config.drm_list
574
+
575
+ sequences.each do |name, seq|
576
+ s = ViralSeq::Sequence.new(name, seq)
577
+ s.translate
578
+ aa[name] = s.aa_string
579
+
580
+ records_per_seq = {}
581
+
582
+ drm_list.each do |drm_class, list|
583
+
584
+ mut[drm_class] = {} if !mut[drm_class]
585
+
586
+ record = s.check_drm(list)
587
+ records_per_seq = records_per_seq.merge(record)
588
+
589
+ record.each do |position, mutation|
590
+ if !mut[drm_class][position]
591
+ mut[drm_class][position] = [mutation[0],[]]
592
+ end
593
+ mut[drm_class][position][1] << mutation[1]
594
+ end
595
+ end
596
+
597
+ mut_com << records_per_seq.sort.to_h
598
+ end
599
+
600
+ mut.each do |drm_class, mutations|
601
+ mutations.each do |position, mutation|
602
+ wt = mutation[0]
603
+ mut_list = mutation[1]
604
+ count_mut_list = mut_list.count_freq
605
+ count_mut_list.each do |m,number|
606
+ ci = ViralSeq::Math::BinomCI.new(number, n_seq)
607
+ fdr = fdr_hash[number].round(5)
608
+ label = fdr >= 0.05 ? "*" : ""
609
+ point_mutation_list << [drm_class, n_seq, position, wt, m, number, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), fdr, label]
610
+ end
611
+ end
612
+ end
613
+
614
+ point_mutation_list.sort_by! {|record| record[2]}
615
+
616
+ link = mut_com.count_freq
617
+ link2 = {}
618
+ link.each do |k,v|
619
+ pattern = []
620
+ if k.size == 0
621
+ pattern = ['WT']
622
+ else
623
+ k.each do |p,m|
624
+ pattern << (m[0] + p.to_s + m[1])
625
+ end
626
+ end
627
+ link2[pattern.join("+")] = v
628
+ end
629
+ linkage_list = []
630
+ link2.sort_by{|_key,value|value}.reverse.to_h.each do |k,v|
631
+ ci = ViralSeq::Math::BinomCI.new(v, n_seq)
632
+ label = ""
633
+ linkage_list << [region, n_seq, k, v, ci.mean.round(5), ci.lower.round(5), ci.upper.round(5), label]
634
+ end
635
+
636
+ report_list = []
637
+
638
+ div_aa = {}
639
+ aa_start = 1
640
+
641
+ aa_size = aa.values[0].size - 1
642
+
643
+ (0..aa_size).to_a.each do |p|
644
+ aas = []
645
+ aa.values.each do |r1|
646
+ aas << r1[p]
647
+ end
648
+ count_aas = aas.count_freq
649
+ div_aa[aa_start] = count_aas.sort_by{|_k,v|v}.reverse.to_h
650
+ aa_start += 1
651
+ end
652
+
653
+ div_aa.each do |k,v|
654
+ record = [region, k, n_seq]
655
+ ViralSeq::AMINO_ACID_LIST.each do |amino_acid|
656
+ aa_count = v[amino_acid]
657
+ record << (aa_count.to_f/n_seq*100).round(4)
658
+ end
659
+ report_list << record
660
+ end
661
+
662
+ return [point_mutation_list, linkage_list, report_list]
663
+
664
+ end
665
+
463
666
  end # end of ViralSeq::SeqHash
667
+
464
668
  end # end of ViralSeq
@@ -1,109 +1,44 @@
1
1
  module ViralSeq
2
2
  class DRMs
3
- class << self
4
3
 
5
- # function to retrieve sdrm positions as a hash
6
- # @param ref_option [Symbol], name of reference genomes, options are `:hiv_pr`, `:hiv_rt`, `:hiv_in`, `hcv_ns5a`
7
- # @return [Hash] Hash of :position_number => [ 'wildtype_codon', ['mutation_codons']]
8
- def sdrm_hash(options)
4
+ # function to retrieve sdrm positions as a hash, DRM list are stored at `lib/viral_seq/util/drm_list.json`
5
+ # @param ref_option [Symbol], name of reference genomes, options are `:hiv_pr`, `:hiv_rt`, `:hiv_in`, `hcv_ns5a`
6
+ # @return [Hash] Hash of :position_number => [ 'wildtype_codon', ['mutation_codons']]
7
+
8
+ def self.sdrm_hash(options)
9
+ options = options.to_s.upcase
10
+ drm_data = JSON.parse(
11
+ File.read(
12
+ File.join(ViralSeq.root, 'viral_seq', 'util', 'drm_list.json')
13
+ )
14
+ )
15
+ if drm_data[options]
9
16
  sdrm = {}
10
- case options
11
- when :hcv_ns5a
12
- sdrm[28] = ['M',['T']]
13
- sdrm[30] = ['L',['H','K','R','Q','A','S','D']]
14
- sdrm[31] = ['L',['M','V','F']]
15
- sdrm[32] = ['P',['L']]
16
- sdrm[44] = ['K',['R']]
17
- sdrm[58] = ['H',['D','P','S']]
18
- sdrm[64] = ['T',['A','S']]
19
- sdrm[77] = ['P',['A','S']]
20
- sdrm[78] = ['R',['K']]
21
- sdrm[79] = ['T',['A']]
22
- sdrm[83] = ['T',['M']]
23
- sdrm[85] = ['S',['N','H','Y']]
24
- sdrm[92] = ['A',['P','T','K','E']]
25
- sdrm[93] = ['Y',['C','F','H','N']]
26
- sdrm[107] = ['K',['T','S']]
27
- sdrm[121] = ['I',['V']]
28
- sdrm[135] = ['T',['A']]
29
- when :nrti
30
- sdrm[41] = ['M',['L']]
31
- sdrm[65] = ['K',['R']]
32
- sdrm[67] = ['D',['N','G','E']]
33
- sdrm[69] = ['T',['D']]
34
- sdrm[70] = ['K',['R','E']]
35
- sdrm[74] = ['L',['V','I']]
36
- sdrm[75] = ['V',['M','T','A','S']]
37
- sdrm[77] = ['F',['L']]
38
- sdrm[115] = ['Y',['F']]
39
- sdrm[116] = ['F',['Y']]
40
- sdrm[151] = ['Q',['M']]
41
- sdrm[184] = ['M',['V','I']]
42
- sdrm[210] = ['L',['W']]
43
- sdrm[215] = ["T",["Y","F","I","C","D","V","E"]]
44
- sdrm[219] = ["K",["Q","E","N","R"]]
45
- when :nnrti
46
- sdrm[100] = ['L',['I']]
47
- sdrm[101] = ['K',['E','P']]
48
- sdrm[103] = ['K',['N','S']]
49
- sdrm[106] = ['V',['M','A']]
50
- sdrm[179] = ['V',['F','D']]
51
- sdrm[181] = ['Y',['C','I','V']]
52
- sdrm[188] = ['Y',['L','H','C']]
53
- sdrm[190] = ['G',['A','S','E']]
54
- sdrm[225] = ['P',['H']]
55
- sdrm[230] = ['M',['L']]
56
- when :hiv_pr
57
- sdrm[23] = ['L',['I']]
58
- sdrm[24] = ['L',['I']]
59
- sdrm[30] = ['D',['N']]
60
- sdrm[32] = ['V',['I']]
61
- sdrm[46] = ['M',['I','L']]
62
- sdrm[47] = ['I',['V','A']]
63
- sdrm[48] = ['G',['V','M']]
64
- sdrm[50] = ['I',['V','L']]
65
- sdrm[53] = ['F',['L']]
66
- sdrm[54] = ['I',['V','L','M','T','A','S']]
67
- sdrm[73] = ['G',['S','T','C','A']]
68
- sdrm[76] = ['L',['V']]
69
- sdrm[82] = ['V',['A','T','S','F','L','C','M']]
70
- sdrm[83] = ['N',['D']]
71
- sdrm[84] = ['I',['V','A','C']]
72
- sdrm[88] = ['N',['D','S']]
73
- sdrm[90] = ['L',['M']]
74
- when :hiv_in
75
- sdrm[66] = ['T',['A','I','K']]
76
- sdrm[74] = ['L',['M']]
77
- sdrm[92] = ['E',['Q']]
78
- sdrm[95] = ['Q',['K']]
79
- sdrm[97] = ['T',['A']]
80
- sdrm[121] = ['F',['Y']]
81
- sdrm[140] = ['G',['A','S','C']]
82
- sdrm[143] = ["Y",["C","H","R"]]
83
- sdrm[147] = ['S',['G']]
84
- sdrm[148] = ['Q',['H','K','R']]
85
- sdrm[155] = ['N',['S','H']]
86
- else raise "Input option `#{options}` for ViralSeq::Sequence.sdrm not supported"
17
+ drm_data[options].each do |record|
18
+ sdrm[record["position"]] = [record["wild-type"], record["mutations"]]
87
19
  end
88
- return sdrm
89
- end # end of #sdrm_hash
90
20
 
91
- # function to export SDRM positions as json object
92
- # @param (see #sdrm_hash)
93
- # @return [Array] json Array of SDRM positions
21
+ else
22
+ abort "Input option `#{options}` for ViralSeq::DRMs.sdrm_hash not supported. Program aborted.\nSupported type of mutations for '#{drm_data.keys.join(", ")}' only."
23
+ end
24
+ return sdrm
25
+ end # end of #sdrm_hash
26
+
27
+ # function to export SDRM positions as json object
28
+ # @param (see #sdrm_hash)
29
+ # @return [Array] json Array of SDRM positions
94
30
 
95
- def sdrm_json(options)
96
- sdrm = ViralSeq::DRMs.sdrm_hash(options)
97
- json_array = []
98
- sdrm.each do |pos, muts|
99
- mutation = {}
100
- mutation[:position] = pos
101
- mutation[:wildtypeCodon] = muts[0]
102
- mutation[:mutationCodons] = muts[1]
103
- json_array << mutation
104
- end
105
- return json_array
31
+ def self.sdrm_json(options)
32
+ sdrm = ViralSeq::DRMs.sdrm_hash(options)
33
+ json_array = []
34
+ sdrm.each do |pos, muts|
35
+ mutation = {}
36
+ mutation[:position] = pos
37
+ mutation[:wildtypeCodon] = muts[0]
38
+ mutation[:mutationCodons] = muts[1]
39
+ json_array << mutation
106
40
  end
107
- end
41
+ return json_array
42
+ end #end of #sdrm_json
108
43
  end
109
44
  end
@@ -136,7 +136,32 @@ module ViralSeq
136
136
  end
137
137
  end
138
138
  return out_hash
139
- end # end of #hcv_ns5a
139
+ end # end of #sdrm
140
+
141
+ # Similar to #sdrm but use a DRM list as a param
142
+
143
+ def check_drm(drm_list_single_type)
144
+ aa_array = self.aa_array
145
+ out_hash = {}
146
+
147
+ drm_list_single_type.each do |position, mut|
148
+ wt_aa = mut[0]
149
+ mut_aas = mut[1]
150
+ test_aa = aa_array[position - 1]
151
+ if test_aa.size == 1 and mut_aas.include?(test_aa)
152
+ out_hash[position] = [wt_aa, test_aa]
153
+ elsif test_aa.size > 1
154
+ test_aa_array = test_aa.split("")
155
+ mut_detected = test_aa_array & mut_aas
156
+
157
+ if !mut_detected.empty?
158
+ out_hash[position] = [wt_aa, mut_detected.join]
159
+ end
160
+
161
+ end
162
+ end
163
+ return out_hash
164
+ end
140
165
 
141
166
  # HIV sequence locator function, resembling HIV Sequence Locator from LANL
142
167
  # # current version only supports nucleotide sequence, not for amino acid sequence.
@@ -126,7 +126,7 @@ module ViralSeq
126
126
  name_array.each do |name|
127
127
  tag = parser_file_name(name)[:tag]
128
128
  if name !~ /\.fastq\Z|\.fastq\.gz\Z/
129
- errors[:file_type_error] << name
129
+ name_array.delete(name)
130
130
  elsif tag.count("R1") == 0 and tag.count("R2") == 0
131
131
  errors[:no_region_tag] << name
132
132
  elsif tag.count("R1") > 0 and tag.count("R2") > 0
@@ -5,7 +5,7 @@ module ViralSeq
5
5
  # run `tcs --dr_params [VERSION]` to pull the params json string for each version of DR.
6
6
  module TcsDr
7
7
  PARAMS = {
8
- "v1" => {:platform_error_rate=>0.02,
8
+ "v1" => {:platform_error_rate=>0.01,
9
9
  :primer_pairs=>
10
10
  [{:region=>"RT",
11
11
  :cdna=>
@@ -68,7 +68,7 @@ module ViralSeq
68
68
  :ref_end=>7205..7210,
69
69
  :indel=>true,
70
70
  :trim=>false},
71
- {:region=>"P17",
71
+ {:region=>"CA",
72
72
  :cdna=>
73
73
  "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCAACAAGGTTTCTGTCATCCAATTTTTTAC",
74
74
  :forward=>
@@ -86,7 +86,7 @@ module ViralSeq
86
86
  ]
87
87
  },
88
88
 
89
- "v2" => {:platform_error_rate=>0.02,
89
+ "v2" => {:platform_error_rate=>0.01,
90
90
  :primer_pairs=>
91
91
  [{:region=>"RT",
92
92
  :cdna=>
@@ -149,7 +149,7 @@ module ViralSeq
149
149
  :ref_end=>7205..7210,
150
150
  :indel=>true,
151
151
  :trim=>false},
152
- {:region=>"P17",
152
+ {:region=>"CA",
153
153
  :cdna=>
154
154
  "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCAACAAGGTTTCTGTCATCCAATTTTTTAC",
155
155
  :forward=>
@@ -165,7 +165,89 @@ module ViralSeq
165
165
  :indel=>true,
166
166
  :trim=>false}
167
167
  ]
168
- }
168
+ },
169
+
170
+ "v3" => {:platform_error_rate=>0.01,
171
+ :primer_pairs=>
172
+ [{:region=>"RT",
173
+ :cdna=>
174
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTAAGGAATGGAGGTTCTTTCTGATG",
175
+ :forward=>
176
+ "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
177
+ :majority=>0,
178
+ :end_join=>true,
179
+ :end_join_option=>1,
180
+ :overlap=>0,
181
+ :TCS_QC=>true,
182
+ :ref_genome=>"HXB2",
183
+ :ref_start=>2648,
184
+ :ref_end=>3209,
185
+ :indel=>true,
186
+ :trim=>false},
187
+ {:region=>"PR",
188
+ :cdna=>
189
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNTTAACCTTTGGGCCATCCATTCC",
190
+ :forward=>
191
+ "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
192
+ :majority=>0,
193
+ :end_join=>true,
194
+ :end_join_option=>3,
195
+ :TCS_QC=>true,
196
+ :ref_genome=>"HXB2",
197
+ :ref_start=>0,
198
+ :ref_end=>2591,
199
+ :indel=>true,
200
+ :trim=>true,
201
+ :trim_ref=>"HXB2",
202
+ :trim_ref_start=>2253,
203
+ :trim_ref_end=>2549},
204
+ {:region=>"IN",
205
+ :cdna=>
206
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCACAATCAKCACCTGCCATCTG",
207
+ :forward=>"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGCAGAAGTTATYCCAGCAGAAACA",
208
+ :majority=>0,
209
+ :end_join=>true,
210
+ :end_join_option=>3,
211
+ :TCS_QC=>true,
212
+ :ref_genome=>"HXB2",
213
+ :ref_start=>4509,
214
+ :ref_end=>5048,
215
+ :indel=>true,
216
+ :trim=>false},
217
+ {:region=>"V1V3",
218
+ :cdna=>
219
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
220
+ :forward=>
221
+ "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
222
+ :majority=>0,
223
+ :end_join=>true,
224
+ :end_join_option=>1,
225
+ :overlap=>0,
226
+ :TCS_QC=>true,
227
+ :ref_genome=>"HXB2",
228
+ :ref_start=>6585,
229
+ :ref_end=>7205..7210,
230
+ :indel=>true,
231
+ :trim=>false},
232
+ {:region=>"CA",
233
+ :cdna=>
234
+ "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCAACAAGGTTTCTGTCATCCAATTTTTTAC",
235
+ :forward=>
236
+ "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGTCAGCCAAAATTACCCTATAGTGC",
237
+ :majority=>0,
238
+ :end_join=>true,
239
+ :end_join_option=>1,
240
+ :overlap=>0,
241
+ :TCS_QC=>true,
242
+ :ref_genome=>"HXB2",
243
+ :ref_start=>1196,
244
+ :ref_end=>1725,
245
+ :indel=>true,
246
+ :trim=>false}
247
+ ]
248
+ },
249
+
250
+
169
251
  }
170
252
 
171
253
  end