viral_seq 1.0.6 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/viral_seq.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2019 Shuntai Zhou (shuntai.zhou@gmail.com)
1
+ # Copyright (c) 2020 Shuntai Zhou (shuntai.zhou@gmail.com)
2
2
  #
3
3
  # Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  # of this software and associated documentation files (the "Software"), to deal
@@ -35,5 +35,8 @@ require_relative "viral_seq/seq_hash_pair"
35
35
  require_relative "viral_seq/sequence"
36
36
  require_relative "viral_seq/string"
37
37
  require_relative "viral_seq/version"
38
+ require_relative "viral_seq/tcs_core"
39
+ require_relative "viral_seq/tcs_json"
40
+
38
41
 
39
42
  require "muscle_bio"
@@ -1,7 +1,11 @@
1
1
  module ViralSeq
2
-
2
+
3
3
  # array for all amino acid one letter abbreviations
4
4
 
5
5
  AMINO_ACID_LIST = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*"]
6
6
 
7
+ SDRM_HIV_PR_LIST = {}
8
+ SDRM_HIV_RT_LIST = {}
9
+ SDRM_HIV_IN_LIST = {}
10
+
7
11
  end
@@ -1,4 +1,4 @@
1
- # addition methods for Class::Hash required for ViralSeq
1
+ # additional methods for Class::Hash required for ViralSeq
2
2
 
3
3
  class Hash
4
4
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  module ViralSeq
3
- class SeqHash
3
+ class SDRM
4
4
 
5
5
  # functions to identify SDRMs from a ViralSeq::SeqHash object at HIV PR region.
6
6
  # works for MPID-DR protocol (dx.doi.org/10.17504/protocols.io.useewbe)
@@ -39,8 +39,8 @@ module ViralSeq
39
39
 
40
40
  def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
41
  temp_dir = Dir.home
42
- temp_file = temp_dir + "/_temp_muscle_in"
43
- temp_aln = temp_dir + "/_temp_muscle_aln"
42
+ temp_file = File.join(temp_dir, "_temp_muscle_in")
43
+ temp_aln = File.join(temp_dir, "_temp_muscle_aln")
44
44
  name = ">test"
45
45
  temp_in = File.open(temp_file,"w")
46
46
  temp_in.puts ">ref"
@@ -0,0 +1,43 @@
1
+ module ViralSeq
2
+ class DRMs
3
+ def initialize (mutation_list = {})
4
+ @mutation_list = mutation_list
5
+ end
6
+
7
+ attr_accessor :mutation_list
8
+ end
9
+
10
+ def self.sdrm_hiv_pr(seq_hash)
11
+ end
12
+
13
+ def self.sdrm_hiv_rt(seq_hash)
14
+ end
15
+
16
+ def self.sdrm_hiv_in(seq_hash)
17
+ end
18
+
19
+ def self.list_from_json(file)
20
+ end
21
+
22
+ def self.list_from_csv(file)
23
+ end
24
+
25
+ def self.export_list_hiv_pr(file, format = :json)
26
+ if foramt == :json
27
+
28
+ end
29
+ end
30
+
31
+ def self.export_list_hiv_rt(file, format = :json)
32
+
33
+ end
34
+
35
+ def self.export_list_hiv_in(file, format = :json)
36
+
37
+ end
38
+
39
+ def drm_analysis(seq_hash)
40
+ mutation_list = self.mutation_list
41
+
42
+ end
43
+ end
@@ -9,7 +9,7 @@ module ViralSeq
9
9
  # # align with MUSCLE
10
10
  # filtered_seqhash = aligned_pr_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
11
11
  # # filter nt sequences with the reference coordinates
12
- # filtered_seqhash = aligned_pr_seqhash.stop_codon[1]
12
+ # filtered_seqhash = aligned_pr_seqhash.stop_codon[:without_stop_codon]
13
13
  # # return a new ViralSeq::SeqHash object without stop codons
14
14
  # filtered_seqhash = filtered_seqhash.a3g[1]
15
15
  # # further filter out sequences with A3G hypermutations
@@ -130,8 +130,8 @@ module ViralSeq
130
130
  end
131
131
  end
132
132
  end
133
- sequence_hash = Hash[*sequence_a]
134
- quality_hash = Hash[*quality_a]
133
+ sequence_hash = Hash[sequence_a.each_slice(2).to_a]
134
+ quality_hash = Hash[quality_a.each_slice(2).to_a]
135
135
 
136
136
  seq_hash = ViralSeq::SeqHash.new
137
137
  seq_hash.dna_hash = sequence_hash
@@ -181,6 +181,7 @@ module ViralSeq
181
181
  new_seqhash = ViralSeq::SeqHash.new
182
182
  new_seqhash.dna_hash = self.dna_hash.merge(sh2.dna_hash)
183
183
  new_seqhash.aa_hash = self.aa_hash.merge(sh2.aa_hash)
184
+ new_seqhash.qc_hash = self.qc_hash.merge(sh2.qc_hash)
184
185
  new_seqhash.title = self.title + "_with_" + sh2.title
185
186
  new_seqhash.file = self.file + "," + sh2.file
186
187
  return new_seqhash
@@ -248,10 +249,12 @@ module ViralSeq
248
249
  def translate(codon_position = 0)
249
250
  seqs = self.dna_hash
250
251
  @aa_hash = {}
251
- seqs.each do |name, seq|
252
- s = ViralSeq::Sequence.new(name, seq)
252
+ seqs.uniq_hash.each do |seq, array_of_name|
253
+ s = ViralSeq::Sequence.new('name', seq)
253
254
  s.translate(codon_position)
254
- @aa_hash[name] = s.aa_string
255
+ array_of_name.each do |name|
256
+ @aa_hash[name] = s.aa_string
257
+ end
255
258
  end
256
259
  return nil
257
260
  end # end of #translate
@@ -310,41 +313,45 @@ module ViralSeq
310
313
 
311
314
  # screen for sequences with stop codons.
312
315
  # @param (see #translate)
313
- # @return [Array] of two elements [seqhash_stop_codon, seqhash_no_stop_codon],
316
+ # @return [Hash] of two SeqHash objects {with_stop_codon: seqHash, without_stop_codon: seqHash},
314
317
  #
315
- # # seqhash_stop_codon: ViralSeq::SeqHash object with stop codons
316
- # # seqhash_no_stop_codon: ViralSeq::SeqHash object without stop codons
318
+ # # :with_stop_codon : ViralSeq::SeqHash object with stop codons
319
+ # # :without_stop_codon: ViralSeq::SeqHash object without stop codons
317
320
  # @example given a hash of sequences, return a sub-hash with sequences only contains stop codons
318
321
  # my_seqhash = ViralSeq::SeqHash.fa('my_fasta_file.fasta')
319
322
  # my_seqhash.dna_hash
320
323
  # => {">seq1"=>"ATAAGAACG", ">seq2"=>"ATATGAACG", ">seq3"=>"ATGAGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
321
- # stop_codon_seqhash = my_seqhash.stop_codon[0]
324
+ # stop_codon_seqhash = my_seqhash.stop_codon[:with_stop_codon]
322
325
  # stop_codon_seqhash.dna_hash
323
326
  # => {">seq2"=>"ATATGAACG", ">seq4"=>"TATTAGACG", ">seq5"=>"CGCTGAACG"}
324
327
  # stop_codon_seqhash.aa_hash
325
328
  # => {">seq2"=>"I*T", ">seq4"=>"Y*T", ">seq5"=>"R*T"}
326
329
  # stop_codon_seqhash.title
327
330
  # => "my_fasta_file_stop"
328
- # filtered_seqhash = my_seqhash.stop_codon[1]
331
+ # filtered_seqhash = my_seqhash.stop_codon[:without_stop_codon]
329
332
  # filtered_seqhash.aa_hash
330
333
  # {">seq1"=>"IRT", ">seq3"=>"MRT"}
331
334
 
332
335
  def stop_codon(codon_position = 0)
333
336
  self.translate(codon_position)
334
337
  keys = []
335
- self.aa_hash.each do |k,v|
336
- keys << k if v.include?('*')
338
+ aa_seqs = self.aa_hash
339
+ aa_seqs.uniq_hash.each do |seq,array_of_name|
340
+ keys += array_of_name if seq.include?('*')
337
341
  end
338
342
  seqhash1 = self.sub(keys)
339
343
  seqhash1.title = self.title + "_stop"
340
- keys2 = self.aa_hash.keys - keys
344
+ keys2 = aa_seqs.keys - keys
341
345
  seqhash2 = self.sub(keys2)
342
- return [seqhash1, seqhash2]
346
+ return {
347
+ with_stop_codon: seqhash1,
348
+ without_stop_codon: seqhash2
349
+ }
343
350
  end #end of #stop_codon
344
351
 
345
352
 
346
353
  # create one consensus sequence from @dna_hash with an optional majority cut-off for mixed bases.
347
- # @param cutoff [Float] majority cut-off for calling consensus bases. defult at simple majority (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off.
354
+ # @param cutoff [Float] majority cut-off for calling consensus bases. defult at (0.5), position with 15% "A" and 85% "G" will be called as "G" with 20% cut-off and as "R" with 10% cut-off. Using (0) will return use simply majority rule (no cutoff)
348
355
  # @return [String] consensus sequence
349
356
  # @example consensus sequence from an array of sequences.
350
357
  # seq_array = %w{ ATTTTTTTTT
@@ -376,11 +383,18 @@ module ViralSeq
376
383
  base_count = all_base.count_freq
377
384
  max_base_list = []
378
385
 
379
- base_count.each do |k,v|
380
- if v/seq_size.to_f >= cutoff
381
- max_base_list << k
386
+ if cutoff.zero?
387
+ max_count = base_count.values.max
388
+ max_base_hash = base_count.select {|_k,v| v == max_count}
389
+ max_base_list = max_base_hash.keys
390
+ else
391
+ base_count.each do |k,v|
392
+ if v/seq_size.to_f >= cutoff
393
+ max_base_list << k
394
+ end
382
395
  end
383
396
  end
397
+
384
398
  consensus_seq += call_consensus_base(max_base_list)
385
399
  end
386
400
  return consensus_seq
@@ -391,14 +405,14 @@ module ViralSeq
391
405
  # # control pattern: G[YN|RC] -> A[YN|RC]
392
406
  # # use the sample consensus to determine potential a3g sites
393
407
  # # Two criteria to identify hypermutation
394
- # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
408
+ # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
395
409
  # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
396
410
  # # note: criteria 2 only applies on a sequence file containing more than 20 sequences,
397
411
  # # b/c Poisson model does not do well on small sample size.
398
- # @return [Array] three values.
399
- # first value, `array[0]`: a ViralSeq:SeqHash object for sequences with hypermutations
400
- # second value, `array[1]`: a ViralSeq:SeqHash object for sequences without hypermutations
401
- # third value, `array[2]`: a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
412
+ # @return [Hash] three paris.
413
+ # :a3g_seq: a ViralSeq:SeqHash object for sequences with hypermutations
414
+ # :filtered_seq : a ViralSeq:SeqHash object for sequences without hypermutations
415
+ # :stats : a two-demensional array `[[a,b], [c,d]]` for statistic_info, including the following information,
402
416
  # # sequence tag
403
417
  # # G to A mutation numbers at potential a3g positions
404
418
  # # total potential a3g G positions
@@ -409,17 +423,17 @@ module ViralSeq
409
423
  # @example identify apobec3gf mutations from a sequence fasta file
410
424
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence1.fasta')
411
425
  # hypermut = my_seqhash.a3g
412
- # hypermut[0].dna_hash.keys
426
+ # hypermut[:a3g_seq].dna_hash.keys
413
427
  # => [">Seq7", ">Seq14"]
414
- # hypermut[1].dna_hash.keys
428
+ # hypermut[:filtered_seq].dna_hash.keys
415
429
  # => [">Seq1", ">Seq2", ">Seq5"]
416
- # hypermut[2]
430
+ # hypermut[:stats]
417
431
  # => [[">Seq7", 23, 68, 1, 54, 18.26, 4.308329383112348e-06], [">Seq14", 45, 68, 9, 54, 3.97, 5.2143571971582974e-08]]
418
432
  #
419
433
  # @example identify apobec3gf mutations from another sequence fasta file
420
434
  # my_seqhash = ViralSeq::SeqHash.fa('spec/sample_files/sample_a3g_sequence2.fasta')
421
435
  # hypermut = my_seqhash.a3g
422
- # hypermut[2]
436
+ # hypermut[:stats]
423
437
  # => [[">CTAACACTCA_134_a3g-sample2", 4, 35, 0, 51, Infinity, 0.02465676660128911], [">ATAGTGCCCA_60_a3g-sample2", 4, 35, 1, 51, 5.83, 0.1534487353839561]]
424
438
  # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05,
425
439
  # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
@@ -512,7 +526,10 @@ module ViralSeq
512
526
  hm_seq_hash.title = self.title + "_hypermut"
513
527
  hm_seq_hash.file = self.file
514
528
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
515
- return [hm_seq_hash, filtered_seq_hash, hm_hash.values]
529
+ return { a3g_seq: hm_seq_hash,
530
+ filtered_seq: filtered_seq_hash,
531
+ stats: hm_hash.values
532
+ }
516
533
  end #end of #a3g_hypermut
517
534
 
518
535
  alias_method :a3g, :a3g_hypermut
@@ -532,7 +549,7 @@ module ViralSeq
532
549
  if sequences.size == 0
533
550
  return 0
534
551
  else
535
- cut_off = 1
552
+ cut_off = Float::INFINITY
536
553
  l = sequences[0].size
537
554
  rate = sequences.size * error_rate
538
555
  count_mut = variant_for_poisson(sequences)
@@ -541,7 +558,7 @@ module ViralSeq
541
558
 
542
559
  poisson_hash.each do |k,v|
543
560
  cal = l * v
544
- obs = count_mut[k] ? count_mut[k] : 0
561
+ obs = count_mut[k] ? count_mut[k] : 1
545
562
  if obs >= fold_cutoff * cal
546
563
  cut_off = k
547
564
  break
@@ -726,6 +743,7 @@ module ViralSeq
726
743
 
727
744
  seq_hash_unique.each do |seq|
728
745
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
746
+ next unless loc # if locator tool fails, skip this seq.
729
747
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
730
748
  if indel
731
749
  seq_hash_unique_pass << seq
@@ -904,11 +922,11 @@ module ViralSeq
904
922
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
905
923
  # @example gap strip for an array of sequences
906
924
  # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
907
- # array = { AACCGGTT
908
- # A-CCGGTT
909
- # AAC-GGTT
910
- # AACCG-TT
911
- # AACCGGT- }
925
+ # array = %w{ AACCGGTT
926
+ # A-CCGGTT
927
+ # AAC-GGTT
928
+ # AACCG-TT
929
+ # AACCGGT- }
912
930
  # my_seqhash = ViralSeq::SeqHash.array(array)
913
931
  # puts my_seqhash.gap_strip.dna_hash.values
914
932
  # ACGT
@@ -963,12 +981,11 @@ module ViralSeq
963
981
  # @param (see #gap_strip)
964
982
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
965
983
  # @example gap strip for an array of sequences only at the ends
966
- # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
967
- # array = { AACCGGTT
968
- # A-CCGGTT
969
- # AAC-GGTT
970
- # AACCG-TT
971
- # AACCGGT- }
984
+ # array = %w{ AACCGGTT
985
+ # A-CCGGTT
986
+ # AAC-GGTT
987
+ # AACCG-TT
988
+ # AACCGGT- }
972
989
  # my_seqhash = ViralSeq::SeqHash.array(array)
973
990
  # puts my_seqhash.gap_strip_ends.dna_hash.values
974
991
  # AACCGGT
@@ -1048,7 +1065,121 @@ module ViralSeq
1048
1065
  return new_seqhash
1049
1066
  end
1050
1067
 
1068
+ # return an table of frequencies of nucleotides at each position.
1069
+ # @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
1070
+ # @param head [Boolean] if the head of table is included.
1071
+ # @return [Array] a two-dimension array of the frequency table,
1072
+ # including the following info:
1073
+ # position on the sequence (starting from 1)
1074
+ # consensus nucleotide
1075
+ # total sequence numbers
1076
+ # percentage of A, shows "-" if agrees with consensus
1077
+ # percentage of C, shows "-" if agrees with consensus
1078
+ # percentage of G, shows "-" if agrees with consensus
1079
+ # percentage of T, shows "-" if agrees with consensus
1080
+ #
1081
+ # @example error table for an array of sequences
1082
+ # array = %w{ AACCGGTT
1083
+ # AGCCGGTT
1084
+ # AACTGCTT
1085
+ # AACCGTTA
1086
+ # AACCGGTA }
1087
+ # my_seqhash = ViralSeq::SeqHash.array(array)
1088
+ # my_seqhash.error_table.each {|r| puts r.join(',')}
1089
+ # position,consensus,total_seq_number,A,C,G,T
1090
+ # 1,A,5,-,,,
1091
+ # 2,A,5,-,,0.2,
1092
+ # 3,C,5,,-,,
1093
+ # 4,C,5,,-,,0.2
1094
+ # 5,G,5,,,-,
1095
+ # 6,G,5,,0.2,-,0.2
1096
+ # 7,T,5,,,,-
1097
+ # 8,T,5,0.4,,,-
1098
+
1099
+ def error_table(ref = self.consensus, head = true)
1100
+
1101
+ table = []
1102
+ if head
1103
+ table << %w{
1104
+ position
1105
+ consensus
1106
+ total_seq_number
1107
+ A
1108
+ C
1109
+ G
1110
+ T
1111
+ }
1112
+ end
1113
+ ref_size = ref.size
1114
+
1115
+ (0..(ref_size - 1)).each do |position|
1116
+ ref_base = ref[position]
1117
+ nts = []
1118
+
1119
+ self.dna_hash.each do |_k,v|
1120
+ nts << v[position]
1121
+ end
1122
+
1123
+ freq = nts.count_freq
1124
+ freq2 = {}
1125
+
1126
+ freq.each do |nt,c|
1127
+ if nt == ref_base
1128
+ freq2[nt] = '-'
1129
+ else
1130
+ freq2[nt] = (c/(self.size).to_f)
1131
+ end
1132
+ end
1133
+
1134
+ table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
1135
+ end
1136
+
1137
+ return table
1051
1138
 
1139
+ end # end of error_table
1140
+
1141
+ # randomly select n number of sequences from the orginal SeqHash object
1142
+ # @param n [Integer] number of sequences to randomly select
1143
+ # @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
1144
+
1145
+ def random_select(n = 100)
1146
+ new_sh = ViralSeq::SeqHash.new
1147
+ dna_hash = self.dna_hash
1148
+ aa_hash = self.aa_hash
1149
+ qc_hash = self.qc_hash
1150
+
1151
+ keys = dna_hash.keys.sample(n)
1152
+
1153
+ keys.each do |k|
1154
+ new_sh.dna_hash[k] = dna_hash[k]
1155
+ new_sh.aa_hash[k] = aa_hash[k]
1156
+ new_sh.qc_hash[k] = qc_hash[k]
1157
+ end
1158
+ new_sh.title = self.title + "_" + n.to_s
1159
+ return new_sh
1160
+ end
1161
+
1162
+ # trim dna sequences based on the provided reference coordinates.
1163
+ # @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
1164
+ # @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
1165
+ # @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
1166
+ # @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
1167
+ # @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with trimmed sequences
1168
+
1169
+ def trim(start_nt, end_nt, ref_option = :HXB2, path_to_muscle = false)
1170
+ seq_hash = self.dna_hash.dup
1171
+ seq_hash_unique = seq_hash.uniq_hash
1172
+ trimmed_seq_hash = {}
1173
+ seq_hash_unique.each do |seq, names|
1174
+ trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option, path_to_muscle).dna
1175
+ names.each do |name|
1176
+ trimmed_seq_hash[name] = trimmed_seq
1177
+ end
1178
+ end
1179
+ return_seq_hash = self.dup
1180
+ return_seq_hash.dna_hash = trimmed_seq_hash
1181
+ return return_seq_hash
1182
+ end
1052
1183
 
1053
1184
  # start of private functions
1054
1185
  private