viral_seq 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: df8d50f2dfbf0f2e7e1efcf66c8a91c286c7b5029533b725a4a33219882748bb
4
- data.tar.gz: 4061c3875d4629025d1ccc216a54fdb7a011d397408a3ecb15125475e9f262e9
3
+ metadata.gz: bb326c97b25326286a51ec63583983a20dfebee2513fd8811bc855ec21ac0b5d
4
+ data.tar.gz: e9870bbaa8c17ba51d53e790ca8189e2dd362911e1b5cfcd4806a3bc68ccf369
5
5
  SHA512:
6
- metadata.gz: a52087ced9fe258ef5bab4449b90e964ff9a557292dc1ce679aae03a56bd2570fdf1221e7026fec2b1ccb49ad2a9ff076338a397982e47c46877e2cdfb4e6d2e
7
- data.tar.gz: 792cb9424fd46d536d0b95cfc90914a8548ee5ea6d1c3efe45cccd1d01c6dbd6b7a7ee0ba1be010bd6cf7a3ea201f4850803c28f16889b5286e1e458a774c8f1
6
+ metadata.gz: ff6e5727484687db04180a1ef9d3204e9ed02d9b1a98862bdb8796255680aca1e830667429a57db116702793dc55eeb7cc84800c39b27f8e2773186e1a638988
7
+ data.tar.gz: 86d0b03af6335cc91e38bc54a8c1fa7e2c84d430dc0adb02e4dc3819ebb188a0e8ae1e4c76c71e5066cac51675e0a45f9ee5a9b0bbd2de8b26da4fa04fe95d85
@@ -1,12 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.0.5)
4
+ viral_seq (1.0.7)
5
+ colorize (~> 0.1)
5
6
  muscle_bio (~> 0.4)
6
7
 
7
8
  GEM
8
9
  remote: https://rubygems.org/
9
10
  specs:
11
+ colorize (0.8.1)
10
12
  diff-lcs (1.3)
11
13
  muscle_bio (0.4.0)
12
14
  rake (10.5.0)
@@ -34,4 +36,4 @@ DEPENDENCIES
34
36
  viral_seq!
35
37
 
36
38
  BUNDLED WITH
37
- 2.0.2
39
+ 2.1.4
data/README.md CHANGED
@@ -14,7 +14,7 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
14
14
 
15
15
  #!/usr/bin/env ruby
16
16
  require 'viral_seq'
17
-
17
+
18
18
  #### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
19
19
 
20
20
  $ locator -i sequence.fasta -o sequence.fasta.csv
@@ -51,6 +51,13 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
51
51
 
52
52
  ## Updates
53
53
 
54
+ Version 1.0.7-01282020:
55
+
56
+ 1. Several methods added, including
57
+ ViralSeq::SeqHash#error_table
58
+ ViralSeq::SeqHash#random_select
59
+ 2. Improved performance for several functions.
60
+
54
61
  Version 1.0.6-07232019:
55
62
 
56
63
  1. Several methods added to ViralSeq::SeqHash, including
@@ -58,7 +65,7 @@ Version 1.0.6-07232019:
58
65
  ViralSeq::SeqHash#+
59
66
  ViralSeq::SeqHash#write_nt_fa
60
67
  ViralSeq::SeqHash#mutation
61
- 2. Update documentations and rspec samples.
68
+ 2. Update documentations and rspec samples.
62
69
 
63
70
  Version 1.0.5-07112019:
64
71
 
@@ -3,13 +3,14 @@
3
3
  require 'viral_seq'
4
4
  require 'csv'
5
5
  require 'optparse'
6
+ require 'colorize'
6
7
 
7
8
  def myparser
8
9
  options = {}
9
10
  OptionParser.new do |opts|
10
- opts.banner = "Usage: locator -i [nt_sequence_fasta_file] -o [locator_info_csv_file] -r [reference_genome_option]"
11
+ opts.banner = "#{"Usage:".red.bold} locator #{"-i".blue.bold} [nt_sequence_fasta_file] #{"-o".blue.bold} [locator_info_csv_file] #{"-r".blue.bold} [reference_genome_option]"
11
12
 
12
- opts.on('-i', '--infile FASTA_FILE', 'nt sequence file in FASTA format') do |i|
13
+ opts.on('-i', '--infile FASTA_FILE', "#{"nt sequence".blue.bold} file in FASTA format") do |i|
13
14
  options[:infile] = i
14
15
  end
15
16
 
@@ -17,7 +18,7 @@ def myparser
17
18
  options[:outfile] = o
18
19
  end
19
20
 
20
- opts.on('-r', '--ref_option OPTION', 'reference genome option, choose from `HXB2` (default), `NL43`, `MAC239`') do |o|
21
+ opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `NL43`, `MAC239`".blue.bold}") do |o|
21
22
  options[:ref_option] = o.to_sym
22
23
  end
23
24
 
@@ -35,9 +36,9 @@ def myparser
35
36
  return options
36
37
  end
37
38
 
38
- puts "\nSequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION}) by Shuntai Zhou"
39
- puts "See details at https://github.com/ViralSeq/viral_seq\n"
40
- puts "Resembling Sequence Locator from LANL (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n\n"
39
+ puts "\n" + "Sequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION})".red.bold + " by " + "Shuntai Zhou".blue.bold
40
+ puts "See details at " + "https://github.com/ViralSeq/viral_seq\n".blue
41
+ puts "Resembling" + " Sequence Locator ".magenta.bold + "from LANL" + " (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n".blue
41
42
 
42
43
  ARGV << '-h' if ARGV.size == 0
43
44
 
@@ -47,7 +48,7 @@ begin
47
48
  if options[:infile]
48
49
  seq_file = options[:infile]
49
50
  else
50
- raise StandardError.new("Input file sequence file not found")
51
+ raise StandardError.new("Input file sequence file not found".red.bold)
51
52
  end
52
53
 
53
54
  if options[:outfile]
@@ -57,14 +58,14 @@ begin
57
58
  end
58
59
 
59
60
  unless File.exist?(seq_file)
60
- raise StandardError.new("Input file sequence file not found")
61
+ raise StandardError.new("Input file sequence file not found".red.bold)
61
62
  end
62
63
 
63
64
  seqs = ViralSeq::SeqHash.fa(seq_file)
64
65
  opt = options[:ref_option] ? options[:ref_option] : :HXB2
65
66
 
66
67
  unless [:HXB2, :NL43, :MAC239].include? opt
67
- puts "Reference option #{opt} not recognized, using `:HXB2` as the reference genome."
68
+ puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
68
69
  opt = :HXB2
69
70
  end
70
71
 
@@ -76,6 +77,7 @@ begin
76
77
  end
77
78
 
78
79
  File.write(csv_file, data)
80
+ puts "Output file found at #{csv_file.green.bold}"
79
81
  rescue StandardError => e
80
82
  puts e.message
81
83
  puts "\n"
@@ -39,8 +39,8 @@ module ViralSeq
39
39
 
40
40
  def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
41
  temp_dir = Dir.home
42
- temp_file = temp_dir + "/_temp_muscle_in"
43
- temp_aln = temp_dir + "/_temp_muscle_aln"
42
+ temp_file = File.join(temp_dir, "_temp_muscle_in")
43
+ temp_aln = File.join(temp_dir, "_temp_muscle_aln")
44
44
  name = ">test"
45
45
  temp_in = File.open(temp_file,"w")
46
46
  temp_in.puts ">ref"
@@ -248,10 +248,12 @@ module ViralSeq
248
248
  def translate(codon_position = 0)
249
249
  seqs = self.dna_hash
250
250
  @aa_hash = {}
251
- seqs.each do |name, seq|
252
- s = ViralSeq::Sequence.new(name, seq)
251
+ seqs.uniq_hash.each do |seq, array_of_name|
252
+ s = ViralSeq::Sequence.new('name', seq)
253
253
  s.translate(codon_position)
254
- @aa_hash[name] = s.aa_string
254
+ array_of_name.each do |name|
255
+ @aa_hash[name] = s.aa_string
256
+ end
255
257
  end
256
258
  return nil
257
259
  end # end of #translate
@@ -332,12 +334,13 @@ module ViralSeq
332
334
  def stop_codon(codon_position = 0)
333
335
  self.translate(codon_position)
334
336
  keys = []
335
- self.aa_hash.each do |k,v|
336
- keys << k if v.include?('*')
337
+ aa_seqs = self.aa_hash
338
+ aa_seqs.uniq_hash.each do |seq,array_of_name|
339
+ keys += array_of_name if seq.include?('*')
337
340
  end
338
341
  seqhash1 = self.sub(keys)
339
342
  seqhash1.title = self.title + "_stop"
340
- keys2 = self.aa_hash.keys - keys
343
+ keys2 = aa_seqs.keys - keys
341
344
  seqhash2 = self.sub(keys2)
342
345
  return [seqhash1, seqhash2]
343
346
  end #end of #stop_codon
@@ -904,11 +907,11 @@ module ViralSeq
904
907
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
905
908
  # @example gap strip for an array of sequences
906
909
  # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
907
- # array = { AACCGGTT
908
- # A-CCGGTT
909
- # AAC-GGTT
910
- # AACCG-TT
911
- # AACCGGT- }
910
+ # array = %w{ AACCGGTT
911
+ # A-CCGGTT
912
+ # AAC-GGTT
913
+ # AACCG-TT
914
+ # AACCGGT- }
912
915
  # my_seqhash = ViralSeq::SeqHash.array(array)
913
916
  # puts my_seqhash.gap_strip.dna_hash.values
914
917
  # ACGT
@@ -963,12 +966,11 @@ module ViralSeq
963
966
  # @param (see #gap_strip)
964
967
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
965
968
  # @example gap strip for an array of sequences only at the ends
966
- # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
967
- # array = { AACCGGTT
968
- # A-CCGGTT
969
- # AAC-GGTT
970
- # AACCG-TT
971
- # AACCGGT- }
969
+ # array = %w{ AACCGGTT
970
+ # A-CCGGTT
971
+ # AAC-GGTT
972
+ # AACCG-TT
973
+ # AACCGGT- }
972
974
  # my_seqhash = ViralSeq::SeqHash.array(array)
973
975
  # puts my_seqhash.gap_strip_ends.dna_hash.values
974
976
  # AACCGGT
@@ -1048,6 +1050,99 @@ module ViralSeq
1048
1050
  return new_seqhash
1049
1051
  end
1050
1052
 
1053
+ # return an table of frequencies of nucleotides at each position.
1054
+ # @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
1055
+ # @param head [Boolean] if the head of table is included.
1056
+ # @return [Array] a two-dimension array of the frequency table,
1057
+ # including the following info:
1058
+ # position on the sequence (starting from 1)
1059
+ # consensus nucleotide
1060
+ # total sequence numbers
1061
+ # percentage of A, shows "-" if agrees with consensus
1062
+ # percentage of C, shows "-" if agrees with consensus
1063
+ # percentage of G, shows "-" if agrees with consensus
1064
+ # percentage of T, shows "-" if agrees with consensus
1065
+ #
1066
+ # @example error table for an array of sequences
1067
+ # array = %w{ AACCGGTT
1068
+ # AGCCGGTT
1069
+ # AACTGCTT
1070
+ # AACCGTTA
1071
+ # AACCGGTA }
1072
+ # my_seqhash = ViralSeq::SeqHash.array(array)
1073
+ # my_seqhash.error_table.each {|r| puts r.join(',')}
1074
+ # position,consensus,total_seq_number,A,C,G,T
1075
+ # 1,A,5,-,,,
1076
+ # 2,A,5,-,,0.2,
1077
+ # 3,C,5,,-,,
1078
+ # 4,C,5,,-,,0.2
1079
+ # 5,G,5,,,-,
1080
+ # 6,G,5,,0.2,-,0.2
1081
+ # 7,T,5,,,,-
1082
+ # 8,T,5,0.4,,,-
1083
+
1084
+ def error_table(ref = self.consensus, head = true)
1085
+
1086
+ table = []
1087
+ if head
1088
+ table << %w{
1089
+ position
1090
+ consensus
1091
+ total_seq_number
1092
+ A
1093
+ C
1094
+ G
1095
+ T
1096
+ }
1097
+ end
1098
+ ref_size = ref.size
1099
+
1100
+ (0..(ref_size - 1)).each do |position|
1101
+ ref_base = ref[position]
1102
+ nts = []
1103
+
1104
+ self.dna_hash.each do |_k,v|
1105
+ nts << v[position]
1106
+ end
1107
+
1108
+ freq = nts.count_freq
1109
+ freq2 = {}
1110
+
1111
+ freq.each do |nt,c|
1112
+ if nt == ref_base
1113
+ freq2[nt] = '-'
1114
+ else
1115
+ freq2[nt] = (c/(self.size).to_f)
1116
+ end
1117
+ end
1118
+
1119
+ table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
1120
+ end
1121
+
1122
+ return table
1123
+
1124
+ end # end of error_table
1125
+
1126
+ # randomly select n number of sequences from the orginal SeqHash object
1127
+ # @param n [Integer] number of sequences to randomly select
1128
+ # @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
1129
+
1130
+ def random_select(n = 100)
1131
+ new_sh = ViralSeq::SeqHash.new
1132
+ dna_hash = self.dna_hash
1133
+ aa_hash = self.aa_hash
1134
+ qc_hash = self.qc_hash
1135
+
1136
+ keys = dna_hash.keys.sample(n)
1137
+
1138
+ keys.each do |k|
1139
+ new_sh.dna_hash[k] = dna_hash[k]
1140
+ new_sh.aa_hash[k] = aa_hash[k]
1141
+ new_sh.qc_hash[k] = qc_hash[k]
1142
+ end
1143
+ new_sh.title = self.title + "_" + n.to_s
1144
+ return new_sh
1145
+ end
1051
1146
 
1052
1147
 
1053
1148
  # start of private functions
@@ -7,7 +7,7 @@ module ViralSeq
7
7
  # @example join the paired-end sequences with an overlap of 100 bp
8
8
  # my_seqhashpair.join1(100)
9
9
  # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
10
- # my_seqhashpair.join1(:indiv)
10
+ # my_seqhashpair.join2(model: :indiv)
11
11
 
12
12
  class SeqHashPair
13
13
 
@@ -104,17 +104,21 @@ module ViralSeq
104
104
  raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
105
105
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
106
106
  joined_seq = {}
107
- seq_pair_hash.each do |seq_name, seq_pair|
107
+ seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
108
108
  r1_seq = seq_pair[0]
109
109
  r2_seq = seq_pair[1]
110
110
  if overlap.zero?
111
- joined_seq[seq_name] = r1_seq + r2_seq
111
+ joined_sequence = r1_seq + r2_seq
112
112
  elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
113
- joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
113
+ joined_sequence= r1_seq + r2_seq[overlap..-1]
114
114
  else
115
115
  next
116
116
  end
117
+ seq_names.each do |seq_name|
118
+ joined_seq[seq_name] = joined_sequence
119
+ end
117
120
  end
121
+
118
122
  joined_seq_hash = ViralSeq::SeqHash.new
119
123
  joined_seq_hash.dna_hash = joined_seq
120
124
  joined_seq_hash.title = self.title + "_joined"
@@ -139,7 +143,7 @@ module ViralSeq
139
143
  # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
140
144
  # my_seqhashpair.join2.dna_hash
141
145
  # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
142
- # my_seqhashpair.join2(model :indiv).dna_hash
146
+ # my_seqhashpair.join2(model: :indiv).dna_hash
143
147
  # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
144
148
 
145
149
  def join2(model: :con, diff: 0.0)
@@ -2,5 +2,5 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.0.6"
5
+ VERSION = "1.0.7"
6
6
  end
@@ -31,5 +31,9 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  # muscle_bio gem required
33
33
  spec.add_runtime_dependency "muscle_bio", "~> 0.4"
34
+
35
+ # colorize gem required
36
+ spec.add_runtime_dependency "colorize", "~> 0.1"
37
+
34
38
  spec.requirements << 'R required for some functions'
35
39
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.6
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-07-23 00:00:00.000000000 Z
12
+ date: 2020-01-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -67,6 +67,20 @@ dependencies:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0.4'
70
+ - !ruby/object:Gem::Dependency
71
+ name: colorize
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '0.1'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '0.1'
70
84
  description: |-
71
85
  A Ruby Gem with bioinformatics tools for processing viral NGS data.
72
86
  Specifically for Primer-ID sequencing and HIV drug resistance analysis.
@@ -124,7 +138,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
124
138
  version: '0'
125
139
  requirements:
126
140
  - R required for some functions
127
- rubygems_version: 3.0.3
141
+ rubygems_version: 3.1.2
128
142
  signing_key:
129
143
  specification_version: 4
130
144
  summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.