viral_seq 1.0.6 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: df8d50f2dfbf0f2e7e1efcf66c8a91c286c7b5029533b725a4a33219882748bb
4
- data.tar.gz: 4061c3875d4629025d1ccc216a54fdb7a011d397408a3ecb15125475e9f262e9
3
+ metadata.gz: bb326c97b25326286a51ec63583983a20dfebee2513fd8811bc855ec21ac0b5d
4
+ data.tar.gz: e9870bbaa8c17ba51d53e790ca8189e2dd362911e1b5cfcd4806a3bc68ccf369
5
5
  SHA512:
6
- metadata.gz: a52087ced9fe258ef5bab4449b90e964ff9a557292dc1ce679aae03a56bd2570fdf1221e7026fec2b1ccb49ad2a9ff076338a397982e47c46877e2cdfb4e6d2e
7
- data.tar.gz: 792cb9424fd46d536d0b95cfc90914a8548ee5ea6d1c3efe45cccd1d01c6dbd6b7a7ee0ba1be010bd6cf7a3ea201f4850803c28f16889b5286e1e458a774c8f1
6
+ metadata.gz: ff6e5727484687db04180a1ef9d3204e9ed02d9b1a98862bdb8796255680aca1e830667429a57db116702793dc55eeb7cc84800c39b27f8e2773186e1a638988
7
+ data.tar.gz: 86d0b03af6335cc91e38bc54a8c1fa7e2c84d430dc0adb02e4dc3819ebb188a0e8ae1e4c76c71e5066cac51675e0a45f9ee5a9b0bbd2de8b26da4fa04fe95d85
@@ -1,12 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.0.5)
4
+ viral_seq (1.0.7)
5
+ colorize (~> 0.1)
5
6
  muscle_bio (~> 0.4)
6
7
 
7
8
  GEM
8
9
  remote: https://rubygems.org/
9
10
  specs:
11
+ colorize (0.8.1)
10
12
  diff-lcs (1.3)
11
13
  muscle_bio (0.4.0)
12
14
  rake (10.5.0)
@@ -34,4 +36,4 @@ DEPENDENCIES
34
36
  viral_seq!
35
37
 
36
38
  BUNDLED WITH
37
- 2.0.2
39
+ 2.1.4
data/README.md CHANGED
@@ -14,7 +14,7 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
14
14
 
15
15
  #!/usr/bin/env ruby
16
16
  require 'viral_seq'
17
-
17
+
18
18
  #### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
19
19
 
20
20
  $ locator -i sequence.fasta -o sequence.fasta.csv
@@ -51,6 +51,13 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
51
51
 
52
52
  ## Updates
53
53
 
54
+ Version 1.0.7-01282020:
55
+
56
+ 1. Several methods added, including
57
+ ViralSeq::SeqHash#error_table
58
+ ViralSeq::SeqHash#random_select
59
+ 2. Improved performance for several functions.
60
+
54
61
  Version 1.0.6-07232019:
55
62
 
56
63
  1. Several methods added to ViralSeq::SeqHash, including
@@ -58,7 +65,7 @@ Version 1.0.6-07232019:
58
65
  ViralSeq::SeqHash#+
59
66
  ViralSeq::SeqHash#write_nt_fa
60
67
  ViralSeq::SeqHash#mutation
61
- 2. Update documentations and rspec samples.
68
+ 2. Update documentations and rspec samples.
62
69
 
63
70
  Version 1.0.5-07112019:
64
71
 
@@ -3,13 +3,14 @@
3
3
  require 'viral_seq'
4
4
  require 'csv'
5
5
  require 'optparse'
6
+ require 'colorize'
6
7
 
7
8
  def myparser
8
9
  options = {}
9
10
  OptionParser.new do |opts|
10
- opts.banner = "Usage: locator -i [nt_sequence_fasta_file] -o [locator_info_csv_file] -r [reference_genome_option]"
11
+ opts.banner = "#{"Usage:".red.bold} locator #{"-i".blue.bold} [nt_sequence_fasta_file] #{"-o".blue.bold} [locator_info_csv_file] #{"-r".blue.bold} [reference_genome_option]"
11
12
 
12
- opts.on('-i', '--infile FASTA_FILE', 'nt sequence file in FASTA format') do |i|
13
+ opts.on('-i', '--infile FASTA_FILE', "#{"nt sequence".blue.bold} file in FASTA format") do |i|
13
14
  options[:infile] = i
14
15
  end
15
16
 
@@ -17,7 +18,7 @@ def myparser
17
18
  options[:outfile] = o
18
19
  end
19
20
 
20
- opts.on('-r', '--ref_option OPTION', 'reference genome option, choose from `HXB2` (default), `NL43`, `MAC239`') do |o|
21
+ opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `NL43`, `MAC239`".blue.bold}") do |o|
21
22
  options[:ref_option] = o.to_sym
22
23
  end
23
24
 
@@ -35,9 +36,9 @@ def myparser
35
36
  return options
36
37
  end
37
38
 
38
- puts "\nSequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION}) by Shuntai Zhou"
39
- puts "See details at https://github.com/ViralSeq/viral_seq\n"
40
- puts "Resembling Sequence Locator from LANL (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n\n"
39
+ puts "\n" + "Sequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION})".red.bold + " by " + "Shuntai Zhou".blue.bold
40
+ puts "See details at " + "https://github.com/ViralSeq/viral_seq\n".blue
41
+ puts "Resembling" + " Sequence Locator ".magenta.bold + "from LANL" + " (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n".blue
41
42
 
42
43
  ARGV << '-h' if ARGV.size == 0
43
44
 
@@ -47,7 +48,7 @@ begin
47
48
  if options[:infile]
48
49
  seq_file = options[:infile]
49
50
  else
50
- raise StandardError.new("Input file sequence file not found")
51
+ raise StandardError.new("Input file sequence file not found".red.bold)
51
52
  end
52
53
 
53
54
  if options[:outfile]
@@ -57,14 +58,14 @@ begin
57
58
  end
58
59
 
59
60
  unless File.exist?(seq_file)
60
- raise StandardError.new("Input file sequence file not found")
61
+ raise StandardError.new("Input file sequence file not found".red.bold)
61
62
  end
62
63
 
63
64
  seqs = ViralSeq::SeqHash.fa(seq_file)
64
65
  opt = options[:ref_option] ? options[:ref_option] : :HXB2
65
66
 
66
67
  unless [:HXB2, :NL43, :MAC239].include? opt
67
- puts "Reference option #{opt} not recognized, using `:HXB2` as the reference genome."
68
+ puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
68
69
  opt = :HXB2
69
70
  end
70
71
 
@@ -76,6 +77,7 @@ begin
76
77
  end
77
78
 
78
79
  File.write(csv_file, data)
80
+ puts "Output file found at #{csv_file.green.bold}"
79
81
  rescue StandardError => e
80
82
  puts e.message
81
83
  puts "\n"
@@ -39,8 +39,8 @@ module ViralSeq
39
39
 
40
40
  def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
41
41
  temp_dir = Dir.home
42
- temp_file = temp_dir + "/_temp_muscle_in"
43
- temp_aln = temp_dir + "/_temp_muscle_aln"
42
+ temp_file = File.join(temp_dir, "_temp_muscle_in")
43
+ temp_aln = File.join(temp_dir, "_temp_muscle_aln")
44
44
  name = ">test"
45
45
  temp_in = File.open(temp_file,"w")
46
46
  temp_in.puts ">ref"
@@ -248,10 +248,12 @@ module ViralSeq
248
248
  def translate(codon_position = 0)
249
249
  seqs = self.dna_hash
250
250
  @aa_hash = {}
251
- seqs.each do |name, seq|
252
- s = ViralSeq::Sequence.new(name, seq)
251
+ seqs.uniq_hash.each do |seq, array_of_name|
252
+ s = ViralSeq::Sequence.new('name', seq)
253
253
  s.translate(codon_position)
254
- @aa_hash[name] = s.aa_string
254
+ array_of_name.each do |name|
255
+ @aa_hash[name] = s.aa_string
256
+ end
255
257
  end
256
258
  return nil
257
259
  end # end of #translate
@@ -332,12 +334,13 @@ module ViralSeq
332
334
  def stop_codon(codon_position = 0)
333
335
  self.translate(codon_position)
334
336
  keys = []
335
- self.aa_hash.each do |k,v|
336
- keys << k if v.include?('*')
337
+ aa_seqs = self.aa_hash
338
+ aa_seqs.uniq_hash.each do |seq,array_of_name|
339
+ keys += array_of_name if seq.include?('*')
337
340
  end
338
341
  seqhash1 = self.sub(keys)
339
342
  seqhash1.title = self.title + "_stop"
340
- keys2 = self.aa_hash.keys - keys
343
+ keys2 = aa_seqs.keys - keys
341
344
  seqhash2 = self.sub(keys2)
342
345
  return [seqhash1, seqhash2]
343
346
  end #end of #stop_codon
@@ -904,11 +907,11 @@ module ViralSeq
904
907
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
905
908
  # @example gap strip for an array of sequences
906
909
  # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
907
- # array = { AACCGGTT
908
- # A-CCGGTT
909
- # AAC-GGTT
910
- # AACCG-TT
911
- # AACCGGT- }
910
+ # array = %w{ AACCGGTT
911
+ # A-CCGGTT
912
+ # AAC-GGTT
913
+ # AACCG-TT
914
+ # AACCGGT- }
912
915
  # my_seqhash = ViralSeq::SeqHash.array(array)
913
916
  # puts my_seqhash.gap_strip.dna_hash.values
914
917
  # ACGT
@@ -963,12 +966,11 @@ module ViralSeq
963
966
  # @param (see #gap_strip)
964
967
  # @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
965
968
  # @example gap strip for an array of sequences only at the ends
966
- # array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
967
- # array = { AACCGGTT
968
- # A-CCGGTT
969
- # AAC-GGTT
970
- # AACCG-TT
971
- # AACCGGT- }
969
+ # array = %w{ AACCGGTT
970
+ # A-CCGGTT
971
+ # AAC-GGTT
972
+ # AACCG-TT
973
+ # AACCGGT- }
972
974
  # my_seqhash = ViralSeq::SeqHash.array(array)
973
975
  # puts my_seqhash.gap_strip_ends.dna_hash.values
974
976
  # AACCGGT
@@ -1048,6 +1050,99 @@ module ViralSeq
1048
1050
  return new_seqhash
1049
1051
  end
1050
1052
 
1053
+ # return an table of frequencies of nucleotides at each position.
1054
+ # @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
1055
+ # @param head [Boolean] if the head of table is included.
1056
+ # @return [Array] a two-dimension array of the frequency table,
1057
+ # including the following info:
1058
+ # position on the sequence (starting from 1)
1059
+ # consensus nucleotide
1060
+ # total sequence numbers
1061
+ # percentage of A, shows "-" if agrees with consensus
1062
+ # percentage of C, shows "-" if agrees with consensus
1063
+ # percentage of G, shows "-" if agrees with consensus
1064
+ # percentage of T, shows "-" if agrees with consensus
1065
+ #
1066
+ # @example error table for an array of sequences
1067
+ # array = %w{ AACCGGTT
1068
+ # AGCCGGTT
1069
+ # AACTGCTT
1070
+ # AACCGTTA
1071
+ # AACCGGTA }
1072
+ # my_seqhash = ViralSeq::SeqHash.array(array)
1073
+ # my_seqhash.error_table.each {|r| puts r.join(',')}
1074
+ # position,consensus,total_seq_number,A,C,G,T
1075
+ # 1,A,5,-,,,
1076
+ # 2,A,5,-,,0.2,
1077
+ # 3,C,5,,-,,
1078
+ # 4,C,5,,-,,0.2
1079
+ # 5,G,5,,,-,
1080
+ # 6,G,5,,0.2,-,0.2
1081
+ # 7,T,5,,,,-
1082
+ # 8,T,5,0.4,,,-
1083
+
1084
+ def error_table(ref = self.consensus, head = true)
1085
+
1086
+ table = []
1087
+ if head
1088
+ table << %w{
1089
+ position
1090
+ consensus
1091
+ total_seq_number
1092
+ A
1093
+ C
1094
+ G
1095
+ T
1096
+ }
1097
+ end
1098
+ ref_size = ref.size
1099
+
1100
+ (0..(ref_size - 1)).each do |position|
1101
+ ref_base = ref[position]
1102
+ nts = []
1103
+
1104
+ self.dna_hash.each do |_k,v|
1105
+ nts << v[position]
1106
+ end
1107
+
1108
+ freq = nts.count_freq
1109
+ freq2 = {}
1110
+
1111
+ freq.each do |nt,c|
1112
+ if nt == ref_base
1113
+ freq2[nt] = '-'
1114
+ else
1115
+ freq2[nt] = (c/(self.size).to_f)
1116
+ end
1117
+ end
1118
+
1119
+ table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
1120
+ end
1121
+
1122
+ return table
1123
+
1124
+ end # end of error_table
1125
+
1126
+ # randomly select n number of sequences from the orginal SeqHash object
1127
+ # @param n [Integer] number of sequences to randomly select
1128
+ # @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
1129
+
1130
+ def random_select(n = 100)
1131
+ new_sh = ViralSeq::SeqHash.new
1132
+ dna_hash = self.dna_hash
1133
+ aa_hash = self.aa_hash
1134
+ qc_hash = self.qc_hash
1135
+
1136
+ keys = dna_hash.keys.sample(n)
1137
+
1138
+ keys.each do |k|
1139
+ new_sh.dna_hash[k] = dna_hash[k]
1140
+ new_sh.aa_hash[k] = aa_hash[k]
1141
+ new_sh.qc_hash[k] = qc_hash[k]
1142
+ end
1143
+ new_sh.title = self.title + "_" + n.to_s
1144
+ return new_sh
1145
+ end
1051
1146
 
1052
1147
 
1053
1148
  # start of private functions
@@ -7,7 +7,7 @@ module ViralSeq
7
7
  # @example join the paired-end sequences with an overlap of 100 bp
8
8
  # my_seqhashpair.join1(100)
9
9
  # @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
10
- # my_seqhashpair.join1(:indiv)
10
+ # my_seqhashpair.join2(model: :indiv)
11
11
 
12
12
  class SeqHashPair
13
13
 
@@ -104,17 +104,21 @@ module ViralSeq
104
104
  raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
105
105
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
106
106
  joined_seq = {}
107
- seq_pair_hash.each do |seq_name, seq_pair|
107
+ seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
108
108
  r1_seq = seq_pair[0]
109
109
  r2_seq = seq_pair[1]
110
110
  if overlap.zero?
111
- joined_seq[seq_name] = r1_seq + r2_seq
111
+ joined_sequence = r1_seq + r2_seq
112
112
  elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
113
- joined_seq[seq_name] = r1_seq + r2_seq[overlap..-1]
113
+ joined_sequence= r1_seq + r2_seq[overlap..-1]
114
114
  else
115
115
  next
116
116
  end
117
+ seq_names.each do |seq_name|
118
+ joined_seq[seq_name] = joined_sequence
119
+ end
117
120
  end
121
+
118
122
  joined_seq_hash = ViralSeq::SeqHash.new
119
123
  joined_seq_hash.dna_hash = joined_seq
120
124
  joined_seq_hash.title = self.title + "_joined"
@@ -139,7 +143,7 @@ module ViralSeq
139
143
  # my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
140
144
  # my_seqhashpair.join2.dna_hash
141
145
  # => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
142
- # my_seqhashpair.join2(model :indiv).dna_hash
146
+ # my_seqhashpair.join2(model: :indiv).dna_hash
143
147
  # => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
144
148
 
145
149
  def join2(model: :con, diff: 0.0)
@@ -2,5 +2,5 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.0.6"
5
+ VERSION = "1.0.7"
6
6
  end
@@ -31,5 +31,9 @@ Gem::Specification.new do |spec|
31
31
 
32
32
  # muscle_bio gem required
33
33
  spec.add_runtime_dependency "muscle_bio", "~> 0.4"
34
+
35
+ # colorize gem required
36
+ spec.add_runtime_dependency "colorize", "~> 0.1"
37
+
34
38
  spec.requirements << 'R required for some functions'
35
39
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.6
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-07-23 00:00:00.000000000 Z
12
+ date: 2020-01-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -67,6 +67,20 @@ dependencies:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0.4'
70
+ - !ruby/object:Gem::Dependency
71
+ name: colorize
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '0.1'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '0.1'
70
84
  description: |-
71
85
  A Ruby Gem with bioinformatics tools for processing viral NGS data.
72
86
  Specifically for Primer-ID sequencing and HIV drug resistance analysis.
@@ -124,7 +138,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
124
138
  version: '0'
125
139
  requirements:
126
140
  - R required for some functions
127
- rubygems_version: 3.0.3
141
+ rubygems_version: 3.1.2
128
142
  signing_key:
129
143
  specification_version: 4
130
144
  summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.