viral_seq 1.0.6 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +9 -2
- data/bin/locator +11 -9
- data/lib/viral_seq/muscle.rb +2 -2
- data/lib/viral_seq/seq_hash.rb +112 -17
- data/lib/viral_seq/seq_hash_pair.rb +9 -5
- data/lib/viral_seq/version.rb +1 -1
- data/viral_seq.gemspec +4 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bb326c97b25326286a51ec63583983a20dfebee2513fd8811bc855ec21ac0b5d
|
4
|
+
data.tar.gz: e9870bbaa8c17ba51d53e790ca8189e2dd362911e1b5cfcd4806a3bc68ccf369
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff6e5727484687db04180a1ef9d3204e9ed02d9b1a98862bdb8796255680aca1e830667429a57db116702793dc55eeb7cc84800c39b27f8e2773186e1a638988
|
7
|
+
data.tar.gz: 86d0b03af6335cc91e38bc54a8c1fa7e2c84d430dc0adb02e4dc3819ebb188a0e8ae1e4c76c71e5066cac51675e0a45f9ee5a9b0bbd2de8b26da4fa04fe95d85
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.0.
|
4
|
+
viral_seq (1.0.7)
|
5
|
+
colorize (~> 0.1)
|
5
6
|
muscle_bio (~> 0.4)
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
9
10
|
specs:
|
11
|
+
colorize (0.8.1)
|
10
12
|
diff-lcs (1.3)
|
11
13
|
muscle_bio (0.4.0)
|
12
14
|
rake (10.5.0)
|
@@ -34,4 +36,4 @@ DEPENDENCIES
|
|
34
36
|
viral_seq!
|
35
37
|
|
36
38
|
BUNDLED WITH
|
37
|
-
2.
|
39
|
+
2.1.4
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
|
14
14
|
|
15
15
|
#!/usr/bin/env ruby
|
16
16
|
require 'viral_seq'
|
17
|
-
|
17
|
+
|
18
18
|
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
19
19
|
|
20
20
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
@@ -51,6 +51,13 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
|
51
51
|
|
52
52
|
## Updates
|
53
53
|
|
54
|
+
Version 1.0.7-01282020:
|
55
|
+
|
56
|
+
1. Several methods added, including
|
57
|
+
ViralSeq::SeqHash#error_table
|
58
|
+
ViralSeq::SeqHash#random_select
|
59
|
+
2. Improved performance for several functions.
|
60
|
+
|
54
61
|
Version 1.0.6-07232019:
|
55
62
|
|
56
63
|
1. Several methods added to ViralSeq::SeqHash, including
|
@@ -58,7 +65,7 @@ Version 1.0.6-07232019:
|
|
58
65
|
ViralSeq::SeqHash#+
|
59
66
|
ViralSeq::SeqHash#write_nt_fa
|
60
67
|
ViralSeq::SeqHash#mutation
|
61
|
-
2. Update documentations and rspec samples.
|
68
|
+
2. Update documentations and rspec samples.
|
62
69
|
|
63
70
|
Version 1.0.5-07112019:
|
64
71
|
|
data/bin/locator
CHANGED
@@ -3,13 +3,14 @@
|
|
3
3
|
require 'viral_seq'
|
4
4
|
require 'csv'
|
5
5
|
require 'optparse'
|
6
|
+
require 'colorize'
|
6
7
|
|
7
8
|
def myparser
|
8
9
|
options = {}
|
9
10
|
OptionParser.new do |opts|
|
10
|
-
opts.banner = "Usage: locator -i [nt_sequence_fasta_file] -o [locator_info_csv_file] -r [reference_genome_option]"
|
11
|
+
opts.banner = "#{"Usage:".red.bold} locator #{"-i".blue.bold} [nt_sequence_fasta_file] #{"-o".blue.bold} [locator_info_csv_file] #{"-r".blue.bold} [reference_genome_option]"
|
11
12
|
|
12
|
-
opts.on('-i', '--infile FASTA_FILE',
|
13
|
+
opts.on('-i', '--infile FASTA_FILE', "#{"nt sequence".blue.bold} file in FASTA format") do |i|
|
13
14
|
options[:infile] = i
|
14
15
|
end
|
15
16
|
|
@@ -17,7 +18,7 @@ def myparser
|
|
17
18
|
options[:outfile] = o
|
18
19
|
end
|
19
20
|
|
20
|
-
opts.on('-r', '--ref_option OPTION',
|
21
|
+
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `NL43`, `MAC239`".blue.bold}") do |o|
|
21
22
|
options[:ref_option] = o.to_sym
|
22
23
|
end
|
23
24
|
|
@@ -35,9 +36,9 @@ def myparser
|
|
35
36
|
return options
|
36
37
|
end
|
37
38
|
|
38
|
-
puts "\
|
39
|
-
puts "See details at https://github.com/ViralSeq/viral_seq\n"
|
40
|
-
puts "Resembling Sequence Locator from LANL (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n
|
39
|
+
puts "\n" + "Sequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION})".red.bold + " by " + "Shuntai Zhou".blue.bold
|
40
|
+
puts "See details at " + "https://github.com/ViralSeq/viral_seq\n".blue
|
41
|
+
puts "Resembling" + " Sequence Locator ".magenta.bold + "from LANL" + " (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n".blue
|
41
42
|
|
42
43
|
ARGV << '-h' if ARGV.size == 0
|
43
44
|
|
@@ -47,7 +48,7 @@ begin
|
|
47
48
|
if options[:infile]
|
48
49
|
seq_file = options[:infile]
|
49
50
|
else
|
50
|
-
raise StandardError.new("Input file sequence file not found")
|
51
|
+
raise StandardError.new("Input file sequence file not found".red.bold)
|
51
52
|
end
|
52
53
|
|
53
54
|
if options[:outfile]
|
@@ -57,14 +58,14 @@ begin
|
|
57
58
|
end
|
58
59
|
|
59
60
|
unless File.exist?(seq_file)
|
60
|
-
raise StandardError.new("Input file sequence file not found")
|
61
|
+
raise StandardError.new("Input file sequence file not found".red.bold)
|
61
62
|
end
|
62
63
|
|
63
64
|
seqs = ViralSeq::SeqHash.fa(seq_file)
|
64
65
|
opt = options[:ref_option] ? options[:ref_option] : :HXB2
|
65
66
|
|
66
67
|
unless [:HXB2, :NL43, :MAC239].include? opt
|
67
|
-
puts "Reference option
|
68
|
+
puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
|
68
69
|
opt = :HXB2
|
69
70
|
end
|
70
71
|
|
@@ -76,6 +77,7 @@ begin
|
|
76
77
|
end
|
77
78
|
|
78
79
|
File.write(csv_file, data)
|
80
|
+
puts "Output file found at #{csv_file.green.bold}"
|
79
81
|
rescue StandardError => e
|
80
82
|
puts e.message
|
81
83
|
puts "\n"
|
data/lib/viral_seq/muscle.rb
CHANGED
@@ -39,8 +39,8 @@ module ViralSeq
|
|
39
39
|
|
40
40
|
def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
|
41
41
|
temp_dir = Dir.home
|
42
|
-
temp_file = temp_dir
|
43
|
-
temp_aln = temp_dir
|
42
|
+
temp_file = File.join(temp_dir, "_temp_muscle_in")
|
43
|
+
temp_aln = File.join(temp_dir, "_temp_muscle_aln")
|
44
44
|
name = ">test"
|
45
45
|
temp_in = File.open(temp_file,"w")
|
46
46
|
temp_in.puts ">ref"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -248,10 +248,12 @@ module ViralSeq
|
|
248
248
|
def translate(codon_position = 0)
|
249
249
|
seqs = self.dna_hash
|
250
250
|
@aa_hash = {}
|
251
|
-
seqs.each do |
|
252
|
-
s = ViralSeq::Sequence.new(name, seq)
|
251
|
+
seqs.uniq_hash.each do |seq, array_of_name|
|
252
|
+
s = ViralSeq::Sequence.new('name', seq)
|
253
253
|
s.translate(codon_position)
|
254
|
-
|
254
|
+
array_of_name.each do |name|
|
255
|
+
@aa_hash[name] = s.aa_string
|
256
|
+
end
|
255
257
|
end
|
256
258
|
return nil
|
257
259
|
end # end of #translate
|
@@ -332,12 +334,13 @@ module ViralSeq
|
|
332
334
|
def stop_codon(codon_position = 0)
|
333
335
|
self.translate(codon_position)
|
334
336
|
keys = []
|
335
|
-
self.aa_hash
|
336
|
-
|
337
|
+
aa_seqs = self.aa_hash
|
338
|
+
aa_seqs.uniq_hash.each do |seq,array_of_name|
|
339
|
+
keys += array_of_name if seq.include?('*')
|
337
340
|
end
|
338
341
|
seqhash1 = self.sub(keys)
|
339
342
|
seqhash1.title = self.title + "_stop"
|
340
|
-
keys2 =
|
343
|
+
keys2 = aa_seqs.keys - keys
|
341
344
|
seqhash2 = self.sub(keys2)
|
342
345
|
return [seqhash1, seqhash2]
|
343
346
|
end #end of #stop_codon
|
@@ -904,11 +907,11 @@ module ViralSeq
|
|
904
907
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
|
905
908
|
# @example gap strip for an array of sequences
|
906
909
|
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
907
|
-
# array = { AACCGGTT
|
908
|
-
#
|
909
|
-
#
|
910
|
-
#
|
911
|
-
#
|
910
|
+
# array = %w{ AACCGGTT
|
911
|
+
# A-CCGGTT
|
912
|
+
# AAC-GGTT
|
913
|
+
# AACCG-TT
|
914
|
+
# AACCGGT- }
|
912
915
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
913
916
|
# puts my_seqhash.gap_strip.dna_hash.values
|
914
917
|
# ACGT
|
@@ -963,12 +966,11 @@ module ViralSeq
|
|
963
966
|
# @param (see #gap_strip)
|
964
967
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
|
965
968
|
# @example gap strip for an array of sequences only at the ends
|
966
|
-
# array =
|
967
|
-
#
|
968
|
-
#
|
969
|
-
#
|
970
|
-
#
|
971
|
-
# AACCGGT- }
|
969
|
+
# array = %w{ AACCGGTT
|
970
|
+
# A-CCGGTT
|
971
|
+
# AAC-GGTT
|
972
|
+
# AACCG-TT
|
973
|
+
# AACCGGT- }
|
972
974
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
973
975
|
# puts my_seqhash.gap_strip_ends.dna_hash.values
|
974
976
|
# AACCGGT
|
@@ -1048,6 +1050,99 @@ module ViralSeq
|
|
1048
1050
|
return new_seqhash
|
1049
1051
|
end
|
1050
1052
|
|
1053
|
+
# return an table of frequencies of nucleotides at each position.
|
1054
|
+
# @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
|
1055
|
+
# @param head [Boolean] if the head of table is included.
|
1056
|
+
# @return [Array] a two-dimension array of the frequency table,
|
1057
|
+
# including the following info:
|
1058
|
+
# position on the sequence (starting from 1)
|
1059
|
+
# consensus nucleotide
|
1060
|
+
# total sequence numbers
|
1061
|
+
# percentage of A, shows "-" if agrees with consensus
|
1062
|
+
# percentage of C, shows "-" if agrees with consensus
|
1063
|
+
# percentage of G, shows "-" if agrees with consensus
|
1064
|
+
# percentage of T, shows "-" if agrees with consensus
|
1065
|
+
#
|
1066
|
+
# @example error table for an array of sequences
|
1067
|
+
# array = %w{ AACCGGTT
|
1068
|
+
# AGCCGGTT
|
1069
|
+
# AACTGCTT
|
1070
|
+
# AACCGTTA
|
1071
|
+
# AACCGGTA }
|
1072
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
1073
|
+
# my_seqhash.error_table.each {|r| puts r.join(',')}
|
1074
|
+
# position,consensus,total_seq_number,A,C,G,T
|
1075
|
+
# 1,A,5,-,,,
|
1076
|
+
# 2,A,5,-,,0.2,
|
1077
|
+
# 3,C,5,,-,,
|
1078
|
+
# 4,C,5,,-,,0.2
|
1079
|
+
# 5,G,5,,,-,
|
1080
|
+
# 6,G,5,,0.2,-,0.2
|
1081
|
+
# 7,T,5,,,,-
|
1082
|
+
# 8,T,5,0.4,,,-
|
1083
|
+
|
1084
|
+
def error_table(ref = self.consensus, head = true)
|
1085
|
+
|
1086
|
+
table = []
|
1087
|
+
if head
|
1088
|
+
table << %w{
|
1089
|
+
position
|
1090
|
+
consensus
|
1091
|
+
total_seq_number
|
1092
|
+
A
|
1093
|
+
C
|
1094
|
+
G
|
1095
|
+
T
|
1096
|
+
}
|
1097
|
+
end
|
1098
|
+
ref_size = ref.size
|
1099
|
+
|
1100
|
+
(0..(ref_size - 1)).each do |position|
|
1101
|
+
ref_base = ref[position]
|
1102
|
+
nts = []
|
1103
|
+
|
1104
|
+
self.dna_hash.each do |_k,v|
|
1105
|
+
nts << v[position]
|
1106
|
+
end
|
1107
|
+
|
1108
|
+
freq = nts.count_freq
|
1109
|
+
freq2 = {}
|
1110
|
+
|
1111
|
+
freq.each do |nt,c|
|
1112
|
+
if nt == ref_base
|
1113
|
+
freq2[nt] = '-'
|
1114
|
+
else
|
1115
|
+
freq2[nt] = (c/(self.size).to_f)
|
1116
|
+
end
|
1117
|
+
end
|
1118
|
+
|
1119
|
+
table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
|
1120
|
+
end
|
1121
|
+
|
1122
|
+
return table
|
1123
|
+
|
1124
|
+
end # end of error_table
|
1125
|
+
|
1126
|
+
# randomly select n number of sequences from the orginal SeqHash object
|
1127
|
+
# @param n [Integer] number of sequences to randomly select
|
1128
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
|
1129
|
+
|
1130
|
+
def random_select(n = 100)
|
1131
|
+
new_sh = ViralSeq::SeqHash.new
|
1132
|
+
dna_hash = self.dna_hash
|
1133
|
+
aa_hash = self.aa_hash
|
1134
|
+
qc_hash = self.qc_hash
|
1135
|
+
|
1136
|
+
keys = dna_hash.keys.sample(n)
|
1137
|
+
|
1138
|
+
keys.each do |k|
|
1139
|
+
new_sh.dna_hash[k] = dna_hash[k]
|
1140
|
+
new_sh.aa_hash[k] = aa_hash[k]
|
1141
|
+
new_sh.qc_hash[k] = qc_hash[k]
|
1142
|
+
end
|
1143
|
+
new_sh.title = self.title + "_" + n.to_s
|
1144
|
+
return new_sh
|
1145
|
+
end
|
1051
1146
|
|
1052
1147
|
|
1053
1148
|
# start of private functions
|
@@ -7,7 +7,7 @@ module ViralSeq
|
|
7
7
|
# @example join the paired-end sequences with an overlap of 100 bp
|
8
8
|
# my_seqhashpair.join1(100)
|
9
9
|
# @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
|
10
|
-
# my_seqhashpair.
|
10
|
+
# my_seqhashpair.join2(model: :indiv)
|
11
11
|
|
12
12
|
class SeqHashPair
|
13
13
|
|
@@ -104,17 +104,21 @@ module ViralSeq
|
|
104
104
|
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
105
105
|
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
106
106
|
joined_seq = {}
|
107
|
-
seq_pair_hash.each do |
|
107
|
+
seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
|
108
108
|
r1_seq = seq_pair[0]
|
109
109
|
r2_seq = seq_pair[1]
|
110
110
|
if overlap.zero?
|
111
|
-
|
111
|
+
joined_sequence = r1_seq + r2_seq
|
112
112
|
elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
|
113
|
-
|
113
|
+
joined_sequence= r1_seq + r2_seq[overlap..-1]
|
114
114
|
else
|
115
115
|
next
|
116
116
|
end
|
117
|
+
seq_names.each do |seq_name|
|
118
|
+
joined_seq[seq_name] = joined_sequence
|
119
|
+
end
|
117
120
|
end
|
121
|
+
|
118
122
|
joined_seq_hash = ViralSeq::SeqHash.new
|
119
123
|
joined_seq_hash.dna_hash = joined_seq
|
120
124
|
joined_seq_hash.title = self.title + "_joined"
|
@@ -139,7 +143,7 @@ module ViralSeq
|
|
139
143
|
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
|
140
144
|
# my_seqhashpair.join2.dna_hash
|
141
145
|
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
142
|
-
# my_seqhashpair.join2(model :indiv).dna_hash
|
146
|
+
# my_seqhashpair.join2(model: :indiv).dna_hash
|
143
147
|
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
144
148
|
|
145
149
|
def join2(model: :con, diff: 0.0)
|
data/lib/viral_seq/version.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -31,5 +31,9 @@ Gem::Specification.new do |spec|
|
|
31
31
|
|
32
32
|
# muscle_bio gem required
|
33
33
|
spec.add_runtime_dependency "muscle_bio", "~> 0.4"
|
34
|
+
|
35
|
+
# colorize gem required
|
36
|
+
spec.add_runtime_dependency "colorize", "~> 0.1"
|
37
|
+
|
34
38
|
spec.requirements << 'R required for some functions'
|
35
39
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2020-01-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -67,6 +67,20 @@ dependencies:
|
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0.4'
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: colorize
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0.1'
|
77
|
+
type: :runtime
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0.1'
|
70
84
|
description: |-
|
71
85
|
A Ruby Gem with bioinformatics tools for processing viral NGS data.
|
72
86
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
@@ -124,7 +138,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
138
|
version: '0'
|
125
139
|
requirements:
|
126
140
|
- R required for some functions
|
127
|
-
rubygems_version: 3.
|
141
|
+
rubygems_version: 3.1.2
|
128
142
|
signing_key:
|
129
143
|
specification_version: 4
|
130
144
|
summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|