viral_seq 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +9 -2
- data/bin/locator +11 -9
- data/lib/viral_seq/muscle.rb +2 -2
- data/lib/viral_seq/seq_hash.rb +112 -17
- data/lib/viral_seq/seq_hash_pair.rb +9 -5
- data/lib/viral_seq/version.rb +1 -1
- data/viral_seq.gemspec +4 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bb326c97b25326286a51ec63583983a20dfebee2513fd8811bc855ec21ac0b5d
|
4
|
+
data.tar.gz: e9870bbaa8c17ba51d53e790ca8189e2dd362911e1b5cfcd4806a3bc68ccf369
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff6e5727484687db04180a1ef9d3204e9ed02d9b1a98862bdb8796255680aca1e830667429a57db116702793dc55eeb7cc84800c39b27f8e2773186e1a638988
|
7
|
+
data.tar.gz: 86d0b03af6335cc91e38bc54a8c1fa7e2c84d430dc0adb02e4dc3819ebb188a0e8ae1e4c76c71e5066cac51675e0a45f9ee5a9b0bbd2de8b26da4fa04fe95d85
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.0.
|
4
|
+
viral_seq (1.0.7)
|
5
|
+
colorize (~> 0.1)
|
5
6
|
muscle_bio (~> 0.4)
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
9
10
|
specs:
|
11
|
+
colorize (0.8.1)
|
10
12
|
diff-lcs (1.3)
|
11
13
|
muscle_bio (0.4.0)
|
12
14
|
rake (10.5.0)
|
@@ -34,4 +36,4 @@ DEPENDENCIES
|
|
34
36
|
viral_seq!
|
35
37
|
|
36
38
|
BUNDLED WITH
|
37
|
-
2.
|
39
|
+
2.1.4
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
|
14
14
|
|
15
15
|
#!/usr/bin/env ruby
|
16
16
|
require 'viral_seq'
|
17
|
-
|
17
|
+
|
18
18
|
#### Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
19
19
|
|
20
20
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
@@ -51,6 +51,13 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
|
51
51
|
|
52
52
|
## Updates
|
53
53
|
|
54
|
+
Version 1.0.7-01282020:
|
55
|
+
|
56
|
+
1. Several methods added, including
|
57
|
+
ViralSeq::SeqHash#error_table
|
58
|
+
ViralSeq::SeqHash#random_select
|
59
|
+
2. Improved performance for several functions.
|
60
|
+
|
54
61
|
Version 1.0.6-07232019:
|
55
62
|
|
56
63
|
1. Several methods added to ViralSeq::SeqHash, including
|
@@ -58,7 +65,7 @@ Version 1.0.6-07232019:
|
|
58
65
|
ViralSeq::SeqHash#+
|
59
66
|
ViralSeq::SeqHash#write_nt_fa
|
60
67
|
ViralSeq::SeqHash#mutation
|
61
|
-
2. Update documentations and rspec samples.
|
68
|
+
2. Update documentations and rspec samples.
|
62
69
|
|
63
70
|
Version 1.0.5-07112019:
|
64
71
|
|
data/bin/locator
CHANGED
@@ -3,13 +3,14 @@
|
|
3
3
|
require 'viral_seq'
|
4
4
|
require 'csv'
|
5
5
|
require 'optparse'
|
6
|
+
require 'colorize'
|
6
7
|
|
7
8
|
def myparser
|
8
9
|
options = {}
|
9
10
|
OptionParser.new do |opts|
|
10
|
-
opts.banner = "Usage: locator -i [nt_sequence_fasta_file] -o [locator_info_csv_file] -r [reference_genome_option]"
|
11
|
+
opts.banner = "#{"Usage:".red.bold} locator #{"-i".blue.bold} [nt_sequence_fasta_file] #{"-o".blue.bold} [locator_info_csv_file] #{"-r".blue.bold} [reference_genome_option]"
|
11
12
|
|
12
|
-
opts.on('-i', '--infile FASTA_FILE',
|
13
|
+
opts.on('-i', '--infile FASTA_FILE', "#{"nt sequence".blue.bold} file in FASTA format") do |i|
|
13
14
|
options[:infile] = i
|
14
15
|
end
|
15
16
|
|
@@ -17,7 +18,7 @@ def myparser
|
|
17
18
|
options[:outfile] = o
|
18
19
|
end
|
19
20
|
|
20
|
-
opts.on('-r', '--ref_option OPTION',
|
21
|
+
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `NL43`, `MAC239`".blue.bold}") do |o|
|
21
22
|
options[:ref_option] = o.to_sym
|
22
23
|
end
|
23
24
|
|
@@ -35,9 +36,9 @@ def myparser
|
|
35
36
|
return options
|
36
37
|
end
|
37
38
|
|
38
|
-
puts "\
|
39
|
-
puts "See details at https://github.com/ViralSeq/viral_seq\n"
|
40
|
-
puts "Resembling Sequence Locator from LANL (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n
|
39
|
+
puts "\n" + "Sequence Locator (RubyGem::ViralSeq Version #{ViralSeq::VERSION})".red.bold + " by " + "Shuntai Zhou".blue.bold
|
40
|
+
puts "See details at " + "https://github.com/ViralSeq/viral_seq\n".blue
|
41
|
+
puts "Resembling" + " Sequence Locator ".magenta.bold + "from LANL" + " (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n".blue
|
41
42
|
|
42
43
|
ARGV << '-h' if ARGV.size == 0
|
43
44
|
|
@@ -47,7 +48,7 @@ begin
|
|
47
48
|
if options[:infile]
|
48
49
|
seq_file = options[:infile]
|
49
50
|
else
|
50
|
-
raise StandardError.new("Input file sequence file not found")
|
51
|
+
raise StandardError.new("Input file sequence file not found".red.bold)
|
51
52
|
end
|
52
53
|
|
53
54
|
if options[:outfile]
|
@@ -57,14 +58,14 @@ begin
|
|
57
58
|
end
|
58
59
|
|
59
60
|
unless File.exist?(seq_file)
|
60
|
-
raise StandardError.new("Input file sequence file not found")
|
61
|
+
raise StandardError.new("Input file sequence file not found".red.bold)
|
61
62
|
end
|
62
63
|
|
63
64
|
seqs = ViralSeq::SeqHash.fa(seq_file)
|
64
65
|
opt = options[:ref_option] ? options[:ref_option] : :HXB2
|
65
66
|
|
66
67
|
unless [:HXB2, :NL43, :MAC239].include? opt
|
67
|
-
puts "Reference option
|
68
|
+
puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
|
68
69
|
opt = :HXB2
|
69
70
|
end
|
70
71
|
|
@@ -76,6 +77,7 @@ begin
|
|
76
77
|
end
|
77
78
|
|
78
79
|
File.write(csv_file, data)
|
80
|
+
puts "Output file found at #{csv_file.green.bold}"
|
79
81
|
rescue StandardError => e
|
80
82
|
puts e.message
|
81
83
|
puts "\n"
|
data/lib/viral_seq/muscle.rb
CHANGED
@@ -39,8 +39,8 @@ module ViralSeq
|
|
39
39
|
|
40
40
|
def self.align(ref_seq = "", test_seq = "", path_to_muscle = false)
|
41
41
|
temp_dir = Dir.home
|
42
|
-
temp_file = temp_dir
|
43
|
-
temp_aln = temp_dir
|
42
|
+
temp_file = File.join(temp_dir, "_temp_muscle_in")
|
43
|
+
temp_aln = File.join(temp_dir, "_temp_muscle_aln")
|
44
44
|
name = ">test"
|
45
45
|
temp_in = File.open(temp_file,"w")
|
46
46
|
temp_in.puts ">ref"
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -248,10 +248,12 @@ module ViralSeq
|
|
248
248
|
def translate(codon_position = 0)
|
249
249
|
seqs = self.dna_hash
|
250
250
|
@aa_hash = {}
|
251
|
-
seqs.each do |
|
252
|
-
s = ViralSeq::Sequence.new(name, seq)
|
251
|
+
seqs.uniq_hash.each do |seq, array_of_name|
|
252
|
+
s = ViralSeq::Sequence.new('name', seq)
|
253
253
|
s.translate(codon_position)
|
254
|
-
|
254
|
+
array_of_name.each do |name|
|
255
|
+
@aa_hash[name] = s.aa_string
|
256
|
+
end
|
255
257
|
end
|
256
258
|
return nil
|
257
259
|
end # end of #translate
|
@@ -332,12 +334,13 @@ module ViralSeq
|
|
332
334
|
def stop_codon(codon_position = 0)
|
333
335
|
self.translate(codon_position)
|
334
336
|
keys = []
|
335
|
-
self.aa_hash
|
336
|
-
|
337
|
+
aa_seqs = self.aa_hash
|
338
|
+
aa_seqs.uniq_hash.each do |seq,array_of_name|
|
339
|
+
keys += array_of_name if seq.include?('*')
|
337
340
|
end
|
338
341
|
seqhash1 = self.sub(keys)
|
339
342
|
seqhash1.title = self.title + "_stop"
|
340
|
-
keys2 =
|
343
|
+
keys2 = aa_seqs.keys - keys
|
341
344
|
seqhash2 = self.sub(keys2)
|
342
345
|
return [seqhash1, seqhash2]
|
343
346
|
end #end of #stop_codon
|
@@ -904,11 +907,11 @@ module ViralSeq
|
|
904
907
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps
|
905
908
|
# @example gap strip for an array of sequences
|
906
909
|
# array = ["AACCGGTT", "A-CCGGTT", "AAC-GGTT", "AACCG-TT", "AACCGGT-"]
|
907
|
-
# array = { AACCGGTT
|
908
|
-
#
|
909
|
-
#
|
910
|
-
#
|
911
|
-
#
|
910
|
+
# array = %w{ AACCGGTT
|
911
|
+
# A-CCGGTT
|
912
|
+
# AAC-GGTT
|
913
|
+
# AACCG-TT
|
914
|
+
# AACCGGT- }
|
912
915
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
913
916
|
# puts my_seqhash.gap_strip.dna_hash.values
|
914
917
|
# ACGT
|
@@ -963,12 +966,11 @@ module ViralSeq
|
|
963
966
|
# @param (see #gap_strip)
|
964
967
|
# @return [ViralSeq::SeqHash] a new SeqHash object containing nt or aa sequences without gaps at the ends
|
965
968
|
# @example gap strip for an array of sequences only at the ends
|
966
|
-
# array =
|
967
|
-
#
|
968
|
-
#
|
969
|
-
#
|
970
|
-
#
|
971
|
-
# AACCGGT- }
|
969
|
+
# array = %w{ AACCGGTT
|
970
|
+
# A-CCGGTT
|
971
|
+
# AAC-GGTT
|
972
|
+
# AACCG-TT
|
973
|
+
# AACCGGT- }
|
972
974
|
# my_seqhash = ViralSeq::SeqHash.array(array)
|
973
975
|
# puts my_seqhash.gap_strip_ends.dna_hash.values
|
974
976
|
# AACCGGT
|
@@ -1048,6 +1050,99 @@ module ViralSeq
|
|
1048
1050
|
return new_seqhash
|
1049
1051
|
end
|
1050
1052
|
|
1053
|
+
# return an table of frequencies of nucleotides at each position.
|
1054
|
+
# @param ref [String] a reference sequence to compare with, default as the sample consensus sequence
|
1055
|
+
# @param head [Boolean] if the head of table is included.
|
1056
|
+
# @return [Array] a two-dimension array of the frequency table,
|
1057
|
+
# including the following info:
|
1058
|
+
# position on the sequence (starting from 1)
|
1059
|
+
# consensus nucleotide
|
1060
|
+
# total sequence numbers
|
1061
|
+
# percentage of A, shows "-" if agrees with consensus
|
1062
|
+
# percentage of C, shows "-" if agrees with consensus
|
1063
|
+
# percentage of G, shows "-" if agrees with consensus
|
1064
|
+
# percentage of T, shows "-" if agrees with consensus
|
1065
|
+
#
|
1066
|
+
# @example error table for an array of sequences
|
1067
|
+
# array = %w{ AACCGGTT
|
1068
|
+
# AGCCGGTT
|
1069
|
+
# AACTGCTT
|
1070
|
+
# AACCGTTA
|
1071
|
+
# AACCGGTA }
|
1072
|
+
# my_seqhash = ViralSeq::SeqHash.array(array)
|
1073
|
+
# my_seqhash.error_table.each {|r| puts r.join(',')}
|
1074
|
+
# position,consensus,total_seq_number,A,C,G,T
|
1075
|
+
# 1,A,5,-,,,
|
1076
|
+
# 2,A,5,-,,0.2,
|
1077
|
+
# 3,C,5,,-,,
|
1078
|
+
# 4,C,5,,-,,0.2
|
1079
|
+
# 5,G,5,,,-,
|
1080
|
+
# 6,G,5,,0.2,-,0.2
|
1081
|
+
# 7,T,5,,,,-
|
1082
|
+
# 8,T,5,0.4,,,-
|
1083
|
+
|
1084
|
+
def error_table(ref = self.consensus, head = true)
|
1085
|
+
|
1086
|
+
table = []
|
1087
|
+
if head
|
1088
|
+
table << %w{
|
1089
|
+
position
|
1090
|
+
consensus
|
1091
|
+
total_seq_number
|
1092
|
+
A
|
1093
|
+
C
|
1094
|
+
G
|
1095
|
+
T
|
1096
|
+
}
|
1097
|
+
end
|
1098
|
+
ref_size = ref.size
|
1099
|
+
|
1100
|
+
(0..(ref_size - 1)).each do |position|
|
1101
|
+
ref_base = ref[position]
|
1102
|
+
nts = []
|
1103
|
+
|
1104
|
+
self.dna_hash.each do |_k,v|
|
1105
|
+
nts << v[position]
|
1106
|
+
end
|
1107
|
+
|
1108
|
+
freq = nts.count_freq
|
1109
|
+
freq2 = {}
|
1110
|
+
|
1111
|
+
freq.each do |nt,c|
|
1112
|
+
if nt == ref_base
|
1113
|
+
freq2[nt] = '-'
|
1114
|
+
else
|
1115
|
+
freq2[nt] = (c/(self.size).to_f)
|
1116
|
+
end
|
1117
|
+
end
|
1118
|
+
|
1119
|
+
table << [(position + 1),ref_base,self.size,freq2['A'],freq2['C'],freq2['G'],freq2['T']]
|
1120
|
+
end
|
1121
|
+
|
1122
|
+
return table
|
1123
|
+
|
1124
|
+
end # end of error_table
|
1125
|
+
|
1126
|
+
# randomly select n number of sequences from the orginal SeqHash object
|
1127
|
+
# @param n [Integer] number of sequences to randomly select
|
1128
|
+
# @return [ViralSeq::SeqHash] a new SeqHash object with randomly selected sequences
|
1129
|
+
|
1130
|
+
def random_select(n = 100)
|
1131
|
+
new_sh = ViralSeq::SeqHash.new
|
1132
|
+
dna_hash = self.dna_hash
|
1133
|
+
aa_hash = self.aa_hash
|
1134
|
+
qc_hash = self.qc_hash
|
1135
|
+
|
1136
|
+
keys = dna_hash.keys.sample(n)
|
1137
|
+
|
1138
|
+
keys.each do |k|
|
1139
|
+
new_sh.dna_hash[k] = dna_hash[k]
|
1140
|
+
new_sh.aa_hash[k] = aa_hash[k]
|
1141
|
+
new_sh.qc_hash[k] = qc_hash[k]
|
1142
|
+
end
|
1143
|
+
new_sh.title = self.title + "_" + n.to_s
|
1144
|
+
return new_sh
|
1145
|
+
end
|
1051
1146
|
|
1052
1147
|
|
1053
1148
|
# start of private functions
|
@@ -7,7 +7,7 @@ module ViralSeq
|
|
7
7
|
# @example join the paired-end sequences with an overlap of 100 bp
|
8
8
|
# my_seqhashpair.join1(100)
|
9
9
|
# @example join the paired-end sequences with unknown overlap, each pair of sequences has its own overlap size
|
10
|
-
# my_seqhashpair.
|
10
|
+
# my_seqhashpair.join2(model: :indiv)
|
11
11
|
|
12
12
|
class SeqHashPair
|
13
13
|
|
@@ -104,17 +104,21 @@ module ViralSeq
|
|
104
104
|
raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
|
105
105
|
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
106
106
|
joined_seq = {}
|
107
|
-
seq_pair_hash.each do |
|
107
|
+
seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
|
108
108
|
r1_seq = seq_pair[0]
|
109
109
|
r2_seq = seq_pair[1]
|
110
110
|
if overlap.zero?
|
111
|
-
|
111
|
+
joined_sequence = r1_seq + r2_seq
|
112
112
|
elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
|
113
|
-
|
113
|
+
joined_sequence= r1_seq + r2_seq[overlap..-1]
|
114
114
|
else
|
115
115
|
next
|
116
116
|
end
|
117
|
+
seq_names.each do |seq_name|
|
118
|
+
joined_seq[seq_name] = joined_sequence
|
119
|
+
end
|
117
120
|
end
|
121
|
+
|
118
122
|
joined_seq_hash = ViralSeq::SeqHash.new
|
119
123
|
joined_seq_hash.dna_hash = joined_seq
|
120
124
|
joined_seq_hash.title = self.title + "_joined"
|
@@ -139,7 +143,7 @@ module ViralSeq
|
|
139
143
|
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
|
140
144
|
# my_seqhashpair.join2.dna_hash
|
141
145
|
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
142
|
-
# my_seqhashpair.join2(model :indiv).dna_hash
|
146
|
+
# my_seqhashpair.join2(model: :indiv).dna_hash
|
143
147
|
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
144
148
|
|
145
149
|
def join2(model: :con, diff: 0.0)
|
data/lib/viral_seq/version.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -31,5 +31,9 @@ Gem::Specification.new do |spec|
|
|
31
31
|
|
32
32
|
# muscle_bio gem required
|
33
33
|
spec.add_runtime_dependency "muscle_bio", "~> 0.4"
|
34
|
+
|
35
|
+
# colorize gem required
|
36
|
+
spec.add_runtime_dependency "colorize", "~> 0.1"
|
37
|
+
|
34
38
|
spec.requirements << 'R required for some functions'
|
35
39
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2020-01-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -67,6 +67,20 @@ dependencies:
|
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0.4'
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: colorize
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0.1'
|
77
|
+
type: :runtime
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0.1'
|
70
84
|
description: |-
|
71
85
|
A Ruby Gem with bioinformatics tools for processing viral NGS data.
|
72
86
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
@@ -124,7 +138,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
138
|
version: '0'
|
125
139
|
requirements:
|
126
140
|
- R required for some functions
|
127
|
-
rubygems_version: 3.
|
141
|
+
rubygems_version: 3.1.2
|
128
142
|
signing_key:
|
129
143
|
specification_version: 4
|
130
144
|
summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|