viral_seq 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +37 -0
- data/bin/locator +102 -0
- data/lib/viral_seq/seq_hash.rb +37 -0
- data/lib/viral_seq/seq_hash_pair.rb +2 -2
- data/lib/viral_seq/sequence.rb +15 -8
- data/lib/viral_seq/version.rb +1 -1
- data/viral_seq.gemspec +2 -5
- metadata +6 -7
- data/bin/console +0 -14
- data/bin/setup +0 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c704f9231123785d82a46e3bb8eb797564ffce24e23f91817b634b502438ac3
|
4
|
+
data.tar.gz: 2cfa762e094166be0510b5b2e344dbac2913b7da431c922def1802f594d96559
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3f5d9b0bd5acada5be9bc20d9264917b7317ef0abbfa70395527483fb2e7e0d256c5423f7a498b3c7add3416aadd2806a797b297360175f6257fae19abfd3122
|
7
|
+
data.tar.gz: f64e177e00a642a090cc07b6ef7150ae6047e58a3f84b6d4186300ca60925bd212e0561edf4046d2467ebc297701c1706cec23f877a4f9724799361c3fa9765e
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -15,8 +15,45 @@ Load all ViralSeq classes by requiring 'viral_seq.rb'
|
|
15
15
|
#!/usr/bin/env ruby
|
16
16
|
require 'viral_seq'
|
17
17
|
|
18
|
+
## Some Examples
|
19
|
+
|
20
|
+
Load nucleotide sequences from a FASTA format sequence file
|
21
|
+
|
22
|
+
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
23
|
+
|
24
|
+
Make an alignment (using MUSCLE)
|
25
|
+
|
26
|
+
aligned_seqhash = my_seqhash.align
|
27
|
+
|
28
|
+
Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
29
|
+
|
30
|
+
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
31
|
+
|
32
|
+
Further filter out sequences with Apobec3g/f hypermutations
|
33
|
+
|
34
|
+
qc_seqhash = qc_seqhash.a3g
|
35
|
+
|
36
|
+
Calculate nucleotide diveristy π
|
37
|
+
|
38
|
+
qc_seqhash.pi
|
39
|
+
|
40
|
+
Calculate cut-off for minority variants based on Poisson model
|
41
|
+
|
42
|
+
cut_off = qc_seqhash.pm
|
43
|
+
|
44
|
+
Examine for drug resistance mutations for HIV PR region
|
45
|
+
|
46
|
+
qc_seqhash.sdrm_hiv_pr(cut_off)
|
47
|
+
|
18
48
|
## Updates
|
19
49
|
|
50
|
+
Version 1.0.1-07102019:
|
51
|
+
|
52
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
53
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
54
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
55
|
+
4. update documentations
|
56
|
+
|
20
57
|
Version 1.0.0-07092019:
|
21
58
|
|
22
59
|
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
data/bin/locator
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'viral_seq'
|
4
|
+
require 'csv'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
module ViralSeq
|
8
|
+
class SeqHash
|
9
|
+
|
10
|
+
def sequence_locator(ref_option = :HXB2)
|
11
|
+
out_array = []
|
12
|
+
dna_seq = self.dna_hash
|
13
|
+
title = self.title
|
14
|
+
|
15
|
+
uniq_dna = dna_seq.uniq_hash
|
16
|
+
|
17
|
+
uniq_dna.each do |seq,names|
|
18
|
+
s = ViralSeq::Sequence.new('',seq)
|
19
|
+
loc = s.locator(ref_option)
|
20
|
+
names.each do |name|
|
21
|
+
out_array << ([title, name, ref_option.to_s] + loc)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
return out_array
|
25
|
+
end # end of locator
|
26
|
+
alias_method :loc, :sequence_locator
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def myparser
|
31
|
+
options = {}
|
32
|
+
OptionParser.new do |opts|
|
33
|
+
opts.banner = "Usage: locator -i [nt_sequence_fasta_file] -o [locator_info_csv_file] -r [reference_genome_option]"
|
34
|
+
|
35
|
+
opts.on('-i', '--infile FASTA_FILE', 'nt sequence file in FASTA format') do |i|
|
36
|
+
options[:infile] = i
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-o', '--outfile CSV_FILE', 'output .csv file for locator info') do |o|
|
40
|
+
options[:outfile] = o
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-r', '--ref_option OPTION', 'reference genome option, choose from `HXB2` (default), `NL43`, `MAC239`') do |o|
|
44
|
+
options[:ref_option] = o.to_sym
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-h", "--help", "Prints this help") do
|
48
|
+
puts opts
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end.parse!
|
52
|
+
return options
|
53
|
+
end
|
54
|
+
|
55
|
+
puts "\nSequence Locator (RubyGem::ViralSeq) v1.0.1 by Shuntai Zhou"
|
56
|
+
puts "See details at https://github.com/ViralSeq/viral_seq\n"
|
57
|
+
puts "Resembling Sequence Locator from LANL (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n\n"
|
58
|
+
|
59
|
+
ARGV << '-h' if ARGV.size == 0
|
60
|
+
|
61
|
+
options = myparser
|
62
|
+
|
63
|
+
begin
|
64
|
+
if options[:infile]
|
65
|
+
seq_file = options[:infile]
|
66
|
+
else
|
67
|
+
raise StandardError.new("Input file sequence file not found")
|
68
|
+
end
|
69
|
+
|
70
|
+
if options[:outfile]
|
71
|
+
csv_file = options[:outfile]
|
72
|
+
else
|
73
|
+
raise StandardError.new("Please provide path to output csv file")
|
74
|
+
end
|
75
|
+
|
76
|
+
unless File.exist?(seq_file)
|
77
|
+
raise StandardError.new("Input file sequence file not found")
|
78
|
+
end
|
79
|
+
|
80
|
+
seqs = ViralSeq::SeqHash.fa(seq_file)
|
81
|
+
opt = options[:ref_option] ? options[:ref_option] : :HXB2
|
82
|
+
|
83
|
+
unless [:HXB2, :NL43, :MAC239].include? opt
|
84
|
+
puts "Reference option #{opt} not recognized, using `:HXB2` as the reference genome."
|
85
|
+
opt = :HXB2
|
86
|
+
end
|
87
|
+
|
88
|
+
locs = seqs.loc(opt)
|
89
|
+
head = ["title", "sequence", "ref", "start", "end", "similarity", "indel", "aligned_input", "aligned_ref"]
|
90
|
+
locs.unshift(head)
|
91
|
+
data = CSV.generate do |csv|
|
92
|
+
locs.each {|loc| csv << loc}
|
93
|
+
end
|
94
|
+
|
95
|
+
File.write(csv_file, data)
|
96
|
+
rescue StandardError => e
|
97
|
+
puts e.message
|
98
|
+
puts "\n"
|
99
|
+
ARGV.clear
|
100
|
+
ARGV << '-h'
|
101
|
+
myparser
|
102
|
+
end
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -713,6 +713,43 @@ module ViralSeq
|
|
713
713
|
self.sub(seq_pass)
|
714
714
|
end # end of #hiv_seq_qc
|
715
715
|
|
716
|
+
# sequence locator for SeqHash object, resembling HIV Sequence Locator from LANL
|
717
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
718
|
+
# @return [Array] two dimensional array `[[],[],[],...]` for each sequence, including the following information:
|
719
|
+
#
|
720
|
+
# title of the SeqHash object (String)
|
721
|
+
#
|
722
|
+
# sequence taxa (String)
|
723
|
+
#
|
724
|
+
# start_location (Integer)
|
725
|
+
#
|
726
|
+
# end_location (Integer)
|
727
|
+
#
|
728
|
+
# percentage_of_similarity_to_reference_sequence (Float)
|
729
|
+
#
|
730
|
+
# containing_indel? (Boolean)
|
731
|
+
#
|
732
|
+
# aligned_input_sequence (String)
|
733
|
+
#
|
734
|
+
# aligned_reference_sequence (String)
|
735
|
+
# @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
|
736
|
+
def sequence_locator(ref_option = :HXB2)
|
737
|
+
out_array = []
|
738
|
+
dna_seq = self.dna_hash
|
739
|
+
title = self.title
|
740
|
+
|
741
|
+
uniq_dna = dna_seq.uniq_hash
|
742
|
+
|
743
|
+
uniq_dna.each do |seq,names|
|
744
|
+
s = ViralSeq::Sequence.new('',seq)
|
745
|
+
loc = s.locator(ref_option)
|
746
|
+
names.each do |name|
|
747
|
+
out_array << ([title, name, ref_option.to_s] + loc)
|
748
|
+
end
|
749
|
+
end
|
750
|
+
return out_array
|
751
|
+
end # end of locator
|
752
|
+
alias_method :loc, :sequence_locator
|
716
753
|
|
717
754
|
# Remove squences with residual offspring Primer IDs.
|
718
755
|
# Compare PID with sequences which have identical sequences.
|
@@ -139,10 +139,10 @@ module ViralSeq
|
|
139
139
|
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
|
140
140
|
# my_seqhashpair.join2.dna_hash
|
141
141
|
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
142
|
-
# my_seqhashpair.join2(:indiv).dna_hash
|
142
|
+
# my_seqhashpair.join2(model :indiv).dna_hash
|
143
143
|
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
144
144
|
|
145
|
-
def join2(model
|
145
|
+
def join2(model: :con, diff: 0.0)
|
146
146
|
seq_pair_hash = self.dna_hash
|
147
147
|
begin
|
148
148
|
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
data/lib/viral_seq/sequence.rb
CHANGED
@@ -142,13 +142,20 @@ module ViralSeq
|
|
142
142
|
# # current version only supports nucleotide sequence, not for amino acid sequence.
|
143
143
|
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
144
144
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
145
|
-
# @return [Array] an array of the following info
|
146
|
-
#
|
147
|
-
#
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
145
|
+
# @return [Array] an array of the following info:
|
146
|
+
#
|
147
|
+
# start_location (Integer)
|
148
|
+
#
|
149
|
+
# end_location (Integer)
|
150
|
+
#
|
151
|
+
# percentage_of_similarity_to_reference_sequence (Float)
|
152
|
+
#
|
153
|
+
# containing_indel? (Boolean)
|
154
|
+
#
|
155
|
+
# aligned_input_sequence (String)
|
156
|
+
#
|
157
|
+
# aligned_reference_sequence (String)
|
158
|
+
#
|
152
159
|
# @example identify the location of the input sequence on the NL43 genome
|
153
160
|
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
154
161
|
# s = ViralSeq::Sequence.new('my_sequence', sequence)
|
@@ -349,7 +356,7 @@ module ViralSeq
|
|
349
356
|
# s = ViralSeq::Sequence.new('my_seq', seq)
|
350
357
|
# s.sequence_clip(2333, 2433, :HXB2).dna
|
351
358
|
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
352
|
-
|
359
|
+
|
353
360
|
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
354
361
|
loc = self.locator(ref_option, path_to_muscle)
|
355
362
|
l1 = loc[0]
|
data/lib/viral_seq/version.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -20,8 +20,8 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
21
21
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
22
|
end
|
23
|
-
spec.bindir = "
|
24
|
-
|
23
|
+
spec.bindir = "bin"
|
24
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
25
25
|
spec.require_paths = ["lib"]
|
26
26
|
spec.post_install_message = "Thanks for installing!"
|
27
27
|
|
@@ -31,8 +31,5 @@ Gem::Specification.new do |spec|
|
|
31
31
|
|
32
32
|
# muscle_bio gem required
|
33
33
|
spec.add_runtime_dependency "muscle_bio", "~> 0.4"
|
34
|
-
|
35
|
-
|
36
|
-
spec.requirements << 'MUSCLE (http://www.drive5.com/muscle) required for some functions'
|
37
34
|
spec.requirements << 'R required for some functions'
|
38
35
|
end
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
8
8
|
- Michael Clark
|
9
9
|
autorequire:
|
10
|
-
bindir:
|
10
|
+
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-07-
|
12
|
+
date: 2019-07-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -73,7 +73,8 @@ description: |-
|
|
73
73
|
email:
|
74
74
|
- shuntai.zhou@gmail.com
|
75
75
|
- clarkmu@gmail.com
|
76
|
-
executables:
|
76
|
+
executables:
|
77
|
+
- locator
|
77
78
|
extensions: []
|
78
79
|
extra_rdoc_files: []
|
79
80
|
files:
|
@@ -86,8 +87,7 @@ files:
|
|
86
87
|
- LICENSE.txt
|
87
88
|
- README.md
|
88
89
|
- Rakefile
|
89
|
-
- bin/
|
90
|
-
- bin/setup
|
90
|
+
- bin/locator
|
91
91
|
- lib/viral_seq.rb
|
92
92
|
- lib/viral_seq/Integer.rb
|
93
93
|
- lib/viral_seq/constant.rb
|
@@ -124,7 +124,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: '0'
|
126
126
|
requirements:
|
127
|
-
- MUSCLE (http://www.drive5.com/muscle) required for some functions
|
128
127
|
- R required for some functions
|
129
128
|
rubygems_version: 3.0.3
|
130
129
|
signing_key:
|
data/bin/console
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require "bundler/setup"
|
4
|
-
require "viral_seq"
|
5
|
-
|
6
|
-
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
-
# with your gem easier. You can also use a different console, if you like.
|
8
|
-
|
9
|
-
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
-
# require "pry"
|
11
|
-
# Pry.start
|
12
|
-
|
13
|
-
require "irb"
|
14
|
-
IRB.start(__FILE__)
|