viral_seq 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +37 -0
- data/bin/locator +102 -0
- data/lib/viral_seq/seq_hash.rb +37 -0
- data/lib/viral_seq/seq_hash_pair.rb +2 -2
- data/lib/viral_seq/sequence.rb +15 -8
- data/lib/viral_seq/version.rb +1 -1
- data/viral_seq.gemspec +2 -5
- metadata +6 -7
- data/bin/console +0 -14
- data/bin/setup +0 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c704f9231123785d82a46e3bb8eb797564ffce24e23f91817b634b502438ac3
|
4
|
+
data.tar.gz: 2cfa762e094166be0510b5b2e344dbac2913b7da431c922def1802f594d96559
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3f5d9b0bd5acada5be9bc20d9264917b7317ef0abbfa70395527483fb2e7e0d256c5423f7a498b3c7add3416aadd2806a797b297360175f6257fae19abfd3122
|
7
|
+
data.tar.gz: f64e177e00a642a090cc07b6ef7150ae6047e58a3f84b6d4186300ca60925bd212e0561edf4046d2467ebc297701c1706cec23f877a4f9724799361c3fa9765e
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -15,8 +15,45 @@ Load all ViralSeq classes by requiring 'viral_seq.rb'
|
|
15
15
|
#!/usr/bin/env ruby
|
16
16
|
require 'viral_seq'
|
17
17
|
|
18
|
+
## Some Examples
|
19
|
+
|
20
|
+
Load nucleotide sequences from a FASTA format sequence file
|
21
|
+
|
22
|
+
my_seqhash = ViralSeq::SeqHash.fa('my_seq_file.fasta')
|
23
|
+
|
24
|
+
Make an alignment (using MUSCLE)
|
25
|
+
|
26
|
+
aligned_seqhash = my_seqhash.align
|
27
|
+
|
28
|
+
Filter nucleotide sequences with the reference coordinates (HIV Protease)
|
29
|
+
|
30
|
+
qc_seqhash = aligned_seqhash.hiv_seq_qc(2253, 2549, false, :HXB2)
|
31
|
+
|
32
|
+
Further filter out sequences with Apobec3g/f hypermutations
|
33
|
+
|
34
|
+
qc_seqhash = qc_seqhash.a3g
|
35
|
+
|
36
|
+
Calculate nucleotide diveristy π
|
37
|
+
|
38
|
+
qc_seqhash.pi
|
39
|
+
|
40
|
+
Calculate cut-off for minority variants based on Poisson model
|
41
|
+
|
42
|
+
cut_off = qc_seqhash.pm
|
43
|
+
|
44
|
+
Examine for drug resistance mutations for HIV PR region
|
45
|
+
|
46
|
+
qc_seqhash.sdrm_hiv_pr(cut_off)
|
47
|
+
|
18
48
|
## Updates
|
19
49
|
|
50
|
+
Version 1.0.1-07102019:
|
51
|
+
|
52
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
53
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
54
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
55
|
+
4. update documentations
|
56
|
+
|
20
57
|
Version 1.0.0-07092019:
|
21
58
|
|
22
59
|
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
data/bin/locator
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'viral_seq'
|
4
|
+
require 'csv'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
module ViralSeq
|
8
|
+
class SeqHash
|
9
|
+
|
10
|
+
def sequence_locator(ref_option = :HXB2)
|
11
|
+
out_array = []
|
12
|
+
dna_seq = self.dna_hash
|
13
|
+
title = self.title
|
14
|
+
|
15
|
+
uniq_dna = dna_seq.uniq_hash
|
16
|
+
|
17
|
+
uniq_dna.each do |seq,names|
|
18
|
+
s = ViralSeq::Sequence.new('',seq)
|
19
|
+
loc = s.locator(ref_option)
|
20
|
+
names.each do |name|
|
21
|
+
out_array << ([title, name, ref_option.to_s] + loc)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
return out_array
|
25
|
+
end # end of locator
|
26
|
+
alias_method :loc, :sequence_locator
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def myparser
|
31
|
+
options = {}
|
32
|
+
OptionParser.new do |opts|
|
33
|
+
opts.banner = "Usage: locator -i [nt_sequence_fasta_file] -o [locator_info_csv_file] -r [reference_genome_option]"
|
34
|
+
|
35
|
+
opts.on('-i', '--infile FASTA_FILE', 'nt sequence file in FASTA format') do |i|
|
36
|
+
options[:infile] = i
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-o', '--outfile CSV_FILE', 'output .csv file for locator info') do |o|
|
40
|
+
options[:outfile] = o
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-r', '--ref_option OPTION', 'reference genome option, choose from `HXB2` (default), `NL43`, `MAC239`') do |o|
|
44
|
+
options[:ref_option] = o.to_sym
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-h", "--help", "Prints this help") do
|
48
|
+
puts opts
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end.parse!
|
52
|
+
return options
|
53
|
+
end
|
54
|
+
|
55
|
+
puts "\nSequence Locator (RubyGem::ViralSeq) v1.0.1 by Shuntai Zhou"
|
56
|
+
puts "See details at https://github.com/ViralSeq/viral_seq\n"
|
57
|
+
puts "Resembling Sequence Locator from LANL (https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html)\n\n"
|
58
|
+
|
59
|
+
ARGV << '-h' if ARGV.size == 0
|
60
|
+
|
61
|
+
options = myparser
|
62
|
+
|
63
|
+
begin
|
64
|
+
if options[:infile]
|
65
|
+
seq_file = options[:infile]
|
66
|
+
else
|
67
|
+
raise StandardError.new("Input file sequence file not found")
|
68
|
+
end
|
69
|
+
|
70
|
+
if options[:outfile]
|
71
|
+
csv_file = options[:outfile]
|
72
|
+
else
|
73
|
+
raise StandardError.new("Please provide path to output csv file")
|
74
|
+
end
|
75
|
+
|
76
|
+
unless File.exist?(seq_file)
|
77
|
+
raise StandardError.new("Input file sequence file not found")
|
78
|
+
end
|
79
|
+
|
80
|
+
seqs = ViralSeq::SeqHash.fa(seq_file)
|
81
|
+
opt = options[:ref_option] ? options[:ref_option] : :HXB2
|
82
|
+
|
83
|
+
unless [:HXB2, :NL43, :MAC239].include? opt
|
84
|
+
puts "Reference option #{opt} not recognized, using `:HXB2` as the reference genome."
|
85
|
+
opt = :HXB2
|
86
|
+
end
|
87
|
+
|
88
|
+
locs = seqs.loc(opt)
|
89
|
+
head = ["title", "sequence", "ref", "start", "end", "similarity", "indel", "aligned_input", "aligned_ref"]
|
90
|
+
locs.unshift(head)
|
91
|
+
data = CSV.generate do |csv|
|
92
|
+
locs.each {|loc| csv << loc}
|
93
|
+
end
|
94
|
+
|
95
|
+
File.write(csv_file, data)
|
96
|
+
rescue StandardError => e
|
97
|
+
puts e.message
|
98
|
+
puts "\n"
|
99
|
+
ARGV.clear
|
100
|
+
ARGV << '-h'
|
101
|
+
myparser
|
102
|
+
end
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -713,6 +713,43 @@ module ViralSeq
|
|
713
713
|
self.sub(seq_pass)
|
714
714
|
end # end of #hiv_seq_qc
|
715
715
|
|
716
|
+
# sequence locator for SeqHash object, resembling HIV Sequence Locator from LANL
|
717
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
718
|
+
# @return [Array] two dimensional array `[[],[],[],...]` for each sequence, including the following information:
|
719
|
+
#
|
720
|
+
# title of the SeqHash object (String)
|
721
|
+
#
|
722
|
+
# sequence taxa (String)
|
723
|
+
#
|
724
|
+
# start_location (Integer)
|
725
|
+
#
|
726
|
+
# end_location (Integer)
|
727
|
+
#
|
728
|
+
# percentage_of_similarity_to_reference_sequence (Float)
|
729
|
+
#
|
730
|
+
# containing_indel? (Boolean)
|
731
|
+
#
|
732
|
+
# aligned_input_sequence (String)
|
733
|
+
#
|
734
|
+
# aligned_reference_sequence (String)
|
735
|
+
# @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
|
736
|
+
def sequence_locator(ref_option = :HXB2)
|
737
|
+
out_array = []
|
738
|
+
dna_seq = self.dna_hash
|
739
|
+
title = self.title
|
740
|
+
|
741
|
+
uniq_dna = dna_seq.uniq_hash
|
742
|
+
|
743
|
+
uniq_dna.each do |seq,names|
|
744
|
+
s = ViralSeq::Sequence.new('',seq)
|
745
|
+
loc = s.locator(ref_option)
|
746
|
+
names.each do |name|
|
747
|
+
out_array << ([title, name, ref_option.to_s] + loc)
|
748
|
+
end
|
749
|
+
end
|
750
|
+
return out_array
|
751
|
+
end # end of locator
|
752
|
+
alias_method :loc, :sequence_locator
|
716
753
|
|
717
754
|
# Remove squences with residual offspring Primer IDs.
|
718
755
|
# Compare PID with sequences which have identical sequences.
|
@@ -139,10 +139,10 @@ module ViralSeq
|
|
139
139
|
# my_seqhashpair = ViralSeq::SeqHashPair.new(paired_seq2)
|
140
140
|
# my_seqhashpair.join2.dna_hash
|
141
141
|
# => {">pair4"=>"AAAGGGGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
142
|
-
# my_seqhashpair.join2(:indiv).dna_hash
|
142
|
+
# my_seqhashpair.join2(model :indiv).dna_hash
|
143
143
|
# => {">pair4"=>"AAAGGGGGGGTT", ">pair5"=>"AAAAAAGGGGTTTTT", ">pair6"=>"AAACAAGGGGTTTTT"}
|
144
144
|
|
145
|
-
def join2(model
|
145
|
+
def join2(model: :con, diff: 0.0)
|
146
146
|
seq_pair_hash = self.dna_hash
|
147
147
|
begin
|
148
148
|
raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
|
data/lib/viral_seq/sequence.rb
CHANGED
@@ -142,13 +142,20 @@ module ViralSeq
|
|
142
142
|
# # current version only supports nucleotide sequence, not for amino acid sequence.
|
143
143
|
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:NL43`, `:MAC239`
|
144
144
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
145
|
-
# @return [Array] an array of the following info
|
146
|
-
#
|
147
|
-
#
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
145
|
+
# @return [Array] an array of the following info:
|
146
|
+
#
|
147
|
+
# start_location (Integer)
|
148
|
+
#
|
149
|
+
# end_location (Integer)
|
150
|
+
#
|
151
|
+
# percentage_of_similarity_to_reference_sequence (Float)
|
152
|
+
#
|
153
|
+
# containing_indel? (Boolean)
|
154
|
+
#
|
155
|
+
# aligned_input_sequence (String)
|
156
|
+
#
|
157
|
+
# aligned_reference_sequence (String)
|
158
|
+
#
|
152
159
|
# @example identify the location of the input sequence on the NL43 genome
|
153
160
|
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
154
161
|
# s = ViralSeq::Sequence.new('my_sequence', sequence)
|
@@ -349,7 +356,7 @@ module ViralSeq
|
|
349
356
|
# s = ViralSeq::Sequence.new('my_seq', seq)
|
350
357
|
# s.sequence_clip(2333, 2433, :HXB2).dna
|
351
358
|
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
352
|
-
|
359
|
+
|
353
360
|
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2, path_to_muscle = false)
|
354
361
|
loc = self.locator(ref_option, path_to_muscle)
|
355
362
|
l1 = loc[0]
|
data/lib/viral_seq/version.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -20,8 +20,8 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
21
21
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
22
|
end
|
23
|
-
spec.bindir = "
|
24
|
-
|
23
|
+
spec.bindir = "bin"
|
24
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
25
25
|
spec.require_paths = ["lib"]
|
26
26
|
spec.post_install_message = "Thanks for installing!"
|
27
27
|
|
@@ -31,8 +31,5 @@ Gem::Specification.new do |spec|
|
|
31
31
|
|
32
32
|
# muscle_bio gem required
|
33
33
|
spec.add_runtime_dependency "muscle_bio", "~> 0.4"
|
34
|
-
|
35
|
-
|
36
|
-
spec.requirements << 'MUSCLE (http://www.drive5.com/muscle) required for some functions'
|
37
34
|
spec.requirements << 'R required for some functions'
|
38
35
|
end
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
8
8
|
- Michael Clark
|
9
9
|
autorequire:
|
10
|
-
bindir:
|
10
|
+
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-07-
|
12
|
+
date: 2019-07-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -73,7 +73,8 @@ description: |-
|
|
73
73
|
email:
|
74
74
|
- shuntai.zhou@gmail.com
|
75
75
|
- clarkmu@gmail.com
|
76
|
-
executables:
|
76
|
+
executables:
|
77
|
+
- locator
|
77
78
|
extensions: []
|
78
79
|
extra_rdoc_files: []
|
79
80
|
files:
|
@@ -86,8 +87,7 @@ files:
|
|
86
87
|
- LICENSE.txt
|
87
88
|
- README.md
|
88
89
|
- Rakefile
|
89
|
-
- bin/
|
90
|
-
- bin/setup
|
90
|
+
- bin/locator
|
91
91
|
- lib/viral_seq.rb
|
92
92
|
- lib/viral_seq/Integer.rb
|
93
93
|
- lib/viral_seq/constant.rb
|
@@ -124,7 +124,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: '0'
|
126
126
|
requirements:
|
127
|
-
- MUSCLE (http://www.drive5.com/muscle) required for some functions
|
128
127
|
- R required for some functions
|
129
128
|
rubygems_version: 3.0.3
|
130
129
|
signing_key:
|
data/bin/console
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require "bundler/setup"
|
4
|
-
require "viral_seq"
|
5
|
-
|
6
|
-
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
-
# with your gem easier. You can also use a different console, if you like.
|
8
|
-
|
9
|
-
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
-
# require "pry"
|
11
|
-
# Pry.start
|
12
|
-
|
13
|
-
require "irb"
|
14
|
-
IRB.start(__FILE__)
|