viral_seq 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,119 @@
1
+ # functions added to Class::String for direct operation on sequence as a String object
2
+
3
+ class String
4
+
5
+ # reverse complement
6
+ # @return [String] reverse complement sequence
7
+ # @example Reverse complement
8
+ # "ACAGA".rc
9
+ # => "TCTGT"
10
+
11
+ def rc
12
+ self.reverse.tr("ACTG","TGAC")
13
+ end
14
+
15
+ # mutate a nt sequence (String class) randomly
16
+ # @param error_rate [Float] define an error rate for mutation, default to `0.01`
17
+ # @return [String] mutated sequence as String
18
+ # @example mutate a sequence at an error rate of 0.05
19
+ # seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
20
+ # seq.mutation(0.05)
21
+ # => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
22
+
23
+ def mutation(error_rate = 0.01)
24
+ new_string = ""
25
+ self.split("").each do |nt|
26
+ pool = ["A","C","T","G"]
27
+ pool.delete(nt)
28
+ s = error_rate * 10000
29
+ r = rand(10000)
30
+ if r < s
31
+ nt = pool.sample
32
+ end
33
+ new_string << nt
34
+ end
35
+ return new_string
36
+ end
37
+
38
+ # parse the nucleotide sequences as a String object
39
+ # and return a Regexp object for possible matches
40
+ # @return [Regexp] as possible matches
41
+ # @example parse a sequence with ambiguities
42
+ # "ATRWCG".nt_parser
43
+ # => /AT[A|G][A|T]CG/
44
+
45
+ def nt_parser
46
+ match = ""
47
+ self.each_char.each do |base|
48
+ base_array = base.to_list
49
+ if base_array.size == 1
50
+ match += base_array[0]
51
+ else
52
+ pattern = "[" + base_array.join("|") + "]"
53
+ match += pattern
54
+ end
55
+ end
56
+ Regexp.new match
57
+ end
58
+
59
+ # parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
60
+ # @return [Array] parsed nt bases
61
+ # @example parse IUPAC `R`
62
+ # 'R'.to_list
63
+ # => ["A", "G"]
64
+
65
+ def to_list
66
+ list = []
67
+ case self.upcase
68
+ when /[A|T|C|G]/
69
+ list << self
70
+ when "W"
71
+ list = ['A','T']
72
+ when "S"
73
+ list = ['C','G']
74
+ when "M"
75
+ list = ['A','C']
76
+ when 'K'
77
+ list = ['G','C']
78
+ when 'R'
79
+ list = ['A','G']
80
+ when 'Y'
81
+ list = ['C','T']
82
+ when 'B'
83
+ list = ['C','G','T']
84
+ when 'D'
85
+ list = ['A','G','T']
86
+ when 'H'
87
+ list = ['A','C','T']
88
+ when 'V'
89
+ list = ['A','C','G']
90
+ when 'N'
91
+ list = ['A','T','C','G']
92
+ end
93
+ return list
94
+ end
95
+
96
+ # compare two sequences as String objects, two sequence strings need to aligned first
97
+ # @param seq2 [String] the sequence string to compare with
98
+ # @return [Integer] the total number of differences as integer
99
+ # @example compare two sequence strings, without alignment and with alignment
100
+ # seq1 = 'AAGGCGTAGGAC'
101
+ # seq2 = 'AAGCTTAGGACG'
102
+ # seq1.compare_with(seq2) # no alignment
103
+ # => 8
104
+ # aligned_seqs = ViralSeq::Muscle.align(seq1,seq2) # align using MUSCLE
105
+ # aligned_seqs[0].compare_with(aligned_seqs[1])
106
+ # => 4
107
+
108
+ def compare_with(seq2)
109
+ seq1 = self
110
+ length = seq1.size
111
+ diff = 0
112
+ (0..(length-1)).each do |position|
113
+ nt1 = seq1[position]
114
+ nt2 = seq2[position]
115
+ diff += 1 unless nt1 == nt2
116
+ end
117
+ return diff
118
+ end
119
+ end
@@ -2,5 +2,5 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "0.3.2"
5
+ VERSION = "1.0.0"
6
6
  end
data/lib/viral_seq.rb CHANGED
@@ -18,24 +18,23 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
- # viral_seq main
22
21
  module ViralSeq; end
23
22
 
24
- # load all modules
25
-
26
- require "viral_seq/version"
27
- require "viral_seq/sequence"
23
+ # load all classes
24
+ require "viral_seq/constant"
25
+ require "viral_seq/enumerable"
26
+ require "viral_seq/hash"
27
+ require "viral_seq/hivdr"
28
+ require "viral_seq/integer"
28
29
  require "viral_seq/math"
29
- require "viral_seq/fasta"
30
- require "viral_seq/misc"
31
- require "viral_seq/refseq"
32
- require "viral_seq/locator"
33
30
  require "viral_seq/muscle"
34
- require "viral_seq/tcs_core.rb"
35
- require "viral_seq/poisson_cutoff"
36
- require "viral_seq/a3g"
37
- require "viral_seq/sdrm_core"
38
- require "viral_seq/hcv_dr"
39
- require "viral_seq/nt_variation"
31
+ require "viral_seq/pid"
32
+ require "viral_seq/ref_seq"
33
+ require "viral_seq/rubystats"
34
+ require "viral_seq/seq_hash"
35
+ require "viral_seq/seq_hash_pair"
36
+ require "viral_seq/sequence"
37
+ require "viral_seq/string"
38
+ require "viral_seq/version"
40
39
 
41
40
  require "muscle_bio"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2019-06-21 00:00:00.000000000 Z
12
+ date: 2019-07-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -89,19 +89,20 @@ files:
89
89
  - bin/console
90
90
  - bin/setup
91
91
  - lib/viral_seq.rb
92
- - lib/viral_seq/a3g.rb
93
- - lib/viral_seq/fasta.rb
94
- - lib/viral_seq/hcv_dr.rb
95
- - lib/viral_seq/locator.rb
92
+ - lib/viral_seq/Integer.rb
93
+ - lib/viral_seq/constant.rb
94
+ - lib/viral_seq/enumerable.rb
95
+ - lib/viral_seq/hash.rb
96
+ - lib/viral_seq/hivdr.rb
96
97
  - lib/viral_seq/math.rb
97
- - lib/viral_seq/misc.rb
98
98
  - lib/viral_seq/muscle.rb
99
- - lib/viral_seq/nt_variation.rb
100
- - lib/viral_seq/poisson_cutoff.rb
101
- - lib/viral_seq/refseq.rb
102
- - lib/viral_seq/sdrm_core.rb
99
+ - lib/viral_seq/pid.rb
100
+ - lib/viral_seq/ref_seq.rb
101
+ - lib/viral_seq/rubystats.rb
102
+ - lib/viral_seq/seq_hash.rb
103
+ - lib/viral_seq/seq_hash_pair.rb
103
104
  - lib/viral_seq/sequence.rb
104
- - lib/viral_seq/tcs_core.rb
105
+ - lib/viral_seq/string.rb
105
106
  - lib/viral_seq/version.rb
106
107
  - viral_seq.gemspec
107
108
  homepage: https://github.com/ViralSeq/viral_seq
data/lib/viral_seq/a3g.rb DELETED
@@ -1,172 +0,0 @@
1
- # viral_seq/a3g
2
- # APOBEC3g/f hypermutation function including
3
- # ViralSeq::a3g_hypermut_seq_hash
4
- # ViralSeq::apobec3gf
5
-
6
- # APOBEC3g/f G to A hypermutation
7
- # APOBEC3G/F pattern: GRD -> ARD
8
- # control pattern: G[YN|RC] -> A[YN|RC]
9
- # use the sample consensus to determine potential a3g sites
10
-
11
- # Two criteria to identify hypermutation
12
- # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
13
- # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
14
- # note: criteria 2 only applies on a sequence file containing more than 20 sequences
15
- # b/c Poisson model does not do well on small sample size.
16
-
17
- # ViralSeq.a3g_hypermut_seq_hash(sequence_hash)
18
- # sequence_hash is a Hash object for sequences. {:name => :sequence, ...}
19
- # return array [hypermutation_hash, statistic_info]
20
- # hypermutation_hash is a Hash object for sequences
21
- # statistic_info is a hash object of [sequence_name, stats],
22
- # in which stats String object in csv format (separated by ',') containing
23
- # sequence tag
24
- # G to A mutation numbers at potential a3g positions
25
- # total potential a3g G positions
26
- # G to A mutation numbers at non a3g positions
27
- # total non a3g G positions
28
- # a3g G to A mutation rate / non-a3g G to A mutation rate
29
- # Fishers Exact P-value
30
- #
31
- # =USAGE
32
- # # example 1
33
- # sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence1.fasta')
34
- # hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
35
- # hypermut[0].keys
36
- # => [">Seq7", ">Seq14"]
37
- # stats = hypermut[1]
38
- # stats.values
39
- # => [">Seq7,23,68,1,54,18.26,4.308329383112348e-06", ">Seq14,45,68,9,54,3.97,5.2143571971582974e-08"]
40
- #
41
- # # example 2
42
- # sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence2.fasta')
43
- # hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
44
- # stats = hypermut[1]
45
- # stats = values
46
- # => [">CTAACACTCA_134_a3g-sample2,4,35,0,51,Infinity,0.02465676660128911", ">ATAGTGCCCA_60_a3g-sample2,4,35,1,51,5.83,0.1534487353839561"]
47
- # # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05, but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
48
-
49
-
50
- # ViralSeq.apobec3gf(sequence)
51
- # APOBEC3G/F pattern: GRD -> ARD
52
- # control pattern: G[YN|RC] -> A[YN|RC]
53
- # input a sequence String object
54
- # return all two arrays of position numbers of
55
- # a3g G positions (a3g)
56
- # non-a3g G positions (control)
57
-
58
-
59
- module ViralSeq
60
- def ViralSeq.a3g_hypermut_seq_hash(seq_hash)
61
- # mut_hash number of apobec3g/f mutations per sequence
62
- mut_hash = {}
63
- hm_hash = {}
64
- out_hash = {}
65
-
66
- # total G->A mutations at apobec3g/f positions.
67
- total = 0
68
-
69
- # make consensus sequence for the input sequence hash
70
- ref = ViralSeq.consensus(seq_hash.values)
71
-
72
- # obtain apobec3g positions and control positions
73
- apobec = ViralSeq.apobec3gf(ref)
74
- mut = apobec[0]
75
- control = apobec[1]
76
-
77
- seq_hash.each do |k,v|
78
- a = 0 # muts
79
- b = 0 # potential mut sites
80
- c = 0 # control muts
81
- d = 0 # potenrial controls
82
- mut.each do |n|
83
- next if v[n] == "-"
84
- if v[n] == "A"
85
- a += 1
86
- b += 1
87
- else
88
- b += 1
89
- end
90
- end
91
- mut_hash[k] = a
92
- total += a
93
-
94
- control.each do |n|
95
- next if v[n] == "-"
96
- if v[n] == "A"
97
- c += 1
98
- d += 1
99
- else
100
- d += 1
101
- end
102
- end
103
- rr = (a/b.to_f)/(c/d.to_f)
104
-
105
- t1 = b - a
106
- t2 = d - c
107
-
108
- fet = Rubystats::FishersExactTest.new
109
- fisher = fet.calculate(t1,t2,a,c)
110
- perc = fisher[:twotail]
111
- info = k + "," + a.to_s + "," + b.to_s + "," + c.to_s + "," + d.to_s + "," + rr.round(2).to_s + "," + perc.to_s
112
- out_hash[k] = info
113
- if perc < 0.05
114
- hm_hash[k] = info
115
- end
116
- end
117
-
118
- if seq_hash.size > 20
119
- rate = total.to_f/(seq_hash.size)
120
-
121
- count_mut = ViralSeq.count(mut_hash.values)
122
- maxi_count = count_mut.values.max
123
-
124
- poisson_hash = ViralSeq.poisson_distribution(rate,maxi_count)
125
-
126
- cut_off = 0
127
- poisson_hash.each do |k,v|
128
- cal = seq_hash.size * v
129
- obs = count_mut[k]
130
- if obs >= 20 * cal
131
- cut_off = k
132
- break
133
- elsif k == maxi_count
134
- cut_off = maxi_count
135
- end
136
- end
137
-
138
- mut_hash.each do |k,v|
139
- if v > cut_off
140
- hm_hash[k] = out_hash[k]
141
- end
142
- end
143
- end
144
-
145
- hm_seq_hash = {}
146
- hm_hash.keys.each do |k|
147
- hm_seq_hash[k] = seq_hash[k]
148
- end
149
- return [hm_seq_hash,hm_hash]
150
- end
151
-
152
- # APOBEC3G/F mutation position identification
153
- # APOBEC3G/F pattern: GRD -> ARD
154
- # control pattern: G[YN|RC] -> A[YN|RC]
155
-
156
- def self.apobec3gf(seq = "")
157
- seq.tr!("-", "")
158
- seq_length = seq.size
159
- apobec_position = []
160
- control_position = []
161
- (0..(seq_length - 3)).each do |n|
162
- tri_base = seq[n,3]
163
- if tri_base =~ /G[A|G][A|G|T]/
164
- apobec_position << n
165
- elsif seq[n] == "G"
166
- control_position << n
167
- end
168
- end
169
- return [apobec_position,control_position]
170
- end
171
-
172
- end
@@ -1,154 +0,0 @@
1
- # fasta.rb
2
- # methods for converting sequence formats, including
3
- # ViralSeq::fasta_to_hash
4
- # ViralSeq::fastq_to_fasta
5
- # ViralSeq::fastq_to_hash
6
- # ViralSeq::fasta_hash_to_rsphylip
7
- # ViralSeq::pair_fasta_to_hash
8
-
9
- # =USAGE
10
- # sequence_fasta_hash = ViralSeq.fasta_to_hash(input_fasta_file)
11
- # # input a sequence file in fasta format, read as a sequence hash
12
- # # {:sequence_name1 => sequence1, ...}
13
-
14
- # sequence_fasta_hash = ViralSeq.fastq_to_fasta(input_fastq_file)
15
- # # input a sequence file in fastq format, read as a sequence hash
16
- # # discard sequence quality score
17
-
18
- # sequence_fastq_hash = ViralSeq.fasta_to_hash(input_fastq_file)
19
- # # input a sequence file in fastq format, read as a sequence hash
20
- # # keep sequence quality score
21
- # # {:sequence_name1 => [sequence1, quality1], ...}
22
-
23
- # phylip_hash = ViralSeq.fasta_hash_to_rsphylip(sequence_fasta_hash)
24
- # # convert a aligned fasta sequence hash into relaxed sequencial phylip format
25
-
26
- # paired_sequence_hash = ViralSeq.pair_fasta_to_hash(directory_of_paired_fasta)
27
- # # input a directory containing paired sequence files in the fasta format
28
- # # ├───lib1
29
- # │ lib1_r1.txt
30
- # │ lib1_r2.txt
31
- # # paired sequence files need to have "r1" and "r2" in their file names
32
- # # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
33
- # # return a paired sequence hash :seq_name => [r1_seq, r2_seq]
34
-
35
- module ViralSeq
36
-
37
- def self.fasta_to_hash(infile)
38
- f=File.open(infile,"r")
39
- return_hash = {}
40
- name = ""
41
- while line = f.gets do
42
- line.tr!("\u0000","")
43
- next if line == "\n"
44
- next if line =~ /^\=/
45
- if line =~ /^\>/
46
- name = line.chomp
47
- return_hash[name] = ""
48
- else
49
- return_hash[name] += line.chomp.upcase
50
- end
51
- end
52
- f.close
53
- return return_hash
54
- end
55
-
56
-
57
- # fastq file to fasta, discard quality, return a sequence hash
58
-
59
- def self.fastq_to_fasta(fastq_file)
60
- count = 0
61
- sequence_a = []
62
- count_seq = 0
63
-
64
- File.open(fastq_file,'r') do |file|
65
- file.readlines.collect do |line|
66
- count +=1
67
- count_m = count % 4
68
- if count_m == 1
69
- line.tr!('@','>')
70
- sequence_a << line.chomp
71
- count_seq += 1
72
- elsif count_m == 2
73
- sequence_a << line.chomp
74
- end
75
- end
76
- end
77
- Hash[*sequence_a]
78
- end
79
-
80
- # fastq file to hash, including quality. {:seq_name => [seq,quality]}
81
-
82
- def self.fastq_to_hash(fastq_file)
83
- count = 0
84
- sequence_a = []
85
- quality_a = []
86
- count_seq = 0
87
-
88
- File.open(fastq_file,'r') do |file|
89
- file.readlines.collect do |line|
90
- count +=1
91
- count_m = count % 4
92
- if count_m == 1
93
- line.tr!('@','>')
94
- sequence_a << line.chomp
95
- quality_a << line.chomp
96
- count_seq += 1
97
- elsif count_m == 2
98
- sequence_a << line.chomp
99
- elsif count_m == 0
100
- quality_a << line.chomp
101
- end
102
- end
103
- end
104
- sequence_hash = Hash[*sequence_a]
105
- quality_hash = Hash[*quality_a]
106
- return_hash = {}
107
- sequence_hash.each do |k,v|
108
- return_hash[k] = [v, quality_hash[k]]
109
- end
110
- return return_hash
111
- end
112
-
113
- # fasta sequence hash to relaxed sequencial phylip format
114
-
115
- def self.fasta_hash_to_rsphylip(seqs)
116
- outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
117
- names = seqs.keys
118
- max_name_l = (names.max.size - 1)
119
- max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
120
- seqs.each do |k,v|
121
- outline += k[1..-1] + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
122
- end
123
- return outline
124
- end
125
-
126
- # input a directory with r1 and r2 sequences, return a hash :seq_name => [r1_seq, r2_seq]
127
- # r1 and r2 file names should contain "r1" and "r2" respectively
128
- # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
129
- def self.pair_fasta_to_hash(indir)
130
- files = Dir[indir + "/*"]
131
- r1_file = ""
132
- r2_file = ""
133
- files.each do |f|
134
- if File.basename(f) =~ /r1/i
135
- r1_file = f
136
- elsif File.basename(f) =~ /r2/i
137
- r2_file = f
138
- end
139
- end
140
-
141
- seq1 = ViralSeq.fasta_to_hash(r1_file)
142
- seq2 = ViralSeq.fasta_to_hash(r2_file)
143
-
144
- new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
145
- new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
146
-
147
- seq_pair_hash = {}
148
-
149
- new_seq1.each do |seq_name,seq|
150
- seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
151
- end
152
- return seq_pair_hash
153
- end
154
- end