viral_seq 0.3.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
@@ -0,0 +1,119 @@
|
|
1
|
+
# functions added to Class::String for direct operation on sequence as a String object
|
2
|
+
|
3
|
+
class String
|
4
|
+
|
5
|
+
# reverse complement
|
6
|
+
# @return [String] reverse complement sequence
|
7
|
+
# @example Reverse complement
|
8
|
+
# "ACAGA".rc
|
9
|
+
# => "TCTGT"
|
10
|
+
|
11
|
+
def rc
|
12
|
+
self.reverse.tr("ACTG","TGAC")
|
13
|
+
end
|
14
|
+
|
15
|
+
# mutate a nt sequence (String class) randomly
|
16
|
+
# @param error_rate [Float] define an error rate for mutation, default to `0.01`
|
17
|
+
# @return [String] mutated sequence as String
|
18
|
+
# @example mutate a sequence at an error rate of 0.05
|
19
|
+
# seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
|
20
|
+
# seq.mutation(0.05)
|
21
|
+
# => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
|
22
|
+
|
23
|
+
def mutation(error_rate = 0.01)
|
24
|
+
new_string = ""
|
25
|
+
self.split("").each do |nt|
|
26
|
+
pool = ["A","C","T","G"]
|
27
|
+
pool.delete(nt)
|
28
|
+
s = error_rate * 10000
|
29
|
+
r = rand(10000)
|
30
|
+
if r < s
|
31
|
+
nt = pool.sample
|
32
|
+
end
|
33
|
+
new_string << nt
|
34
|
+
end
|
35
|
+
return new_string
|
36
|
+
end
|
37
|
+
|
38
|
+
# parse the nucleotide sequences as a String object
|
39
|
+
# and return a Regexp object for possible matches
|
40
|
+
# @return [Regexp] as possible matches
|
41
|
+
# @example parse a sequence with ambiguities
|
42
|
+
# "ATRWCG".nt_parser
|
43
|
+
# => /AT[A|G][A|T]CG/
|
44
|
+
|
45
|
+
def nt_parser
|
46
|
+
match = ""
|
47
|
+
self.each_char.each do |base|
|
48
|
+
base_array = base.to_list
|
49
|
+
if base_array.size == 1
|
50
|
+
match += base_array[0]
|
51
|
+
else
|
52
|
+
pattern = "[" + base_array.join("|") + "]"
|
53
|
+
match += pattern
|
54
|
+
end
|
55
|
+
end
|
56
|
+
Regexp.new match
|
57
|
+
end
|
58
|
+
|
59
|
+
# parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
|
60
|
+
# @return [Array] parsed nt bases
|
61
|
+
# @example parse IUPAC `R`
|
62
|
+
# 'R'.to_list
|
63
|
+
# => ["A", "G"]
|
64
|
+
|
65
|
+
def to_list
|
66
|
+
list = []
|
67
|
+
case self.upcase
|
68
|
+
when /[A|T|C|G]/
|
69
|
+
list << self
|
70
|
+
when "W"
|
71
|
+
list = ['A','T']
|
72
|
+
when "S"
|
73
|
+
list = ['C','G']
|
74
|
+
when "M"
|
75
|
+
list = ['A','C']
|
76
|
+
when 'K'
|
77
|
+
list = ['G','C']
|
78
|
+
when 'R'
|
79
|
+
list = ['A','G']
|
80
|
+
when 'Y'
|
81
|
+
list = ['C','T']
|
82
|
+
when 'B'
|
83
|
+
list = ['C','G','T']
|
84
|
+
when 'D'
|
85
|
+
list = ['A','G','T']
|
86
|
+
when 'H'
|
87
|
+
list = ['A','C','T']
|
88
|
+
when 'V'
|
89
|
+
list = ['A','C','G']
|
90
|
+
when 'N'
|
91
|
+
list = ['A','T','C','G']
|
92
|
+
end
|
93
|
+
return list
|
94
|
+
end
|
95
|
+
|
96
|
+
# compare two sequences as String objects, two sequence strings need to aligned first
|
97
|
+
# @param seq2 [String] the sequence string to compare with
|
98
|
+
# @return [Integer] the total number of differences as integer
|
99
|
+
# @example compare two sequence strings, without alignment and with alignment
|
100
|
+
# seq1 = 'AAGGCGTAGGAC'
|
101
|
+
# seq2 = 'AAGCTTAGGACG'
|
102
|
+
# seq1.compare_with(seq2) # no alignment
|
103
|
+
# => 8
|
104
|
+
# aligned_seqs = ViralSeq::Muscle.align(seq1,seq2) # align using MUSCLE
|
105
|
+
# aligned_seqs[0].compare_with(aligned_seqs[1])
|
106
|
+
# => 4
|
107
|
+
|
108
|
+
def compare_with(seq2)
|
109
|
+
seq1 = self
|
110
|
+
length = seq1.size
|
111
|
+
diff = 0
|
112
|
+
(0..(length-1)).each do |position|
|
113
|
+
nt1 = seq1[position]
|
114
|
+
nt2 = seq2[position]
|
115
|
+
diff += 1 unless nt1 == nt2
|
116
|
+
end
|
117
|
+
return diff
|
118
|
+
end
|
119
|
+
end
|
data/lib/viral_seq/version.rb
CHANGED
data/lib/viral_seq.rb
CHANGED
@@ -18,24 +18,23 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
# viral_seq main
|
22
21
|
module ViralSeq; end
|
23
22
|
|
24
|
-
# load all
|
25
|
-
|
26
|
-
require "viral_seq/
|
27
|
-
require "viral_seq/
|
23
|
+
# load all classes
|
24
|
+
require "viral_seq/constant"
|
25
|
+
require "viral_seq/enumerable"
|
26
|
+
require "viral_seq/hash"
|
27
|
+
require "viral_seq/hivdr"
|
28
|
+
require "viral_seq/integer"
|
28
29
|
require "viral_seq/math"
|
29
|
-
require "viral_seq/fasta"
|
30
|
-
require "viral_seq/misc"
|
31
|
-
require "viral_seq/refseq"
|
32
|
-
require "viral_seq/locator"
|
33
30
|
require "viral_seq/muscle"
|
34
|
-
require "viral_seq/
|
35
|
-
require "viral_seq/
|
36
|
-
require "viral_seq/
|
37
|
-
require "viral_seq/
|
38
|
-
require "viral_seq/
|
39
|
-
require "viral_seq/
|
31
|
+
require "viral_seq/pid"
|
32
|
+
require "viral_seq/ref_seq"
|
33
|
+
require "viral_seq/rubystats"
|
34
|
+
require "viral_seq/seq_hash"
|
35
|
+
require "viral_seq/seq_hash_pair"
|
36
|
+
require "viral_seq/sequence"
|
37
|
+
require "viral_seq/string"
|
38
|
+
require "viral_seq/version"
|
40
39
|
|
41
40
|
require "muscle_bio"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-07-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -89,19 +89,20 @@ files:
|
|
89
89
|
- bin/console
|
90
90
|
- bin/setup
|
91
91
|
- lib/viral_seq.rb
|
92
|
-
- lib/viral_seq/
|
93
|
-
- lib/viral_seq/
|
94
|
-
- lib/viral_seq/
|
95
|
-
- lib/viral_seq/
|
92
|
+
- lib/viral_seq/Integer.rb
|
93
|
+
- lib/viral_seq/constant.rb
|
94
|
+
- lib/viral_seq/enumerable.rb
|
95
|
+
- lib/viral_seq/hash.rb
|
96
|
+
- lib/viral_seq/hivdr.rb
|
96
97
|
- lib/viral_seq/math.rb
|
97
|
-
- lib/viral_seq/misc.rb
|
98
98
|
- lib/viral_seq/muscle.rb
|
99
|
-
- lib/viral_seq/
|
100
|
-
- lib/viral_seq/
|
101
|
-
- lib/viral_seq/
|
102
|
-
- lib/viral_seq/
|
99
|
+
- lib/viral_seq/pid.rb
|
100
|
+
- lib/viral_seq/ref_seq.rb
|
101
|
+
- lib/viral_seq/rubystats.rb
|
102
|
+
- lib/viral_seq/seq_hash.rb
|
103
|
+
- lib/viral_seq/seq_hash_pair.rb
|
103
104
|
- lib/viral_seq/sequence.rb
|
104
|
-
- lib/viral_seq/
|
105
|
+
- lib/viral_seq/string.rb
|
105
106
|
- lib/viral_seq/version.rb
|
106
107
|
- viral_seq.gemspec
|
107
108
|
homepage: https://github.com/ViralSeq/viral_seq
|
data/lib/viral_seq/a3g.rb
DELETED
@@ -1,172 +0,0 @@
|
|
1
|
-
# viral_seq/a3g
|
2
|
-
# APOBEC3g/f hypermutation function including
|
3
|
-
# ViralSeq::a3g_hypermut_seq_hash
|
4
|
-
# ViralSeq::apobec3gf
|
5
|
-
|
6
|
-
# APOBEC3g/f G to A hypermutation
|
7
|
-
# APOBEC3G/F pattern: GRD -> ARD
|
8
|
-
# control pattern: G[YN|RC] -> A[YN|RC]
|
9
|
-
# use the sample consensus to determine potential a3g sites
|
10
|
-
|
11
|
-
# Two criteria to identify hypermutation
|
12
|
-
# 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
|
13
|
-
# 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
14
|
-
# note: criteria 2 only applies on a sequence file containing more than 20 sequences
|
15
|
-
# b/c Poisson model does not do well on small sample size.
|
16
|
-
|
17
|
-
# ViralSeq.a3g_hypermut_seq_hash(sequence_hash)
|
18
|
-
# sequence_hash is a Hash object for sequences. {:name => :sequence, ...}
|
19
|
-
# return array [hypermutation_hash, statistic_info]
|
20
|
-
# hypermutation_hash is a Hash object for sequences
|
21
|
-
# statistic_info is a hash object of [sequence_name, stats],
|
22
|
-
# in which stats String object in csv format (separated by ',') containing
|
23
|
-
# sequence tag
|
24
|
-
# G to A mutation numbers at potential a3g positions
|
25
|
-
# total potential a3g G positions
|
26
|
-
# G to A mutation numbers at non a3g positions
|
27
|
-
# total non a3g G positions
|
28
|
-
# a3g G to A mutation rate / non-a3g G to A mutation rate
|
29
|
-
# Fishers Exact P-value
|
30
|
-
#
|
31
|
-
# =USAGE
|
32
|
-
# # example 1
|
33
|
-
# sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence1.fasta')
|
34
|
-
# hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
|
35
|
-
# hypermut[0].keys
|
36
|
-
# => [">Seq7", ">Seq14"]
|
37
|
-
# stats = hypermut[1]
|
38
|
-
# stats.values
|
39
|
-
# => [">Seq7,23,68,1,54,18.26,4.308329383112348e-06", ">Seq14,45,68,9,54,3.97,5.2143571971582974e-08"]
|
40
|
-
#
|
41
|
-
# # example 2
|
42
|
-
# sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence2.fasta')
|
43
|
-
# hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
|
44
|
-
# stats = hypermut[1]
|
45
|
-
# stats = values
|
46
|
-
# => [">CTAACACTCA_134_a3g-sample2,4,35,0,51,Infinity,0.02465676660128911", ">ATAGTGCCCA_60_a3g-sample2,4,35,1,51,5.83,0.1534487353839561"]
|
47
|
-
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05, but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
48
|
-
|
49
|
-
|
50
|
-
# ViralSeq.apobec3gf(sequence)
|
51
|
-
# APOBEC3G/F pattern: GRD -> ARD
|
52
|
-
# control pattern: G[YN|RC] -> A[YN|RC]
|
53
|
-
# input a sequence String object
|
54
|
-
# return all two arrays of position numbers of
|
55
|
-
# a3g G positions (a3g)
|
56
|
-
# non-a3g G positions (control)
|
57
|
-
|
58
|
-
|
59
|
-
module ViralSeq
|
60
|
-
def ViralSeq.a3g_hypermut_seq_hash(seq_hash)
|
61
|
-
# mut_hash number of apobec3g/f mutations per sequence
|
62
|
-
mut_hash = {}
|
63
|
-
hm_hash = {}
|
64
|
-
out_hash = {}
|
65
|
-
|
66
|
-
# total G->A mutations at apobec3g/f positions.
|
67
|
-
total = 0
|
68
|
-
|
69
|
-
# make consensus sequence for the input sequence hash
|
70
|
-
ref = ViralSeq.consensus(seq_hash.values)
|
71
|
-
|
72
|
-
# obtain apobec3g positions and control positions
|
73
|
-
apobec = ViralSeq.apobec3gf(ref)
|
74
|
-
mut = apobec[0]
|
75
|
-
control = apobec[1]
|
76
|
-
|
77
|
-
seq_hash.each do |k,v|
|
78
|
-
a = 0 # muts
|
79
|
-
b = 0 # potential mut sites
|
80
|
-
c = 0 # control muts
|
81
|
-
d = 0 # potenrial controls
|
82
|
-
mut.each do |n|
|
83
|
-
next if v[n] == "-"
|
84
|
-
if v[n] == "A"
|
85
|
-
a += 1
|
86
|
-
b += 1
|
87
|
-
else
|
88
|
-
b += 1
|
89
|
-
end
|
90
|
-
end
|
91
|
-
mut_hash[k] = a
|
92
|
-
total += a
|
93
|
-
|
94
|
-
control.each do |n|
|
95
|
-
next if v[n] == "-"
|
96
|
-
if v[n] == "A"
|
97
|
-
c += 1
|
98
|
-
d += 1
|
99
|
-
else
|
100
|
-
d += 1
|
101
|
-
end
|
102
|
-
end
|
103
|
-
rr = (a/b.to_f)/(c/d.to_f)
|
104
|
-
|
105
|
-
t1 = b - a
|
106
|
-
t2 = d - c
|
107
|
-
|
108
|
-
fet = Rubystats::FishersExactTest.new
|
109
|
-
fisher = fet.calculate(t1,t2,a,c)
|
110
|
-
perc = fisher[:twotail]
|
111
|
-
info = k + "," + a.to_s + "," + b.to_s + "," + c.to_s + "," + d.to_s + "," + rr.round(2).to_s + "," + perc.to_s
|
112
|
-
out_hash[k] = info
|
113
|
-
if perc < 0.05
|
114
|
-
hm_hash[k] = info
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
if seq_hash.size > 20
|
119
|
-
rate = total.to_f/(seq_hash.size)
|
120
|
-
|
121
|
-
count_mut = ViralSeq.count(mut_hash.values)
|
122
|
-
maxi_count = count_mut.values.max
|
123
|
-
|
124
|
-
poisson_hash = ViralSeq.poisson_distribution(rate,maxi_count)
|
125
|
-
|
126
|
-
cut_off = 0
|
127
|
-
poisson_hash.each do |k,v|
|
128
|
-
cal = seq_hash.size * v
|
129
|
-
obs = count_mut[k]
|
130
|
-
if obs >= 20 * cal
|
131
|
-
cut_off = k
|
132
|
-
break
|
133
|
-
elsif k == maxi_count
|
134
|
-
cut_off = maxi_count
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
mut_hash.each do |k,v|
|
139
|
-
if v > cut_off
|
140
|
-
hm_hash[k] = out_hash[k]
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
hm_seq_hash = {}
|
146
|
-
hm_hash.keys.each do |k|
|
147
|
-
hm_seq_hash[k] = seq_hash[k]
|
148
|
-
end
|
149
|
-
return [hm_seq_hash,hm_hash]
|
150
|
-
end
|
151
|
-
|
152
|
-
# APOBEC3G/F mutation position identification
|
153
|
-
# APOBEC3G/F pattern: GRD -> ARD
|
154
|
-
# control pattern: G[YN|RC] -> A[YN|RC]
|
155
|
-
|
156
|
-
def self.apobec3gf(seq = "")
|
157
|
-
seq.tr!("-", "")
|
158
|
-
seq_length = seq.size
|
159
|
-
apobec_position = []
|
160
|
-
control_position = []
|
161
|
-
(0..(seq_length - 3)).each do |n|
|
162
|
-
tri_base = seq[n,3]
|
163
|
-
if tri_base =~ /G[A|G][A|G|T]/
|
164
|
-
apobec_position << n
|
165
|
-
elsif seq[n] == "G"
|
166
|
-
control_position << n
|
167
|
-
end
|
168
|
-
end
|
169
|
-
return [apobec_position,control_position]
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
data/lib/viral_seq/fasta.rb
DELETED
@@ -1,154 +0,0 @@
|
|
1
|
-
# fasta.rb
|
2
|
-
# methods for converting sequence formats, including
|
3
|
-
# ViralSeq::fasta_to_hash
|
4
|
-
# ViralSeq::fastq_to_fasta
|
5
|
-
# ViralSeq::fastq_to_hash
|
6
|
-
# ViralSeq::fasta_hash_to_rsphylip
|
7
|
-
# ViralSeq::pair_fasta_to_hash
|
8
|
-
|
9
|
-
# =USAGE
|
10
|
-
# sequence_fasta_hash = ViralSeq.fasta_to_hash(input_fasta_file)
|
11
|
-
# # input a sequence file in fasta format, read as a sequence hash
|
12
|
-
# # {:sequence_name1 => sequence1, ...}
|
13
|
-
|
14
|
-
# sequence_fasta_hash = ViralSeq.fastq_to_fasta(input_fastq_file)
|
15
|
-
# # input a sequence file in fastq format, read as a sequence hash
|
16
|
-
# # discard sequence quality score
|
17
|
-
|
18
|
-
# sequence_fastq_hash = ViralSeq.fasta_to_hash(input_fastq_file)
|
19
|
-
# # input a sequence file in fastq format, read as a sequence hash
|
20
|
-
# # keep sequence quality score
|
21
|
-
# # {:sequence_name1 => [sequence1, quality1], ...}
|
22
|
-
|
23
|
-
# phylip_hash = ViralSeq.fasta_hash_to_rsphylip(sequence_fasta_hash)
|
24
|
-
# # convert a aligned fasta sequence hash into relaxed sequencial phylip format
|
25
|
-
|
26
|
-
# paired_sequence_hash = ViralSeq.pair_fasta_to_hash(directory_of_paired_fasta)
|
27
|
-
# # input a directory containing paired sequence files in the fasta format
|
28
|
-
# # ├───lib1
|
29
|
-
# │ lib1_r1.txt
|
30
|
-
# │ lib1_r2.txt
|
31
|
-
# # paired sequence files need to have "r1" and "r2" in their file names
|
32
|
-
# # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
|
33
|
-
# # return a paired sequence hash :seq_name => [r1_seq, r2_seq]
|
34
|
-
|
35
|
-
module ViralSeq
|
36
|
-
|
37
|
-
def self.fasta_to_hash(infile)
|
38
|
-
f=File.open(infile,"r")
|
39
|
-
return_hash = {}
|
40
|
-
name = ""
|
41
|
-
while line = f.gets do
|
42
|
-
line.tr!("\u0000","")
|
43
|
-
next if line == "\n"
|
44
|
-
next if line =~ /^\=/
|
45
|
-
if line =~ /^\>/
|
46
|
-
name = line.chomp
|
47
|
-
return_hash[name] = ""
|
48
|
-
else
|
49
|
-
return_hash[name] += line.chomp.upcase
|
50
|
-
end
|
51
|
-
end
|
52
|
-
f.close
|
53
|
-
return return_hash
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
# fastq file to fasta, discard quality, return a sequence hash
|
58
|
-
|
59
|
-
def self.fastq_to_fasta(fastq_file)
|
60
|
-
count = 0
|
61
|
-
sequence_a = []
|
62
|
-
count_seq = 0
|
63
|
-
|
64
|
-
File.open(fastq_file,'r') do |file|
|
65
|
-
file.readlines.collect do |line|
|
66
|
-
count +=1
|
67
|
-
count_m = count % 4
|
68
|
-
if count_m == 1
|
69
|
-
line.tr!('@','>')
|
70
|
-
sequence_a << line.chomp
|
71
|
-
count_seq += 1
|
72
|
-
elsif count_m == 2
|
73
|
-
sequence_a << line.chomp
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
Hash[*sequence_a]
|
78
|
-
end
|
79
|
-
|
80
|
-
# fastq file to hash, including quality. {:seq_name => [seq,quality]}
|
81
|
-
|
82
|
-
def self.fastq_to_hash(fastq_file)
|
83
|
-
count = 0
|
84
|
-
sequence_a = []
|
85
|
-
quality_a = []
|
86
|
-
count_seq = 0
|
87
|
-
|
88
|
-
File.open(fastq_file,'r') do |file|
|
89
|
-
file.readlines.collect do |line|
|
90
|
-
count +=1
|
91
|
-
count_m = count % 4
|
92
|
-
if count_m == 1
|
93
|
-
line.tr!('@','>')
|
94
|
-
sequence_a << line.chomp
|
95
|
-
quality_a << line.chomp
|
96
|
-
count_seq += 1
|
97
|
-
elsif count_m == 2
|
98
|
-
sequence_a << line.chomp
|
99
|
-
elsif count_m == 0
|
100
|
-
quality_a << line.chomp
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
sequence_hash = Hash[*sequence_a]
|
105
|
-
quality_hash = Hash[*quality_a]
|
106
|
-
return_hash = {}
|
107
|
-
sequence_hash.each do |k,v|
|
108
|
-
return_hash[k] = [v, quality_hash[k]]
|
109
|
-
end
|
110
|
-
return return_hash
|
111
|
-
end
|
112
|
-
|
113
|
-
# fasta sequence hash to relaxed sequencial phylip format
|
114
|
-
|
115
|
-
def self.fasta_hash_to_rsphylip(seqs)
|
116
|
-
outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
|
117
|
-
names = seqs.keys
|
118
|
-
max_name_l = (names.max.size - 1)
|
119
|
-
max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
|
120
|
-
seqs.each do |k,v|
|
121
|
-
outline += k[1..-1] + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
|
122
|
-
end
|
123
|
-
return outline
|
124
|
-
end
|
125
|
-
|
126
|
-
# input a directory with r1 and r2 sequences, return a hash :seq_name => [r1_seq, r2_seq]
|
127
|
-
# r1 and r2 file names should contain "r1" and "r2" respectively
|
128
|
-
# the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
|
129
|
-
def self.pair_fasta_to_hash(indir)
|
130
|
-
files = Dir[indir + "/*"]
|
131
|
-
r1_file = ""
|
132
|
-
r2_file = ""
|
133
|
-
files.each do |f|
|
134
|
-
if File.basename(f) =~ /r1/i
|
135
|
-
r1_file = f
|
136
|
-
elsif File.basename(f) =~ /r2/i
|
137
|
-
r2_file = f
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
seq1 = ViralSeq.fasta_to_hash(r1_file)
|
142
|
-
seq2 = ViralSeq.fasta_to_hash(r2_file)
|
143
|
-
|
144
|
-
new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
145
|
-
new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
146
|
-
|
147
|
-
seq_pair_hash = {}
|
148
|
-
|
149
|
-
new_seq1.each do |seq_name,seq|
|
150
|
-
seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
|
151
|
-
end
|
152
|
-
return seq_pair_hash
|
153
|
-
end
|
154
|
-
end
|