viral_seq 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/lib/viral_seq/Integer.rb +16 -0
- data/lib/viral_seq/constant.rb +7 -0
- data/lib/viral_seq/enumerable.rb +132 -0
- data/lib/viral_seq/hash.rb +45 -0
- data/lib/viral_seq/hivdr.rb +454 -0
- data/lib/viral_seq/math.rb +128 -380
- data/lib/viral_seq/muscle.rb +60 -82
- data/lib/viral_seq/pid.rb +26 -0
- data/lib/viral_seq/ref_seq.rb +35 -0
- data/lib/viral_seq/rubystats.rb +172 -0
- data/lib/viral_seq/seq_hash.rb +1043 -0
- data/lib/viral_seq/seq_hash_pair.rb +219 -0
- data/lib/viral_seq/sequence.rb +571 -348
- data/lib/viral_seq/string.rb +119 -0
- data/lib/viral_seq/version.rb +1 -1
- data/lib/viral_seq.rb +14 -15
- metadata +13 -12
- data/lib/viral_seq/a3g.rb +0 -172
- data/lib/viral_seq/fasta.rb +0 -154
- data/lib/viral_seq/hcv_dr.rb +0 -54
- data/lib/viral_seq/locator.rb +0 -299
- data/lib/viral_seq/misc.rb +0 -103
- data/lib/viral_seq/nt_variation.rb +0 -148
- data/lib/viral_seq/poisson_cutoff.rb +0 -68
- data/lib/viral_seq/refseq.rb +0 -45
- data/lib/viral_seq/sdrm_core.rb +0 -652
- data/lib/viral_seq/tcs_core.rb +0 -556
@@ -0,0 +1,119 @@
|
|
1
|
+
# functions added to Class::String for direct operation on sequence as a String object
|
2
|
+
|
3
|
+
class String
|
4
|
+
|
5
|
+
# reverse complement
|
6
|
+
# @return [String] reverse complement sequence
|
7
|
+
# @example Reverse complement
|
8
|
+
# "ACAGA".rc
|
9
|
+
# => "TCTGT"
|
10
|
+
|
11
|
+
def rc
|
12
|
+
self.reverse.tr("ACTG","TGAC")
|
13
|
+
end
|
14
|
+
|
15
|
+
# mutate a nt sequence (String class) randomly
|
16
|
+
# @param error_rate [Float] define an error rate for mutation, default to `0.01`
|
17
|
+
# @return [String] mutated sequence as String
|
18
|
+
# @example mutate a sequence at an error rate of 0.05
|
19
|
+
# seq = "TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTG"
|
20
|
+
# seq.mutation(0.05)
|
21
|
+
# => "TGGAAGGGCTAATGCACTCCCAACGAAGACACGATATCCTTGATCTGTGGATCTACGACACACAAGGCTGCTTCCCTG"
|
22
|
+
|
23
|
+
def mutation(error_rate = 0.01)
|
24
|
+
new_string = ""
|
25
|
+
self.split("").each do |nt|
|
26
|
+
pool = ["A","C","T","G"]
|
27
|
+
pool.delete(nt)
|
28
|
+
s = error_rate * 10000
|
29
|
+
r = rand(10000)
|
30
|
+
if r < s
|
31
|
+
nt = pool.sample
|
32
|
+
end
|
33
|
+
new_string << nt
|
34
|
+
end
|
35
|
+
return new_string
|
36
|
+
end
|
37
|
+
|
38
|
+
# parse the nucleotide sequences as a String object
|
39
|
+
# and return a Regexp object for possible matches
|
40
|
+
# @return [Regexp] as possible matches
|
41
|
+
# @example parse a sequence with ambiguities
|
42
|
+
# "ATRWCG".nt_parser
|
43
|
+
# => /AT[A|G][A|T]CG/
|
44
|
+
|
45
|
+
def nt_parser
|
46
|
+
match = ""
|
47
|
+
self.each_char.each do |base|
|
48
|
+
base_array = base.to_list
|
49
|
+
if base_array.size == 1
|
50
|
+
match += base_array[0]
|
51
|
+
else
|
52
|
+
pattern = "[" + base_array.join("|") + "]"
|
53
|
+
match += pattern
|
54
|
+
end
|
55
|
+
end
|
56
|
+
Regexp.new match
|
57
|
+
end
|
58
|
+
|
59
|
+
# parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
|
60
|
+
# @return [Array] parsed nt bases
|
61
|
+
# @example parse IUPAC `R`
|
62
|
+
# 'R'.to_list
|
63
|
+
# => ["A", "G"]
|
64
|
+
|
65
|
+
def to_list
|
66
|
+
list = []
|
67
|
+
case self.upcase
|
68
|
+
when /[A|T|C|G]/
|
69
|
+
list << self
|
70
|
+
when "W"
|
71
|
+
list = ['A','T']
|
72
|
+
when "S"
|
73
|
+
list = ['C','G']
|
74
|
+
when "M"
|
75
|
+
list = ['A','C']
|
76
|
+
when 'K'
|
77
|
+
list = ['G','C']
|
78
|
+
when 'R'
|
79
|
+
list = ['A','G']
|
80
|
+
when 'Y'
|
81
|
+
list = ['C','T']
|
82
|
+
when 'B'
|
83
|
+
list = ['C','G','T']
|
84
|
+
when 'D'
|
85
|
+
list = ['A','G','T']
|
86
|
+
when 'H'
|
87
|
+
list = ['A','C','T']
|
88
|
+
when 'V'
|
89
|
+
list = ['A','C','G']
|
90
|
+
when 'N'
|
91
|
+
list = ['A','T','C','G']
|
92
|
+
end
|
93
|
+
return list
|
94
|
+
end
|
95
|
+
|
96
|
+
# compare two sequences as String objects, two sequence strings need to aligned first
|
97
|
+
# @param seq2 [String] the sequence string to compare with
|
98
|
+
# @return [Integer] the total number of differences as integer
|
99
|
+
# @example compare two sequence strings, without alignment and with alignment
|
100
|
+
# seq1 = 'AAGGCGTAGGAC'
|
101
|
+
# seq2 = 'AAGCTTAGGACG'
|
102
|
+
# seq1.compare_with(seq2) # no alignment
|
103
|
+
# => 8
|
104
|
+
# aligned_seqs = ViralSeq::Muscle.align(seq1,seq2) # align using MUSCLE
|
105
|
+
# aligned_seqs[0].compare_with(aligned_seqs[1])
|
106
|
+
# => 4
|
107
|
+
|
108
|
+
def compare_with(seq2)
|
109
|
+
seq1 = self
|
110
|
+
length = seq1.size
|
111
|
+
diff = 0
|
112
|
+
(0..(length-1)).each do |position|
|
113
|
+
nt1 = seq1[position]
|
114
|
+
nt2 = seq2[position]
|
115
|
+
diff += 1 unless nt1 == nt2
|
116
|
+
end
|
117
|
+
return diff
|
118
|
+
end
|
119
|
+
end
|
data/lib/viral_seq/version.rb
CHANGED
data/lib/viral_seq.rb
CHANGED
@@ -18,24 +18,23 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
# viral_seq main
|
22
21
|
module ViralSeq; end
|
23
22
|
|
24
|
-
# load all
|
25
|
-
|
26
|
-
require "viral_seq/
|
27
|
-
require "viral_seq/
|
23
|
+
# load all classes
|
24
|
+
require "viral_seq/constant"
|
25
|
+
require "viral_seq/enumerable"
|
26
|
+
require "viral_seq/hash"
|
27
|
+
require "viral_seq/hivdr"
|
28
|
+
require "viral_seq/integer"
|
28
29
|
require "viral_seq/math"
|
29
|
-
require "viral_seq/fasta"
|
30
|
-
require "viral_seq/misc"
|
31
|
-
require "viral_seq/refseq"
|
32
|
-
require "viral_seq/locator"
|
33
30
|
require "viral_seq/muscle"
|
34
|
-
require "viral_seq/
|
35
|
-
require "viral_seq/
|
36
|
-
require "viral_seq/
|
37
|
-
require "viral_seq/
|
38
|
-
require "viral_seq/
|
39
|
-
require "viral_seq/
|
31
|
+
require "viral_seq/pid"
|
32
|
+
require "viral_seq/ref_seq"
|
33
|
+
require "viral_seq/rubystats"
|
34
|
+
require "viral_seq/seq_hash"
|
35
|
+
require "viral_seq/seq_hash_pair"
|
36
|
+
require "viral_seq/sequence"
|
37
|
+
require "viral_seq/string"
|
38
|
+
require "viral_seq/version"
|
40
39
|
|
41
40
|
require "muscle_bio"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-07-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -89,19 +89,20 @@ files:
|
|
89
89
|
- bin/console
|
90
90
|
- bin/setup
|
91
91
|
- lib/viral_seq.rb
|
92
|
-
- lib/viral_seq/
|
93
|
-
- lib/viral_seq/
|
94
|
-
- lib/viral_seq/
|
95
|
-
- lib/viral_seq/
|
92
|
+
- lib/viral_seq/Integer.rb
|
93
|
+
- lib/viral_seq/constant.rb
|
94
|
+
- lib/viral_seq/enumerable.rb
|
95
|
+
- lib/viral_seq/hash.rb
|
96
|
+
- lib/viral_seq/hivdr.rb
|
96
97
|
- lib/viral_seq/math.rb
|
97
|
-
- lib/viral_seq/misc.rb
|
98
98
|
- lib/viral_seq/muscle.rb
|
99
|
-
- lib/viral_seq/
|
100
|
-
- lib/viral_seq/
|
101
|
-
- lib/viral_seq/
|
102
|
-
- lib/viral_seq/
|
99
|
+
- lib/viral_seq/pid.rb
|
100
|
+
- lib/viral_seq/ref_seq.rb
|
101
|
+
- lib/viral_seq/rubystats.rb
|
102
|
+
- lib/viral_seq/seq_hash.rb
|
103
|
+
- lib/viral_seq/seq_hash_pair.rb
|
103
104
|
- lib/viral_seq/sequence.rb
|
104
|
-
- lib/viral_seq/
|
105
|
+
- lib/viral_seq/string.rb
|
105
106
|
- lib/viral_seq/version.rb
|
106
107
|
- viral_seq.gemspec
|
107
108
|
homepage: https://github.com/ViralSeq/viral_seq
|
data/lib/viral_seq/a3g.rb
DELETED
@@ -1,172 +0,0 @@
|
|
1
|
-
# viral_seq/a3g
|
2
|
-
# APOBEC3g/f hypermutation function including
|
3
|
-
# ViralSeq::a3g_hypermut_seq_hash
|
4
|
-
# ViralSeq::apobec3gf
|
5
|
-
|
6
|
-
# APOBEC3g/f G to A hypermutation
|
7
|
-
# APOBEC3G/F pattern: GRD -> ARD
|
8
|
-
# control pattern: G[YN|RC] -> A[YN|RC]
|
9
|
-
# use the sample consensus to determine potential a3g sites
|
10
|
-
|
11
|
-
# Two criteria to identify hypermutation
|
12
|
-
# 1. Fisher's exact test on the frequencies of G to A mutation at A3G positons vs. non-A3G positions
|
13
|
-
# 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
|
14
|
-
# note: criteria 2 only applies on a sequence file containing more than 20 sequences
|
15
|
-
# b/c Poisson model does not do well on small sample size.
|
16
|
-
|
17
|
-
# ViralSeq.a3g_hypermut_seq_hash(sequence_hash)
|
18
|
-
# sequence_hash is a Hash object for sequences. {:name => :sequence, ...}
|
19
|
-
# return array [hypermutation_hash, statistic_info]
|
20
|
-
# hypermutation_hash is a Hash object for sequences
|
21
|
-
# statistic_info is a hash object of [sequence_name, stats],
|
22
|
-
# in which stats String object in csv format (separated by ',') containing
|
23
|
-
# sequence tag
|
24
|
-
# G to A mutation numbers at potential a3g positions
|
25
|
-
# total potential a3g G positions
|
26
|
-
# G to A mutation numbers at non a3g positions
|
27
|
-
# total non a3g G positions
|
28
|
-
# a3g G to A mutation rate / non-a3g G to A mutation rate
|
29
|
-
# Fishers Exact P-value
|
30
|
-
#
|
31
|
-
# =USAGE
|
32
|
-
# # example 1
|
33
|
-
# sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence1.fasta')
|
34
|
-
# hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
|
35
|
-
# hypermut[0].keys
|
36
|
-
# => [">Seq7", ">Seq14"]
|
37
|
-
# stats = hypermut[1]
|
38
|
-
# stats.values
|
39
|
-
# => [">Seq7,23,68,1,54,18.26,4.308329383112348e-06", ">Seq14,45,68,9,54,3.97,5.2143571971582974e-08"]
|
40
|
-
#
|
41
|
-
# # example 2
|
42
|
-
# sequences = ViralSeq.fasta_to_hash('spec/sample_files/sample_a3g_sequence2.fasta')
|
43
|
-
# hypermut = ViralSeq.a3g_hypermut_seq_hash(sequences)
|
44
|
-
# stats = hypermut[1]
|
45
|
-
# stats = values
|
46
|
-
# => [">CTAACACTCA_134_a3g-sample2,4,35,0,51,Infinity,0.02465676660128911", ">ATAGTGCCCA_60_a3g-sample2,4,35,1,51,5.83,0.1534487353839561"]
|
47
|
-
# # notice sequence ">ATAGTGCCCA_60_a3g-sample2" has a p value at 0.15, greater than 0.05, but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
|
48
|
-
|
49
|
-
|
50
|
-
# ViralSeq.apobec3gf(sequence)
|
51
|
-
# APOBEC3G/F pattern: GRD -> ARD
|
52
|
-
# control pattern: G[YN|RC] -> A[YN|RC]
|
53
|
-
# input a sequence String object
|
54
|
-
# return all two arrays of position numbers of
|
55
|
-
# a3g G positions (a3g)
|
56
|
-
# non-a3g G positions (control)
|
57
|
-
|
58
|
-
|
59
|
-
module ViralSeq
|
60
|
-
def ViralSeq.a3g_hypermut_seq_hash(seq_hash)
|
61
|
-
# mut_hash number of apobec3g/f mutations per sequence
|
62
|
-
mut_hash = {}
|
63
|
-
hm_hash = {}
|
64
|
-
out_hash = {}
|
65
|
-
|
66
|
-
# total G->A mutations at apobec3g/f positions.
|
67
|
-
total = 0
|
68
|
-
|
69
|
-
# make consensus sequence for the input sequence hash
|
70
|
-
ref = ViralSeq.consensus(seq_hash.values)
|
71
|
-
|
72
|
-
# obtain apobec3g positions and control positions
|
73
|
-
apobec = ViralSeq.apobec3gf(ref)
|
74
|
-
mut = apobec[0]
|
75
|
-
control = apobec[1]
|
76
|
-
|
77
|
-
seq_hash.each do |k,v|
|
78
|
-
a = 0 # muts
|
79
|
-
b = 0 # potential mut sites
|
80
|
-
c = 0 # control muts
|
81
|
-
d = 0 # potenrial controls
|
82
|
-
mut.each do |n|
|
83
|
-
next if v[n] == "-"
|
84
|
-
if v[n] == "A"
|
85
|
-
a += 1
|
86
|
-
b += 1
|
87
|
-
else
|
88
|
-
b += 1
|
89
|
-
end
|
90
|
-
end
|
91
|
-
mut_hash[k] = a
|
92
|
-
total += a
|
93
|
-
|
94
|
-
control.each do |n|
|
95
|
-
next if v[n] == "-"
|
96
|
-
if v[n] == "A"
|
97
|
-
c += 1
|
98
|
-
d += 1
|
99
|
-
else
|
100
|
-
d += 1
|
101
|
-
end
|
102
|
-
end
|
103
|
-
rr = (a/b.to_f)/(c/d.to_f)
|
104
|
-
|
105
|
-
t1 = b - a
|
106
|
-
t2 = d - c
|
107
|
-
|
108
|
-
fet = Rubystats::FishersExactTest.new
|
109
|
-
fisher = fet.calculate(t1,t2,a,c)
|
110
|
-
perc = fisher[:twotail]
|
111
|
-
info = k + "," + a.to_s + "," + b.to_s + "," + c.to_s + "," + d.to_s + "," + rr.round(2).to_s + "," + perc.to_s
|
112
|
-
out_hash[k] = info
|
113
|
-
if perc < 0.05
|
114
|
-
hm_hash[k] = info
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
if seq_hash.size > 20
|
119
|
-
rate = total.to_f/(seq_hash.size)
|
120
|
-
|
121
|
-
count_mut = ViralSeq.count(mut_hash.values)
|
122
|
-
maxi_count = count_mut.values.max
|
123
|
-
|
124
|
-
poisson_hash = ViralSeq.poisson_distribution(rate,maxi_count)
|
125
|
-
|
126
|
-
cut_off = 0
|
127
|
-
poisson_hash.each do |k,v|
|
128
|
-
cal = seq_hash.size * v
|
129
|
-
obs = count_mut[k]
|
130
|
-
if obs >= 20 * cal
|
131
|
-
cut_off = k
|
132
|
-
break
|
133
|
-
elsif k == maxi_count
|
134
|
-
cut_off = maxi_count
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
mut_hash.each do |k,v|
|
139
|
-
if v > cut_off
|
140
|
-
hm_hash[k] = out_hash[k]
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
hm_seq_hash = {}
|
146
|
-
hm_hash.keys.each do |k|
|
147
|
-
hm_seq_hash[k] = seq_hash[k]
|
148
|
-
end
|
149
|
-
return [hm_seq_hash,hm_hash]
|
150
|
-
end
|
151
|
-
|
152
|
-
# APOBEC3G/F mutation position identification
|
153
|
-
# APOBEC3G/F pattern: GRD -> ARD
|
154
|
-
# control pattern: G[YN|RC] -> A[YN|RC]
|
155
|
-
|
156
|
-
def self.apobec3gf(seq = "")
|
157
|
-
seq.tr!("-", "")
|
158
|
-
seq_length = seq.size
|
159
|
-
apobec_position = []
|
160
|
-
control_position = []
|
161
|
-
(0..(seq_length - 3)).each do |n|
|
162
|
-
tri_base = seq[n,3]
|
163
|
-
if tri_base =~ /G[A|G][A|G|T]/
|
164
|
-
apobec_position << n
|
165
|
-
elsif seq[n] == "G"
|
166
|
-
control_position << n
|
167
|
-
end
|
168
|
-
end
|
169
|
-
return [apobec_position,control_position]
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
data/lib/viral_seq/fasta.rb
DELETED
@@ -1,154 +0,0 @@
|
|
1
|
-
# fasta.rb
|
2
|
-
# methods for converting sequence formats, including
|
3
|
-
# ViralSeq::fasta_to_hash
|
4
|
-
# ViralSeq::fastq_to_fasta
|
5
|
-
# ViralSeq::fastq_to_hash
|
6
|
-
# ViralSeq::fasta_hash_to_rsphylip
|
7
|
-
# ViralSeq::pair_fasta_to_hash
|
8
|
-
|
9
|
-
# =USAGE
|
10
|
-
# sequence_fasta_hash = ViralSeq.fasta_to_hash(input_fasta_file)
|
11
|
-
# # input a sequence file in fasta format, read as a sequence hash
|
12
|
-
# # {:sequence_name1 => sequence1, ...}
|
13
|
-
|
14
|
-
# sequence_fasta_hash = ViralSeq.fastq_to_fasta(input_fastq_file)
|
15
|
-
# # input a sequence file in fastq format, read as a sequence hash
|
16
|
-
# # discard sequence quality score
|
17
|
-
|
18
|
-
# sequence_fastq_hash = ViralSeq.fasta_to_hash(input_fastq_file)
|
19
|
-
# # input a sequence file in fastq format, read as a sequence hash
|
20
|
-
# # keep sequence quality score
|
21
|
-
# # {:sequence_name1 => [sequence1, quality1], ...}
|
22
|
-
|
23
|
-
# phylip_hash = ViralSeq.fasta_hash_to_rsphylip(sequence_fasta_hash)
|
24
|
-
# # convert a aligned fasta sequence hash into relaxed sequencial phylip format
|
25
|
-
|
26
|
-
# paired_sequence_hash = ViralSeq.pair_fasta_to_hash(directory_of_paired_fasta)
|
27
|
-
# # input a directory containing paired sequence files in the fasta format
|
28
|
-
# # ├───lib1
|
29
|
-
# │ lib1_r1.txt
|
30
|
-
# │ lib1_r2.txt
|
31
|
-
# # paired sequence files need to have "r1" and "r2" in their file names
|
32
|
-
# # the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
|
33
|
-
# # return a paired sequence hash :seq_name => [r1_seq, r2_seq]
|
34
|
-
|
35
|
-
module ViralSeq
|
36
|
-
|
37
|
-
def self.fasta_to_hash(infile)
|
38
|
-
f=File.open(infile,"r")
|
39
|
-
return_hash = {}
|
40
|
-
name = ""
|
41
|
-
while line = f.gets do
|
42
|
-
line.tr!("\u0000","")
|
43
|
-
next if line == "\n"
|
44
|
-
next if line =~ /^\=/
|
45
|
-
if line =~ /^\>/
|
46
|
-
name = line.chomp
|
47
|
-
return_hash[name] = ""
|
48
|
-
else
|
49
|
-
return_hash[name] += line.chomp.upcase
|
50
|
-
end
|
51
|
-
end
|
52
|
-
f.close
|
53
|
-
return return_hash
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
# fastq file to fasta, discard quality, return a sequence hash
|
58
|
-
|
59
|
-
def self.fastq_to_fasta(fastq_file)
|
60
|
-
count = 0
|
61
|
-
sequence_a = []
|
62
|
-
count_seq = 0
|
63
|
-
|
64
|
-
File.open(fastq_file,'r') do |file|
|
65
|
-
file.readlines.collect do |line|
|
66
|
-
count +=1
|
67
|
-
count_m = count % 4
|
68
|
-
if count_m == 1
|
69
|
-
line.tr!('@','>')
|
70
|
-
sequence_a << line.chomp
|
71
|
-
count_seq += 1
|
72
|
-
elsif count_m == 2
|
73
|
-
sequence_a << line.chomp
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
Hash[*sequence_a]
|
78
|
-
end
|
79
|
-
|
80
|
-
# fastq file to hash, including quality. {:seq_name => [seq,quality]}
|
81
|
-
|
82
|
-
def self.fastq_to_hash(fastq_file)
|
83
|
-
count = 0
|
84
|
-
sequence_a = []
|
85
|
-
quality_a = []
|
86
|
-
count_seq = 0
|
87
|
-
|
88
|
-
File.open(fastq_file,'r') do |file|
|
89
|
-
file.readlines.collect do |line|
|
90
|
-
count +=1
|
91
|
-
count_m = count % 4
|
92
|
-
if count_m == 1
|
93
|
-
line.tr!('@','>')
|
94
|
-
sequence_a << line.chomp
|
95
|
-
quality_a << line.chomp
|
96
|
-
count_seq += 1
|
97
|
-
elsif count_m == 2
|
98
|
-
sequence_a << line.chomp
|
99
|
-
elsif count_m == 0
|
100
|
-
quality_a << line.chomp
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
sequence_hash = Hash[*sequence_a]
|
105
|
-
quality_hash = Hash[*quality_a]
|
106
|
-
return_hash = {}
|
107
|
-
sequence_hash.each do |k,v|
|
108
|
-
return_hash[k] = [v, quality_hash[k]]
|
109
|
-
end
|
110
|
-
return return_hash
|
111
|
-
end
|
112
|
-
|
113
|
-
# fasta sequence hash to relaxed sequencial phylip format
|
114
|
-
|
115
|
-
def self.fasta_hash_to_rsphylip(seqs)
|
116
|
-
outline = "\s" + seqs.size.to_s + "\s" + seqs.values[0].size.to_s + "\n"
|
117
|
-
names = seqs.keys
|
118
|
-
max_name_l = (names.max.size - 1)
|
119
|
-
max_name_l > 10 ? name_block_l = max_name_l : name_block_l = 10
|
120
|
-
seqs.each do |k,v|
|
121
|
-
outline += k[1..-1] + "\s" * (name_block_l - k.size + 2) + v.scan(/.{1,10}/).join("\s") + "\n"
|
122
|
-
end
|
123
|
-
return outline
|
124
|
-
end
|
125
|
-
|
126
|
-
# input a directory with r1 and r2 sequences, return a hash :seq_name => [r1_seq, r2_seq]
|
127
|
-
# r1 and r2 file names should contain "r1" and "r2" respectively
|
128
|
-
# the sequence taxa should only differ by last 3 characters to distinguish r1 and r2 sequence.
|
129
|
-
def self.pair_fasta_to_hash(indir)
|
130
|
-
files = Dir[indir + "/*"]
|
131
|
-
r1_file = ""
|
132
|
-
r2_file = ""
|
133
|
-
files.each do |f|
|
134
|
-
if File.basename(f) =~ /r1/i
|
135
|
-
r1_file = f
|
136
|
-
elsif File.basename(f) =~ /r2/i
|
137
|
-
r2_file = f
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
seq1 = ViralSeq.fasta_to_hash(r1_file)
|
142
|
-
seq2 = ViralSeq.fasta_to_hash(r2_file)
|
143
|
-
|
144
|
-
new_seq1 = seq1.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
145
|
-
new_seq2 = seq2.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
146
|
-
|
147
|
-
seq_pair_hash = {}
|
148
|
-
|
149
|
-
new_seq1.each do |seq_name,seq|
|
150
|
-
seq_pair_hash[seq_name] = [seq, new_seq2[seq_name]]
|
151
|
-
end
|
152
|
-
return seq_pair_hash
|
153
|
-
end
|
154
|
-
end
|