parse_fasta 1.9.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +178 -0
- data/README.md +42 -215
- data/Rakefile +2 -4
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/parse_fasta/error.rb +39 -0
- data/lib/parse_fasta/record.rb +88 -0
- data/lib/parse_fasta/seq_file.rb +221 -114
- data/lib/parse_fasta/version.rb +2 -2
- data/lib/parse_fasta.rb +5 -20
- data/spec/parse_fasta/record_spec.rb +115 -0
- data/spec/parse_fasta/seq_file_spec.rb +238 -0
- data/spec/parse_fasta_spec.rb +25 -0
- data/spec/spec_helper.rb +2 -44
- data/spec/test_files/cr.fa +1 -0
- data/spec/test_files/cr.fa.gz +0 -0
- data/spec/test_files/cr.fq +3 -0
- data/spec/test_files/cr.fq.gz +0 -0
- data/spec/test_files/cr_nl.fa +4 -0
- data/spec/test_files/cr_nl.fa.gz +0 -0
- data/spec/test_files/cr_nl.fq +8 -0
- data/spec/test_files/cr_nl.fq.gz +0 -0
- data/spec/test_files/multi_blob.fa.gz +0 -0
- data/spec/test_files/multi_blob.fq.gz +0 -0
- data/spec/test_files/not_a_seq_file.txt +1 -0
- data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
- data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
- data/spec/test_files/seqs.fa.gz +0 -0
- data/spec/test_files/seqs.fq +8 -0
- data/spec/test_files/seqs.fq.gz +0 -0
- metadata +49 -24
- data/lib/parse_fasta/fasta_file.rb +0 -232
- data/lib/parse_fasta/fastq_file.rb +0 -160
- data/lib/parse_fasta/quality.rb +0 -54
- data/lib/parse_fasta/sequence.rb +0 -174
- data/spec/lib/fasta_file_spec.rb +0 -212
- data/spec/lib/fastq_file_spec.rb +0 -143
- data/spec/lib/quality_spec.rb +0 -51
- data/spec/lib/seq_file_spec.rb +0 -357
- data/spec/lib/sequence_spec.rb +0 -188
- data/test_files/benchmark.rb +0 -99
- data/test_files/bogus.txt +0 -2
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq +0 -8
- data/test_files/test.fq.gz +0 -0
@@ -1,232 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
require 'zlib'
|
20
|
-
|
21
|
-
# Provides simple interface for parsing fasta format files. Gzipped
|
22
|
-
# files are no problem.
|
23
|
-
class FastaFile < File
|
24
|
-
|
25
|
-
# Use it like IO::open
|
26
|
-
#
|
27
|
-
# @param fname [String] the name of the file to open
|
28
|
-
#
|
29
|
-
# @return [FastaFile] a FastaFile
|
30
|
-
def self.open(fname, *args)
|
31
|
-
begin
|
32
|
-
handle = Zlib::GzipReader.open(fname)
|
33
|
-
rescue Zlib::GzipFile::Error => e
|
34
|
-
handle = File.open(fname)
|
35
|
-
end
|
36
|
-
|
37
|
-
unless handle.each_char.peek[0] == '>'
|
38
|
-
raise ParseFasta::DataFormatError
|
39
|
-
end
|
40
|
-
|
41
|
-
handle.close
|
42
|
-
|
43
|
-
super
|
44
|
-
end
|
45
|
-
|
46
|
-
# Returns the records in the fasta file as a hash map with the
|
47
|
-
# headers as keys and the Sequences as values.
|
48
|
-
#
|
49
|
-
# @example Read a fastA into a hash table.
|
50
|
-
# seqs = FastaFile.open('reads.fa').to_hash
|
51
|
-
#
|
52
|
-
# @return [Hash] A hash with headers as keys, sequences as the
|
53
|
-
# values (Sequence objects)
|
54
|
-
#
|
55
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
56
|
-
def to_hash
|
57
|
-
hash = {}
|
58
|
-
self.each_record do |head, seq|
|
59
|
-
hash[head] = seq
|
60
|
-
end
|
61
|
-
|
62
|
-
hash
|
63
|
-
end
|
64
|
-
|
65
|
-
# Analagous to IO#each_line, #each_record is used to go through a
|
66
|
-
# fasta file record by record. It will accept gzipped files as well.
|
67
|
-
#
|
68
|
-
# @param separate_lines [Object] If truthy, separate lines of record
|
69
|
-
# into an array of Sequences, but if falsy, yield a Sequence
|
70
|
-
# object for the sequence instead.
|
71
|
-
#
|
72
|
-
# @example Parsing a fasta file (default behavior, gzip files are fine)
|
73
|
-
# FastaFile.open('reads.fna.gz').each_record do |header, sequence|
|
74
|
-
# puts [header, sequence.gc].join("\t")
|
75
|
-
# end
|
76
|
-
#
|
77
|
-
# @example Parsing a fasta file (with truthy value param)
|
78
|
-
# FastaFile.open('reads.fna').each_record(1) do |header, sequence|
|
79
|
-
# # header => 'sequence_1'
|
80
|
-
# # sequence => ['AACTG', 'AGTCGT', ... ]
|
81
|
-
# end
|
82
|
-
#
|
83
|
-
# @yield The header and sequence for each record in the fasta
|
84
|
-
# file to the block
|
85
|
-
#
|
86
|
-
# @yieldparam header [String] The header of the fasta record without
|
87
|
-
# the leading '>'
|
88
|
-
#
|
89
|
-
# @yieldparam sequence [Sequence, Array<Sequence>] The sequence of the
|
90
|
-
# fasta record. If `separate_lines` is falsy (the default
|
91
|
-
# behavior), will be Sequence, but if truthy will be
|
92
|
-
# Array<String>.
|
93
|
-
#
|
94
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
95
|
-
def each_record(separate_lines=nil)
|
96
|
-
begin
|
97
|
-
f = Zlib::GzipReader.open(self)
|
98
|
-
rescue Zlib::GzipFile::Error => e
|
99
|
-
f = self
|
100
|
-
end
|
101
|
-
|
102
|
-
if separate_lines
|
103
|
-
f.each("\n>") do |line|
|
104
|
-
header, sequence = parse_line_separately(line)
|
105
|
-
yield(header.strip, sequence)
|
106
|
-
end
|
107
|
-
|
108
|
-
# f.each_with_index(">") do |line, idx|
|
109
|
-
# if idx.zero?
|
110
|
-
# if line != ">"
|
111
|
-
# raise ParseFasta::DataFormatError
|
112
|
-
# end
|
113
|
-
# else
|
114
|
-
# header, sequence = parse_line_separately(line)
|
115
|
-
# yield(header.strip, sequence)
|
116
|
-
# end
|
117
|
-
# end
|
118
|
-
else
|
119
|
-
header = ""
|
120
|
-
sequence = ""
|
121
|
-
f.each_line do |line|
|
122
|
-
line.chomp!
|
123
|
-
len = line.length
|
124
|
-
if header.empty? && line.start_with?(">")
|
125
|
-
header = line[1, len]
|
126
|
-
elsif line.start_with?(">")
|
127
|
-
yield(header.strip, Sequence.new(sequence || ""))
|
128
|
-
header = line[1, len]
|
129
|
-
sequence = ""
|
130
|
-
else
|
131
|
-
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
132
|
-
sequence << line
|
133
|
-
end
|
134
|
-
end
|
135
|
-
yield(header, Sequence.new(sequence || ""))
|
136
|
-
|
137
|
-
# f.each("\n>") do |line|
|
138
|
-
# header, sequence = parse_line(line)
|
139
|
-
# yield(header.strip, Sequence.new(sequence || ""))
|
140
|
-
# end
|
141
|
-
|
142
|
-
# f.each_with_index(sep=/^>/) do |line, idx|
|
143
|
-
# if idx.zero?
|
144
|
-
# if line != ">"
|
145
|
-
# raise ParseFasta::DataFormatError
|
146
|
-
# end
|
147
|
-
# else
|
148
|
-
# header, sequence = parse_line(line)
|
149
|
-
# yield(header.strip, Sequence.new(sequence || ""))
|
150
|
-
# end
|
151
|
-
# end
|
152
|
-
end
|
153
|
-
|
154
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
155
|
-
return f
|
156
|
-
end
|
157
|
-
|
158
|
-
# Fast version of #each_record
|
159
|
-
#
|
160
|
-
# Yields the sequence as a String, not Sequence. No separate lines
|
161
|
-
# option.
|
162
|
-
#
|
163
|
-
# @note If the fastA file has spaces in the sequence, they will be
|
164
|
-
# retained. If this is a problem, use #each_record instead.
|
165
|
-
#
|
166
|
-
# @yield The header and sequence for each record in the fasta
|
167
|
-
# file to the block
|
168
|
-
#
|
169
|
-
# @yieldparam header [String] The header of the fasta record without
|
170
|
-
# the leading '>'
|
171
|
-
#
|
172
|
-
# @yieldparam sequence [String] The sequence of the fasta record
|
173
|
-
#
|
174
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
175
|
-
def each_record_fast
|
176
|
-
begin
|
177
|
-
f = Zlib::GzipReader.open(self)
|
178
|
-
rescue Zlib::GzipFile::Error => e
|
179
|
-
f = self
|
180
|
-
end
|
181
|
-
|
182
|
-
header = ""
|
183
|
-
sequence = ""
|
184
|
-
f.each_line do |line|
|
185
|
-
line.chomp!
|
186
|
-
len = line.length
|
187
|
-
if header.empty? && line.start_with?(">")
|
188
|
-
header = line[1, len]
|
189
|
-
elsif line.start_with?(">")
|
190
|
-
yield(header.strip, sequence)
|
191
|
-
header = line[1, len]
|
192
|
-
sequence = ""
|
193
|
-
else
|
194
|
-
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
195
|
-
sequence << line
|
196
|
-
end
|
197
|
-
end
|
198
|
-
yield(header, sequence)
|
199
|
-
|
200
|
-
# f.each("\n>") do |line|
|
201
|
-
# header, sequence = parse_line(line)
|
202
|
-
|
203
|
-
# raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
204
|
-
|
205
|
-
# yield(header.strip, sequence)
|
206
|
-
# end
|
207
|
-
|
208
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
209
|
-
return f
|
210
|
-
end
|
211
|
-
|
212
|
-
private
|
213
|
-
|
214
|
-
def parse_line(line)
|
215
|
-
line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
|
216
|
-
end
|
217
|
-
|
218
|
-
def parse_line_separately(line)
|
219
|
-
header, sequence =
|
220
|
-
line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
|
221
|
-
|
222
|
-
if sequence.nil?
|
223
|
-
sequences = []
|
224
|
-
else
|
225
|
-
sequences = sequence.split("\n")
|
226
|
-
.reject { |s| s.empty? }
|
227
|
-
.map { |s| Sequence.new(s) }
|
228
|
-
end
|
229
|
-
|
230
|
-
[header, sequences]
|
231
|
-
end
|
232
|
-
end
|
@@ -1,160 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
require 'zlib'
|
20
|
-
|
21
|
-
# Provides simple interface for parsing four-line-per-record fastq
|
22
|
-
# format files. Gzipped files are no problem.
|
23
|
-
class FastqFile < File
|
24
|
-
|
25
|
-
# Returns the records in the fastq file as a hash map with the
|
26
|
-
# headers as keys pointing to a hash map like so
|
27
|
-
# { "seq1" => { head: "seq1", seq: "ACTG", desc: "", qual: "II3*"} }
|
28
|
-
#
|
29
|
-
# @example Read a fastQ into a hash table.
|
30
|
-
# seqs = FastqFile.open('reads.fq.gz').to_hash
|
31
|
-
#
|
32
|
-
# @return [Hash] A hash with headers as keys, and a hash map as the
|
33
|
-
# value with keys :head, :seq, :desc, :qual, for header, sequence,
|
34
|
-
# description, and quality.
|
35
|
-
def to_hash
|
36
|
-
hash = {}
|
37
|
-
self.each_record do |head, seq, desc, qual|
|
38
|
-
hash[head] = { head: head, seq: seq, desc: desc, qual: qual }
|
39
|
-
end
|
40
|
-
|
41
|
-
hash
|
42
|
-
end
|
43
|
-
|
44
|
-
# Analagous to IO#each_line, #each_record is used to go through a
|
45
|
-
# fastq file record by record. It will accept gzipped files as well.
|
46
|
-
#
|
47
|
-
# @example Parsing a fastq file
|
48
|
-
# FastqFile.open('reads.fq').each_record do |head, seq, desc, qual|
|
49
|
-
# # do some fun stuff here!
|
50
|
-
# end
|
51
|
-
# @example Use the same syntax for gzipped files!
|
52
|
-
# FastqFile.open('reads.fq.gz').each_record do |head, seq, desc, qual|
|
53
|
-
# # do some fun stuff here!
|
54
|
-
# end
|
55
|
-
#
|
56
|
-
# @yield The header, sequence, description and quality string for
|
57
|
-
# each record in the fastq file to the block
|
58
|
-
# @yieldparam header [String] The header of the fastq record without
|
59
|
-
# the leading '@'
|
60
|
-
# @yieldparam sequence [Sequence] The sequence of the fastq record
|
61
|
-
# @yieldparam description [String] The description line of the fastq
|
62
|
-
# record without the leading '+'
|
63
|
-
# @yieldparam quality_string [Quality] The quality string of the
|
64
|
-
# fastq record
|
65
|
-
def each_record
|
66
|
-
count = 0
|
67
|
-
header = ''
|
68
|
-
sequence = ''
|
69
|
-
description = ''
|
70
|
-
quality = ''
|
71
|
-
|
72
|
-
begin
|
73
|
-
f = Zlib::GzipReader.open(self)
|
74
|
-
rescue Zlib::GzipFile::Error => e
|
75
|
-
f = self
|
76
|
-
end
|
77
|
-
|
78
|
-
f.each_line do |line|
|
79
|
-
line.chomp!
|
80
|
-
|
81
|
-
case count
|
82
|
-
when 0
|
83
|
-
header = line[1..-1]
|
84
|
-
when 1
|
85
|
-
sequence = Sequence.new(line)
|
86
|
-
when 2
|
87
|
-
description = line[1..-1]
|
88
|
-
when 3
|
89
|
-
count = -1
|
90
|
-
quality = Quality.new(line)
|
91
|
-
yield(header, sequence, description, quality)
|
92
|
-
end
|
93
|
-
|
94
|
-
count += 1
|
95
|
-
end
|
96
|
-
|
97
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
98
|
-
return f
|
99
|
-
end
|
100
|
-
|
101
|
-
# Fast version of #each_record
|
102
|
-
#
|
103
|
-
# @note If the fastQ file has spaces in the sequence, they will be
|
104
|
-
# retained. If this is a problem, use #each_record instead.
|
105
|
-
#
|
106
|
-
# @example Parsing a fastq file
|
107
|
-
# FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
|
108
|
-
# # do some fun stuff here!
|
109
|
-
# end
|
110
|
-
# @example Use the same syntax for gzipped files!
|
111
|
-
# FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
|
112
|
-
# # do some fun stuff here!
|
113
|
-
# end
|
114
|
-
#
|
115
|
-
# @yield The header, sequence, description and quality string for
|
116
|
-
# each record in the fastq file to the block
|
117
|
-
#
|
118
|
-
# @yieldparam header [String] The header of the fastq record without
|
119
|
-
# the leading '@'
|
120
|
-
# @yieldparam sequence [String] The sequence of the fastq record
|
121
|
-
# @yieldparam description [String] The description line of the fastq
|
122
|
-
# record without the leading '+'
|
123
|
-
# @yieldparam quality_string [String] The quality string of the
|
124
|
-
# fastq record
|
125
|
-
def each_record_fast
|
126
|
-
count = 0
|
127
|
-
header = ''
|
128
|
-
sequence = ''
|
129
|
-
description = ''
|
130
|
-
quality = ''
|
131
|
-
|
132
|
-
begin
|
133
|
-
f = Zlib::GzipReader.open(self)
|
134
|
-
rescue Zlib::GzipFile::Error => e
|
135
|
-
f = self
|
136
|
-
end
|
137
|
-
|
138
|
-
f.each_line do |line|
|
139
|
-
line.chomp!
|
140
|
-
|
141
|
-
case count
|
142
|
-
when 0
|
143
|
-
header = line[1..-1]
|
144
|
-
when 1
|
145
|
-
sequence = line
|
146
|
-
when 2
|
147
|
-
description = line[1..-1]
|
148
|
-
when 3
|
149
|
-
count = -1
|
150
|
-
quality = line
|
151
|
-
yield(header, sequence, description, quality)
|
152
|
-
end
|
153
|
-
|
154
|
-
count += 1
|
155
|
-
end
|
156
|
-
|
157
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
158
|
-
return f
|
159
|
-
end
|
160
|
-
end
|
data/lib/parse_fasta/quality.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
# Provide some methods for dealing with common tasks regarding
|
20
|
-
# quality strings.
|
21
|
-
class Quality < String
|
22
|
-
|
23
|
-
# Strips whitespace from the str argument before calling super
|
24
|
-
#
|
25
|
-
# @return [Quality] A Quality string
|
26
|
-
#
|
27
|
-
# @example Removes whitespace
|
28
|
-
# Quality.new "I I 2 ! " #=> "II2!"
|
29
|
-
def initialize(str)
|
30
|
-
super(str.gsub(/ +/, ""))
|
31
|
-
end
|
32
|
-
|
33
|
-
# Returns the mean quality for the record. This will be a good deal
|
34
|
-
# faster than getting the average with `qual_scores` and reduce.
|
35
|
-
#
|
36
|
-
# @example Get mean quality score for a record
|
37
|
-
# Quality.new("!+5?I").mean_qual #=> 20.0
|
38
|
-
#
|
39
|
-
# @return [Float] Mean quality score for record
|
40
|
-
def mean_qual
|
41
|
-
(self.sum - (self.length * 33)) / self.length.to_f
|
42
|
-
end
|
43
|
-
|
44
|
-
# Returns an array of illumina style quality scores. The quality
|
45
|
-
# scores generated will be Phred+33 (i.e., new Illumina).
|
46
|
-
#
|
47
|
-
# @example Get quality score array of a Quality
|
48
|
-
# Quality.new("!+5?I").qual_scores #=> [0, 10, 20, 30, 40]
|
49
|
-
#
|
50
|
-
# @return [Array<Fixnum>] the quality scores
|
51
|
-
def qual_scores
|
52
|
-
self.each_byte.map { |b| b - 33 }
|
53
|
-
end
|
54
|
-
end
|
data/lib/parse_fasta/sequence.rb
DELETED
@@ -1,174 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
# Provide some methods for dealing with common tasks regarding
|
20
|
-
# nucleotide sequences.
|
21
|
-
class Sequence < String
|
22
|
-
|
23
|
-
# # Error raised if both T and U are present
|
24
|
-
# #
|
25
|
-
# # @note This is NOT checked on every call to Sequence.new
|
26
|
-
# class AmbiguousSequenceError < StandardError
|
27
|
-
# def message
|
28
|
-
# "Sequence is ambiguous -- both T and U present"
|
29
|
-
# end
|
30
|
-
# end
|
31
|
-
|
32
|
-
# Strips whitespace from the str argument before calling super
|
33
|
-
#
|
34
|
-
# @return [Sequence] A Sequence string
|
35
|
-
#
|
36
|
-
# @example Removes whitespace
|
37
|
-
# Sequence.new "AA CC TT" #=> "AACCTT"
|
38
|
-
#
|
39
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
40
|
-
def initialize(str)
|
41
|
-
if str.match(/>/)
|
42
|
-
raise ParseFasta::SequenceFormatError
|
43
|
-
end
|
44
|
-
|
45
|
-
super(str.gsub(/ +/, ""))
|
46
|
-
end
|
47
|
-
|
48
|
-
# Calculates GC content
|
49
|
-
#
|
50
|
-
# Calculates GC content by dividing count of G + C divided by count
|
51
|
-
# of G + C + T + A + U. If there are both T's and U's in the
|
52
|
-
# Sequence, things will get weird, but then again, that wouldn't
|
53
|
-
# happen, now would it! Ambiguous bases are ignored similar to
|
54
|
-
# BioRuby.
|
55
|
-
#
|
56
|
-
# @example Get GC of a Sequence
|
57
|
-
# Sequence.new('ACTg').gc #=> 0.5
|
58
|
-
# @example Using with FastaFile#each_record
|
59
|
-
# FastaFile.open('reads.fna', 'r').each_record do |header, sequence|
|
60
|
-
# puts [header, sequence.gc].join("\t")
|
61
|
-
# end
|
62
|
-
#
|
63
|
-
# @return [0] if the Sequence is empty or there are no A, C, T, G or U
|
64
|
-
# present
|
65
|
-
# @return [Float] if the GC content is defined for the Sequence
|
66
|
-
def gc
|
67
|
-
s = self.downcase
|
68
|
-
c = s.count('c')
|
69
|
-
g = s.count('g')
|
70
|
-
t = s.count('t')
|
71
|
-
a = s.count('a')
|
72
|
-
u = s.count('u')
|
73
|
-
|
74
|
-
return 0 if c + g + t + a + u == 0
|
75
|
-
return (c + g) / (c + g + t + a + u).to_f
|
76
|
-
end
|
77
|
-
|
78
|
-
# Returns a map of base counts
|
79
|
-
#
|
80
|
-
# This method will check if the sequence is DNA or RNA and return a
|
81
|
-
# count map appropriate for each. If a truthy argument is given, the
|
82
|
-
# count of ambiguous bases will be returned as well.
|
83
|
-
#
|
84
|
-
# If a sequence has both T and U present, will warn the user and
|
85
|
-
# keep going. Will return a map with counts of both, however.
|
86
|
-
#
|
87
|
-
# @example Get base counts of DNA sequence without ambiguous bases
|
88
|
-
# Sequence.new('AcTGn').base_counts
|
89
|
-
# #=> { a: 1, c: 1, t: 1, g: 1 }
|
90
|
-
# @example Get base counts of DNA sequence with ambiguous bases
|
91
|
-
# Sequence.new('AcTGn').base_counts(true)
|
92
|
-
# #=> { a: 1, c: 1, t: 1, g: 1, n: 1 }
|
93
|
-
# @example Get base counts of RNA sequence without ambiguous bases
|
94
|
-
# Sequence.new('AcUGn').base_counts
|
95
|
-
# #=> { a: 1, c: 1, u: 1, g: 1 }
|
96
|
-
# @example Get base counts of DNA sequence with ambiguous bases
|
97
|
-
# Sequence.new('AcUGn').base_counts(true)
|
98
|
-
# #=> { a: 1, c: 1, u: 1, g: 1, n: 1 }
|
99
|
-
#
|
100
|
-
# @return [Hash] A hash with base as key, count as value
|
101
|
-
def base_counts(count_ambiguous_bases=nil)
|
102
|
-
s = self.downcase
|
103
|
-
t = s.count('t')
|
104
|
-
u = s.count('u')
|
105
|
-
counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') }
|
106
|
-
|
107
|
-
if t > 0 && u == 0
|
108
|
-
counts[:t] = t
|
109
|
-
elsif t == 0 && u > 0
|
110
|
-
counts[:u] = u
|
111
|
-
elsif t > 0 && u > 0
|
112
|
-
warn('ERROR: A sequence contains both T and U')
|
113
|
-
counts[:t], counts[:u] = t, u
|
114
|
-
end
|
115
|
-
|
116
|
-
counts[:n] = s.count('n') if count_ambiguous_bases
|
117
|
-
|
118
|
-
counts
|
119
|
-
end
|
120
|
-
|
121
|
-
# Returns a map of base frequencies
|
122
|
-
#
|
123
|
-
# Counts bases with the `base_counts` method, then divides each
|
124
|
-
# count by the total bases counted to give frequency for each
|
125
|
-
# base. If a truthy argument is given, ambiguous bases will be
|
126
|
-
# included in the total and their frequency reported. Can discern
|
127
|
-
# between DNA and RNA.
|
128
|
-
#
|
129
|
-
# If default or falsy argument is given, ambiguous bases will not be
|
130
|
-
# counted in the total base count and their frequency will not be
|
131
|
-
# given.
|
132
|
-
#
|
133
|
-
# @example Get base frequencies of DNA sequence without ambiguous bases
|
134
|
-
# Sequence.new('AcTGn').base_counts
|
135
|
-
# #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 }
|
136
|
-
# @example Get base counts of DNA sequence with ambiguous bases
|
137
|
-
# Sequence.new('AcTGn').base_counts(true)
|
138
|
-
# #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 }
|
139
|
-
#
|
140
|
-
# @return [Hash] A hash with base as key, frequency as value
|
141
|
-
def base_frequencies(count_ambiguous_bases=nil)
|
142
|
-
base_counts = self.base_counts(count_ambiguous_bases)
|
143
|
-
total_bases = base_counts.values.reduce(:+).to_f
|
144
|
-
base_freqs =
|
145
|
-
base_counts.map { |base, count| [base, count/total_bases] }.flatten
|
146
|
-
Hash[*base_freqs]
|
147
|
-
end
|
148
|
-
|
149
|
-
# Returns a reverse complement of self
|
150
|
-
#
|
151
|
-
# @return [Sequence] a Sequence that is the reverse complement of
|
152
|
-
# self
|
153
|
-
#
|
154
|
-
# @example Hanldes any IUPAC character and capitalization properly
|
155
|
-
# Sequence.new("gARKbdctymvhu").rev_comp #=> "adbkraghvMYTc"
|
156
|
-
#
|
157
|
-
# @example Leaves non IUPAC characters
|
158
|
-
# Sequence.new("cccc--CCCcccga").rev_comp #=> "tcgggGGG--gggg""
|
159
|
-
#
|
160
|
-
# @note If Sequence contains non-IUPAC characters, these are not
|
161
|
-
# complemented
|
162
|
-
def rev_comp
|
163
|
-
# if self.match(/T/i) && self.match(/U/i)
|
164
|
-
# raise Sequence::AmbiguousSequenceError
|
165
|
-
# end
|
166
|
-
|
167
|
-
# if self.match(/[^ATUGCYRSWKMBDHVN]/i)
|
168
|
-
# warn "WARNING: Sequence contains non IUPAC characters"
|
169
|
-
# end
|
170
|
-
|
171
|
-
self.reverse.tr("ATUGCYRSWKMBDHVNatugcyrswkmbdhvn",
|
172
|
-
"TAACGRYSWMKVHDBNtaacgryswmkvhdbn")
|
173
|
-
end
|
174
|
-
end
|