parse_fasta 1.9.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +178 -0
- data/README.md +42 -215
- data/Rakefile +2 -4
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/parse_fasta/error.rb +39 -0
- data/lib/parse_fasta/record.rb +88 -0
- data/lib/parse_fasta/seq_file.rb +221 -114
- data/lib/parse_fasta/version.rb +2 -2
- data/lib/parse_fasta.rb +5 -20
- data/spec/parse_fasta/record_spec.rb +115 -0
- data/spec/parse_fasta/seq_file_spec.rb +238 -0
- data/spec/parse_fasta_spec.rb +25 -0
- data/spec/spec_helper.rb +2 -44
- data/spec/test_files/cr.fa +1 -0
- data/spec/test_files/cr.fa.gz +0 -0
- data/spec/test_files/cr.fq +3 -0
- data/spec/test_files/cr.fq.gz +0 -0
- data/spec/test_files/cr_nl.fa +4 -0
- data/spec/test_files/cr_nl.fa.gz +0 -0
- data/spec/test_files/cr_nl.fq +8 -0
- data/spec/test_files/cr_nl.fq.gz +0 -0
- data/spec/test_files/multi_blob.fa.gz +0 -0
- data/spec/test_files/multi_blob.fq.gz +0 -0
- data/spec/test_files/not_a_seq_file.txt +1 -0
- data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
- data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
- data/spec/test_files/seqs.fa.gz +0 -0
- data/spec/test_files/seqs.fq +8 -0
- data/spec/test_files/seqs.fq.gz +0 -0
- metadata +49 -24
- data/lib/parse_fasta/fasta_file.rb +0 -232
- data/lib/parse_fasta/fastq_file.rb +0 -160
- data/lib/parse_fasta/quality.rb +0 -54
- data/lib/parse_fasta/sequence.rb +0 -174
- data/spec/lib/fasta_file_spec.rb +0 -212
- data/spec/lib/fastq_file_spec.rb +0 -143
- data/spec/lib/quality_spec.rb +0 -51
- data/spec/lib/seq_file_spec.rb +0 -357
- data/spec/lib/sequence_spec.rb +0 -188
- data/test_files/benchmark.rb +0 -99
- data/test_files/bogus.txt +0 -2
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq +0 -8
- data/test_files/test.fq.gz +0 -0
@@ -1,232 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
require 'zlib'
|
20
|
-
|
21
|
-
# Provides simple interface for parsing fasta format files. Gzipped
|
22
|
-
# files are no problem.
|
23
|
-
class FastaFile < File
|
24
|
-
|
25
|
-
# Use it like IO::open
|
26
|
-
#
|
27
|
-
# @param fname [String] the name of the file to open
|
28
|
-
#
|
29
|
-
# @return [FastaFile] a FastaFile
|
30
|
-
def self.open(fname, *args)
|
31
|
-
begin
|
32
|
-
handle = Zlib::GzipReader.open(fname)
|
33
|
-
rescue Zlib::GzipFile::Error => e
|
34
|
-
handle = File.open(fname)
|
35
|
-
end
|
36
|
-
|
37
|
-
unless handle.each_char.peek[0] == '>'
|
38
|
-
raise ParseFasta::DataFormatError
|
39
|
-
end
|
40
|
-
|
41
|
-
handle.close
|
42
|
-
|
43
|
-
super
|
44
|
-
end
|
45
|
-
|
46
|
-
# Returns the records in the fasta file as a hash map with the
|
47
|
-
# headers as keys and the Sequences as values.
|
48
|
-
#
|
49
|
-
# @example Read a fastA into a hash table.
|
50
|
-
# seqs = FastaFile.open('reads.fa').to_hash
|
51
|
-
#
|
52
|
-
# @return [Hash] A hash with headers as keys, sequences as the
|
53
|
-
# values (Sequence objects)
|
54
|
-
#
|
55
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
56
|
-
def to_hash
|
57
|
-
hash = {}
|
58
|
-
self.each_record do |head, seq|
|
59
|
-
hash[head] = seq
|
60
|
-
end
|
61
|
-
|
62
|
-
hash
|
63
|
-
end
|
64
|
-
|
65
|
-
# Analagous to IO#each_line, #each_record is used to go through a
|
66
|
-
# fasta file record by record. It will accept gzipped files as well.
|
67
|
-
#
|
68
|
-
# @param separate_lines [Object] If truthy, separate lines of record
|
69
|
-
# into an array of Sequences, but if falsy, yield a Sequence
|
70
|
-
# object for the sequence instead.
|
71
|
-
#
|
72
|
-
# @example Parsing a fasta file (default behavior, gzip files are fine)
|
73
|
-
# FastaFile.open('reads.fna.gz').each_record do |header, sequence|
|
74
|
-
# puts [header, sequence.gc].join("\t")
|
75
|
-
# end
|
76
|
-
#
|
77
|
-
# @example Parsing a fasta file (with truthy value param)
|
78
|
-
# FastaFile.open('reads.fna').each_record(1) do |header, sequence|
|
79
|
-
# # header => 'sequence_1'
|
80
|
-
# # sequence => ['AACTG', 'AGTCGT', ... ]
|
81
|
-
# end
|
82
|
-
#
|
83
|
-
# @yield The header and sequence for each record in the fasta
|
84
|
-
# file to the block
|
85
|
-
#
|
86
|
-
# @yieldparam header [String] The header of the fasta record without
|
87
|
-
# the leading '>'
|
88
|
-
#
|
89
|
-
# @yieldparam sequence [Sequence, Array<Sequence>] The sequence of the
|
90
|
-
# fasta record. If `separate_lines` is falsy (the default
|
91
|
-
# behavior), will be Sequence, but if truthy will be
|
92
|
-
# Array<String>.
|
93
|
-
#
|
94
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
95
|
-
def each_record(separate_lines=nil)
|
96
|
-
begin
|
97
|
-
f = Zlib::GzipReader.open(self)
|
98
|
-
rescue Zlib::GzipFile::Error => e
|
99
|
-
f = self
|
100
|
-
end
|
101
|
-
|
102
|
-
if separate_lines
|
103
|
-
f.each("\n>") do |line|
|
104
|
-
header, sequence = parse_line_separately(line)
|
105
|
-
yield(header.strip, sequence)
|
106
|
-
end
|
107
|
-
|
108
|
-
# f.each_with_index(">") do |line, idx|
|
109
|
-
# if idx.zero?
|
110
|
-
# if line != ">"
|
111
|
-
# raise ParseFasta::DataFormatError
|
112
|
-
# end
|
113
|
-
# else
|
114
|
-
# header, sequence = parse_line_separately(line)
|
115
|
-
# yield(header.strip, sequence)
|
116
|
-
# end
|
117
|
-
# end
|
118
|
-
else
|
119
|
-
header = ""
|
120
|
-
sequence = ""
|
121
|
-
f.each_line do |line|
|
122
|
-
line.chomp!
|
123
|
-
len = line.length
|
124
|
-
if header.empty? && line.start_with?(">")
|
125
|
-
header = line[1, len]
|
126
|
-
elsif line.start_with?(">")
|
127
|
-
yield(header.strip, Sequence.new(sequence || ""))
|
128
|
-
header = line[1, len]
|
129
|
-
sequence = ""
|
130
|
-
else
|
131
|
-
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
132
|
-
sequence << line
|
133
|
-
end
|
134
|
-
end
|
135
|
-
yield(header, Sequence.new(sequence || ""))
|
136
|
-
|
137
|
-
# f.each("\n>") do |line|
|
138
|
-
# header, sequence = parse_line(line)
|
139
|
-
# yield(header.strip, Sequence.new(sequence || ""))
|
140
|
-
# end
|
141
|
-
|
142
|
-
# f.each_with_index(sep=/^>/) do |line, idx|
|
143
|
-
# if idx.zero?
|
144
|
-
# if line != ">"
|
145
|
-
# raise ParseFasta::DataFormatError
|
146
|
-
# end
|
147
|
-
# else
|
148
|
-
# header, sequence = parse_line(line)
|
149
|
-
# yield(header.strip, Sequence.new(sequence || ""))
|
150
|
-
# end
|
151
|
-
# end
|
152
|
-
end
|
153
|
-
|
154
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
155
|
-
return f
|
156
|
-
end
|
157
|
-
|
158
|
-
# Fast version of #each_record
|
159
|
-
#
|
160
|
-
# Yields the sequence as a String, not Sequence. No separate lines
|
161
|
-
# option.
|
162
|
-
#
|
163
|
-
# @note If the fastA file has spaces in the sequence, they will be
|
164
|
-
# retained. If this is a problem, use #each_record instead.
|
165
|
-
#
|
166
|
-
# @yield The header and sequence for each record in the fasta
|
167
|
-
# file to the block
|
168
|
-
#
|
169
|
-
# @yieldparam header [String] The header of the fasta record without
|
170
|
-
# the leading '>'
|
171
|
-
#
|
172
|
-
# @yieldparam sequence [String] The sequence of the fasta record
|
173
|
-
#
|
174
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
175
|
-
def each_record_fast
|
176
|
-
begin
|
177
|
-
f = Zlib::GzipReader.open(self)
|
178
|
-
rescue Zlib::GzipFile::Error => e
|
179
|
-
f = self
|
180
|
-
end
|
181
|
-
|
182
|
-
header = ""
|
183
|
-
sequence = ""
|
184
|
-
f.each_line do |line|
|
185
|
-
line.chomp!
|
186
|
-
len = line.length
|
187
|
-
if header.empty? && line.start_with?(">")
|
188
|
-
header = line[1, len]
|
189
|
-
elsif line.start_with?(">")
|
190
|
-
yield(header.strip, sequence)
|
191
|
-
header = line[1, len]
|
192
|
-
sequence = ""
|
193
|
-
else
|
194
|
-
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
195
|
-
sequence << line
|
196
|
-
end
|
197
|
-
end
|
198
|
-
yield(header, sequence)
|
199
|
-
|
200
|
-
# f.each("\n>") do |line|
|
201
|
-
# header, sequence = parse_line(line)
|
202
|
-
|
203
|
-
# raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
204
|
-
|
205
|
-
# yield(header.strip, sequence)
|
206
|
-
# end
|
207
|
-
|
208
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
209
|
-
return f
|
210
|
-
end
|
211
|
-
|
212
|
-
private
|
213
|
-
|
214
|
-
def parse_line(line)
|
215
|
-
line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
|
216
|
-
end
|
217
|
-
|
218
|
-
def parse_line_separately(line)
|
219
|
-
header, sequence =
|
220
|
-
line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
|
221
|
-
|
222
|
-
if sequence.nil?
|
223
|
-
sequences = []
|
224
|
-
else
|
225
|
-
sequences = sequence.split("\n")
|
226
|
-
.reject { |s| s.empty? }
|
227
|
-
.map { |s| Sequence.new(s) }
|
228
|
-
end
|
229
|
-
|
230
|
-
[header, sequences]
|
231
|
-
end
|
232
|
-
end
|
@@ -1,160 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
require 'zlib'
|
20
|
-
|
21
|
-
# Provides simple interface for parsing four-line-per-record fastq
|
22
|
-
# format files. Gzipped files are no problem.
|
23
|
-
class FastqFile < File
|
24
|
-
|
25
|
-
# Returns the records in the fastq file as a hash map with the
|
26
|
-
# headers as keys pointing to a hash map like so
|
27
|
-
# { "seq1" => { head: "seq1", seq: "ACTG", desc: "", qual: "II3*"} }
|
28
|
-
#
|
29
|
-
# @example Read a fastQ into a hash table.
|
30
|
-
# seqs = FastqFile.open('reads.fq.gz').to_hash
|
31
|
-
#
|
32
|
-
# @return [Hash] A hash with headers as keys, and a hash map as the
|
33
|
-
# value with keys :head, :seq, :desc, :qual, for header, sequence,
|
34
|
-
# description, and quality.
|
35
|
-
def to_hash
|
36
|
-
hash = {}
|
37
|
-
self.each_record do |head, seq, desc, qual|
|
38
|
-
hash[head] = { head: head, seq: seq, desc: desc, qual: qual }
|
39
|
-
end
|
40
|
-
|
41
|
-
hash
|
42
|
-
end
|
43
|
-
|
44
|
-
# Analagous to IO#each_line, #each_record is used to go through a
|
45
|
-
# fastq file record by record. It will accept gzipped files as well.
|
46
|
-
#
|
47
|
-
# @example Parsing a fastq file
|
48
|
-
# FastqFile.open('reads.fq').each_record do |head, seq, desc, qual|
|
49
|
-
# # do some fun stuff here!
|
50
|
-
# end
|
51
|
-
# @example Use the same syntax for gzipped files!
|
52
|
-
# FastqFile.open('reads.fq.gz').each_record do |head, seq, desc, qual|
|
53
|
-
# # do some fun stuff here!
|
54
|
-
# end
|
55
|
-
#
|
56
|
-
# @yield The header, sequence, description and quality string for
|
57
|
-
# each record in the fastq file to the block
|
58
|
-
# @yieldparam header [String] The header of the fastq record without
|
59
|
-
# the leading '@'
|
60
|
-
# @yieldparam sequence [Sequence] The sequence of the fastq record
|
61
|
-
# @yieldparam description [String] The description line of the fastq
|
62
|
-
# record without the leading '+'
|
63
|
-
# @yieldparam quality_string [Quality] The quality string of the
|
64
|
-
# fastq record
|
65
|
-
def each_record
|
66
|
-
count = 0
|
67
|
-
header = ''
|
68
|
-
sequence = ''
|
69
|
-
description = ''
|
70
|
-
quality = ''
|
71
|
-
|
72
|
-
begin
|
73
|
-
f = Zlib::GzipReader.open(self)
|
74
|
-
rescue Zlib::GzipFile::Error => e
|
75
|
-
f = self
|
76
|
-
end
|
77
|
-
|
78
|
-
f.each_line do |line|
|
79
|
-
line.chomp!
|
80
|
-
|
81
|
-
case count
|
82
|
-
when 0
|
83
|
-
header = line[1..-1]
|
84
|
-
when 1
|
85
|
-
sequence = Sequence.new(line)
|
86
|
-
when 2
|
87
|
-
description = line[1..-1]
|
88
|
-
when 3
|
89
|
-
count = -1
|
90
|
-
quality = Quality.new(line)
|
91
|
-
yield(header, sequence, description, quality)
|
92
|
-
end
|
93
|
-
|
94
|
-
count += 1
|
95
|
-
end
|
96
|
-
|
97
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
98
|
-
return f
|
99
|
-
end
|
100
|
-
|
101
|
-
# Fast version of #each_record
|
102
|
-
#
|
103
|
-
# @note If the fastQ file has spaces in the sequence, they will be
|
104
|
-
# retained. If this is a problem, use #each_record instead.
|
105
|
-
#
|
106
|
-
# @example Parsing a fastq file
|
107
|
-
# FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
|
108
|
-
# # do some fun stuff here!
|
109
|
-
# end
|
110
|
-
# @example Use the same syntax for gzipped files!
|
111
|
-
# FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
|
112
|
-
# # do some fun stuff here!
|
113
|
-
# end
|
114
|
-
#
|
115
|
-
# @yield The header, sequence, description and quality string for
|
116
|
-
# each record in the fastq file to the block
|
117
|
-
#
|
118
|
-
# @yieldparam header [String] The header of the fastq record without
|
119
|
-
# the leading '@'
|
120
|
-
# @yieldparam sequence [String] The sequence of the fastq record
|
121
|
-
# @yieldparam description [String] The description line of the fastq
|
122
|
-
# record without the leading '+'
|
123
|
-
# @yieldparam quality_string [String] The quality string of the
|
124
|
-
# fastq record
|
125
|
-
def each_record_fast
|
126
|
-
count = 0
|
127
|
-
header = ''
|
128
|
-
sequence = ''
|
129
|
-
description = ''
|
130
|
-
quality = ''
|
131
|
-
|
132
|
-
begin
|
133
|
-
f = Zlib::GzipReader.open(self)
|
134
|
-
rescue Zlib::GzipFile::Error => e
|
135
|
-
f = self
|
136
|
-
end
|
137
|
-
|
138
|
-
f.each_line do |line|
|
139
|
-
line.chomp!
|
140
|
-
|
141
|
-
case count
|
142
|
-
when 0
|
143
|
-
header = line[1..-1]
|
144
|
-
when 1
|
145
|
-
sequence = line
|
146
|
-
when 2
|
147
|
-
description = line[1..-1]
|
148
|
-
when 3
|
149
|
-
count = -1
|
150
|
-
quality = line
|
151
|
-
yield(header, sequence, description, quality)
|
152
|
-
end
|
153
|
-
|
154
|
-
count += 1
|
155
|
-
end
|
156
|
-
|
157
|
-
f.close if f.instance_of?(Zlib::GzipReader)
|
158
|
-
return f
|
159
|
-
end
|
160
|
-
end
|
data/lib/parse_fasta/quality.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
# Provide some methods for dealing with common tasks regarding
|
20
|
-
# quality strings.
|
21
|
-
class Quality < String
|
22
|
-
|
23
|
-
# Strips whitespace from the str argument before calling super
|
24
|
-
#
|
25
|
-
# @return [Quality] A Quality string
|
26
|
-
#
|
27
|
-
# @example Removes whitespace
|
28
|
-
# Quality.new "I I 2 ! " #=> "II2!"
|
29
|
-
def initialize(str)
|
30
|
-
super(str.gsub(/ +/, ""))
|
31
|
-
end
|
32
|
-
|
33
|
-
# Returns the mean quality for the record. This will be a good deal
|
34
|
-
# faster than getting the average with `qual_scores` and reduce.
|
35
|
-
#
|
36
|
-
# @example Get mean quality score for a record
|
37
|
-
# Quality.new("!+5?I").mean_qual #=> 20.0
|
38
|
-
#
|
39
|
-
# @return [Float] Mean quality score for record
|
40
|
-
def mean_qual
|
41
|
-
(self.sum - (self.length * 33)) / self.length.to_f
|
42
|
-
end
|
43
|
-
|
44
|
-
# Returns an array of illumina style quality scores. The quality
|
45
|
-
# scores generated will be Phred+33 (i.e., new Illumina).
|
46
|
-
#
|
47
|
-
# @example Get quality score array of a Quality
|
48
|
-
# Quality.new("!+5?I").qual_scores #=> [0, 10, 20, 30, 40]
|
49
|
-
#
|
50
|
-
# @return [Array<Fixnum>] the quality scores
|
51
|
-
def qual_scores
|
52
|
-
self.each_byte.map { |b| b - 33 }
|
53
|
-
end
|
54
|
-
end
|
data/lib/parse_fasta/sequence.rb
DELETED
@@ -1,174 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
# Provide some methods for dealing with common tasks regarding
|
20
|
-
# nucleotide sequences.
|
21
|
-
class Sequence < String
|
22
|
-
|
23
|
-
# # Error raised if both T and U are present
|
24
|
-
# #
|
25
|
-
# # @note This is NOT checked on every call to Sequence.new
|
26
|
-
# class AmbiguousSequenceError < StandardError
|
27
|
-
# def message
|
28
|
-
# "Sequence is ambiguous -- both T and U present"
|
29
|
-
# end
|
30
|
-
# end
|
31
|
-
|
32
|
-
# Strips whitespace from the str argument before calling super
|
33
|
-
#
|
34
|
-
# @return [Sequence] A Sequence string
|
35
|
-
#
|
36
|
-
# @example Removes whitespace
|
37
|
-
# Sequence.new "AA CC TT" #=> "AACCTT"
|
38
|
-
#
|
39
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
40
|
-
def initialize(str)
|
41
|
-
if str.match(/>/)
|
42
|
-
raise ParseFasta::SequenceFormatError
|
43
|
-
end
|
44
|
-
|
45
|
-
super(str.gsub(/ +/, ""))
|
46
|
-
end
|
47
|
-
|
48
|
-
# Calculates GC content
|
49
|
-
#
|
50
|
-
# Calculates GC content by dividing count of G + C divided by count
|
51
|
-
# of G + C + T + A + U. If there are both T's and U's in the
|
52
|
-
# Sequence, things will get weird, but then again, that wouldn't
|
53
|
-
# happen, now would it! Ambiguous bases are ignored similar to
|
54
|
-
# BioRuby.
|
55
|
-
#
|
56
|
-
# @example Get GC of a Sequence
|
57
|
-
# Sequence.new('ACTg').gc #=> 0.5
|
58
|
-
# @example Using with FastaFile#each_record
|
59
|
-
# FastaFile.open('reads.fna', 'r').each_record do |header, sequence|
|
60
|
-
# puts [header, sequence.gc].join("\t")
|
61
|
-
# end
|
62
|
-
#
|
63
|
-
# @return [0] if the Sequence is empty or there are no A, C, T, G or U
|
64
|
-
# present
|
65
|
-
# @return [Float] if the GC content is defined for the Sequence
|
66
|
-
def gc
|
67
|
-
s = self.downcase
|
68
|
-
c = s.count('c')
|
69
|
-
g = s.count('g')
|
70
|
-
t = s.count('t')
|
71
|
-
a = s.count('a')
|
72
|
-
u = s.count('u')
|
73
|
-
|
74
|
-
return 0 if c + g + t + a + u == 0
|
75
|
-
return (c + g) / (c + g + t + a + u).to_f
|
76
|
-
end
|
77
|
-
|
78
|
-
# Returns a map of base counts
|
79
|
-
#
|
80
|
-
# This method will check if the sequence is DNA or RNA and return a
|
81
|
-
# count map appropriate for each. If a truthy argument is given, the
|
82
|
-
# count of ambiguous bases will be returned as well.
|
83
|
-
#
|
84
|
-
# If a sequence has both T and U present, will warn the user and
|
85
|
-
# keep going. Will return a map with counts of both, however.
|
86
|
-
#
|
87
|
-
# @example Get base counts of DNA sequence without ambiguous bases
|
88
|
-
# Sequence.new('AcTGn').base_counts
|
89
|
-
# #=> { a: 1, c: 1, t: 1, g: 1 }
|
90
|
-
# @example Get base counts of DNA sequence with ambiguous bases
|
91
|
-
# Sequence.new('AcTGn').base_counts(true)
|
92
|
-
# #=> { a: 1, c: 1, t: 1, g: 1, n: 1 }
|
93
|
-
# @example Get base counts of RNA sequence without ambiguous bases
|
94
|
-
# Sequence.new('AcUGn').base_counts
|
95
|
-
# #=> { a: 1, c: 1, u: 1, g: 1 }
|
96
|
-
# @example Get base counts of DNA sequence with ambiguous bases
|
97
|
-
# Sequence.new('AcUGn').base_counts(true)
|
98
|
-
# #=> { a: 1, c: 1, u: 1, g: 1, n: 1 }
|
99
|
-
#
|
100
|
-
# @return [Hash] A hash with base as key, count as value
|
101
|
-
def base_counts(count_ambiguous_bases=nil)
|
102
|
-
s = self.downcase
|
103
|
-
t = s.count('t')
|
104
|
-
u = s.count('u')
|
105
|
-
counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') }
|
106
|
-
|
107
|
-
if t > 0 && u == 0
|
108
|
-
counts[:t] = t
|
109
|
-
elsif t == 0 && u > 0
|
110
|
-
counts[:u] = u
|
111
|
-
elsif t > 0 && u > 0
|
112
|
-
warn('ERROR: A sequence contains both T and U')
|
113
|
-
counts[:t], counts[:u] = t, u
|
114
|
-
end
|
115
|
-
|
116
|
-
counts[:n] = s.count('n') if count_ambiguous_bases
|
117
|
-
|
118
|
-
counts
|
119
|
-
end
|
120
|
-
|
121
|
-
# Returns a map of base frequencies
|
122
|
-
#
|
123
|
-
# Counts bases with the `base_counts` method, then divides each
|
124
|
-
# count by the total bases counted to give frequency for each
|
125
|
-
# base. If a truthy argument is given, ambiguous bases will be
|
126
|
-
# included in the total and their frequency reported. Can discern
|
127
|
-
# between DNA and RNA.
|
128
|
-
#
|
129
|
-
# If default or falsy argument is given, ambiguous bases will not be
|
130
|
-
# counted in the total base count and their frequency will not be
|
131
|
-
# given.
|
132
|
-
#
|
133
|
-
# @example Get base frequencies of DNA sequence without ambiguous bases
|
134
|
-
# Sequence.new('AcTGn').base_counts
|
135
|
-
# #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 }
|
136
|
-
# @example Get base counts of DNA sequence with ambiguous bases
|
137
|
-
# Sequence.new('AcTGn').base_counts(true)
|
138
|
-
# #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 }
|
139
|
-
#
|
140
|
-
# @return [Hash] A hash with base as key, frequency as value
|
141
|
-
def base_frequencies(count_ambiguous_bases=nil)
|
142
|
-
base_counts = self.base_counts(count_ambiguous_bases)
|
143
|
-
total_bases = base_counts.values.reduce(:+).to_f
|
144
|
-
base_freqs =
|
145
|
-
base_counts.map { |base, count| [base, count/total_bases] }.flatten
|
146
|
-
Hash[*base_freqs]
|
147
|
-
end
|
148
|
-
|
149
|
-
# Returns a reverse complement of self
|
150
|
-
#
|
151
|
-
# @return [Sequence] a Sequence that is the reverse complement of
|
152
|
-
# self
|
153
|
-
#
|
154
|
-
# @example Hanldes any IUPAC character and capitalization properly
|
155
|
-
# Sequence.new("gARKbdctymvhu").rev_comp #=> "adbkraghvMYTc"
|
156
|
-
#
|
157
|
-
# @example Leaves non IUPAC characters
|
158
|
-
# Sequence.new("cccc--CCCcccga").rev_comp #=> "tcgggGGG--gggg""
|
159
|
-
#
|
160
|
-
# @note If Sequence contains non-IUPAC characters, these are not
|
161
|
-
# complemented
|
162
|
-
def rev_comp
|
163
|
-
# if self.match(/T/i) && self.match(/U/i)
|
164
|
-
# raise Sequence::AmbiguousSequenceError
|
165
|
-
# end
|
166
|
-
|
167
|
-
# if self.match(/[^ATUGCYRSWKMBDHVN]/i)
|
168
|
-
# warn "WARNING: Sequence contains non IUPAC characters"
|
169
|
-
# end
|
170
|
-
|
171
|
-
self.reverse.tr("ATUGCYRSWKMBDHVNatugcyrswkmbdhvn",
|
172
|
-
"TAACGRYSWMKVHDBNtaacgryswmkvhdbn")
|
173
|
-
end
|
174
|
-
end
|