parse_fasta 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/README.md +22 -1
- data/lib/parse_fasta/fasta_file.rb +28 -2
- data/lib/parse_fasta/seq_file.rb +6 -0
- data/lib/parse_fasta/sequence.rb +6 -0
- data/lib/parse_fasta/version.rb +1 -1
- data/lib/parse_fasta.rb +7 -0
- data/spec/lib/fasta_file_spec.rb +18 -0
- data/spec/lib/seq_file_spec.rb +18 -0
- data/spec/lib/sequence_spec.rb +8 -0
- data/spec/spec_helper.rb +4 -1
- data/test_files/bad.fa +5 -0
- data/test_files/test.fa +3 -0
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq.gz +0 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YzlkNTQ5NGQ5YTFlNzVkOTJjOTJkMTM2YmUwN2FlMjhmOTg2ZDZlMQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OThhYTU5NTAzYzlkMTg2N2IxOWNjYTExMWEyODRiY2Q2OGFhMzQ4MQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NzMyYTNmZmQ0YThlMThkZmE3ZjZhZjAzNDM2MGQ4ZTcwODhkODY3NzI2NzU1
|
10
|
+
NDQzYmU1ZDBiZjljYzVhZmNlMDIzZDMxMDc4Zjk3N2E1YTAxOTUzZTIyOGNj
|
11
|
+
NzdjOWJiODA2ZDA0NGNmMjFkOGI1ZjgxZWY3NTRmMTQ1MDc5MTU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ODU2NzMwZTk3ZmE0ZTIxYzMwOWVkMWUyY2U4MTE3YzAzMzI5MzU1ZDAzNWE3
|
14
|
+
OGQ3ODk2ZjQwYTNjNTJlZTVjYzg3MGU5YzliZjAyYjQ4ZDNmNjRlNzE2YmJk
|
15
|
+
MmY2OTRhYjI3NTM3ODFmYWYwNDk2ZjQ0YzI3YjIxMzI3MGU3MmE=
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -27,7 +27,7 @@ lightweight than BioRuby. And more fun! ;)
|
|
27
27
|
## Documentation ##
|
28
28
|
|
29
29
|
Checkout
|
30
|
-
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta
|
30
|
+
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta)
|
31
31
|
for the full api documentation.
|
32
32
|
|
33
33
|
## Usage ##
|
@@ -73,6 +73,27 @@ Add `Sequence#rev_comp`. It can handle IUPAC characters. Since
|
|
73
73
|
an amino acid string, things will get weird as it will complement the
|
74
74
|
IUPAC characters in the AA string and leave others.
|
75
75
|
|
76
|
+
#### 1.8.1 ####
|
77
|
+
|
78
|
+
An error will be raised if a fasta file has a `>` in the
|
79
|
+
sequence. Sometimes files are not terminated with a newline
|
80
|
+
character. If this is the case, then catting two fasta files will
|
81
|
+
smush the first header of the second file right in with the last
|
82
|
+
sequence of the first file. This is bad, raise an error! ;)
|
83
|
+
|
84
|
+
Example
|
85
|
+
|
86
|
+
>seq1
|
87
|
+
ACTG>seq2
|
88
|
+
ACTG
|
89
|
+
>seq3
|
90
|
+
ACTG
|
91
|
+
|
92
|
+
This will raise `ParseFasta::SequenceFormatError`.
|
93
|
+
|
94
|
+
Also, headers with lots of `>` within are fine now.
|
95
|
+
|
96
|
+
|
76
97
|
### 1.7 ###
|
77
98
|
|
78
99
|
Add `SeqFile#to_hash`, `FastaFile#to_hash` and `FastqFile#to_hash`.
|
@@ -51,6 +51,8 @@ class FastaFile < File
|
|
51
51
|
#
|
52
52
|
# @return [Hash] A hash with headers as keys, sequences as the
|
53
53
|
# values (Sequence objects)
|
54
|
+
#
|
55
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
54
56
|
def to_hash
|
55
57
|
hash = {}
|
56
58
|
self.each_record do |head, seq|
|
@@ -88,6 +90,8 @@ class FastaFile < File
|
|
88
90
|
# fasta record. If `separate_lines` is falsy (the default
|
89
91
|
# behavior), will be Sequence, but if truthy will be
|
90
92
|
# Array<String>.
|
93
|
+
#
|
94
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
91
95
|
def each_record(separate_lines=nil)
|
92
96
|
begin
|
93
97
|
f = Zlib::GzipReader.open(self)
|
@@ -100,11 +104,33 @@ class FastaFile < File
|
|
100
104
|
header, sequence = parse_line_separately(line)
|
101
105
|
yield(header.strip, sequence)
|
102
106
|
end
|
107
|
+
|
108
|
+
# f.each_with_index(">") do |line, idx|
|
109
|
+
# if idx.zero?
|
110
|
+
# if line != ">"
|
111
|
+
# raise ParseFasta::DataFormatError
|
112
|
+
# end
|
113
|
+
# else
|
114
|
+
# header, sequence = parse_line_separately(line)
|
115
|
+
# yield(header.strip, sequence)
|
116
|
+
# end
|
117
|
+
# end
|
103
118
|
else
|
104
119
|
f.each("\n>") do |line|
|
105
120
|
header, sequence = parse_line(line)
|
106
121
|
yield(header.strip, Sequence.new(sequence || ""))
|
107
122
|
end
|
123
|
+
|
124
|
+
# f.each_with_index(sep=/^>/) do |line, idx|
|
125
|
+
# if idx.zero?
|
126
|
+
# if line != ">"
|
127
|
+
# raise ParseFasta::DataFormatError
|
128
|
+
# end
|
129
|
+
# else
|
130
|
+
# header, sequence = parse_line(line)
|
131
|
+
# yield(header.strip, Sequence.new(sequence || ""))
|
132
|
+
# end
|
133
|
+
# end
|
108
134
|
end
|
109
135
|
|
110
136
|
f.close if f.instance_of?(Zlib::GzipReader)
|
@@ -114,12 +140,12 @@ class FastaFile < File
|
|
114
140
|
private
|
115
141
|
|
116
142
|
def parse_line(line)
|
117
|
-
line.split("\n", 2).map { |s| s.gsub(/\n
|
143
|
+
line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
|
118
144
|
end
|
119
145
|
|
120
146
|
def parse_line_separately(line)
|
121
147
|
header, sequence =
|
122
|
-
line.split("\n", 2).map { |s| s.gsub(
|
148
|
+
line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
|
123
149
|
|
124
150
|
if sequence.nil?
|
125
151
|
sequences = []
|
data/lib/parse_fasta/seq_file.rb
CHANGED
@@ -29,6 +29,9 @@ class SeqFile < File
|
|
29
29
|
#
|
30
30
|
# @return [Hash] A hash with headers as keys, sequences as the
|
31
31
|
# values (Sequence objects)
|
32
|
+
#
|
33
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
|
34
|
+
# and file is a fastA file
|
32
35
|
def to_hash
|
33
36
|
first_char = get_first_char(self)
|
34
37
|
|
@@ -73,6 +76,9 @@ class SeqFile < File
|
|
73
76
|
# leading '>' or '@'
|
74
77
|
#
|
75
78
|
# @yieldparam sequence [Sequence] The sequence of the record.
|
79
|
+
#
|
80
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
|
81
|
+
# and file is a fastA file
|
76
82
|
def each_record
|
77
83
|
first_char = get_first_char(self)
|
78
84
|
|
data/lib/parse_fasta/sequence.rb
CHANGED
@@ -35,7 +35,13 @@ class Sequence < String
|
|
35
35
|
#
|
36
36
|
# @example Removes whitespace
|
37
37
|
# Sequence.new "AA CC TT" #=> "AACCTT"
|
38
|
+
#
|
39
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
38
40
|
def initialize(str)
|
41
|
+
if str.match(/>/)
|
42
|
+
raise ParseFasta::SequenceFormatError
|
43
|
+
end
|
44
|
+
|
39
45
|
super(str.gsub(/ +/, ""))
|
40
46
|
end
|
41
47
|
|
data/lib/parse_fasta/version.rb
CHANGED
data/lib/parse_fasta.rb
CHANGED
@@ -24,10 +24,17 @@ require 'parse_fasta/sequence'
|
|
24
24
|
require 'parse_fasta/quality'
|
25
25
|
|
26
26
|
module ParseFasta
|
27
|
+
|
28
|
+
class Error < StandardError
|
29
|
+
end
|
30
|
+
|
27
31
|
# Error raised when FASTA file is malformed
|
28
32
|
class DataFormatError < IOError
|
29
33
|
def message
|
30
34
|
"Data format error -- check input file"
|
31
35
|
end
|
32
36
|
end
|
37
|
+
|
38
|
+
class SequenceFormatError < Error
|
39
|
+
end
|
33
40
|
end
|
data/spec/lib/fasta_file_spec.rb
CHANGED
@@ -49,6 +49,15 @@ describe FastaFile do
|
|
49
49
|
let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
|
50
50
|
let(:fasta) { FastaFile.open(fname) }
|
51
51
|
|
52
|
+
context "with badly catted fasta" do
|
53
|
+
it "raises ParseFasta::SequenceFormatError" do
|
54
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
55
|
+
|
56
|
+
expect { FastaFile.open(fname).to_hash }.
|
57
|
+
to raise_error ParseFasta::SequenceFormatError
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
52
61
|
it "reads the records into a hash: header as key and seq as val" do
|
53
62
|
expect(fasta.to_hash).to eq records
|
54
63
|
end
|
@@ -66,6 +75,15 @@ describe FastaFile do
|
|
66
75
|
let(:truthy_records) { Helpers::TRUTHY_RECORDS }
|
67
76
|
let(:f_handle) { FastaFile.open(@fname).each_record { |s| } }
|
68
77
|
|
78
|
+
context "with badly catted fasta" do
|
79
|
+
it "raises ParseFasta::SequenceFormatError" do
|
80
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
81
|
+
|
82
|
+
expect { FastaFile.open(fname).each_record {} }.
|
83
|
+
to raise_error ParseFasta::SequenceFormatError
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
69
87
|
shared_examples_for "any FastaFile" do
|
70
88
|
context "with no arguments" do
|
71
89
|
it "yields proper header and sequence for each record" do
|
data/spec/lib/seq_file_spec.rb
CHANGED
@@ -26,6 +26,15 @@ describe SeqFile do
|
|
26
26
|
let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
|
27
27
|
let(:fasta) { SeqFile.open(fname) }
|
28
28
|
|
29
|
+
context "with badly catted fasta" do
|
30
|
+
it "raises ParseFasta::SequenceFormatError" do
|
31
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
32
|
+
|
33
|
+
expect { FastaFile.open(fname).to_hash }.
|
34
|
+
to raise_error ParseFasta::SequenceFormatError
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
29
38
|
it "reads the records into a hash: header as key and seq as val" do
|
30
39
|
expect(fasta.to_hash).to eq records
|
31
40
|
end
|
@@ -77,6 +86,15 @@ describe SeqFile do
|
|
77
86
|
|
78
87
|
let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
|
79
88
|
|
89
|
+
context "with badly catted fasta" do
|
90
|
+
it "raises ParseFasta::SequenceFormatError" do
|
91
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
92
|
+
|
93
|
+
expect { FastaFile.open(fname).to_hash }.
|
94
|
+
to raise_error ParseFasta::SequenceFormatError
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
80
98
|
shared_examples_for "parsing a fasta file" do
|
81
99
|
it "yields proper header and sequence for each record" do
|
82
100
|
expect { |b|
|
data/spec/lib/sequence_spec.rb
CHANGED
@@ -35,6 +35,14 @@ describe Sequence do
|
|
35
35
|
s_no_spaces = "ACTACTACTGCT"
|
36
36
|
expect(Sequence.new(s)).to eq s_no_spaces
|
37
37
|
end
|
38
|
+
|
39
|
+
context "when sequence has a '>' in it" do
|
40
|
+
it "raises SequenceFormatError" do
|
41
|
+
s = "actg>sequence 3"
|
42
|
+
expect { Sequence.new(s) }.
|
43
|
+
to raise_error ParseFasta::SequenceFormatError
|
44
|
+
end
|
45
|
+
end
|
38
46
|
end
|
39
47
|
|
40
48
|
describe "#gc" do
|
data/spec/spec_helper.rb
CHANGED
@@ -29,6 +29,7 @@ module Helpers
|
|
29
29
|
["empty seq 1", ""],
|
30
30
|
["empty seq 2", ""],
|
31
31
|
["seq3", "yyyyyyyyyyyyyyyNNN"],
|
32
|
+
["seq 4 > has many '>' in header", "ACTGactg"],
|
32
33
|
["empty seq at end", ""]]
|
33
34
|
|
34
35
|
RECORDS_MAP = {
|
@@ -38,16 +39,18 @@ module Helpers
|
|
38
39
|
"empty seq 1" => "",
|
39
40
|
"empty seq 2" => "",
|
40
41
|
"seq3" => "yyyyyyyyyyyyyyyNNN",
|
42
|
+
"seq 4 > has many '>' in header" => "ACTGactg",
|
41
43
|
"empty seq at end" => ""
|
42
44
|
}
|
43
45
|
|
44
|
-
|
45
46
|
TRUTHY_RECORDS = [["empty seq at beginning", []],
|
46
47
|
["seq1 is fun", ["AACTGGNNN"]],
|
47
48
|
["seq2", ["AAT", "CCTGNNN"]],
|
48
49
|
["empty seq 1", []],
|
49
50
|
["empty seq 2", []],
|
50
51
|
["seq3", ["yyyyyyyyyy", "yyyyy", "NNN"]],
|
52
|
+
["seq 4 > has many '>' in header", ["ACTG" ,
|
53
|
+
"actg"]],
|
51
54
|
["empty seq at end", []]]
|
52
55
|
|
53
56
|
end
|
data/test_files/test.fa
CHANGED
data/test_files/test.fa.gz
CHANGED
Binary file
|
data/test_files/test.fq.gz
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.8.
|
4
|
+
version: 1.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-03-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -138,6 +138,7 @@ files:
|
|
138
138
|
- spec/lib/seq_file_spec.rb
|
139
139
|
- spec/lib/sequence_spec.rb
|
140
140
|
- spec/spec_helper.rb
|
141
|
+
- test_files/bad.fa
|
141
142
|
- test_files/benchmark.rb
|
142
143
|
- test_files/bogus.txt
|
143
144
|
- test_files/test.fa
|