parse_fasta 1.8.0 → 1.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/README.md +22 -1
- data/lib/parse_fasta/fasta_file.rb +28 -2
- data/lib/parse_fasta/seq_file.rb +6 -0
- data/lib/parse_fasta/sequence.rb +6 -0
- data/lib/parse_fasta/version.rb +1 -1
- data/lib/parse_fasta.rb +7 -0
- data/spec/lib/fasta_file_spec.rb +18 -0
- data/spec/lib/seq_file_spec.rb +18 -0
- data/spec/lib/sequence_spec.rb +8 -0
- data/spec/spec_helper.rb +4 -1
- data/test_files/bad.fa +5 -0
- data/test_files/test.fa +3 -0
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq.gz +0 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YzlkNTQ5NGQ5YTFlNzVkOTJjOTJkMTM2YmUwN2FlMjhmOTg2ZDZlMQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OThhYTU5NTAzYzlkMTg2N2IxOWNjYTExMWEyODRiY2Q2OGFhMzQ4MQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NzMyYTNmZmQ0YThlMThkZmE3ZjZhZjAzNDM2MGQ4ZTcwODhkODY3NzI2NzU1
|
10
|
+
NDQzYmU1ZDBiZjljYzVhZmNlMDIzZDMxMDc4Zjk3N2E1YTAxOTUzZTIyOGNj
|
11
|
+
NzdjOWJiODA2ZDA0NGNmMjFkOGI1ZjgxZWY3NTRmMTQ1MDc5MTU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ODU2NzMwZTk3ZmE0ZTIxYzMwOWVkMWUyY2U4MTE3YzAzMzI5MzU1ZDAzNWE3
|
14
|
+
OGQ3ODk2ZjQwYTNjNTJlZTVjYzg3MGU5YzliZjAyYjQ4ZDNmNjRlNzE2YmJk
|
15
|
+
MmY2OTRhYjI3NTM3ODFmYWYwNDk2ZjQ0YzI3YjIxMzI3MGU3MmE=
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -27,7 +27,7 @@ lightweight than BioRuby. And more fun! ;)
|
|
27
27
|
## Documentation ##
|
28
28
|
|
29
29
|
Checkout
|
30
|
-
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta
|
30
|
+
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta)
|
31
31
|
for the full api documentation.
|
32
32
|
|
33
33
|
## Usage ##
|
@@ -73,6 +73,27 @@ Add `Sequence#rev_comp`. It can handle IUPAC characters. Since
|
|
73
73
|
an amino acid string, things will get weird as it will complement the
|
74
74
|
IUPAC characters in the AA string and leave others.
|
75
75
|
|
76
|
+
#### 1.8.1 ####
|
77
|
+
|
78
|
+
An error will be raised if a fasta file has a `>` in the
|
79
|
+
sequence. Sometimes files are not terminated with a newline
|
80
|
+
character. If this is the case, then catting two fasta files will
|
81
|
+
smush the first header of the second file right in with the last
|
82
|
+
sequence of the first file. This is bad, raise an error! ;)
|
83
|
+
|
84
|
+
Example
|
85
|
+
|
86
|
+
>seq1
|
87
|
+
ACTG>seq2
|
88
|
+
ACTG
|
89
|
+
>seq3
|
90
|
+
ACTG
|
91
|
+
|
92
|
+
This will raise `ParseFasta::SequenceFormatError`.
|
93
|
+
|
94
|
+
Also, headers with lots of `>` within are fine now.
|
95
|
+
|
96
|
+
|
76
97
|
### 1.7 ###
|
77
98
|
|
78
99
|
Add `SeqFile#to_hash`, `FastaFile#to_hash` and `FastqFile#to_hash`.
|
@@ -51,6 +51,8 @@ class FastaFile < File
|
|
51
51
|
#
|
52
52
|
# @return [Hash] A hash with headers as keys, sequences as the
|
53
53
|
# values (Sequence objects)
|
54
|
+
#
|
55
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
54
56
|
def to_hash
|
55
57
|
hash = {}
|
56
58
|
self.each_record do |head, seq|
|
@@ -88,6 +90,8 @@ class FastaFile < File
|
|
88
90
|
# fasta record. If `separate_lines` is falsy (the default
|
89
91
|
# behavior), will be Sequence, but if truthy will be
|
90
92
|
# Array<String>.
|
93
|
+
#
|
94
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
91
95
|
def each_record(separate_lines=nil)
|
92
96
|
begin
|
93
97
|
f = Zlib::GzipReader.open(self)
|
@@ -100,11 +104,33 @@ class FastaFile < File
|
|
100
104
|
header, sequence = parse_line_separately(line)
|
101
105
|
yield(header.strip, sequence)
|
102
106
|
end
|
107
|
+
|
108
|
+
# f.each_with_index(">") do |line, idx|
|
109
|
+
# if idx.zero?
|
110
|
+
# if line != ">"
|
111
|
+
# raise ParseFasta::DataFormatError
|
112
|
+
# end
|
113
|
+
# else
|
114
|
+
# header, sequence = parse_line_separately(line)
|
115
|
+
# yield(header.strip, sequence)
|
116
|
+
# end
|
117
|
+
# end
|
103
118
|
else
|
104
119
|
f.each("\n>") do |line|
|
105
120
|
header, sequence = parse_line(line)
|
106
121
|
yield(header.strip, Sequence.new(sequence || ""))
|
107
122
|
end
|
123
|
+
|
124
|
+
# f.each_with_index(sep=/^>/) do |line, idx|
|
125
|
+
# if idx.zero?
|
126
|
+
# if line != ">"
|
127
|
+
# raise ParseFasta::DataFormatError
|
128
|
+
# end
|
129
|
+
# else
|
130
|
+
# header, sequence = parse_line(line)
|
131
|
+
# yield(header.strip, Sequence.new(sequence || ""))
|
132
|
+
# end
|
133
|
+
# end
|
108
134
|
end
|
109
135
|
|
110
136
|
f.close if f.instance_of?(Zlib::GzipReader)
|
@@ -114,12 +140,12 @@ class FastaFile < File
|
|
114
140
|
private
|
115
141
|
|
116
142
|
def parse_line(line)
|
117
|
-
line.split("\n", 2).map { |s| s.gsub(/\n
|
143
|
+
line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
|
118
144
|
end
|
119
145
|
|
120
146
|
def parse_line_separately(line)
|
121
147
|
header, sequence =
|
122
|
-
line.split("\n", 2).map { |s| s.gsub(
|
148
|
+
line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
|
123
149
|
|
124
150
|
if sequence.nil?
|
125
151
|
sequences = []
|
data/lib/parse_fasta/seq_file.rb
CHANGED
@@ -29,6 +29,9 @@ class SeqFile < File
|
|
29
29
|
#
|
30
30
|
# @return [Hash] A hash with headers as keys, sequences as the
|
31
31
|
# values (Sequence objects)
|
32
|
+
#
|
33
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
|
34
|
+
# and file is a fastA file
|
32
35
|
def to_hash
|
33
36
|
first_char = get_first_char(self)
|
34
37
|
|
@@ -73,6 +76,9 @@ class SeqFile < File
|
|
73
76
|
# leading '>' or '@'
|
74
77
|
#
|
75
78
|
# @yieldparam sequence [Sequence] The sequence of the record.
|
79
|
+
#
|
80
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
|
81
|
+
# and file is a fastA file
|
76
82
|
def each_record
|
77
83
|
first_char = get_first_char(self)
|
78
84
|
|
data/lib/parse_fasta/sequence.rb
CHANGED
@@ -35,7 +35,13 @@ class Sequence < String
|
|
35
35
|
#
|
36
36
|
# @example Removes whitespace
|
37
37
|
# Sequence.new "AA CC TT" #=> "AACCTT"
|
38
|
+
#
|
39
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
38
40
|
def initialize(str)
|
41
|
+
if str.match(/>/)
|
42
|
+
raise ParseFasta::SequenceFormatError
|
43
|
+
end
|
44
|
+
|
39
45
|
super(str.gsub(/ +/, ""))
|
40
46
|
end
|
41
47
|
|
data/lib/parse_fasta/version.rb
CHANGED
data/lib/parse_fasta.rb
CHANGED
@@ -24,10 +24,17 @@ require 'parse_fasta/sequence'
|
|
24
24
|
require 'parse_fasta/quality'
|
25
25
|
|
26
26
|
module ParseFasta
|
27
|
+
|
28
|
+
class Error < StandardError
|
29
|
+
end
|
30
|
+
|
27
31
|
# Error raised when FASTA file is malformed
|
28
32
|
class DataFormatError < IOError
|
29
33
|
def message
|
30
34
|
"Data format error -- check input file"
|
31
35
|
end
|
32
36
|
end
|
37
|
+
|
38
|
+
class SequenceFormatError < Error
|
39
|
+
end
|
33
40
|
end
|
data/spec/lib/fasta_file_spec.rb
CHANGED
@@ -49,6 +49,15 @@ describe FastaFile do
|
|
49
49
|
let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
|
50
50
|
let(:fasta) { FastaFile.open(fname) }
|
51
51
|
|
52
|
+
context "with badly catted fasta" do
|
53
|
+
it "raises ParseFasta::SequenceFormatError" do
|
54
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
55
|
+
|
56
|
+
expect { FastaFile.open(fname).to_hash }.
|
57
|
+
to raise_error ParseFasta::SequenceFormatError
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
52
61
|
it "reads the records into a hash: header as key and seq as val" do
|
53
62
|
expect(fasta.to_hash).to eq records
|
54
63
|
end
|
@@ -66,6 +75,15 @@ describe FastaFile do
|
|
66
75
|
let(:truthy_records) { Helpers::TRUTHY_RECORDS }
|
67
76
|
let(:f_handle) { FastaFile.open(@fname).each_record { |s| } }
|
68
77
|
|
78
|
+
context "with badly catted fasta" do
|
79
|
+
it "raises ParseFasta::SequenceFormatError" do
|
80
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
81
|
+
|
82
|
+
expect { FastaFile.open(fname).each_record {} }.
|
83
|
+
to raise_error ParseFasta::SequenceFormatError
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
69
87
|
shared_examples_for "any FastaFile" do
|
70
88
|
context "with no arguments" do
|
71
89
|
it "yields proper header and sequence for each record" do
|
data/spec/lib/seq_file_spec.rb
CHANGED
@@ -26,6 +26,15 @@ describe SeqFile do
|
|
26
26
|
let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
|
27
27
|
let(:fasta) { SeqFile.open(fname) }
|
28
28
|
|
29
|
+
context "with badly catted fasta" do
|
30
|
+
it "raises ParseFasta::SequenceFormatError" do
|
31
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
32
|
+
|
33
|
+
expect { FastaFile.open(fname).to_hash }.
|
34
|
+
to raise_error ParseFasta::SequenceFormatError
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
29
38
|
it "reads the records into a hash: header as key and seq as val" do
|
30
39
|
expect(fasta.to_hash).to eq records
|
31
40
|
end
|
@@ -77,6 +86,15 @@ describe SeqFile do
|
|
77
86
|
|
78
87
|
let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
|
79
88
|
|
89
|
+
context "with badly catted fasta" do
|
90
|
+
it "raises ParseFasta::SequenceFormatError" do
|
91
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
92
|
+
|
93
|
+
expect { FastaFile.open(fname).to_hash }.
|
94
|
+
to raise_error ParseFasta::SequenceFormatError
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
80
98
|
shared_examples_for "parsing a fasta file" do
|
81
99
|
it "yields proper header and sequence for each record" do
|
82
100
|
expect { |b|
|
data/spec/lib/sequence_spec.rb
CHANGED
@@ -35,6 +35,14 @@ describe Sequence do
|
|
35
35
|
s_no_spaces = "ACTACTACTGCT"
|
36
36
|
expect(Sequence.new(s)).to eq s_no_spaces
|
37
37
|
end
|
38
|
+
|
39
|
+
context "when sequence has a '>' in it" do
|
40
|
+
it "raises SequenceFormatError" do
|
41
|
+
s = "actg>sequence 3"
|
42
|
+
expect { Sequence.new(s) }.
|
43
|
+
to raise_error ParseFasta::SequenceFormatError
|
44
|
+
end
|
45
|
+
end
|
38
46
|
end
|
39
47
|
|
40
48
|
describe "#gc" do
|
data/spec/spec_helper.rb
CHANGED
@@ -29,6 +29,7 @@ module Helpers
|
|
29
29
|
["empty seq 1", ""],
|
30
30
|
["empty seq 2", ""],
|
31
31
|
["seq3", "yyyyyyyyyyyyyyyNNN"],
|
32
|
+
["seq 4 > has many '>' in header", "ACTGactg"],
|
32
33
|
["empty seq at end", ""]]
|
33
34
|
|
34
35
|
RECORDS_MAP = {
|
@@ -38,16 +39,18 @@ module Helpers
|
|
38
39
|
"empty seq 1" => "",
|
39
40
|
"empty seq 2" => "",
|
40
41
|
"seq3" => "yyyyyyyyyyyyyyyNNN",
|
42
|
+
"seq 4 > has many '>' in header" => "ACTGactg",
|
41
43
|
"empty seq at end" => ""
|
42
44
|
}
|
43
45
|
|
44
|
-
|
45
46
|
TRUTHY_RECORDS = [["empty seq at beginning", []],
|
46
47
|
["seq1 is fun", ["AACTGGNNN"]],
|
47
48
|
["seq2", ["AAT", "CCTGNNN"]],
|
48
49
|
["empty seq 1", []],
|
49
50
|
["empty seq 2", []],
|
50
51
|
["seq3", ["yyyyyyyyyy", "yyyyy", "NNN"]],
|
52
|
+
["seq 4 > has many '>' in header", ["ACTG" ,
|
53
|
+
"actg"]],
|
51
54
|
["empty seq at end", []]]
|
52
55
|
|
53
56
|
end
|
data/test_files/test.fa
CHANGED
data/test_files/test.fa.gz
CHANGED
Binary file
|
data/test_files/test.fq.gz
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.8.
|
4
|
+
version: 1.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-03-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -138,6 +138,7 @@ files:
|
|
138
138
|
- spec/lib/seq_file_spec.rb
|
139
139
|
- spec/lib/sequence_spec.rb
|
140
140
|
- spec/spec_helper.rb
|
141
|
+
- test_files/bad.fa
|
141
142
|
- test_files/benchmark.rb
|
142
143
|
- test_files/bogus.txt
|
143
144
|
- test_files/test.fa
|