parse_fasta 1.8.0 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MmIxNzgzM2E2MzkzYTc1ZjM2MWE0NzIxN2ZmZmRkMGUxMDQ0Y2MzYw==
4
+ YzlkNTQ5NGQ5YTFlNzVkOTJjOTJkMTM2YmUwN2FlMjhmOTg2ZDZlMQ==
5
5
  data.tar.gz: !binary |-
6
- Njc4ZTVlN2EwMTVjZDkwNWZmYTM2MzcwY2FkY2NlNjVmNmJiNWFmYw==
6
+ OThhYTU5NTAzYzlkMTg2N2IxOWNjYTExMWEyODRiY2Q2OGFhMzQ4MQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZjNiOTc2NTM0OTc0M2U4NDNkYzhlNTBlNjM3MzI4Mjc4YWY4YmE4MjRhMDdj
10
- MGRiNWNiMmExZTZjYmRhZDk0YzcxMTQ1OGQ0NzdkYTVmMmQ0Mzg1YmU0NmFl
11
- NmNjYWM1MmY4ZWRhNTY5MDAzNTk1YzcyN2IzYWE2MTNkMzAwNDg=
9
+ NzMyYTNmZmQ0YThlMThkZmE3ZjZhZjAzNDM2MGQ4ZTcwODhkODY3NzI2NzU1
10
+ NDQzYmU1ZDBiZjljYzVhZmNlMDIzZDMxMDc4Zjk3N2E1YTAxOTUzZTIyOGNj
11
+ NzdjOWJiODA2ZDA0NGNmMjFkOGI1ZjgxZWY3NTRmMTQ1MDc5MTU=
12
12
  data.tar.gz: !binary |-
13
- Mjk5ZjU3YWI3YTJlN2Q4NWJjMWY3NDczOTBhNzI3NzlkMDViZjRhZGFkN2I4
14
- NmU5YzFhYTI4ZDc2N2RhYTE5ODdkODE5NTQ5ZjJmNzNmNjEyNzY3NzJiZTk3
15
- ODA2MWI3YjUzZDdmMTE5MGM1MDA3ZTk1NmMyNGU3NjFmOTIyMWY=
13
+ ODU2NzMwZTk3ZmE0ZTIxYzMwOWVkMWUyY2U4MTE3YzAzMzI5MzU1ZDAzNWE3
14
+ OGQ3ODk2ZjQwYTNjNTJlZTVjYzg3MGU5YzliZjAyYjQ4ZDNmNjRlNzE2YmJk
15
+ MmY2OTRhYjI3NTM3ODFmYWYwNDk2ZjQ0YzI3YjIxMzI3MGU3MmE=
data/.gitignore CHANGED
@@ -20,3 +20,4 @@ tmp
20
20
  *.o
21
21
  *.a
22
22
  mkmf.log
23
+ .ruby-*
data/README.md CHANGED
@@ -27,7 +27,7 @@ lightweight than BioRuby. And more fun! ;)
27
27
  ## Documentation ##
28
28
 
29
29
  Checkout
30
- [parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.8.0/frames)
30
+ [parse_fasta docs](http://rubydoc.info/gems/parse_fasta)
31
31
  for the full api documentation.
32
32
 
33
33
  ## Usage ##
@@ -73,6 +73,27 @@ Add `Sequence#rev_comp`. It can handle IUPAC characters. Since
73
73
  an amino acid string, things will get weird as it will complement the
74
74
  IUPAC characters in the AA string and leave others.
75
75
 
76
+ #### 1.8.1 ####
77
+
78
+ An error will be raised if a fasta file has a `>` in the
79
+ sequence. Sometimes files are not terminated with a newline
80
+ character. If this is the case, then catting two fasta files will
81
+ smush the first header of the second file right in with the last
82
+ sequence of the first file. This is bad, raise an error! ;)
83
+
84
+ Example
85
+
86
+ >seq1
87
+ ACTG>seq2
88
+ ACTG
89
+ >seq3
90
+ ACTG
91
+
92
+ This will raise `ParseFasta::SequenceFormatError`.
93
+
94
+ Also, headers with lots of `>` within are fine now.
95
+
96
+
76
97
  ### 1.7 ###
77
98
 
78
99
  Add `SeqFile#to_hash`, `FastaFile#to_hash` and `FastqFile#to_hash`.
@@ -51,6 +51,8 @@ class FastaFile < File
51
51
  #
52
52
  # @return [Hash] A hash with headers as keys, sequences as the
53
53
  # values (Sequence objects)
54
+ #
55
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
54
56
  def to_hash
55
57
  hash = {}
56
58
  self.each_record do |head, seq|
@@ -88,6 +90,8 @@ class FastaFile < File
88
90
  # fasta record. If `separate_lines` is falsy (the default
89
91
  # behavior), will be Sequence, but if truthy will be
90
92
  # Array<String>.
93
+ #
94
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
91
95
  def each_record(separate_lines=nil)
92
96
  begin
93
97
  f = Zlib::GzipReader.open(self)
@@ -100,11 +104,33 @@ class FastaFile < File
100
104
  header, sequence = parse_line_separately(line)
101
105
  yield(header.strip, sequence)
102
106
  end
107
+
108
+ # f.each_with_index(">") do |line, idx|
109
+ # if idx.zero?
110
+ # if line != ">"
111
+ # raise ParseFasta::DataFormatError
112
+ # end
113
+ # else
114
+ # header, sequence = parse_line_separately(line)
115
+ # yield(header.strip, sequence)
116
+ # end
117
+ # end
103
118
  else
104
119
  f.each("\n>") do |line|
105
120
  header, sequence = parse_line(line)
106
121
  yield(header.strip, Sequence.new(sequence || ""))
107
122
  end
123
+
124
+ # f.each_with_index(sep=/^>/) do |line, idx|
125
+ # if idx.zero?
126
+ # if line != ">"
127
+ # raise ParseFasta::DataFormatError
128
+ # end
129
+ # else
130
+ # header, sequence = parse_line(line)
131
+ # yield(header.strip, Sequence.new(sequence || ""))
132
+ # end
133
+ # end
108
134
  end
109
135
 
110
136
  f.close if f.instance_of?(Zlib::GzipReader)
@@ -114,12 +140,12 @@ class FastaFile < File
114
140
  private
115
141
 
116
142
  def parse_line(line)
117
- line.split("\n", 2).map { |s| s.gsub(/\n|>/, '') }
143
+ line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
118
144
  end
119
145
 
120
146
  def parse_line_separately(line)
121
147
  header, sequence =
122
- line.split("\n", 2).map { |s| s.gsub(/>/, '') }
148
+ line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
123
149
 
124
150
  if sequence.nil?
125
151
  sequences = []
@@ -29,6 +29,9 @@ class SeqFile < File
29
29
  #
30
30
  # @return [Hash] A hash with headers as keys, sequences as the
31
31
  # values (Sequence objects)
32
+ #
33
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
34
+ # and file is a fastA file
32
35
  def to_hash
33
36
  first_char = get_first_char(self)
34
37
 
@@ -73,6 +76,9 @@ class SeqFile < File
73
76
  # leading '>' or '@'
74
77
  #
75
78
  # @yieldparam sequence [Sequence] The sequence of the record.
79
+ #
80
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
81
+ # and file is a fastA file
76
82
  def each_record
77
83
  first_char = get_first_char(self)
78
84
 
@@ -35,7 +35,13 @@ class Sequence < String
35
35
  #
36
36
  # @example Removes whitespace
37
37
  # Sequence.new "AA CC TT" #=> "AACCTT"
38
+ #
39
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
38
40
  def initialize(str)
41
+ if str.match(/>/)
42
+ raise ParseFasta::SequenceFormatError
43
+ end
44
+
39
45
  super(str.gsub(/ +/, ""))
40
46
  end
41
47
 
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.8.0"
20
+ VERSION = "1.8.1"
21
21
  end
data/lib/parse_fasta.rb CHANGED
@@ -24,10 +24,17 @@ require 'parse_fasta/sequence'
24
24
  require 'parse_fasta/quality'
25
25
 
26
26
  module ParseFasta
27
+
28
+ class Error < StandardError
29
+ end
30
+
27
31
  # Error raised when FASTA file is malformed
28
32
  class DataFormatError < IOError
29
33
  def message
30
34
  "Data format error -- check input file"
31
35
  end
32
36
  end
37
+
38
+ class SequenceFormatError < Error
39
+ end
33
40
  end
@@ -49,6 +49,15 @@ describe FastaFile do
49
49
  let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
50
50
  let(:fasta) { FastaFile.open(fname) }
51
51
 
52
+ context "with badly catted fasta" do
53
+ it "raises ParseFasta::SequenceFormatError" do
54
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
55
+
56
+ expect { FastaFile.open(fname).to_hash }.
57
+ to raise_error ParseFasta::SequenceFormatError
58
+ end
59
+ end
60
+
52
61
  it "reads the records into a hash: header as key and seq as val" do
53
62
  expect(fasta.to_hash).to eq records
54
63
  end
@@ -66,6 +75,15 @@ describe FastaFile do
66
75
  let(:truthy_records) { Helpers::TRUTHY_RECORDS }
67
76
  let(:f_handle) { FastaFile.open(@fname).each_record { |s| } }
68
77
 
78
+ context "with badly catted fasta" do
79
+ it "raises ParseFasta::SequenceFormatError" do
80
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
81
+
82
+ expect { FastaFile.open(fname).each_record {} }.
83
+ to raise_error ParseFasta::SequenceFormatError
84
+ end
85
+ end
86
+
69
87
  shared_examples_for "any FastaFile" do
70
88
  context "with no arguments" do
71
89
  it "yields proper header and sequence for each record" do
@@ -26,6 +26,15 @@ describe SeqFile do
26
26
  let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
27
27
  let(:fasta) { SeqFile.open(fname) }
28
28
 
29
+ context "with badly catted fasta" do
30
+ it "raises ParseFasta::SequenceFormatError" do
31
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
32
+
33
+ expect { FastaFile.open(fname).to_hash }.
34
+ to raise_error ParseFasta::SequenceFormatError
35
+ end
36
+ end
37
+
29
38
  it "reads the records into a hash: header as key and seq as val" do
30
39
  expect(fasta.to_hash).to eq records
31
40
  end
@@ -77,6 +86,15 @@ describe SeqFile do
77
86
 
78
87
  let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
79
88
 
89
+ context "with badly catted fasta" do
90
+ it "raises ParseFasta::SequenceFormatError" do
91
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
92
+
93
+ expect { FastaFile.open(fname).to_hash }.
94
+ to raise_error ParseFasta::SequenceFormatError
95
+ end
96
+ end
97
+
80
98
  shared_examples_for "parsing a fasta file" do
81
99
  it "yields proper header and sequence for each record" do
82
100
  expect { |b|
@@ -35,6 +35,14 @@ describe Sequence do
35
35
  s_no_spaces = "ACTACTACTGCT"
36
36
  expect(Sequence.new(s)).to eq s_no_spaces
37
37
  end
38
+
39
+ context "when sequence has a '>' in it" do
40
+ it "raises SequenceFormatError" do
41
+ s = "actg>sequence 3"
42
+ expect { Sequence.new(s) }.
43
+ to raise_error ParseFasta::SequenceFormatError
44
+ end
45
+ end
38
46
  end
39
47
 
40
48
  describe "#gc" do
data/spec/spec_helper.rb CHANGED
@@ -29,6 +29,7 @@ module Helpers
29
29
  ["empty seq 1", ""],
30
30
  ["empty seq 2", ""],
31
31
  ["seq3", "yyyyyyyyyyyyyyyNNN"],
32
+ ["seq 4 > has many '>' in header", "ACTGactg"],
32
33
  ["empty seq at end", ""]]
33
34
 
34
35
  RECORDS_MAP = {
@@ -38,16 +39,18 @@ module Helpers
38
39
  "empty seq 1" => "",
39
40
  "empty seq 2" => "",
40
41
  "seq3" => "yyyyyyyyyyyyyyyNNN",
42
+ "seq 4 > has many '>' in header" => "ACTGactg",
41
43
  "empty seq at end" => ""
42
44
  }
43
45
 
44
-
45
46
  TRUTHY_RECORDS = [["empty seq at beginning", []],
46
47
  ["seq1 is fun", ["AACTGGNNN"]],
47
48
  ["seq2", ["AAT", "CCTGNNN"]],
48
49
  ["empty seq 1", []],
49
50
  ["empty seq 2", []],
50
51
  ["seq3", ["yyyyyyyyyy", "yyyyy", "NNN"]],
52
+ ["seq 4 > has many '>' in header", ["ACTG" ,
53
+ "actg"]],
51
54
  ["empty seq at end", []]]
52
55
 
53
56
  end
data/test_files/bad.fa ADDED
@@ -0,0 +1,5 @@
1
+ >seq1
2
+ ACTG>seq2
3
+ ACTG
4
+ >seq3
5
+ ACTG
data/test_files/test.fa CHANGED
@@ -15,4 +15,7 @@ yyyyyyyyyy
15
15
 
16
16
  yyyyy
17
17
  NNN
18
+ >seq 4 > has many '>' in header
19
+ ACTG
20
+ actg
18
21
  >empty seq at end
Binary file
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-17 00:00:00.000000000 Z
11
+ date: 2016-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -138,6 +138,7 @@ files:
138
138
  - spec/lib/seq_file_spec.rb
139
139
  - spec/lib/sequence_spec.rb
140
140
  - spec/spec_helper.rb
141
+ - test_files/bad.fa
141
142
  - test_files/benchmark.rb
142
143
  - test_files/bogus.txt
143
144
  - test_files/test.fa