parse_fasta 1.7.1 → 1.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +11 -1
- data/lib/parse_fasta/quality.rb +11 -0
- data/lib/parse_fasta/sequence.rb +14 -4
- data/lib/parse_fasta/version.rb +1 -1
- data/spec/lib/quality_spec.rb +10 -2
- data/spec/lib/sequence_spec.rb +9 -1
- data/test_files/test.fa +2 -2
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq +3 -3
- data/test_files/test.fq.gz +0 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NzY3MWVlYjVmYzM4NzhjZjIzNDUyYjFlNWZhMjMxMGE4NWQ3YTI4Yw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTVjMThlYzg0NzUwZTdmMzIyZmJlYjc5NjBiNDUzMmE2NjJjMGVkOA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZTVkOTQ4MGMzMTQ4ZDg5MzczMDllMzgwNDY3Y2NmNzU4NDc5MzMwOGIwYzA3
|
10
|
+
Y2FjMzQ1YzA4NDhiMjY0OWMyYjMyMWFhOWM5MTI5OGFiMTJiMDFhYmJlNTI3
|
11
|
+
NjBkZDU5MzMzOTJlOGY0NzdiZWRiYWI5ZWE2YTBiZjJkYTk1ZTU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MWJkZjY5ZWY5MmYyNTJjMjIyODE0ZmQ4YWU4OTZlMTY5MzIyNGNlZGJkNjU2
|
14
|
+
YjBjNmI5YTIyYTdmMGJhZmVlNTExZWJkN2U4OWRlMDE0NzM3MGI4ZGE4M2Rj
|
15
|
+
MzNjODU3YzYzMDFmZDg1MjU1YzQ2MDgxODIwNmQ3ZDJkZDU3MWQ=
|
data/README.md
CHANGED
@@ -27,7 +27,7 @@ lightweight than BioRuby. And more fun! ;)
|
|
27
27
|
## Documentation ##
|
28
28
|
|
29
29
|
Checkout
|
30
|
-
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.7.
|
30
|
+
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.7.2/frames)
|
31
31
|
for the full api documentation.
|
32
32
|
|
33
33
|
## Usage ##
|
@@ -70,6 +70,16 @@ Read fasta file into a hash.
|
|
70
70
|
|
71
71
|
Add `SeqFile#to_hash`, `FastaFile#to_hash` and `FastqFile#to_hash`.
|
72
72
|
|
73
|
+
#### 1.7.2 ####
|
74
|
+
|
75
|
+
Strip spaces (not all whitespace) from `Sequence` and `Quality` strings.
|
76
|
+
|
77
|
+
Some alignment fastas have spaces for easier reading. Strip these
|
78
|
+
out. For consistency, also strips spaces from `Quality` strings. If
|
79
|
+
there are spaces that don't match in the quality and sequence in a
|
80
|
+
fastQ file, then things will get messed up in the FastQ file. FastQ
|
81
|
+
shouldn't have spaces though.
|
82
|
+
|
73
83
|
### 1.6 ###
|
74
84
|
|
75
85
|
Added `SeqFile` class, which accepts either fastA or fastQ files. It
|
data/lib/parse_fasta/quality.rb
CHANGED
@@ -19,6 +19,17 @@
|
|
19
19
|
# Provide some methods for dealing with common tasks regarding
|
20
20
|
# quality strings.
|
21
21
|
class Quality < String
|
22
|
+
|
23
|
+
# Strips whitespace from the str argument before calling super
|
24
|
+
#
|
25
|
+
# @return [Quality] A Quality string
|
26
|
+
#
|
27
|
+
# @example Removes whitespace
|
28
|
+
# Quality.new "I I 2 ! " #=> "II2!"
|
29
|
+
def initialize(str)
|
30
|
+
super(str.gsub(/ +/, ""))
|
31
|
+
end
|
32
|
+
|
22
33
|
# Returns the mean quality for the record. This will be a good deal
|
23
34
|
# faster than getting the average with `qual_scores` and reduce.
|
24
35
|
#
|
data/lib/parse_fasta/sequence.rb
CHANGED
@@ -20,6 +20,16 @@
|
|
20
20
|
# nucleotide sequences.
|
21
21
|
class Sequence < String
|
22
22
|
|
23
|
+
# Strips whitespace from the str argument before calling super
|
24
|
+
#
|
25
|
+
# @return [Sequence] A Sequence string
|
26
|
+
#
|
27
|
+
# @example Removes whitespace
|
28
|
+
# Sequence.new "AA CC TT" #=> "AACCTT"
|
29
|
+
def initialize(str)
|
30
|
+
super(str.gsub(/ +/, ""))
|
31
|
+
end
|
32
|
+
|
23
33
|
# Calculates GC content
|
24
34
|
#
|
25
35
|
# Calculates GC content by dividing count of G + C divided by count
|
@@ -45,7 +55,7 @@ class Sequence < String
|
|
45
55
|
t = s.count('t')
|
46
56
|
a = s.count('a')
|
47
57
|
u = s.count('u')
|
48
|
-
|
58
|
+
|
49
59
|
return 0 if c + g + t + a + u == 0
|
50
60
|
return (c + g) / (c + g + t + a + u).to_f
|
51
61
|
end
|
@@ -87,9 +97,9 @@ class Sequence < String
|
|
87
97
|
warn('ERROR: A sequence contains both T and U')
|
88
98
|
counts[:t], counts[:u] = t, u
|
89
99
|
end
|
90
|
-
|
100
|
+
|
91
101
|
counts[:n] = s.count('n') if count_ambiguous_bases
|
92
|
-
|
102
|
+
|
93
103
|
counts
|
94
104
|
end
|
95
105
|
|
@@ -116,7 +126,7 @@ class Sequence < String
|
|
116
126
|
def base_frequencies(count_ambiguous_bases=nil)
|
117
127
|
base_counts = self.base_counts(count_ambiguous_bases)
|
118
128
|
total_bases = base_counts.values.reduce(:+).to_f
|
119
|
-
base_freqs =
|
129
|
+
base_freqs =
|
120
130
|
base_counts.map { |base, count| [base, count/total_bases] }.flatten
|
121
131
|
Hash[*base_freqs]
|
122
132
|
end
|
data/lib/parse_fasta/version.rb
CHANGED
data/spec/lib/quality_spec.rb
CHANGED
@@ -21,10 +21,18 @@ require 'bio'
|
|
21
21
|
|
22
22
|
describe Quality do
|
23
23
|
let(:qual_string) { qual_string = Quality.new('ab%63:K') }
|
24
|
-
let(:bioruby_qual_scores) do
|
24
|
+
let(:bioruby_qual_scores) do
|
25
25
|
Bio::Fastq.new("@seq1\nACTGACT\n+\n#{qual_string}").quality_scores
|
26
26
|
end
|
27
27
|
|
28
|
+
describe "::new" do
|
29
|
+
it "removes any spaces in the quality string" do
|
30
|
+
q = " ab # :m, ! "
|
31
|
+
q_no_spaces = "ab#:m,!"
|
32
|
+
expect(Quality.new(q)).to eq q_no_spaces
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
28
36
|
describe "#qual_scores" do
|
29
37
|
context "with illumina style quality scores" do
|
30
38
|
it "returns an array of quality scores" do
|
@@ -39,5 +47,5 @@ describe Quality do
|
|
39
47
|
mean_quality = qual_string.qual_scores.reduce(:+) / len
|
40
48
|
expect(qual_string.mean_qual).to eq mean_quality
|
41
49
|
end
|
42
|
-
end
|
50
|
+
end
|
43
51
|
end
|
data/spec/lib/sequence_spec.rb
CHANGED
@@ -25,6 +25,14 @@ describe Sequence do
|
|
25
25
|
expect(Sequence.new('ACTG')).to be_a String
|
26
26
|
end
|
27
27
|
|
28
|
+
describe "::new" do
|
29
|
+
it "removes any spaces in the sequence" do
|
30
|
+
s = "ACT ACT ACT GCT "
|
31
|
+
s_no_spaces = "ACTACTACTGCT"
|
32
|
+
expect(Sequence.new(s)).to eq s_no_spaces
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
28
36
|
describe "#gc" do
|
29
37
|
it "gives the same answer as BioRuby" do
|
30
38
|
s = 'ACtgcGAtcgCgAaTtGgCcnNuU'
|
@@ -75,7 +83,7 @@ describe Sequence do
|
|
75
83
|
expect(s.base_counts).to eq({ a: 2, c: 2, u: 2, g: 2 })
|
76
84
|
end
|
77
85
|
end
|
78
|
-
|
86
|
+
|
79
87
|
context "for an RNA sequence with truthy argument" do
|
80
88
|
it "returns a map of A, C, U, G and N counts" do
|
81
89
|
s = Sequence.new('ACUGNacugn')
|
data/test_files/test.fa
CHANGED
data/test_files/test.fa.gz
CHANGED
Binary file
|
data/test_files/test.fq
CHANGED
data/test_files/test.fq.gz
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.7.
|
4
|
+
version: 1.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|