parse_fasta 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/lib/parse_fasta/sequence.rb +73 -2
- data/lib/parse_fasta/version.rb +1 -1
- data/parse_fasta.gemspec +1 -1
- data/spec/lib/sequence_spec.rb +67 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 689bddd57bc87d882ad8e8711acc84f0f1c33825
|
4
|
+
data.tar.gz: 85d056f1581a23cc25c1a60224fd2c177266ab23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f68c1541107d1881177606398c63d09b7da2b04d7e0f30b3165ebd51cc7310caddbcba7788f72db71f21c778c5c1f022382a17e4b450cd172946534665f360a1
|
7
|
+
data.tar.gz: eb8a9bf47345d0ee4b164ef053da5ddb4cb6785d0c0a91424b2e4549d9231d2f55ab79f9355e0df8f924d51be7be738a95f5887b6333aa99c4a5a945941f34f3
|
data/README.md
CHANGED
@@ -29,7 +29,7 @@ and over.
|
|
29
29
|
## Documentation ##
|
30
30
|
|
31
31
|
Checkout
|
32
|
-
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.
|
32
|
+
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.4.0/frames)
|
33
33
|
to see the full documentation.
|
34
34
|
|
35
35
|
## Usage ##
|
@@ -58,6 +58,13 @@ Now we can parse fastq files as well!
|
|
58
58
|
|
59
59
|
## Versions ##
|
60
60
|
|
61
|
+
### 1.4.0 ###
|
62
|
+
|
63
|
+
Added methods:
|
64
|
+
|
65
|
+
Sequence.base_counts
|
66
|
+
Sequence.base_frequencies
|
67
|
+
|
61
68
|
### 1.3.0 ###
|
62
69
|
|
63
70
|
Add additional functionality to `each_record` method.
|
data/lib/parse_fasta/sequence.rb
CHANGED
@@ -25,7 +25,8 @@ class Sequence < String
|
|
25
25
|
# Calculates GC content by dividing count of G + C divided by count
|
26
26
|
# of G + C + T + A + U. If there are both T's and U's in the
|
27
27
|
# Sequence, things will get weird, but then again, that wouldn't
|
28
|
-
# happen, now would it!
|
28
|
+
# happen, now would it! Ambiguous bases are ignored similar to
|
29
|
+
# BioRuby.
|
29
30
|
#
|
30
31
|
# @example Get GC of a Sequence
|
31
32
|
# Sequence.new('ACTg').gc #=> 0.5
|
@@ -46,7 +47,77 @@ class Sequence < String
|
|
46
47
|
u = s.count('u')
|
47
48
|
|
48
49
|
return 0 if c + g + t + a + u == 0
|
49
|
-
return (c + g)
|
50
|
+
return (c + g) / (c + g + t + a + u).to_f
|
50
51
|
end
|
51
52
|
|
53
|
+
# Returns a map of base counts
|
54
|
+
#
|
55
|
+
# This method will check if the sequence is DNA or RNA and return a
|
56
|
+
# count map appropriate for each. If a truthy argument is given, the
|
57
|
+
# count of ambiguous bases will be returned as well.
|
58
|
+
#
|
59
|
+
# If a sequence has both T and U present, will warn the user and
|
60
|
+
# keep going. Will return a map with counts of both, however.
|
61
|
+
#
|
62
|
+
# @example Get base counts of DNA sequence without ambiguous bases
|
63
|
+
# Sequence.new('AcTGn').base_counts
|
64
|
+
# #=> { a: 1, c: 1, t: 1, g: 1 }
|
65
|
+
# @example Get base counts of DNA sequence with ambiguous bases
|
66
|
+
# Sequence.new('AcTGn').base_counts(true)
|
67
|
+
# #=> { a: 1, c: 1, t: 1, g: 1, n: 1 }
|
68
|
+
# @example Get base counts of RNA sequence without ambiguous bases
|
69
|
+
# Sequence.new('AcUGn').base_counts
|
70
|
+
# #=> { a: 1, c: 1, u: 1, g: 1 }
|
71
|
+
# @example Get base counts of DNA sequence with ambiguous bases
|
72
|
+
# Sequence.new('AcUGn').base_counts(true)
|
73
|
+
# #=> { a: 1, c: 1, u: 1, g: 1, n: 1 }
|
74
|
+
#
|
75
|
+
# @return [Hash] A hash with base as key, count as value
|
76
|
+
def base_counts(count_ambiguous_bases=nil)
|
77
|
+
s = self.downcase
|
78
|
+
t = s.count('t')
|
79
|
+
u = s.count('u')
|
80
|
+
counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') }
|
81
|
+
|
82
|
+
if t > 0 && u == 0
|
83
|
+
counts[:t] = t
|
84
|
+
elsif t == 0 && u > 0
|
85
|
+
counts[:u] = u
|
86
|
+
elsif t > 0 && u > 0
|
87
|
+
warn('ERROR: A sequence contains both T and U')
|
88
|
+
counts[:t], counts[:u] = t, u
|
89
|
+
end
|
90
|
+
|
91
|
+
counts[:n] = s.count('n') if count_ambiguous_bases
|
92
|
+
|
93
|
+
counts
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns a map of base frequencies
|
97
|
+
#
|
98
|
+
# Counts bases with the `base_counts` method, then divides each
|
99
|
+
# count by the total bases counted to give frequency for each
|
100
|
+
# base. If a truthy argument is given, ambiguous bases will be
|
101
|
+
# included in the total and their frequency reported. Can discern
|
102
|
+
# between DNA and RNA.
|
103
|
+
#
|
104
|
+
# If default or falsy argument is given, ambiguous bases will not be
|
105
|
+
# counted in the total base count and their frequency will not be
|
106
|
+
# given.
|
107
|
+
#
|
108
|
+
# @example Get base frequencies of DNA sequence without ambiguous bases
|
109
|
+
# Sequence.new('AcTGn').base_counts
|
110
|
+
# #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 }
|
111
|
+
# @example Get base counts of DNA sequence with ambiguous bases
|
112
|
+
# Sequence.new('AcTGn').base_counts(true)
|
113
|
+
# #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 }
|
114
|
+
#
|
115
|
+
# @return [Hash] A hash with base as key, frequency as value
|
116
|
+
def base_frequencies(count_ambiguous_bases=nil)
|
117
|
+
base_counts = self.base_counts(count_ambiguous_bases)
|
118
|
+
total_bases = base_counts.values.reduce(:+).to_f
|
119
|
+
base_freqs =
|
120
|
+
base_counts.map { |base, count| [base, count/total_bases] }.flatten
|
121
|
+
Hash[*base_freqs]
|
122
|
+
end
|
52
123
|
end
|
data/lib/parse_fasta/version.rb
CHANGED
data/parse_fasta.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Ryan Moore"]
|
10
10
|
spec.email = ["moorer@udel.edu"]
|
11
11
|
spec.summary = %q{Easy-peasy parsing of fasta files}
|
12
|
-
spec.description = %q{So you want to parse a fasta file...}
|
12
|
+
spec.description = %q{So you want to parse a fasta or fastq file...}
|
13
13
|
spec.homepage = "https://github.com/mooreryan/parse_fasta"
|
14
14
|
spec.license = "GPLv3: http://www.gnu.org/licenses/gpl.txt"
|
15
15
|
|
data/spec/lib/sequence_spec.rb
CHANGED
@@ -53,4 +53,71 @@ describe Sequence do
|
|
53
53
|
end
|
54
54
|
end
|
55
55
|
end
|
56
|
+
|
57
|
+
describe "#base_counts" do
|
58
|
+
context "for a DNA sequence with default or falsy argument" do
|
59
|
+
it "returns a map of A, C, T, and G counts" do
|
60
|
+
s = Sequence.new('ACTGactg')
|
61
|
+
expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, g: 2 })
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
context "for a DNA sequence with truthy argument" do
|
66
|
+
it "returns a map of A, C, T, G and N counts" do
|
67
|
+
s = Sequence.new('ACTGNactgn')
|
68
|
+
expect(s.base_counts(1)).to eq({ a: 2, c: 2, t: 2, g: 2, n: 2 })
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context "for an RNA sequence with falsy or default argument" do
|
73
|
+
it "returns a map of A, C, U, G counts" do
|
74
|
+
s = Sequence.new('ACUGacug')
|
75
|
+
expect(s.base_counts).to eq({ a: 2, c: 2, u: 2, g: 2 })
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "for an RNA sequence with truthy argument" do
|
80
|
+
it "returns a map of A, C, U, G and N counts" do
|
81
|
+
s = Sequence.new('ACUGNacugn')
|
82
|
+
expect(s.base_counts(1)).to eq({ a: 2, c: 2, u: 2, g: 2, n: 2 })
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context "for a sequence with both U and T present" do
|
87
|
+
s = Sequence.new('AaCcTtGgNnUu')
|
88
|
+
err_message = 'ERROR: A sequence contains both T and U'
|
89
|
+
|
90
|
+
it "warns the user about having both U and T present" do
|
91
|
+
expect(s).to receive(:warn).with(err_message)
|
92
|
+
s.base_counts
|
93
|
+
end
|
94
|
+
|
95
|
+
it "returns a map that counts both U's and T's" do
|
96
|
+
expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, u: 2, g: 2 })
|
97
|
+
end
|
98
|
+
|
99
|
+
it "returns a map with T, U and N if truthy argument given" do
|
100
|
+
base_counts = { a: 2, c: 2, t: 2, u: 2, g: 2, n: 2 }
|
101
|
+
expect(s.base_counts(1)).to eq(base_counts)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe "#base_frequencies" do
|
107
|
+
context "with falsy argument" do
|
108
|
+
it "doesn't count ambiguous bases in total bases" do
|
109
|
+
s = Sequence.new('ACTTn')
|
110
|
+
base_freqs = { a: 0.25, c: 0.25, t: 0.5, g: 0.0 }
|
111
|
+
expect(s.base_frequencies).to eq(base_freqs)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
context "when counting ambiguous bases" do
|
116
|
+
it "does count ambiguous bases in total bases" do
|
117
|
+
s = Sequence.new('ACTTn')
|
118
|
+
base_freqs = { a: 0.2, c: 0.2, t: 0.4, g: 0.0, n: 0.2 }
|
119
|
+
expect(s.base_frequencies(1)).to eq(base_freqs)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
56
123
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,7 +80,7 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0.8'
|
83
|
-
description: So you want to parse a fasta file...
|
83
|
+
description: So you want to parse a fasta or fastq file...
|
84
84
|
email:
|
85
85
|
- moorer@udel.edu
|
86
86
|
executables: []
|