parse_fasta 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/lib/parse_fasta/sequence.rb +73 -2
- data/lib/parse_fasta/version.rb +1 -1
- data/parse_fasta.gemspec +1 -1
- data/spec/lib/sequence_spec.rb +67 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 689bddd57bc87d882ad8e8711acc84f0f1c33825
|
4
|
+
data.tar.gz: 85d056f1581a23cc25c1a60224fd2c177266ab23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f68c1541107d1881177606398c63d09b7da2b04d7e0f30b3165ebd51cc7310caddbcba7788f72db71f21c778c5c1f022382a17e4b450cd172946534665f360a1
|
7
|
+
data.tar.gz: eb8a9bf47345d0ee4b164ef053da5ddb4cb6785d0c0a91424b2e4549d9231d2f55ab79f9355e0df8f924d51be7be738a95f5887b6333aa99c4a5a945941f34f3
|
data/README.md
CHANGED
@@ -29,7 +29,7 @@ and over.
|
|
29
29
|
## Documentation ##
|
30
30
|
|
31
31
|
Checkout
|
32
|
-
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.
|
32
|
+
[parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.4.0/frames)
|
33
33
|
to see the full documentation.
|
34
34
|
|
35
35
|
## Usage ##
|
@@ -58,6 +58,13 @@ Now we can parse fastq files as well!
|
|
58
58
|
|
59
59
|
## Versions ##
|
60
60
|
|
61
|
+
### 1.4.0 ###
|
62
|
+
|
63
|
+
Added methods:
|
64
|
+
|
65
|
+
Sequence.base_counts
|
66
|
+
Sequence.base_frequencies
|
67
|
+
|
61
68
|
### 1.3.0 ###
|
62
69
|
|
63
70
|
Add additional functionality to `each_record` method.
|
data/lib/parse_fasta/sequence.rb
CHANGED
@@ -25,7 +25,8 @@ class Sequence < String
|
|
25
25
|
# Calculates GC content by dividing count of G + C divided by count
|
26
26
|
# of G + C + T + A + U. If there are both T's and U's in the
|
27
27
|
# Sequence, things will get weird, but then again, that wouldn't
|
28
|
-
# happen, now would it!
|
28
|
+
# happen, now would it! Ambiguous bases are ignored similar to
|
29
|
+
# BioRuby.
|
29
30
|
#
|
30
31
|
# @example Get GC of a Sequence
|
31
32
|
# Sequence.new('ACTg').gc #=> 0.5
|
@@ -46,7 +47,77 @@ class Sequence < String
|
|
46
47
|
u = s.count('u')
|
47
48
|
|
48
49
|
return 0 if c + g + t + a + u == 0
|
49
|
-
return (c + g)
|
50
|
+
return (c + g) / (c + g + t + a + u).to_f
|
50
51
|
end
|
51
52
|
|
53
|
+
# Returns a map of base counts
|
54
|
+
#
|
55
|
+
# This method will check if the sequence is DNA or RNA and return a
|
56
|
+
# count map appropriate for each. If a truthy argument is given, the
|
57
|
+
# count of ambiguous bases will be returned as well.
|
58
|
+
#
|
59
|
+
# If a sequence has both T and U present, will warn the user and
|
60
|
+
# keep going. Will return a map with counts of both, however.
|
61
|
+
#
|
62
|
+
# @example Get base counts of DNA sequence without ambiguous bases
|
63
|
+
# Sequence.new('AcTGn').base_counts
|
64
|
+
# #=> { a: 1, c: 1, t: 1, g: 1 }
|
65
|
+
# @example Get base counts of DNA sequence with ambiguous bases
|
66
|
+
# Sequence.new('AcTGn').base_counts(true)
|
67
|
+
# #=> { a: 1, c: 1, t: 1, g: 1, n: 1 }
|
68
|
+
# @example Get base counts of RNA sequence without ambiguous bases
|
69
|
+
# Sequence.new('AcUGn').base_counts
|
70
|
+
# #=> { a: 1, c: 1, u: 1, g: 1 }
|
71
|
+
# @example Get base counts of DNA sequence with ambiguous bases
|
72
|
+
# Sequence.new('AcUGn').base_counts(true)
|
73
|
+
# #=> { a: 1, c: 1, u: 1, g: 1, n: 1 }
|
74
|
+
#
|
75
|
+
# @return [Hash] A hash with base as key, count as value
|
76
|
+
def base_counts(count_ambiguous_bases=nil)
|
77
|
+
s = self.downcase
|
78
|
+
t = s.count('t')
|
79
|
+
u = s.count('u')
|
80
|
+
counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') }
|
81
|
+
|
82
|
+
if t > 0 && u == 0
|
83
|
+
counts[:t] = t
|
84
|
+
elsif t == 0 && u > 0
|
85
|
+
counts[:u] = u
|
86
|
+
elsif t > 0 && u > 0
|
87
|
+
warn('ERROR: A sequence contains both T and U')
|
88
|
+
counts[:t], counts[:u] = t, u
|
89
|
+
end
|
90
|
+
|
91
|
+
counts[:n] = s.count('n') if count_ambiguous_bases
|
92
|
+
|
93
|
+
counts
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns a map of base frequencies
|
97
|
+
#
|
98
|
+
# Counts bases with the `base_counts` method, then divides each
|
99
|
+
# count by the total bases counted to give frequency for each
|
100
|
+
# base. If a truthy argument is given, ambiguous bases will be
|
101
|
+
# included in the total and their frequency reported. Can discern
|
102
|
+
# between DNA and RNA.
|
103
|
+
#
|
104
|
+
# If default or falsy argument is given, ambiguous bases will not be
|
105
|
+
# counted in the total base count and their frequency will not be
|
106
|
+
# given.
|
107
|
+
#
|
108
|
+
# @example Get base frequencies of DNA sequence without ambiguous bases
|
109
|
+
# Sequence.new('AcTGn').base_counts
|
110
|
+
# #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 }
|
111
|
+
# @example Get base counts of DNA sequence with ambiguous bases
|
112
|
+
# Sequence.new('AcTGn').base_counts(true)
|
113
|
+
# #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 }
|
114
|
+
#
|
115
|
+
# @return [Hash] A hash with base as key, frequency as value
|
116
|
+
def base_frequencies(count_ambiguous_bases=nil)
|
117
|
+
base_counts = self.base_counts(count_ambiguous_bases)
|
118
|
+
total_bases = base_counts.values.reduce(:+).to_f
|
119
|
+
base_freqs =
|
120
|
+
base_counts.map { |base, count| [base, count/total_bases] }.flatten
|
121
|
+
Hash[*base_freqs]
|
122
|
+
end
|
52
123
|
end
|
data/lib/parse_fasta/version.rb
CHANGED
data/parse_fasta.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Ryan Moore"]
|
10
10
|
spec.email = ["moorer@udel.edu"]
|
11
11
|
spec.summary = %q{Easy-peasy parsing of fasta files}
|
12
|
-
spec.description = %q{So you want to parse a fasta file...}
|
12
|
+
spec.description = %q{So you want to parse a fasta or fastq file...}
|
13
13
|
spec.homepage = "https://github.com/mooreryan/parse_fasta"
|
14
14
|
spec.license = "GPLv3: http://www.gnu.org/licenses/gpl.txt"
|
15
15
|
|
data/spec/lib/sequence_spec.rb
CHANGED
@@ -53,4 +53,71 @@ describe Sequence do
|
|
53
53
|
end
|
54
54
|
end
|
55
55
|
end
|
56
|
+
|
57
|
+
describe "#base_counts" do
|
58
|
+
context "for a DNA sequence with default or falsy argument" do
|
59
|
+
it "returns a map of A, C, T, and G counts" do
|
60
|
+
s = Sequence.new('ACTGactg')
|
61
|
+
expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, g: 2 })
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
context "for a DNA sequence with truthy argument" do
|
66
|
+
it "returns a map of A, C, T, G and N counts" do
|
67
|
+
s = Sequence.new('ACTGNactgn')
|
68
|
+
expect(s.base_counts(1)).to eq({ a: 2, c: 2, t: 2, g: 2, n: 2 })
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context "for an RNA sequence with falsy or default argument" do
|
73
|
+
it "returns a map of A, C, U, G counts" do
|
74
|
+
s = Sequence.new('ACUGacug')
|
75
|
+
expect(s.base_counts).to eq({ a: 2, c: 2, u: 2, g: 2 })
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "for an RNA sequence with truthy argument" do
|
80
|
+
it "returns a map of A, C, U, G and N counts" do
|
81
|
+
s = Sequence.new('ACUGNacugn')
|
82
|
+
expect(s.base_counts(1)).to eq({ a: 2, c: 2, u: 2, g: 2, n: 2 })
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context "for a sequence with both U and T present" do
|
87
|
+
s = Sequence.new('AaCcTtGgNnUu')
|
88
|
+
err_message = 'ERROR: A sequence contains both T and U'
|
89
|
+
|
90
|
+
it "warns the user about having both U and T present" do
|
91
|
+
expect(s).to receive(:warn).with(err_message)
|
92
|
+
s.base_counts
|
93
|
+
end
|
94
|
+
|
95
|
+
it "returns a map that counts both U's and T's" do
|
96
|
+
expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, u: 2, g: 2 })
|
97
|
+
end
|
98
|
+
|
99
|
+
it "returns a map with T, U and N if truthy argument given" do
|
100
|
+
base_counts = { a: 2, c: 2, t: 2, u: 2, g: 2, n: 2 }
|
101
|
+
expect(s.base_counts(1)).to eq(base_counts)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe "#base_frequencies" do
|
107
|
+
context "with falsy argument" do
|
108
|
+
it "doesn't count ambiguous bases in total bases" do
|
109
|
+
s = Sequence.new('ACTTn')
|
110
|
+
base_freqs = { a: 0.25, c: 0.25, t: 0.5, g: 0.0 }
|
111
|
+
expect(s.base_frequencies).to eq(base_freqs)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
context "when counting ambiguous bases" do
|
116
|
+
it "does count ambiguous bases in total bases" do
|
117
|
+
s = Sequence.new('ACTTn')
|
118
|
+
base_freqs = { a: 0.2, c: 0.2, t: 0.4, g: 0.0, n: 0.2 }
|
119
|
+
expect(s.base_frequencies(1)).to eq(base_freqs)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
56
123
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,7 +80,7 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0.8'
|
83
|
-
description: So you want to parse a fasta file...
|
83
|
+
description: So you want to parse a fasta or fastq file...
|
84
84
|
email:
|
85
85
|
- moorer@udel.edu
|
86
86
|
executables: []
|