parse_fasta 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ae03847d56379d572d6118a176876a1e11a21a49
4
- data.tar.gz: ffb84a464f8f7057f3a363b2b872f4b60fcca9c2
3
+ metadata.gz: 689bddd57bc87d882ad8e8711acc84f0f1c33825
4
+ data.tar.gz: 85d056f1581a23cc25c1a60224fd2c177266ab23
5
5
  SHA512:
6
- metadata.gz: 0b6b3694de307b868df3b1d5b38b08e539e545827d8d5f47a8b41a550f60d6354164b0e563b6eacf39ab63bd844c2ab68578fc0bfbc5fd5a1f3a63b31a09cfc9
7
- data.tar.gz: 719f6fb5c112b06ecf969662617de3bdc02156a09fcb542ff0a9553de55f2d7d6400fc2d6384be27c780517b1ab95887991a8a22c9d05ab3070e7d0e2b05962f
6
+ metadata.gz: f68c1541107d1881177606398c63d09b7da2b04d7e0f30b3165ebd51cc7310caddbcba7788f72db71f21c778c5c1f022382a17e4b450cd172946534665f360a1
7
+ data.tar.gz: eb8a9bf47345d0ee4b164ef053da5ddb4cb6785d0c0a91424b2e4549d9231d2f55ab79f9355e0df8f924d51be7be738a95f5887b6333aa99c4a5a945941f34f3
data/README.md CHANGED
@@ -29,7 +29,7 @@ and over.
29
29
  ## Documentation ##
30
30
 
31
31
  Checkout
32
- [parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.3.0/frames)
32
+ [parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.4.0/frames)
33
33
  to see the full documentation.
34
34
 
35
35
  ## Usage ##
@@ -58,6 +58,13 @@ Now we can parse fastq files as well!
58
58
 
59
59
  ## Versions ##
60
60
 
61
+ ### 1.4.0 ###
62
+
63
+ Added methods:
64
+
65
+ Sequence.base_counts
66
+ Sequence.base_frequencies
67
+
61
68
  ### 1.3.0 ###
62
69
 
63
70
  Add additional functionality to `each_record` method.
@@ -25,7 +25,8 @@ class Sequence < String
25
25
  # Calculates GC content by dividing count of G + C divided by count
26
26
  # of G + C + T + A + U. If there are both T's and U's in the
27
27
  # Sequence, things will get weird, but then again, that wouldn't
28
- # happen, now would it!
28
+ # happen, now would it! Ambiguous bases are ignored similar to
29
+ # BioRuby.
29
30
  #
30
31
  # @example Get GC of a Sequence
31
32
  # Sequence.new('ACTg').gc #=> 0.5
@@ -46,7 +47,77 @@ class Sequence < String
46
47
  u = s.count('u')
47
48
 
48
49
  return 0 if c + g + t + a + u == 0
49
- return (c + g).quo(c + g + t + a + u).to_f
50
+ return (c + g) / (c + g + t + a + u).to_f
50
51
  end
51
52
 
53
+ # Returns a map of base counts
54
+ #
55
+ # This method will check if the sequence is DNA or RNA and return a
56
+ # count map appropriate for each. If a truthy argument is given, the
57
+ # count of ambiguous bases will be returned as well.
58
+ #
59
+ # If a sequence has both T and U present, will warn the user and
60
+ # keep going. Will return a map with counts of both, however.
61
+ #
62
+ # @example Get base counts of DNA sequence without ambiguous bases
63
+ # Sequence.new('AcTGn').base_counts
64
+ # #=> { a: 1, c: 1, t: 1, g: 1 }
65
+ # @example Get base counts of DNA sequence with ambiguous bases
66
+ # Sequence.new('AcTGn').base_counts(true)
67
+ # #=> { a: 1, c: 1, t: 1, g: 1, n: 1 }
68
+ # @example Get base counts of RNA sequence without ambiguous bases
69
+ # Sequence.new('AcUGn').base_counts
70
+ # #=> { a: 1, c: 1, u: 1, g: 1 }
71
+ # @example Get base counts of DNA sequence with ambiguous bases
72
+ # Sequence.new('AcUGn').base_counts(true)
73
+ # #=> { a: 1, c: 1, u: 1, g: 1, n: 1 }
74
+ #
75
+ # @return [Hash] A hash with base as key, count as value
76
+ def base_counts(count_ambiguous_bases=nil)
77
+ s = self.downcase
78
+ t = s.count('t')
79
+ u = s.count('u')
80
+ counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') }
81
+
82
+ if t > 0 && u == 0
83
+ counts[:t] = t
84
+ elsif t == 0 && u > 0
85
+ counts[:u] = u
86
+ elsif t > 0 && u > 0
87
+ warn('ERROR: A sequence contains both T and U')
88
+ counts[:t], counts[:u] = t, u
89
+ end
90
+
91
+ counts[:n] = s.count('n') if count_ambiguous_bases
92
+
93
+ counts
94
+ end
95
+
96
+ # Returns a map of base frequencies
97
+ #
98
+ # Counts bases with the `base_counts` method, then divides each
99
+ # count by the total bases counted to give frequency for each
100
+ # base. If a truthy argument is given, ambiguous bases will be
101
+ # included in the total and their frequency reported. Can discern
102
+ # between DNA and RNA.
103
+ #
104
+ # If default or falsy argument is given, ambiguous bases will not be
105
+ # counted in the total base count and their frequency will not be
106
+ # given.
107
+ #
108
+ # @example Get base frequencies of DNA sequence without ambiguous bases
109
+ # Sequence.new('AcTGn').base_counts
110
+ # #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 }
111
+ # @example Get base counts of DNA sequence with ambiguous bases
112
+ # Sequence.new('AcTGn').base_counts(true)
113
+ # #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 }
114
+ #
115
+ # @return [Hash] A hash with base as key, frequency as value
116
+ def base_frequencies(count_ambiguous_bases=nil)
117
+ base_counts = self.base_counts(count_ambiguous_bases)
118
+ total_bases = base_counts.values.reduce(:+).to_f
119
+ base_freqs =
120
+ base_counts.map { |base, count| [base, count/total_bases] }.flatten
121
+ Hash[*base_freqs]
122
+ end
52
123
  end
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.3.0"
20
+ VERSION = "1.4.0"
21
21
  end
data/parse_fasta.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Ryan Moore"]
10
10
  spec.email = ["moorer@udel.edu"]
11
11
  spec.summary = %q{Easy-peasy parsing of fasta files}
12
- spec.description = %q{So you want to parse a fasta file...}
12
+ spec.description = %q{So you want to parse a fasta or fastq file...}
13
13
  spec.homepage = "https://github.com/mooreryan/parse_fasta"
14
14
  spec.license = "GPLv3: http://www.gnu.org/licenses/gpl.txt"
15
15
 
@@ -53,4 +53,71 @@ describe Sequence do
53
53
  end
54
54
  end
55
55
  end
56
+
57
+ describe "#base_counts" do
58
+ context "for a DNA sequence with default or falsy argument" do
59
+ it "returns a map of A, C, T, and G counts" do
60
+ s = Sequence.new('ACTGactg')
61
+ expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, g: 2 })
62
+ end
63
+ end
64
+
65
+ context "for a DNA sequence with truthy argument" do
66
+ it "returns a map of A, C, T, G and N counts" do
67
+ s = Sequence.new('ACTGNactgn')
68
+ expect(s.base_counts(1)).to eq({ a: 2, c: 2, t: 2, g: 2, n: 2 })
69
+ end
70
+ end
71
+
72
+ context "for an RNA sequence with falsy or default argument" do
73
+ it "returns a map of A, C, U, G counts" do
74
+ s = Sequence.new('ACUGacug')
75
+ expect(s.base_counts).to eq({ a: 2, c: 2, u: 2, g: 2 })
76
+ end
77
+ end
78
+
79
+ context "for an RNA sequence with truthy argument" do
80
+ it "returns a map of A, C, U, G and N counts" do
81
+ s = Sequence.new('ACUGNacugn')
82
+ expect(s.base_counts(1)).to eq({ a: 2, c: 2, u: 2, g: 2, n: 2 })
83
+ end
84
+ end
85
+
86
+ context "for a sequence with both U and T present" do
87
+ s = Sequence.new('AaCcTtGgNnUu')
88
+ err_message = 'ERROR: A sequence contains both T and U'
89
+
90
+ it "warns the user about having both U and T present" do
91
+ expect(s).to receive(:warn).with(err_message)
92
+ s.base_counts
93
+ end
94
+
95
+ it "returns a map that counts both U's and T's" do
96
+ expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, u: 2, g: 2 })
97
+ end
98
+
99
+ it "returns a map with T, U and N if truthy argument given" do
100
+ base_counts = { a: 2, c: 2, t: 2, u: 2, g: 2, n: 2 }
101
+ expect(s.base_counts(1)).to eq(base_counts)
102
+ end
103
+ end
104
+ end
105
+
106
+ describe "#base_frequencies" do
107
+ context "with falsy argument" do
108
+ it "doesn't count ambiguous bases in total bases" do
109
+ s = Sequence.new('ACTTn')
110
+ base_freqs = { a: 0.25, c: 0.25, t: 0.5, g: 0.0 }
111
+ expect(s.base_frequencies).to eq(base_freqs)
112
+ end
113
+ end
114
+
115
+ context "when counting ambiguous bases" do
116
+ it "does count ambiguous bases in total bases" do
117
+ s = Sequence.new('ACTTn')
118
+ base_freqs = { a: 0.2, c: 0.2, t: 0.4, g: 0.0, n: 0.2 }
119
+ expect(s.base_frequencies(1)).to eq(base_freqs)
120
+ end
121
+ end
122
+ end
56
123
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-24 00:00:00.000000000 Z
11
+ date: 2014-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,7 +80,7 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0.8'
83
- description: So you want to parse a fasta file...
83
+ description: So you want to parse a fasta or fastq file...
84
84
  email:
85
85
  - moorer@udel.edu
86
86
  executables: []