parse_fasta 1.9.1 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MDAyZjJjNDJlMGZkM2Y2YmZmOTNmZTIwOTE2ZmVlMTRlYjhiNThkNA==
4
+ NmM5ZWYwOGM5YWIxMzU2YjBmZTk4Y2I5YzI0NjY0MzUwM2YwMjgyOA==
5
5
  data.tar.gz: !binary |-
6
- ZmQxMDI5MDI4MjEyN2UyMjQ3ZGQwNDU2YzZhYmI0ZTNjMGQ2ZWFiOQ==
6
+ NzI2NDY1MWZmYmUwNDUxMTk2MmI4YjgwYWVlYjcyZDI4MDUzMzk4NA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NDZjNTk2OTUzYmM1MzE3MmJmOWJlMGM0NjQxY2RlODRiY2RiOTNlNWRiNmM1
10
- NTA3OWNmMzdiZTRlNmVlZWZiOTU1ZjhkZjE4YTA3ZjQ5OTFhOWQ3NjViOTM3
11
- N2IwY2I5YjI1MDIwMDczY2M2MzNmZTE4OTdiMzI0NWQ4ODcwMGE=
9
+ ODY1ZTQ1MzU4MTc2MDhhMjA0OThiYzM4Yzk4YjJiZjU4ZGY4MGM5NTRjYTE5
10
+ OWZkODk0M2ZmODE5ODY1MjE3NTQ5MzgyNTFjMTk2NzU2NGVjN2NkNGUzYzA3
11
+ ODliNjRlOGJjOGJhNjhlMWZmMmU1NjkyMjgwNzAyODQ1MDExOTI=
12
12
  data.tar.gz: !binary |-
13
- NmNkZTI0NzE3NjU5M2VmNzgzYTk0MDM1NGM2NTdmNDA2YWY0ZmY2OWRhNTE4
14
- ODE1MzIxYTcyNzhhMTJmMTc5NmE5MjI3OWEyYjUwMzM5NzJhYzAxZjJkYTE1
15
- YTNmMTcyNWE1ZDg1NTNhMTg0ZDU3N2I0Y2E0YzlmOWZhOWRmNmE=
13
+ YWY4NWU3NDFiYTVmMmE1Y2MxMDI3ZjE3NTIyY2Q1N2Q2ZDQxM2ZlZjI4NjUy
14
+ MWM5OTZhNzEzZWNmMGVlYTQ1MDc1MzViMDBkOTQ0YzQyY2IxYjlmOGQwNzRh
15
+ YmIyOTg2Yjk0OTFlNWVhOGU3MTMzM2I1ZGY0ZjlkMzExZGNkZDk=
data/README.md CHANGED
@@ -66,6 +66,10 @@ Read fasta file into a hash.
66
66
 
67
67
  ## Versions ##
68
68
 
69
+ ### 1.9.2 ###
70
+
71
+ Speed up fastA `each_record` and `each_record_fast`.
72
+
69
73
  ### 1.9.1 ###
70
74
 
71
75
  Speed up fastQ `each_record` and `each_record_fast`. Courtesy of
@@ -221,60 +225,19 @@ Last version with File monkey patch.
221
225
 
222
226
  ## Benchmark ##
223
227
 
224
- **NOTE**: These benchmarks are against an older version of
225
- `parse_fasta`.
226
-
227
228
  Some quick and dirty benchmarks against `BioRuby`.
228
229
 
229
230
  ### FastaFile#each_record ###
230
231
 
231
- Calculating sequence length length for each fasta record with both the
232
- `each_record` method from this gem and using the `FastaFormat` class
233
- from BioRuby. You can see the test script in `benchmark.rb`.
234
-
235
- The test file contained 2,009,897 illumina reads and the file size
236
- was 1.1 gigabytes. Here are the results from Ruby's `Benchmark` class:
232
+ You can see the test script in `benchmark.rb`.
237
233
 
238
- user system total real
239
- parse_fasta 64.530000 1.740000 66.270000 ( 67.081502)
240
- bioruby 116.250000 2.260000 118.510000 (120.223710)
234
+ user system total real
235
+ parse_fasta 1.920000 0.160000 2.080000 ( 2.145932)
236
+ parse_fasta fast 1.210000 0.160000 1.370000 ( 1.377770)
237
+ bioruby 4.330000 0.290000 4.620000 ( 4.655567)
241
238
 
242
239
  Hot dog! It's faster :)
243
240
 
244
- ### FastqFile#each_record ###
245
-
246
- The same sequence length test as above, but this time with a fastq
247
- file containing 4,000,000 illumina reads.
248
-
249
- user system total real
250
- this_fastq 62.610000 1.660000 64.270000 ( 64.389408)
251
- bioruby_fastq 165.500000 2.100000 167.600000 (167.969636)
252
-
253
- ### Sequence#gc ###
254
-
255
- The test is done on random strings matcing `/[AaCcTtGgUu]/`. `this_gc`
256
- is `Sequence.new(str).gc`, and `bioruby_gc` is
257
- `Bio::Sequence::NA.new(str).gc_content`.
258
-
259
- To see how the methods scales, the test 1 string was 2,000,000 bases,
260
- test 2 was 4,000,000 and test 3 was 8,000,000 bases.
261
-
262
- user system total real
263
- this_gc 1 0.030000 0.000000 0.030000 ( 0.029145)
264
- bioruby_gc 1 2.030000 0.010000 2.040000 ( 2.157512)
265
-
266
- this_gc 2 0.060000 0.000000 0.060000 ( 0.059408)
267
- bioruby_gc 2 4.060000 0.020000 4.080000 ( 4.334159)
268
-
269
- this_gc 3 0.120000 0.000000 0.120000 ( 0.185434)
270
- bioruby_gc 3 8.060000 0.020000 8.080000 ( 8.659071)
271
-
272
- Nice!
273
-
274
- Troll: "When will you find the GC of an 8,000,000 base sequence?"
275
-
276
- Me: "Step off, troll!"
277
-
278
241
  ## Notes ##
279
242
 
280
243
  Only the `SeqFile` class actually checks to make sure that you passed
@@ -116,10 +116,28 @@ class FastaFile < File
116
116
  # end
117
117
  # end
118
118
  else
119
- f.each("\n>") do |line|
120
- header, sequence = parse_line(line)
121
- yield(header.strip, Sequence.new(sequence || ""))
119
+ header = ""
120
+ sequence = ""
121
+ f.each_line do |line|
122
+ line.chomp!
123
+ len = line.length
124
+ if header.empty? && line.start_with?(">")
125
+ header = line[1, len]
126
+ elsif line.start_with?(">")
127
+ yield(header.strip, Sequence.new(sequence || ""))
128
+ header = line[1, len]
129
+ sequence = ""
130
+ else
131
+ raise ParseFasta::SequenceFormatError if sequence.include? ">"
132
+ sequence << line
133
+ end
122
134
  end
135
+ yield(header, Sequence.new(sequence || ""))
136
+
137
+ # f.each("\n>") do |line|
138
+ # header, sequence = parse_line(line)
139
+ # yield(header.strip, Sequence.new(sequence || ""))
140
+ # end
123
141
 
124
142
  # f.each_with_index(sep=/^>/) do |line, idx|
125
143
  # if idx.zero?
@@ -161,13 +179,31 @@ class FastaFile < File
161
179
  f = self
162
180
  end
163
181
 
164
- f.each("\n>") do |line|
165
- header, sequence = parse_line(line)
182
+ header = ""
183
+ sequence = ""
184
+ f.each_line do |line|
185
+ line.chomp!
186
+ len = line.length
187
+ if header.empty? && line.start_with?(">")
188
+ header = line[1, len]
189
+ elsif line.start_with?(">")
190
+ yield(header.strip, sequence)
191
+ header = line[1, len]
192
+ sequence = ""
193
+ else
194
+ raise ParseFasta::SequenceFormatError if sequence.include? ">"
195
+ sequence << line
196
+ end
197
+ end
198
+ yield(header, sequence)
166
199
 
167
- raise ParseFasta::SequenceFormatError if sequence.include? ">"
200
+ # f.each("\n>") do |line|
201
+ # header, sequence = parse_line(line)
168
202
 
169
- yield(header.strip, sequence)
170
- end
203
+ # raise ParseFasta::SequenceFormatError if sequence.include? ">"
204
+
205
+ # yield(header.strip, sequence)
206
+ # end
171
207
 
172
208
  f.close if f.instance_of?(Zlib::GzipReader)
173
209
  return f
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.9.1"
20
+ VERSION = "1.9.2"
21
21
  end
@@ -28,16 +28,23 @@ def this_parse_fasta fname
28
28
  end
29
29
  end
30
30
 
31
+ def this_parse_fasta_fast fname
32
+ FastaFile.open(fname, 'r').each_record_fast do |header, sequence|
33
+ [header, sequence.length].join("\t")
34
+ end
35
+ end
36
+
31
37
  def bioruby_parse_fasta fname
32
38
  Bio::FastaFormat.open(fname).each do |entry|
33
39
  [entry.definition, entry.seq.length].join("\t")
34
40
  end
35
41
  end
36
42
 
37
- # Benchmark.bmbm do |x|
38
- # x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
39
- # x.report('bioruby') { bioruby_parse_fasta(ARGV.first) }
40
- # end
43
+ Benchmark.bmbm do |x|
44
+ x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
45
+ x.report('parse_fasta fast') { this_parse_fasta_fast(ARGV.first) }
46
+ x.report('bioruby') { bioruby_parse_fasta(ARGV.first) }
47
+ end
41
48
 
42
49
  ####
43
50
 
@@ -72,8 +79,8 @@ end
72
79
  # fastq = ARGV.first
73
80
 
74
81
  def bioruby_fastq(fastq)
75
- Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
76
- fq.each do |entry|
82
+ Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
83
+ fq.each do |entry|
77
84
  [entry.definition, entry.seq.length].join("\t")
78
85
  end
79
86
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.1
4
+ version: 1.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-20 00:00:00.000000000 Z
11
+ date: 2016-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler