parse_fasta 1.9.1 → 1.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MDAyZjJjNDJlMGZkM2Y2YmZmOTNmZTIwOTE2ZmVlMTRlYjhiNThkNA==
4
+ NmM5ZWYwOGM5YWIxMzU2YjBmZTk4Y2I5YzI0NjY0MzUwM2YwMjgyOA==
5
5
  data.tar.gz: !binary |-
6
- ZmQxMDI5MDI4MjEyN2UyMjQ3ZGQwNDU2YzZhYmI0ZTNjMGQ2ZWFiOQ==
6
+ NzI2NDY1MWZmYmUwNDUxMTk2MmI4YjgwYWVlYjcyZDI4MDUzMzk4NA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NDZjNTk2OTUzYmM1MzE3MmJmOWJlMGM0NjQxY2RlODRiY2RiOTNlNWRiNmM1
10
- NTA3OWNmMzdiZTRlNmVlZWZiOTU1ZjhkZjE4YTA3ZjQ5OTFhOWQ3NjViOTM3
11
- N2IwY2I5YjI1MDIwMDczY2M2MzNmZTE4OTdiMzI0NWQ4ODcwMGE=
9
+ ODY1ZTQ1MzU4MTc2MDhhMjA0OThiYzM4Yzk4YjJiZjU4ZGY4MGM5NTRjYTE5
10
+ OWZkODk0M2ZmODE5ODY1MjE3NTQ5MzgyNTFjMTk2NzU2NGVjN2NkNGUzYzA3
11
+ ODliNjRlOGJjOGJhNjhlMWZmMmU1NjkyMjgwNzAyODQ1MDExOTI=
12
12
  data.tar.gz: !binary |-
13
- NmNkZTI0NzE3NjU5M2VmNzgzYTk0MDM1NGM2NTdmNDA2YWY0ZmY2OWRhNTE4
14
- ODE1MzIxYTcyNzhhMTJmMTc5NmE5MjI3OWEyYjUwMzM5NzJhYzAxZjJkYTE1
15
- YTNmMTcyNWE1ZDg1NTNhMTg0ZDU3N2I0Y2E0YzlmOWZhOWRmNmE=
13
+ YWY4NWU3NDFiYTVmMmE1Y2MxMDI3ZjE3NTIyY2Q1N2Q2ZDQxM2ZlZjI4NjUy
14
+ MWM5OTZhNzEzZWNmMGVlYTQ1MDc1MzViMDBkOTQ0YzQyY2IxYjlmOGQwNzRh
15
+ YmIyOTg2Yjk0OTFlNWVhOGU3MTMzM2I1ZGY0ZjlkMzExZGNkZDk=
data/README.md CHANGED
@@ -66,6 +66,10 @@ Read fasta file into a hash.
66
66
 
67
67
  ## Versions ##
68
68
 
69
+ ### 1.9.2 ###
70
+
71
+ Speed up fastA `each_record` and `each_record_fast`.
72
+
69
73
  ### 1.9.1 ###
70
74
 
71
75
  Speed up fastQ `each_record` and `each_record_fast`. Courtesy of
@@ -221,60 +225,19 @@ Last version with File monkey patch.
221
225
 
222
226
  ## Benchmark ##
223
227
 
224
- **NOTE**: These benchmarks are against an older version of
225
- `parse_fasta`.
226
-
227
228
  Some quick and dirty benchmarks against `BioRuby`.
228
229
 
229
230
  ### FastaFile#each_record ###
230
231
 
231
- Calculating sequence length length for each fasta record with both the
232
- `each_record` method from this gem and using the `FastaFormat` class
233
- from BioRuby. You can see the test script in `benchmark.rb`.
234
-
235
- The test file contained 2,009,897 illumina reads and the file size
236
- was 1.1 gigabytes. Here are the results from Ruby's `Benchmark` class:
232
+ You can see the test script in `benchmark.rb`.
237
233
 
238
- user system total real
239
- parse_fasta 64.530000 1.740000 66.270000 ( 67.081502)
240
- bioruby 116.250000 2.260000 118.510000 (120.223710)
234
+ user system total real
235
+ parse_fasta 1.920000 0.160000 2.080000 ( 2.145932)
236
+ parse_fasta fast 1.210000 0.160000 1.370000 ( 1.377770)
237
+ bioruby 4.330000 0.290000 4.620000 ( 4.655567)
241
238
 
242
239
  Hot dog! It's faster :)
243
240
 
244
- ### FastqFile#each_record ###
245
-
246
- The same sequence length test as above, but this time with a fastq
247
- file containing 4,000,000 illumina reads.
248
-
249
- user system total real
250
- this_fastq 62.610000 1.660000 64.270000 ( 64.389408)
251
- bioruby_fastq 165.500000 2.100000 167.600000 (167.969636)
252
-
253
- ### Sequence#gc ###
254
-
255
- The test is done on random strings matcing `/[AaCcTtGgUu]/`. `this_gc`
256
- is `Sequence.new(str).gc`, and `bioruby_gc` is
257
- `Bio::Sequence::NA.new(str).gc_content`.
258
-
259
- To see how the methods scales, the test 1 string was 2,000,000 bases,
260
- test 2 was 4,000,000 and test 3 was 8,000,000 bases.
261
-
262
- user system total real
263
- this_gc 1 0.030000 0.000000 0.030000 ( 0.029145)
264
- bioruby_gc 1 2.030000 0.010000 2.040000 ( 2.157512)
265
-
266
- this_gc 2 0.060000 0.000000 0.060000 ( 0.059408)
267
- bioruby_gc 2 4.060000 0.020000 4.080000 ( 4.334159)
268
-
269
- this_gc 3 0.120000 0.000000 0.120000 ( 0.185434)
270
- bioruby_gc 3 8.060000 0.020000 8.080000 ( 8.659071)
271
-
272
- Nice!
273
-
274
- Troll: "When will you find the GC of an 8,000,000 base sequence?"
275
-
276
- Me: "Step off, troll!"
277
-
278
241
  ## Notes ##
279
242
 
280
243
  Only the `SeqFile` class actually checks to make sure that you passed
@@ -116,10 +116,28 @@ class FastaFile < File
116
116
  # end
117
117
  # end
118
118
  else
119
- f.each("\n>") do |line|
120
- header, sequence = parse_line(line)
121
- yield(header.strip, Sequence.new(sequence || ""))
119
+ header = ""
120
+ sequence = ""
121
+ f.each_line do |line|
122
+ line.chomp!
123
+ len = line.length
124
+ if header.empty? && line.start_with?(">")
125
+ header = line[1, len]
126
+ elsif line.start_with?(">")
127
+ yield(header.strip, Sequence.new(sequence || ""))
128
+ header = line[1, len]
129
+ sequence = ""
130
+ else
131
+ raise ParseFasta::SequenceFormatError if sequence.include? ">"
132
+ sequence << line
133
+ end
122
134
  end
135
+ yield(header, Sequence.new(sequence || ""))
136
+
137
+ # f.each("\n>") do |line|
138
+ # header, sequence = parse_line(line)
139
+ # yield(header.strip, Sequence.new(sequence || ""))
140
+ # end
123
141
 
124
142
  # f.each_with_index(sep=/^>/) do |line, idx|
125
143
  # if idx.zero?
@@ -161,13 +179,31 @@ class FastaFile < File
161
179
  f = self
162
180
  end
163
181
 
164
- f.each("\n>") do |line|
165
- header, sequence = parse_line(line)
182
+ header = ""
183
+ sequence = ""
184
+ f.each_line do |line|
185
+ line.chomp!
186
+ len = line.length
187
+ if header.empty? && line.start_with?(">")
188
+ header = line[1, len]
189
+ elsif line.start_with?(">")
190
+ yield(header.strip, sequence)
191
+ header = line[1, len]
192
+ sequence = ""
193
+ else
194
+ raise ParseFasta::SequenceFormatError if sequence.include? ">"
195
+ sequence << line
196
+ end
197
+ end
198
+ yield(header, sequence)
166
199
 
167
- raise ParseFasta::SequenceFormatError if sequence.include? ">"
200
+ # f.each("\n>") do |line|
201
+ # header, sequence = parse_line(line)
168
202
 
169
- yield(header.strip, sequence)
170
- end
203
+ # raise ParseFasta::SequenceFormatError if sequence.include? ">"
204
+
205
+ # yield(header.strip, sequence)
206
+ # end
171
207
 
172
208
  f.close if f.instance_of?(Zlib::GzipReader)
173
209
  return f
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.9.1"
20
+ VERSION = "1.9.2"
21
21
  end
@@ -28,16 +28,23 @@ def this_parse_fasta fname
28
28
  end
29
29
  end
30
30
 
31
+ def this_parse_fasta_fast fname
32
+ FastaFile.open(fname, 'r').each_record_fast do |header, sequence|
33
+ [header, sequence.length].join("\t")
34
+ end
35
+ end
36
+
31
37
  def bioruby_parse_fasta fname
32
38
  Bio::FastaFormat.open(fname).each do |entry|
33
39
  [entry.definition, entry.seq.length].join("\t")
34
40
  end
35
41
  end
36
42
 
37
- # Benchmark.bmbm do |x|
38
- # x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
39
- # x.report('bioruby') { bioruby_parse_fasta(ARGV.first) }
40
- # end
43
+ Benchmark.bmbm do |x|
44
+ x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
45
+ x.report('parse_fasta fast') { this_parse_fasta_fast(ARGV.first) }
46
+ x.report('bioruby') { bioruby_parse_fasta(ARGV.first) }
47
+ end
41
48
 
42
49
  ####
43
50
 
@@ -72,8 +79,8 @@ end
72
79
  # fastq = ARGV.first
73
80
 
74
81
  def bioruby_fastq(fastq)
75
- Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
76
- fq.each do |entry|
82
+ Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
83
+ fq.each do |entry|
77
84
  [entry.definition, entry.seq.length].join("\t")
78
85
  end
79
86
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.1
4
+ version: 1.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-20 00:00:00.000000000 Z
11
+ date: 2016-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler