parse_fasta 1.9.1 → 1.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +9 -46
- data/lib/parse_fasta/fasta_file.rb +44 -8
- data/lib/parse_fasta/version.rb +1 -1
- data/test_files/benchmark.rb +13 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NmM5ZWYwOGM5YWIxMzU2YjBmZTk4Y2I5YzI0NjY0MzUwM2YwMjgyOA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NzI2NDY1MWZmYmUwNDUxMTk2MmI4YjgwYWVlYjcyZDI4MDUzMzk4NA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODY1ZTQ1MzU4MTc2MDhhMjA0OThiYzM4Yzk4YjJiZjU4ZGY4MGM5NTRjYTE5
|
10
|
+
OWZkODk0M2ZmODE5ODY1MjE3NTQ5MzgyNTFjMTk2NzU2NGVjN2NkNGUzYzA3
|
11
|
+
ODliNjRlOGJjOGJhNjhlMWZmMmU1NjkyMjgwNzAyODQ1MDExOTI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YWY4NWU3NDFiYTVmMmE1Y2MxMDI3ZjE3NTIyY2Q1N2Q2ZDQxM2ZlZjI4NjUy
|
14
|
+
MWM5OTZhNzEzZWNmMGVlYTQ1MDc1MzViMDBkOTQ0YzQyY2IxYjlmOGQwNzRh
|
15
|
+
YmIyOTg2Yjk0OTFlNWVhOGU3MTMzM2I1ZGY0ZjlkMzExZGNkZDk=
|
data/README.md
CHANGED
@@ -66,6 +66,10 @@ Read fasta file into a hash.
|
|
66
66
|
|
67
67
|
## Versions ##
|
68
68
|
|
69
|
+
### 1.9.2 ###
|
70
|
+
|
71
|
+
Speed up fastA `each_record` and `each_record_fast`.
|
72
|
+
|
69
73
|
### 1.9.1 ###
|
70
74
|
|
71
75
|
Speed up fastQ `each_record` and `each_record_fast`. Courtesy of
|
@@ -221,60 +225,19 @@ Last version with File monkey patch.
|
|
221
225
|
|
222
226
|
## Benchmark ##
|
223
227
|
|
224
|
-
**NOTE**: These benchmarks are against an older version of
|
225
|
-
`parse_fasta`.
|
226
|
-
|
227
228
|
Some quick and dirty benchmarks against `BioRuby`.
|
228
229
|
|
229
230
|
### FastaFile#each_record ###
|
230
231
|
|
231
|
-
|
232
|
-
`each_record` method from this gem and using the `FastaFormat` class
|
233
|
-
from BioRuby. You can see the test script in `benchmark.rb`.
|
234
|
-
|
235
|
-
The test file contained 2,009,897 illumina reads and the file size
|
236
|
-
was 1.1 gigabytes. Here are the results from Ruby's `Benchmark` class:
|
232
|
+
You can see the test script in `benchmark.rb`.
|
237
233
|
|
238
|
-
|
239
|
-
parse_fasta
|
240
|
-
|
234
|
+
user system total real
|
235
|
+
parse_fasta 1.920000 0.160000 2.080000 ( 2.145932)
|
236
|
+
parse_fasta fast 1.210000 0.160000 1.370000 ( 1.377770)
|
237
|
+
bioruby 4.330000 0.290000 4.620000 ( 4.655567)
|
241
238
|
|
242
239
|
Hot dog! It's faster :)
|
243
240
|
|
244
|
-
### FastqFile#each_record ###
|
245
|
-
|
246
|
-
The same sequence length test as above, but this time with a fastq
|
247
|
-
file containing 4,000,000 illumina reads.
|
248
|
-
|
249
|
-
user system total real
|
250
|
-
this_fastq 62.610000 1.660000 64.270000 ( 64.389408)
|
251
|
-
bioruby_fastq 165.500000 2.100000 167.600000 (167.969636)
|
252
|
-
|
253
|
-
### Sequence#gc ###
|
254
|
-
|
255
|
-
The test is done on random strings matcing `/[AaCcTtGgUu]/`. `this_gc`
|
256
|
-
is `Sequence.new(str).gc`, and `bioruby_gc` is
|
257
|
-
`Bio::Sequence::NA.new(str).gc_content`.
|
258
|
-
|
259
|
-
To see how the methods scales, the test 1 string was 2,000,000 bases,
|
260
|
-
test 2 was 4,000,000 and test 3 was 8,000,000 bases.
|
261
|
-
|
262
|
-
user system total real
|
263
|
-
this_gc 1 0.030000 0.000000 0.030000 ( 0.029145)
|
264
|
-
bioruby_gc 1 2.030000 0.010000 2.040000 ( 2.157512)
|
265
|
-
|
266
|
-
this_gc 2 0.060000 0.000000 0.060000 ( 0.059408)
|
267
|
-
bioruby_gc 2 4.060000 0.020000 4.080000 ( 4.334159)
|
268
|
-
|
269
|
-
this_gc 3 0.120000 0.000000 0.120000 ( 0.185434)
|
270
|
-
bioruby_gc 3 8.060000 0.020000 8.080000 ( 8.659071)
|
271
|
-
|
272
|
-
Nice!
|
273
|
-
|
274
|
-
Troll: "When will you find the GC of an 8,000,000 base sequence?"
|
275
|
-
|
276
|
-
Me: "Step off, troll!"
|
277
|
-
|
278
241
|
## Notes ##
|
279
242
|
|
280
243
|
Only the `SeqFile` class actually checks to make sure that you passed
|
@@ -116,10 +116,28 @@ class FastaFile < File
|
|
116
116
|
# end
|
117
117
|
# end
|
118
118
|
else
|
119
|
-
|
120
|
-
|
121
|
-
|
119
|
+
header = ""
|
120
|
+
sequence = ""
|
121
|
+
f.each_line do |line|
|
122
|
+
line.chomp!
|
123
|
+
len = line.length
|
124
|
+
if header.empty? && line.start_with?(">")
|
125
|
+
header = line[1, len]
|
126
|
+
elsif line.start_with?(">")
|
127
|
+
yield(header.strip, Sequence.new(sequence || ""))
|
128
|
+
header = line[1, len]
|
129
|
+
sequence = ""
|
130
|
+
else
|
131
|
+
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
132
|
+
sequence << line
|
133
|
+
end
|
122
134
|
end
|
135
|
+
yield(header, Sequence.new(sequence || ""))
|
136
|
+
|
137
|
+
# f.each("\n>") do |line|
|
138
|
+
# header, sequence = parse_line(line)
|
139
|
+
# yield(header.strip, Sequence.new(sequence || ""))
|
140
|
+
# end
|
123
141
|
|
124
142
|
# f.each_with_index(sep=/^>/) do |line, idx|
|
125
143
|
# if idx.zero?
|
@@ -161,13 +179,31 @@ class FastaFile < File
|
|
161
179
|
f = self
|
162
180
|
end
|
163
181
|
|
164
|
-
|
165
|
-
|
182
|
+
header = ""
|
183
|
+
sequence = ""
|
184
|
+
f.each_line do |line|
|
185
|
+
line.chomp!
|
186
|
+
len = line.length
|
187
|
+
if header.empty? && line.start_with?(">")
|
188
|
+
header = line[1, len]
|
189
|
+
elsif line.start_with?(">")
|
190
|
+
yield(header.strip, sequence)
|
191
|
+
header = line[1, len]
|
192
|
+
sequence = ""
|
193
|
+
else
|
194
|
+
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
195
|
+
sequence << line
|
196
|
+
end
|
197
|
+
end
|
198
|
+
yield(header, sequence)
|
166
199
|
|
167
|
-
|
200
|
+
# f.each("\n>") do |line|
|
201
|
+
# header, sequence = parse_line(line)
|
168
202
|
|
169
|
-
|
170
|
-
|
203
|
+
# raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
204
|
+
|
205
|
+
# yield(header.strip, sequence)
|
206
|
+
# end
|
171
207
|
|
172
208
|
f.close if f.instance_of?(Zlib::GzipReader)
|
173
209
|
return f
|
data/lib/parse_fasta/version.rb
CHANGED
data/test_files/benchmark.rb
CHANGED
@@ -28,16 +28,23 @@ def this_parse_fasta fname
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
+
def this_parse_fasta_fast fname
|
32
|
+
FastaFile.open(fname, 'r').each_record_fast do |header, sequence|
|
33
|
+
[header, sequence.length].join("\t")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
31
37
|
def bioruby_parse_fasta fname
|
32
38
|
Bio::FastaFormat.open(fname).each do |entry|
|
33
39
|
[entry.definition, entry.seq.length].join("\t")
|
34
40
|
end
|
35
41
|
end
|
36
42
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
43
|
+
Benchmark.bmbm do |x|
|
44
|
+
x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
|
45
|
+
x.report('parse_fasta fast') { this_parse_fasta_fast(ARGV.first) }
|
46
|
+
x.report('bioruby') { bioruby_parse_fasta(ARGV.first) }
|
47
|
+
end
|
41
48
|
|
42
49
|
####
|
43
50
|
|
@@ -72,8 +79,8 @@ end
|
|
72
79
|
# fastq = ARGV.first
|
73
80
|
|
74
81
|
def bioruby_fastq(fastq)
|
75
|
-
Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
|
76
|
-
fq.each do |entry|
|
82
|
+
Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
|
83
|
+
fq.each do |entry|
|
77
84
|
[entry.definition, entry.seq.length].join("\t")
|
78
85
|
end
|
79
86
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|