parse_fasta 1.9.1 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +9 -46
- data/lib/parse_fasta/fasta_file.rb +44 -8
- data/lib/parse_fasta/version.rb +1 -1
- data/test_files/benchmark.rb +13 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NmM5ZWYwOGM5YWIxMzU2YjBmZTk4Y2I5YzI0NjY0MzUwM2YwMjgyOA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NzI2NDY1MWZmYmUwNDUxMTk2MmI4YjgwYWVlYjcyZDI4MDUzMzk4NA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODY1ZTQ1MzU4MTc2MDhhMjA0OThiYzM4Yzk4YjJiZjU4ZGY4MGM5NTRjYTE5
|
10
|
+
OWZkODk0M2ZmODE5ODY1MjE3NTQ5MzgyNTFjMTk2NzU2NGVjN2NkNGUzYzA3
|
11
|
+
ODliNjRlOGJjOGJhNjhlMWZmMmU1NjkyMjgwNzAyODQ1MDExOTI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YWY4NWU3NDFiYTVmMmE1Y2MxMDI3ZjE3NTIyY2Q1N2Q2ZDQxM2ZlZjI4NjUy
|
14
|
+
MWM5OTZhNzEzZWNmMGVlYTQ1MDc1MzViMDBkOTQ0YzQyY2IxYjlmOGQwNzRh
|
15
|
+
YmIyOTg2Yjk0OTFlNWVhOGU3MTMzM2I1ZGY0ZjlkMzExZGNkZDk=
|
data/README.md
CHANGED
@@ -66,6 +66,10 @@ Read fasta file into a hash.
|
|
66
66
|
|
67
67
|
## Versions ##
|
68
68
|
|
69
|
+
### 1.9.2 ###
|
70
|
+
|
71
|
+
Speed up fastA `each_record` and `each_record_fast`.
|
72
|
+
|
69
73
|
### 1.9.1 ###
|
70
74
|
|
71
75
|
Speed up fastQ `each_record` and `each_record_fast`. Courtesy of
|
@@ -221,60 +225,19 @@ Last version with File monkey patch.
|
|
221
225
|
|
222
226
|
## Benchmark ##
|
223
227
|
|
224
|
-
**NOTE**: These benchmarks are against an older version of
|
225
|
-
`parse_fasta`.
|
226
|
-
|
227
228
|
Some quick and dirty benchmarks against `BioRuby`.
|
228
229
|
|
229
230
|
### FastaFile#each_record ###
|
230
231
|
|
231
|
-
|
232
|
-
`each_record` method from this gem and using the `FastaFormat` class
|
233
|
-
from BioRuby. You can see the test script in `benchmark.rb`.
|
234
|
-
|
235
|
-
The test file contained 2,009,897 illumina reads and the file size
|
236
|
-
was 1.1 gigabytes. Here are the results from Ruby's `Benchmark` class:
|
232
|
+
You can see the test script in `benchmark.rb`.
|
237
233
|
|
238
|
-
|
239
|
-
parse_fasta
|
240
|
-
|
234
|
+
user system total real
|
235
|
+
parse_fasta 1.920000 0.160000 2.080000 ( 2.145932)
|
236
|
+
parse_fasta fast 1.210000 0.160000 1.370000 ( 1.377770)
|
237
|
+
bioruby 4.330000 0.290000 4.620000 ( 4.655567)
|
241
238
|
|
242
239
|
Hot dog! It's faster :)
|
243
240
|
|
244
|
-
### FastqFile#each_record ###
|
245
|
-
|
246
|
-
The same sequence length test as above, but this time with a fastq
|
247
|
-
file containing 4,000,000 illumina reads.
|
248
|
-
|
249
|
-
user system total real
|
250
|
-
this_fastq 62.610000 1.660000 64.270000 ( 64.389408)
|
251
|
-
bioruby_fastq 165.500000 2.100000 167.600000 (167.969636)
|
252
|
-
|
253
|
-
### Sequence#gc ###
|
254
|
-
|
255
|
-
The test is done on random strings matcing `/[AaCcTtGgUu]/`. `this_gc`
|
256
|
-
is `Sequence.new(str).gc`, and `bioruby_gc` is
|
257
|
-
`Bio::Sequence::NA.new(str).gc_content`.
|
258
|
-
|
259
|
-
To see how the methods scales, the test 1 string was 2,000,000 bases,
|
260
|
-
test 2 was 4,000,000 and test 3 was 8,000,000 bases.
|
261
|
-
|
262
|
-
user system total real
|
263
|
-
this_gc 1 0.030000 0.000000 0.030000 ( 0.029145)
|
264
|
-
bioruby_gc 1 2.030000 0.010000 2.040000 ( 2.157512)
|
265
|
-
|
266
|
-
this_gc 2 0.060000 0.000000 0.060000 ( 0.059408)
|
267
|
-
bioruby_gc 2 4.060000 0.020000 4.080000 ( 4.334159)
|
268
|
-
|
269
|
-
this_gc 3 0.120000 0.000000 0.120000 ( 0.185434)
|
270
|
-
bioruby_gc 3 8.060000 0.020000 8.080000 ( 8.659071)
|
271
|
-
|
272
|
-
Nice!
|
273
|
-
|
274
|
-
Troll: "When will you find the GC of an 8,000,000 base sequence?"
|
275
|
-
|
276
|
-
Me: "Step off, troll!"
|
277
|
-
|
278
241
|
## Notes ##
|
279
242
|
|
280
243
|
Only the `SeqFile` class actually checks to make sure that you passed
|
@@ -116,10 +116,28 @@ class FastaFile < File
|
|
116
116
|
# end
|
117
117
|
# end
|
118
118
|
else
|
119
|
-
|
120
|
-
|
121
|
-
|
119
|
+
header = ""
|
120
|
+
sequence = ""
|
121
|
+
f.each_line do |line|
|
122
|
+
line.chomp!
|
123
|
+
len = line.length
|
124
|
+
if header.empty? && line.start_with?(">")
|
125
|
+
header = line[1, len]
|
126
|
+
elsif line.start_with?(">")
|
127
|
+
yield(header.strip, Sequence.new(sequence || ""))
|
128
|
+
header = line[1, len]
|
129
|
+
sequence = ""
|
130
|
+
else
|
131
|
+
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
132
|
+
sequence << line
|
133
|
+
end
|
122
134
|
end
|
135
|
+
yield(header, Sequence.new(sequence || ""))
|
136
|
+
|
137
|
+
# f.each("\n>") do |line|
|
138
|
+
# header, sequence = parse_line(line)
|
139
|
+
# yield(header.strip, Sequence.new(sequence || ""))
|
140
|
+
# end
|
123
141
|
|
124
142
|
# f.each_with_index(sep=/^>/) do |line, idx|
|
125
143
|
# if idx.zero?
|
@@ -161,13 +179,31 @@ class FastaFile < File
|
|
161
179
|
f = self
|
162
180
|
end
|
163
181
|
|
164
|
-
|
165
|
-
|
182
|
+
header = ""
|
183
|
+
sequence = ""
|
184
|
+
f.each_line do |line|
|
185
|
+
line.chomp!
|
186
|
+
len = line.length
|
187
|
+
if header.empty? && line.start_with?(">")
|
188
|
+
header = line[1, len]
|
189
|
+
elsif line.start_with?(">")
|
190
|
+
yield(header.strip, sequence)
|
191
|
+
header = line[1, len]
|
192
|
+
sequence = ""
|
193
|
+
else
|
194
|
+
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
195
|
+
sequence << line
|
196
|
+
end
|
197
|
+
end
|
198
|
+
yield(header, sequence)
|
166
199
|
|
167
|
-
|
200
|
+
# f.each("\n>") do |line|
|
201
|
+
# header, sequence = parse_line(line)
|
168
202
|
|
169
|
-
|
170
|
-
|
203
|
+
# raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
204
|
+
|
205
|
+
# yield(header.strip, sequence)
|
206
|
+
# end
|
171
207
|
|
172
208
|
f.close if f.instance_of?(Zlib::GzipReader)
|
173
209
|
return f
|
data/lib/parse_fasta/version.rb
CHANGED
data/test_files/benchmark.rb
CHANGED
@@ -28,16 +28,23 @@ def this_parse_fasta fname
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
+
def this_parse_fasta_fast fname
|
32
|
+
FastaFile.open(fname, 'r').each_record_fast do |header, sequence|
|
33
|
+
[header, sequence.length].join("\t")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
31
37
|
def bioruby_parse_fasta fname
|
32
38
|
Bio::FastaFormat.open(fname).each do |entry|
|
33
39
|
[entry.definition, entry.seq.length].join("\t")
|
34
40
|
end
|
35
41
|
end
|
36
42
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
43
|
+
Benchmark.bmbm do |x|
|
44
|
+
x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
|
45
|
+
x.report('parse_fasta fast') { this_parse_fasta_fast(ARGV.first) }
|
46
|
+
x.report('bioruby') { bioruby_parse_fasta(ARGV.first) }
|
47
|
+
end
|
41
48
|
|
42
49
|
####
|
43
50
|
|
@@ -72,8 +79,8 @@ end
|
|
72
79
|
# fastq = ARGV.first
|
73
80
|
|
74
81
|
def bioruby_fastq(fastq)
|
75
|
-
Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
|
76
|
-
fq.each do |entry|
|
82
|
+
Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
|
83
|
+
fq.each do |entry|
|
77
84
|
[entry.definition, entry.seq.length].join("\t")
|
78
85
|
end
|
79
86
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|