bio-kseq 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +65 -0
- data/Rakefile +9 -0
- data/bio-kseq.gemspec +30 -0
- data/ext/seqtk_bindings/extconf.rb +6 -0
- data/ext/seqtk_bindings/seqtk/kseq.h +235 -0
- data/ext/seqtk_bindings/seqtk_bindings.c +100 -0
- data/lib/bio/kseq.rb +24 -0
- data/lib/bio/kseq/version.rb +5 -0
- data/spec/kseq_spec.rb +100 -0
- data/spec/spec_helper.rb +7 -0
- metadata +120 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 887f0a14561673eb7226ee2e01372b637d780c63
|
4
|
+
data.tar.gz: c4fd1b6e2648d0bb3fb54855fb33cb7cb2fdfb68
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0224e47c38a03c2d2e468c8aaef333494a254774bd0261f6f7a32efd59b0e93888187897e6cb3b9ec4df45b53b56cb6fc3702ce833fe3b162ffd2d193e406569
|
7
|
+
data.tar.gz: 73d65945910d96940304586733c8a78c9b6cf8338408596be33fafbbc755542c4af07a50c4f84d48c763765d8c4a84d2a7c970f80e597173d8d168ec680eb354
|
data/.gitignore
ADDED
data/.gitmodules
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Gusev Fedor
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# Bio::Kseq [](https://travis-ci.org/gusevfe/bio-kseq)
|
2
|
+
|
3
|
+
Ruby bindings for a very fast FASTA/Q parser [kseq.h](https://github.com/lh3/seqtk/blob/master/kseq.h) by Heng Li.
|
4
|
+
|
5
|
+
A default FASTA/Q parser from [BioRuby](http://bioruby.org) is extremly slow. One alternative is to use [bio-faster](https://github.com/fstrozzi/bioruby-faster) but that lacks support for FASTA files. However, `bio-faster` does parse qualities, unlike `bio-kseq`.
|
6
|
+
|
7
|
+
## Timings
|
8
|
+
```
|
9
|
+
user system total real
|
10
|
+
BioRuby 2.130000 0.270000 2.400000 ( 2.403145)
|
11
|
+
Bio::Faster 0.420000 0.070000 0.490000 ( 0.486809)
|
12
|
+
Bio::Kseq 0.030000 0.010000 0.040000 ( 0.037176)
|
13
|
+
```
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
Add this line to your application's Gemfile:
|
18
|
+
|
19
|
+
gem 'bioruby-seqtk'
|
20
|
+
|
21
|
+
And then execute:
|
22
|
+
|
23
|
+
$ bundle
|
24
|
+
|
25
|
+
Or install it yourself as:
|
26
|
+
|
27
|
+
$ gem install bioruby-seqtk
|
28
|
+
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
require 'bio/kseq'
|
33
|
+
|
34
|
+
# Convert FASTQ to FASTA
|
35
|
+
kseq = Bio::Kseq.new("test.fastq")
|
36
|
+
while kseq.read! # returns truthy values when there is an entry
|
37
|
+
puts ">" + kseq.name
|
38
|
+
puts kseq.seq
|
39
|
+
end
|
40
|
+
|
41
|
+
kseq = Bio::Kseq.new("test.fastq.gz") # You can open GZIPed files flawlessly
|
42
|
+
kseq.read! or throw("Failed to read test.fastq.gz")
|
43
|
+
|
44
|
+
# Suppose entry is like this:
|
45
|
+
# @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36
|
46
|
+
# GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC
|
47
|
+
# +SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36
|
48
|
+
# IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC
|
49
|
+
kseq.name # = "SRR001666.1"
|
50
|
+
kseq.comment # = "071112_SLXA-EAS1_s_7:5:1:817:345 length=36", may be nil
|
51
|
+
kseq.seq # = "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC"
|
52
|
+
kseq.qual # = "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC", may be nil
|
53
|
+
|
54
|
+
kseq = Bio::Kseq.new(IO.popen("zcat test.fastq.gz")) # You can also process Ruby IO objects
|
55
|
+
kseq.read! or throw("Failed to read test.fastq.gz")
|
56
|
+
puts kseq # Outputs a valid FASTQ entry
|
57
|
+
```
|
58
|
+
|
59
|
+
## Contributing
|
60
|
+
|
61
|
+
1. Fork it ( http://github.com/gusevfe/bioruby-seqtk/fork )
|
62
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
63
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
64
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
65
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/bio-kseq.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'bio/kseq/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "bio-kseq"
|
8
|
+
spec.version = Bio::Kseq::VERSION
|
9
|
+
spec.authors = ["Fedor Gusev"]
|
10
|
+
spec.email = ["gusevfe@gmail.com"]
|
11
|
+
spec.summary = %q{Ruby inferface for kseq.h by Heng Li for fast FASTA/Q reading}
|
12
|
+
spec.description = %q{A fast FASTA/FASTQ parser based on kseq.h by Heng Li}
|
13
|
+
spec.homepage = "https://github.com/gusevfe/bio-kseq"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0") - ["timing.rb"]
|
17
|
+
spec.files << "ext/seqtk_bindings/seqtk/kseq.h"
|
18
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
20
|
+
spec.require_paths = ["lib", "ext"]
|
21
|
+
spec.extensions = Dir['ext/**/extconf.rb']
|
22
|
+
spec.platform = Gem::Platform::RUBY
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
spec.add_development_dependency "rspec", ">= 3.0.0"
|
27
|
+
spec.add_development_dependency "rake-compiler"
|
28
|
+
#spec.add_development_dependency "bio" # For timing script
|
29
|
+
#spec.add_development_dependency "bio-faster" # For timing script
|
30
|
+
end
|
@@ -0,0 +1,235 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Last Modified: 05MAR2012 */
|
27
|
+
|
28
|
+
#ifndef AC_KSEQ_H
|
29
|
+
#define AC_KSEQ_H
|
30
|
+
|
31
|
+
#include <ctype.h>
|
32
|
+
#include <string.h>
|
33
|
+
#include <stdlib.h>
|
34
|
+
|
35
|
+
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
|
36
|
+
#define KS_SEP_TAB 1 // isspace() && !' '
|
37
|
+
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
|
38
|
+
#define KS_SEP_MAX 2
|
39
|
+
|
40
|
+
#define __KS_TYPE(type_t) \
|
41
|
+
typedef struct __kstream_t { \
|
42
|
+
unsigned char *buf; \
|
43
|
+
int begin, end, is_eof; \
|
44
|
+
type_t f; \
|
45
|
+
} kstream_t;
|
46
|
+
|
47
|
+
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
|
48
|
+
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
|
49
|
+
|
50
|
+
#define __KS_BASIC(type_t, __bufsize) \
|
51
|
+
static inline kstream_t *ks_init(type_t f) \
|
52
|
+
{ \
|
53
|
+
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
|
54
|
+
ks->f = f; \
|
55
|
+
ks->buf = (unsigned char*)malloc(__bufsize); \
|
56
|
+
return ks; \
|
57
|
+
} \
|
58
|
+
static inline void ks_destroy(kstream_t *ks) \
|
59
|
+
{ \
|
60
|
+
if (ks) { \
|
61
|
+
free(ks->buf); \
|
62
|
+
free(ks); \
|
63
|
+
} \
|
64
|
+
}
|
65
|
+
|
66
|
+
#define __KS_GETC(__read, __bufsize) \
|
67
|
+
static inline int ks_getc(kstream_t *ks) \
|
68
|
+
{ \
|
69
|
+
if (ks->is_eof && ks->begin >= ks->end) return -1; \
|
70
|
+
if (ks->begin >= ks->end) { \
|
71
|
+
ks->begin = 0; \
|
72
|
+
ks->end = __read(ks->f, ks->buf, __bufsize); \
|
73
|
+
if (ks->end < __bufsize) ks->is_eof = 1; \
|
74
|
+
if (ks->end == 0) return -1; \
|
75
|
+
} \
|
76
|
+
return (int)ks->buf[ks->begin++]; \
|
77
|
+
}
|
78
|
+
|
79
|
+
#ifndef KSTRING_T
|
80
|
+
#define KSTRING_T kstring_t
|
81
|
+
typedef struct __kstring_t {
|
82
|
+
size_t l, m;
|
83
|
+
char *s;
|
84
|
+
} kstring_t;
|
85
|
+
#endif
|
86
|
+
|
87
|
+
#ifndef kroundup32
|
88
|
+
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
89
|
+
#endif
|
90
|
+
|
91
|
+
#define __KS_GETUNTIL(__read, __bufsize) \
|
92
|
+
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
|
93
|
+
{ \
|
94
|
+
if (dret) *dret = 0; \
|
95
|
+
str->l = append? str->l : 0; \
|
96
|
+
if (ks->begin >= ks->end && ks->is_eof) return -1; \
|
97
|
+
for (;;) { \
|
98
|
+
int i; \
|
99
|
+
if (ks->begin >= ks->end) { \
|
100
|
+
if (!ks->is_eof) { \
|
101
|
+
ks->begin = 0; \
|
102
|
+
ks->end = __read(ks->f, ks->buf, __bufsize); \
|
103
|
+
if (ks->end < __bufsize) ks->is_eof = 1; \
|
104
|
+
if (ks->end == 0) break; \
|
105
|
+
} else break; \
|
106
|
+
} \
|
107
|
+
if (delimiter == KS_SEP_LINE) { \
|
108
|
+
for (i = ks->begin; i < ks->end; ++i) \
|
109
|
+
if (ks->buf[i] == '\n') break; \
|
110
|
+
} else if (delimiter > KS_SEP_MAX) { \
|
111
|
+
for (i = ks->begin; i < ks->end; ++i) \
|
112
|
+
if (ks->buf[i] == delimiter) break; \
|
113
|
+
} else if (delimiter == KS_SEP_SPACE) { \
|
114
|
+
for (i = ks->begin; i < ks->end; ++i) \
|
115
|
+
if (isspace(ks->buf[i])) break; \
|
116
|
+
} else if (delimiter == KS_SEP_TAB) { \
|
117
|
+
for (i = ks->begin; i < ks->end; ++i) \
|
118
|
+
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
|
119
|
+
} else i = 0; /* never come to here! */ \
|
120
|
+
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
|
121
|
+
str->m = str->l + (i - ks->begin) + 1; \
|
122
|
+
kroundup32(str->m); \
|
123
|
+
str->s = (char*)realloc(str->s, str->m); \
|
124
|
+
} \
|
125
|
+
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
|
126
|
+
str->l = str->l + (i - ks->begin); \
|
127
|
+
ks->begin = i + 1; \
|
128
|
+
if (i < ks->end) { \
|
129
|
+
if (dret) *dret = ks->buf[i]; \
|
130
|
+
break; \
|
131
|
+
} \
|
132
|
+
} \
|
133
|
+
if (str->s == 0) { \
|
134
|
+
str->m = 1; \
|
135
|
+
str->s = (char*)calloc(1, 1); \
|
136
|
+
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
|
137
|
+
str->s[str->l] = '\0'; \
|
138
|
+
return str->l; \
|
139
|
+
} \
|
140
|
+
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
|
141
|
+
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
|
142
|
+
|
143
|
+
#define KSTREAM_INIT(type_t, __read, __bufsize) \
|
144
|
+
__KS_TYPE(type_t) \
|
145
|
+
__KS_BASIC(type_t, __bufsize) \
|
146
|
+
__KS_GETC(__read, __bufsize) \
|
147
|
+
__KS_GETUNTIL(__read, __bufsize)
|
148
|
+
|
149
|
+
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
|
150
|
+
|
151
|
+
#define __KSEQ_BASIC(SCOPE, type_t) \
|
152
|
+
SCOPE kseq_t *kseq_init(type_t fd) \
|
153
|
+
{ \
|
154
|
+
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
|
155
|
+
s->f = ks_init(fd); \
|
156
|
+
return s; \
|
157
|
+
} \
|
158
|
+
SCOPE void kseq_destroy(kseq_t *ks) \
|
159
|
+
{ \
|
160
|
+
if (!ks) return; \
|
161
|
+
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
|
162
|
+
ks_destroy(ks->f); \
|
163
|
+
free(ks); \
|
164
|
+
}
|
165
|
+
|
166
|
+
/* Return value:
|
167
|
+
>=0 length of the sequence (normal)
|
168
|
+
-1 end-of-file
|
169
|
+
-2 truncated quality string
|
170
|
+
*/
|
171
|
+
#define __KSEQ_READ(SCOPE) \
|
172
|
+
SCOPE int kseq_read(kseq_t *seq) \
|
173
|
+
{ \
|
174
|
+
int c; \
|
175
|
+
kstream_t *ks = seq->f; \
|
176
|
+
if (seq->last_char == 0) { /* then jump to the next header line */ \
|
177
|
+
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
|
178
|
+
if (c == -1) return -1; /* end of file */ \
|
179
|
+
seq->last_char = c; \
|
180
|
+
} /* else: the first header char has been read in the previous call */ \
|
181
|
+
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
|
182
|
+
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
|
183
|
+
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
|
184
|
+
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
|
185
|
+
seq->seq.m = 256; \
|
186
|
+
seq->seq.s = (char*)malloc(seq->seq.m); \
|
187
|
+
} \
|
188
|
+
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
|
189
|
+
if (c == '\n') continue; /* skip empty lines */ \
|
190
|
+
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
|
191
|
+
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
|
192
|
+
} \
|
193
|
+
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
|
194
|
+
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
|
195
|
+
seq->seq.m = seq->seq.l + 2; \
|
196
|
+
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
|
197
|
+
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
|
198
|
+
} \
|
199
|
+
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
|
200
|
+
if (c != '+') return seq->seq.l; /* FASTA */ \
|
201
|
+
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
|
202
|
+
seq->qual.m = seq->seq.m; \
|
203
|
+
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
|
204
|
+
} \
|
205
|
+
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
|
206
|
+
if (c == -1) return -2; /* error: no quality string */ \
|
207
|
+
while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
|
208
|
+
seq->last_char = 0; /* we have not come to the next header line */ \
|
209
|
+
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
|
210
|
+
return seq->seq.l; \
|
211
|
+
}
|
212
|
+
|
213
|
+
#define __KSEQ_TYPE(type_t) \
|
214
|
+
typedef struct { \
|
215
|
+
kstring_t name, comment, seq, qual; \
|
216
|
+
int last_char; \
|
217
|
+
kstream_t *f; \
|
218
|
+
} kseq_t;
|
219
|
+
|
220
|
+
#define KSEQ_INIT2(SCOPE, type_t, __read) \
|
221
|
+
KSTREAM_INIT(type_t, __read, 16384) \
|
222
|
+
__KSEQ_TYPE(type_t) \
|
223
|
+
__KSEQ_BASIC(SCOPE, type_t) \
|
224
|
+
__KSEQ_READ(SCOPE)
|
225
|
+
|
226
|
+
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
|
227
|
+
|
228
|
+
#define KSEQ_DECLARE(type_t) \
|
229
|
+
__KS_TYPE(type_t) \
|
230
|
+
__KSEQ_TYPE(type_t) \
|
231
|
+
extern kseq_t *kseq_init(type_t fd); \
|
232
|
+
void kseq_destroy(kseq_t *ks); \
|
233
|
+
int kseq_read(kseq_t *seq);
|
234
|
+
|
235
|
+
#endif
|
@@ -0,0 +1,100 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/io.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <seqtk/kseq.h>
|
6
|
+
#include <zlib.h>
|
7
|
+
|
8
|
+
KSEQ_INIT(gzFile, gzread);
|
9
|
+
|
10
|
+
void Init_seqtk_bindings();
|
11
|
+
static VALUE kseq_wrapper_allocate(VALUE klass);
|
12
|
+
static VALUE kseq_wrapper_initialize(VALUE klass, VALUE rb_filename);
|
13
|
+
static void kseq_wrapper_deallocate(void *seq);
|
14
|
+
static VALUE kseq_wrapper_read(VALUE self);
|
15
|
+
|
16
|
+
VALUE mBio;
|
17
|
+
VALUE cKseq;
|
18
|
+
|
19
|
+
typedef struct {
|
20
|
+
char from_io;
|
21
|
+
kseq_t *seq;
|
22
|
+
gzFile fp;
|
23
|
+
} Kseq_Wrapper;
|
24
|
+
|
25
|
+
#define kseq_wrapper_field(NAME) \
|
26
|
+
static VALUE kseq_wrapper_ ## NAME(VALUE self) { \
|
27
|
+
Kseq_Wrapper *w; \
|
28
|
+
Data_Get_Struct(self, Kseq_Wrapper, w);\
|
29
|
+
if (w->seq->NAME.l) \
|
30
|
+
return rb_str_new2(w->seq->NAME.s);\
|
31
|
+
else \
|
32
|
+
return Qnil;\
|
33
|
+
}\
|
34
|
+
|
35
|
+
kseq_wrapper_field(name);
|
36
|
+
kseq_wrapper_field(comment);
|
37
|
+
kseq_wrapper_field(seq);
|
38
|
+
kseq_wrapper_field(qual);
|
39
|
+
|
40
|
+
void Init_seqtk_bindings() {
|
41
|
+
mBio = rb_define_module("Bio");
|
42
|
+
cKseq = rb_define_class_under(mBio, "Kseq", rb_cObject);
|
43
|
+
rb_define_alloc_func(cKseq, kseq_wrapper_allocate);
|
44
|
+
rb_define_method(cKseq, "initialize", kseq_wrapper_initialize, 1);
|
45
|
+
|
46
|
+
rb_define_method(cKseq, "read!", kseq_wrapper_read, 0);
|
47
|
+
rb_define_method(cKseq, "name", kseq_wrapper_name, 0);
|
48
|
+
rb_define_method(cKseq, "comment", kseq_wrapper_comment, 0);
|
49
|
+
rb_define_method(cKseq, "seq", kseq_wrapper_seq, 0);
|
50
|
+
rb_define_method(cKseq, "qual", kseq_wrapper_qual, 0);
|
51
|
+
}
|
52
|
+
|
53
|
+
static VALUE kseq_wrapper_allocate(VALUE klass) {
|
54
|
+
Kseq_Wrapper *w = malloc(sizeof(Kseq_Wrapper));
|
55
|
+
|
56
|
+
return Data_Wrap_Struct(klass, NULL, kseq_wrapper_deallocate, w);
|
57
|
+
}
|
58
|
+
|
59
|
+
static void kseq_wrapper_deallocate(void *p)
|
60
|
+
{
|
61
|
+
Kseq_Wrapper *w = p;
|
62
|
+
kseq_destroy(w->seq);
|
63
|
+
if (!(w->from_io))
|
64
|
+
gzclose(w->fp);
|
65
|
+
free(w);
|
66
|
+
}
|
67
|
+
|
68
|
+
static VALUE kseq_wrapper_read(VALUE self) {
|
69
|
+
int r;
|
70
|
+
Kseq_Wrapper *w;
|
71
|
+
|
72
|
+
Data_Get_Struct(self, Kseq_Wrapper, w);
|
73
|
+
r = kseq_read(w->seq);
|
74
|
+
|
75
|
+
return r >= 0 ? Qtrue : Qfalse;
|
76
|
+
}
|
77
|
+
|
78
|
+
static VALUE kseq_wrapper_initialize(VALUE self, VALUE value) {
|
79
|
+
Kseq_Wrapper *w;
|
80
|
+
|
81
|
+
Data_Get_Struct(self, Kseq_Wrapper, w);
|
82
|
+
w->from_io = 0;
|
83
|
+
|
84
|
+
switch (TYPE(value)) {
|
85
|
+
case T_STRING:
|
86
|
+
w->fp = gzopen(StringValuePtr(value), "r");
|
87
|
+
break;
|
88
|
+
case T_FILE:
|
89
|
+
w->fp = gzdopen(fileno(rb_io_stdio_file(RFILE(value)->fptr)), "r");
|
90
|
+
w->from_io = 1;
|
91
|
+
break;
|
92
|
+
default:
|
93
|
+
rb_raise(rb_eTypeError, "Only strings and IOs are supported");
|
94
|
+
break;
|
95
|
+
}
|
96
|
+
|
97
|
+
w->seq = kseq_init(w->fp);
|
98
|
+
|
99
|
+
return self;
|
100
|
+
}
|
data/lib/bio/kseq.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require "seqtk_bindings"
|
2
|
+
require "bio/kseq/version"
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
module SeqTK
|
6
|
+
class Kseq
|
7
|
+
def to_s
|
8
|
+
if qual.nil?
|
9
|
+
if comment.nil?
|
10
|
+
">" + name + "\n" + seq
|
11
|
+
else
|
12
|
+
">" + name + " " + comment + "\n" + seq
|
13
|
+
end
|
14
|
+
else
|
15
|
+
if comment.nil?
|
16
|
+
"@" + name + "\n" + seq + "\n+\n" + qual
|
17
|
+
else
|
18
|
+
"@" + name + " " + comment + "\n" + seq + "\n+\n" + qual
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/spec/kseq_spec.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'bio/kseq'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
include Bio
|
5
|
+
|
6
|
+
describe Kseq do
|
7
|
+
it 'should parse simple FASTA files' do
|
8
|
+
tmp = Tempfile.new 'fasta'
|
9
|
+
tmp.puts ">A"
|
10
|
+
tmp.puts "AAAATTTTCCCCGGGG"
|
11
|
+
tmp.puts ">B comment"
|
12
|
+
tmp.puts "GGGGTTTTCCCCAAAA"
|
13
|
+
tmp.close
|
14
|
+
|
15
|
+
kseq = Kseq.new tmp.path
|
16
|
+
|
17
|
+
expect(kseq.read!).to be_truthy
|
18
|
+
expect(kseq.name).to eq("A")
|
19
|
+
expect(kseq.comment).to be_nil
|
20
|
+
expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
|
21
|
+
expect(kseq.qual).to be_nil
|
22
|
+
|
23
|
+
expect(kseq.read!).to be_truthy
|
24
|
+
expect(kseq.name).to eq("B")
|
25
|
+
expect(kseq.comment).to eq("comment")
|
26
|
+
expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
|
27
|
+
expect(kseq.qual).to be_nil
|
28
|
+
|
29
|
+
expect(kseq.read!).to be_falsey
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should parse simple FASTQ files' do
|
33
|
+
tmp = Tempfile.new 'fasta'
|
34
|
+
tmp.puts "@A"
|
35
|
+
tmp.puts "AAAATTTTCCCCGGGG"
|
36
|
+
tmp.puts "+"
|
37
|
+
tmp.puts "AAAAAAAAAAAAAAAA"
|
38
|
+
tmp.puts "@B comment"
|
39
|
+
tmp.puts "GGGGTTTTCCCCAAAA"
|
40
|
+
tmp.puts "+"
|
41
|
+
tmp.puts "IIIIIIIIIIIIIIII"
|
42
|
+
tmp.close
|
43
|
+
|
44
|
+
kseq = Kseq.new tmp.path
|
45
|
+
|
46
|
+
expect(kseq.read!).to be_truthy
|
47
|
+
expect(kseq.name).to eq("A")
|
48
|
+
expect(kseq.comment).to be_nil
|
49
|
+
expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
|
50
|
+
expect(kseq.qual).to eq("AAAAAAAAAAAAAAAA")
|
51
|
+
|
52
|
+
expect(kseq.read!).to be_truthy
|
53
|
+
expect(kseq.name).to eq("B")
|
54
|
+
expect(kseq.comment).to eq("comment")
|
55
|
+
expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
|
56
|
+
expect(kseq.qual).to eq("IIIIIIIIIIIIIIII")
|
57
|
+
|
58
|
+
expect(kseq.read!).to be_falsey
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should read from IO' do
|
62
|
+
tmp = Tempfile.new 'fasta'
|
63
|
+
tmp.puts ">A"
|
64
|
+
tmp.puts "AAAATTTTCCCCGGGG"
|
65
|
+
tmp.puts ">B comment"
|
66
|
+
tmp.puts "GGGGTTTTCCCCAAAA"
|
67
|
+
tmp.close
|
68
|
+
|
69
|
+
io = File.open(tmp.path)
|
70
|
+
|
71
|
+
kseq = Kseq.new io
|
72
|
+
|
73
|
+
expect(kseq.read!).to be_truthy
|
74
|
+
expect(kseq.name).to eq("A")
|
75
|
+
expect(kseq.comment).to be_nil
|
76
|
+
expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
|
77
|
+
expect(kseq.qual).to be_nil
|
78
|
+
|
79
|
+
expect(kseq.read!).to be_truthy
|
80
|
+
expect(kseq.name).to eq("B")
|
81
|
+
expect(kseq.comment).to eq("comment")
|
82
|
+
expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
|
83
|
+
expect(kseq.qual).to be_nil
|
84
|
+
|
85
|
+
expect(kseq.read!).to be_falsey
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should read comment' do
|
89
|
+
tmp = Tempfile.new 'fasta'
|
90
|
+
tmp.puts "@SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36"
|
91
|
+
tmp.puts "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC"
|
92
|
+
tmp.puts "+SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36"
|
93
|
+
tmp.puts "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC"
|
94
|
+
tmp.close
|
95
|
+
|
96
|
+
kseq = Kseq.new tmp.path
|
97
|
+
expect(kseq.read!).to be_truthy
|
98
|
+
expect(kseq.comment).to eq("071112_SLXA-EAS1_s_7:5:1:817:345 length=36")
|
99
|
+
end
|
100
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-kseq
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Fedor Gusev
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.5'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.5'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 3.0.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 3.0.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake-compiler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: A fast FASTA/FASTQ parser based on kseq.h by Heng Li
|
70
|
+
email:
|
71
|
+
- gusevfe@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions:
|
74
|
+
- ext/seqtk_bindings/extconf.rb
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- ".gitignore"
|
78
|
+
- ".gitmodules"
|
79
|
+
- ".rspec"
|
80
|
+
- ".travis.yml"
|
81
|
+
- Gemfile
|
82
|
+
- LICENSE.txt
|
83
|
+
- README.md
|
84
|
+
- Rakefile
|
85
|
+
- bio-kseq.gemspec
|
86
|
+
- ext/seqtk_bindings/extconf.rb
|
87
|
+
- ext/seqtk_bindings/seqtk/kseq.h
|
88
|
+
- ext/seqtk_bindings/seqtk_bindings.c
|
89
|
+
- lib/bio/kseq.rb
|
90
|
+
- lib/bio/kseq/version.rb
|
91
|
+
- spec/kseq_spec.rb
|
92
|
+
- spec/spec_helper.rb
|
93
|
+
homepage: https://github.com/gusevfe/bio-kseq
|
94
|
+
licenses:
|
95
|
+
- MIT
|
96
|
+
metadata: {}
|
97
|
+
post_install_message:
|
98
|
+
rdoc_options: []
|
99
|
+
require_paths:
|
100
|
+
- lib
|
101
|
+
- ext
|
102
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
requirements: []
|
113
|
+
rubyforge_project:
|
114
|
+
rubygems_version: 2.2.2
|
115
|
+
signing_key:
|
116
|
+
specification_version: 4
|
117
|
+
summary: Ruby inferface for kseq.h by Heng Li for fast FASTA/Q reading
|
118
|
+
test_files:
|
119
|
+
- spec/kseq_spec.rb
|
120
|
+
- spec/spec_helper.rb
|