bio-kseq 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 887f0a14561673eb7226ee2e01372b637d780c63
4
+ data.tar.gz: c4fd1b6e2648d0bb3fb54855fb33cb7cb2fdfb68
5
+ SHA512:
6
+ metadata.gz: 0224e47c38a03c2d2e468c8aaef333494a254774bd0261f6f7a32efd59b0e93888187897e6cb3b9ec4df45b53b56cb6fc3702ce833fe3b162ffd2d193e406569
7
+ data.tar.gz: 73d65945910d96940304586733c8a78c9b6cf8338408596be33fafbbc755542c4af07a50c4f84d48c763765d8c4a84d2a7c970f80e597173d8d168ec680eb354
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
@@ -0,0 +1,3 @@
1
+ [submodule "ext/seqtk_bindings/seqtk"]
2
+ path = ext/seqtk_bindings/seqtk
3
+ url = https://github.com/lh3/seqtk.git
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.1.2
5
+ - 2.1.1
6
+ - 2.0.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in bioruby-seqtk.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Gusev Fedor
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # Bio::Kseq [![Build Status](https://travis-ci.org/gusevfe/bio-kseq.svg?branch=master)](https://travis-ci.org/gusevfe/bio-kseq)
2
+
3
+ Ruby bindings for a very fast FASTA/Q parser [kseq.h](https://github.com/lh3/seqtk/blob/master/kseq.h) by Heng Li.
4
+
5
+ A default FASTA/Q parser from [BioRuby](http://bioruby.org) is extremly slow. One alternative is to use [bio-faster](https://github.com/fstrozzi/bioruby-faster) but that lacks support for FASTA files. However, `bio-faster` does parse qualities, unlike `bio-kseq`.
6
+
7
+ ## Timings
8
+ ```
9
+ user system total real
10
+ BioRuby 2.130000 0.270000 2.400000 ( 2.403145)
11
+ Bio::Faster 0.420000 0.070000 0.490000 ( 0.486809)
12
+ Bio::Kseq 0.030000 0.010000 0.040000 ( 0.037176)
13
+ ```
14
+
15
+ ## Installation
16
+
17
+ Add this line to your application's Gemfile:
18
+
19
+ gem 'bioruby-seqtk'
20
+
21
+ And then execute:
22
+
23
+ $ bundle
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install bioruby-seqtk
28
+
29
+ ## Usage
30
+
31
+ ```ruby
32
+ require 'bio/kseq'
33
+
34
+ # Convert FASTQ to FASTA
35
+ kseq = Bio::Kseq.new("test.fastq")
36
+ while kseq.read! # returns truthy values when there is an entry
37
+ puts ">" + kseq.name
38
+ puts kseq.seq
39
+ end
40
+
41
+ kseq = Bio::Kseq.new("test.fastq.gz") # You can open GZIPed files flawlessly
42
+ kseq.read! or throw("Failed to read test.fastq.gz")
43
+
44
+ # Suppose entry is like this:
45
+ # @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36
46
+ # GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC
47
+ # +SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36
48
+ # IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC
49
+ kseq.name # = "SRR001666.1"
50
+ kseq.comment # = "071112_SLXA-EAS1_s_7:5:1:817:345 length=36", may be nil
51
+ kseq.seq # = "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC"
52
+ kseq.qual # = "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC", may be nil
53
+
54
+ kseq = Bio::Kseq.new(IO.popen("zcat test.fastq.gz")) # You can also process Ruby IO objects
55
+ kseq.read! or throw("Failed to read test.fastq.gz")
56
+ puts kseq # Outputs a valid FASTQ entry
57
+ ```
58
+
59
+ ## Contributing
60
+
61
+ 1. Fork it ( http://github.com/gusevfe/bioruby-seqtk/fork )
62
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
63
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
64
+ 4. Push to the branch (`git push origin my-new-feature`)
65
+ 5. Create new Pull Request
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+ require "rake/extensiontask"
4
+
5
+ RSpec::Core::RakeTask.new('spec')
6
+ Rake::ExtensionTask.new "seqtk_bindings"
7
+
8
+ task :spec => :compile
9
+ task :default => :spec
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'bio/kseq/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "bio-kseq"
8
+ spec.version = Bio::Kseq::VERSION
9
+ spec.authors = ["Fedor Gusev"]
10
+ spec.email = ["gusevfe@gmail.com"]
11
+ spec.summary = %q{Ruby inferface for kseq.h by Heng Li for fast FASTA/Q reading}
12
+ spec.description = %q{A fast FASTA/FASTQ parser based on kseq.h by Heng Li}
13
+ spec.homepage = "https://github.com/gusevfe/bio-kseq"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0") - ["timing.rb"]
17
+ spec.files << "ext/seqtk_bindings/seqtk/kseq.h"
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ["lib", "ext"]
21
+ spec.extensions = Dir['ext/**/extconf.rb']
22
+ spec.platform = Gem::Platform::RUBY
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.5"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec", ">= 3.0.0"
27
+ spec.add_development_dependency "rake-compiler"
28
+ #spec.add_development_dependency "bio" # For timing script
29
+ #spec.add_development_dependency "bio-faster" # For timing script
30
+ end
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ dir_config('seqtk_bindings')
4
+ find_header('zlib.h')
5
+ find_library('z', 'gzopen')
6
+ create_makefile('seqtk_bindings')
@@ -0,0 +1,235 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Last Modified: 05MAR2012 */
27
+
28
+ #ifndef AC_KSEQ_H
29
+ #define AC_KSEQ_H
30
+
31
+ #include <ctype.h>
32
+ #include <string.h>
33
+ #include <stdlib.h>
34
+
35
+ #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36
+ #define KS_SEP_TAB 1 // isspace() && !' '
37
+ #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38
+ #define KS_SEP_MAX 2
39
+
40
+ #define __KS_TYPE(type_t) \
41
+ typedef struct __kstream_t { \
42
+ unsigned char *buf; \
43
+ int begin, end, is_eof; \
44
+ type_t f; \
45
+ } kstream_t;
46
+
47
+ #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
48
+ #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
49
+
50
+ #define __KS_BASIC(type_t, __bufsize) \
51
+ static inline kstream_t *ks_init(type_t f) \
52
+ { \
53
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
54
+ ks->f = f; \
55
+ ks->buf = (unsigned char*)malloc(__bufsize); \
56
+ return ks; \
57
+ } \
58
+ static inline void ks_destroy(kstream_t *ks) \
59
+ { \
60
+ if (ks) { \
61
+ free(ks->buf); \
62
+ free(ks); \
63
+ } \
64
+ }
65
+
66
+ #define __KS_GETC(__read, __bufsize) \
67
+ static inline int ks_getc(kstream_t *ks) \
68
+ { \
69
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
70
+ if (ks->begin >= ks->end) { \
71
+ ks->begin = 0; \
72
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
73
+ if (ks->end < __bufsize) ks->is_eof = 1; \
74
+ if (ks->end == 0) return -1; \
75
+ } \
76
+ return (int)ks->buf[ks->begin++]; \
77
+ }
78
+
79
+ #ifndef KSTRING_T
80
+ #define KSTRING_T kstring_t
81
+ typedef struct __kstring_t {
82
+ size_t l, m;
83
+ char *s;
84
+ } kstring_t;
85
+ #endif
86
+
87
+ #ifndef kroundup32
88
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
89
+ #endif
90
+
91
+ #define __KS_GETUNTIL(__read, __bufsize) \
92
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
93
+ { \
94
+ if (dret) *dret = 0; \
95
+ str->l = append? str->l : 0; \
96
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
97
+ for (;;) { \
98
+ int i; \
99
+ if (ks->begin >= ks->end) { \
100
+ if (!ks->is_eof) { \
101
+ ks->begin = 0; \
102
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
103
+ if (ks->end < __bufsize) ks->is_eof = 1; \
104
+ if (ks->end == 0) break; \
105
+ } else break; \
106
+ } \
107
+ if (delimiter == KS_SEP_LINE) { \
108
+ for (i = ks->begin; i < ks->end; ++i) \
109
+ if (ks->buf[i] == '\n') break; \
110
+ } else if (delimiter > KS_SEP_MAX) { \
111
+ for (i = ks->begin; i < ks->end; ++i) \
112
+ if (ks->buf[i] == delimiter) break; \
113
+ } else if (delimiter == KS_SEP_SPACE) { \
114
+ for (i = ks->begin; i < ks->end; ++i) \
115
+ if (isspace(ks->buf[i])) break; \
116
+ } else if (delimiter == KS_SEP_TAB) { \
117
+ for (i = ks->begin; i < ks->end; ++i) \
118
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
119
+ } else i = 0; /* never come to here! */ \
120
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
121
+ str->m = str->l + (i - ks->begin) + 1; \
122
+ kroundup32(str->m); \
123
+ str->s = (char*)realloc(str->s, str->m); \
124
+ } \
125
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
126
+ str->l = str->l + (i - ks->begin); \
127
+ ks->begin = i + 1; \
128
+ if (i < ks->end) { \
129
+ if (dret) *dret = ks->buf[i]; \
130
+ break; \
131
+ } \
132
+ } \
133
+ if (str->s == 0) { \
134
+ str->m = 1; \
135
+ str->s = (char*)calloc(1, 1); \
136
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
137
+ str->s[str->l] = '\0'; \
138
+ return str->l; \
139
+ } \
140
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
141
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
142
+
143
+ #define KSTREAM_INIT(type_t, __read, __bufsize) \
144
+ __KS_TYPE(type_t) \
145
+ __KS_BASIC(type_t, __bufsize) \
146
+ __KS_GETC(__read, __bufsize) \
147
+ __KS_GETUNTIL(__read, __bufsize)
148
+
149
+ #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
150
+
151
+ #define __KSEQ_BASIC(SCOPE, type_t) \
152
+ SCOPE kseq_t *kseq_init(type_t fd) \
153
+ { \
154
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
155
+ s->f = ks_init(fd); \
156
+ return s; \
157
+ } \
158
+ SCOPE void kseq_destroy(kseq_t *ks) \
159
+ { \
160
+ if (!ks) return; \
161
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
162
+ ks_destroy(ks->f); \
163
+ free(ks); \
164
+ }
165
+
166
+ /* Return value:
167
+ >=0 length of the sequence (normal)
168
+ -1 end-of-file
169
+ -2 truncated quality string
170
+ */
171
+ #define __KSEQ_READ(SCOPE) \
172
+ SCOPE int kseq_read(kseq_t *seq) \
173
+ { \
174
+ int c; \
175
+ kstream_t *ks = seq->f; \
176
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
177
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
178
+ if (c == -1) return -1; /* end of file */ \
179
+ seq->last_char = c; \
180
+ } /* else: the first header char has been read in the previous call */ \
181
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
182
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
183
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
184
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
185
+ seq->seq.m = 256; \
186
+ seq->seq.s = (char*)malloc(seq->seq.m); \
187
+ } \
188
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
189
+ if (c == '\n') continue; /* skip empty lines */ \
190
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
191
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
192
+ } \
193
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
195
+ seq->seq.m = seq->seq.l + 2; \
196
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
197
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
198
+ } \
199
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
200
+ if (c != '+') return seq->seq.l; /* FASTA */ \
201
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
202
+ seq->qual.m = seq->seq.m; \
203
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
204
+ } \
205
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
206
+ if (c == -1) return -2; /* error: no quality string */ \
207
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
208
+ seq->last_char = 0; /* we have not come to the next header line */ \
209
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
210
+ return seq->seq.l; \
211
+ }
212
+
213
+ #define __KSEQ_TYPE(type_t) \
214
+ typedef struct { \
215
+ kstring_t name, comment, seq, qual; \
216
+ int last_char; \
217
+ kstream_t *f; \
218
+ } kseq_t;
219
+
220
+ #define KSEQ_INIT2(SCOPE, type_t, __read) \
221
+ KSTREAM_INIT(type_t, __read, 16384) \
222
+ __KSEQ_TYPE(type_t) \
223
+ __KSEQ_BASIC(SCOPE, type_t) \
224
+ __KSEQ_READ(SCOPE)
225
+
226
+ #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
227
+
228
+ #define KSEQ_DECLARE(type_t) \
229
+ __KS_TYPE(type_t) \
230
+ __KSEQ_TYPE(type_t) \
231
+ extern kseq_t *kseq_init(type_t fd); \
232
+ void kseq_destroy(kseq_t *ks); \
233
+ int kseq_read(kseq_t *seq);
234
+
235
+ #endif
@@ -0,0 +1,100 @@
1
+ #include <ruby.h>
2
+ #include <ruby/io.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <seqtk/kseq.h>
6
+ #include <zlib.h>
7
+
8
+ KSEQ_INIT(gzFile, gzread);
9
+
10
+ void Init_seqtk_bindings();
11
+ static VALUE kseq_wrapper_allocate(VALUE klass);
12
+ static VALUE kseq_wrapper_initialize(VALUE klass, VALUE rb_filename);
13
+ static void kseq_wrapper_deallocate(void *seq);
14
+ static VALUE kseq_wrapper_read(VALUE self);
15
+
16
+ VALUE mBio;
17
+ VALUE cKseq;
18
+
19
+ typedef struct {
20
+ char from_io;
21
+ kseq_t *seq;
22
+ gzFile fp;
23
+ } Kseq_Wrapper;
24
+
25
+ #define kseq_wrapper_field(NAME) \
26
+ static VALUE kseq_wrapper_ ## NAME(VALUE self) { \
27
+ Kseq_Wrapper *w; \
28
+ Data_Get_Struct(self, Kseq_Wrapper, w);\
29
+ if (w->seq->NAME.l) \
30
+ return rb_str_new2(w->seq->NAME.s);\
31
+ else \
32
+ return Qnil;\
33
+ }\
34
+
35
+ kseq_wrapper_field(name);
36
+ kseq_wrapper_field(comment);
37
+ kseq_wrapper_field(seq);
38
+ kseq_wrapper_field(qual);
39
+
40
+ void Init_seqtk_bindings() {
41
+ mBio = rb_define_module("Bio");
42
+ cKseq = rb_define_class_under(mBio, "Kseq", rb_cObject);
43
+ rb_define_alloc_func(cKseq, kseq_wrapper_allocate);
44
+ rb_define_method(cKseq, "initialize", kseq_wrapper_initialize, 1);
45
+
46
+ rb_define_method(cKseq, "read!", kseq_wrapper_read, 0);
47
+ rb_define_method(cKseq, "name", kseq_wrapper_name, 0);
48
+ rb_define_method(cKseq, "comment", kseq_wrapper_comment, 0);
49
+ rb_define_method(cKseq, "seq", kseq_wrapper_seq, 0);
50
+ rb_define_method(cKseq, "qual", kseq_wrapper_qual, 0);
51
+ }
52
+
53
+ static VALUE kseq_wrapper_allocate(VALUE klass) {
54
+ Kseq_Wrapper *w = malloc(sizeof(Kseq_Wrapper));
55
+
56
+ return Data_Wrap_Struct(klass, NULL, kseq_wrapper_deallocate, w);
57
+ }
58
+
59
+ static void kseq_wrapper_deallocate(void *p)
60
+ {
61
+ Kseq_Wrapper *w = p;
62
+ kseq_destroy(w->seq);
63
+ if (!(w->from_io))
64
+ gzclose(w->fp);
65
+ free(w);
66
+ }
67
+
68
+ static VALUE kseq_wrapper_read(VALUE self) {
69
+ int r;
70
+ Kseq_Wrapper *w;
71
+
72
+ Data_Get_Struct(self, Kseq_Wrapper, w);
73
+ r = kseq_read(w->seq);
74
+
75
+ return r >= 0 ? Qtrue : Qfalse;
76
+ }
77
+
78
+ static VALUE kseq_wrapper_initialize(VALUE self, VALUE value) {
79
+ Kseq_Wrapper *w;
80
+
81
+ Data_Get_Struct(self, Kseq_Wrapper, w);
82
+ w->from_io = 0;
83
+
84
+ switch (TYPE(value)) {
85
+ case T_STRING:
86
+ w->fp = gzopen(StringValuePtr(value), "r");
87
+ break;
88
+ case T_FILE:
89
+ w->fp = gzdopen(fileno(rb_io_stdio_file(RFILE(value)->fptr)), "r");
90
+ w->from_io = 1;
91
+ break;
92
+ default:
93
+ rb_raise(rb_eTypeError, "Only strings and IOs are supported");
94
+ break;
95
+ }
96
+
97
+ w->seq = kseq_init(w->fp);
98
+
99
+ return self;
100
+ }
@@ -0,0 +1,24 @@
1
+ require "seqtk_bindings"
2
+ require "bio/kseq/version"
3
+
4
+ module Bio
5
+ module SeqTK
6
+ class Kseq
7
+ def to_s
8
+ if qual.nil?
9
+ if comment.nil?
10
+ ">" + name + "\n" + seq
11
+ else
12
+ ">" + name + " " + comment + "\n" + seq
13
+ end
14
+ else
15
+ if comment.nil?
16
+ "@" + name + "\n" + seq + "\n+\n" + qual
17
+ else
18
+ "@" + name + " " + comment + "\n" + seq + "\n+\n" + qual
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,5 @@
1
+ module Bio
2
+ class Kseq
3
+ VERSION = "0.0.2"
4
+ end
5
+ end
@@ -0,0 +1,100 @@
1
+ require 'bio/kseq'
2
+ require 'tempfile'
3
+
4
+ include Bio
5
+
6
+ describe Kseq do
7
+ it 'should parse simple FASTA files' do
8
+ tmp = Tempfile.new 'fasta'
9
+ tmp.puts ">A"
10
+ tmp.puts "AAAATTTTCCCCGGGG"
11
+ tmp.puts ">B comment"
12
+ tmp.puts "GGGGTTTTCCCCAAAA"
13
+ tmp.close
14
+
15
+ kseq = Kseq.new tmp.path
16
+
17
+ expect(kseq.read!).to be_truthy
18
+ expect(kseq.name).to eq("A")
19
+ expect(kseq.comment).to be_nil
20
+ expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
21
+ expect(kseq.qual).to be_nil
22
+
23
+ expect(kseq.read!).to be_truthy
24
+ expect(kseq.name).to eq("B")
25
+ expect(kseq.comment).to eq("comment")
26
+ expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
27
+ expect(kseq.qual).to be_nil
28
+
29
+ expect(kseq.read!).to be_falsey
30
+ end
31
+
32
+ it 'should parse simple FASTQ files' do
33
+ tmp = Tempfile.new 'fasta'
34
+ tmp.puts "@A"
35
+ tmp.puts "AAAATTTTCCCCGGGG"
36
+ tmp.puts "+"
37
+ tmp.puts "AAAAAAAAAAAAAAAA"
38
+ tmp.puts "@B comment"
39
+ tmp.puts "GGGGTTTTCCCCAAAA"
40
+ tmp.puts "+"
41
+ tmp.puts "IIIIIIIIIIIIIIII"
42
+ tmp.close
43
+
44
+ kseq = Kseq.new tmp.path
45
+
46
+ expect(kseq.read!).to be_truthy
47
+ expect(kseq.name).to eq("A")
48
+ expect(kseq.comment).to be_nil
49
+ expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
50
+ expect(kseq.qual).to eq("AAAAAAAAAAAAAAAA")
51
+
52
+ expect(kseq.read!).to be_truthy
53
+ expect(kseq.name).to eq("B")
54
+ expect(kseq.comment).to eq("comment")
55
+ expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
56
+ expect(kseq.qual).to eq("IIIIIIIIIIIIIIII")
57
+
58
+ expect(kseq.read!).to be_falsey
59
+ end
60
+
61
+ it 'should read from IO' do
62
+ tmp = Tempfile.new 'fasta'
63
+ tmp.puts ">A"
64
+ tmp.puts "AAAATTTTCCCCGGGG"
65
+ tmp.puts ">B comment"
66
+ tmp.puts "GGGGTTTTCCCCAAAA"
67
+ tmp.close
68
+
69
+ io = File.open(tmp.path)
70
+
71
+ kseq = Kseq.new io
72
+
73
+ expect(kseq.read!).to be_truthy
74
+ expect(kseq.name).to eq("A")
75
+ expect(kseq.comment).to be_nil
76
+ expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
77
+ expect(kseq.qual).to be_nil
78
+
79
+ expect(kseq.read!).to be_truthy
80
+ expect(kseq.name).to eq("B")
81
+ expect(kseq.comment).to eq("comment")
82
+ expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
83
+ expect(kseq.qual).to be_nil
84
+
85
+ expect(kseq.read!).to be_falsey
86
+ end
87
+
88
+ it 'should read comment' do
89
+ tmp = Tempfile.new 'fasta'
90
+ tmp.puts "@SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36"
91
+ tmp.puts "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC"
92
+ tmp.puts "+SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36"
93
+ tmp.puts "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC"
94
+ tmp.close
95
+
96
+ kseq = Kseq.new tmp.path
97
+ expect(kseq.read!).to be_truthy
98
+ expect(kseq.comment).to eq("071112_SLXA-EAS1_s_7:5:1:817:345 length=36")
99
+ end
100
+ end
@@ -0,0 +1,7 @@
1
+ RSpec.configure do |config|
2
+ config.treat_symbols_as_metadata_keys_with_true_values = true
3
+ config.run_all_when_everything_filtered = true
4
+ config.filter_run :focus
5
+
6
+ config.order = 'random'
7
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-kseq
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Fedor Gusev
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 3.0.0
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 3.0.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: A fast FASTA/FASTQ parser based on kseq.h by Heng Li
70
+ email:
71
+ - gusevfe@gmail.com
72
+ executables: []
73
+ extensions:
74
+ - ext/seqtk_bindings/extconf.rb
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - ".gitmodules"
79
+ - ".rspec"
80
+ - ".travis.yml"
81
+ - Gemfile
82
+ - LICENSE.txt
83
+ - README.md
84
+ - Rakefile
85
+ - bio-kseq.gemspec
86
+ - ext/seqtk_bindings/extconf.rb
87
+ - ext/seqtk_bindings/seqtk/kseq.h
88
+ - ext/seqtk_bindings/seqtk_bindings.c
89
+ - lib/bio/kseq.rb
90
+ - lib/bio/kseq/version.rb
91
+ - spec/kseq_spec.rb
92
+ - spec/spec_helper.rb
93
+ homepage: https://github.com/gusevfe/bio-kseq
94
+ licenses:
95
+ - MIT
96
+ metadata: {}
97
+ post_install_message:
98
+ rdoc_options: []
99
+ require_paths:
100
+ - lib
101
+ - ext
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project:
114
+ rubygems_version: 2.2.2
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: Ruby inferface for kseq.h by Heng Li for fast FASTA/Q reading
118
+ test_files:
119
+ - spec/kseq_spec.rb
120
+ - spec/spec_helper.rb