bio-kseq 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 887f0a14561673eb7226ee2e01372b637d780c63
4
+ data.tar.gz: c4fd1b6e2648d0bb3fb54855fb33cb7cb2fdfb68
5
+ SHA512:
6
+ metadata.gz: 0224e47c38a03c2d2e468c8aaef333494a254774bd0261f6f7a32efd59b0e93888187897e6cb3b9ec4df45b53b56cb6fc3702ce833fe3b162ffd2d193e406569
7
+ data.tar.gz: 73d65945910d96940304586733c8a78c9b6cf8338408596be33fafbbc755542c4af07a50c4f84d48c763765d8c4a84d2a7c970f80e597173d8d168ec680eb354
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
@@ -0,0 +1,3 @@
1
+ [submodule "ext/seqtk_bindings/seqtk"]
2
+ path = ext/seqtk_bindings/seqtk
3
+ url = https://github.com/lh3/seqtk.git
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.1.2
5
+ - 2.1.1
6
+ - 2.0.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in bioruby-seqtk.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Gusev Fedor
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # Bio::Kseq [![Build Status](https://travis-ci.org/gusevfe/bio-kseq.svg?branch=master)](https://travis-ci.org/gusevfe/bio-kseq)
2
+
3
+ Ruby bindings for a very fast FASTA/Q parser [kseq.h](https://github.com/lh3/seqtk/blob/master/kseq.h) by Heng Li.
4
+
5
+ A default FASTA/Q parser from [BioRuby](http://bioruby.org) is extremly slow. One alternative is to use [bio-faster](https://github.com/fstrozzi/bioruby-faster) but that lacks support for FASTA files. However, `bio-faster` does parse qualities, unlike `bio-kseq`.
6
+
7
+ ## Timings
8
+ ```
9
+ user system total real
10
+ BioRuby 2.130000 0.270000 2.400000 ( 2.403145)
11
+ Bio::Faster 0.420000 0.070000 0.490000 ( 0.486809)
12
+ Bio::Kseq 0.030000 0.010000 0.040000 ( 0.037176)
13
+ ```
14
+
15
+ ## Installation
16
+
17
+ Add this line to your application's Gemfile:
18
+
19
+ gem 'bioruby-seqtk'
20
+
21
+ And then execute:
22
+
23
+ $ bundle
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install bioruby-seqtk
28
+
29
+ ## Usage
30
+
31
+ ```ruby
32
+ require 'bio/kseq'
33
+
34
+ # Convert FASTQ to FASTA
35
+ kseq = Bio::Kseq.new("test.fastq")
36
+ while kseq.read! # returns truthy values when there is an entry
37
+ puts ">" + kseq.name
38
+ puts kseq.seq
39
+ end
40
+
41
+ kseq = Bio::Kseq.new("test.fastq.gz") # You can open GZIPed files flawlessly
42
+ kseq.read! or throw("Failed to read test.fastq.gz")
43
+
44
+ # Suppose entry is like this:
45
+ # @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36
46
+ # GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC
47
+ # +SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36
48
+ # IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC
49
+ kseq.name # = "SRR001666.1"
50
+ kseq.comment # = "071112_SLXA-EAS1_s_7:5:1:817:345 length=36", may be nil
51
+ kseq.seq # = "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC"
52
+ kseq.qual # = "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC", may be nil
53
+
54
+ kseq = Bio::Kseq.new(IO.popen("zcat test.fastq.gz")) # You can also process Ruby IO objects
55
+ kseq.read! or throw("Failed to read test.fastq.gz")
56
+ puts kseq # Outputs a valid FASTQ entry
57
+ ```
58
+
59
+ ## Contributing
60
+
61
+ 1. Fork it ( http://github.com/gusevfe/bioruby-seqtk/fork )
62
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
63
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
64
+ 4. Push to the branch (`git push origin my-new-feature`)
65
+ 5. Create new Pull Request
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+ require "rake/extensiontask"
4
+
5
+ RSpec::Core::RakeTask.new('spec')
6
+ Rake::ExtensionTask.new "seqtk_bindings"
7
+
8
+ task :spec => :compile
9
+ task :default => :spec
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'bio/kseq/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "bio-kseq"
8
+ spec.version = Bio::Kseq::VERSION
9
+ spec.authors = ["Fedor Gusev"]
10
+ spec.email = ["gusevfe@gmail.com"]
11
+ spec.summary = %q{Ruby inferface for kseq.h by Heng Li for fast FASTA/Q reading}
12
+ spec.description = %q{A fast FASTA/FASTQ parser based on kseq.h by Heng Li}
13
+ spec.homepage = "https://github.com/gusevfe/bio-kseq"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0") - ["timing.rb"]
17
+ spec.files << "ext/seqtk_bindings/seqtk/kseq.h"
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ["lib", "ext"]
21
+ spec.extensions = Dir['ext/**/extconf.rb']
22
+ spec.platform = Gem::Platform::RUBY
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.5"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec", ">= 3.0.0"
27
+ spec.add_development_dependency "rake-compiler"
28
+ #spec.add_development_dependency "bio" # For timing script
29
+ #spec.add_development_dependency "bio-faster" # For timing script
30
+ end
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ dir_config('seqtk_bindings')
4
+ find_header('zlib.h')
5
+ find_library('z', 'gzopen')
6
+ create_makefile('seqtk_bindings')
@@ -0,0 +1,235 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Last Modified: 05MAR2012 */
27
+
28
+ #ifndef AC_KSEQ_H
29
+ #define AC_KSEQ_H
30
+
31
+ #include <ctype.h>
32
+ #include <string.h>
33
+ #include <stdlib.h>
34
+
35
+ #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36
+ #define KS_SEP_TAB 1 // isspace() && !' '
37
+ #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38
+ #define KS_SEP_MAX 2
39
+
40
+ #define __KS_TYPE(type_t) \
41
+ typedef struct __kstream_t { \
42
+ unsigned char *buf; \
43
+ int begin, end, is_eof; \
44
+ type_t f; \
45
+ } kstream_t;
46
+
47
+ #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
48
+ #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
49
+
50
+ #define __KS_BASIC(type_t, __bufsize) \
51
+ static inline kstream_t *ks_init(type_t f) \
52
+ { \
53
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
54
+ ks->f = f; \
55
+ ks->buf = (unsigned char*)malloc(__bufsize); \
56
+ return ks; \
57
+ } \
58
+ static inline void ks_destroy(kstream_t *ks) \
59
+ { \
60
+ if (ks) { \
61
+ free(ks->buf); \
62
+ free(ks); \
63
+ } \
64
+ }
65
+
66
+ #define __KS_GETC(__read, __bufsize) \
67
+ static inline int ks_getc(kstream_t *ks) \
68
+ { \
69
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
70
+ if (ks->begin >= ks->end) { \
71
+ ks->begin = 0; \
72
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
73
+ if (ks->end < __bufsize) ks->is_eof = 1; \
74
+ if (ks->end == 0) return -1; \
75
+ } \
76
+ return (int)ks->buf[ks->begin++]; \
77
+ }
78
+
79
+ #ifndef KSTRING_T
80
+ #define KSTRING_T kstring_t
81
+ typedef struct __kstring_t {
82
+ size_t l, m;
83
+ char *s;
84
+ } kstring_t;
85
+ #endif
86
+
87
+ #ifndef kroundup32
88
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
89
+ #endif
90
+
91
+ #define __KS_GETUNTIL(__read, __bufsize) \
92
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
93
+ { \
94
+ if (dret) *dret = 0; \
95
+ str->l = append? str->l : 0; \
96
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
97
+ for (;;) { \
98
+ int i; \
99
+ if (ks->begin >= ks->end) { \
100
+ if (!ks->is_eof) { \
101
+ ks->begin = 0; \
102
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
103
+ if (ks->end < __bufsize) ks->is_eof = 1; \
104
+ if (ks->end == 0) break; \
105
+ } else break; \
106
+ } \
107
+ if (delimiter == KS_SEP_LINE) { \
108
+ for (i = ks->begin; i < ks->end; ++i) \
109
+ if (ks->buf[i] == '\n') break; \
110
+ } else if (delimiter > KS_SEP_MAX) { \
111
+ for (i = ks->begin; i < ks->end; ++i) \
112
+ if (ks->buf[i] == delimiter) break; \
113
+ } else if (delimiter == KS_SEP_SPACE) { \
114
+ for (i = ks->begin; i < ks->end; ++i) \
115
+ if (isspace(ks->buf[i])) break; \
116
+ } else if (delimiter == KS_SEP_TAB) { \
117
+ for (i = ks->begin; i < ks->end; ++i) \
118
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
119
+ } else i = 0; /* never come to here! */ \
120
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
121
+ str->m = str->l + (i - ks->begin) + 1; \
122
+ kroundup32(str->m); \
123
+ str->s = (char*)realloc(str->s, str->m); \
124
+ } \
125
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
126
+ str->l = str->l + (i - ks->begin); \
127
+ ks->begin = i + 1; \
128
+ if (i < ks->end) { \
129
+ if (dret) *dret = ks->buf[i]; \
130
+ break; \
131
+ } \
132
+ } \
133
+ if (str->s == 0) { \
134
+ str->m = 1; \
135
+ str->s = (char*)calloc(1, 1); \
136
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
137
+ str->s[str->l] = '\0'; \
138
+ return str->l; \
139
+ } \
140
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
141
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
142
+
143
+ #define KSTREAM_INIT(type_t, __read, __bufsize) \
144
+ __KS_TYPE(type_t) \
145
+ __KS_BASIC(type_t, __bufsize) \
146
+ __KS_GETC(__read, __bufsize) \
147
+ __KS_GETUNTIL(__read, __bufsize)
148
+
149
+ #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
150
+
151
+ #define __KSEQ_BASIC(SCOPE, type_t) \
152
+ SCOPE kseq_t *kseq_init(type_t fd) \
153
+ { \
154
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
155
+ s->f = ks_init(fd); \
156
+ return s; \
157
+ } \
158
+ SCOPE void kseq_destroy(kseq_t *ks) \
159
+ { \
160
+ if (!ks) return; \
161
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
162
+ ks_destroy(ks->f); \
163
+ free(ks); \
164
+ }
165
+
166
+ /* Return value:
167
+ >=0 length of the sequence (normal)
168
+ -1 end-of-file
169
+ -2 truncated quality string
170
+ */
171
+ #define __KSEQ_READ(SCOPE) \
172
+ SCOPE int kseq_read(kseq_t *seq) \
173
+ { \
174
+ int c; \
175
+ kstream_t *ks = seq->f; \
176
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
177
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
178
+ if (c == -1) return -1; /* end of file */ \
179
+ seq->last_char = c; \
180
+ } /* else: the first header char has been read in the previous call */ \
181
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
182
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
183
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
184
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
185
+ seq->seq.m = 256; \
186
+ seq->seq.s = (char*)malloc(seq->seq.m); \
187
+ } \
188
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
189
+ if (c == '\n') continue; /* skip empty lines */ \
190
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
191
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
192
+ } \
193
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
195
+ seq->seq.m = seq->seq.l + 2; \
196
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
197
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
198
+ } \
199
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
200
+ if (c != '+') return seq->seq.l; /* FASTA */ \
201
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
202
+ seq->qual.m = seq->seq.m; \
203
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
204
+ } \
205
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
206
+ if (c == -1) return -2; /* error: no quality string */ \
207
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
208
+ seq->last_char = 0; /* we have not come to the next header line */ \
209
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
210
+ return seq->seq.l; \
211
+ }
212
+
213
+ #define __KSEQ_TYPE(type_t) \
214
+ typedef struct { \
215
+ kstring_t name, comment, seq, qual; \
216
+ int last_char; \
217
+ kstream_t *f; \
218
+ } kseq_t;
219
+
220
+ #define KSEQ_INIT2(SCOPE, type_t, __read) \
221
+ KSTREAM_INIT(type_t, __read, 16384) \
222
+ __KSEQ_TYPE(type_t) \
223
+ __KSEQ_BASIC(SCOPE, type_t) \
224
+ __KSEQ_READ(SCOPE)
225
+
226
+ #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
227
+
228
+ #define KSEQ_DECLARE(type_t) \
229
+ __KS_TYPE(type_t) \
230
+ __KSEQ_TYPE(type_t) \
231
+ extern kseq_t *kseq_init(type_t fd); \
232
+ void kseq_destroy(kseq_t *ks); \
233
+ int kseq_read(kseq_t *seq);
234
+
235
+ #endif
@@ -0,0 +1,100 @@
1
+ #include <ruby.h>
2
+ #include <ruby/io.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <seqtk/kseq.h>
6
+ #include <zlib.h>
7
+
8
+ KSEQ_INIT(gzFile, gzread);
9
+
10
+ void Init_seqtk_bindings();
11
+ static VALUE kseq_wrapper_allocate(VALUE klass);
12
+ static VALUE kseq_wrapper_initialize(VALUE klass, VALUE rb_filename);
13
+ static void kseq_wrapper_deallocate(void *seq);
14
+ static VALUE kseq_wrapper_read(VALUE self);
15
+
16
+ VALUE mBio;
17
+ VALUE cKseq;
18
+
19
+ typedef struct {
20
+ char from_io;
21
+ kseq_t *seq;
22
+ gzFile fp;
23
+ } Kseq_Wrapper;
24
+
25
+ #define kseq_wrapper_field(NAME) \
26
+ static VALUE kseq_wrapper_ ## NAME(VALUE self) { \
27
+ Kseq_Wrapper *w; \
28
+ Data_Get_Struct(self, Kseq_Wrapper, w);\
29
+ if (w->seq->NAME.l) \
30
+ return rb_str_new2(w->seq->NAME.s);\
31
+ else \
32
+ return Qnil;\
33
+ }\
34
+
35
+ kseq_wrapper_field(name);
36
+ kseq_wrapper_field(comment);
37
+ kseq_wrapper_field(seq);
38
+ kseq_wrapper_field(qual);
39
+
40
+ void Init_seqtk_bindings() {
41
+ mBio = rb_define_module("Bio");
42
+ cKseq = rb_define_class_under(mBio, "Kseq", rb_cObject);
43
+ rb_define_alloc_func(cKseq, kseq_wrapper_allocate);
44
+ rb_define_method(cKseq, "initialize", kseq_wrapper_initialize, 1);
45
+
46
+ rb_define_method(cKseq, "read!", kseq_wrapper_read, 0);
47
+ rb_define_method(cKseq, "name", kseq_wrapper_name, 0);
48
+ rb_define_method(cKseq, "comment", kseq_wrapper_comment, 0);
49
+ rb_define_method(cKseq, "seq", kseq_wrapper_seq, 0);
50
+ rb_define_method(cKseq, "qual", kseq_wrapper_qual, 0);
51
+ }
52
+
53
+ static VALUE kseq_wrapper_allocate(VALUE klass) {
54
+ Kseq_Wrapper *w = malloc(sizeof(Kseq_Wrapper));
55
+
56
+ return Data_Wrap_Struct(klass, NULL, kseq_wrapper_deallocate, w);
57
+ }
58
+
59
+ static void kseq_wrapper_deallocate(void *p)
60
+ {
61
+ Kseq_Wrapper *w = p;
62
+ kseq_destroy(w->seq);
63
+ if (!(w->from_io))
64
+ gzclose(w->fp);
65
+ free(w);
66
+ }
67
+
68
+ static VALUE kseq_wrapper_read(VALUE self) {
69
+ int r;
70
+ Kseq_Wrapper *w;
71
+
72
+ Data_Get_Struct(self, Kseq_Wrapper, w);
73
+ r = kseq_read(w->seq);
74
+
75
+ return r >= 0 ? Qtrue : Qfalse;
76
+ }
77
+
78
+ static VALUE kseq_wrapper_initialize(VALUE self, VALUE value) {
79
+ Kseq_Wrapper *w;
80
+
81
+ Data_Get_Struct(self, Kseq_Wrapper, w);
82
+ w->from_io = 0;
83
+
84
+ switch (TYPE(value)) {
85
+ case T_STRING:
86
+ w->fp = gzopen(StringValuePtr(value), "r");
87
+ break;
88
+ case T_FILE:
89
+ w->fp = gzdopen(fileno(rb_io_stdio_file(RFILE(value)->fptr)), "r");
90
+ w->from_io = 1;
91
+ break;
92
+ default:
93
+ rb_raise(rb_eTypeError, "Only strings and IOs are supported");
94
+ break;
95
+ }
96
+
97
+ w->seq = kseq_init(w->fp);
98
+
99
+ return self;
100
+ }
@@ -0,0 +1,24 @@
1
+ require "seqtk_bindings"
2
+ require "bio/kseq/version"
3
+
4
+ module Bio
5
+ module SeqTK
6
+ class Kseq
7
+ def to_s
8
+ if qual.nil?
9
+ if comment.nil?
10
+ ">" + name + "\n" + seq
11
+ else
12
+ ">" + name + " " + comment + "\n" + seq
13
+ end
14
+ else
15
+ if comment.nil?
16
+ "@" + name + "\n" + seq + "\n+\n" + qual
17
+ else
18
+ "@" + name + " " + comment + "\n" + seq + "\n+\n" + qual
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,5 @@
1
+ module Bio
2
+ class Kseq
3
+ VERSION = "0.0.2"
4
+ end
5
+ end
@@ -0,0 +1,100 @@
1
+ require 'bio/kseq'
2
+ require 'tempfile'
3
+
4
+ include Bio
5
+
6
+ describe Kseq do
7
+ it 'should parse simple FASTA files' do
8
+ tmp = Tempfile.new 'fasta'
9
+ tmp.puts ">A"
10
+ tmp.puts "AAAATTTTCCCCGGGG"
11
+ tmp.puts ">B comment"
12
+ tmp.puts "GGGGTTTTCCCCAAAA"
13
+ tmp.close
14
+
15
+ kseq = Kseq.new tmp.path
16
+
17
+ expect(kseq.read!).to be_truthy
18
+ expect(kseq.name).to eq("A")
19
+ expect(kseq.comment).to be_nil
20
+ expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
21
+ expect(kseq.qual).to be_nil
22
+
23
+ expect(kseq.read!).to be_truthy
24
+ expect(kseq.name).to eq("B")
25
+ expect(kseq.comment).to eq("comment")
26
+ expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
27
+ expect(kseq.qual).to be_nil
28
+
29
+ expect(kseq.read!).to be_falsey
30
+ end
31
+
32
+ it 'should parse simple FASTQ files' do
33
+ tmp = Tempfile.new 'fasta'
34
+ tmp.puts "@A"
35
+ tmp.puts "AAAATTTTCCCCGGGG"
36
+ tmp.puts "+"
37
+ tmp.puts "AAAAAAAAAAAAAAAA"
38
+ tmp.puts "@B comment"
39
+ tmp.puts "GGGGTTTTCCCCAAAA"
40
+ tmp.puts "+"
41
+ tmp.puts "IIIIIIIIIIIIIIII"
42
+ tmp.close
43
+
44
+ kseq = Kseq.new tmp.path
45
+
46
+ expect(kseq.read!).to be_truthy
47
+ expect(kseq.name).to eq("A")
48
+ expect(kseq.comment).to be_nil
49
+ expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
50
+ expect(kseq.qual).to eq("AAAAAAAAAAAAAAAA")
51
+
52
+ expect(kseq.read!).to be_truthy
53
+ expect(kseq.name).to eq("B")
54
+ expect(kseq.comment).to eq("comment")
55
+ expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
56
+ expect(kseq.qual).to eq("IIIIIIIIIIIIIIII")
57
+
58
+ expect(kseq.read!).to be_falsey
59
+ end
60
+
61
+ it 'should read from IO' do
62
+ tmp = Tempfile.new 'fasta'
63
+ tmp.puts ">A"
64
+ tmp.puts "AAAATTTTCCCCGGGG"
65
+ tmp.puts ">B comment"
66
+ tmp.puts "GGGGTTTTCCCCAAAA"
67
+ tmp.close
68
+
69
+ io = File.open(tmp.path)
70
+
71
+ kseq = Kseq.new io
72
+
73
+ expect(kseq.read!).to be_truthy
74
+ expect(kseq.name).to eq("A")
75
+ expect(kseq.comment).to be_nil
76
+ expect(kseq.seq).to eq("AAAATTTTCCCCGGGG")
77
+ expect(kseq.qual).to be_nil
78
+
79
+ expect(kseq.read!).to be_truthy
80
+ expect(kseq.name).to eq("B")
81
+ expect(kseq.comment).to eq("comment")
82
+ expect(kseq.seq).to eq("GGGGTTTTCCCCAAAA")
83
+ expect(kseq.qual).to be_nil
84
+
85
+ expect(kseq.read!).to be_falsey
86
+ end
87
+
88
+ it 'should read comment' do
89
+ tmp = Tempfile.new 'fasta'
90
+ tmp.puts "@SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36"
91
+ tmp.puts "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC"
92
+ tmp.puts "+SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36"
93
+ tmp.puts "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC"
94
+ tmp.close
95
+
96
+ kseq = Kseq.new tmp.path
97
+ expect(kseq.read!).to be_truthy
98
+ expect(kseq.comment).to eq("071112_SLXA-EAS1_s_7:5:1:817:345 length=36")
99
+ end
100
+ end
@@ -0,0 +1,7 @@
1
+ RSpec.configure do |config|
2
+ config.treat_symbols_as_metadata_keys_with_true_values = true
3
+ config.run_all_when_everything_filtered = true
4
+ config.filter_run :focus
5
+
6
+ config.order = 'random'
7
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-kseq
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Fedor Gusev
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 3.0.0
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 3.0.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: A fast FASTA/FASTQ parser based on kseq.h by Heng Li
70
+ email:
71
+ - gusevfe@gmail.com
72
+ executables: []
73
+ extensions:
74
+ - ext/seqtk_bindings/extconf.rb
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - ".gitmodules"
79
+ - ".rspec"
80
+ - ".travis.yml"
81
+ - Gemfile
82
+ - LICENSE.txt
83
+ - README.md
84
+ - Rakefile
85
+ - bio-kseq.gemspec
86
+ - ext/seqtk_bindings/extconf.rb
87
+ - ext/seqtk_bindings/seqtk/kseq.h
88
+ - ext/seqtk_bindings/seqtk_bindings.c
89
+ - lib/bio/kseq.rb
90
+ - lib/bio/kseq/version.rb
91
+ - spec/kseq_spec.rb
92
+ - spec/spec_helper.rb
93
+ homepage: https://github.com/gusevfe/bio-kseq
94
+ licenses:
95
+ - MIT
96
+ metadata: {}
97
+ post_install_message:
98
+ rdoc_options: []
99
+ require_paths:
100
+ - lib
101
+ - ext
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project:
114
+ rubygems_version: 2.2.2
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: Ruby inferface for kseq.h by Heng Li for fast FASTA/Q reading
118
+ test_files:
119
+ - spec/kseq_spec.rb
120
+ - spec/spec_helper.rb