bio-faster 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "shoulda", ">= 0"
10
+ gem "bundler", "~> 1.0.0"
11
+ gem "jeweler", "~> 1.6.4"
12
+ gem "rcov", ">= 0"
13
+ gem "bio", ">= 1.4.2"
14
+ gem "rspec"
15
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,34 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ bio (1.4.2)
5
+ diff-lcs (1.1.3)
6
+ git (1.2.5)
7
+ jeweler (1.6.4)
8
+ bundler (~> 1.0)
9
+ git (>= 1.2.5)
10
+ rake
11
+ rake (0.9.2.2)
12
+ rcov (0.9.11)
13
+ rcov (0.9.11-java)
14
+ rspec (2.7.0)
15
+ rspec-core (~> 2.7.0)
16
+ rspec-expectations (~> 2.7.0)
17
+ rspec-mocks (~> 2.7.0)
18
+ rspec-core (2.7.1)
19
+ rspec-expectations (2.7.0)
20
+ diff-lcs (~> 1.1.2)
21
+ rspec-mocks (2.7.0)
22
+ shoulda (2.11.3)
23
+
24
+ PLATFORMS
25
+ java
26
+ ruby
27
+
28
+ DEPENDENCIES
29
+ bio (>= 1.4.2)
30
+ bundler (~> 1.0.0)
31
+ jeweler (~> 1.6.4)
32
+ rcov
33
+ rspec
34
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Francesco Strozzi
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = bio-faster
2
+
3
+ Fast and simple parser for FastA / FastQ files, based on Heng Li Kseq library written in C.
4
+ http://lh3lh3.users.sourceforge.net/parsefastq.shtml
5
+
6
+ = Examples
7
+
8
+ See the wiki page.
9
+
10
+
11
+ == Contributing to bio-faster
12
+
13
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
14
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
15
+ * Fork the project
16
+ * Start a feature/bugfix branch
17
+ * Commit and push until you are happy with your contribution
18
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
19
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
20
+
21
+ == Copyright
22
+
23
+ Copyright (c) 2011 Francesco Strozzi. See LICENSE.txt for
24
+ further details.
25
+
data/Rakefile ADDED
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "bio-faster"
18
+ gem.homepage = "http://github.com/fstrozzi/bioruby-faster"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{A fast parser for Fasta and FastQ files}
21
+ gem.description = %Q{A fast parser for Fasta and FastQ files}
22
+ gem.email = "francesco.strozzi@gmail.com"
23
+ gem.authors = ["Francesco Strozzi"]
24
+ gem.required_ruby_version = '>= 1.9'
25
+ # dependencies defined in Gemfile
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ test.rcov_opts << '--exclude "gems/*"'
42
+ end
43
+
44
+ desc "Run all specs"
45
+ task :spec do
46
+ FileList['spec/**/*_spec.rb'].each do |spec|
47
+ sh "rspec #{spec}"
48
+ end
49
+ end
50
+
51
+ task :default => :test
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
56
+
57
+ rdoc.rdoc_dir = 'rdoc'
58
+ rdoc.title = "bio-faster #{version}"
59
+ rdoc.rdoc_files.include('README*')
60
+ rdoc.rdoc_files.include('lib/**/*.rb')
61
+ end
62
+
63
+ namespace :ext do
64
+ desc "Compile extension"
65
+ task :build do
66
+ puts "Building extension"
67
+ cd File.join(File.dirname(__FILE__),"ext")
68
+ sh "ruby "+File.join(File.dirname(__FILE__),"ext","extconf.rb")
69
+ sh "make"
70
+ FileList["*.log"].each do |file|
71
+ rm file
72
+ end
73
+ FileList["*.o"].each do |file|
74
+ rm file
75
+ end
76
+
77
+ end
78
+ end
79
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -0,0 +1,74 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "bio-faster"
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Francesco Strozzi"]
12
+ s.date = "2012-01-04"
13
+ s.description = "A fast parser for Fasta and FastQ files"
14
+ s.email = "francesco.strozzi@gmail.com"
15
+ s.extensions = ["ext/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE.txt",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".document",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README.rdoc",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "bio-faster.gemspec",
29
+ "ext/extconf.rb",
30
+ "ext/faster.c",
31
+ "ext/kseq.h",
32
+ "lib/bio-faster.rb",
33
+ "spec/helper.rb",
34
+ "spec/parser_spec.rb",
35
+ "test/data/sample.fasta",
36
+ "test/data/sample.fastq",
37
+ "test/data/sample.fastq.gz",
38
+ "test/data/sff_sample.fastq"
39
+ ]
40
+ s.homepage = "http://github.com/fstrozzi/bioruby-faster"
41
+ s.licenses = ["MIT"]
42
+ s.require_paths = ["lib"]
43
+ s.required_ruby_version = Gem::Requirement.new(">= 1.9")
44
+ s.rubygems_version = "1.8.12"
45
+ s.summary = "A fast parser for Fasta and FastQ files"
46
+
47
+ if s.respond_to? :specification_version then
48
+ s.specification_version = 3
49
+
50
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
51
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
52
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
53
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
54
+ s.add_development_dependency(%q<rcov>, [">= 0"])
55
+ s.add_development_dependency(%q<bio>, [">= 1.4.2"])
56
+ s.add_development_dependency(%q<rspec>, [">= 0"])
57
+ else
58
+ s.add_dependency(%q<shoulda>, [">= 0"])
59
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
60
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
61
+ s.add_dependency(%q<rcov>, [">= 0"])
62
+ s.add_dependency(%q<bio>, [">= 1.4.2"])
63
+ s.add_dependency(%q<rspec>, [">= 0"])
64
+ end
65
+ else
66
+ s.add_dependency(%q<shoulda>, [">= 0"])
67
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
68
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
69
+ s.add_dependency(%q<rcov>, [">= 0"])
70
+ s.add_dependency(%q<bio>, [">= 1.4.2"])
71
+ s.add_dependency(%q<rspec>, [">= 0"])
72
+ end
73
+ end
74
+
data/ext/extconf.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'mkmf'
2
+ extension_name = "faster"
3
+ have_library("z")
4
+ create_makefile(extension_name)
5
+
6
+
7
+
data/ext/faster.c ADDED
@@ -0,0 +1,52 @@
1
+ /*
2
+ Copyright(C) 2011 Francesco Strozzi <francesco.strozzi@gmail.com>
3
+ */
4
+
5
+ #include <zlib.h>
6
+ #include <stdio.h>
7
+ #include "ruby.h"
8
+ #include "kseq.h"
9
+
10
+ KSEQ_INIT(gzFile, gzread)
11
+
12
+ static VALUE method_parse(VALUE self, VALUE file) {
13
+
14
+ // check if a block is passed to the method
15
+ if (!(rb_block_given_p())) {
16
+ rb_raise(rb_eArgError,"You must pass a valid block!");
17
+ }
18
+
19
+ gzFile fp;
20
+ if (!(fp = gzopen(RSTRING_PTR(file), "r"))) {
21
+ rb_raise(rb_eArgError,"File %s not found!", RSTRING_PTR(file));
22
+ }
23
+ else {
24
+ kseq_t *seq;
25
+ seq = kseq_init(fp);
26
+ while (kseq_read(seq) >= 0) {
27
+ VALUE arr = rb_ary_new();
28
+ rb_ary_push(arr, rb_str_new2(seq->name.s));
29
+ if (seq->comment.l) rb_ary_push(arr, rb_str_new2(seq->comment.s));
30
+ rb_ary_push(arr, rb_str_new2(seq->seq.s));
31
+ if (seq->qual.l) {
32
+ VALUE rb_quality = rb_ary_new();
33
+ int unsigned i = 0;
34
+ while(i < seq->qual.l) {
35
+ rb_ary_push(rb_quality,INT2FIX(*(seq->qual.s + i) - 33)); // quality conversion (Sanger/Phred only)
36
+ i++;
37
+ }
38
+ rb_ary_push(arr,rb_quality);
39
+ }
40
+ rb_yield(arr);
41
+ }
42
+ kseq_destroy(seq);
43
+ gzclose(fp);
44
+ return Qtrue;
45
+ }
46
+ }
47
+
48
+ void Init_faster() {
49
+ VALUE Bio = rb_define_module("Bio");
50
+ VALUE Faster = rb_define_module_under(Bio,"Faster"); // it is defined as a sub-module of Bio
51
+ rb_define_singleton_method(Faster,"parse",method_parse,1);
52
+ }
data/ext/kseq.h ADDED
@@ -0,0 +1,223 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ /* Last Modified: 12APR2009 */
29
+
30
+ #ifndef AC_KSEQ_H
31
+ #define AC_KSEQ_H
32
+
33
+ #include <ctype.h>
34
+ #include <string.h>
35
+ #include <stdlib.h>
36
+
37
+ #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
38
+ #define KS_SEP_TAB 1 // isspace() && !' '
39
+ #define KS_SEP_MAX 1
40
+
41
+ #define __KS_TYPE(type_t) \
42
+ typedef struct __kstream_t { \
43
+ char *buf; \
44
+ int begin, end, is_eof; \
45
+ type_t f; \
46
+ } kstream_t;
47
+
48
+ #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
49
+ #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
50
+
51
+ #define __KS_BASIC(type_t, __bufsize) \
52
+ static inline kstream_t *ks_init(type_t f) \
53
+ { \
54
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
55
+ ks->f = f; \
56
+ ks->buf = (char*)malloc(__bufsize); \
57
+ return ks; \
58
+ } \
59
+ static inline void ks_destroy(kstream_t *ks) \
60
+ { \
61
+ if (ks) { \
62
+ free(ks->buf); \
63
+ free(ks); \
64
+ } \
65
+ }
66
+
67
+ #define __KS_GETC(__read, __bufsize) \
68
+ static inline int ks_getc(kstream_t *ks) \
69
+ { \
70
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
71
+ if (ks->begin >= ks->end) { \
72
+ ks->begin = 0; \
73
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
74
+ if (ks->end < __bufsize) ks->is_eof = 1; \
75
+ if (ks->end == 0) return -1; \
76
+ } \
77
+ return (int)ks->buf[ks->begin++]; \
78
+ }
79
+
80
+ #ifndef KSTRING_T
81
+ #define KSTRING_T kstring_t
82
+ typedef struct __kstring_t {
83
+ size_t l, m;
84
+ char *s;
85
+ } kstring_t;
86
+ #endif
87
+
88
+ #ifndef kroundup32
89
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
90
+ #endif
91
+
92
+ #define __KS_GETUNTIL(__read, __bufsize) \
93
+ static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
94
+ { \
95
+ if (dret) *dret = 0; \
96
+ str->l = 0; \
97
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
98
+ for (;;) { \
99
+ int i; \
100
+ if (ks->begin >= ks->end) { \
101
+ if (!ks->is_eof) { \
102
+ ks->begin = 0; \
103
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
104
+ if (ks->end < __bufsize) ks->is_eof = 1; \
105
+ if (ks->end == 0) break; \
106
+ } else break; \
107
+ } \
108
+ if (delimiter > KS_SEP_MAX) { \
109
+ for (i = ks->begin; i < ks->end; ++i) \
110
+ if (ks->buf[i] == delimiter) break; \
111
+ } else if (delimiter == KS_SEP_SPACE) { \
112
+ for (i = ks->begin; i < ks->end; ++i) \
113
+ if (isspace(ks->buf[i])) break; \
114
+ } else if (delimiter == KS_SEP_TAB) { \
115
+ for (i = ks->begin; i < ks->end; ++i) \
116
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
117
+ } else i = 0; /* never come to here! */ \
118
+ if (str->m - str->l < i - ks->begin + 1) { \
119
+ str->m = str->l + (i - ks->begin) + 1; \
120
+ kroundup32(str->m); \
121
+ str->s = (char*)realloc(str->s, str->m); \
122
+ } \
123
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
124
+ str->l = str->l + (i - ks->begin); \
125
+ ks->begin = i + 1; \
126
+ if (i < ks->end) { \
127
+ if (dret) *dret = ks->buf[i]; \
128
+ break; \
129
+ } \
130
+ } \
131
+ if (str->l == 0) { \
132
+ str->m = 1; \
133
+ str->s = (char*)calloc(1, 1); \
134
+ } \
135
+ str->s[str->l] = '\0'; \
136
+ return str->l; \
137
+ }
138
+
139
+ #define KSTREAM_INIT(type_t, __read, __bufsize) \
140
+ __KS_TYPE(type_t) \
141
+ __KS_BASIC(type_t, __bufsize) \
142
+ __KS_GETC(__read, __bufsize) \
143
+ __KS_GETUNTIL(__read, __bufsize)
144
+
145
+ #define __KSEQ_BASIC(type_t) \
146
+ static inline kseq_t *kseq_init(type_t fd) \
147
+ { \
148
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
149
+ s->f = ks_init(fd); \
150
+ return s; \
151
+ } \
152
+ static inline void kseq_rewind(kseq_t *ks) \
153
+ { \
154
+ ks->last_char = 0; \
155
+ ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
156
+ } \
157
+ static inline void kseq_destroy(kseq_t *ks) \
158
+ { \
159
+ if (!ks) return; \
160
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
161
+ ks_destroy(ks->f); \
162
+ free(ks); \
163
+ }
164
+
165
+ /* Return value:
166
+ >=0 length of the sequence (normal)
167
+ -1 end-of-file
168
+ -2 truncated quality string
169
+ */
170
+ #define __KSEQ_READ \
171
+ static int kseq_read(kseq_t *seq) \
172
+ { \
173
+ int c; \
174
+ kstream_t *ks = seq->f; \
175
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
176
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
177
+ if (c == -1) return -1; /* end of file */ \
178
+ seq->last_char = c; \
179
+ } /* the first header char has been read */ \
180
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; \
181
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
182
+ if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
183
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
184
+ if (isgraph(c)) { /* printable non-space character */ \
185
+ if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
186
+ seq->seq.m = seq->seq.l + 2; \
187
+ kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
188
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
189
+ } \
190
+ seq->seq.s[seq->seq.l++] = (char)c; \
191
+ } \
192
+ } \
193
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
195
+ if (c != '+') return seq->seq.l; /* FASTA */ \
196
+ if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
197
+ seq->qual.m = seq->seq.m; \
198
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
199
+ } \
200
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
201
+ if (c == -1) return -2; /* we should not stop here */ \
202
+ while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
203
+ if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
204
+ seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
205
+ seq->last_char = 0; /* we have not come to the next header line */ \
206
+ if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
207
+ return seq->seq.l; \
208
+ }
209
+
210
+ #define __KSEQ_TYPE(type_t) \
211
+ typedef struct { \
212
+ kstring_t name, comment, seq, qual; \
213
+ int last_char; \
214
+ kstream_t *f; \
215
+ } kseq_t;
216
+
217
+ #define KSEQ_INIT(type_t, __read) \
218
+ KSTREAM_INIT(type_t, __read, 4096) \
219
+ __KSEQ_TYPE(type_t) \
220
+ __KSEQ_BASIC(type_t) \
221
+ __KSEQ_READ
222
+
223
+ #endif
data/lib/bio-faster.rb ADDED
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright(C) 2011 Francesco Strozzi <francesco.strozzi@gmail.com>
3
+ #
4
+
5
+ require File.expand_path(File.join(File.dirname(__FILE__),'..','ext','faster'))
6
+
data/spec/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+
11
+ require 'rspec'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ TEST_DATA = File.join(File.dirname(File.dirname(__FILE__)),"test","data")
16
+ require 'bio-faster'
17
+ require 'bio'
18
+
@@ -0,0 +1,117 @@
1
+ require 'helper'
2
+
3
+ describe Bio::Faster do
4
+
5
+ describe "#parser" do
6
+
7
+ it "reads both Fasta and FastQ format files" do
8
+ res = Bio::Faster.parse(File.join(TEST_DATA,"sample.fastq")) {|seq|}
9
+ res.should == true
10
+ res = Bio::Faster.parse(File.join(TEST_DATA,"sample.fasta")) {|seq|}
11
+ res.should == true
12
+ end
13
+
14
+ it "needs a valid block to parse files" do
15
+ expect { Bio::Faster.parse(File.join(TEST_DATA,"sample.fastq")) }.to raise_error(ArgumentError)
16
+ end
17
+
18
+ it "throws an error if file does not exists" do
19
+ expect { Bio::Faster.parse(File.join(TEST_DATA,"dummy.fastq")) }.to raise_error(ArgumentError)
20
+ end
21
+
22
+ it "returns an array with id, comment, sequence and an array with qualities (FastQ only)" do
23
+ Bio::Faster.parse(File.join(TEST_DATA,"sample.fastq")) do |seq|
24
+ seq.class.should == Array
25
+ seq.size.should == 4
26
+ seq[-1].class.should == Array
27
+ end
28
+
29
+ Bio::Faster.parse(File.join(TEST_DATA,"sample.fasta")) do |seq|
30
+ seq.class.should == Array
31
+ seq.size.should == 3
32
+ end
33
+
34
+ end
35
+
36
+ it "reads gzipped files" do
37
+ res = Bio::Faster.parse(File.join(TEST_DATA,"sample.fastq")) {|seq|}
38
+ res.should == true
39
+ end
40
+
41
+ it "parse correctly FastQ files" do
42
+ faster_res = []
43
+ Bio::Faster.parse(File.join(TEST_DATA,"sample.fastq")) {|seq| faster_res << seq}
44
+ faster_res[0][0].should == "HISEQ1:86:D0306ACXX:2:1101:20970:17588"
45
+ faster_res[0][1].should == "1:N:0:CTTGTA"
46
+ faster_res[0][2].should == "CGGTGCTGTTGTTATGCTGATGCTTATTAGTGCAAGTGTAGCTCCTCCGATTAGATGAATTAACAGGTGTCCTGCAGTAATGTTGGCTGTTAGTCGTAC"
47
+
48
+ faster_res[-1][0].should == "HISEQ1:86:D0306ACXX:2:1101:1411:17830"
49
+ faster_res[-1][1].should == "1:Y:0:CTTGTA"
50
+ faster_res[-1][2].should == "CGGCGGGCGTGGGGAGAGAGCTATGAAGGCCTCAGGGAAGCTTCGAGAGTATAAGGTGTTGGGGTGCTGCCTGCCAACCCCCAAATTCCACACACCACC"
51
+ end
52
+
53
+ it "parse correctly compressed FastQ files" do
54
+ faster_res = []
55
+ Bio::Faster.parse(File.join(TEST_DATA,"sample.fastq.gz")) {|seq| faster_res << seq}
56
+ faster_res[0][0].should == "HISEQ1:86:D0306ACXX:2:1101:20970:17588"
57
+ faster_res[0][1].should == "1:N:0:CTTGTA"
58
+ faster_res[0][2].should == "CGGTGCTGTTGTTATGCTGATGCTTATTAGTGCAAGTGTAGCTCCTCCGATTAGATGAATTAACAGGTGTCCTGCAGTAATGTTGGCTGTTAGTCGTAC"
59
+
60
+ faster_res[-1][0].should == "HISEQ1:86:D0306ACXX:2:1101:1411:17830"
61
+ faster_res[-1][1].should == "1:Y:0:CTTGTA"
62
+ faster_res[-1][2].should == "CGGCGGGCGTGGGGAGAGAGCTATGAAGGCCTCAGGGAAGCTTCGAGAGTATAAGGTGTTGGGGTGCTGCCTGCCAACCCCCAAATTCCACACACCACC"
63
+ end
64
+
65
+
66
+ it "parse correctly Fasta files" do
67
+ faster_res = []
68
+ Bio::Faster.parse(File.join(TEST_DATA,"sample.fasta")) {|seq| faster_res << seq}
69
+ faster_res[0][0].should == "seq1"
70
+ faster_res[0][1].should == "comment1"
71
+ faster_res[0][2].should == "AGCAATTTCCCTTTTCCTGTCCTTTTTATAACATTGTGGAGGAAGACGGCAGCATAAAAAGGACAGTATTTGATTAAAAAATGATAAAAATTTTCAAAC"
72
+
73
+ faster_res[-1][0].should == "seq4"
74
+ faster_res[-1][1].should == "comment4"
75
+ faster_res[-1][2].should == "mgltrrealssiaavggekalkdalavlggps"
76
+ end
77
+
78
+
79
+ describe "quality conversion for FastQ files (Sanger/Phred only)" do
80
+
81
+ it "converts directly quality scores for Illumina 1.8+ FastQ files" do
82
+
83
+ bioruby_quals = []
84
+ # standard Quality conversion as done in BioRuby Bio::FastQ
85
+ Bio::FlatFile.open(Bio::Fastq,File.open(File.join(TEST_DATA,"sample.fastq"))).each_entry do |seq|
86
+ bioruby_quals << seq.qualities
87
+ end
88
+
89
+ faster_quals = []
90
+ Bio::Faster.parse(File.join(TEST_DATA,"sample.fastq")) do |seq|
91
+ faster_quals << seq[-1]
92
+ end
93
+ faster_quals.should == bioruby_quals
94
+
95
+ end
96
+
97
+ it "converts directly quality scores for SFF 454 FastQ files" do
98
+
99
+ bioruby_quals = []
100
+ # standard Quality conversion as done in BioRuby Bio::FastQ
101
+ Bio::FlatFile.open(Bio::Fastq,File.open(File.join(TEST_DATA,"sff_sample.fastq"))).each_entry do |seq|
102
+ bioruby_quals << seq.qualities
103
+ end
104
+
105
+ faster_quals = []
106
+ Bio::Faster.parse(File.join(TEST_DATA,"sff_sample.fastq")) do |seq|
107
+ faster_quals << seq[-1]
108
+ end
109
+ faster_quals.should == bioruby_quals
110
+
111
+ end
112
+
113
+ end
114
+
115
+ end
116
+
117
+ end
@@ -0,0 +1,10 @@
1
+ >seq1 comment1
2
+ AGCAATTTCCCTTTTCCTGTCCTTTTTATAACATTGTGGAGGAAGACGGCAGCATAAAAAGGACAGTATTTGATTAAAAAATGATAAAAATTTTCAAAC
3
+ >seq2 comment 2
4
+ GTGGGGCCAAAGGGGTTTGGAGGTGCCTTGTTCTTAGTCCCCAGAAGACTAGAGAGACTGCGTTTCAGGGAGGAGGAGATAAGACGAGCAGGAGACTTC
5
+
6
+ >seq3 comment3
7
+ CTCATAGACACGGTCCGAGGAGCCAAACACCAAGCTGTTGGGGAAGACTCGGCTGAGGAACTGCAGGGGCCCAAGCCACGACTGGATGAGGAGCAGTGA
8
+
9
+ >seq4 comment4
10
+ mgltrrealssiaavggekalkdalavlggps
@@ -0,0 +1,24 @@
1
+ @HISEQ1:86:D0306ACXX:2:1101:20970:17588 1:N:0:CTTGTA
2
+ CGGTGCTGTTGTTATGCTGATGCTTATTAGTGCAAGTGTAGCTCCTCCGATTAGATGAATTAACAGGTGTCCTGCAGTAATGTTGGCTGTTAGTCGTAC
3
+ +
4
+ @C@:DDFFHGHHHIJJJJIGIJIJJCHIGGHGIIJJGGHGIJIGJJIJJGGIIBGGCFIAFGF4CGI;AHIFGIEHEEH>EDFFEFFD@AECCDCBDD<
5
+ @HISEQ1:86:D0306ACXX:2:1101:20839:17705 1:N:0:CTTGTA
6
+ AGCAATTTCCCTTTTCCTGTCCTTTTTATAACATTGTGGAGGAAGACGGCAGCATAAAAAGGACAGTATTTGATTAAAAAATGATAAAAATTTTCAAAC
7
+ +
8
+ @@<DFFFFHBDHHJEAFHIJJEHIIIJGHGIGEHGEEGGCGGDG>@GB:@DBGGIIGBGEHHIGFE:@A?B>?>;>>>@;>BCA@>ACCC?:A>;>>@3
9
+ @HISEQ1:86:D0306ACXX:2:1101:21047:17715 1:Y:0:CTTGTA
10
+ GTTAAGAATCTGCCTGCCAATGCAGGAGATGCAAGAGATGCAAGAGACGTGGGTTCCATAGCTGGGTCAGGAAAATTACCTGAAGGAGGACATATCAAA
11
+ +
12
+ =7+2?@++?4?7A7+++<+++3+3<AA<;3A@A9AA###############################################################
13
+ @HISEQ1:86:D0306ACXX:2:1101:21299:17513 1:N:0:CTTGTA
14
+ GTAGCACACTTTCACTATGTCCTATCAATAGGAGCTGTGTTCGCCATTATAGGAGGATTTGTAAATTGATTTCCACTGTTCTCATACTATAACCTAAAA
15
+ +
16
+ 8:?;=;B?;B>+A,<AEACEDE@AFF,A:99CEF@E*1??:CDII:?@*9?@*9D:BCB@.88>AAAA)==4==7=C;==);@################
17
+ @HISEQ1:86:D0306ACXX:2:1101:1352:17782 1:N:0:CTTGTA
18
+ AATAATTGTTTGCATTGCCTTTTATATATATTTATATATATATATAAAACATGGGTCTTGGTTTTTTGATTTATTAGTGTGAAGAAATAACTACATTCT
19
+ +
20
+ @<?DABADHGFA<,CECFIHGIIDEGG@DHGDGI<<CHEBCEGIIIGCD<CFGC=09?<F@GEHIDHI=A:C;EH>3?EE;?C@@DDFA>@>@>C3;AC
21
+ @HISEQ1:86:D0306ACXX:2:1101:1411:17830 1:Y:0:CTTGTA
22
+ CGGCGGGCGTGGGGAGAGAGCTATGAAGGCCTCAGGGAAGCTTCGAGAGTATAAGGTGTTGGGGTGCTGCCTGCCAACCCCCAAATTCCACACACCACC
23
+ +
24
+ ###################################################################################################
Binary file
@@ -0,0 +1,16 @@
1
+ @SRR014849.1 EIXKN4201CFU84/1
2
+ GGGGGGGGGGGGGGGGCTTTTTTTGTTTGGAACCGAAAGGGTTTTGAATTTCAAACCCTTTTCGGTTTCCAACCTTCCAAAGCAATGCCAATACTGAGCGGGCTGGCAAGGCNNNNNNNNNNNN
3
+ +
4
+ 3+&$#"""""""""""7F@71,'";C?,B;?6B;:EA1EA1EA5'9B:?:#9EA0D@2EA5':>5?:%A;A8A;?9B;D@/=<?7=9<2A8==<=5<6:?:$::9.;:0:/3!!!!!!!!!!!!
5
+ @SRR014849.2 EIXKN4201AKDUH/1
6
+ TCAAGTGGTGAACGGCAGAAA
7
+ +
8
+ <=B:==B:=<?6=B;<;=B=)
9
+ @SRR014849.3 EIXKN4201D4ZBL/1
10
+ GGGGGGGGGCTGTTGGCCGAGGTTGGAGTAGCCAGGGGGAAGGCATGGCCAGCCGTTGAGAAATGCTTGTTGAAGTTTTCGATAATAATGGATTTATCGGTGGTGACCGTGTTACCTAGCC
11
+ +
12
+ ;3.*(&$"";<=A9@8A9;<B;B;B;8=<==B;<FB8/'@8B:==<B;A9<<A8=B;==;A=)=<<B;=A9<@7<FB5(<<=<B;<B;:A9=EA0;<;B:<A8=<<@8<<<B;<A99=<B:
13
+ @SRR014849.4 EIXKN4201AHFLR/1
14
+ GGGGTTCAAGAATATGCCCC
15
+ +
16
+ B8/&?9<B;=B;=<9<FB5(
metadata ADDED
@@ -0,0 +1,132 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-faster
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Francesco Strozzi
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: shoulda
16
+ requirement: &2154318860 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *2154318860
25
+ - !ruby/object:Gem::Dependency
26
+ name: bundler
27
+ requirement: &2154318380 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.0.0
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *2154318380
36
+ - !ruby/object:Gem::Dependency
37
+ name: jeweler
38
+ requirement: &2154317900 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 1.6.4
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *2154317900
47
+ - !ruby/object:Gem::Dependency
48
+ name: rcov
49
+ requirement: &2154317360 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *2154317360
58
+ - !ruby/object:Gem::Dependency
59
+ name: bio
60
+ requirement: &2154316860 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: 1.4.2
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *2154316860
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: &2154316360 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *2154316360
80
+ description: A fast parser for Fasta and FastQ files
81
+ email: francesco.strozzi@gmail.com
82
+ executables: []
83
+ extensions:
84
+ - ext/extconf.rb
85
+ extra_rdoc_files:
86
+ - LICENSE.txt
87
+ - README.rdoc
88
+ files:
89
+ - .document
90
+ - Gemfile
91
+ - Gemfile.lock
92
+ - LICENSE.txt
93
+ - README.rdoc
94
+ - Rakefile
95
+ - VERSION
96
+ - bio-faster.gemspec
97
+ - ext/extconf.rb
98
+ - ext/faster.c
99
+ - ext/kseq.h
100
+ - lib/bio-faster.rb
101
+ - spec/helper.rb
102
+ - spec/parser_spec.rb
103
+ - test/data/sample.fasta
104
+ - test/data/sample.fastq
105
+ - test/data/sample.fastq.gz
106
+ - test/data/sff_sample.fastq
107
+ homepage: http://github.com/fstrozzi/bioruby-faster
108
+ licenses:
109
+ - MIT
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
119
+ version: '1.9'
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ requirements: []
127
+ rubyforge_project:
128
+ rubygems_version: 1.8.12
129
+ signing_key:
130
+ specification_version: 3
131
+ summary: A fast parser for Fasta and FastQ files
132
+ test_files: []