bio-faster 0.2.2 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +2 -3
- data/Gemfile +2 -0
- data/Gemfile.lock +2 -0
- data/LICENSE.txt +1 -1
- data/README.md +2 -3
- data/Rakefile +9 -15
- data/VERSION +1 -1
- data/bio-faster.gemspec +56 -12
- data/ext/faster.c +115 -52
- data/ext/mkrf_conf.rb +40 -0
- data/lib/bio-faster.rb +5 -4
- data/lib/bio/faster.rb +57 -0
- data/lib/bio/faster/library.rb +26 -0
- data/spec/fastq_error_spec.rb +55 -0
- data/spec/parser_spec.rb +28 -116
- data/test/data/errors/error_header.fastq +20 -0
- data/test/data/errors/error_long_qual.fastq +20 -0
- data/test/data/errors/error_qual_del.fastq +20 -0
- data/test/data/errors/error_qual_escape.fastq +20 -0
- data/test/data/errors/error_qual_null.fastq +0 -0
- data/test/data/errors/error_qual_space.fastq +20 -0
- data/test/data/errors/error_qual_tab.fastq +20 -0
- data/test/data/errors/error_qual_unit_sep.fastq +20 -0
- data/test/data/errors/error_qual_vtab.fastq +20 -0
- data/test/data/errors/error_spaces.fastq +20 -0
- data/test/data/errors/error_tabs.fastq +21 -0
- data/test/data/errors/error_trunc_at_qual.fastq +19 -0
- data/test/data/errors/error_trunc_at_seq.fastq +18 -0
- data/test/data/errors/error_trunc_in_qual.fastq +20 -0
- data/test/data/errors/error_trunc_in_seq.fastq +18 -0
- data/test/data/formats/illumina_full_range_as_illumina.fastq +8 -0
- data/test/data/formats/illumina_full_range_as_sanger.fastq +8 -0
- data/test/data/formats/illumina_full_range_as_solexa.fastq +8 -0
- data/test/data/formats/illumina_full_range_original_illumina.fastq +8 -0
- data/test/data/formats/longreads_as_illumina.fastq +40 -0
- data/test/data/formats/longreads_as_sanger.fastq +40 -0
- data/test/data/formats/longreads_as_solexa.fastq +40 -0
- data/test/data/formats/misc_dna_as_illumina.fastq +16 -0
- data/test/data/formats/misc_dna_as_sanger.fastq +16 -0
- data/test/data/formats/misc_dna_as_solexa.fastq +16 -0
- data/test/data/formats/misc_dna_original_sanger.fastq +16 -0
- data/test/data/formats/misc_rna_as_illumina.fastq +16 -0
- data/test/data/formats/misc_rna_as_sanger.fastq +16 -0
- data/test/data/formats/misc_rna_as_solexa.fastq +16 -0
- data/test/data/formats/misc_rna_original_sanger.fastq +16 -0
- data/test/data/formats/sanger_full_range_as_illumina.fastq +8 -0
- data/test/data/formats/sanger_full_range_as_sanger.fastq +8 -0
- data/test/data/formats/sanger_full_range_as_solexa.fastq +8 -0
- data/test/data/formats/sanger_full_range_original_sanger.fastq +8 -0
- data/test/data/formats/solexa_full_range_as_illumina.fastq +8 -0
- data/test/data/formats/solexa_full_range_as_sanger.fastq +8 -0
- data/test/data/formats/solexa_full_range_as_solexa.fastq +8 -0
- data/test/data/formats/solexa_full_range_original_solexa.fastq +8 -0
- data/test/data/formats/wrapping_as_illumina.fastq +12 -0
- data/test/data/formats/wrapping_as_sanger.fastq +12 -0
- data/test/data/formats/wrapping_as_solexa.fastq +12 -0
- metadata +88 -24
- data/ext/extconf.rb +0 -7
- data/ext/kseq.h +0 -223
- data/test/data/sample.fasta +0 -10
- data/test/data/sample.fastq +0 -24
- data/test/data/sample.fastq.gz +0 -0
- data/test/data/sff_sample.fastq +0 -16
data/.travis.yml
CHANGED
@@ -3,10 +3,9 @@ rvm:
|
|
3
3
|
- 1.8.7
|
4
4
|
- 1.9.2
|
5
5
|
- 1.9.3
|
6
|
-
|
7
|
-
|
6
|
+
- jruby-18mode # JRuby in 1.8 mode
|
7
|
+
- jruby-19mode # JRuby in 1.9 mode
|
8
8
|
# - rbx-18mode
|
9
9
|
# - rbx-19mode
|
10
10
|
# uncomment this line if your project needs to run something other than `rake`:
|
11
|
-
#script: bundle exec rake ext:build
|
12
11
|
#script: bundle exec rake
|
data/Gemfile
CHANGED
@@ -2,6 +2,7 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem "ffi"
|
5
6
|
|
6
7
|
# Add dependencies to develop your gem here.
|
7
8
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -12,4 +13,5 @@ group :development do
|
|
12
13
|
gem "rcov", ">= 0"
|
13
14
|
gem "bio", ">= 1.4.2"
|
14
15
|
gem "rspec"
|
16
|
+
gem "ffi"
|
15
17
|
end
|
data/Gemfile.lock
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -3,8 +3,7 @@
|
|
3
3
|
Bio::Faster
|
4
4
|
==========
|
5
5
|
|
6
|
-
Fast and simple parser for
|
7
|
-
http://lh3lh3.users.sourceforge.net/parsefastq.shtml
|
6
|
+
Fast and simple parser for FastQ files
|
8
7
|
|
9
8
|
Examples
|
10
9
|
========
|
@@ -28,6 +27,6 @@ Contributing to bio-faster
|
|
28
27
|
Copyright
|
29
28
|
=========
|
30
29
|
|
31
|
-
Copyright (c)
|
30
|
+
Copyright (c) 2012 Francesco Strozzi. See LICENSE.txt for
|
32
31
|
further details.
|
33
32
|
|
data/Rakefile
CHANGED
@@ -17,11 +17,11 @@ Jeweler::Tasks.new do |gem|
|
|
17
17
|
gem.name = "bio-faster"
|
18
18
|
gem.homepage = "http://github.com/fstrozzi/bioruby-faster"
|
19
19
|
gem.license = "MIT"
|
20
|
-
gem.summary = %Q{A fast parser for
|
21
|
-
gem.description = %Q{A fast parser for
|
20
|
+
gem.summary = %Q{A fast parser for FastQ files}
|
21
|
+
gem.description = %Q{A fast parser for FastQ files}
|
22
22
|
gem.email = "francesco.strozzi@gmail.com"
|
23
23
|
gem.authors = ["Francesco Strozzi"]
|
24
|
-
gem.
|
24
|
+
gem.files << "lib/*/**"
|
25
25
|
# dependencies defined in Gemfile
|
26
26
|
end
|
27
27
|
Jeweler::RubygemsDotOrgTasks.new
|
@@ -58,20 +58,14 @@ Rake::RDocTask.new do |rdoc|
|
|
58
58
|
end
|
59
59
|
|
60
60
|
namespace :ext do
|
61
|
-
desc "
|
61
|
+
desc "Build native extension"
|
62
62
|
task :build do
|
63
|
-
|
64
|
-
|
65
|
-
sh "
|
66
|
-
|
67
|
-
FileList["*.log"].each do |file|
|
68
|
-
rm file
|
69
|
-
end
|
70
|
-
FileList["*.o"].each do |file|
|
71
|
-
rm file
|
72
|
-
end
|
73
|
-
cd ".."
|
63
|
+
cd "ext"
|
64
|
+
ruby "mkrf_conf.rb"
|
65
|
+
sh "rake"
|
66
|
+
cd ".."
|
74
67
|
end
|
68
|
+
|
75
69
|
end
|
76
70
|
|
77
71
|
task :default => ["ext:build",:spec]
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.2
|
data/bio-faster.gemspec
CHANGED
@@ -5,14 +5,14 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-faster"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Francesco Strozzi"]
|
12
|
-
s.date = "2012-04-
|
13
|
-
s.description = "A fast parser for
|
12
|
+
s.date = "2012-04-27"
|
13
|
+
s.description = "A fast parser for FastQ files"
|
14
14
|
s.email = "francesco.strozzi@gmail.com"
|
15
|
-
s.extensions = ["ext/
|
15
|
+
s.extensions = ["ext/mkrf_conf.rb"]
|
16
16
|
s.extra_rdoc_files = [
|
17
17
|
"LICENSE.txt",
|
18
18
|
"README.md"
|
@@ -27,49 +27,93 @@ Gem::Specification.new do |s|
|
|
27
27
|
"Rakefile",
|
28
28
|
"VERSION",
|
29
29
|
"bio-faster.gemspec",
|
30
|
-
"ext/extconf.rb",
|
31
30
|
"ext/faster.c",
|
32
|
-
"ext/
|
31
|
+
"ext/mkrf_conf.rb",
|
33
32
|
"lib/bio-faster.rb",
|
33
|
+
"lib/bio/faster.rb",
|
34
|
+
"lib/bio/faster/library.rb",
|
35
|
+
"spec/fastq_error_spec.rb",
|
34
36
|
"spec/helper.rb",
|
35
37
|
"spec/parser_spec.rb",
|
36
|
-
"test/data/
|
37
|
-
"test/data/
|
38
|
-
"test/data/
|
39
|
-
"test/data/
|
38
|
+
"test/data/errors/error_header.fastq",
|
39
|
+
"test/data/errors/error_long_qual.fastq",
|
40
|
+
"test/data/errors/error_qual_del.fastq",
|
41
|
+
"test/data/errors/error_qual_escape.fastq",
|
42
|
+
"test/data/errors/error_qual_null.fastq",
|
43
|
+
"test/data/errors/error_qual_space.fastq",
|
44
|
+
"test/data/errors/error_qual_tab.fastq",
|
45
|
+
"test/data/errors/error_qual_unit_sep.fastq",
|
46
|
+
"test/data/errors/error_qual_vtab.fastq",
|
47
|
+
"test/data/errors/error_spaces.fastq",
|
48
|
+
"test/data/errors/error_tabs.fastq",
|
49
|
+
"test/data/errors/error_trunc_at_qual.fastq",
|
50
|
+
"test/data/errors/error_trunc_at_seq.fastq",
|
51
|
+
"test/data/errors/error_trunc_in_qual.fastq",
|
52
|
+
"test/data/errors/error_trunc_in_seq.fastq",
|
53
|
+
"test/data/formats/illumina_full_range_as_illumina.fastq",
|
54
|
+
"test/data/formats/illumina_full_range_as_sanger.fastq",
|
55
|
+
"test/data/formats/illumina_full_range_as_solexa.fastq",
|
56
|
+
"test/data/formats/illumina_full_range_original_illumina.fastq",
|
57
|
+
"test/data/formats/longreads_as_illumina.fastq",
|
58
|
+
"test/data/formats/longreads_as_sanger.fastq",
|
59
|
+
"test/data/formats/longreads_as_solexa.fastq",
|
60
|
+
"test/data/formats/misc_dna_as_illumina.fastq",
|
61
|
+
"test/data/formats/misc_dna_as_sanger.fastq",
|
62
|
+
"test/data/formats/misc_dna_as_solexa.fastq",
|
63
|
+
"test/data/formats/misc_dna_original_sanger.fastq",
|
64
|
+
"test/data/formats/misc_rna_as_illumina.fastq",
|
65
|
+
"test/data/formats/misc_rna_as_sanger.fastq",
|
66
|
+
"test/data/formats/misc_rna_as_solexa.fastq",
|
67
|
+
"test/data/formats/misc_rna_original_sanger.fastq",
|
68
|
+
"test/data/formats/sanger_full_range_as_illumina.fastq",
|
69
|
+
"test/data/formats/sanger_full_range_as_sanger.fastq",
|
70
|
+
"test/data/formats/sanger_full_range_as_solexa.fastq",
|
71
|
+
"test/data/formats/sanger_full_range_original_sanger.fastq",
|
72
|
+
"test/data/formats/solexa_full_range_as_illumina.fastq",
|
73
|
+
"test/data/formats/solexa_full_range_as_sanger.fastq",
|
74
|
+
"test/data/formats/solexa_full_range_as_solexa.fastq",
|
75
|
+
"test/data/formats/solexa_full_range_original_solexa.fastq",
|
76
|
+
"test/data/formats/wrapping_as_illumina.fastq",
|
77
|
+
"test/data/formats/wrapping_as_sanger.fastq",
|
78
|
+
"test/data/formats/wrapping_as_solexa.fastq"
|
40
79
|
]
|
41
80
|
s.homepage = "http://github.com/fstrozzi/bioruby-faster"
|
42
81
|
s.licenses = ["MIT"]
|
43
82
|
s.require_paths = ["lib"]
|
44
|
-
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
45
83
|
s.rubygems_version = "1.8.15"
|
46
|
-
s.summary = "A fast parser for
|
84
|
+
s.summary = "A fast parser for FastQ files"
|
47
85
|
|
48
86
|
if s.respond_to? :specification_version then
|
49
87
|
s.specification_version = 3
|
50
88
|
|
51
89
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
90
|
+
s.add_runtime_dependency(%q<ffi>, [">= 0"])
|
52
91
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
53
92
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
54
93
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
55
94
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
56
95
|
s.add_development_dependency(%q<bio>, [">= 1.4.2"])
|
57
96
|
s.add_development_dependency(%q<rspec>, [">= 0"])
|
97
|
+
s.add_development_dependency(%q<ffi>, [">= 0"])
|
58
98
|
else
|
99
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
59
100
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
60
101
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
61
102
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
62
103
|
s.add_dependency(%q<rcov>, [">= 0"])
|
63
104
|
s.add_dependency(%q<bio>, [">= 1.4.2"])
|
64
105
|
s.add_dependency(%q<rspec>, [">= 0"])
|
106
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
65
107
|
end
|
66
108
|
else
|
109
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
67
110
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
68
111
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
69
112
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
70
113
|
s.add_dependency(%q<rcov>, [">= 0"])
|
71
114
|
s.add_dependency(%q<bio>, [">= 1.4.2"])
|
72
115
|
s.add_dependency(%q<rspec>, [">= 0"])
|
116
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
73
117
|
end
|
74
118
|
end
|
75
119
|
|
data/ext/faster.c
CHANGED
@@ -1,57 +1,120 @@
|
|
1
|
+
|
1
2
|
/*
|
2
|
-
Copyright(C)
|
3
|
-
*/
|
3
|
+
Copyright(C) 2012 Francesco Strozzi <francesco.strozzi@gmail.com>
|
4
4
|
|
5
|
-
|
5
|
+
*/
|
6
6
|
#include <stdio.h>
|
7
|
-
#include
|
8
|
-
#include
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
return Qtrue;
|
50
|
-
}
|
7
|
+
#include <string.h>
|
8
|
+
#include <stdlib.h>
|
9
|
+
|
10
|
+
#define _BSIZE 100000
|
11
|
+
|
12
|
+
typedef struct {
|
13
|
+
char *id;
|
14
|
+
char *seq;
|
15
|
+
int *quality;
|
16
|
+
char *raw_quality;
|
17
|
+
char *filename;
|
18
|
+
char *line;
|
19
|
+
char *bad_chars;
|
20
|
+
FILE *stream;
|
21
|
+
|
22
|
+
}FastQRecord;
|
23
|
+
|
24
|
+
|
25
|
+
static char* alloc_and_copy(char *dst, char *src) {
|
26
|
+
if (dst==NULL || strlen(dst)<strlen(src)) {
|
27
|
+
if (dst!=NULL)
|
28
|
+
free(dst);
|
29
|
+
dst= malloc(sizeof (char)*(strlen(src)+1));
|
30
|
+
}
|
31
|
+
strcpy(dst, src);
|
32
|
+
int len;
|
33
|
+
len = strlen(dst);
|
34
|
+
if (dst[len-1] == '\n') dst[len-1] = '\0';
|
35
|
+
return dst;
|
36
|
+
}
|
37
|
+
|
38
|
+
|
39
|
+
const char* check_bad_chars(char *invalid_chars, char *string_to_check) {
|
40
|
+
return strpbrk(string_to_check, invalid_chars);
|
41
|
+
}
|
42
|
+
|
43
|
+
static char* initialize(char *ptr) {
|
44
|
+
if(ptr!=NULL){
|
45
|
+
free(ptr);
|
46
|
+
ptr = NULL;
|
47
|
+
}
|
48
|
+
return ptr;
|
51
49
|
}
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
50
|
+
|
51
|
+
int* initialize_int(int *ptr) {
|
52
|
+
if(ptr!=NULL){
|
53
|
+
free(ptr);
|
54
|
+
ptr = NULL;
|
55
|
+
}
|
56
|
+
return ptr;
|
57
|
+
}
|
58
|
+
|
59
|
+
int check_header(char *header, char *firstline) {
|
60
|
+
if (*header == *firstline)
|
61
|
+
return 1;
|
62
|
+
else {
|
63
|
+
return 0;
|
64
|
+
}
|
57
65
|
}
|
66
|
+
|
67
|
+
int fastQ_iterator(FastQRecord *seq, int scale_factor) {
|
68
|
+
// initialization of structure elements.
|
69
|
+
char *header = "@"; // FastQ header
|
70
|
+
if (!seq->stream)
|
71
|
+
seq->stream = fopen(seq->filename,"r");
|
72
|
+
if (!seq->line)
|
73
|
+
seq->line = malloc(sizeof (char)* _BSIZE);
|
74
|
+
if (!seq->bad_chars)
|
75
|
+
seq->bad_chars = " \x1F\x7F\t\v\e";
|
76
|
+
|
77
|
+
// this is done to wipe out data from previous iteration
|
78
|
+
seq->id = initialize(seq->id);
|
79
|
+
seq->seq = initialize(seq->seq);
|
80
|
+
seq->raw_quality = initialize(seq->raw_quality);
|
81
|
+
for (int i = 0; i < 4; i++)
|
82
|
+
{
|
83
|
+
if (fgets(seq->line, _BSIZE, seq->stream) == NULL) {
|
84
|
+
// if either sequence or quality is missing the record is truncated
|
85
|
+
if((seq->seq != NULL && seq->raw_quality == NULL) || (seq->raw_quality != NULL && seq->seq == NULL)) return -2;
|
86
|
+
else return 0;
|
87
|
+
}
|
88
|
+
// getting seq ID
|
89
|
+
if (i==0) {
|
90
|
+
if (!check_header(header,seq->line)) return -1; // check if the header format is correct
|
91
|
+
// removing the @
|
92
|
+
seq->id = alloc_and_copy(seq->id, seq->line+1);
|
93
|
+
|
94
|
+
}
|
95
|
+
else {
|
96
|
+
if (check_bad_chars(seq->bad_chars,seq->line)) return -1; // check if quality or sequence includes bad characters
|
97
|
+
if (i==1) seq->seq = alloc_and_copy(seq->seq, seq->line);
|
98
|
+
if (i==3) {
|
99
|
+
seq->raw_quality = alloc_and_copy(seq->raw_quality, seq->line);
|
100
|
+
int quality_length = strlen(seq->raw_quality);
|
101
|
+
if(strlen(seq->seq) != strlen(seq->raw_quality)) return -2; // if sequence and quality are of different length the record is truncated
|
102
|
+
int c = 0;
|
103
|
+
seq->quality = initialize_int(seq->quality);
|
104
|
+
seq->quality = malloc(sizeof (int)* quality_length);
|
105
|
+
while(c < quality_length) {
|
106
|
+
seq->quality[c] = *(seq->line + c) - scale_factor; // quality conversion
|
107
|
+
c++;
|
108
|
+
}
|
109
|
+
|
110
|
+
}
|
111
|
+
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
return 1;
|
116
|
+
|
117
|
+
}
|
118
|
+
|
119
|
+
|
120
|
+
#undef _BSIZE
|
data/ext/mkrf_conf.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# create Rakefile for shared library compilation
|
2
|
+
|
3
|
+
require File.join("..",File.dirname(__FILE__),"lib/bio/faster/library")
|
4
|
+
|
5
|
+
path = File.expand_path(File.dirname(__FILE__))
|
6
|
+
ext = Bio::Faster::Library.lib_extension
|
7
|
+
|
8
|
+
flags = ""
|
9
|
+
compile = ""
|
10
|
+
if ext == "so" then
|
11
|
+
flags = "-shared -Wl,-soname,libfaster.so"
|
12
|
+
compile = " -fPIC"
|
13
|
+
elsif ext == "dylib" then
|
14
|
+
flags = "-bundle -undefined dynamic_lookup -flat_namespace"
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
File.open(File.join(path,"Rakefile"),"w") do |rakefile|
|
19
|
+
rakefile.write <<-RAKE
|
20
|
+
require 'rake/clean'
|
21
|
+
|
22
|
+
source = %w(faster.c)
|
23
|
+
|
24
|
+
CLEAN.include('*.o')
|
25
|
+
SRC = FileList.new(source)
|
26
|
+
OBJ_SRC = SRC.ext('o')
|
27
|
+
|
28
|
+
rule '.o' => '.c' do |t|
|
29
|
+
sh "gcc#{compile} -std=c99 -c -g -Wall -O2 "+t.source+" -o "+t.name
|
30
|
+
end
|
31
|
+
|
32
|
+
task :compile_lib => OBJ_SRC do
|
33
|
+
sh "gcc #{flags} -std=c99 "+OBJ_SRC.join(" ")+" -o libfaster.#{ext}"
|
34
|
+
end
|
35
|
+
|
36
|
+
task :default => [:compile_lib, :clean]
|
37
|
+
|
38
|
+
RAKE
|
39
|
+
|
40
|
+
end
|