bio-faster 0.2.2 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.travis.yml +2 -3
  2. data/Gemfile +2 -0
  3. data/Gemfile.lock +2 -0
  4. data/LICENSE.txt +1 -1
  5. data/README.md +2 -3
  6. data/Rakefile +9 -15
  7. data/VERSION +1 -1
  8. data/bio-faster.gemspec +56 -12
  9. data/ext/faster.c +115 -52
  10. data/ext/mkrf_conf.rb +40 -0
  11. data/lib/bio-faster.rb +5 -4
  12. data/lib/bio/faster.rb +57 -0
  13. data/lib/bio/faster/library.rb +26 -0
  14. data/spec/fastq_error_spec.rb +55 -0
  15. data/spec/parser_spec.rb +28 -116
  16. data/test/data/errors/error_header.fastq +20 -0
  17. data/test/data/errors/error_long_qual.fastq +20 -0
  18. data/test/data/errors/error_qual_del.fastq +20 -0
  19. data/test/data/errors/error_qual_escape.fastq +20 -0
  20. data/test/data/errors/error_qual_null.fastq +0 -0
  21. data/test/data/errors/error_qual_space.fastq +20 -0
  22. data/test/data/errors/error_qual_tab.fastq +20 -0
  23. data/test/data/errors/error_qual_unit_sep.fastq +20 -0
  24. data/test/data/errors/error_qual_vtab.fastq +20 -0
  25. data/test/data/errors/error_spaces.fastq +20 -0
  26. data/test/data/errors/error_tabs.fastq +21 -0
  27. data/test/data/errors/error_trunc_at_qual.fastq +19 -0
  28. data/test/data/errors/error_trunc_at_seq.fastq +18 -0
  29. data/test/data/errors/error_trunc_in_qual.fastq +20 -0
  30. data/test/data/errors/error_trunc_in_seq.fastq +18 -0
  31. data/test/data/formats/illumina_full_range_as_illumina.fastq +8 -0
  32. data/test/data/formats/illumina_full_range_as_sanger.fastq +8 -0
  33. data/test/data/formats/illumina_full_range_as_solexa.fastq +8 -0
  34. data/test/data/formats/illumina_full_range_original_illumina.fastq +8 -0
  35. data/test/data/formats/longreads_as_illumina.fastq +40 -0
  36. data/test/data/formats/longreads_as_sanger.fastq +40 -0
  37. data/test/data/formats/longreads_as_solexa.fastq +40 -0
  38. data/test/data/formats/misc_dna_as_illumina.fastq +16 -0
  39. data/test/data/formats/misc_dna_as_sanger.fastq +16 -0
  40. data/test/data/formats/misc_dna_as_solexa.fastq +16 -0
  41. data/test/data/formats/misc_dna_original_sanger.fastq +16 -0
  42. data/test/data/formats/misc_rna_as_illumina.fastq +16 -0
  43. data/test/data/formats/misc_rna_as_sanger.fastq +16 -0
  44. data/test/data/formats/misc_rna_as_solexa.fastq +16 -0
  45. data/test/data/formats/misc_rna_original_sanger.fastq +16 -0
  46. data/test/data/formats/sanger_full_range_as_illumina.fastq +8 -0
  47. data/test/data/formats/sanger_full_range_as_sanger.fastq +8 -0
  48. data/test/data/formats/sanger_full_range_as_solexa.fastq +8 -0
  49. data/test/data/formats/sanger_full_range_original_sanger.fastq +8 -0
  50. data/test/data/formats/solexa_full_range_as_illumina.fastq +8 -0
  51. data/test/data/formats/solexa_full_range_as_sanger.fastq +8 -0
  52. data/test/data/formats/solexa_full_range_as_solexa.fastq +8 -0
  53. data/test/data/formats/solexa_full_range_original_solexa.fastq +8 -0
  54. data/test/data/formats/wrapping_as_illumina.fastq +12 -0
  55. data/test/data/formats/wrapping_as_sanger.fastq +12 -0
  56. data/test/data/formats/wrapping_as_solexa.fastq +12 -0
  57. metadata +88 -24
  58. data/ext/extconf.rb +0 -7
  59. data/ext/kseq.h +0 -223
  60. data/test/data/sample.fasta +0 -10
  61. data/test/data/sample.fastq +0 -24
  62. data/test/data/sample.fastq.gz +0 -0
  63. data/test/data/sff_sample.fastq +0 -16
@@ -3,10 +3,9 @@ rvm:
3
3
  - 1.8.7
4
4
  - 1.9.2
5
5
  - 1.9.3
6
- # - jruby-18mode # JRuby in 1.8 mode
7
- # - jruby-19mode # JRuby in 1.9 mode
6
+ - jruby-18mode # JRuby in 1.8 mode
7
+ - jruby-19mode # JRuby in 1.9 mode
8
8
  # - rbx-18mode
9
9
  # - rbx-19mode
10
10
  # uncomment this line if your project needs to run something other than `rake`:
11
- #script: bundle exec rake ext:build
12
11
  #script: bundle exec rake
data/Gemfile CHANGED
@@ -2,6 +2,7 @@ source "http://rubygems.org"
2
2
  # Add dependencies required to use your gem here.
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
+ gem "ffi"
5
6
 
6
7
  # Add dependencies to develop your gem here.
7
8
  # Include everything needed to run rake, tests, features, etc.
@@ -12,4 +13,5 @@ group :development do
12
13
  gem "rcov", ">= 0"
13
14
  gem "bio", ">= 1.4.2"
14
15
  gem "rspec"
16
+ gem "ffi"
15
17
  end
@@ -3,6 +3,7 @@ GEM
3
3
  specs:
4
4
  bio (1.4.2)
5
5
  diff-lcs (1.1.3)
6
+ ffi (1.0.11)
6
7
  git (1.2.5)
7
8
  jeweler (1.6.4)
8
9
  bundler (~> 1.0)
@@ -28,6 +29,7 @@ PLATFORMS
28
29
  DEPENDENCIES
29
30
  bio (>= 1.4.2)
30
31
  bundler (~> 1.0.0)
32
+ ffi
31
33
  jeweler (~> 1.6.4)
32
34
  rcov
33
35
  rspec
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011 Francesco Strozzi
1
+ Copyright (c) 2012 Francesco Strozzi
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -3,8 +3,7 @@
3
3
  Bio::Faster
4
4
  ==========
5
5
 
6
- Fast and simple parser for FastA / FastQ files, based on Heng Li Kseq library written in C.
7
- http://lh3lh3.users.sourceforge.net/parsefastq.shtml
6
+ Fast and simple parser for FastQ files
8
7
 
9
8
  Examples
10
9
  ========
@@ -28,6 +27,6 @@ Contributing to bio-faster
28
27
  Copyright
29
28
  =========
30
29
 
31
- Copyright (c) 2011 Francesco Strozzi. See LICENSE.txt for
30
+ Copyright (c) 2012 Francesco Strozzi. See LICENSE.txt for
32
31
  further details.
33
32
 
data/Rakefile CHANGED
@@ -17,11 +17,11 @@ Jeweler::Tasks.new do |gem|
17
17
  gem.name = "bio-faster"
18
18
  gem.homepage = "http://github.com/fstrozzi/bioruby-faster"
19
19
  gem.license = "MIT"
20
- gem.summary = %Q{A fast parser for Fasta and FastQ files}
21
- gem.description = %Q{A fast parser for Fasta and FastQ files}
20
+ gem.summary = %Q{A fast parser for FastQ files}
21
+ gem.description = %Q{A fast parser for FastQ files}
22
22
  gem.email = "francesco.strozzi@gmail.com"
23
23
  gem.authors = ["Francesco Strozzi"]
24
- gem.required_ruby_version = '>= 1.9'
24
+ gem.files << "lib/*/**"
25
25
  # dependencies defined in Gemfile
26
26
  end
27
27
  Jeweler::RubygemsDotOrgTasks.new
@@ -58,20 +58,14 @@ Rake::RDocTask.new do |rdoc|
58
58
  end
59
59
 
60
60
  namespace :ext do
61
- desc "Compile extension"
61
+ desc "Build native extension"
62
62
  task :build do
63
- puts "Building extension"
64
- cd File.join(File.dirname(__FILE__),"ext")
65
- sh "ruby "+File.join(File.dirname(__FILE__),"ext","extconf.rb")
66
- sh "make"
67
- FileList["*.log"].each do |file|
68
- rm file
69
- end
70
- FileList["*.o"].each do |file|
71
- rm file
72
- end
73
- cd ".."
63
+ cd "ext"
64
+ ruby "mkrf_conf.rb"
65
+ sh "rake"
66
+ cd ".."
74
67
  end
68
+
75
69
  end
76
70
 
77
71
  task :default => ["ext:build",:spec]
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.4.2
@@ -5,14 +5,14 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bio-faster"
8
- s.version = "0.2.2"
8
+ s.version = "0.4.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Francesco Strozzi"]
12
- s.date = "2012-04-02"
13
- s.description = "A fast parser for Fasta and FastQ files"
12
+ s.date = "2012-04-27"
13
+ s.description = "A fast parser for FastQ files"
14
14
  s.email = "francesco.strozzi@gmail.com"
15
- s.extensions = ["ext/extconf.rb"]
15
+ s.extensions = ["ext/mkrf_conf.rb"]
16
16
  s.extra_rdoc_files = [
17
17
  "LICENSE.txt",
18
18
  "README.md"
@@ -27,49 +27,93 @@ Gem::Specification.new do |s|
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "bio-faster.gemspec",
30
- "ext/extconf.rb",
31
30
  "ext/faster.c",
32
- "ext/kseq.h",
31
+ "ext/mkrf_conf.rb",
33
32
  "lib/bio-faster.rb",
33
+ "lib/bio/faster.rb",
34
+ "lib/bio/faster/library.rb",
35
+ "spec/fastq_error_spec.rb",
34
36
  "spec/helper.rb",
35
37
  "spec/parser_spec.rb",
36
- "test/data/sample.fasta",
37
- "test/data/sample.fastq",
38
- "test/data/sample.fastq.gz",
39
- "test/data/sff_sample.fastq"
38
+ "test/data/errors/error_header.fastq",
39
+ "test/data/errors/error_long_qual.fastq",
40
+ "test/data/errors/error_qual_del.fastq",
41
+ "test/data/errors/error_qual_escape.fastq",
42
+ "test/data/errors/error_qual_null.fastq",
43
+ "test/data/errors/error_qual_space.fastq",
44
+ "test/data/errors/error_qual_tab.fastq",
45
+ "test/data/errors/error_qual_unit_sep.fastq",
46
+ "test/data/errors/error_qual_vtab.fastq",
47
+ "test/data/errors/error_spaces.fastq",
48
+ "test/data/errors/error_tabs.fastq",
49
+ "test/data/errors/error_trunc_at_qual.fastq",
50
+ "test/data/errors/error_trunc_at_seq.fastq",
51
+ "test/data/errors/error_trunc_in_qual.fastq",
52
+ "test/data/errors/error_trunc_in_seq.fastq",
53
+ "test/data/formats/illumina_full_range_as_illumina.fastq",
54
+ "test/data/formats/illumina_full_range_as_sanger.fastq",
55
+ "test/data/formats/illumina_full_range_as_solexa.fastq",
56
+ "test/data/formats/illumina_full_range_original_illumina.fastq",
57
+ "test/data/formats/longreads_as_illumina.fastq",
58
+ "test/data/formats/longreads_as_sanger.fastq",
59
+ "test/data/formats/longreads_as_solexa.fastq",
60
+ "test/data/formats/misc_dna_as_illumina.fastq",
61
+ "test/data/formats/misc_dna_as_sanger.fastq",
62
+ "test/data/formats/misc_dna_as_solexa.fastq",
63
+ "test/data/formats/misc_dna_original_sanger.fastq",
64
+ "test/data/formats/misc_rna_as_illumina.fastq",
65
+ "test/data/formats/misc_rna_as_sanger.fastq",
66
+ "test/data/formats/misc_rna_as_solexa.fastq",
67
+ "test/data/formats/misc_rna_original_sanger.fastq",
68
+ "test/data/formats/sanger_full_range_as_illumina.fastq",
69
+ "test/data/formats/sanger_full_range_as_sanger.fastq",
70
+ "test/data/formats/sanger_full_range_as_solexa.fastq",
71
+ "test/data/formats/sanger_full_range_original_sanger.fastq",
72
+ "test/data/formats/solexa_full_range_as_illumina.fastq",
73
+ "test/data/formats/solexa_full_range_as_sanger.fastq",
74
+ "test/data/formats/solexa_full_range_as_solexa.fastq",
75
+ "test/data/formats/solexa_full_range_original_solexa.fastq",
76
+ "test/data/formats/wrapping_as_illumina.fastq",
77
+ "test/data/formats/wrapping_as_sanger.fastq",
78
+ "test/data/formats/wrapping_as_solexa.fastq"
40
79
  ]
41
80
  s.homepage = "http://github.com/fstrozzi/bioruby-faster"
42
81
  s.licenses = ["MIT"]
43
82
  s.require_paths = ["lib"]
44
- s.required_ruby_version = Gem::Requirement.new(">= 1.9")
45
83
  s.rubygems_version = "1.8.15"
46
- s.summary = "A fast parser for Fasta and FastQ files"
84
+ s.summary = "A fast parser for FastQ files"
47
85
 
48
86
  if s.respond_to? :specification_version then
49
87
  s.specification_version = 3
50
88
 
51
89
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
90
+ s.add_runtime_dependency(%q<ffi>, [">= 0"])
52
91
  s.add_development_dependency(%q<shoulda>, [">= 0"])
53
92
  s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
54
93
  s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
55
94
  s.add_development_dependency(%q<rcov>, [">= 0"])
56
95
  s.add_development_dependency(%q<bio>, [">= 1.4.2"])
57
96
  s.add_development_dependency(%q<rspec>, [">= 0"])
97
+ s.add_development_dependency(%q<ffi>, [">= 0"])
58
98
  else
99
+ s.add_dependency(%q<ffi>, [">= 0"])
59
100
  s.add_dependency(%q<shoulda>, [">= 0"])
60
101
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
61
102
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
62
103
  s.add_dependency(%q<rcov>, [">= 0"])
63
104
  s.add_dependency(%q<bio>, [">= 1.4.2"])
64
105
  s.add_dependency(%q<rspec>, [">= 0"])
106
+ s.add_dependency(%q<ffi>, [">= 0"])
65
107
  end
66
108
  else
109
+ s.add_dependency(%q<ffi>, [">= 0"])
67
110
  s.add_dependency(%q<shoulda>, [">= 0"])
68
111
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
69
112
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
70
113
  s.add_dependency(%q<rcov>, [">= 0"])
71
114
  s.add_dependency(%q<bio>, [">= 1.4.2"])
72
115
  s.add_dependency(%q<rspec>, [">= 0"])
116
+ s.add_dependency(%q<ffi>, [">= 0"])
73
117
  end
74
118
  end
75
119
 
@@ -1,57 +1,120 @@
1
+
1
2
  /*
2
- Copyright(C) 2011 Francesco Strozzi <francesco.strozzi@gmail.com>
3
- */
3
+ Copyright(C) 2012 Francesco Strozzi <francesco.strozzi@gmail.com>
4
4
 
5
- #include <zlib.h>
5
+ */
6
6
  #include <stdio.h>
7
- #include "ruby.h"
8
- #include "kseq.h"
9
-
10
- KSEQ_INIT(gzFile, gzread)
11
-
12
- static VALUE method_parse(VALUE self, VALUE file) {
13
-
14
- // check if a block is passed to the method
15
- if (!(rb_block_given_p())) {
16
- rb_raise(rb_eArgError,"You must pass a valid block!");
17
- }
18
-
19
- gzFile fp;
20
- if (!(fp = gzopen(RSTRING_PTR(file), "r"))) {
21
- rb_raise(rb_eArgError,"File %s not found!", RSTRING_PTR(file));
22
- }
23
- else {
24
- kseq_t *seq;
25
- seq = kseq_init(fp);
26
- while (kseq_read(seq) >= 0) {
27
- VALUE arr = rb_ary_new();
28
- rb_ary_push(arr, rb_str_new2(seq->name.s));
29
- if (seq->comment.l) {
30
- rb_ary_push(arr, rb_str_new2(seq->comment.s));
31
- }
32
- else {
33
- rb_ary_push(arr, Qnil);
34
- }
35
- rb_ary_push(arr, rb_str_new2(seq->seq.s));
36
- if (seq->qual.l) {
37
- VALUE rb_quality = rb_ary_new();
38
- int unsigned i = 0;
39
- while(i < seq->qual.l) {
40
- rb_ary_push(rb_quality,INT2FIX(*(seq->qual.s + i) - 33)); // quality conversion (Sanger/Phred only)
41
- i++;
42
- }
43
- rb_ary_push(arr,rb_quality);
44
- }
45
- rb_yield(arr);
46
- }
47
- kseq_destroy(seq);
48
- gzclose(fp);
49
- return Qtrue;
50
- }
7
+ #include <string.h>
8
+ #include <stdlib.h>
9
+
10
+ #define _BSIZE 100000
11
+
12
+ typedef struct {
13
+ char *id;
14
+ char *seq;
15
+ int *quality;
16
+ char *raw_quality;
17
+ char *filename;
18
+ char *line;
19
+ char *bad_chars;
20
+ FILE *stream;
21
+
22
+ }FastQRecord;
23
+
24
+
25
+ static char* alloc_and_copy(char *dst, char *src) {
26
+ if (dst==NULL || strlen(dst)<strlen(src)) {
27
+ if (dst!=NULL)
28
+ free(dst);
29
+ dst= malloc(sizeof (char)*(strlen(src)+1));
30
+ }
31
+ strcpy(dst, src);
32
+ int len;
33
+ len = strlen(dst);
34
+ if (dst[len-1] == '\n') dst[len-1] = '\0';
35
+ return dst;
36
+ }
37
+
38
+
39
+ const char* check_bad_chars(char *invalid_chars, char *string_to_check) {
40
+ return strpbrk(string_to_check, invalid_chars);
41
+ }
42
+
43
+ static char* initialize(char *ptr) {
44
+ if(ptr!=NULL){
45
+ free(ptr);
46
+ ptr = NULL;
47
+ }
48
+ return ptr;
51
49
  }
52
-
53
- void Init_faster() {
54
- VALUE Bio = rb_define_module("Bio");
55
- VALUE Faster = rb_define_module_under(Bio,"Faster"); // it is defined as a sub-module of Bio
56
- rb_define_singleton_method(Faster,"parse",method_parse,1);
50
+
51
+ int* initialize_int(int *ptr) {
52
+ if(ptr!=NULL){
53
+ free(ptr);
54
+ ptr = NULL;
55
+ }
56
+ return ptr;
57
+ }
58
+
59
+ int check_header(char *header, char *firstline) {
60
+ if (*header == *firstline)
61
+ return 1;
62
+ else {
63
+ return 0;
64
+ }
57
65
  }
66
+
67
+ int fastQ_iterator(FastQRecord *seq, int scale_factor) {
68
+ // initialization of structure elements.
69
+ char *header = "@"; // FastQ header
70
+ if (!seq->stream)
71
+ seq->stream = fopen(seq->filename,"r");
72
+ if (!seq->line)
73
+ seq->line = malloc(sizeof (char)* _BSIZE);
74
+ if (!seq->bad_chars)
75
+ seq->bad_chars = " \x1F\x7F\t\v\e";
76
+
77
+ // this is done to wipe out data from previous iteration
78
+ seq->id = initialize(seq->id);
79
+ seq->seq = initialize(seq->seq);
80
+ seq->raw_quality = initialize(seq->raw_quality);
81
+ for (int i = 0; i < 4; i++)
82
+ {
83
+ if (fgets(seq->line, _BSIZE, seq->stream) == NULL) {
84
+ // if either sequence or quality is missing the record is truncated
85
+ if((seq->seq != NULL && seq->raw_quality == NULL) || (seq->raw_quality != NULL && seq->seq == NULL)) return -2;
86
+ else return 0;
87
+ }
88
+ // getting seq ID
89
+ if (i==0) {
90
+ if (!check_header(header,seq->line)) return -1; // check if the header format is correct
91
+ // removing the @
92
+ seq->id = alloc_and_copy(seq->id, seq->line+1);
93
+
94
+ }
95
+ else {
96
+ if (check_bad_chars(seq->bad_chars,seq->line)) return -1; // check if quality or sequence includes bad characters
97
+ if (i==1) seq->seq = alloc_and_copy(seq->seq, seq->line);
98
+ if (i==3) {
99
+ seq->raw_quality = alloc_and_copy(seq->raw_quality, seq->line);
100
+ int quality_length = strlen(seq->raw_quality);
101
+ if(strlen(seq->seq) != strlen(seq->raw_quality)) return -2; // if sequence and quality are of different length the record is truncated
102
+ int c = 0;
103
+ seq->quality = initialize_int(seq->quality);
104
+ seq->quality = malloc(sizeof (int)* quality_length);
105
+ while(c < quality_length) {
106
+ seq->quality[c] = *(seq->line + c) - scale_factor; // quality conversion
107
+ c++;
108
+ }
109
+
110
+ }
111
+
112
+ }
113
+ }
114
+
115
+ return 1;
116
+
117
+ }
118
+
119
+
120
+ #undef _BSIZE
@@ -0,0 +1,40 @@
1
+ # create Rakefile for shared library compilation
2
+
3
+ require File.join("..",File.dirname(__FILE__),"lib/bio/faster/library")
4
+
5
+ path = File.expand_path(File.dirname(__FILE__))
6
+ ext = Bio::Faster::Library.lib_extension
7
+
8
+ flags = ""
9
+ compile = ""
10
+ if ext == "so" then
11
+ flags = "-shared -Wl,-soname,libfaster.so"
12
+ compile = " -fPIC"
13
+ elsif ext == "dylib" then
14
+ flags = "-bundle -undefined dynamic_lookup -flat_namespace"
15
+ end
16
+
17
+
18
+ File.open(File.join(path,"Rakefile"),"w") do |rakefile|
19
+ rakefile.write <<-RAKE
20
+ require 'rake/clean'
21
+
22
+ source = %w(faster.c)
23
+
24
+ CLEAN.include('*.o')
25
+ SRC = FileList.new(source)
26
+ OBJ_SRC = SRC.ext('o')
27
+
28
+ rule '.o' => '.c' do |t|
29
+ sh "gcc#{compile} -std=c99 -c -g -Wall -O2 "+t.source+" -o "+t.name
30
+ end
31
+
32
+ task :compile_lib => OBJ_SRC do
33
+ sh "gcc #{flags} -std=c99 "+OBJ_SRC.join(" ")+" -o libfaster.#{ext}"
34
+ end
35
+
36
+ task :default => [:compile_lib, :clean]
37
+
38
+ RAKE
39
+
40
+ end