bio-faster 0.2.2 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.travis.yml +2 -3
  2. data/Gemfile +2 -0
  3. data/Gemfile.lock +2 -0
  4. data/LICENSE.txt +1 -1
  5. data/README.md +2 -3
  6. data/Rakefile +9 -15
  7. data/VERSION +1 -1
  8. data/bio-faster.gemspec +56 -12
  9. data/ext/faster.c +115 -52
  10. data/ext/mkrf_conf.rb +40 -0
  11. data/lib/bio-faster.rb +5 -4
  12. data/lib/bio/faster.rb +57 -0
  13. data/lib/bio/faster/library.rb +26 -0
  14. data/spec/fastq_error_spec.rb +55 -0
  15. data/spec/parser_spec.rb +28 -116
  16. data/test/data/errors/error_header.fastq +20 -0
  17. data/test/data/errors/error_long_qual.fastq +20 -0
  18. data/test/data/errors/error_qual_del.fastq +20 -0
  19. data/test/data/errors/error_qual_escape.fastq +20 -0
  20. data/test/data/errors/error_qual_null.fastq +0 -0
  21. data/test/data/errors/error_qual_space.fastq +20 -0
  22. data/test/data/errors/error_qual_tab.fastq +20 -0
  23. data/test/data/errors/error_qual_unit_sep.fastq +20 -0
  24. data/test/data/errors/error_qual_vtab.fastq +20 -0
  25. data/test/data/errors/error_spaces.fastq +20 -0
  26. data/test/data/errors/error_tabs.fastq +21 -0
  27. data/test/data/errors/error_trunc_at_qual.fastq +19 -0
  28. data/test/data/errors/error_trunc_at_seq.fastq +18 -0
  29. data/test/data/errors/error_trunc_in_qual.fastq +20 -0
  30. data/test/data/errors/error_trunc_in_seq.fastq +18 -0
  31. data/test/data/formats/illumina_full_range_as_illumina.fastq +8 -0
  32. data/test/data/formats/illumina_full_range_as_sanger.fastq +8 -0
  33. data/test/data/formats/illumina_full_range_as_solexa.fastq +8 -0
  34. data/test/data/formats/illumina_full_range_original_illumina.fastq +8 -0
  35. data/test/data/formats/longreads_as_illumina.fastq +40 -0
  36. data/test/data/formats/longreads_as_sanger.fastq +40 -0
  37. data/test/data/formats/longreads_as_solexa.fastq +40 -0
  38. data/test/data/formats/misc_dna_as_illumina.fastq +16 -0
  39. data/test/data/formats/misc_dna_as_sanger.fastq +16 -0
  40. data/test/data/formats/misc_dna_as_solexa.fastq +16 -0
  41. data/test/data/formats/misc_dna_original_sanger.fastq +16 -0
  42. data/test/data/formats/misc_rna_as_illumina.fastq +16 -0
  43. data/test/data/formats/misc_rna_as_sanger.fastq +16 -0
  44. data/test/data/formats/misc_rna_as_solexa.fastq +16 -0
  45. data/test/data/formats/misc_rna_original_sanger.fastq +16 -0
  46. data/test/data/formats/sanger_full_range_as_illumina.fastq +8 -0
  47. data/test/data/formats/sanger_full_range_as_sanger.fastq +8 -0
  48. data/test/data/formats/sanger_full_range_as_solexa.fastq +8 -0
  49. data/test/data/formats/sanger_full_range_original_sanger.fastq +8 -0
  50. data/test/data/formats/solexa_full_range_as_illumina.fastq +8 -0
  51. data/test/data/formats/solexa_full_range_as_sanger.fastq +8 -0
  52. data/test/data/formats/solexa_full_range_as_solexa.fastq +8 -0
  53. data/test/data/formats/solexa_full_range_original_solexa.fastq +8 -0
  54. data/test/data/formats/wrapping_as_illumina.fastq +12 -0
  55. data/test/data/formats/wrapping_as_sanger.fastq +12 -0
  56. data/test/data/formats/wrapping_as_solexa.fastq +12 -0
  57. metadata +88 -24
  58. data/ext/extconf.rb +0 -7
  59. data/ext/kseq.h +0 -223
  60. data/test/data/sample.fasta +0 -10
  61. data/test/data/sample.fastq +0 -24
  62. data/test/data/sample.fastq.gz +0 -0
  63. data/test/data/sff_sample.fastq +0 -16
@@ -3,10 +3,9 @@ rvm:
3
3
  - 1.8.7
4
4
  - 1.9.2
5
5
  - 1.9.3
6
- # - jruby-18mode # JRuby in 1.8 mode
7
- # - jruby-19mode # JRuby in 1.9 mode
6
+ - jruby-18mode # JRuby in 1.8 mode
7
+ - jruby-19mode # JRuby in 1.9 mode
8
8
  # - rbx-18mode
9
9
  # - rbx-19mode
10
10
  # uncomment this line if your project needs to run something other than `rake`:
11
- #script: bundle exec rake ext:build
12
11
  #script: bundle exec rake
data/Gemfile CHANGED
@@ -2,6 +2,7 @@ source "http://rubygems.org"
2
2
  # Add dependencies required to use your gem here.
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
+ gem "ffi"
5
6
 
6
7
  # Add dependencies to develop your gem here.
7
8
  # Include everything needed to run rake, tests, features, etc.
@@ -12,4 +13,5 @@ group :development do
12
13
  gem "rcov", ">= 0"
13
14
  gem "bio", ">= 1.4.2"
14
15
  gem "rspec"
16
+ gem "ffi"
15
17
  end
@@ -3,6 +3,7 @@ GEM
3
3
  specs:
4
4
  bio (1.4.2)
5
5
  diff-lcs (1.1.3)
6
+ ffi (1.0.11)
6
7
  git (1.2.5)
7
8
  jeweler (1.6.4)
8
9
  bundler (~> 1.0)
@@ -28,6 +29,7 @@ PLATFORMS
28
29
  DEPENDENCIES
29
30
  bio (>= 1.4.2)
30
31
  bundler (~> 1.0.0)
32
+ ffi
31
33
  jeweler (~> 1.6.4)
32
34
  rcov
33
35
  rspec
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011 Francesco Strozzi
1
+ Copyright (c) 2012 Francesco Strozzi
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -3,8 +3,7 @@
3
3
  Bio::Faster
4
4
  ==========
5
5
 
6
- Fast and simple parser for FastA / FastQ files, based on Heng Li Kseq library written in C.
7
- http://lh3lh3.users.sourceforge.net/parsefastq.shtml
6
+ Fast and simple parser for FastQ files
8
7
 
9
8
  Examples
10
9
  ========
@@ -28,6 +27,6 @@ Contributing to bio-faster
28
27
  Copyright
29
28
  =========
30
29
 
31
- Copyright (c) 2011 Francesco Strozzi. See LICENSE.txt for
30
+ Copyright (c) 2012 Francesco Strozzi. See LICENSE.txt for
32
31
  further details.
33
32
 
data/Rakefile CHANGED
@@ -17,11 +17,11 @@ Jeweler::Tasks.new do |gem|
17
17
  gem.name = "bio-faster"
18
18
  gem.homepage = "http://github.com/fstrozzi/bioruby-faster"
19
19
  gem.license = "MIT"
20
- gem.summary = %Q{A fast parser for Fasta and FastQ files}
21
- gem.description = %Q{A fast parser for Fasta and FastQ files}
20
+ gem.summary = %Q{A fast parser for FastQ files}
21
+ gem.description = %Q{A fast parser for FastQ files}
22
22
  gem.email = "francesco.strozzi@gmail.com"
23
23
  gem.authors = ["Francesco Strozzi"]
24
- gem.required_ruby_version = '>= 1.9'
24
+ gem.files << "lib/*/**"
25
25
  # dependencies defined in Gemfile
26
26
  end
27
27
  Jeweler::RubygemsDotOrgTasks.new
@@ -58,20 +58,14 @@ Rake::RDocTask.new do |rdoc|
58
58
  end
59
59
 
60
60
  namespace :ext do
61
- desc "Compile extension"
61
+ desc "Build native extension"
62
62
  task :build do
63
- puts "Building extension"
64
- cd File.join(File.dirname(__FILE__),"ext")
65
- sh "ruby "+File.join(File.dirname(__FILE__),"ext","extconf.rb")
66
- sh "make"
67
- FileList["*.log"].each do |file|
68
- rm file
69
- end
70
- FileList["*.o"].each do |file|
71
- rm file
72
- end
73
- cd ".."
63
+ cd "ext"
64
+ ruby "mkrf_conf.rb"
65
+ sh "rake"
66
+ cd ".."
74
67
  end
68
+
75
69
  end
76
70
 
77
71
  task :default => ["ext:build",:spec]
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.4.2
@@ -5,14 +5,14 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bio-faster"
8
- s.version = "0.2.2"
8
+ s.version = "0.4.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Francesco Strozzi"]
12
- s.date = "2012-04-02"
13
- s.description = "A fast parser for Fasta and FastQ files"
12
+ s.date = "2012-04-27"
13
+ s.description = "A fast parser for FastQ files"
14
14
  s.email = "francesco.strozzi@gmail.com"
15
- s.extensions = ["ext/extconf.rb"]
15
+ s.extensions = ["ext/mkrf_conf.rb"]
16
16
  s.extra_rdoc_files = [
17
17
  "LICENSE.txt",
18
18
  "README.md"
@@ -27,49 +27,93 @@ Gem::Specification.new do |s|
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "bio-faster.gemspec",
30
- "ext/extconf.rb",
31
30
  "ext/faster.c",
32
- "ext/kseq.h",
31
+ "ext/mkrf_conf.rb",
33
32
  "lib/bio-faster.rb",
33
+ "lib/bio/faster.rb",
34
+ "lib/bio/faster/library.rb",
35
+ "spec/fastq_error_spec.rb",
34
36
  "spec/helper.rb",
35
37
  "spec/parser_spec.rb",
36
- "test/data/sample.fasta",
37
- "test/data/sample.fastq",
38
- "test/data/sample.fastq.gz",
39
- "test/data/sff_sample.fastq"
38
+ "test/data/errors/error_header.fastq",
39
+ "test/data/errors/error_long_qual.fastq",
40
+ "test/data/errors/error_qual_del.fastq",
41
+ "test/data/errors/error_qual_escape.fastq",
42
+ "test/data/errors/error_qual_null.fastq",
43
+ "test/data/errors/error_qual_space.fastq",
44
+ "test/data/errors/error_qual_tab.fastq",
45
+ "test/data/errors/error_qual_unit_sep.fastq",
46
+ "test/data/errors/error_qual_vtab.fastq",
47
+ "test/data/errors/error_spaces.fastq",
48
+ "test/data/errors/error_tabs.fastq",
49
+ "test/data/errors/error_trunc_at_qual.fastq",
50
+ "test/data/errors/error_trunc_at_seq.fastq",
51
+ "test/data/errors/error_trunc_in_qual.fastq",
52
+ "test/data/errors/error_trunc_in_seq.fastq",
53
+ "test/data/formats/illumina_full_range_as_illumina.fastq",
54
+ "test/data/formats/illumina_full_range_as_sanger.fastq",
55
+ "test/data/formats/illumina_full_range_as_solexa.fastq",
56
+ "test/data/formats/illumina_full_range_original_illumina.fastq",
57
+ "test/data/formats/longreads_as_illumina.fastq",
58
+ "test/data/formats/longreads_as_sanger.fastq",
59
+ "test/data/formats/longreads_as_solexa.fastq",
60
+ "test/data/formats/misc_dna_as_illumina.fastq",
61
+ "test/data/formats/misc_dna_as_sanger.fastq",
62
+ "test/data/formats/misc_dna_as_solexa.fastq",
63
+ "test/data/formats/misc_dna_original_sanger.fastq",
64
+ "test/data/formats/misc_rna_as_illumina.fastq",
65
+ "test/data/formats/misc_rna_as_sanger.fastq",
66
+ "test/data/formats/misc_rna_as_solexa.fastq",
67
+ "test/data/formats/misc_rna_original_sanger.fastq",
68
+ "test/data/formats/sanger_full_range_as_illumina.fastq",
69
+ "test/data/formats/sanger_full_range_as_sanger.fastq",
70
+ "test/data/formats/sanger_full_range_as_solexa.fastq",
71
+ "test/data/formats/sanger_full_range_original_sanger.fastq",
72
+ "test/data/formats/solexa_full_range_as_illumina.fastq",
73
+ "test/data/formats/solexa_full_range_as_sanger.fastq",
74
+ "test/data/formats/solexa_full_range_as_solexa.fastq",
75
+ "test/data/formats/solexa_full_range_original_solexa.fastq",
76
+ "test/data/formats/wrapping_as_illumina.fastq",
77
+ "test/data/formats/wrapping_as_sanger.fastq",
78
+ "test/data/formats/wrapping_as_solexa.fastq"
40
79
  ]
41
80
  s.homepage = "http://github.com/fstrozzi/bioruby-faster"
42
81
  s.licenses = ["MIT"]
43
82
  s.require_paths = ["lib"]
44
- s.required_ruby_version = Gem::Requirement.new(">= 1.9")
45
83
  s.rubygems_version = "1.8.15"
46
- s.summary = "A fast parser for Fasta and FastQ files"
84
+ s.summary = "A fast parser for FastQ files"
47
85
 
48
86
  if s.respond_to? :specification_version then
49
87
  s.specification_version = 3
50
88
 
51
89
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
90
+ s.add_runtime_dependency(%q<ffi>, [">= 0"])
52
91
  s.add_development_dependency(%q<shoulda>, [">= 0"])
53
92
  s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
54
93
  s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
55
94
  s.add_development_dependency(%q<rcov>, [">= 0"])
56
95
  s.add_development_dependency(%q<bio>, [">= 1.4.2"])
57
96
  s.add_development_dependency(%q<rspec>, [">= 0"])
97
+ s.add_development_dependency(%q<ffi>, [">= 0"])
58
98
  else
99
+ s.add_dependency(%q<ffi>, [">= 0"])
59
100
  s.add_dependency(%q<shoulda>, [">= 0"])
60
101
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
61
102
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
62
103
  s.add_dependency(%q<rcov>, [">= 0"])
63
104
  s.add_dependency(%q<bio>, [">= 1.4.2"])
64
105
  s.add_dependency(%q<rspec>, [">= 0"])
106
+ s.add_dependency(%q<ffi>, [">= 0"])
65
107
  end
66
108
  else
109
+ s.add_dependency(%q<ffi>, [">= 0"])
67
110
  s.add_dependency(%q<shoulda>, [">= 0"])
68
111
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
69
112
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
70
113
  s.add_dependency(%q<rcov>, [">= 0"])
71
114
  s.add_dependency(%q<bio>, [">= 1.4.2"])
72
115
  s.add_dependency(%q<rspec>, [">= 0"])
116
+ s.add_dependency(%q<ffi>, [">= 0"])
73
117
  end
74
118
  end
75
119
 
@@ -1,57 +1,120 @@
1
+
1
2
  /*
2
- Copyright(C) 2011 Francesco Strozzi <francesco.strozzi@gmail.com>
3
- */
3
+ Copyright(C) 2012 Francesco Strozzi <francesco.strozzi@gmail.com>
4
4
 
5
- #include <zlib.h>
5
+ */
6
6
  #include <stdio.h>
7
- #include "ruby.h"
8
- #include "kseq.h"
9
-
10
- KSEQ_INIT(gzFile, gzread)
11
-
12
- static VALUE method_parse(VALUE self, VALUE file) {
13
-
14
- // check if a block is passed to the method
15
- if (!(rb_block_given_p())) {
16
- rb_raise(rb_eArgError,"You must pass a valid block!");
17
- }
18
-
19
- gzFile fp;
20
- if (!(fp = gzopen(RSTRING_PTR(file), "r"))) {
21
- rb_raise(rb_eArgError,"File %s not found!", RSTRING_PTR(file));
22
- }
23
- else {
24
- kseq_t *seq;
25
- seq = kseq_init(fp);
26
- while (kseq_read(seq) >= 0) {
27
- VALUE arr = rb_ary_new();
28
- rb_ary_push(arr, rb_str_new2(seq->name.s));
29
- if (seq->comment.l) {
30
- rb_ary_push(arr, rb_str_new2(seq->comment.s));
31
- }
32
- else {
33
- rb_ary_push(arr, Qnil);
34
- }
35
- rb_ary_push(arr, rb_str_new2(seq->seq.s));
36
- if (seq->qual.l) {
37
- VALUE rb_quality = rb_ary_new();
38
- int unsigned i = 0;
39
- while(i < seq->qual.l) {
40
- rb_ary_push(rb_quality,INT2FIX(*(seq->qual.s + i) - 33)); // quality conversion (Sanger/Phred only)
41
- i++;
42
- }
43
- rb_ary_push(arr,rb_quality);
44
- }
45
- rb_yield(arr);
46
- }
47
- kseq_destroy(seq);
48
- gzclose(fp);
49
- return Qtrue;
50
- }
7
+ #include <string.h>
8
+ #include <stdlib.h>
9
+
10
+ #define _BSIZE 100000
11
+
12
+ typedef struct {
13
+ char *id;
14
+ char *seq;
15
+ int *quality;
16
+ char *raw_quality;
17
+ char *filename;
18
+ char *line;
19
+ char *bad_chars;
20
+ FILE *stream;
21
+
22
+ }FastQRecord;
23
+
24
+
25
+ static char* alloc_and_copy(char *dst, char *src) {
26
+ if (dst==NULL || strlen(dst)<strlen(src)) {
27
+ if (dst!=NULL)
28
+ free(dst);
29
+ dst= malloc(sizeof (char)*(strlen(src)+1));
30
+ }
31
+ strcpy(dst, src);
32
+ int len;
33
+ len = strlen(dst);
34
+ if (dst[len-1] == '\n') dst[len-1] = '\0';
35
+ return dst;
36
+ }
37
+
38
+
39
+ const char* check_bad_chars(char *invalid_chars, char *string_to_check) {
40
+ return strpbrk(string_to_check, invalid_chars);
41
+ }
42
+
43
+ static char* initialize(char *ptr) {
44
+ if(ptr!=NULL){
45
+ free(ptr);
46
+ ptr = NULL;
47
+ }
48
+ return ptr;
51
49
  }
52
-
53
- void Init_faster() {
54
- VALUE Bio = rb_define_module("Bio");
55
- VALUE Faster = rb_define_module_under(Bio,"Faster"); // it is defined as a sub-module of Bio
56
- rb_define_singleton_method(Faster,"parse",method_parse,1);
50
+
51
+ int* initialize_int(int *ptr) {
52
+ if(ptr!=NULL){
53
+ free(ptr);
54
+ ptr = NULL;
55
+ }
56
+ return ptr;
57
+ }
58
+
59
+ int check_header(char *header, char *firstline) {
60
+ if (*header == *firstline)
61
+ return 1;
62
+ else {
63
+ return 0;
64
+ }
57
65
  }
66
+
67
+ int fastQ_iterator(FastQRecord *seq, int scale_factor) {
68
+ // initialization of structure elements.
69
+ char *header = "@"; // FastQ header
70
+ if (!seq->stream)
71
+ seq->stream = fopen(seq->filename,"r");
72
+ if (!seq->line)
73
+ seq->line = malloc(sizeof (char)* _BSIZE);
74
+ if (!seq->bad_chars)
75
+ seq->bad_chars = " \x1F\x7F\t\v\e";
76
+
77
+ // this is done to wipe out data from previous iteration
78
+ seq->id = initialize(seq->id);
79
+ seq->seq = initialize(seq->seq);
80
+ seq->raw_quality = initialize(seq->raw_quality);
81
+ for (int i = 0; i < 4; i++)
82
+ {
83
+ if (fgets(seq->line, _BSIZE, seq->stream) == NULL) {
84
+ // if either sequence or quality is missing the record is truncated
85
+ if((seq->seq != NULL && seq->raw_quality == NULL) || (seq->raw_quality != NULL && seq->seq == NULL)) return -2;
86
+ else return 0;
87
+ }
88
+ // getting seq ID
89
+ if (i==0) {
90
+ if (!check_header(header,seq->line)) return -1; // check if the header format is correct
91
+ // removing the @
92
+ seq->id = alloc_and_copy(seq->id, seq->line+1);
93
+
94
+ }
95
+ else {
96
+ if (check_bad_chars(seq->bad_chars,seq->line)) return -1; // check if quality or sequence includes bad characters
97
+ if (i==1) seq->seq = alloc_and_copy(seq->seq, seq->line);
98
+ if (i==3) {
99
+ seq->raw_quality = alloc_and_copy(seq->raw_quality, seq->line);
100
+ int quality_length = strlen(seq->raw_quality);
101
+ if(strlen(seq->seq) != strlen(seq->raw_quality)) return -2; // if sequence and quality are of different length the record is truncated
102
+ int c = 0;
103
+ seq->quality = initialize_int(seq->quality);
104
+ seq->quality = malloc(sizeof (int)* quality_length);
105
+ while(c < quality_length) {
106
+ seq->quality[c] = *(seq->line + c) - scale_factor; // quality conversion
107
+ c++;
108
+ }
109
+
110
+ }
111
+
112
+ }
113
+ }
114
+
115
+ return 1;
116
+
117
+ }
118
+
119
+
120
+ #undef _BSIZE
@@ -0,0 +1,40 @@
1
+ # create Rakefile for shared library compilation
2
+
3
+ require File.join("..",File.dirname(__FILE__),"lib/bio/faster/library")
4
+
5
+ path = File.expand_path(File.dirname(__FILE__))
6
+ ext = Bio::Faster::Library.lib_extension
7
+
8
+ flags = ""
9
+ compile = ""
10
+ if ext == "so" then
11
+ flags = "-shared -Wl,-soname,libfaster.so"
12
+ compile = " -fPIC"
13
+ elsif ext == "dylib" then
14
+ flags = "-bundle -undefined dynamic_lookup -flat_namespace"
15
+ end
16
+
17
+
18
+ File.open(File.join(path,"Rakefile"),"w") do |rakefile|
19
+ rakefile.write <<-RAKE
20
+ require 'rake/clean'
21
+
22
+ source = %w(faster.c)
23
+
24
+ CLEAN.include('*.o')
25
+ SRC = FileList.new(source)
26
+ OBJ_SRC = SRC.ext('o')
27
+
28
+ rule '.o' => '.c' do |t|
29
+ sh "gcc#{compile} -std=c99 -c -g -Wall -O2 "+t.source+" -o "+t.name
30
+ end
31
+
32
+ task :compile_lib => OBJ_SRC do
33
+ sh "gcc #{flags} -std=c99 "+OBJ_SRC.join(" ")+" -o libfaster.#{ext}"
34
+ end
35
+
36
+ task :default => [:compile_lib, :clean]
37
+
38
+ RAKE
39
+
40
+ end