bio-faster 0.4.4 → 0.4.5
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bio-faster.gemspec +4 -3
- data/ext/faster.c +10 -10
- data/lib/bio/faster.rb +32 -13
- data/spec/parser_spec.rb +21 -3
- data/test/data/formats/issue_2.fastq +4 -0
- metadata +61 -20
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.4.
|
1
|
+
0.4.5
|
data/bio-faster.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-faster"
|
8
|
-
s.version = "0.4.
|
8
|
+
s.version = "0.4.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Francesco Strozzi"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-06-13"
|
13
13
|
s.description = "A fast parser for FastQ files"
|
14
14
|
s.email = "francesco.strozzi@gmail.com"
|
15
15
|
s.extensions = ["ext/mkrf_conf.rb"]
|
@@ -54,6 +54,7 @@ Gem::Specification.new do |s|
|
|
54
54
|
"test/data/formats/illumina_full_range_as_sanger.fastq",
|
55
55
|
"test/data/formats/illumina_full_range_as_solexa.fastq",
|
56
56
|
"test/data/formats/illumina_full_range_original_illumina.fastq",
|
57
|
+
"test/data/formats/issue_2.fastq",
|
57
58
|
"test/data/formats/longreads_as_illumina.fastq",
|
58
59
|
"test/data/formats/longreads_as_sanger.fastq",
|
59
60
|
"test/data/formats/longreads_as_solexa.fastq",
|
@@ -81,7 +82,7 @@ Gem::Specification.new do |s|
|
|
81
82
|
s.homepage = "http://github.com/fstrozzi/bioruby-faster"
|
82
83
|
s.licenses = ["MIT"]
|
83
84
|
s.require_paths = ["lib"]
|
84
|
-
s.rubygems_version = "1.8.
|
85
|
+
s.rubygems_version = "1.8.24"
|
85
86
|
s.summary = "A fast parser for FastQ files"
|
86
87
|
|
87
88
|
if s.respond_to? :specification_version then
|
data/ext/faster.c
CHANGED
@@ -95,23 +95,23 @@ int fastQ_iterator(FastQRecord *seq, int scale_factor) {
|
|
95
95
|
if (!check_header(header,seq->line)) return -1; // check if the header format is correct
|
96
96
|
// removing the @
|
97
97
|
seq->id = alloc_and_copy(seq->id, seq->line+1);
|
98
|
-
|
99
98
|
}
|
100
99
|
else {
|
101
|
-
if (check_bad_chars(seq->bad_chars,seq->line)) return -1; // check if quality or sequence includes bad characters
|
100
|
+
if ((i==1 || i==3) && (check_bad_chars(seq->bad_chars,seq->line))) return -1; // check if quality or sequence includes bad characters
|
102
101
|
if (i==1) seq->seq = alloc_and_copy(seq->seq, seq->line);
|
103
102
|
if (i==3) {
|
104
103
|
seq->raw_quality = alloc_and_copy(seq->raw_quality, seq->line);
|
105
104
|
int quality_length = strlen(seq->raw_quality);
|
106
105
|
if(strlen(seq->seq) != strlen(seq->raw_quality)) return -2; // if sequence and quality are of different length the record is truncated
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
106
|
+
if (scale_factor != 0) {
|
107
|
+
int c = 0;
|
108
|
+
seq->quality = initialize_int(seq->quality);
|
109
|
+
seq->quality = malloc(sizeof (int)* quality_length);
|
110
|
+
while(c < quality_length) {
|
111
|
+
seq->quality[c] = *(seq->line + c) - scale_factor; // quality conversion
|
112
|
+
c++;
|
113
|
+
}
|
114
|
+
}
|
115
115
|
}
|
116
116
|
|
117
117
|
}
|
data/lib/bio/faster.rb
CHANGED
@@ -13,9 +13,8 @@ module Bio
|
|
13
13
|
|
14
14
|
attr_accessor :file
|
15
15
|
attr_accessor :encoding
|
16
|
-
def initialize(file
|
16
|
+
def initialize(file)
|
17
17
|
self.file = file
|
18
|
-
self.encoding = encoding
|
19
18
|
end
|
20
19
|
|
21
20
|
class FastQRecord < FFI::Struct
|
@@ -32,23 +31,28 @@ module Bio
|
|
32
31
|
|
33
32
|
attach_function :fastQ_iterator, [FastQRecord, :int], :int
|
34
33
|
|
35
|
-
def each_record
|
34
|
+
def each_record(args = {:quality => :sanger}, &block)
|
36
35
|
if self.file == :stdin
|
37
36
|
self.file = "stdin"
|
38
37
|
elsif !File.exists? self.file
|
39
38
|
raise ArgumentError, "File #{self.file} does not exist"
|
40
39
|
end
|
41
40
|
record = FastQRecord.new
|
42
|
-
scale_factor = nil
|
43
|
-
case self.encoding
|
44
|
-
when :sanger then scale_factor = 33
|
45
|
-
when :solexa then scale_factor = 64
|
46
|
-
end
|
47
41
|
record[:filename] = FFI::MemoryPointer.from_string self.file
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
42
|
+
result = nil
|
43
|
+
case args[:quality]
|
44
|
+
when :sanger
|
45
|
+
scale_factor = 33
|
46
|
+
result = parse_fastq_with_quality_conversion(record, scale_factor, &block)
|
47
|
+
when :solexa
|
48
|
+
scale_factor = 64
|
49
|
+
result = parse_fastq_with_quality_conversion(record, scale_factor, &block)
|
50
|
+
when :raw
|
51
|
+
scale_factor = 0
|
52
|
+
result = parse_fastq(record, scale_factor, &block)
|
53
|
+
end
|
54
|
+
|
55
|
+
case result
|
52
56
|
when -1 then raise RuntimeError, "Bad formatted FastQ file!"
|
53
57
|
when -2 then raise RuntimeError, "Sequence or quality is truncated!"
|
54
58
|
end
|
@@ -56,6 +60,21 @@ module Bio
|
|
56
60
|
|
57
61
|
end
|
58
62
|
|
63
|
+
private
|
64
|
+
|
65
|
+
def parse_fastq_with_quality_conversion(record, scale_factor, &block)
|
66
|
+
while (result = Bio::Faster.fastQ_iterator(record,scale_factor)) == 1
|
67
|
+
yield [record[:id].read_string,record[:seq].read_string,record[:quality].read_array_of_int(record[:raw_quality].read_string.length)]
|
68
|
+
end
|
69
|
+
result
|
70
|
+
end
|
71
|
+
|
72
|
+
def parse_fastq(record, scale_factor, &block)
|
73
|
+
while (result = Bio::Faster.fastQ_iterator(record,scale_factor)) == 1
|
74
|
+
yield [record[:id].read_string,record[:seq].read_string,record[:raw_quality].read_string]
|
75
|
+
end
|
76
|
+
result
|
77
|
+
end
|
59
78
|
|
60
79
|
end
|
61
|
-
end
|
80
|
+
end
|
data/spec/parser_spec.rb
CHANGED
@@ -18,14 +18,14 @@ describe Bio::Faster do
|
|
18
18
|
bioruby_data << [seq.entry_id,seq.seq,seq.qualities]
|
19
19
|
end
|
20
20
|
faster_data = []
|
21
|
-
Bio::Faster.new(file
|
21
|
+
Bio::Faster.new(file).each_record(:quality => :solexa) do |seq|
|
22
22
|
seq[0] = seq[0].split(" ").first
|
23
23
|
faster_data << seq
|
24
24
|
end
|
25
25
|
faster_data.should == bioruby_data
|
26
26
|
end
|
27
27
|
|
28
|
-
it "should read different FastQ formats" do
|
28
|
+
it "should read different FastQ formats and convert quality scores" do
|
29
29
|
files = Dir.glob(TEST_DATA+"/formats/*.fastq")
|
30
30
|
files.each do |file|
|
31
31
|
bioruby_data = []
|
@@ -42,6 +42,24 @@ describe Bio::Faster do
|
|
42
42
|
|
43
43
|
end
|
44
44
|
|
45
|
+
it "should read different FastQ formats without converting quality scores" do
|
46
|
+
|
47
|
+
files = Dir.glob(TEST_DATA+"/formats/*.fastq")
|
48
|
+
files.each do |file|
|
49
|
+
bioruby_data = []
|
50
|
+
Bio::FlatFile.open(Bio::Fastq,File.open(file)).each_entry do |seq|
|
51
|
+
bioruby_data << [seq.entry_id,seq.seq,seq.quality_string]
|
52
|
+
end
|
53
|
+
faster_data = []
|
54
|
+
Bio::Faster.new(file).each_record(:quality => :raw) do |seq|
|
55
|
+
seq[0] = seq[0].split(" ").first
|
56
|
+
faster_data << seq
|
57
|
+
end
|
58
|
+
faster_data.should == bioruby_data
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
45
63
|
|
46
64
|
it "can read from the standard input" do
|
47
65
|
require 'digest/md5'
|
@@ -63,4 +81,4 @@ describe Bio::Faster do
|
|
63
81
|
end
|
64
82
|
|
65
83
|
|
66
|
-
end
|
84
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-faster
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ffi
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: shoulda
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :development
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: bundler
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ~>
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: 1.0.0
|
44
54
|
type: :development
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.0.0
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: jeweler
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ~>
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: 1.6.4
|
55
70
|
type: :development
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.6.4
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: rcov
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ! '>='
|
@@ -65,10 +85,15 @@ dependencies:
|
|
65
85
|
version: '0'
|
66
86
|
type: :development
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: bio
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
72
97
|
none: false
|
73
98
|
requirements:
|
74
99
|
- - ! '>='
|
@@ -76,10 +101,15 @@ dependencies:
|
|
76
101
|
version: 1.4.2
|
77
102
|
type: :development
|
78
103
|
prerelease: false
|
79
|
-
version_requirements:
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.4.2
|
80
110
|
- !ruby/object:Gem::Dependency
|
81
111
|
name: rspec
|
82
|
-
requirement:
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
83
113
|
none: false
|
84
114
|
requirements:
|
85
115
|
- - ! '>='
|
@@ -87,10 +117,15 @@ dependencies:
|
|
87
117
|
version: '0'
|
88
118
|
type: :development
|
89
119
|
prerelease: false
|
90
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
91
126
|
- !ruby/object:Gem::Dependency
|
92
127
|
name: ffi
|
93
|
-
requirement:
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
94
129
|
none: false
|
95
130
|
requirements:
|
96
131
|
- - ! '>='
|
@@ -98,7 +133,12 @@ dependencies:
|
|
98
133
|
version: '0'
|
99
134
|
type: :development
|
100
135
|
prerelease: false
|
101
|
-
version_requirements:
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
102
142
|
description: A fast parser for FastQ files
|
103
143
|
email: francesco.strozzi@gmail.com
|
104
144
|
executables: []
|
@@ -144,6 +184,7 @@ files:
|
|
144
184
|
- test/data/formats/illumina_full_range_as_sanger.fastq
|
145
185
|
- test/data/formats/illumina_full_range_as_solexa.fastq
|
146
186
|
- test/data/formats/illumina_full_range_original_illumina.fastq
|
187
|
+
- test/data/formats/issue_2.fastq
|
147
188
|
- test/data/formats/longreads_as_illumina.fastq
|
148
189
|
- test/data/formats/longreads_as_sanger.fastq
|
149
190
|
- test/data/formats/longreads_as_solexa.fastq
|
@@ -182,7 +223,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
182
223
|
version: '0'
|
183
224
|
segments:
|
184
225
|
- 0
|
185
|
-
hash: -
|
226
|
+
hash: -1805779141213914087
|
186
227
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
228
|
none: false
|
188
229
|
requirements:
|
@@ -191,7 +232,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
191
232
|
version: '0'
|
192
233
|
requirements: []
|
193
234
|
rubyforge_project:
|
194
|
-
rubygems_version: 1.8.
|
235
|
+
rubygems_version: 1.8.24
|
195
236
|
signing_key:
|
196
237
|
specification_version: 3
|
197
238
|
summary: A fast parser for FastQ files
|