bio-faster 0.4.4 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bio-faster.gemspec +4 -3
- data/ext/faster.c +10 -10
- data/lib/bio/faster.rb +32 -13
- data/spec/parser_spec.rb +21 -3
- data/test/data/formats/issue_2.fastq +4 -0
- metadata +61 -20
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.4.
|
1
|
+
0.4.5
|
data/bio-faster.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "bio-faster"
|
8
|
-
s.version = "0.4.
|
8
|
+
s.version = "0.4.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Francesco Strozzi"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-06-13"
|
13
13
|
s.description = "A fast parser for FastQ files"
|
14
14
|
s.email = "francesco.strozzi@gmail.com"
|
15
15
|
s.extensions = ["ext/mkrf_conf.rb"]
|
@@ -54,6 +54,7 @@ Gem::Specification.new do |s|
|
|
54
54
|
"test/data/formats/illumina_full_range_as_sanger.fastq",
|
55
55
|
"test/data/formats/illumina_full_range_as_solexa.fastq",
|
56
56
|
"test/data/formats/illumina_full_range_original_illumina.fastq",
|
57
|
+
"test/data/formats/issue_2.fastq",
|
57
58
|
"test/data/formats/longreads_as_illumina.fastq",
|
58
59
|
"test/data/formats/longreads_as_sanger.fastq",
|
59
60
|
"test/data/formats/longreads_as_solexa.fastq",
|
@@ -81,7 +82,7 @@ Gem::Specification.new do |s|
|
|
81
82
|
s.homepage = "http://github.com/fstrozzi/bioruby-faster"
|
82
83
|
s.licenses = ["MIT"]
|
83
84
|
s.require_paths = ["lib"]
|
84
|
-
s.rubygems_version = "1.8.
|
85
|
+
s.rubygems_version = "1.8.24"
|
85
86
|
s.summary = "A fast parser for FastQ files"
|
86
87
|
|
87
88
|
if s.respond_to? :specification_version then
|
data/ext/faster.c
CHANGED
@@ -95,23 +95,23 @@ int fastQ_iterator(FastQRecord *seq, int scale_factor) {
|
|
95
95
|
if (!check_header(header,seq->line)) return -1; // check if the header format is correct
|
96
96
|
// removing the @
|
97
97
|
seq->id = alloc_and_copy(seq->id, seq->line+1);
|
98
|
-
|
99
98
|
}
|
100
99
|
else {
|
101
|
-
if (check_bad_chars(seq->bad_chars,seq->line)) return -1; // check if quality or sequence includes bad characters
|
100
|
+
if ((i==1 || i==3) && (check_bad_chars(seq->bad_chars,seq->line))) return -1; // check if quality or sequence includes bad characters
|
102
101
|
if (i==1) seq->seq = alloc_and_copy(seq->seq, seq->line);
|
103
102
|
if (i==3) {
|
104
103
|
seq->raw_quality = alloc_and_copy(seq->raw_quality, seq->line);
|
105
104
|
int quality_length = strlen(seq->raw_quality);
|
106
105
|
if(strlen(seq->seq) != strlen(seq->raw_quality)) return -2; // if sequence and quality are of different length the record is truncated
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
106
|
+
if (scale_factor != 0) {
|
107
|
+
int c = 0;
|
108
|
+
seq->quality = initialize_int(seq->quality);
|
109
|
+
seq->quality = malloc(sizeof (int)* quality_length);
|
110
|
+
while(c < quality_length) {
|
111
|
+
seq->quality[c] = *(seq->line + c) - scale_factor; // quality conversion
|
112
|
+
c++;
|
113
|
+
}
|
114
|
+
}
|
115
115
|
}
|
116
116
|
|
117
117
|
}
|
data/lib/bio/faster.rb
CHANGED
@@ -13,9 +13,8 @@ module Bio
|
|
13
13
|
|
14
14
|
attr_accessor :file
|
15
15
|
attr_accessor :encoding
|
16
|
-
def initialize(file
|
16
|
+
def initialize(file)
|
17
17
|
self.file = file
|
18
|
-
self.encoding = encoding
|
19
18
|
end
|
20
19
|
|
21
20
|
class FastQRecord < FFI::Struct
|
@@ -32,23 +31,28 @@ module Bio
|
|
32
31
|
|
33
32
|
attach_function :fastQ_iterator, [FastQRecord, :int], :int
|
34
33
|
|
35
|
-
def each_record
|
34
|
+
def each_record(args = {:quality => :sanger}, &block)
|
36
35
|
if self.file == :stdin
|
37
36
|
self.file = "stdin"
|
38
37
|
elsif !File.exists? self.file
|
39
38
|
raise ArgumentError, "File #{self.file} does not exist"
|
40
39
|
end
|
41
40
|
record = FastQRecord.new
|
42
|
-
scale_factor = nil
|
43
|
-
case self.encoding
|
44
|
-
when :sanger then scale_factor = 33
|
45
|
-
when :solexa then scale_factor = 64
|
46
|
-
end
|
47
41
|
record[:filename] = FFI::MemoryPointer.from_string self.file
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
42
|
+
result = nil
|
43
|
+
case args[:quality]
|
44
|
+
when :sanger
|
45
|
+
scale_factor = 33
|
46
|
+
result = parse_fastq_with_quality_conversion(record, scale_factor, &block)
|
47
|
+
when :solexa
|
48
|
+
scale_factor = 64
|
49
|
+
result = parse_fastq_with_quality_conversion(record, scale_factor, &block)
|
50
|
+
when :raw
|
51
|
+
scale_factor = 0
|
52
|
+
result = parse_fastq(record, scale_factor, &block)
|
53
|
+
end
|
54
|
+
|
55
|
+
case result
|
52
56
|
when -1 then raise RuntimeError, "Bad formatted FastQ file!"
|
53
57
|
when -2 then raise RuntimeError, "Sequence or quality is truncated!"
|
54
58
|
end
|
@@ -56,6 +60,21 @@ module Bio
|
|
56
60
|
|
57
61
|
end
|
58
62
|
|
63
|
+
private
|
64
|
+
|
65
|
+
def parse_fastq_with_quality_conversion(record, scale_factor, &block)
|
66
|
+
while (result = Bio::Faster.fastQ_iterator(record,scale_factor)) == 1
|
67
|
+
yield [record[:id].read_string,record[:seq].read_string,record[:quality].read_array_of_int(record[:raw_quality].read_string.length)]
|
68
|
+
end
|
69
|
+
result
|
70
|
+
end
|
71
|
+
|
72
|
+
def parse_fastq(record, scale_factor, &block)
|
73
|
+
while (result = Bio::Faster.fastQ_iterator(record,scale_factor)) == 1
|
74
|
+
yield [record[:id].read_string,record[:seq].read_string,record[:raw_quality].read_string]
|
75
|
+
end
|
76
|
+
result
|
77
|
+
end
|
59
78
|
|
60
79
|
end
|
61
|
-
end
|
80
|
+
end
|
data/spec/parser_spec.rb
CHANGED
@@ -18,14 +18,14 @@ describe Bio::Faster do
|
|
18
18
|
bioruby_data << [seq.entry_id,seq.seq,seq.qualities]
|
19
19
|
end
|
20
20
|
faster_data = []
|
21
|
-
Bio::Faster.new(file
|
21
|
+
Bio::Faster.new(file).each_record(:quality => :solexa) do |seq|
|
22
22
|
seq[0] = seq[0].split(" ").first
|
23
23
|
faster_data << seq
|
24
24
|
end
|
25
25
|
faster_data.should == bioruby_data
|
26
26
|
end
|
27
27
|
|
28
|
-
it "should read different FastQ formats" do
|
28
|
+
it "should read different FastQ formats and convert quality scores" do
|
29
29
|
files = Dir.glob(TEST_DATA+"/formats/*.fastq")
|
30
30
|
files.each do |file|
|
31
31
|
bioruby_data = []
|
@@ -42,6 +42,24 @@ describe Bio::Faster do
|
|
42
42
|
|
43
43
|
end
|
44
44
|
|
45
|
+
it "should read different FastQ formats without converting quality scores" do
|
46
|
+
|
47
|
+
files = Dir.glob(TEST_DATA+"/formats/*.fastq")
|
48
|
+
files.each do |file|
|
49
|
+
bioruby_data = []
|
50
|
+
Bio::FlatFile.open(Bio::Fastq,File.open(file)).each_entry do |seq|
|
51
|
+
bioruby_data << [seq.entry_id,seq.seq,seq.quality_string]
|
52
|
+
end
|
53
|
+
faster_data = []
|
54
|
+
Bio::Faster.new(file).each_record(:quality => :raw) do |seq|
|
55
|
+
seq[0] = seq[0].split(" ").first
|
56
|
+
faster_data << seq
|
57
|
+
end
|
58
|
+
faster_data.should == bioruby_data
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
45
63
|
|
46
64
|
it "can read from the standard input" do
|
47
65
|
require 'digest/md5'
|
@@ -63,4 +81,4 @@ describe Bio::Faster do
|
|
63
81
|
end
|
64
82
|
|
65
83
|
|
66
|
-
end
|
84
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-faster
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ffi
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: shoulda
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :development
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: bundler
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ~>
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: 1.0.0
|
44
54
|
type: :development
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.0.0
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: jeweler
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ~>
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: 1.6.4
|
55
70
|
type: :development
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.6.4
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: rcov
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ! '>='
|
@@ -65,10 +85,15 @@ dependencies:
|
|
65
85
|
version: '0'
|
66
86
|
type: :development
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: bio
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
72
97
|
none: false
|
73
98
|
requirements:
|
74
99
|
- - ! '>='
|
@@ -76,10 +101,15 @@ dependencies:
|
|
76
101
|
version: 1.4.2
|
77
102
|
type: :development
|
78
103
|
prerelease: false
|
79
|
-
version_requirements:
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.4.2
|
80
110
|
- !ruby/object:Gem::Dependency
|
81
111
|
name: rspec
|
82
|
-
requirement:
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
83
113
|
none: false
|
84
114
|
requirements:
|
85
115
|
- - ! '>='
|
@@ -87,10 +117,15 @@ dependencies:
|
|
87
117
|
version: '0'
|
88
118
|
type: :development
|
89
119
|
prerelease: false
|
90
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
91
126
|
- !ruby/object:Gem::Dependency
|
92
127
|
name: ffi
|
93
|
-
requirement:
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
94
129
|
none: false
|
95
130
|
requirements:
|
96
131
|
- - ! '>='
|
@@ -98,7 +133,12 @@ dependencies:
|
|
98
133
|
version: '0'
|
99
134
|
type: :development
|
100
135
|
prerelease: false
|
101
|
-
version_requirements:
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
102
142
|
description: A fast parser for FastQ files
|
103
143
|
email: francesco.strozzi@gmail.com
|
104
144
|
executables: []
|
@@ -144,6 +184,7 @@ files:
|
|
144
184
|
- test/data/formats/illumina_full_range_as_sanger.fastq
|
145
185
|
- test/data/formats/illumina_full_range_as_solexa.fastq
|
146
186
|
- test/data/formats/illumina_full_range_original_illumina.fastq
|
187
|
+
- test/data/formats/issue_2.fastq
|
147
188
|
- test/data/formats/longreads_as_illumina.fastq
|
148
189
|
- test/data/formats/longreads_as_sanger.fastq
|
149
190
|
- test/data/formats/longreads_as_solexa.fastq
|
@@ -182,7 +223,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
182
223
|
version: '0'
|
183
224
|
segments:
|
184
225
|
- 0
|
185
|
-
hash: -
|
226
|
+
hash: -1805779141213914087
|
186
227
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
228
|
none: false
|
188
229
|
requirements:
|
@@ -191,7 +232,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
191
232
|
version: '0'
|
192
233
|
requirements: []
|
193
234
|
rubyforge_project:
|
194
|
-
rubygems_version: 1.8.
|
235
|
+
rubygems_version: 1.8.24
|
195
236
|
signing_key:
|
196
237
|
specification_version: 3
|
197
238
|
summary: A fast parser for FastQ files
|