bio-fastqc 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/lib/bio/fastqc/converter.rb +23 -23
- data/lib/bio/fastqc/io.rb +6 -6
- data/lib/bio/fastqc/parser.rb +154 -170
- data/lib/bio/fastqc/semantics.rb +57 -42
- data/spec/bio-fastqc_spec.rb +117 -9
- data/spec/example_fastqc_454.zip +0 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1a0b30382d2c41b0fc5327cc1e18ce63b7f190e
|
4
|
+
data.tar.gz: 479c6f71276f0360f15cc3286bafe8f12b7404d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56c3e7b739a99e6ee39ffc958a8f59f61438ebfd44a7dd3ae9dcb1ea0ea481fb40c927673f6f8ae052e4422cfb4b8e38b6da542ace153b24d7e4f439284f68b3
|
7
|
+
data.tar.gz: 41ef0bc02eb028d9f4de4661cd4cc67159159d1e17ff4f5c06d8779ee3b0ddaf1b348f6aea1d06222873443820aba50133cb6732cfe2a8f875e87c6f12f2573f
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.6.0
|
data/lib/bio/fastqc/converter.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
module Bio
|
4
4
|
module FastQC
|
5
5
|
class Converter
|
6
|
-
def initialize(
|
6
|
+
def initialize(fastqc_object, id: nil)
|
7
7
|
@id = id
|
8
|
-
@
|
8
|
+
@fastqc_object = fastqc_object
|
9
9
|
end
|
10
10
|
|
11
11
|
def convert_to(format)
|
@@ -23,20 +23,20 @@ module Bio
|
|
23
23
|
|
24
24
|
def to_json
|
25
25
|
json = if @id
|
26
|
-
{ @id => @
|
26
|
+
{ @id => @fastqc_object }
|
27
27
|
else
|
28
|
-
@
|
28
|
+
@fastqc_object
|
29
29
|
end
|
30
30
|
JSON.dump(json)
|
31
31
|
end
|
32
32
|
|
33
33
|
def to_jsonld
|
34
|
-
json_ld_object = Semantics.new(@
|
34
|
+
json_ld_object = Semantics.new(@fastqc_object, id: @id).json_ld_object
|
35
35
|
JSON.dump(json_ld_object)
|
36
36
|
end
|
37
37
|
|
38
38
|
def to_turtle
|
39
|
-
Semantics.new(@
|
39
|
+
Semantics.new(@fastqc_object, id: @id).turtle
|
40
40
|
end
|
41
41
|
|
42
42
|
def to_ttl
|
@@ -47,28 +47,28 @@ module Bio
|
|
47
47
|
identifier = if @id
|
48
48
|
@id
|
49
49
|
else
|
50
|
-
@
|
50
|
+
@fastqc_object[:filename].split(".").first
|
51
51
|
end
|
52
52
|
|
53
53
|
# return one-line tab separated value
|
54
54
|
[
|
55
55
|
identifier,
|
56
|
-
@
|
57
|
-
@
|
58
|
-
@
|
59
|
-
@
|
60
|
-
@
|
61
|
-
@
|
62
|
-
@
|
63
|
-
@
|
64
|
-
@
|
65
|
-
@
|
66
|
-
@
|
67
|
-
@
|
68
|
-
@
|
69
|
-
@
|
70
|
-
@
|
71
|
-
@
|
56
|
+
@fastqc_object[:fastqc_version],
|
57
|
+
@fastqc_object[:filename],
|
58
|
+
@fastqc_object[:file_type],
|
59
|
+
@fastqc_object[:encoding],
|
60
|
+
@fastqc_object[:total_sequences],
|
61
|
+
@fastqc_object[:filtered_sequences],
|
62
|
+
@fastqc_object[:sequence_length],
|
63
|
+
@fastqc_object[:min_length],
|
64
|
+
@fastqc_object[:max_length],
|
65
|
+
@fastqc_object[:mean_sequence_length],
|
66
|
+
@fastqc_object[:median_sequence_length],
|
67
|
+
@fastqc_object[:percent_gc],
|
68
|
+
@fastqc_object[:total_duplicate_percentage],
|
69
|
+
@fastqc_object[:overall_mean_quality_score],
|
70
|
+
@fastqc_object[:overall_median_quality_score],
|
71
|
+
@fastqc_object[:overall_n_content],
|
72
72
|
].join("\t")
|
73
73
|
end
|
74
74
|
end
|
data/lib/bio/fastqc/io.rb
CHANGED
@@ -5,8 +5,8 @@ require 'rdf/turtle'
|
|
5
5
|
module Bio
|
6
6
|
module FastQC
|
7
7
|
class IO
|
8
|
-
def initialize(
|
9
|
-
@
|
8
|
+
def initialize(fastqc_object, id: nil)
|
9
|
+
@fastqc_object = fastqc_object
|
10
10
|
@id = id
|
11
11
|
end
|
12
12
|
|
@@ -24,17 +24,17 @@ module Bio
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def write_json(output_file)
|
27
|
-
json = Converter.new(@
|
27
|
+
json = Converter.new(@fastqc_object, id: @id).to_json
|
28
28
|
open(output_file, 'w'){|file| file.puts(json) }
|
29
29
|
end
|
30
30
|
|
31
31
|
def write_jsonld(output_file)
|
32
|
-
jsonld = Converter.new(@
|
32
|
+
jsonld = Converter.new(@fastqc_object, id: @id).to_jsonld
|
33
33
|
open(output_file, 'w'){|file| file.puts(jsonld) }
|
34
34
|
end
|
35
35
|
|
36
36
|
def write_ttl(output_file)
|
37
|
-
semantics = Semantics.new(@
|
37
|
+
semantics = Semantics.new(@fastqc_object, id: @id)
|
38
38
|
graph = semantics.turtle_graph
|
39
39
|
prefixes = semantics.turtle_prefixes
|
40
40
|
RDF::Turtle::Writer.open(output_file, prefixes: prefixes) do |writer|
|
@@ -43,7 +43,7 @@ module Bio
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def write_tsv(output_file)
|
46
|
-
tsv = Converter.new(@
|
46
|
+
tsv = Converter.new(@fastqc_object, id: @id).to_tsv
|
47
47
|
open(output_file, 'w'){|file| file.puts(tsv) }
|
48
48
|
end
|
49
49
|
end
|
data/lib/bio/fastqc/parser.rb
CHANGED
@@ -5,246 +5,230 @@ module Bio
|
|
5
5
|
class Parser
|
6
6
|
def initialize(fastqc_data_txt)
|
7
7
|
@data = fastqc_data_txt
|
8
|
-
@
|
9
|
-
@
|
8
|
+
@module_results = parse_modules
|
9
|
+
@basic_statistics = basic_statistics
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
13
|
-
|
14
|
-
|
15
|
-
lines = node.split("\n")
|
16
|
-
rm_header = lines.map do |line|
|
17
|
-
if line !~ /^\#/ || line =~ /^#Total Duplicate Percentage/
|
18
|
-
line.split("\t")
|
19
|
-
end
|
20
|
-
end
|
21
|
-
rm_header.compact
|
12
|
+
def parse_modules
|
13
|
+
@data.split(">>END_MODULE\n").map do |mod|
|
14
|
+
mod.split("\n").map{|line| line.split("\t") }
|
22
15
|
end
|
23
16
|
end
|
24
17
|
|
25
|
-
|
26
|
-
|
27
|
-
|
18
|
+
#
|
19
|
+
# Basic Statistics module
|
20
|
+
#
|
28
21
|
|
29
22
|
def basic_statistics
|
30
|
-
Hash[*@
|
23
|
+
Hash[*@module_results[0].flatten]
|
31
24
|
end
|
32
25
|
|
33
|
-
def
|
34
|
-
@
|
26
|
+
def fastqc_version # software version of FastQC
|
27
|
+
@basic_statistics["##FastQC"]
|
35
28
|
end
|
36
29
|
|
37
|
-
def
|
38
|
-
@
|
30
|
+
def filename # input filename for FastQC program
|
31
|
+
@basic_statistics["Filename"]
|
39
32
|
end
|
40
33
|
|
41
|
-
def
|
42
|
-
@
|
34
|
+
def file_type # input file type
|
35
|
+
@basic_statistics["File type"]
|
43
36
|
end
|
44
37
|
|
45
|
-
def
|
46
|
-
@
|
38
|
+
def encoding # quality encoding method for input file type
|
39
|
+
@basic_statistics["Encoding"]
|
47
40
|
end
|
48
41
|
|
49
|
-
def
|
50
|
-
@
|
42
|
+
def total_sequences # total number of sequence reads
|
43
|
+
@basic_statistics["Total Sequences"].to_i
|
51
44
|
end
|
52
45
|
|
53
|
-
def
|
54
|
-
@
|
46
|
+
def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality
|
47
|
+
@basic_statistics["Sequences flagged as poor quality"].to_i
|
55
48
|
end
|
56
49
|
|
57
|
-
def
|
58
|
-
|
59
|
-
if l =~ /\d-\d/
|
60
|
-
l.sub(/-\d+$/,"").to_i
|
61
|
-
else
|
62
|
-
l.to_i
|
63
|
-
end
|
50
|
+
def filtered_sequences # number of sequence reads filtered out
|
51
|
+
@basic_statistics["Filtered Sequences"].to_i
|
64
52
|
end
|
65
53
|
|
66
|
-
def
|
67
|
-
|
68
|
-
if l =~ /\d-\d/
|
69
|
-
l.sub(/^\d+-/,"").to_i
|
70
|
-
else
|
71
|
-
l.to_i
|
72
|
-
end
|
54
|
+
def sequence_length # store as string: can be range
|
55
|
+
@basic_statistics["Sequence length"]
|
73
56
|
end
|
74
57
|
|
75
|
-
def percent_gc
|
76
|
-
@
|
58
|
+
def percent_gc # overall percentage of GC content
|
59
|
+
@basic_statistics["%GC"].to_f
|
77
60
|
end
|
78
61
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
62
|
+
#
|
63
|
+
# Other modules
|
64
|
+
#
|
83
65
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
if per_base
|
88
|
-
v = per_base.map{|c| (10**(c[1].to_f/-10)).to_f }
|
89
|
-
-10 * Math.log10(v.reduce(:+) / v.size)
|
90
|
-
end
|
66
|
+
def get_module_matrix(module_name, num_of_header_rows)
|
67
|
+
mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0]
|
68
|
+
mod.drop(num_of_header_rows) if mod
|
91
69
|
end
|
92
70
|
|
93
|
-
|
94
|
-
|
95
|
-
per_base = self.per_base_sequence_quality
|
96
|
-
if per_base
|
97
|
-
v = per_base.map{|c| (10**(c[2].to_f/-10)).to_f }
|
98
|
-
-10 * Math.log10(v.reduce(:+) / v.size)
|
99
|
-
end
|
71
|
+
def per_base_sequence_quality
|
72
|
+
get_module_matrix("Per base sequence quality", 1)
|
100
73
|
end
|
101
74
|
|
102
75
|
def per_tile_sequence_quality
|
103
|
-
|
104
|
-
node.select{|n| n.first != ">>Per tile sequence quality" } if node
|
76
|
+
get_module_matrix("Per tile sequence quality", 1)
|
105
77
|
end
|
106
78
|
|
107
79
|
def per_sequence_quality_scores
|
108
|
-
|
109
|
-
node.select{|n| n.first != ">>Per sequence quality scores" } if node
|
80
|
+
get_module_matrix("Per sequence quality scores", 1)
|
110
81
|
end
|
111
82
|
|
112
83
|
def per_base_sequence_content
|
113
|
-
|
114
|
-
node.select{|n| n.first != ">>Per base sequence content" } if node
|
84
|
+
get_module_matrix("Per base sequence content", 1)
|
115
85
|
end
|
116
86
|
|
117
87
|
def per_sequence_gc_content
|
118
|
-
|
119
|
-
node.select{|n| n.first != ">>Per sequence GC content" } if node
|
88
|
+
get_module_matrix("Per sequence GC content", 1)
|
120
89
|
end
|
121
90
|
|
122
|
-
def
|
123
|
-
|
124
|
-
node.select{|n| n.first != ">>Per sequence GC content" } if node
|
91
|
+
def per_base_n_content
|
92
|
+
get_module_matrix("Per base N content", 1)
|
125
93
|
end
|
126
94
|
|
127
|
-
def
|
128
|
-
|
129
|
-
node.select{|n| n.first != ">>Per base N content" } if node
|
95
|
+
def sequence_length_distribution
|
96
|
+
get_module_matrix("Sequence Length Distribution", 1)
|
130
97
|
end
|
131
98
|
|
132
|
-
|
133
|
-
|
134
|
-
per_base = self.per_base_n_content
|
135
|
-
if per_base
|
136
|
-
v = per_base.map{|c| c[1].to_f }
|
137
|
-
v.reduce(:+) / v.size
|
138
|
-
end
|
99
|
+
def total_duplicate_percentage
|
100
|
+
get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f
|
139
101
|
end
|
140
102
|
|
141
|
-
def
|
142
|
-
|
143
|
-
node.select{|n| n.first != ">>Sequence Length Distribution" } if node
|
103
|
+
def sequence_duplication_levels
|
104
|
+
get_module_matrix("Sequence Duplication Levels", 2)
|
144
105
|
end
|
145
106
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
107
|
+
def overrepresented_sequences
|
108
|
+
get_module_matrix("Overrepresented sequences", 1)
|
109
|
+
end
|
110
|
+
|
111
|
+
def adapter_content
|
112
|
+
get_module_matrix("Adapter Content", 1)
|
113
|
+
end
|
114
|
+
|
115
|
+
def kmer_content
|
116
|
+
get_module_matrix("Kmer Content", 1)
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Custom modules
|
121
|
+
#
|
122
|
+
|
123
|
+
def min_length
|
124
|
+
sequence_length.sub(/-\d+$/,"").to_i
|
125
|
+
end
|
126
|
+
|
127
|
+
def max_length
|
128
|
+
sequence_length.sub(/^\d+-/,"").to_i
|
129
|
+
end
|
130
|
+
|
131
|
+
def per_base_quality_column(mean_or_median)
|
132
|
+
case mean_or_median
|
133
|
+
when :mean
|
134
|
+
1
|
135
|
+
when :median
|
136
|
+
2
|
163
137
|
end
|
164
138
|
end
|
165
139
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
length = length_count[0]
|
172
|
-
count = length_count[1].to_i
|
173
|
-
if length =~ /\d-\d/
|
174
|
-
f = length.sub(/-\d+$/,"").to_i
|
175
|
-
b = length.sub(/^\d+-/,"").to_i
|
176
|
-
mean = (f + b) / 2
|
177
|
-
[mean.to_f] * count
|
178
|
-
else
|
179
|
-
[length.to_f] * count
|
180
|
-
end
|
181
|
-
end
|
182
|
-
sorted = array.flatten.sort
|
183
|
-
quot = sorted.size / 2
|
184
|
-
if !sorted.size.even?
|
185
|
-
sorted[quot]
|
186
|
-
else
|
187
|
-
f = sorted[quot]
|
188
|
-
b = sorted[quot - 1]
|
189
|
-
(f + b) / 2
|
190
|
-
end
|
140
|
+
def overall_quality_score(mean_or_median)
|
141
|
+
per_base = per_base_sequence_quality.drop(1) # drop header
|
142
|
+
column = per_base_quality_column(mean_or_median)
|
143
|
+
v = per_base.map do |row|
|
144
|
+
(10**(row[column].to_f / -10)).to_f
|
191
145
|
end
|
146
|
+
-10 * Math.log10(v.reduce(:+) / v.size)
|
192
147
|
end
|
193
148
|
|
194
|
-
def
|
195
|
-
|
196
|
-
node.select{|n| n.first != ">>Sequence Duplication Levels" && n.first != "\#Total Duplicate Percentage" } if node
|
149
|
+
def overall_mean_quality_score
|
150
|
+
overall_quality_score(:mean)
|
197
151
|
end
|
198
152
|
|
199
|
-
def
|
200
|
-
|
201
|
-
node.select{|n| n.first == "\#Total Duplicate Percentage" }.flatten[1].to_f if node
|
153
|
+
def overall_median_quality_score
|
154
|
+
overall_quality_score(:median)
|
202
155
|
end
|
203
156
|
|
204
|
-
def
|
205
|
-
|
206
|
-
|
157
|
+
def overall_n_content
|
158
|
+
per_base = per_base_n_content
|
159
|
+
v = per_base.map{|c| c[1].to_f }
|
160
|
+
v.reduce(:+) / v.size
|
207
161
|
end
|
208
162
|
|
209
|
-
def
|
210
|
-
|
211
|
-
|
163
|
+
def mean_sequence_length
|
164
|
+
dist = sequence_length_distribution.drop(1) # drop column header
|
165
|
+
if dist.size == 1
|
166
|
+
dist[0][0].to_f
|
167
|
+
else
|
168
|
+
sum = dist.map do |length_count|
|
169
|
+
l = length_count[0]
|
170
|
+
c = length_count[1].to_f
|
171
|
+
((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c
|
172
|
+
end
|
173
|
+
sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+)
|
174
|
+
end
|
212
175
|
end
|
213
176
|
|
214
|
-
def
|
215
|
-
|
216
|
-
|
177
|
+
def median_sequence_length
|
178
|
+
dist = sequence_length_distribution.drop(1) # drop column header
|
179
|
+
if dist.size == 1
|
180
|
+
dist[0][0].to_f
|
181
|
+
else
|
182
|
+
k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median
|
183
|
+
median = 0
|
184
|
+
dist.each do |l_c|
|
185
|
+
c = l_c[1].to_f # count of reads in this length range
|
186
|
+
if k > c
|
187
|
+
k -= c
|
188
|
+
else
|
189
|
+
l = l_c[0]
|
190
|
+
median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2)
|
191
|
+
break
|
192
|
+
end
|
193
|
+
end
|
194
|
+
median
|
195
|
+
end
|
217
196
|
end
|
218
197
|
|
219
198
|
def summary
|
199
|
+
parse
|
200
|
+
end
|
201
|
+
|
202
|
+
def parse
|
220
203
|
{
|
221
|
-
fastqc_version:
|
222
|
-
filename:
|
223
|
-
file_type:
|
224
|
-
encoding:
|
225
|
-
total_sequences:
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
204
|
+
fastqc_version: fastqc_version,
|
205
|
+
filename: filename,
|
206
|
+
file_type: file_type,
|
207
|
+
encoding: encoding,
|
208
|
+
total_sequences: total_sequences,
|
209
|
+
sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality,
|
210
|
+
filtered_sequences: filtered_sequences,
|
211
|
+
sequence_length: sequence_length,
|
212
|
+
percent_gc: percent_gc,
|
213
|
+
per_base_sequence_quality: per_base_sequence_quality,
|
214
|
+
per_tile_sequence_quality: per_tile_sequence_quality,
|
215
|
+
per_sequence_quality_scores: per_sequence_quality_scores,
|
216
|
+
per_base_sequence_content: per_base_sequence_content,
|
217
|
+
per_sequence_gc_content: per_sequence_gc_content,
|
218
|
+
per_base_n_content: per_base_n_content,
|
219
|
+
sequence_length_distribution: sequence_length_distribution,
|
220
|
+
total_duplicate_percentage: total_duplicate_percentage,
|
221
|
+
sequence_duplication_levels: sequence_duplication_levels,
|
222
|
+
overrepresented_sequences: overrepresented_sequences,
|
223
|
+
adapter_content: adapter_content,
|
224
|
+
kmer_content: kmer_content,
|
225
|
+
min_length: min_length,
|
226
|
+
max_length: max_length,
|
227
|
+
overall_mean_quality_score: overall_mean_quality_score,
|
228
|
+
overall_median_quality_score: overall_median_quality_score,
|
229
|
+
overall_n_content: overall_n_content,
|
230
|
+
mean_sequence_length: mean_sequence_length,
|
231
|
+
median_sequence_length: median_sequence_length,
|
248
232
|
}
|
249
233
|
end
|
250
234
|
end
|
data/lib/bio/fastqc/semantics.rb
CHANGED
@@ -6,9 +6,13 @@ require 'rdf/turtle'
|
|
6
6
|
module Bio
|
7
7
|
module FastQC
|
8
8
|
class Semantics
|
9
|
-
def initialize(
|
9
|
+
def initialize(fastqc_object, id: nil)
|
10
10
|
@id = id
|
11
|
-
@
|
11
|
+
@fastqc_object = fastqc_object
|
12
|
+
end
|
13
|
+
|
14
|
+
def rdf_version
|
15
|
+
"0.1.0"
|
12
16
|
end
|
13
17
|
|
14
18
|
def turtle
|
@@ -23,6 +27,9 @@ module Bio
|
|
23
27
|
{
|
24
28
|
"uo" => "http://purl.obolibrary.org/obo/",
|
25
29
|
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
30
|
+
"dcterms" => "http://purl.org/dc/terms/",
|
31
|
+
"pav" => "http://purl.org/pav/",
|
32
|
+
"foaf" => "http://xmlns.com/foaf/0.1/",
|
26
33
|
}
|
27
34
|
end
|
28
35
|
|
@@ -32,18 +39,30 @@ module Bio
|
|
32
39
|
object
|
33
40
|
end
|
34
41
|
|
35
|
-
def
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
42
|
+
def uri_base
|
43
|
+
"http://purl.jp/bio/01/quanto"
|
44
|
+
end
|
45
|
+
|
46
|
+
def identifier_literal
|
47
|
+
@id ? @id : "QNT" + @fastqc_object[:filename].split(".")[0]
|
48
|
+
end
|
49
|
+
|
50
|
+
def identifier_uri
|
51
|
+
uri_base + "/resource/" + identifier_literal
|
41
52
|
end
|
42
53
|
|
43
54
|
def object_core
|
44
55
|
{
|
45
56
|
"@context" => jsonld_context,
|
46
|
-
"@id" =>
|
57
|
+
"@id" => identifier_uri,
|
58
|
+
"@type" => "SequenceStatisticsReport",
|
59
|
+
"dcterms:identifier" => identifier_literal,
|
60
|
+
"dcterms:contributor" => ["Tazro Ohta", "Shuichi Kawashima"],
|
61
|
+
"dcterms:created" => Time.now.strftime("%Y-%m-%d"),
|
62
|
+
"dcterms:license" => "http://creativecommons.org/licenses/by-sa/2.1/jp/deed.en",
|
63
|
+
"dcterms:publisher" => "http://dbcls.rois.ac.jp/",
|
64
|
+
"pav:version" => rdf_version,
|
65
|
+
"foaf:page" => "http://quanto.dbcls.jp",
|
47
66
|
}
|
48
67
|
end
|
49
68
|
|
@@ -94,24 +113,26 @@ module Bio
|
|
94
113
|
end
|
95
114
|
|
96
115
|
def fastqc_version
|
97
|
-
{
|
116
|
+
{
|
117
|
+
"fastqcVersion" => @fastqc_object[:fastqc_version],
|
118
|
+
}
|
98
119
|
end
|
99
120
|
|
100
121
|
def filename
|
101
122
|
{
|
102
|
-
"filename" => @
|
123
|
+
"filename" => @fastqc_object[:filename],
|
103
124
|
}
|
104
125
|
end
|
105
126
|
|
106
127
|
def file_type
|
107
128
|
{
|
108
|
-
"fileType" => @
|
129
|
+
"fileType" => @fastqc_object[:file_type],
|
109
130
|
}
|
110
131
|
end
|
111
132
|
|
112
133
|
def encoding
|
113
134
|
{
|
114
|
-
"encoding" => @
|
135
|
+
"encoding" => @fastqc_object[:encoding],
|
115
136
|
}
|
116
137
|
end
|
117
138
|
|
@@ -120,7 +141,7 @@ module Bio
|
|
120
141
|
"totalSequences" => {
|
121
142
|
"@type" => "SequenceReadContent",
|
122
143
|
"hasUnit" => "uo:CountUnit",
|
123
|
-
"rdf:value" => @
|
144
|
+
"rdf:value" => @fastqc_object[:total_sequences],
|
124
145
|
}
|
125
146
|
}
|
126
147
|
end
|
@@ -130,7 +151,7 @@ module Bio
|
|
130
151
|
"filteredSequences" => {
|
131
152
|
"@type" => "SequenceReadContent",
|
132
153
|
"hasUnit" => "uo:CountUnit",
|
133
|
-
"rdf:value" => @
|
154
|
+
"rdf:value" => @fastqc_object[:filtered_sequences],
|
134
155
|
}
|
135
156
|
}
|
136
157
|
end
|
@@ -140,7 +161,7 @@ module Bio
|
|
140
161
|
"sequenceLength" => {
|
141
162
|
"@type" => "SequenceReadLength",
|
142
163
|
"hasUnit" => "uo:CountUnit",
|
143
|
-
"rdf:value" => @
|
164
|
+
"rdf:value" => @fastqc_object[:sequence_length],
|
144
165
|
}
|
145
166
|
}
|
146
167
|
end
|
@@ -150,7 +171,7 @@ module Bio
|
|
150
171
|
"percentGC" => {
|
151
172
|
"@type" => "NucleotideBaseContent",
|
152
173
|
"hasUnit" => "uo:CountUnit",
|
153
|
-
"rdf:value" => @
|
174
|
+
"rdf:value" => @fastqc_object[:percent_gc],
|
154
175
|
}
|
155
176
|
}
|
156
177
|
end
|
@@ -158,7 +179,7 @@ module Bio
|
|
158
179
|
def per_base_sequence_quality
|
159
180
|
{
|
160
181
|
"@type" => "PerBaseSequenceQuality",
|
161
|
-
"hasRow" => per_base_sequence_quality_rows(@
|
182
|
+
"hasRow" => per_base_sequence_quality_rows(@fastqc_object[:per_base_sequence_quality]),
|
162
183
|
}
|
163
184
|
end
|
164
185
|
|
@@ -220,7 +241,7 @@ module Bio
|
|
220
241
|
def per_sequence_quality_scores
|
221
242
|
{
|
222
243
|
"@type" => "PerSequnceQualityScores",
|
223
|
-
"hasRow" => per_sequence_quality_scores_rows(@
|
244
|
+
"hasRow" => per_sequence_quality_scores_rows(@fastqc_object[:per_sequence_quality_scores]),
|
224
245
|
}
|
225
246
|
end
|
226
247
|
|
@@ -248,7 +269,7 @@ module Bio
|
|
248
269
|
def per_base_sequence_content
|
249
270
|
{
|
250
271
|
"@type" => "PerBaseSequenceContent",
|
251
|
-
"hasRow" => per_base_sequence_content_rows(@
|
272
|
+
"hasRow" => per_base_sequence_content_rows(@fastqc_object[:per_base_sequence_content]),
|
252
273
|
}
|
253
274
|
end
|
254
275
|
|
@@ -293,7 +314,7 @@ module Bio
|
|
293
314
|
def per_sequence_gc_content
|
294
315
|
{
|
295
316
|
"@type" => "PerSequenceGCContent",
|
296
|
-
"hasRow" => per_sequence_gc_content_rows(@
|
317
|
+
"hasRow" => per_sequence_gc_content_rows(@fastqc_object[:per_sequence_gc_content]),
|
297
318
|
}
|
298
319
|
end
|
299
320
|
|
@@ -321,7 +342,7 @@ module Bio
|
|
321
342
|
def per_base_n_content
|
322
343
|
{
|
323
344
|
"@type" => "PerBaseNContent",
|
324
|
-
"hasRow" => per_base_n_content_rows(@
|
345
|
+
"hasRow" => per_base_n_content_rows(@fastqc_object[:per_base_n_content]),
|
325
346
|
}
|
326
347
|
end
|
327
348
|
|
@@ -348,7 +369,7 @@ module Bio
|
|
348
369
|
def sequence_length_distribution
|
349
370
|
{
|
350
371
|
"@type" => "SequenceLengthDistribution",
|
351
|
-
"hasRow" => sequence_length_distribution_rows(@
|
372
|
+
"hasRow" => sequence_length_distribution_rows(@fastqc_object[:sequence_length_distribution]),
|
352
373
|
}
|
353
374
|
end
|
354
375
|
|
@@ -381,7 +402,7 @@ module Bio
|
|
381
402
|
def sequence_duplication_levels
|
382
403
|
{
|
383
404
|
"@type" => "SequenceDuplicationLevels",
|
384
|
-
"hasRow" => sequence_duplication_levels_rows(@
|
405
|
+
"hasRow" => sequence_duplication_levels_rows(@fastqc_object[:sequence_duplication_levels]),
|
385
406
|
}
|
386
407
|
end
|
387
408
|
|
@@ -410,7 +431,7 @@ module Bio
|
|
410
431
|
def overrepresented_sequences
|
411
432
|
{
|
412
433
|
"@type" => "OverrepresentedSequences",
|
413
|
-
"hasRow" => overrepresented_sequences_rows(@
|
434
|
+
"hasRow" => overrepresented_sequences_rows(@fastqc_object[:overrepresented_sequences]),
|
414
435
|
}
|
415
436
|
end
|
416
437
|
|
@@ -446,7 +467,7 @@ module Bio
|
|
446
467
|
def kmer_content
|
447
468
|
{
|
448
469
|
"@type" => "KmerContent",
|
449
|
-
"hasRow" => kmer_content_rows(@
|
470
|
+
"hasRow" => kmer_content_rows(@fastqc_object[:kmer_content]),
|
450
471
|
}
|
451
472
|
end
|
452
473
|
|
@@ -486,7 +507,7 @@ module Bio
|
|
486
507
|
"minSequenceLength" => {
|
487
508
|
"@type" => "SequenceReadLength",
|
488
509
|
"hasUnit" => "uo:CountUnit",
|
489
|
-
"rdf:value" => @
|
510
|
+
"rdf:value" => @fastqc_object[:min_length],
|
490
511
|
}
|
491
512
|
}
|
492
513
|
end
|
@@ -496,7 +517,7 @@ module Bio
|
|
496
517
|
"maxSequenceLength" => {
|
497
518
|
"@type" => "SequenceReadLength",
|
498
519
|
"hasUnit" => "uo:CountUnit",
|
499
|
-
"rdf:value" => @
|
520
|
+
"rdf:value" => @fastqc_object[:max_length],
|
500
521
|
}
|
501
522
|
}
|
502
523
|
end
|
@@ -506,7 +527,7 @@ module Bio
|
|
506
527
|
"meanSequenceLength" => {
|
507
528
|
"@type" => "SequenceReadLength",
|
508
529
|
"hasUnit" => "uo:CountUnit",
|
509
|
-
"rdf:value" => @
|
530
|
+
"rdf:value" => @fastqc_object[:mean_sequence_length],
|
510
531
|
}
|
511
532
|
}
|
512
533
|
end
|
@@ -516,7 +537,7 @@ module Bio
|
|
516
537
|
"medianSequenceLength" => {
|
517
538
|
"@type" => "SequenceReadLength",
|
518
539
|
"hasUnit" => "uo:CountUnit",
|
519
|
-
"rdf:value" => @
|
540
|
+
"rdf:value" => @fastqc_object[:median_sequence_length],
|
520
541
|
}
|
521
542
|
}
|
522
543
|
end
|
@@ -526,7 +547,7 @@ module Bio
|
|
526
547
|
"overallMeanBaseCallQuality" => {
|
527
548
|
"@type" => "PhredQualityScore",
|
528
549
|
"hasUnit" => "uo:CountUnit",
|
529
|
-
"rdf:value" => @
|
550
|
+
"rdf:value" => @fastqc_object[:overall_mean_quality_score],
|
530
551
|
}
|
531
552
|
}
|
532
553
|
end
|
@@ -536,7 +557,7 @@ module Bio
|
|
536
557
|
"overallMedianBaseCallQuality" => {
|
537
558
|
"@type" => "PhredQualityScore",
|
538
559
|
"hasUnit" => "uo:CountUnit",
|
539
|
-
"rdf:value" => @
|
560
|
+
"rdf:value" => @fastqc_object[:overall_median_quality_score],
|
540
561
|
}
|
541
562
|
}
|
542
563
|
end
|
@@ -546,7 +567,7 @@ module Bio
|
|
546
567
|
"overallNContent" => {
|
547
568
|
"@type" => "NContent",
|
548
569
|
"hasUnit" => "uo:Percentage",
|
549
|
-
"rdf:value" => @
|
570
|
+
"rdf:value" => @fastqc_object[:overall_n_content],
|
550
571
|
}
|
551
572
|
}
|
552
573
|
end
|
@@ -557,10 +578,10 @@ module Bio
|
|
557
578
|
|
558
579
|
def jsonld_context
|
559
580
|
# definition of imported terms in @context
|
560
|
-
object =
|
581
|
+
object = turtle_prefixes
|
561
582
|
|
562
583
|
# definition of local ontology terms
|
563
|
-
domain = "
|
584
|
+
domain = uri_base + "/ontology/sos#"
|
564
585
|
|
565
586
|
# definition of class in @context
|
566
587
|
sos_class.each do |term|
|
@@ -597,13 +618,6 @@ module Bio
|
|
597
618
|
object
|
598
619
|
end
|
599
620
|
|
600
|
-
def imported_keywords
|
601
|
-
{
|
602
|
-
"uo" => "http://purl.obolibrary.org/obo/",
|
603
|
-
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
604
|
-
}
|
605
|
-
end
|
606
|
-
|
607
621
|
#
|
608
622
|
# definition of classes
|
609
623
|
#
|
@@ -690,6 +704,7 @@ module Bio
|
|
690
704
|
|
691
705
|
def sos_data_properties_string
|
692
706
|
[
|
707
|
+
"fastqcVersion",
|
693
708
|
"filename",
|
694
709
|
"fileType",
|
695
710
|
"encoding",
|
data/spec/bio-fastqc_spec.rb
CHANGED
@@ -14,6 +14,7 @@ describe Bio::FastQC do
|
|
14
14
|
describe '#read' do
|
15
15
|
it 'returns parsed data from zipfile' do
|
16
16
|
expect(@data).not_to be_empty
|
17
|
+
expect(@data).not_to be_nil
|
17
18
|
end
|
18
19
|
end
|
19
20
|
end
|
@@ -25,55 +26,112 @@ describe Bio::FastQC do
|
|
25
26
|
end
|
26
27
|
|
27
28
|
describe '#fastqc_version' do
|
28
|
-
it 'returns fastqc version as String
|
29
|
+
it 'returns fastqc version as String' do
|
29
30
|
expect(@parser.fastqc_version).to be_instance_of(String)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'does not return empty string' do
|
30
34
|
expect(@parser.fastqc_version).not_to be_empty
|
31
35
|
end
|
36
|
+
|
37
|
+
it 'does not return nil' do
|
38
|
+
expect(@parser.fastqc_version).not_to be_nil
|
39
|
+
end
|
32
40
|
end
|
33
41
|
|
34
42
|
describe '#filename' do
|
35
|
-
it 'returns filename as String
|
43
|
+
it 'returns filename as String' do
|
36
44
|
expect(@parser.filename).to be_instance_of(String)
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'does not return empty string' do
|
37
48
|
expect(@parser.filename).not_to be_empty
|
38
49
|
end
|
50
|
+
|
51
|
+
it 'does not return nil' do
|
52
|
+
expect(@parser.filename).not_to be_nil
|
53
|
+
end
|
39
54
|
end
|
40
55
|
|
41
56
|
describe '#file_type' do
|
42
|
-
it 'returns file type as String
|
57
|
+
it 'returns file type as String' do
|
43
58
|
expect(@parser.file_type).to be_instance_of(String)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'does not return empty string' do
|
44
62
|
expect(@parser.file_type).not_to be_empty
|
45
63
|
end
|
64
|
+
|
65
|
+
it 'does not return nil' do
|
66
|
+
expect(@parser.file_type).not_to be_nil
|
67
|
+
end
|
46
68
|
end
|
47
69
|
|
48
70
|
describe '#encoding' do
|
49
|
-
it 'returns encoding type as String
|
71
|
+
it 'returns encoding type as String' do
|
50
72
|
expect(@parser.encoding).to be_instance_of(String)
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'does not return empty string' do
|
51
76
|
expect(@parser.encoding).not_to be_empty
|
52
77
|
end
|
78
|
+
|
79
|
+
it 'does not return nil' do
|
80
|
+
expect(@parser.encoding).not_to be_nil
|
81
|
+
end
|
53
82
|
end
|
54
83
|
|
55
84
|
describe '#total_sequences' do
|
56
85
|
it 'returns total number of sequences as Fixnum' do
|
57
86
|
expect(@parser.total_sequences).to be_instance_of(Fixnum)
|
58
87
|
end
|
88
|
+
|
89
|
+
it 'returns integer larger than zero' do
|
90
|
+
expect(@parser.total_sequences).to be > 0
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'does not return nil' do
|
94
|
+
expect(@parser.total_sequences).not_to be_nil
|
95
|
+
end
|
59
96
|
end
|
60
97
|
|
61
98
|
describe '#filtered_sequences' do
|
62
|
-
it 'returns number of filtered sequence as Fixnum
|
63
|
-
|
99
|
+
it 'returns number of filtered sequence as Fixnum, can be nil' do
|
100
|
+
if @parser.filtered_sequences
|
101
|
+
expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe '#sequences_flagged_as_poor_quality' do
|
107
|
+
it 'returns number of sequences flagged as poor quality as Fixnum, can be nil' do
|
108
|
+
if @parser.sequences_flagged_as_poor_quality
|
109
|
+
expect(@parser.sequences_flagged_as_poor_quality).to be_instance_of(Fixnum)
|
110
|
+
end
|
64
111
|
end
|
65
112
|
end
|
66
113
|
|
67
114
|
describe '#sequence_length' do
|
68
|
-
it 'returns length of sequence as String
|
115
|
+
it 'returns length of sequence as String' do
|
69
116
|
expect(@parser.sequence_length).to be_instance_of(String)
|
117
|
+
end
|
118
|
+
|
119
|
+
it 'does not return empty string' do
|
70
120
|
expect(@parser.sequence_length).not_to be_empty
|
71
121
|
end
|
122
|
+
|
123
|
+
it 'does not return nil' do
|
124
|
+
expect(@parser.sequence_length).not_to be_nil
|
125
|
+
end
|
72
126
|
end
|
73
127
|
|
74
128
|
describe '#percent_gc' do
|
75
|
-
it 'returns percentage of GC content as
|
76
|
-
expect(@parser.percent_gc).to be_instance_of(
|
129
|
+
it 'returns percentage of GC content as Float' do
|
130
|
+
expect(@parser.percent_gc).to be_instance_of(Float)
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'does not return nil' do
|
134
|
+
expect(@parser.percent_gc).not_to be_nil
|
77
135
|
end
|
78
136
|
end
|
79
137
|
|
@@ -190,6 +248,10 @@ describe Bio::FastQC do
|
|
190
248
|
it 'returns duplicate percentage as Float and not empty' do
|
191
249
|
expect(@parser.total_duplicate_percentage).to be_instance_of(Float)
|
192
250
|
end
|
251
|
+
|
252
|
+
it 'does not returns nil' do
|
253
|
+
expect(@parser.total_duplicate_percentage).not_to be_nil
|
254
|
+
end
|
193
255
|
end
|
194
256
|
|
195
257
|
describe '#sequence_duplication_levels' do
|
@@ -256,42 +318,88 @@ describe Bio::FastQC do
|
|
256
318
|
it 'returns minimum read length as Fixnum and not empty' do
|
257
319
|
expect(@parser.min_length).to be_instance_of(Fixnum)
|
258
320
|
end
|
321
|
+
|
322
|
+
it 'returns integer larger than zero' do
|
323
|
+
expect(@parser.min_length).to be > 0
|
324
|
+
end
|
325
|
+
|
326
|
+
it 'does not return nil' do
|
327
|
+
expect(@parser.min_length).not_to be_nil
|
328
|
+
end
|
259
329
|
end
|
260
330
|
|
261
331
|
describe '#max_length' do
|
262
332
|
it 'returns maximum read length as Fixnum and not empty' do
|
263
333
|
expect(@parser.max_length).to be_instance_of(Fixnum)
|
264
334
|
end
|
335
|
+
|
336
|
+
it 'returns integer larger than zero' do
|
337
|
+
expect(@parser.max_length).to be > 0
|
338
|
+
end
|
339
|
+
|
340
|
+
it 'does not return nil' do
|
341
|
+
expect(@parser.max_length).not_to be_nil
|
342
|
+
end
|
265
343
|
end
|
266
344
|
|
267
345
|
describe '#overall_mean_quality_score' do
|
268
346
|
it 'returns overall mean quality score as Float and not empty' do
|
269
347
|
expect(@parser.overall_mean_quality_score).to be_instance_of(Float)
|
270
348
|
end
|
349
|
+
|
350
|
+
it 'does not return nil' do
|
351
|
+
expect(@parser.overall_mean_quality_score).not_to be_nil
|
352
|
+
end
|
271
353
|
end
|
272
354
|
|
273
355
|
describe '#overall_median_quality_score' do
|
274
356
|
it 'returns overall median quality score as Float and not empty' do
|
275
357
|
expect(@parser.overall_median_quality_score).to be_instance_of(Float)
|
276
358
|
end
|
359
|
+
|
360
|
+
it 'does not return nil' do
|
361
|
+
expect(@parser.overall_median_quality_score).not_to be_nil
|
362
|
+
end
|
277
363
|
end
|
278
364
|
|
279
365
|
describe '#overall_n_content' do
|
280
366
|
it 'returns overall N content as Float and not empty' do
|
281
367
|
expect(@parser.overall_n_content).to be_instance_of(Float)
|
282
368
|
end
|
369
|
+
|
370
|
+
it 'does not return nil' do
|
371
|
+
expect(@parser.overall_n_content).not_to be_nil
|
372
|
+
end
|
283
373
|
end
|
284
374
|
|
285
375
|
describe '#mean_sequence_length' do
|
286
376
|
it 'returns mean sequence length from read length distribution as Float and not empty' do
|
287
377
|
expect(@parser.mean_sequence_length).to be_instance_of(Float)
|
288
378
|
end
|
379
|
+
|
380
|
+
it 'does not return nil' do
|
381
|
+
expect(@parser.mean_sequence_length).not_to be_nil
|
382
|
+
end
|
289
383
|
end
|
290
384
|
|
291
385
|
describe '#median_sequence_length' do
|
292
386
|
it 'returns median sequence length from read length distribution as Float and not empty' do
|
293
387
|
expect(@parser.median_sequence_length).to be_instance_of(Float)
|
294
388
|
end
|
389
|
+
|
390
|
+
it 'does not return nil' do
|
391
|
+
expect(@parser.median_sequence_length).not_to be_nil
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
395
|
+
describe '#parse' do
|
396
|
+
it 'does not return nil' do
|
397
|
+
expect(@parser.parse).not_to be_nil
|
398
|
+
end
|
399
|
+
|
400
|
+
it 'returns hash' do
|
401
|
+
expect(@parser.parse).to be_instance_of(Hash)
|
402
|
+
end
|
295
403
|
end
|
296
404
|
end
|
297
405
|
end
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-fastqc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tazro Inutano Ohta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rubyzip
|
@@ -229,6 +229,7 @@ files:
|
|
229
229
|
- lib/bio/fastqc/semantics.rb
|
230
230
|
- spec/bio-fastqc_spec.rb
|
231
231
|
- spec/example_fastqc.zip
|
232
|
+
- spec/example_fastqc_454.zip
|
232
233
|
- spec/spec_helper.rb
|
233
234
|
homepage: http://github.com/inutano/bioruby-fastqc
|
234
235
|
licenses:
|