bio-fastqc 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/lib/bio/fastqc/converter.rb +23 -23
- data/lib/bio/fastqc/io.rb +6 -6
- data/lib/bio/fastqc/parser.rb +154 -170
- data/lib/bio/fastqc/semantics.rb +57 -42
- data/spec/bio-fastqc_spec.rb +117 -9
- data/spec/example_fastqc_454.zip +0 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1a0b30382d2c41b0fc5327cc1e18ce63b7f190e
|
4
|
+
data.tar.gz: 479c6f71276f0360f15cc3286bafe8f12b7404d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56c3e7b739a99e6ee39ffc958a8f59f61438ebfd44a7dd3ae9dcb1ea0ea481fb40c927673f6f8ae052e4422cfb4b8e38b6da542ace153b24d7e4f439284f68b3
|
7
|
+
data.tar.gz: 41ef0bc02eb028d9f4de4661cd4cc67159159d1e17ff4f5c06d8779ee3b0ddaf1b348f6aea1d06222873443820aba50133cb6732cfe2a8f875e87c6f12f2573f
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.6.0
|
data/lib/bio/fastqc/converter.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
module Bio
|
4
4
|
module FastQC
|
5
5
|
class Converter
|
6
|
-
def initialize(
|
6
|
+
def initialize(fastqc_object, id: nil)
|
7
7
|
@id = id
|
8
|
-
@
|
8
|
+
@fastqc_object = fastqc_object
|
9
9
|
end
|
10
10
|
|
11
11
|
def convert_to(format)
|
@@ -23,20 +23,20 @@ module Bio
|
|
23
23
|
|
24
24
|
def to_json
|
25
25
|
json = if @id
|
26
|
-
{ @id => @
|
26
|
+
{ @id => @fastqc_object }
|
27
27
|
else
|
28
|
-
@
|
28
|
+
@fastqc_object
|
29
29
|
end
|
30
30
|
JSON.dump(json)
|
31
31
|
end
|
32
32
|
|
33
33
|
def to_jsonld
|
34
|
-
json_ld_object = Semantics.new(@
|
34
|
+
json_ld_object = Semantics.new(@fastqc_object, id: @id).json_ld_object
|
35
35
|
JSON.dump(json_ld_object)
|
36
36
|
end
|
37
37
|
|
38
38
|
def to_turtle
|
39
|
-
Semantics.new(@
|
39
|
+
Semantics.new(@fastqc_object, id: @id).turtle
|
40
40
|
end
|
41
41
|
|
42
42
|
def to_ttl
|
@@ -47,28 +47,28 @@ module Bio
|
|
47
47
|
identifier = if @id
|
48
48
|
@id
|
49
49
|
else
|
50
|
-
@
|
50
|
+
@fastqc_object[:filename].split(".").first
|
51
51
|
end
|
52
52
|
|
53
53
|
# return one-line tab separated value
|
54
54
|
[
|
55
55
|
identifier,
|
56
|
-
@
|
57
|
-
@
|
58
|
-
@
|
59
|
-
@
|
60
|
-
@
|
61
|
-
@
|
62
|
-
@
|
63
|
-
@
|
64
|
-
@
|
65
|
-
@
|
66
|
-
@
|
67
|
-
@
|
68
|
-
@
|
69
|
-
@
|
70
|
-
@
|
71
|
-
@
|
56
|
+
@fastqc_object[:fastqc_version],
|
57
|
+
@fastqc_object[:filename],
|
58
|
+
@fastqc_object[:file_type],
|
59
|
+
@fastqc_object[:encoding],
|
60
|
+
@fastqc_object[:total_sequences],
|
61
|
+
@fastqc_object[:filtered_sequences],
|
62
|
+
@fastqc_object[:sequence_length],
|
63
|
+
@fastqc_object[:min_length],
|
64
|
+
@fastqc_object[:max_length],
|
65
|
+
@fastqc_object[:mean_sequence_length],
|
66
|
+
@fastqc_object[:median_sequence_length],
|
67
|
+
@fastqc_object[:percent_gc],
|
68
|
+
@fastqc_object[:total_duplicate_percentage],
|
69
|
+
@fastqc_object[:overall_mean_quality_score],
|
70
|
+
@fastqc_object[:overall_median_quality_score],
|
71
|
+
@fastqc_object[:overall_n_content],
|
72
72
|
].join("\t")
|
73
73
|
end
|
74
74
|
end
|
data/lib/bio/fastqc/io.rb
CHANGED
@@ -5,8 +5,8 @@ require 'rdf/turtle'
|
|
5
5
|
module Bio
|
6
6
|
module FastQC
|
7
7
|
class IO
|
8
|
-
def initialize(
|
9
|
-
@
|
8
|
+
def initialize(fastqc_object, id: nil)
|
9
|
+
@fastqc_object = fastqc_object
|
10
10
|
@id = id
|
11
11
|
end
|
12
12
|
|
@@ -24,17 +24,17 @@ module Bio
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def write_json(output_file)
|
27
|
-
json = Converter.new(@
|
27
|
+
json = Converter.new(@fastqc_object, id: @id).to_json
|
28
28
|
open(output_file, 'w'){|file| file.puts(json) }
|
29
29
|
end
|
30
30
|
|
31
31
|
def write_jsonld(output_file)
|
32
|
-
jsonld = Converter.new(@
|
32
|
+
jsonld = Converter.new(@fastqc_object, id: @id).to_jsonld
|
33
33
|
open(output_file, 'w'){|file| file.puts(jsonld) }
|
34
34
|
end
|
35
35
|
|
36
36
|
def write_ttl(output_file)
|
37
|
-
semantics = Semantics.new(@
|
37
|
+
semantics = Semantics.new(@fastqc_object, id: @id)
|
38
38
|
graph = semantics.turtle_graph
|
39
39
|
prefixes = semantics.turtle_prefixes
|
40
40
|
RDF::Turtle::Writer.open(output_file, prefixes: prefixes) do |writer|
|
@@ -43,7 +43,7 @@ module Bio
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def write_tsv(output_file)
|
46
|
-
tsv = Converter.new(@
|
46
|
+
tsv = Converter.new(@fastqc_object, id: @id).to_tsv
|
47
47
|
open(output_file, 'w'){|file| file.puts(tsv) }
|
48
48
|
end
|
49
49
|
end
|
data/lib/bio/fastqc/parser.rb
CHANGED
@@ -5,246 +5,230 @@ module Bio
|
|
5
5
|
class Parser
|
6
6
|
def initialize(fastqc_data_txt)
|
7
7
|
@data = fastqc_data_txt
|
8
|
-
@
|
9
|
-
@
|
8
|
+
@module_results = parse_modules
|
9
|
+
@basic_statistics = basic_statistics
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
13
|
-
|
14
|
-
|
15
|
-
lines = node.split("\n")
|
16
|
-
rm_header = lines.map do |line|
|
17
|
-
if line !~ /^\#/ || line =~ /^#Total Duplicate Percentage/
|
18
|
-
line.split("\t")
|
19
|
-
end
|
20
|
-
end
|
21
|
-
rm_header.compact
|
12
|
+
def parse_modules
|
13
|
+
@data.split(">>END_MODULE\n").map do |mod|
|
14
|
+
mod.split("\n").map{|line| line.split("\t") }
|
22
15
|
end
|
23
16
|
end
|
24
17
|
|
25
|
-
|
26
|
-
|
27
|
-
|
18
|
+
#
|
19
|
+
# Basic Statistics module
|
20
|
+
#
|
28
21
|
|
29
22
|
def basic_statistics
|
30
|
-
Hash[*@
|
23
|
+
Hash[*@module_results[0].flatten]
|
31
24
|
end
|
32
25
|
|
33
|
-
def
|
34
|
-
@
|
26
|
+
def fastqc_version # software version of FastQC
|
27
|
+
@basic_statistics["##FastQC"]
|
35
28
|
end
|
36
29
|
|
37
|
-
def
|
38
|
-
@
|
30
|
+
def filename # input filename for FastQC program
|
31
|
+
@basic_statistics["Filename"]
|
39
32
|
end
|
40
33
|
|
41
|
-
def
|
42
|
-
@
|
34
|
+
def file_type # input file type
|
35
|
+
@basic_statistics["File type"]
|
43
36
|
end
|
44
37
|
|
45
|
-
def
|
46
|
-
@
|
38
|
+
def encoding # quality encoding method for input file type
|
39
|
+
@basic_statistics["Encoding"]
|
47
40
|
end
|
48
41
|
|
49
|
-
def
|
50
|
-
@
|
42
|
+
def total_sequences # total number of sequence reads
|
43
|
+
@basic_statistics["Total Sequences"].to_i
|
51
44
|
end
|
52
45
|
|
53
|
-
def
|
54
|
-
@
|
46
|
+
def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality
|
47
|
+
@basic_statistics["Sequences flagged as poor quality"].to_i
|
55
48
|
end
|
56
49
|
|
57
|
-
def
|
58
|
-
|
59
|
-
if l =~ /\d-\d/
|
60
|
-
l.sub(/-\d+$/,"").to_i
|
61
|
-
else
|
62
|
-
l.to_i
|
63
|
-
end
|
50
|
+
def filtered_sequences # number of sequence reads filtered out
|
51
|
+
@basic_statistics["Filtered Sequences"].to_i
|
64
52
|
end
|
65
53
|
|
66
|
-
def
|
67
|
-
|
68
|
-
if l =~ /\d-\d/
|
69
|
-
l.sub(/^\d+-/,"").to_i
|
70
|
-
else
|
71
|
-
l.to_i
|
72
|
-
end
|
54
|
+
def sequence_length # store as string: can be range
|
55
|
+
@basic_statistics["Sequence length"]
|
73
56
|
end
|
74
57
|
|
75
|
-
def percent_gc
|
76
|
-
@
|
58
|
+
def percent_gc # overall percentage of GC content
|
59
|
+
@basic_statistics["%GC"].to_f
|
77
60
|
end
|
78
61
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
62
|
+
#
|
63
|
+
# Other modules
|
64
|
+
#
|
83
65
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
if per_base
|
88
|
-
v = per_base.map{|c| (10**(c[1].to_f/-10)).to_f }
|
89
|
-
-10 * Math.log10(v.reduce(:+) / v.size)
|
90
|
-
end
|
66
|
+
def get_module_matrix(module_name, num_of_header_rows)
|
67
|
+
mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0]
|
68
|
+
mod.drop(num_of_header_rows) if mod
|
91
69
|
end
|
92
70
|
|
93
|
-
|
94
|
-
|
95
|
-
per_base = self.per_base_sequence_quality
|
96
|
-
if per_base
|
97
|
-
v = per_base.map{|c| (10**(c[2].to_f/-10)).to_f }
|
98
|
-
-10 * Math.log10(v.reduce(:+) / v.size)
|
99
|
-
end
|
71
|
+
def per_base_sequence_quality
|
72
|
+
get_module_matrix("Per base sequence quality", 1)
|
100
73
|
end
|
101
74
|
|
102
75
|
def per_tile_sequence_quality
|
103
|
-
|
104
|
-
node.select{|n| n.first != ">>Per tile sequence quality" } if node
|
76
|
+
get_module_matrix("Per tile sequence quality", 1)
|
105
77
|
end
|
106
78
|
|
107
79
|
def per_sequence_quality_scores
|
108
|
-
|
109
|
-
node.select{|n| n.first != ">>Per sequence quality scores" } if node
|
80
|
+
get_module_matrix("Per sequence quality scores", 1)
|
110
81
|
end
|
111
82
|
|
112
83
|
def per_base_sequence_content
|
113
|
-
|
114
|
-
node.select{|n| n.first != ">>Per base sequence content" } if node
|
84
|
+
get_module_matrix("Per base sequence content", 1)
|
115
85
|
end
|
116
86
|
|
117
87
|
def per_sequence_gc_content
|
118
|
-
|
119
|
-
node.select{|n| n.first != ">>Per sequence GC content" } if node
|
88
|
+
get_module_matrix("Per sequence GC content", 1)
|
120
89
|
end
|
121
90
|
|
122
|
-
def
|
123
|
-
|
124
|
-
node.select{|n| n.first != ">>Per sequence GC content" } if node
|
91
|
+
def per_base_n_content
|
92
|
+
get_module_matrix("Per base N content", 1)
|
125
93
|
end
|
126
94
|
|
127
|
-
def
|
128
|
-
|
129
|
-
node.select{|n| n.first != ">>Per base N content" } if node
|
95
|
+
def sequence_length_distribution
|
96
|
+
get_module_matrix("Sequence Length Distribution", 1)
|
130
97
|
end
|
131
98
|
|
132
|
-
|
133
|
-
|
134
|
-
per_base = self.per_base_n_content
|
135
|
-
if per_base
|
136
|
-
v = per_base.map{|c| c[1].to_f }
|
137
|
-
v.reduce(:+) / v.size
|
138
|
-
end
|
99
|
+
def total_duplicate_percentage
|
100
|
+
get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f
|
139
101
|
end
|
140
102
|
|
141
|
-
def
|
142
|
-
|
143
|
-
node.select{|n| n.first != ">>Sequence Length Distribution" } if node
|
103
|
+
def sequence_duplication_levels
|
104
|
+
get_module_matrix("Sequence Duplication Levels", 2)
|
144
105
|
end
|
145
106
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
107
|
+
def overrepresented_sequences
|
108
|
+
get_module_matrix("Overrepresented sequences", 1)
|
109
|
+
end
|
110
|
+
|
111
|
+
def adapter_content
|
112
|
+
get_module_matrix("Adapter Content", 1)
|
113
|
+
end
|
114
|
+
|
115
|
+
def kmer_content
|
116
|
+
get_module_matrix("Kmer Content", 1)
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Custom modules
|
121
|
+
#
|
122
|
+
|
123
|
+
def min_length
|
124
|
+
sequence_length.sub(/-\d+$/,"").to_i
|
125
|
+
end
|
126
|
+
|
127
|
+
def max_length
|
128
|
+
sequence_length.sub(/^\d+-/,"").to_i
|
129
|
+
end
|
130
|
+
|
131
|
+
def per_base_quality_column(mean_or_median)
|
132
|
+
case mean_or_median
|
133
|
+
when :mean
|
134
|
+
1
|
135
|
+
when :median
|
136
|
+
2
|
163
137
|
end
|
164
138
|
end
|
165
139
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
length = length_count[0]
|
172
|
-
count = length_count[1].to_i
|
173
|
-
if length =~ /\d-\d/
|
174
|
-
f = length.sub(/-\d+$/,"").to_i
|
175
|
-
b = length.sub(/^\d+-/,"").to_i
|
176
|
-
mean = (f + b) / 2
|
177
|
-
[mean.to_f] * count
|
178
|
-
else
|
179
|
-
[length.to_f] * count
|
180
|
-
end
|
181
|
-
end
|
182
|
-
sorted = array.flatten.sort
|
183
|
-
quot = sorted.size / 2
|
184
|
-
if !sorted.size.even?
|
185
|
-
sorted[quot]
|
186
|
-
else
|
187
|
-
f = sorted[quot]
|
188
|
-
b = sorted[quot - 1]
|
189
|
-
(f + b) / 2
|
190
|
-
end
|
140
|
+
def overall_quality_score(mean_or_median)
|
141
|
+
per_base = per_base_sequence_quality.drop(1) # drop header
|
142
|
+
column = per_base_quality_column(mean_or_median)
|
143
|
+
v = per_base.map do |row|
|
144
|
+
(10**(row[column].to_f / -10)).to_f
|
191
145
|
end
|
146
|
+
-10 * Math.log10(v.reduce(:+) / v.size)
|
192
147
|
end
|
193
148
|
|
194
|
-
def
|
195
|
-
|
196
|
-
node.select{|n| n.first != ">>Sequence Duplication Levels" && n.first != "\#Total Duplicate Percentage" } if node
|
149
|
+
def overall_mean_quality_score
|
150
|
+
overall_quality_score(:mean)
|
197
151
|
end
|
198
152
|
|
199
|
-
def
|
200
|
-
|
201
|
-
node.select{|n| n.first == "\#Total Duplicate Percentage" }.flatten[1].to_f if node
|
153
|
+
def overall_median_quality_score
|
154
|
+
overall_quality_score(:median)
|
202
155
|
end
|
203
156
|
|
204
|
-
def
|
205
|
-
|
206
|
-
|
157
|
+
def overall_n_content
|
158
|
+
per_base = per_base_n_content
|
159
|
+
v = per_base.map{|c| c[1].to_f }
|
160
|
+
v.reduce(:+) / v.size
|
207
161
|
end
|
208
162
|
|
209
|
-
def
|
210
|
-
|
211
|
-
|
163
|
+
def mean_sequence_length
|
164
|
+
dist = sequence_length_distribution.drop(1) # drop column header
|
165
|
+
if dist.size == 1
|
166
|
+
dist[0][0].to_f
|
167
|
+
else
|
168
|
+
sum = dist.map do |length_count|
|
169
|
+
l = length_count[0]
|
170
|
+
c = length_count[1].to_f
|
171
|
+
((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c
|
172
|
+
end
|
173
|
+
sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+)
|
174
|
+
end
|
212
175
|
end
|
213
176
|
|
214
|
-
def
|
215
|
-
|
216
|
-
|
177
|
+
def median_sequence_length
|
178
|
+
dist = sequence_length_distribution.drop(1) # drop column header
|
179
|
+
if dist.size == 1
|
180
|
+
dist[0][0].to_f
|
181
|
+
else
|
182
|
+
k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median
|
183
|
+
median = 0
|
184
|
+
dist.each do |l_c|
|
185
|
+
c = l_c[1].to_f # count of reads in this length range
|
186
|
+
if k > c
|
187
|
+
k -= c
|
188
|
+
else
|
189
|
+
l = l_c[0]
|
190
|
+
median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2)
|
191
|
+
break
|
192
|
+
end
|
193
|
+
end
|
194
|
+
median
|
195
|
+
end
|
217
196
|
end
|
218
197
|
|
219
198
|
def summary
|
199
|
+
parse
|
200
|
+
end
|
201
|
+
|
202
|
+
def parse
|
220
203
|
{
|
221
|
-
fastqc_version:
|
222
|
-
filename:
|
223
|
-
file_type:
|
224
|
-
encoding:
|
225
|
-
total_sequences:
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
204
|
+
fastqc_version: fastqc_version,
|
205
|
+
filename: filename,
|
206
|
+
file_type: file_type,
|
207
|
+
encoding: encoding,
|
208
|
+
total_sequences: total_sequences,
|
209
|
+
sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality,
|
210
|
+
filtered_sequences: filtered_sequences,
|
211
|
+
sequence_length: sequence_length,
|
212
|
+
percent_gc: percent_gc,
|
213
|
+
per_base_sequence_quality: per_base_sequence_quality,
|
214
|
+
per_tile_sequence_quality: per_tile_sequence_quality,
|
215
|
+
per_sequence_quality_scores: per_sequence_quality_scores,
|
216
|
+
per_base_sequence_content: per_base_sequence_content,
|
217
|
+
per_sequence_gc_content: per_sequence_gc_content,
|
218
|
+
per_base_n_content: per_base_n_content,
|
219
|
+
sequence_length_distribution: sequence_length_distribution,
|
220
|
+
total_duplicate_percentage: total_duplicate_percentage,
|
221
|
+
sequence_duplication_levels: sequence_duplication_levels,
|
222
|
+
overrepresented_sequences: overrepresented_sequences,
|
223
|
+
adapter_content: adapter_content,
|
224
|
+
kmer_content: kmer_content,
|
225
|
+
min_length: min_length,
|
226
|
+
max_length: max_length,
|
227
|
+
overall_mean_quality_score: overall_mean_quality_score,
|
228
|
+
overall_median_quality_score: overall_median_quality_score,
|
229
|
+
overall_n_content: overall_n_content,
|
230
|
+
mean_sequence_length: mean_sequence_length,
|
231
|
+
median_sequence_length: median_sequence_length,
|
248
232
|
}
|
249
233
|
end
|
250
234
|
end
|
data/lib/bio/fastqc/semantics.rb
CHANGED
@@ -6,9 +6,13 @@ require 'rdf/turtle'
|
|
6
6
|
module Bio
|
7
7
|
module FastQC
|
8
8
|
class Semantics
|
9
|
-
def initialize(
|
9
|
+
def initialize(fastqc_object, id: nil)
|
10
10
|
@id = id
|
11
|
-
@
|
11
|
+
@fastqc_object = fastqc_object
|
12
|
+
end
|
13
|
+
|
14
|
+
def rdf_version
|
15
|
+
"0.1.0"
|
12
16
|
end
|
13
17
|
|
14
18
|
def turtle
|
@@ -23,6 +27,9 @@ module Bio
|
|
23
27
|
{
|
24
28
|
"uo" => "http://purl.obolibrary.org/obo/",
|
25
29
|
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
30
|
+
"dcterms" => "http://purl.org/dc/terms/",
|
31
|
+
"pav" => "http://purl.org/pav/",
|
32
|
+
"foaf" => "http://xmlns.com/foaf/0.1/",
|
26
33
|
}
|
27
34
|
end
|
28
35
|
|
@@ -32,18 +39,30 @@ module Bio
|
|
32
39
|
object
|
33
40
|
end
|
34
41
|
|
35
|
-
def
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
42
|
+
def uri_base
|
43
|
+
"http://purl.jp/bio/01/quanto"
|
44
|
+
end
|
45
|
+
|
46
|
+
def identifier_literal
|
47
|
+
@id ? @id : "QNT" + @fastqc_object[:filename].split(".")[0]
|
48
|
+
end
|
49
|
+
|
50
|
+
def identifier_uri
|
51
|
+
uri_base + "/resource/" + identifier_literal
|
41
52
|
end
|
42
53
|
|
43
54
|
def object_core
|
44
55
|
{
|
45
56
|
"@context" => jsonld_context,
|
46
|
-
"@id" =>
|
57
|
+
"@id" => identifier_uri,
|
58
|
+
"@type" => "SequenceStatisticsReport",
|
59
|
+
"dcterms:identifier" => identifier_literal,
|
60
|
+
"dcterms:contributor" => ["Tazro Ohta", "Shuichi Kawashima"],
|
61
|
+
"dcterms:created" => Time.now.strftime("%Y-%m-%d"),
|
62
|
+
"dcterms:license" => "http://creativecommons.org/licenses/by-sa/2.1/jp/deed.en",
|
63
|
+
"dcterms:publisher" => "http://dbcls.rois.ac.jp/",
|
64
|
+
"pav:version" => rdf_version,
|
65
|
+
"foaf:page" => "http://quanto.dbcls.jp",
|
47
66
|
}
|
48
67
|
end
|
49
68
|
|
@@ -94,24 +113,26 @@ module Bio
|
|
94
113
|
end
|
95
114
|
|
96
115
|
def fastqc_version
|
97
|
-
{
|
116
|
+
{
|
117
|
+
"fastqcVersion" => @fastqc_object[:fastqc_version],
|
118
|
+
}
|
98
119
|
end
|
99
120
|
|
100
121
|
def filename
|
101
122
|
{
|
102
|
-
"filename" => @
|
123
|
+
"filename" => @fastqc_object[:filename],
|
103
124
|
}
|
104
125
|
end
|
105
126
|
|
106
127
|
def file_type
|
107
128
|
{
|
108
|
-
"fileType" => @
|
129
|
+
"fileType" => @fastqc_object[:file_type],
|
109
130
|
}
|
110
131
|
end
|
111
132
|
|
112
133
|
def encoding
|
113
134
|
{
|
114
|
-
"encoding" => @
|
135
|
+
"encoding" => @fastqc_object[:encoding],
|
115
136
|
}
|
116
137
|
end
|
117
138
|
|
@@ -120,7 +141,7 @@ module Bio
|
|
120
141
|
"totalSequences" => {
|
121
142
|
"@type" => "SequenceReadContent",
|
122
143
|
"hasUnit" => "uo:CountUnit",
|
123
|
-
"rdf:value" => @
|
144
|
+
"rdf:value" => @fastqc_object[:total_sequences],
|
124
145
|
}
|
125
146
|
}
|
126
147
|
end
|
@@ -130,7 +151,7 @@ module Bio
|
|
130
151
|
"filteredSequences" => {
|
131
152
|
"@type" => "SequenceReadContent",
|
132
153
|
"hasUnit" => "uo:CountUnit",
|
133
|
-
"rdf:value" => @
|
154
|
+
"rdf:value" => @fastqc_object[:filtered_sequences],
|
134
155
|
}
|
135
156
|
}
|
136
157
|
end
|
@@ -140,7 +161,7 @@ module Bio
|
|
140
161
|
"sequenceLength" => {
|
141
162
|
"@type" => "SequenceReadLength",
|
142
163
|
"hasUnit" => "uo:CountUnit",
|
143
|
-
"rdf:value" => @
|
164
|
+
"rdf:value" => @fastqc_object[:sequence_length],
|
144
165
|
}
|
145
166
|
}
|
146
167
|
end
|
@@ -150,7 +171,7 @@ module Bio
|
|
150
171
|
"percentGC" => {
|
151
172
|
"@type" => "NucleotideBaseContent",
|
152
173
|
"hasUnit" => "uo:CountUnit",
|
153
|
-
"rdf:value" => @
|
174
|
+
"rdf:value" => @fastqc_object[:percent_gc],
|
154
175
|
}
|
155
176
|
}
|
156
177
|
end
|
@@ -158,7 +179,7 @@ module Bio
|
|
158
179
|
def per_base_sequence_quality
|
159
180
|
{
|
160
181
|
"@type" => "PerBaseSequenceQuality",
|
161
|
-
"hasRow" => per_base_sequence_quality_rows(@
|
182
|
+
"hasRow" => per_base_sequence_quality_rows(@fastqc_object[:per_base_sequence_quality]),
|
162
183
|
}
|
163
184
|
end
|
164
185
|
|
@@ -220,7 +241,7 @@ module Bio
|
|
220
241
|
def per_sequence_quality_scores
|
221
242
|
{
|
222
243
|
"@type" => "PerSequnceQualityScores",
|
223
|
-
"hasRow" => per_sequence_quality_scores_rows(@
|
244
|
+
"hasRow" => per_sequence_quality_scores_rows(@fastqc_object[:per_sequence_quality_scores]),
|
224
245
|
}
|
225
246
|
end
|
226
247
|
|
@@ -248,7 +269,7 @@ module Bio
|
|
248
269
|
def per_base_sequence_content
|
249
270
|
{
|
250
271
|
"@type" => "PerBaseSequenceContent",
|
251
|
-
"hasRow" => per_base_sequence_content_rows(@
|
272
|
+
"hasRow" => per_base_sequence_content_rows(@fastqc_object[:per_base_sequence_content]),
|
252
273
|
}
|
253
274
|
end
|
254
275
|
|
@@ -293,7 +314,7 @@ module Bio
|
|
293
314
|
def per_sequence_gc_content
|
294
315
|
{
|
295
316
|
"@type" => "PerSequenceGCContent",
|
296
|
-
"hasRow" => per_sequence_gc_content_rows(@
|
317
|
+
"hasRow" => per_sequence_gc_content_rows(@fastqc_object[:per_sequence_gc_content]),
|
297
318
|
}
|
298
319
|
end
|
299
320
|
|
@@ -321,7 +342,7 @@ module Bio
|
|
321
342
|
def per_base_n_content
|
322
343
|
{
|
323
344
|
"@type" => "PerBaseNContent",
|
324
|
-
"hasRow" => per_base_n_content_rows(@
|
345
|
+
"hasRow" => per_base_n_content_rows(@fastqc_object[:per_base_n_content]),
|
325
346
|
}
|
326
347
|
end
|
327
348
|
|
@@ -348,7 +369,7 @@ module Bio
|
|
348
369
|
def sequence_length_distribution
|
349
370
|
{
|
350
371
|
"@type" => "SequenceLengthDistribution",
|
351
|
-
"hasRow" => sequence_length_distribution_rows(@
|
372
|
+
"hasRow" => sequence_length_distribution_rows(@fastqc_object[:sequence_length_distribution]),
|
352
373
|
}
|
353
374
|
end
|
354
375
|
|
@@ -381,7 +402,7 @@ module Bio
|
|
381
402
|
def sequence_duplication_levels
|
382
403
|
{
|
383
404
|
"@type" => "SequenceDuplicationLevels",
|
384
|
-
"hasRow" => sequence_duplication_levels_rows(@
|
405
|
+
"hasRow" => sequence_duplication_levels_rows(@fastqc_object[:sequence_duplication_levels]),
|
385
406
|
}
|
386
407
|
end
|
387
408
|
|
@@ -410,7 +431,7 @@ module Bio
|
|
410
431
|
def overrepresented_sequences
|
411
432
|
{
|
412
433
|
"@type" => "OverrepresentedSequences",
|
413
|
-
"hasRow" => overrepresented_sequences_rows(@
|
434
|
+
"hasRow" => overrepresented_sequences_rows(@fastqc_object[:overrepresented_sequences]),
|
414
435
|
}
|
415
436
|
end
|
416
437
|
|
@@ -446,7 +467,7 @@ module Bio
|
|
446
467
|
def kmer_content
|
447
468
|
{
|
448
469
|
"@type" => "KmerContent",
|
449
|
-
"hasRow" => kmer_content_rows(@
|
470
|
+
"hasRow" => kmer_content_rows(@fastqc_object[:kmer_content]),
|
450
471
|
}
|
451
472
|
end
|
452
473
|
|
@@ -486,7 +507,7 @@ module Bio
|
|
486
507
|
"minSequenceLength" => {
|
487
508
|
"@type" => "SequenceReadLength",
|
488
509
|
"hasUnit" => "uo:CountUnit",
|
489
|
-
"rdf:value" => @
|
510
|
+
"rdf:value" => @fastqc_object[:min_length],
|
490
511
|
}
|
491
512
|
}
|
492
513
|
end
|
@@ -496,7 +517,7 @@ module Bio
|
|
496
517
|
"maxSequenceLength" => {
|
497
518
|
"@type" => "SequenceReadLength",
|
498
519
|
"hasUnit" => "uo:CountUnit",
|
499
|
-
"rdf:value" => @
|
520
|
+
"rdf:value" => @fastqc_object[:max_length],
|
500
521
|
}
|
501
522
|
}
|
502
523
|
end
|
@@ -506,7 +527,7 @@ module Bio
|
|
506
527
|
"meanSequenceLength" => {
|
507
528
|
"@type" => "SequenceReadLength",
|
508
529
|
"hasUnit" => "uo:CountUnit",
|
509
|
-
"rdf:value" => @
|
530
|
+
"rdf:value" => @fastqc_object[:mean_sequence_length],
|
510
531
|
}
|
511
532
|
}
|
512
533
|
end
|
@@ -516,7 +537,7 @@ module Bio
|
|
516
537
|
"medianSequenceLength" => {
|
517
538
|
"@type" => "SequenceReadLength",
|
518
539
|
"hasUnit" => "uo:CountUnit",
|
519
|
-
"rdf:value" => @
|
540
|
+
"rdf:value" => @fastqc_object[:median_sequence_length],
|
520
541
|
}
|
521
542
|
}
|
522
543
|
end
|
@@ -526,7 +547,7 @@ module Bio
|
|
526
547
|
"overallMeanBaseCallQuality" => {
|
527
548
|
"@type" => "PhredQualityScore",
|
528
549
|
"hasUnit" => "uo:CountUnit",
|
529
|
-
"rdf:value" => @
|
550
|
+
"rdf:value" => @fastqc_object[:overall_mean_quality_score],
|
530
551
|
}
|
531
552
|
}
|
532
553
|
end
|
@@ -536,7 +557,7 @@ module Bio
|
|
536
557
|
"overallMedianBaseCallQuality" => {
|
537
558
|
"@type" => "PhredQualityScore",
|
538
559
|
"hasUnit" => "uo:CountUnit",
|
539
|
-
"rdf:value" => @
|
560
|
+
"rdf:value" => @fastqc_object[:overall_median_quality_score],
|
540
561
|
}
|
541
562
|
}
|
542
563
|
end
|
@@ -546,7 +567,7 @@ module Bio
|
|
546
567
|
"overallNContent" => {
|
547
568
|
"@type" => "NContent",
|
548
569
|
"hasUnit" => "uo:Percentage",
|
549
|
-
"rdf:value" => @
|
570
|
+
"rdf:value" => @fastqc_object[:overall_n_content],
|
550
571
|
}
|
551
572
|
}
|
552
573
|
end
|
@@ -557,10 +578,10 @@ module Bio
|
|
557
578
|
|
558
579
|
def jsonld_context
|
559
580
|
# definition of imported terms in @context
|
560
|
-
object =
|
581
|
+
object = turtle_prefixes
|
561
582
|
|
562
583
|
# definition of local ontology terms
|
563
|
-
domain = "
|
584
|
+
domain = uri_base + "/ontology/sos#"
|
564
585
|
|
565
586
|
# definition of class in @context
|
566
587
|
sos_class.each do |term|
|
@@ -597,13 +618,6 @@ module Bio
|
|
597
618
|
object
|
598
619
|
end
|
599
620
|
|
600
|
-
def imported_keywords
|
601
|
-
{
|
602
|
-
"uo" => "http://purl.obolibrary.org/obo/",
|
603
|
-
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
604
|
-
}
|
605
|
-
end
|
606
|
-
|
607
621
|
#
|
608
622
|
# definition of classes
|
609
623
|
#
|
@@ -690,6 +704,7 @@ module Bio
|
|
690
704
|
|
691
705
|
def sos_data_properties_string
|
692
706
|
[
|
707
|
+
"fastqcVersion",
|
693
708
|
"filename",
|
694
709
|
"fileType",
|
695
710
|
"encoding",
|
data/spec/bio-fastqc_spec.rb
CHANGED
@@ -14,6 +14,7 @@ describe Bio::FastQC do
|
|
14
14
|
describe '#read' do
|
15
15
|
it 'returns parsed data from zipfile' do
|
16
16
|
expect(@data).not_to be_empty
|
17
|
+
expect(@data).not_to be_nil
|
17
18
|
end
|
18
19
|
end
|
19
20
|
end
|
@@ -25,55 +26,112 @@ describe Bio::FastQC do
|
|
25
26
|
end
|
26
27
|
|
27
28
|
describe '#fastqc_version' do
|
28
|
-
it 'returns fastqc version as String
|
29
|
+
it 'returns fastqc version as String' do
|
29
30
|
expect(@parser.fastqc_version).to be_instance_of(String)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'does not return empty string' do
|
30
34
|
expect(@parser.fastqc_version).not_to be_empty
|
31
35
|
end
|
36
|
+
|
37
|
+
it 'does not return nil' do
|
38
|
+
expect(@parser.fastqc_version).not_to be_nil
|
39
|
+
end
|
32
40
|
end
|
33
41
|
|
34
42
|
describe '#filename' do
|
35
|
-
it 'returns filename as String
|
43
|
+
it 'returns filename as String' do
|
36
44
|
expect(@parser.filename).to be_instance_of(String)
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'does not return empty string' do
|
37
48
|
expect(@parser.filename).not_to be_empty
|
38
49
|
end
|
50
|
+
|
51
|
+
it 'does not return nil' do
|
52
|
+
expect(@parser.filename).not_to be_nil
|
53
|
+
end
|
39
54
|
end
|
40
55
|
|
41
56
|
describe '#file_type' do
|
42
|
-
it 'returns file type as String
|
57
|
+
it 'returns file type as String' do
|
43
58
|
expect(@parser.file_type).to be_instance_of(String)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'does not return empty string' do
|
44
62
|
expect(@parser.file_type).not_to be_empty
|
45
63
|
end
|
64
|
+
|
65
|
+
it 'does not return nil' do
|
66
|
+
expect(@parser.file_type).not_to be_nil
|
67
|
+
end
|
46
68
|
end
|
47
69
|
|
48
70
|
describe '#encoding' do
|
49
|
-
it 'returns encoding type as String
|
71
|
+
it 'returns encoding type as String' do
|
50
72
|
expect(@parser.encoding).to be_instance_of(String)
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'does not return empty string' do
|
51
76
|
expect(@parser.encoding).not_to be_empty
|
52
77
|
end
|
78
|
+
|
79
|
+
it 'does not return nil' do
|
80
|
+
expect(@parser.encoding).not_to be_nil
|
81
|
+
end
|
53
82
|
end
|
54
83
|
|
55
84
|
describe '#total_sequences' do
|
56
85
|
it 'returns total number of sequences as Fixnum' do
|
57
86
|
expect(@parser.total_sequences).to be_instance_of(Fixnum)
|
58
87
|
end
|
88
|
+
|
89
|
+
it 'returns integer larger than zero' do
|
90
|
+
expect(@parser.total_sequences).to be > 0
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'does not return nil' do
|
94
|
+
expect(@parser.total_sequences).not_to be_nil
|
95
|
+
end
|
59
96
|
end
|
60
97
|
|
61
98
|
describe '#filtered_sequences' do
|
62
|
-
it 'returns number of filtered sequence as Fixnum
|
63
|
-
|
99
|
+
it 'returns number of filtered sequence as Fixnum, can be nil' do
|
100
|
+
if @parser.filtered_sequences
|
101
|
+
expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe '#sequences_flagged_as_poor_quality' do
|
107
|
+
it 'returns number of sequences flagged as poor quality as Fixnum, can be nil' do
|
108
|
+
if @parser.sequences_flagged_as_poor_quality
|
109
|
+
expect(@parser.sequences_flagged_as_poor_quality).to be_instance_of(Fixnum)
|
110
|
+
end
|
64
111
|
end
|
65
112
|
end
|
66
113
|
|
67
114
|
describe '#sequence_length' do
|
68
|
-
it 'returns length of sequence as String
|
115
|
+
it 'returns length of sequence as String' do
|
69
116
|
expect(@parser.sequence_length).to be_instance_of(String)
|
117
|
+
end
|
118
|
+
|
119
|
+
it 'does not return empty string' do
|
70
120
|
expect(@parser.sequence_length).not_to be_empty
|
71
121
|
end
|
122
|
+
|
123
|
+
it 'does not return nil' do
|
124
|
+
expect(@parser.sequence_length).not_to be_nil
|
125
|
+
end
|
72
126
|
end
|
73
127
|
|
74
128
|
describe '#percent_gc' do
|
75
|
-
it 'returns percentage of GC content as
|
76
|
-
expect(@parser.percent_gc).to be_instance_of(
|
129
|
+
it 'returns percentage of GC content as Float' do
|
130
|
+
expect(@parser.percent_gc).to be_instance_of(Float)
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'does not return nil' do
|
134
|
+
expect(@parser.percent_gc).not_to be_nil
|
77
135
|
end
|
78
136
|
end
|
79
137
|
|
@@ -190,6 +248,10 @@ describe Bio::FastQC do
|
|
190
248
|
it 'returns duplicate percentage as Float and not empty' do
|
191
249
|
expect(@parser.total_duplicate_percentage).to be_instance_of(Float)
|
192
250
|
end
|
251
|
+
|
252
|
+
it 'does not returns nil' do
|
253
|
+
expect(@parser.total_duplicate_percentage).not_to be_nil
|
254
|
+
end
|
193
255
|
end
|
194
256
|
|
195
257
|
describe '#sequence_duplication_levels' do
|
@@ -256,42 +318,88 @@ describe Bio::FastQC do
|
|
256
318
|
it 'returns minimum read length as Fixnum and not empty' do
|
257
319
|
expect(@parser.min_length).to be_instance_of(Fixnum)
|
258
320
|
end
|
321
|
+
|
322
|
+
it 'returns integer larger than zero' do
|
323
|
+
expect(@parser.min_length).to be > 0
|
324
|
+
end
|
325
|
+
|
326
|
+
it 'does not return nil' do
|
327
|
+
expect(@parser.min_length).not_to be_nil
|
328
|
+
end
|
259
329
|
end
|
260
330
|
|
261
331
|
describe '#max_length' do
|
262
332
|
it 'returns maximum read length as Fixnum and not empty' do
|
263
333
|
expect(@parser.max_length).to be_instance_of(Fixnum)
|
264
334
|
end
|
335
|
+
|
336
|
+
it 'returns integer larger than zero' do
|
337
|
+
expect(@parser.max_length).to be > 0
|
338
|
+
end
|
339
|
+
|
340
|
+
it 'does not return nil' do
|
341
|
+
expect(@parser.max_length).not_to be_nil
|
342
|
+
end
|
265
343
|
end
|
266
344
|
|
267
345
|
describe '#overall_mean_quality_score' do
|
268
346
|
it 'returns overall mean quality score as Float and not empty' do
|
269
347
|
expect(@parser.overall_mean_quality_score).to be_instance_of(Float)
|
270
348
|
end
|
349
|
+
|
350
|
+
it 'does not return nil' do
|
351
|
+
expect(@parser.overall_mean_quality_score).not_to be_nil
|
352
|
+
end
|
271
353
|
end
|
272
354
|
|
273
355
|
describe '#overall_median_quality_score' do
|
274
356
|
it 'returns overall median quality score as Float and not empty' do
|
275
357
|
expect(@parser.overall_median_quality_score).to be_instance_of(Float)
|
276
358
|
end
|
359
|
+
|
360
|
+
it 'does not return nil' do
|
361
|
+
expect(@parser.overall_median_quality_score).not_to be_nil
|
362
|
+
end
|
277
363
|
end
|
278
364
|
|
279
365
|
describe '#overall_n_content' do
|
280
366
|
it 'returns overall N content as Float and not empty' do
|
281
367
|
expect(@parser.overall_n_content).to be_instance_of(Float)
|
282
368
|
end
|
369
|
+
|
370
|
+
it 'does not return nil' do
|
371
|
+
expect(@parser.overall_n_content).not_to be_nil
|
372
|
+
end
|
283
373
|
end
|
284
374
|
|
285
375
|
describe '#mean_sequence_length' do
|
286
376
|
it 'returns mean sequence length from read length distribution as Float and not empty' do
|
287
377
|
expect(@parser.mean_sequence_length).to be_instance_of(Float)
|
288
378
|
end
|
379
|
+
|
380
|
+
it 'does not return nil' do
|
381
|
+
expect(@parser.mean_sequence_length).not_to be_nil
|
382
|
+
end
|
289
383
|
end
|
290
384
|
|
291
385
|
describe '#median_sequence_length' do
|
292
386
|
it 'returns median sequence length from read length distribution as Float and not empty' do
|
293
387
|
expect(@parser.median_sequence_length).to be_instance_of(Float)
|
294
388
|
end
|
389
|
+
|
390
|
+
it 'does not return nil' do
|
391
|
+
expect(@parser.median_sequence_length).not_to be_nil
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
395
|
+
describe '#parse' do
|
396
|
+
it 'does not return nil' do
|
397
|
+
expect(@parser.parse).not_to be_nil
|
398
|
+
end
|
399
|
+
|
400
|
+
it 'returns hash' do
|
401
|
+
expect(@parser.parse).to be_instance_of(Hash)
|
402
|
+
end
|
295
403
|
end
|
296
404
|
end
|
297
405
|
end
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-fastqc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tazro Inutano Ohta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rubyzip
|
@@ -229,6 +229,7 @@ files:
|
|
229
229
|
- lib/bio/fastqc/semantics.rb
|
230
230
|
- spec/bio-fastqc_spec.rb
|
231
231
|
- spec/example_fastqc.zip
|
232
|
+
- spec/example_fastqc_454.zip
|
232
233
|
- spec/spec_helper.rb
|
233
234
|
homepage: http://github.com/inutano/bioruby-fastqc
|
234
235
|
licenses:
|