bio-fastqc 0.2.1 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -1
- data/VERSION +1 -1
- data/lib/bio-fastqc.rb +2 -0
- data/lib/bio/fastqc/cli.rb +8 -7
- data/lib/bio/fastqc/converter.rb +35 -0
- data/lib/bio/fastqc/parser.rb +3 -2
- data/lib/bio/fastqc/semantics.rb +715 -0
- metadata +36 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7e21d6a0e0f6ec91c058b8a574e79e6fa9dad431
|
4
|
+
data.tar.gz: 93ed09413b3fdb49564c43e68a49bed3d1937d88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48097e944bd5c7e76a804f7d0623f869dddb17fceb4da878878489f2ad745dab9b0c925cadccce12bee332ff3b3c8db5dea1ce22085acf1be914bddec2f04998
|
7
|
+
data.tar.gz: fc4ac83bb59b47e5bf554884efbdc7e7f179fece8bd94fa70529fe7858727cdf6826ec87ac6081c68d91c3c82f0c908ab46a8798f9b180d9974839aa02daa77c
|
data/Gemfile
CHANGED
@@ -5,6 +5,8 @@ source "http://rubygems.org"
|
|
5
5
|
|
6
6
|
gem 'rubyzip', '~> 1.1', '>= 1.1.0'
|
7
7
|
gem 'thor', "~> 0.19.1"
|
8
|
+
gem 'json-ld', '~> 1.99'
|
9
|
+
gem 'rdf-turtle', '~> 1.99'
|
8
10
|
|
9
11
|
# Add dependencies to develop your gem here.
|
10
12
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -19,5 +21,5 @@ group :development do
|
|
19
21
|
gem 'simplecov', '~> 0.10'
|
20
22
|
|
21
23
|
gem 'pry', '~> 0.10'
|
22
|
-
gem 'bio-fastqc'
|
24
|
+
gem 'bio-fastqc', '0.3.0'
|
23
25
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.1
|
data/lib/bio-fastqc.rb
CHANGED
data/lib/bio/fastqc/cli.rb
CHANGED
@@ -6,13 +6,14 @@ require 'json'
|
|
6
6
|
module Bio
|
7
7
|
module FastQC
|
8
8
|
class CLI < Thor
|
9
|
-
desc "parse [filename]
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
9
|
+
desc "parse [--format format] [filename]", "parse fastqc data in fastqc directory or zipfile, output in json, json-ld, or rdf-turtle format."
|
10
|
+
option :format, :default => "json"
|
11
|
+
def parse(file)
|
12
|
+
data = Data.read(file)
|
13
|
+
summary = Parser.new(data).summary
|
14
|
+
puts Converter.new(summary).convert_to(options[:format])
|
15
|
+
# rescue
|
16
|
+
# puts "Wrong input file type: specify fastqc result data, directory or zipfile"
|
16
17
|
end
|
17
18
|
end
|
18
19
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module FastQC
|
5
|
+
class Converter
|
6
|
+
def initialize(summary_json)
|
7
|
+
@summary_json = summary_json
|
8
|
+
end
|
9
|
+
|
10
|
+
def convert_to(format)
|
11
|
+
case format
|
12
|
+
when "json"
|
13
|
+
to_json
|
14
|
+
when "json-ld"
|
15
|
+
to_jsonld
|
16
|
+
when "turtle"
|
17
|
+
to_turtle
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_json
|
22
|
+
JSON.dump(@summary_json)
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_jsonld
|
26
|
+
json_ld_object = Semantics.new(@summary_json).json_ld_object
|
27
|
+
JSON.dump(json_ld_object)
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_turtle
|
31
|
+
Semantics.new(@summary_json).turtle
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/bio/fastqc/parser.rb
CHANGED
@@ -220,12 +220,13 @@ module Bio
|
|
220
220
|
percent_gc: self.percent_gc,
|
221
221
|
per_base_sequence_quality: self.per_base_sequence_quality,
|
222
222
|
per_tile_sequence_quality: self.per_tile_sequence_quality,
|
223
|
-
|
223
|
+
per_sequence_quality_scores: self.per_sequence_quality_scores,
|
224
224
|
per_base_sequence_content: self.per_base_sequence_content,
|
225
225
|
per_sequence_gc_content: self.per_sequence_gc_content,
|
226
226
|
per_base_n_content: self.per_base_n_content,
|
227
227
|
sequence_length_distribution: self.sequence_length_distribution,
|
228
|
-
total_duplicate_percentage: self.total_duplicate_percentage,
|
228
|
+
total_duplicate_percentage: self.total_duplicate_percentage,
|
229
|
+
sequence_duplication_levels: self.sequence_duplication_levels,
|
229
230
|
overrepresented_sequences: self.overrepresented_sequences,
|
230
231
|
adapter_content: self.adapter_content,
|
231
232
|
kmer_content: self.kmer_content,
|
@@ -0,0 +1,715 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'json/ld'
|
4
|
+
require 'rdf/turtle'
|
5
|
+
|
6
|
+
module Bio
|
7
|
+
module FastQC
|
8
|
+
class Semantics
|
9
|
+
def initialize(summary_json)
|
10
|
+
@summary = summary_json
|
11
|
+
end
|
12
|
+
|
13
|
+
def turtle
|
14
|
+
object = json_ld_object
|
15
|
+
graph = RDF::Graph.new << JSON::LD::API.toRdf(object)
|
16
|
+
graph.dump(:ttl, prefixes: turtle_prefixes)
|
17
|
+
end
|
18
|
+
|
19
|
+
def turtle_prefixes
|
20
|
+
{
|
21
|
+
"uo" => "http://purl.obolibrary.org/obo/",
|
22
|
+
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
def json_ld_object
|
27
|
+
object = [object_core, static_value_modules].flatten.inject(&:merge)
|
28
|
+
object["hasMatrix"] = matrix_modules
|
29
|
+
object
|
30
|
+
end
|
31
|
+
|
32
|
+
def identifier
|
33
|
+
"http://me.com/data/QNT" + @summary[:filename].split(".").first
|
34
|
+
end
|
35
|
+
|
36
|
+
def object_core
|
37
|
+
{
|
38
|
+
"@context" => jsonld_context,
|
39
|
+
"@id" => identifier,
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
def static_value_modules
|
44
|
+
[
|
45
|
+
fastqc_version,
|
46
|
+
filename,
|
47
|
+
file_type,
|
48
|
+
encoding,
|
49
|
+
total_sequences,
|
50
|
+
filtered_sequences,
|
51
|
+
sequence_length,
|
52
|
+
percent_gc,
|
53
|
+
total_duplicate_percentage,
|
54
|
+
min_length,
|
55
|
+
max_length,
|
56
|
+
overall_mean_quality_score,
|
57
|
+
overall_median_quality_score,
|
58
|
+
overall_n_content,
|
59
|
+
mean_sequence_length,
|
60
|
+
median_sequence_length,
|
61
|
+
]
|
62
|
+
end
|
63
|
+
|
64
|
+
def matrix_modules
|
65
|
+
[
|
66
|
+
per_base_sequence_quality,
|
67
|
+
per_tile_sequence_quality,
|
68
|
+
per_sequence_quality_scores,
|
69
|
+
per_base_sequence_content,
|
70
|
+
per_sequence_gc_content,
|
71
|
+
per_base_n_content,
|
72
|
+
sequence_length_distribution,
|
73
|
+
sequence_duplication_levels,
|
74
|
+
overrepresented_sequences,
|
75
|
+
adapter_content,
|
76
|
+
kmer_content,
|
77
|
+
]
|
78
|
+
end
|
79
|
+
|
80
|
+
def base_stat_class(base)
|
81
|
+
case base
|
82
|
+
when /-/ # when the base position is range like "50-100"
|
83
|
+
"BaseRangeStatistics"
|
84
|
+
else
|
85
|
+
"ExactBaseStatistics"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def fastqc_version
|
90
|
+
{}
|
91
|
+
end
|
92
|
+
|
93
|
+
def filename
|
94
|
+
{
|
95
|
+
"filename" => @summary[:filename],
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
def file_type
|
100
|
+
{
|
101
|
+
"fileType" => @summary[:file_type],
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def encoding
|
106
|
+
{
|
107
|
+
"encoding" => @summary[:encoding],
|
108
|
+
}
|
109
|
+
end
|
110
|
+
|
111
|
+
def total_sequences
|
112
|
+
{
|
113
|
+
"totalSequences" => {
|
114
|
+
"@type" => "SequenceReadContent",
|
115
|
+
"hasUnit" => "uo:CountUnit",
|
116
|
+
"rdf:value" => @summary[:total_sequences],
|
117
|
+
}
|
118
|
+
}
|
119
|
+
end
|
120
|
+
|
121
|
+
def filtered_sequences
|
122
|
+
{
|
123
|
+
"filteredSequences" => {
|
124
|
+
"@type" => "SequenceReadContent",
|
125
|
+
"hasUnit" => "uo:CountUnit",
|
126
|
+
"rdf:value" => @summary[:filtered_sequences],
|
127
|
+
}
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def sequence_length
|
132
|
+
{
|
133
|
+
"sequenceLength" => {
|
134
|
+
"@type" => "SequenceReadLength",
|
135
|
+
"hasUnit" => "uo:CountUnit",
|
136
|
+
"rdf:value" => @summary[:sequence_length],
|
137
|
+
}
|
138
|
+
}
|
139
|
+
end
|
140
|
+
|
141
|
+
def percent_gc
|
142
|
+
{
|
143
|
+
"percentGC" => {
|
144
|
+
"@type" => "NucleotideBaseContent",
|
145
|
+
"hasUnit" => "uo:CountUnit",
|
146
|
+
"rdf:value" => @summary[:percent_gc],
|
147
|
+
}
|
148
|
+
}
|
149
|
+
end
|
150
|
+
|
151
|
+
def per_base_sequence_quality
|
152
|
+
{
|
153
|
+
"@type" => "PerBaseSequenceQuality",
|
154
|
+
"hasRow" => per_base_sequence_quality_rows(@summary[:per_base_sequence_quality]),
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
def per_base_sequence_quality_rows(matrix)
|
159
|
+
matrix.map.with_index do |row, i|
|
160
|
+
base = row[0]
|
161
|
+
mean = row[1]
|
162
|
+
median = row[2]
|
163
|
+
lower_quartile = row[3]
|
164
|
+
upper_quartile = row[4]
|
165
|
+
tenth_percentile = row[5]
|
166
|
+
ninetieth_percentile = row[6]
|
167
|
+
|
168
|
+
{
|
169
|
+
"@type" => [
|
170
|
+
"Row",
|
171
|
+
base_stat_class(base),
|
172
|
+
],
|
173
|
+
"rowIndex" => i,
|
174
|
+
"basePosition" => base,
|
175
|
+
"meanBaseCallQuality" => {
|
176
|
+
"@type" => "PhredQualityScore",
|
177
|
+
"hasUnit" => "uo:CountUnit",
|
178
|
+
"rdf:value" => mean,
|
179
|
+
},
|
180
|
+
"medianBaseCallQuality" => {
|
181
|
+
"@type" => "PhredQualityScore",
|
182
|
+
"hasUnit" => "uo:CountUnit",
|
183
|
+
"rdf:value" => median,
|
184
|
+
},
|
185
|
+
"baseCallQualityLowerQuartile" => {
|
186
|
+
"@type" => "PhredQualityScore",
|
187
|
+
"hasUnit" => "uo:CountUnit",
|
188
|
+
"rdf:value" => lower_quartile,
|
189
|
+
},
|
190
|
+
"baseCallQualityUpperQuartile" => {
|
191
|
+
"@type" => "PhredQualityScore",
|
192
|
+
"hasUnit" => "uo:CountUnit",
|
193
|
+
"rdf:value" => upper_quartile,
|
194
|
+
},
|
195
|
+
"baseCallQuality10thPercentile" => {
|
196
|
+
"@type" => "PhredQualityScore",
|
197
|
+
"hasUnit" => "uo:CountUnit",
|
198
|
+
"rdf:value" => tenth_percentile,
|
199
|
+
},
|
200
|
+
"baseCallQuality90thPercentile" => {
|
201
|
+
"@type" => "PhredQualityScore",
|
202
|
+
"hasUnit" => "uo:CountUnit",
|
203
|
+
"rdf:value" => ninetieth_percentile,
|
204
|
+
},
|
205
|
+
}
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def per_tile_sequence_quality
|
210
|
+
{}
|
211
|
+
end
|
212
|
+
|
213
|
+
def per_sequence_quality_scores
|
214
|
+
{
|
215
|
+
"@type" => "PerSequnceQualityScores",
|
216
|
+
"hasRow" => per_sequence_quality_scores_rows(@summary[:per_sequence_quality_scores]),
|
217
|
+
}
|
218
|
+
end
|
219
|
+
|
220
|
+
def per_sequence_quality_scores_rows(matrix)
|
221
|
+
matrix.map.with_index do |row, i|
|
222
|
+
quality = row[0]
|
223
|
+
count = row[1]
|
224
|
+
{
|
225
|
+
"@type" => "Row",
|
226
|
+
"rowIndex" => i,
|
227
|
+
"baseCallQuality" => {
|
228
|
+
"@type" => "PhredQualityScore",
|
229
|
+
"hasUnit" => "uo:CountUnit",
|
230
|
+
"rdf:value" => quality,
|
231
|
+
},
|
232
|
+
"sequenceReadCount" => {
|
233
|
+
"@type" => "SequenceReadContent",
|
234
|
+
"hasUnit" => "uo:CountUnit",
|
235
|
+
"rdf:value" => count,
|
236
|
+
},
|
237
|
+
}
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def per_base_sequence_content
|
242
|
+
{
|
243
|
+
"@type" => "PerBaseSequenceContent",
|
244
|
+
"hasRow" => per_base_sequence_content_rows(@summary[:per_base_sequence_content]),
|
245
|
+
}
|
246
|
+
end
|
247
|
+
|
248
|
+
def per_base_sequence_content_rows(matrix)
|
249
|
+
matrix.map.with_index do |row, i|
|
250
|
+
base = row[0]
|
251
|
+
guanine = row[1]
|
252
|
+
adenine = row[2]
|
253
|
+
thymine = row[3]
|
254
|
+
chytosine = row[4]
|
255
|
+
{
|
256
|
+
"@type" => [
|
257
|
+
"Row",
|
258
|
+
base_stat_class(base),
|
259
|
+
],
|
260
|
+
"rowIndex" => i,
|
261
|
+
"basePosition" => base,
|
262
|
+
"percentGuanine" => {
|
263
|
+
"@type" => "NucleotideBaseContent",
|
264
|
+
"hasUnit" => "uo:Percentage",
|
265
|
+
"rdf:value" => guanine,
|
266
|
+
},
|
267
|
+
"percentAdenine" => {
|
268
|
+
"@type" => "NucleotideBaseContent",
|
269
|
+
"hasUnit" => "uo:Percentage",
|
270
|
+
"rdf:value" => adenine,
|
271
|
+
},
|
272
|
+
"percentThymine" => {
|
273
|
+
"@type" => "NucleotideBaseContent",
|
274
|
+
"hasUnit" => "uo:Percentage",
|
275
|
+
"rdf:value" => thymine,
|
276
|
+
},
|
277
|
+
"percentCytosine" => {
|
278
|
+
"@type" => "NucleotideBaseContent",
|
279
|
+
"hasUnit" => "uo:Percentage",
|
280
|
+
"rdf:value" => chytosine,
|
281
|
+
},
|
282
|
+
}
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
def per_sequence_gc_content
|
287
|
+
{
|
288
|
+
"@type" => "PerSequenceGCContent",
|
289
|
+
"hasRow" => per_sequence_gc_content_rows(@summary[:per_sequence_gc_content]),
|
290
|
+
}
|
291
|
+
end
|
292
|
+
|
293
|
+
def per_sequence_gc_content_rows(matrix)
|
294
|
+
matrix.map.with_index do |row, i|
|
295
|
+
gc_content = row[0]
|
296
|
+
count = row[1]
|
297
|
+
{
|
298
|
+
"@type" => "Row",
|
299
|
+
"rowIndex" => i,
|
300
|
+
"percentGC" => {
|
301
|
+
"@type" => "NucleotideBaseContent",
|
302
|
+
"hasunit" => "uo:Percent",
|
303
|
+
"rdf:value" => gc_content,
|
304
|
+
},
|
305
|
+
"sequenceReadCount" => {
|
306
|
+
"@type" => "SequenceReadContent",
|
307
|
+
"hasUnit" => "uo:CountUnit",
|
308
|
+
"rdf:value" => count,
|
309
|
+
},
|
310
|
+
}
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
def per_base_n_content
|
315
|
+
{
|
316
|
+
"@type" => "PerBaseNContent",
|
317
|
+
"hasRow" => per_base_n_content_rows(@summary[:per_base_n_content]),
|
318
|
+
}
|
319
|
+
end
|
320
|
+
|
321
|
+
def per_base_n_content_rows(matrix)
|
322
|
+
matrix.map.with_index do |row, i|
|
323
|
+
base = row[0]
|
324
|
+
n_count = row[1]
|
325
|
+
{
|
326
|
+
"@type" => [
|
327
|
+
"Row",
|
328
|
+
base_stat_class(base),
|
329
|
+
],
|
330
|
+
"rowIndex" => i,
|
331
|
+
"basePosition" => base,
|
332
|
+
"nCount" => {
|
333
|
+
"@type" => "NContent",
|
334
|
+
"hasUnit" => "uo:Percentage",
|
335
|
+
"rdf:value" => n_count,
|
336
|
+
},
|
337
|
+
}
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
def sequence_length_distribution
|
342
|
+
{
|
343
|
+
"@type" => "SequenceLengthDistribution",
|
344
|
+
"hasRow" => sequence_length_distribution_rows(@summary[:sequence_length_distribution]),
|
345
|
+
}
|
346
|
+
end
|
347
|
+
|
348
|
+
def sequence_length_distribution_rows(matrix)
|
349
|
+
matrix.map.with_index do |row, i|
|
350
|
+
length = row[0]
|
351
|
+
count = row[1]
|
352
|
+
{
|
353
|
+
"@type" => "Row",
|
354
|
+
"rowIndex" => i,
|
355
|
+
|
356
|
+
"sequenceReadLength" => {
|
357
|
+
"@type" => "SequenceReadLength",
|
358
|
+
"hasUnit" => "uo:CountUnit",
|
359
|
+
"rdf:value" => length,
|
360
|
+
},
|
361
|
+
"sequenceReadCount" => {
|
362
|
+
"@type" => "SequenceReadContent",
|
363
|
+
"hasUnit" => "uo:CountUnit",
|
364
|
+
"rdf:value" => count,
|
365
|
+
},
|
366
|
+
}
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
def total_duplicate_percentage
|
371
|
+
{}
|
372
|
+
end
|
373
|
+
|
374
|
+
def sequence_duplication_levels
|
375
|
+
{
|
376
|
+
"@type" => "SequenceDuplicationLevels",
|
377
|
+
"hasRow" => sequence_duplication_levels_rows(@summary[:sequence_duplication_levels]),
|
378
|
+
}
|
379
|
+
end
|
380
|
+
|
381
|
+
def sequence_duplication_levels_rows(matrix)
|
382
|
+
matrix.map.with_index do |row, i|
|
383
|
+
duplication_level = row[0]
|
384
|
+
relative_count = row[1]
|
385
|
+
{
|
386
|
+
"@type" => "Row",
|
387
|
+
"rowIndex" => i,
|
388
|
+
|
389
|
+
"sequenceDuplicationLevel" => {
|
390
|
+
"@type" => "SequenceDuplicationLevel",
|
391
|
+
"hasUnit" => "uo:CountUnit",
|
392
|
+
"rdf:value" => duplication_level,
|
393
|
+
},
|
394
|
+
"sequenceReadRelativeCount" => {
|
395
|
+
"@type" => "SequenceReadContent",
|
396
|
+
"hasUnit" => "uo:CountUnit",
|
397
|
+
"rdf:value" => relative_count,
|
398
|
+
},
|
399
|
+
}
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
def overrepresented_sequences
|
404
|
+
{
|
405
|
+
"@type" => "OverrepresentedSequences",
|
406
|
+
"hasRow" => overrepresented_sequences_rows(@summary[:overrepresented_sequences]),
|
407
|
+
}
|
408
|
+
end
|
409
|
+
|
410
|
+
def overrepresented_sequences_rows(matrix)
|
411
|
+
matrix.map.with_index do |row, i|
|
412
|
+
sequence = row[0]
|
413
|
+
count = row[1]
|
414
|
+
percentage = row[2]
|
415
|
+
possible_source = row[3]
|
416
|
+
{
|
417
|
+
"@type" => "Row",
|
418
|
+
"rowIndex" => i,
|
419
|
+
"overrepresentedSequence" => sequence,
|
420
|
+
"sequenceReadCount" => {
|
421
|
+
"@type" => "SequenceReadContent",
|
422
|
+
"hasUnit" => "uo:CountUnit",
|
423
|
+
"rdf:value" => count,
|
424
|
+
},
|
425
|
+
"sequenceReadPercentage" => {
|
426
|
+
"@type" => "SequenceReadContent",
|
427
|
+
"hasUnit" => "uo:Percentage",
|
428
|
+
"rdf:value" => percentage,
|
429
|
+
},
|
430
|
+
"possibleSourceOfSequence" => possible_source,
|
431
|
+
}
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
def adapter_content
|
436
|
+
{}
|
437
|
+
end
|
438
|
+
|
439
|
+
def kmer_content
|
440
|
+
{
|
441
|
+
"@type" => "KmerContent",
|
442
|
+
"hasRow" => kmer_content_rows(@summary[:kmer_content]),
|
443
|
+
}
|
444
|
+
end
|
445
|
+
|
446
|
+
def kmer_content_rows(matrix)
|
447
|
+
matrix.map.with_index do |row, i|
|
448
|
+
sequence = row[0]
|
449
|
+
count = row[1]
|
450
|
+
ratio_overall = row[2]
|
451
|
+
ratio_max = row[3]
|
452
|
+
ratio_max_position = row[4]
|
453
|
+
{
|
454
|
+
"@type" => "Row",
|
455
|
+
"rowIndex" => i,
|
456
|
+
"kmerSequence" => sequence,
|
457
|
+
"sequenceReadCount" => {
|
458
|
+
"@type" => "SequenceReadContent",
|
459
|
+
"hasUnit" => "uo:CountUnit",
|
460
|
+
"rdf:value" => count,
|
461
|
+
},
|
462
|
+
"observedPerExpectedOverall" => {
|
463
|
+
"@type" => "SequenceReadContent",
|
464
|
+
"hasUnit" => "uo:Ratio",
|
465
|
+
"rdf:value" => ratio_overall,
|
466
|
+
},
|
467
|
+
"observedPerExpectedMax" => {
|
468
|
+
"@type" => "SequenceReadContent",
|
469
|
+
"hasUnit" => "uo:Ratio",
|
470
|
+
"rdf:value" => ratio_max,
|
471
|
+
},
|
472
|
+
"observedPerExpectedMaxPosition" => ratio_max_position,
|
473
|
+
}
|
474
|
+
end
|
475
|
+
end
|
476
|
+
|
477
|
+
def min_length
|
478
|
+
{
|
479
|
+
"minSequenceLength" => {
|
480
|
+
"@type" => "SequenceReadLength",
|
481
|
+
"hasUnit" => "uo:CountUnit",
|
482
|
+
"rdf:value" => @summary[:min_length],
|
483
|
+
}
|
484
|
+
}
|
485
|
+
end
|
486
|
+
|
487
|
+
def max_length
|
488
|
+
{
|
489
|
+
"maxSequenceLength" => {
|
490
|
+
"@type" => "SequenceReadLength",
|
491
|
+
"hasUnit" => "uo:CountUnit",
|
492
|
+
"rdf:value" => @summary[:max_length],
|
493
|
+
}
|
494
|
+
}
|
495
|
+
end
|
496
|
+
|
497
|
+
def mean_sequence_length
|
498
|
+
{
|
499
|
+
"meanSequenceLength" => {
|
500
|
+
"@type" => "SequenceReadLength",
|
501
|
+
"hasUnit" => "uo:CountUnit",
|
502
|
+
"rdf:value" => @summary[:mean_sequence_length],
|
503
|
+
}
|
504
|
+
}
|
505
|
+
end
|
506
|
+
|
507
|
+
def median_sequence_length
|
508
|
+
{
|
509
|
+
"medianSequenceLength" => {
|
510
|
+
"@type" => "SequenceReadLength",
|
511
|
+
"hasUnit" => "uo:CountUnit",
|
512
|
+
"rdf:value" => @summary[:median_sequence_length],
|
513
|
+
}
|
514
|
+
}
|
515
|
+
end
|
516
|
+
|
517
|
+
def overall_mean_quality_score
|
518
|
+
{
|
519
|
+
"overallMeanBaseCallQuality" => {
|
520
|
+
"@type" => "PhredQualityScore",
|
521
|
+
"hasUnit" => "uo:CountUnit",
|
522
|
+
"rdf:value" => @summary[:overall_mean_quality_score],
|
523
|
+
}
|
524
|
+
}
|
525
|
+
end
|
526
|
+
|
527
|
+
def overall_median_quality_score
|
528
|
+
{
|
529
|
+
"overallMedianBaseCallQuality" => {
|
530
|
+
"@type" => "PhredQualityScore",
|
531
|
+
"hasUnit" => "uo:CountUnit",
|
532
|
+
"rdf:value" => @summary[:overall_median_quality_score],
|
533
|
+
}
|
534
|
+
}
|
535
|
+
end
|
536
|
+
|
537
|
+
def overall_n_content
|
538
|
+
{
|
539
|
+
"overallNContent" => {
|
540
|
+
"@type" => "NContent",
|
541
|
+
"hasUnit" => "uo:Percentage",
|
542
|
+
"rdf:value" => @summary[:overall_n_content],
|
543
|
+
}
|
544
|
+
}
|
545
|
+
end
|
546
|
+
|
547
|
+
#
|
548
|
+
# Generate JSON-LD context object
|
549
|
+
#
|
550
|
+
|
551
|
+
def jsonld_context
|
552
|
+
# definition of imported terms in @context
|
553
|
+
object = imported_keywords
|
554
|
+
|
555
|
+
# definition of local ontology terms
|
556
|
+
domain = "http://me.com/sos#"
|
557
|
+
|
558
|
+
# definition of class in @context
|
559
|
+
sos_class.each do |term|
|
560
|
+
object[term] = {}
|
561
|
+
object[term]["@id"] = domain + term
|
562
|
+
object[term]["@type"] = "@id"
|
563
|
+
end
|
564
|
+
|
565
|
+
# definition of object properties in @context
|
566
|
+
sos_object_properties.each do |term|
|
567
|
+
object[term] = {}
|
568
|
+
object[term]["@id"] = domain + term
|
569
|
+
object[term]["@type"] = "@id"
|
570
|
+
end
|
571
|
+
|
572
|
+
sos_data_properties_string.each do |term|
|
573
|
+
object[term] = {}
|
574
|
+
object[term]["@id"] = domain + term
|
575
|
+
object[term]["@type"] = "http://www.w3.org/2001/XMLSchema#string"
|
576
|
+
end
|
577
|
+
|
578
|
+
sos_data_properties_integer.each do |term|
|
579
|
+
object[term] = {}
|
580
|
+
object[term]["@id"] = domain + term
|
581
|
+
object[term]["@type"] = "http://www.w3.org/2001/XMLSchema#integer"
|
582
|
+
end
|
583
|
+
|
584
|
+
sos_data_properties_float.each do |term|
|
585
|
+
object[term] = {}
|
586
|
+
object[term]["@id"] = domain + term
|
587
|
+
object[term]["@type"] = "http://www.w3.org/2001/XMLSchema#float"
|
588
|
+
end
|
589
|
+
|
590
|
+
object
|
591
|
+
end
|
592
|
+
|
593
|
+
def imported_keywords
|
594
|
+
{
|
595
|
+
"uo" => "http://purl.obolibrary.org/obo/",
|
596
|
+
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
597
|
+
}
|
598
|
+
end
|
599
|
+
|
600
|
+
#
|
601
|
+
# definition of classes
|
602
|
+
#
|
603
|
+
|
604
|
+
def sos_class
|
605
|
+
[
|
606
|
+
sos_class_general,
|
607
|
+
sos_class_fastqc_modules,
|
608
|
+
sos_class_for_values,
|
609
|
+
].flatten
|
610
|
+
end
|
611
|
+
|
612
|
+
def sos_class_general
|
613
|
+
[
|
614
|
+
"SequenceStatisticsReport",
|
615
|
+
"SequenceStatisticsMatrix",
|
616
|
+
"Row",
|
617
|
+
"ExactBaseStatistics",
|
618
|
+
"BaseRangeStatistics",
|
619
|
+
]
|
620
|
+
end
|
621
|
+
|
622
|
+
def sos_class_fastqc_modules
|
623
|
+
[
|
624
|
+
"PerBaseSequenceQuality",
|
625
|
+
"PerTileSequenceQuality",
|
626
|
+
"PerSequnceQualityScores",
|
627
|
+
"PerBaseSequenceContent",
|
628
|
+
"PerSequenceGCContent",
|
629
|
+
"PerBaseNContent",
|
630
|
+
"SequenceLengthDistribution",
|
631
|
+
"SequenceDuplicationLevels",
|
632
|
+
"OverrepresentedSequences",
|
633
|
+
"KmerContent",
|
634
|
+
]
|
635
|
+
end
|
636
|
+
|
637
|
+
def sos_class_for_values
|
638
|
+
[
|
639
|
+
"PhredQualityScore",
|
640
|
+
"NucleotideBaseContent",
|
641
|
+
"SequenceReadContent",
|
642
|
+
"SequenceReadLength",
|
643
|
+
"SequenceDuplicationLevel",
|
644
|
+
]
|
645
|
+
end
|
646
|
+
|
647
|
+
#
|
648
|
+
# definition of predicates
|
649
|
+
#
|
650
|
+
|
651
|
+
def sos_object_properties
|
652
|
+
[
|
653
|
+
"hasMatrix",
|
654
|
+
"totalSequences",
|
655
|
+
"filteredSequences",
|
656
|
+
"sequenceLength",
|
657
|
+
"percentGC",
|
658
|
+
"hasRow",
|
659
|
+
"basePosition",
|
660
|
+
"kmerSequence",
|
661
|
+
"meanBaseCallQuality",
|
662
|
+
"medianBaseCallQuality",
|
663
|
+
"nCount",
|
664
|
+
"observedPerExpectedMax",
|
665
|
+
"observedPerExpectedMaxPosition",
|
666
|
+
"observedPerExpectedOverall",
|
667
|
+
"percentAdenine",
|
668
|
+
"percentCytosine",
|
669
|
+
"percentGC",
|
670
|
+
"percentGuanine",
|
671
|
+
"percentThymine",
|
672
|
+
"sequenceDuplicationLevel",
|
673
|
+
"sequenceReadCount",
|
674
|
+
"sequenceReadLength",
|
675
|
+
"sequenceReadPercentage",
|
676
|
+
"sequenceReadRelativeCount",
|
677
|
+
"hasUnit",
|
678
|
+
"overallMeanBaseCallQuality",
|
679
|
+
"overallMedianBaseCallQuality",
|
680
|
+
"overallNContent",
|
681
|
+
]
|
682
|
+
end
|
683
|
+
|
684
|
+
def sos_data_properties_string
|
685
|
+
[
|
686
|
+
"filename",
|
687
|
+
"fileType",
|
688
|
+
"encoding",
|
689
|
+
"possibleSourceOfSequence",
|
690
|
+
"overrepresentedSequence",
|
691
|
+
]
|
692
|
+
end
|
693
|
+
|
694
|
+
def sos_data_properties_integer
|
695
|
+
[
|
696
|
+
"rowIndex",
|
697
|
+
]
|
698
|
+
end
|
699
|
+
|
700
|
+
def sos_data_properties_float
|
701
|
+
[
|
702
|
+
"baseCallQuality",
|
703
|
+
"baseCallQuality10thPercentile",
|
704
|
+
"baseCallQuality90thPercentile",
|
705
|
+
"baseCallQualityLowerQuartile",
|
706
|
+
"baseCallQualityUpperQuartile",
|
707
|
+
"minSequenceLength",
|
708
|
+
"maxSequenceLength",
|
709
|
+
"meanSequenceLength",
|
710
|
+
"medianSequenceLength",
|
711
|
+
]
|
712
|
+
end
|
713
|
+
end
|
714
|
+
end
|
715
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-fastqc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tazro Inutano Ohta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-03-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rubyzip
|
@@ -44,6 +44,34 @@ dependencies:
|
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: 0.19.1
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: json-ld
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '1.99'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '1.99'
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: rdf-turtle
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '1.99'
|
68
|
+
type: :runtime
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.99'
|
47
75
|
- !ruby/object:Gem::Dependency
|
48
76
|
name: bundler
|
49
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -160,16 +188,16 @@ dependencies:
|
|
160
188
|
name: bio-fastqc
|
161
189
|
requirement: !ruby/object:Gem::Requirement
|
162
190
|
requirements:
|
163
|
-
- -
|
191
|
+
- - '='
|
164
192
|
- !ruby/object:Gem::Version
|
165
|
-
version:
|
193
|
+
version: 0.3.0
|
166
194
|
type: :development
|
167
195
|
prerelease: false
|
168
196
|
version_requirements: !ruby/object:Gem::Requirement
|
169
197
|
requirements:
|
170
|
-
- -
|
198
|
+
- - '='
|
171
199
|
- !ruby/object:Gem::Version
|
172
|
-
version:
|
200
|
+
version: 0.3.0
|
173
201
|
description: ruby parser for FastQC, a quality control software for high-throughput
|
174
202
|
sequencing data.
|
175
203
|
email: inutano@gmail.com
|
@@ -194,8 +222,10 @@ files:
|
|
194
222
|
- lib/bio-fastqc.rb
|
195
223
|
- lib/bio/fastqc.rb
|
196
224
|
- lib/bio/fastqc/cli.rb
|
225
|
+
- lib/bio/fastqc/converter.rb
|
197
226
|
- lib/bio/fastqc/data.rb
|
198
227
|
- lib/bio/fastqc/parser.rb
|
228
|
+
- lib/bio/fastqc/semantics.rb
|
199
229
|
- spec/bio-fastqc_spec.rb
|
200
230
|
- spec/example_fastqc.zip
|
201
231
|
- spec/spec_helper.rb
|