bio-fastqc 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -1
- data/VERSION +1 -1
- data/lib/bio-fastqc.rb +2 -0
- data/lib/bio/fastqc/cli.rb +8 -7
- data/lib/bio/fastqc/converter.rb +35 -0
- data/lib/bio/fastqc/parser.rb +3 -2
- data/lib/bio/fastqc/semantics.rb +715 -0
- metadata +36 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7e21d6a0e0f6ec91c058b8a574e79e6fa9dad431
|
4
|
+
data.tar.gz: 93ed09413b3fdb49564c43e68a49bed3d1937d88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48097e944bd5c7e76a804f7d0623f869dddb17fceb4da878878489f2ad745dab9b0c925cadccce12bee332ff3b3c8db5dea1ce22085acf1be914bddec2f04998
|
7
|
+
data.tar.gz: fc4ac83bb59b47e5bf554884efbdc7e7f179fece8bd94fa70529fe7858727cdf6826ec87ac6081c68d91c3c82f0c908ab46a8798f9b180d9974839aa02daa77c
|
data/Gemfile
CHANGED
@@ -5,6 +5,8 @@ source "http://rubygems.org"
|
|
5
5
|
|
6
6
|
gem 'rubyzip', '~> 1.1', '>= 1.1.0'
|
7
7
|
gem 'thor', "~> 0.19.1"
|
8
|
+
gem 'json-ld', '~> 1.99'
|
9
|
+
gem 'rdf-turtle', '~> 1.99'
|
8
10
|
|
9
11
|
# Add dependencies to develop your gem here.
|
10
12
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -19,5 +21,5 @@ group :development do
|
|
19
21
|
gem 'simplecov', '~> 0.10'
|
20
22
|
|
21
23
|
gem 'pry', '~> 0.10'
|
22
|
-
gem 'bio-fastqc'
|
24
|
+
gem 'bio-fastqc', '0.3.0'
|
23
25
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.1
|
data/lib/bio-fastqc.rb
CHANGED
data/lib/bio/fastqc/cli.rb
CHANGED
@@ -6,13 +6,14 @@ require 'json'
|
|
6
6
|
module Bio
|
7
7
|
module FastQC
|
8
8
|
class CLI < Thor
|
9
|
-
desc "parse [filename]
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
9
|
+
desc "parse [--format format] [filename]", "parse fastqc data in fastqc directory or zipfile, output in json, json-ld, or rdf-turtle format."
|
10
|
+
option :format, :default => "json"
|
11
|
+
def parse(file)
|
12
|
+
data = Data.read(file)
|
13
|
+
summary = Parser.new(data).summary
|
14
|
+
puts Converter.new(summary).convert_to(options[:format])
|
15
|
+
# rescue
|
16
|
+
# puts "Wrong input file type: specify fastqc result data, directory or zipfile"
|
16
17
|
end
|
17
18
|
end
|
18
19
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module FastQC
|
5
|
+
class Converter
|
6
|
+
def initialize(summary_json)
|
7
|
+
@summary_json = summary_json
|
8
|
+
end
|
9
|
+
|
10
|
+
def convert_to(format)
|
11
|
+
case format
|
12
|
+
when "json"
|
13
|
+
to_json
|
14
|
+
when "json-ld"
|
15
|
+
to_jsonld
|
16
|
+
when "turtle"
|
17
|
+
to_turtle
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_json
|
22
|
+
JSON.dump(@summary_json)
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_jsonld
|
26
|
+
json_ld_object = Semantics.new(@summary_json).json_ld_object
|
27
|
+
JSON.dump(json_ld_object)
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_turtle
|
31
|
+
Semantics.new(@summary_json).turtle
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/bio/fastqc/parser.rb
CHANGED
@@ -220,12 +220,13 @@ module Bio
|
|
220
220
|
percent_gc: self.percent_gc,
|
221
221
|
per_base_sequence_quality: self.per_base_sequence_quality,
|
222
222
|
per_tile_sequence_quality: self.per_tile_sequence_quality,
|
223
|
-
|
223
|
+
per_sequence_quality_scores: self.per_sequence_quality_scores,
|
224
224
|
per_base_sequence_content: self.per_base_sequence_content,
|
225
225
|
per_sequence_gc_content: self.per_sequence_gc_content,
|
226
226
|
per_base_n_content: self.per_base_n_content,
|
227
227
|
sequence_length_distribution: self.sequence_length_distribution,
|
228
|
-
total_duplicate_percentage: self.total_duplicate_percentage,
|
228
|
+
total_duplicate_percentage: self.total_duplicate_percentage,
|
229
|
+
sequence_duplication_levels: self.sequence_duplication_levels,
|
229
230
|
overrepresented_sequences: self.overrepresented_sequences,
|
230
231
|
adapter_content: self.adapter_content,
|
231
232
|
kmer_content: self.kmer_content,
|
@@ -0,0 +1,715 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'json/ld'
|
4
|
+
require 'rdf/turtle'
|
5
|
+
|
6
|
+
module Bio
|
7
|
+
module FastQC
|
8
|
+
class Semantics
|
9
|
+
def initialize(summary_json)
|
10
|
+
@summary = summary_json
|
11
|
+
end
|
12
|
+
|
13
|
+
def turtle
|
14
|
+
object = json_ld_object
|
15
|
+
graph = RDF::Graph.new << JSON::LD::API.toRdf(object)
|
16
|
+
graph.dump(:ttl, prefixes: turtle_prefixes)
|
17
|
+
end
|
18
|
+
|
19
|
+
def turtle_prefixes
|
20
|
+
{
|
21
|
+
"uo" => "http://purl.obolibrary.org/obo/",
|
22
|
+
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
def json_ld_object
|
27
|
+
object = [object_core, static_value_modules].flatten.inject(&:merge)
|
28
|
+
object["hasMatrix"] = matrix_modules
|
29
|
+
object
|
30
|
+
end
|
31
|
+
|
32
|
+
def identifier
|
33
|
+
"http://me.com/data/QNT" + @summary[:filename].split(".").first
|
34
|
+
end
|
35
|
+
|
36
|
+
def object_core
|
37
|
+
{
|
38
|
+
"@context" => jsonld_context,
|
39
|
+
"@id" => identifier,
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
def static_value_modules
|
44
|
+
[
|
45
|
+
fastqc_version,
|
46
|
+
filename,
|
47
|
+
file_type,
|
48
|
+
encoding,
|
49
|
+
total_sequences,
|
50
|
+
filtered_sequences,
|
51
|
+
sequence_length,
|
52
|
+
percent_gc,
|
53
|
+
total_duplicate_percentage,
|
54
|
+
min_length,
|
55
|
+
max_length,
|
56
|
+
overall_mean_quality_score,
|
57
|
+
overall_median_quality_score,
|
58
|
+
overall_n_content,
|
59
|
+
mean_sequence_length,
|
60
|
+
median_sequence_length,
|
61
|
+
]
|
62
|
+
end
|
63
|
+
|
64
|
+
def matrix_modules
|
65
|
+
[
|
66
|
+
per_base_sequence_quality,
|
67
|
+
per_tile_sequence_quality,
|
68
|
+
per_sequence_quality_scores,
|
69
|
+
per_base_sequence_content,
|
70
|
+
per_sequence_gc_content,
|
71
|
+
per_base_n_content,
|
72
|
+
sequence_length_distribution,
|
73
|
+
sequence_duplication_levels,
|
74
|
+
overrepresented_sequences,
|
75
|
+
adapter_content,
|
76
|
+
kmer_content,
|
77
|
+
]
|
78
|
+
end
|
79
|
+
|
80
|
+
def base_stat_class(base)
|
81
|
+
case base
|
82
|
+
when /-/ # when the base position is range like "50-100"
|
83
|
+
"BaseRangeStatistics"
|
84
|
+
else
|
85
|
+
"ExactBaseStatistics"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def fastqc_version
|
90
|
+
{}
|
91
|
+
end
|
92
|
+
|
93
|
+
def filename
|
94
|
+
{
|
95
|
+
"filename" => @summary[:filename],
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
def file_type
|
100
|
+
{
|
101
|
+
"fileType" => @summary[:file_type],
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def encoding
|
106
|
+
{
|
107
|
+
"encoding" => @summary[:encoding],
|
108
|
+
}
|
109
|
+
end
|
110
|
+
|
111
|
+
def total_sequences
|
112
|
+
{
|
113
|
+
"totalSequences" => {
|
114
|
+
"@type" => "SequenceReadContent",
|
115
|
+
"hasUnit" => "uo:CountUnit",
|
116
|
+
"rdf:value" => @summary[:total_sequences],
|
117
|
+
}
|
118
|
+
}
|
119
|
+
end
|
120
|
+
|
121
|
+
def filtered_sequences
|
122
|
+
{
|
123
|
+
"filteredSequences" => {
|
124
|
+
"@type" => "SequenceReadContent",
|
125
|
+
"hasUnit" => "uo:CountUnit",
|
126
|
+
"rdf:value" => @summary[:filtered_sequences],
|
127
|
+
}
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def sequence_length
|
132
|
+
{
|
133
|
+
"sequenceLength" => {
|
134
|
+
"@type" => "SequenceReadLength",
|
135
|
+
"hasUnit" => "uo:CountUnit",
|
136
|
+
"rdf:value" => @summary[:sequence_length],
|
137
|
+
}
|
138
|
+
}
|
139
|
+
end
|
140
|
+
|
141
|
+
def percent_gc
|
142
|
+
{
|
143
|
+
"percentGC" => {
|
144
|
+
"@type" => "NucleotideBaseContent",
|
145
|
+
"hasUnit" => "uo:CountUnit",
|
146
|
+
"rdf:value" => @summary[:percent_gc],
|
147
|
+
}
|
148
|
+
}
|
149
|
+
end
|
150
|
+
|
151
|
+
def per_base_sequence_quality
|
152
|
+
{
|
153
|
+
"@type" => "PerBaseSequenceQuality",
|
154
|
+
"hasRow" => per_base_sequence_quality_rows(@summary[:per_base_sequence_quality]),
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
def per_base_sequence_quality_rows(matrix)
|
159
|
+
matrix.map.with_index do |row, i|
|
160
|
+
base = row[0]
|
161
|
+
mean = row[1]
|
162
|
+
median = row[2]
|
163
|
+
lower_quartile = row[3]
|
164
|
+
upper_quartile = row[4]
|
165
|
+
tenth_percentile = row[5]
|
166
|
+
ninetieth_percentile = row[6]
|
167
|
+
|
168
|
+
{
|
169
|
+
"@type" => [
|
170
|
+
"Row",
|
171
|
+
base_stat_class(base),
|
172
|
+
],
|
173
|
+
"rowIndex" => i,
|
174
|
+
"basePosition" => base,
|
175
|
+
"meanBaseCallQuality" => {
|
176
|
+
"@type" => "PhredQualityScore",
|
177
|
+
"hasUnit" => "uo:CountUnit",
|
178
|
+
"rdf:value" => mean,
|
179
|
+
},
|
180
|
+
"medianBaseCallQuality" => {
|
181
|
+
"@type" => "PhredQualityScore",
|
182
|
+
"hasUnit" => "uo:CountUnit",
|
183
|
+
"rdf:value" => median,
|
184
|
+
},
|
185
|
+
"baseCallQualityLowerQuartile" => {
|
186
|
+
"@type" => "PhredQualityScore",
|
187
|
+
"hasUnit" => "uo:CountUnit",
|
188
|
+
"rdf:value" => lower_quartile,
|
189
|
+
},
|
190
|
+
"baseCallQualityUpperQuartile" => {
|
191
|
+
"@type" => "PhredQualityScore",
|
192
|
+
"hasUnit" => "uo:CountUnit",
|
193
|
+
"rdf:value" => upper_quartile,
|
194
|
+
},
|
195
|
+
"baseCallQuality10thPercentile" => {
|
196
|
+
"@type" => "PhredQualityScore",
|
197
|
+
"hasUnit" => "uo:CountUnit",
|
198
|
+
"rdf:value" => tenth_percentile,
|
199
|
+
},
|
200
|
+
"baseCallQuality90thPercentile" => {
|
201
|
+
"@type" => "PhredQualityScore",
|
202
|
+
"hasUnit" => "uo:CountUnit",
|
203
|
+
"rdf:value" => ninetieth_percentile,
|
204
|
+
},
|
205
|
+
}
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def per_tile_sequence_quality
|
210
|
+
{}
|
211
|
+
end
|
212
|
+
|
213
|
+
def per_sequence_quality_scores
|
214
|
+
{
|
215
|
+
"@type" => "PerSequnceQualityScores",
|
216
|
+
"hasRow" => per_sequence_quality_scores_rows(@summary[:per_sequence_quality_scores]),
|
217
|
+
}
|
218
|
+
end
|
219
|
+
|
220
|
+
def per_sequence_quality_scores_rows(matrix)
|
221
|
+
matrix.map.with_index do |row, i|
|
222
|
+
quality = row[0]
|
223
|
+
count = row[1]
|
224
|
+
{
|
225
|
+
"@type" => "Row",
|
226
|
+
"rowIndex" => i,
|
227
|
+
"baseCallQuality" => {
|
228
|
+
"@type" => "PhredQualityScore",
|
229
|
+
"hasUnit" => "uo:CountUnit",
|
230
|
+
"rdf:value" => quality,
|
231
|
+
},
|
232
|
+
"sequenceReadCount" => {
|
233
|
+
"@type" => "SequenceReadContent",
|
234
|
+
"hasUnit" => "uo:CountUnit",
|
235
|
+
"rdf:value" => count,
|
236
|
+
},
|
237
|
+
}
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def per_base_sequence_content
|
242
|
+
{
|
243
|
+
"@type" => "PerBaseSequenceContent",
|
244
|
+
"hasRow" => per_base_sequence_content_rows(@summary[:per_base_sequence_content]),
|
245
|
+
}
|
246
|
+
end
|
247
|
+
|
248
|
+
def per_base_sequence_content_rows(matrix)
|
249
|
+
matrix.map.with_index do |row, i|
|
250
|
+
base = row[0]
|
251
|
+
guanine = row[1]
|
252
|
+
adenine = row[2]
|
253
|
+
thymine = row[3]
|
254
|
+
chytosine = row[4]
|
255
|
+
{
|
256
|
+
"@type" => [
|
257
|
+
"Row",
|
258
|
+
base_stat_class(base),
|
259
|
+
],
|
260
|
+
"rowIndex" => i,
|
261
|
+
"basePosition" => base,
|
262
|
+
"percentGuanine" => {
|
263
|
+
"@type" => "NucleotideBaseContent",
|
264
|
+
"hasUnit" => "uo:Percentage",
|
265
|
+
"rdf:value" => guanine,
|
266
|
+
},
|
267
|
+
"percentAdenine" => {
|
268
|
+
"@type" => "NucleotideBaseContent",
|
269
|
+
"hasUnit" => "uo:Percentage",
|
270
|
+
"rdf:value" => adenine,
|
271
|
+
},
|
272
|
+
"percentThymine" => {
|
273
|
+
"@type" => "NucleotideBaseContent",
|
274
|
+
"hasUnit" => "uo:Percentage",
|
275
|
+
"rdf:value" => thymine,
|
276
|
+
},
|
277
|
+
"percentCytosine" => {
|
278
|
+
"@type" => "NucleotideBaseContent",
|
279
|
+
"hasUnit" => "uo:Percentage",
|
280
|
+
"rdf:value" => chytosine,
|
281
|
+
},
|
282
|
+
}
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
def per_sequence_gc_content
|
287
|
+
{
|
288
|
+
"@type" => "PerSequenceGCContent",
|
289
|
+
"hasRow" => per_sequence_gc_content_rows(@summary[:per_sequence_gc_content]),
|
290
|
+
}
|
291
|
+
end
|
292
|
+
|
293
|
+
def per_sequence_gc_content_rows(matrix)
|
294
|
+
matrix.map.with_index do |row, i|
|
295
|
+
gc_content = row[0]
|
296
|
+
count = row[1]
|
297
|
+
{
|
298
|
+
"@type" => "Row",
|
299
|
+
"rowIndex" => i,
|
300
|
+
"percentGC" => {
|
301
|
+
"@type" => "NucleotideBaseContent",
|
302
|
+
"hasunit" => "uo:Percent",
|
303
|
+
"rdf:value" => gc_content,
|
304
|
+
},
|
305
|
+
"sequenceReadCount" => {
|
306
|
+
"@type" => "SequenceReadContent",
|
307
|
+
"hasUnit" => "uo:CountUnit",
|
308
|
+
"rdf:value" => count,
|
309
|
+
},
|
310
|
+
}
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
def per_base_n_content
|
315
|
+
{
|
316
|
+
"@type" => "PerBaseNContent",
|
317
|
+
"hasRow" => per_base_n_content_rows(@summary[:per_base_n_content]),
|
318
|
+
}
|
319
|
+
end
|
320
|
+
|
321
|
+
def per_base_n_content_rows(matrix)
|
322
|
+
matrix.map.with_index do |row, i|
|
323
|
+
base = row[0]
|
324
|
+
n_count = row[1]
|
325
|
+
{
|
326
|
+
"@type" => [
|
327
|
+
"Row",
|
328
|
+
base_stat_class(base),
|
329
|
+
],
|
330
|
+
"rowIndex" => i,
|
331
|
+
"basePosition" => base,
|
332
|
+
"nCount" => {
|
333
|
+
"@type" => "NContent",
|
334
|
+
"hasUnit" => "uo:Percentage",
|
335
|
+
"rdf:value" => n_count,
|
336
|
+
},
|
337
|
+
}
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
def sequence_length_distribution
|
342
|
+
{
|
343
|
+
"@type" => "SequenceLengthDistribution",
|
344
|
+
"hasRow" => sequence_length_distribution_rows(@summary[:sequence_length_distribution]),
|
345
|
+
}
|
346
|
+
end
|
347
|
+
|
348
|
+
def sequence_length_distribution_rows(matrix)
|
349
|
+
matrix.map.with_index do |row, i|
|
350
|
+
length = row[0]
|
351
|
+
count = row[1]
|
352
|
+
{
|
353
|
+
"@type" => "Row",
|
354
|
+
"rowIndex" => i,
|
355
|
+
|
356
|
+
"sequenceReadLength" => {
|
357
|
+
"@type" => "SequenceReadLength",
|
358
|
+
"hasUnit" => "uo:CountUnit",
|
359
|
+
"rdf:value" => length,
|
360
|
+
},
|
361
|
+
"sequenceReadCount" => {
|
362
|
+
"@type" => "SequenceReadContent",
|
363
|
+
"hasUnit" => "uo:CountUnit",
|
364
|
+
"rdf:value" => count,
|
365
|
+
},
|
366
|
+
}
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
def total_duplicate_percentage
|
371
|
+
{}
|
372
|
+
end
|
373
|
+
|
374
|
+
def sequence_duplication_levels
|
375
|
+
{
|
376
|
+
"@type" => "SequenceDuplicationLevels",
|
377
|
+
"hasRow" => sequence_duplication_levels_rows(@summary[:sequence_duplication_levels]),
|
378
|
+
}
|
379
|
+
end
|
380
|
+
|
381
|
+
def sequence_duplication_levels_rows(matrix)
|
382
|
+
matrix.map.with_index do |row, i|
|
383
|
+
duplication_level = row[0]
|
384
|
+
relative_count = row[1]
|
385
|
+
{
|
386
|
+
"@type" => "Row",
|
387
|
+
"rowIndex" => i,
|
388
|
+
|
389
|
+
"sequenceDuplicationLevel" => {
|
390
|
+
"@type" => "SequenceDuplicationLevel",
|
391
|
+
"hasUnit" => "uo:CountUnit",
|
392
|
+
"rdf:value" => duplication_level,
|
393
|
+
},
|
394
|
+
"sequenceReadRelativeCount" => {
|
395
|
+
"@type" => "SequenceReadContent",
|
396
|
+
"hasUnit" => "uo:CountUnit",
|
397
|
+
"rdf:value" => relative_count,
|
398
|
+
},
|
399
|
+
}
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
def overrepresented_sequences
|
404
|
+
{
|
405
|
+
"@type" => "OverrepresentedSequences",
|
406
|
+
"hasRow" => overrepresented_sequences_rows(@summary[:overrepresented_sequences]),
|
407
|
+
}
|
408
|
+
end
|
409
|
+
|
410
|
+
def overrepresented_sequences_rows(matrix)
|
411
|
+
matrix.map.with_index do |row, i|
|
412
|
+
sequence = row[0]
|
413
|
+
count = row[1]
|
414
|
+
percentage = row[2]
|
415
|
+
possible_source = row[3]
|
416
|
+
{
|
417
|
+
"@type" => "Row",
|
418
|
+
"rowIndex" => i,
|
419
|
+
"overrepresentedSequence" => sequence,
|
420
|
+
"sequenceReadCount" => {
|
421
|
+
"@type" => "SequenceReadContent",
|
422
|
+
"hasUnit" => "uo:CountUnit",
|
423
|
+
"rdf:value" => count,
|
424
|
+
},
|
425
|
+
"sequenceReadPercentage" => {
|
426
|
+
"@type" => "SequenceReadContent",
|
427
|
+
"hasUnit" => "uo:Percentage",
|
428
|
+
"rdf:value" => percentage,
|
429
|
+
},
|
430
|
+
"possibleSourceOfSequence" => possible_source,
|
431
|
+
}
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
def adapter_content
|
436
|
+
{}
|
437
|
+
end
|
438
|
+
|
439
|
+
def kmer_content
|
440
|
+
{
|
441
|
+
"@type" => "KmerContent",
|
442
|
+
"hasRow" => kmer_content_rows(@summary[:kmer_content]),
|
443
|
+
}
|
444
|
+
end
|
445
|
+
|
446
|
+
def kmer_content_rows(matrix)
|
447
|
+
matrix.map.with_index do |row, i|
|
448
|
+
sequence = row[0]
|
449
|
+
count = row[1]
|
450
|
+
ratio_overall = row[2]
|
451
|
+
ratio_max = row[3]
|
452
|
+
ratio_max_position = row[4]
|
453
|
+
{
|
454
|
+
"@type" => "Row",
|
455
|
+
"rowIndex" => i,
|
456
|
+
"kmerSequence" => sequence,
|
457
|
+
"sequenceReadCount" => {
|
458
|
+
"@type" => "SequenceReadContent",
|
459
|
+
"hasUnit" => "uo:CountUnit",
|
460
|
+
"rdf:value" => count,
|
461
|
+
},
|
462
|
+
"observedPerExpectedOverall" => {
|
463
|
+
"@type" => "SequenceReadContent",
|
464
|
+
"hasUnit" => "uo:Ratio",
|
465
|
+
"rdf:value" => ratio_overall,
|
466
|
+
},
|
467
|
+
"observedPerExpectedMax" => {
|
468
|
+
"@type" => "SequenceReadContent",
|
469
|
+
"hasUnit" => "uo:Ratio",
|
470
|
+
"rdf:value" => ratio_max,
|
471
|
+
},
|
472
|
+
"observedPerExpectedMaxPosition" => ratio_max_position,
|
473
|
+
}
|
474
|
+
end
|
475
|
+
end
|
476
|
+
|
477
|
+
def min_length
|
478
|
+
{
|
479
|
+
"minSequenceLength" => {
|
480
|
+
"@type" => "SequenceReadLength",
|
481
|
+
"hasUnit" => "uo:CountUnit",
|
482
|
+
"rdf:value" => @summary[:min_length],
|
483
|
+
}
|
484
|
+
}
|
485
|
+
end
|
486
|
+
|
487
|
+
def max_length
|
488
|
+
{
|
489
|
+
"maxSequenceLength" => {
|
490
|
+
"@type" => "SequenceReadLength",
|
491
|
+
"hasUnit" => "uo:CountUnit",
|
492
|
+
"rdf:value" => @summary[:max_length],
|
493
|
+
}
|
494
|
+
}
|
495
|
+
end
|
496
|
+
|
497
|
+
def mean_sequence_length
|
498
|
+
{
|
499
|
+
"meanSequenceLength" => {
|
500
|
+
"@type" => "SequenceReadLength",
|
501
|
+
"hasUnit" => "uo:CountUnit",
|
502
|
+
"rdf:value" => @summary[:mean_sequence_length],
|
503
|
+
}
|
504
|
+
}
|
505
|
+
end
|
506
|
+
|
507
|
+
def median_sequence_length
|
508
|
+
{
|
509
|
+
"medianSequenceLength" => {
|
510
|
+
"@type" => "SequenceReadLength",
|
511
|
+
"hasUnit" => "uo:CountUnit",
|
512
|
+
"rdf:value" => @summary[:median_sequence_length],
|
513
|
+
}
|
514
|
+
}
|
515
|
+
end
|
516
|
+
|
517
|
+
def overall_mean_quality_score
|
518
|
+
{
|
519
|
+
"overallMeanBaseCallQuality" => {
|
520
|
+
"@type" => "PhredQualityScore",
|
521
|
+
"hasUnit" => "uo:CountUnit",
|
522
|
+
"rdf:value" => @summary[:overall_mean_quality_score],
|
523
|
+
}
|
524
|
+
}
|
525
|
+
end
|
526
|
+
|
527
|
+
def overall_median_quality_score
|
528
|
+
{
|
529
|
+
"overallMedianBaseCallQuality" => {
|
530
|
+
"@type" => "PhredQualityScore",
|
531
|
+
"hasUnit" => "uo:CountUnit",
|
532
|
+
"rdf:value" => @summary[:overall_median_quality_score],
|
533
|
+
}
|
534
|
+
}
|
535
|
+
end
|
536
|
+
|
537
|
+
def overall_n_content
|
538
|
+
{
|
539
|
+
"overallNContent" => {
|
540
|
+
"@type" => "NContent",
|
541
|
+
"hasUnit" => "uo:Percentage",
|
542
|
+
"rdf:value" => @summary[:overall_n_content],
|
543
|
+
}
|
544
|
+
}
|
545
|
+
end
|
546
|
+
|
547
|
+
#
|
548
|
+
# Generate JSON-LD context object
|
549
|
+
#
|
550
|
+
|
551
|
+
def jsonld_context
|
552
|
+
# definition of imported terms in @context
|
553
|
+
object = imported_keywords
|
554
|
+
|
555
|
+
# definition of local ontology terms
|
556
|
+
domain = "http://me.com/sos#"
|
557
|
+
|
558
|
+
# definition of class in @context
|
559
|
+
sos_class.each do |term|
|
560
|
+
object[term] = {}
|
561
|
+
object[term]["@id"] = domain + term
|
562
|
+
object[term]["@type"] = "@id"
|
563
|
+
end
|
564
|
+
|
565
|
+
# definition of object properties in @context
|
566
|
+
sos_object_properties.each do |term|
|
567
|
+
object[term] = {}
|
568
|
+
object[term]["@id"] = domain + term
|
569
|
+
object[term]["@type"] = "@id"
|
570
|
+
end
|
571
|
+
|
572
|
+
sos_data_properties_string.each do |term|
|
573
|
+
object[term] = {}
|
574
|
+
object[term]["@id"] = domain + term
|
575
|
+
object[term]["@type"] = "http://www.w3.org/2001/XMLSchema#string"
|
576
|
+
end
|
577
|
+
|
578
|
+
sos_data_properties_integer.each do |term|
|
579
|
+
object[term] = {}
|
580
|
+
object[term]["@id"] = domain + term
|
581
|
+
object[term]["@type"] = "http://www.w3.org/2001/XMLSchema#integer"
|
582
|
+
end
|
583
|
+
|
584
|
+
sos_data_properties_float.each do |term|
|
585
|
+
object[term] = {}
|
586
|
+
object[term]["@id"] = domain + term
|
587
|
+
object[term]["@type"] = "http://www.w3.org/2001/XMLSchema#float"
|
588
|
+
end
|
589
|
+
|
590
|
+
object
|
591
|
+
end
|
592
|
+
|
593
|
+
def imported_keywords
|
594
|
+
{
|
595
|
+
"uo" => "http://purl.obolibrary.org/obo/",
|
596
|
+
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
597
|
+
}
|
598
|
+
end
|
599
|
+
|
600
|
+
#
|
601
|
+
# definition of classes
|
602
|
+
#
|
603
|
+
|
604
|
+
def sos_class
|
605
|
+
[
|
606
|
+
sos_class_general,
|
607
|
+
sos_class_fastqc_modules,
|
608
|
+
sos_class_for_values,
|
609
|
+
].flatten
|
610
|
+
end
|
611
|
+
|
612
|
+
def sos_class_general
|
613
|
+
[
|
614
|
+
"SequenceStatisticsReport",
|
615
|
+
"SequenceStatisticsMatrix",
|
616
|
+
"Row",
|
617
|
+
"ExactBaseStatistics",
|
618
|
+
"BaseRangeStatistics",
|
619
|
+
]
|
620
|
+
end
|
621
|
+
|
622
|
+
def sos_class_fastqc_modules
|
623
|
+
[
|
624
|
+
"PerBaseSequenceQuality",
|
625
|
+
"PerTileSequenceQuality",
|
626
|
+
"PerSequnceQualityScores",
|
627
|
+
"PerBaseSequenceContent",
|
628
|
+
"PerSequenceGCContent",
|
629
|
+
"PerBaseNContent",
|
630
|
+
"SequenceLengthDistribution",
|
631
|
+
"SequenceDuplicationLevels",
|
632
|
+
"OverrepresentedSequences",
|
633
|
+
"KmerContent",
|
634
|
+
]
|
635
|
+
end
|
636
|
+
|
637
|
+
def sos_class_for_values
|
638
|
+
[
|
639
|
+
"PhredQualityScore",
|
640
|
+
"NucleotideBaseContent",
|
641
|
+
"SequenceReadContent",
|
642
|
+
"SequenceReadLength",
|
643
|
+
"SequenceDuplicationLevel",
|
644
|
+
]
|
645
|
+
end
|
646
|
+
|
647
|
+
#
|
648
|
+
# definition of predicates
|
649
|
+
#
|
650
|
+
|
651
|
+
def sos_object_properties
|
652
|
+
[
|
653
|
+
"hasMatrix",
|
654
|
+
"totalSequences",
|
655
|
+
"filteredSequences",
|
656
|
+
"sequenceLength",
|
657
|
+
"percentGC",
|
658
|
+
"hasRow",
|
659
|
+
"basePosition",
|
660
|
+
"kmerSequence",
|
661
|
+
"meanBaseCallQuality",
|
662
|
+
"medianBaseCallQuality",
|
663
|
+
"nCount",
|
664
|
+
"observedPerExpectedMax",
|
665
|
+
"observedPerExpectedMaxPosition",
|
666
|
+
"observedPerExpectedOverall",
|
667
|
+
"percentAdenine",
|
668
|
+
"percentCytosine",
|
669
|
+
"percentGC",
|
670
|
+
"percentGuanine",
|
671
|
+
"percentThymine",
|
672
|
+
"sequenceDuplicationLevel",
|
673
|
+
"sequenceReadCount",
|
674
|
+
"sequenceReadLength",
|
675
|
+
"sequenceReadPercentage",
|
676
|
+
"sequenceReadRelativeCount",
|
677
|
+
"hasUnit",
|
678
|
+
"overallMeanBaseCallQuality",
|
679
|
+
"overallMedianBaseCallQuality",
|
680
|
+
"overallNContent",
|
681
|
+
]
|
682
|
+
end
|
683
|
+
|
684
|
+
def sos_data_properties_string
|
685
|
+
[
|
686
|
+
"filename",
|
687
|
+
"fileType",
|
688
|
+
"encoding",
|
689
|
+
"possibleSourceOfSequence",
|
690
|
+
"overrepresentedSequence",
|
691
|
+
]
|
692
|
+
end
|
693
|
+
|
694
|
+
def sos_data_properties_integer
|
695
|
+
[
|
696
|
+
"rowIndex",
|
697
|
+
]
|
698
|
+
end
|
699
|
+
|
700
|
+
def sos_data_properties_float
|
701
|
+
[
|
702
|
+
"baseCallQuality",
|
703
|
+
"baseCallQuality10thPercentile",
|
704
|
+
"baseCallQuality90thPercentile",
|
705
|
+
"baseCallQualityLowerQuartile",
|
706
|
+
"baseCallQualityUpperQuartile",
|
707
|
+
"minSequenceLength",
|
708
|
+
"maxSequenceLength",
|
709
|
+
"meanSequenceLength",
|
710
|
+
"medianSequenceLength",
|
711
|
+
]
|
712
|
+
end
|
713
|
+
end
|
714
|
+
end
|
715
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-fastqc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tazro Inutano Ohta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-03-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rubyzip
|
@@ -44,6 +44,34 @@ dependencies:
|
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: 0.19.1
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: json-ld
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '1.99'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '1.99'
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: rdf-turtle
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '1.99'
|
68
|
+
type: :runtime
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.99'
|
47
75
|
- !ruby/object:Gem::Dependency
|
48
76
|
name: bundler
|
49
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -160,16 +188,16 @@ dependencies:
|
|
160
188
|
name: bio-fastqc
|
161
189
|
requirement: !ruby/object:Gem::Requirement
|
162
190
|
requirements:
|
163
|
-
- -
|
191
|
+
- - '='
|
164
192
|
- !ruby/object:Gem::Version
|
165
|
-
version:
|
193
|
+
version: 0.3.0
|
166
194
|
type: :development
|
167
195
|
prerelease: false
|
168
196
|
version_requirements: !ruby/object:Gem::Requirement
|
169
197
|
requirements:
|
170
|
-
- -
|
198
|
+
- - '='
|
171
199
|
- !ruby/object:Gem::Version
|
172
|
-
version:
|
200
|
+
version: 0.3.0
|
173
201
|
description: ruby parser for FastQC, a quality control software for high-throughput
|
174
202
|
sequencing data.
|
175
203
|
email: inutano@gmail.com
|
@@ -194,8 +222,10 @@ files:
|
|
194
222
|
- lib/bio-fastqc.rb
|
195
223
|
- lib/bio/fastqc.rb
|
196
224
|
- lib/bio/fastqc/cli.rb
|
225
|
+
- lib/bio/fastqc/converter.rb
|
197
226
|
- lib/bio/fastqc/data.rb
|
198
227
|
- lib/bio/fastqc/parser.rb
|
228
|
+
- lib/bio/fastqc/semantics.rb
|
199
229
|
- spec/bio-fastqc_spec.rb
|
200
230
|
- spec/example_fastqc.zip
|
201
231
|
- spec/spec_helper.rb
|