bio-fastqc 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24e302a0001de21bb4e1ad93ef748b397a5758c9
4
- data.tar.gz: a3fecf79186d42870c47aa5f03595dc62d5c15fc
3
+ metadata.gz: f1a0b30382d2c41b0fc5327cc1e18ce63b7f190e
4
+ data.tar.gz: 479c6f71276f0360f15cc3286bafe8f12b7404d2
5
5
  SHA512:
6
- metadata.gz: 93e3dff6270cd274089ac8cc0598ec66d04e5718ebd26f44ee2cfc01b19b7625f5d1a0bb40b81c18406fa91901f7975c2aab0dbea034782b073eae331cf185ab
7
- data.tar.gz: 2441701ea9d0761bf2f9aac4ae5b9cdb7ab3d3f55270630d4ba9b1158f1dd26107b8812e8c97282541bf0f904df9e8f460047c97bf5c7a61ffe0b80c0ed40381
6
+ metadata.gz: 56c3e7b739a99e6ee39ffc958a8f59f61438ebfd44a7dd3ae9dcb1ea0ea481fb40c927673f6f8ae052e4422cfb4b8e38b6da542ace153b24d7e4f439284f68b3
7
+ data.tar.gz: 41ef0bc02eb028d9f4de4661cd4cc67159159d1e17ff4f5c06d8779ee3b0ddaf1b348f6aea1d06222873443820aba50133cb6732cfe2a8f875e87c6f12f2573f
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.2
1
+ 0.6.0
@@ -3,9 +3,9 @@
3
3
  module Bio
4
4
  module FastQC
5
5
  class Converter
6
- def initialize(summary_json, id: nil)
6
+ def initialize(fastqc_object, id: nil)
7
7
  @id = id
8
- @summary_json = summary_json
8
+ @fastqc_object = fastqc_object
9
9
  end
10
10
 
11
11
  def convert_to(format)
@@ -23,20 +23,20 @@ module Bio
23
23
 
24
24
  def to_json
25
25
  json = if @id
26
- { @id => @summary_json }
26
+ { @id => @fastqc_object }
27
27
  else
28
- @summary_json
28
+ @fastqc_object
29
29
  end
30
30
  JSON.dump(json)
31
31
  end
32
32
 
33
33
  def to_jsonld
34
- json_ld_object = Semantics.new(@summary_json, id: @id).json_ld_object
34
+ json_ld_object = Semantics.new(@fastqc_object, id: @id).json_ld_object
35
35
  JSON.dump(json_ld_object)
36
36
  end
37
37
 
38
38
  def to_turtle
39
- Semantics.new(@summary_json, id: @id).turtle
39
+ Semantics.new(@fastqc_object, id: @id).turtle
40
40
  end
41
41
 
42
42
  def to_ttl
@@ -47,28 +47,28 @@ module Bio
47
47
  identifier = if @id
48
48
  @id
49
49
  else
50
- @summary_json[:filename].split(".").first
50
+ @fastqc_object[:filename].split(".").first
51
51
  end
52
52
 
53
53
  # return one-line tab separated value
54
54
  [
55
55
  identifier,
56
- @summary_json[:fastqc_version],
57
- @summary_json[:filename],
58
- @summary_json[:file_type],
59
- @summary_json[:encoding],
60
- @summary_json[:total_sequences],
61
- @summary_json[:filtered_sequences],
62
- @summary_json[:sequence_length],
63
- @summary_json[:min_length],
64
- @summary_json[:max_length],
65
- @summary_json[:mean_sequence_length],
66
- @summary_json[:median_sequence_length],
67
- @summary_json[:percent_gc],
68
- @summary_json[:total_duplicate_percentage],
69
- @summary_json[:overall_mean_quality_score],
70
- @summary_json[:overall_median_quality_score],
71
- @summary_json[:overall_n_content],
56
+ @fastqc_object[:fastqc_version],
57
+ @fastqc_object[:filename],
58
+ @fastqc_object[:file_type],
59
+ @fastqc_object[:encoding],
60
+ @fastqc_object[:total_sequences],
61
+ @fastqc_object[:filtered_sequences],
62
+ @fastqc_object[:sequence_length],
63
+ @fastqc_object[:min_length],
64
+ @fastqc_object[:max_length],
65
+ @fastqc_object[:mean_sequence_length],
66
+ @fastqc_object[:median_sequence_length],
67
+ @fastqc_object[:percent_gc],
68
+ @fastqc_object[:total_duplicate_percentage],
69
+ @fastqc_object[:overall_mean_quality_score],
70
+ @fastqc_object[:overall_median_quality_score],
71
+ @fastqc_object[:overall_n_content],
72
72
  ].join("\t")
73
73
  end
74
74
  end
data/lib/bio/fastqc/io.rb CHANGED
@@ -5,8 +5,8 @@ require 'rdf/turtle'
5
5
  module Bio
6
6
  module FastQC
7
7
  class IO
8
- def initialize(summary_json, id: nil)
9
- @summary_json = summary_json
8
+ def initialize(fastqc_object, id: nil)
9
+ @fastqc_object = fastqc_object
10
10
  @id = id
11
11
  end
12
12
 
@@ -24,17 +24,17 @@ module Bio
24
24
  end
25
25
 
26
26
  def write_json(output_file)
27
- json = Converter.new(@summary_json, id: @id).to_json
27
+ json = Converter.new(@fastqc_object, id: @id).to_json
28
28
  open(output_file, 'w'){|file| file.puts(json) }
29
29
  end
30
30
 
31
31
  def write_jsonld(output_file)
32
- jsonld = Converter.new(@summary_json, id: @id).to_jsonld
32
+ jsonld = Converter.new(@fastqc_object, id: @id).to_jsonld
33
33
  open(output_file, 'w'){|file| file.puts(jsonld) }
34
34
  end
35
35
 
36
36
  def write_ttl(output_file)
37
- semantics = Semantics.new(@summary_json, id: @id)
37
+ semantics = Semantics.new(@fastqc_object, id: @id)
38
38
  graph = semantics.turtle_graph
39
39
  prefixes = semantics.turtle_prefixes
40
40
  RDF::Turtle::Writer.open(output_file, prefixes: prefixes) do |writer|
@@ -43,7 +43,7 @@ module Bio
43
43
  end
44
44
 
45
45
  def write_tsv(output_file)
46
- tsv = Converter.new(@summary_json, id: @id).to_tsv
46
+ tsv = Converter.new(@fastqc_object, id: @id).to_tsv
47
47
  open(output_file, 'w'){|file| file.puts(tsv) }
48
48
  end
49
49
  end
@@ -5,246 +5,230 @@ module Bio
5
5
  class Parser
6
6
  def initialize(fastqc_data_txt)
7
7
  @data = fastqc_data_txt
8
- @object = parse(@data)
9
- @base = self.basic_statistics
8
+ @module_results = parse_modules
9
+ @basic_statistics = basic_statistics
10
10
  end
11
11
 
12
- def parse(data)
13
- modules = data.split(">>END_MODULE\n")
14
- modules.map do |node|
15
- lines = node.split("\n")
16
- rm_header = lines.map do |line|
17
- if line !~ /^\#/ || line =~ /^#Total Duplicate Percentage/
18
- line.split("\t")
19
- end
20
- end
21
- rm_header.compact
12
+ def parse_modules
13
+ @data.split(">>END_MODULE\n").map do |mod|
14
+ mod.split("\n").map{|line| line.split("\t") }
22
15
  end
23
16
  end
24
17
 
25
- def fastqc_version
26
- @data.split("\n").first.split("\t").last
27
- end
18
+ #
19
+ # Basic Statistics module
20
+ #
28
21
 
29
22
  def basic_statistics
30
- Hash[*@object.select{|a| a.first.first == ">>Basic Statistics" }.flatten]
23
+ Hash[*@module_results[0].flatten]
31
24
  end
32
25
 
33
- def filename
34
- @base["Filename"]
26
+ def fastqc_version # software version of FastQC
27
+ @basic_statistics["##FastQC"]
35
28
  end
36
29
 
37
- def file_type
38
- @base["File type"]
30
+ def filename # input filename for FastQC program
31
+ @basic_statistics["Filename"]
39
32
  end
40
33
 
41
- def encoding
42
- @base["Encoding"]
34
+ def file_type # input file type
35
+ @basic_statistics["File type"]
43
36
  end
44
37
 
45
- def total_sequences
46
- @base["Total Sequences"].to_i
38
+ def encoding # quality encoding method for input file type
39
+ @basic_statistics["Encoding"]
47
40
  end
48
41
 
49
- def filtered_sequences
50
- @base["Filtered Sequences"].to_i
42
+ def total_sequences # total number of sequence reads
43
+ @basic_statistics["Total Sequences"].to_i
51
44
  end
52
45
 
53
- def sequence_length
54
- @base["Sequence length"]
46
+ def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality
47
+ @basic_statistics["Sequences flagged as poor quality"].to_i
55
48
  end
56
49
 
57
- def min_length
58
- l = @base["Sequence length"]
59
- if l =~ /\d-\d/
60
- l.sub(/-\d+$/,"").to_i
61
- else
62
- l.to_i
63
- end
50
+ def filtered_sequences # number of sequence reads filtered out
51
+ @basic_statistics["Filtered Sequences"].to_i
64
52
  end
65
53
 
66
- def max_length
67
- l = @base["Sequence length"]
68
- if l =~ /\d-\d/
69
- l.sub(/^\d+-/,"").to_i
70
- else
71
- l.to_i
72
- end
54
+ def sequence_length # store as string: can be range
55
+ @basic_statistics["Sequence length"]
73
56
  end
74
57
 
75
- def percent_gc
76
- @base["%GC"].to_i
58
+ def percent_gc # overall percentage of GC content
59
+ @basic_statistics["%GC"].to_f
77
60
  end
78
61
 
79
- def per_base_sequence_quality
80
- node = @object.select{|a| a.first.first == ">>Per base sequence quality" }.first
81
- node.select{|n| n.first != ">>Per base sequence quality" } if node
82
- end
62
+ #
63
+ # Other modules
64
+ #
83
65
 
84
- ## Custom module: overall mean base call quality indicator
85
- def overall_mean_quality_score
86
- per_base = self.per_base_sequence_quality
87
- if per_base
88
- v = per_base.map{|c| (10**(c[1].to_f/-10)).to_f }
89
- -10 * Math.log10(v.reduce(:+) / v.size)
90
- end
66
+ def get_module_matrix(module_name, num_of_header_rows)
67
+ mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0]
68
+ mod.drop(num_of_header_rows) if mod
91
69
  end
92
70
 
93
- ## Custom module: overall median base call quality indicator
94
- def overall_median_quality_score
95
- per_base = self.per_base_sequence_quality
96
- if per_base
97
- v = per_base.map{|c| (10**(c[2].to_f/-10)).to_f }
98
- -10 * Math.log10(v.reduce(:+) / v.size)
99
- end
71
+ def per_base_sequence_quality
72
+ get_module_matrix("Per base sequence quality", 1)
100
73
  end
101
74
 
102
75
  def per_tile_sequence_quality
103
- node = @object.select{|a| a.first.first == ">>Per tile sequence quality" }.first
104
- node.select{|n| n.first != ">>Per tile sequence quality" } if node
76
+ get_module_matrix("Per tile sequence quality", 1)
105
77
  end
106
78
 
107
79
  def per_sequence_quality_scores
108
- node = @object.select{|a| a.first.first == ">>Per sequence quality scores" }.first
109
- node.select{|n| n.first != ">>Per sequence quality scores" } if node
80
+ get_module_matrix("Per sequence quality scores", 1)
110
81
  end
111
82
 
112
83
  def per_base_sequence_content
113
- node = @object.select{|a| a.first.first == ">>Per base sequence content" }.first
114
- node.select{|n| n.first != ">>Per base sequence content" } if node
84
+ get_module_matrix("Per base sequence content", 1)
115
85
  end
116
86
 
117
87
  def per_sequence_gc_content
118
- node = @object.select{|a| a.first.first == ">>Per sequence GC content" }.first
119
- node.select{|n| n.first != ">>Per sequence GC content" } if node
88
+ get_module_matrix("Per sequence GC content", 1)
120
89
  end
121
90
 
122
- def per_sequence_gc_content
123
- node = @object.select{|a| a.first.first == ">>Per sequence GC content" }.first
124
- node.select{|n| n.first != ">>Per sequence GC content" } if node
91
+ def per_base_n_content
92
+ get_module_matrix("Per base N content", 1)
125
93
  end
126
94
 
127
- def per_base_n_content
128
- node = @object.select{|a| a.first.first == ">>Per base N content" }.first
129
- node.select{|n| n.first != ">>Per base N content" } if node
95
+ def sequence_length_distribution
96
+ get_module_matrix("Sequence Length Distribution", 1)
130
97
  end
131
98
 
132
- ## Custom module: overall N content
133
- def overall_n_content
134
- per_base = self.per_base_n_content
135
- if per_base
136
- v = per_base.map{|c| c[1].to_f }
137
- v.reduce(:+) / v.size
138
- end
99
+ def total_duplicate_percentage
100
+ get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f
139
101
  end
140
102
 
141
- def sequence_length_distribution
142
- node = @object.select{|a| a.first.first == ">>Sequence Length Distribution" }.first
143
- node.select{|n| n.first != ">>Sequence Length Distribution" } if node
103
+ def sequence_duplication_levels
104
+ get_module_matrix("Sequence Duplication Levels", 2)
144
105
  end
145
106
 
146
- ## Custom module: mean sequence length calculated from distribution
147
- def mean_sequence_length
148
- distribution = self.sequence_length_distribution
149
- if distribution
150
- sum = distribution.map do |length_count|
151
- length = length_count[0]
152
- count = length_count[1].to_f
153
- if length =~ /\d-\d/
154
- f = length.sub(/-\d+$/,"").to_i
155
- b = length.sub(/^\d+-/,"").to_i
156
- mean = (f + b) / 2
157
- mean * count
158
- else
159
- length.to_i * count
160
- end
161
- end
162
- sum.reduce(:+) / self.total_sequences
107
+ def overrepresented_sequences
108
+ get_module_matrix("Overrepresented sequences", 1)
109
+ end
110
+
111
+ def adapter_content
112
+ get_module_matrix("Adapter Content", 1)
113
+ end
114
+
115
+ def kmer_content
116
+ get_module_matrix("Kmer Content", 1)
117
+ end
118
+
119
+ #
120
+ # Custom modules
121
+ #
122
+
123
+ def min_length
124
+ sequence_length.sub(/-\d+$/,"").to_i
125
+ end
126
+
127
+ def max_length
128
+ sequence_length.sub(/^\d+-/,"").to_i
129
+ end
130
+
131
+ def per_base_quality_column(mean_or_median)
132
+ case mean_or_median
133
+ when :mean
134
+ 1
135
+ when :median
136
+ 2
163
137
  end
164
138
  end
165
139
 
166
- ## Custom module: median sequence length calculated from distribution
167
- def median_sequence_length
168
- distribution = self.sequence_length_distribution
169
- if distribution
170
- array = distribution.map do |length_count|
171
- length = length_count[0]
172
- count = length_count[1].to_i
173
- if length =~ /\d-\d/
174
- f = length.sub(/-\d+$/,"").to_i
175
- b = length.sub(/^\d+-/,"").to_i
176
- mean = (f + b) / 2
177
- [mean.to_f] * count
178
- else
179
- [length.to_f] * count
180
- end
181
- end
182
- sorted = array.flatten.sort
183
- quot = sorted.size / 2
184
- if !sorted.size.even?
185
- sorted[quot]
186
- else
187
- f = sorted[quot]
188
- b = sorted[quot - 1]
189
- (f + b) / 2
190
- end
140
+ def overall_quality_score(mean_or_median)
141
+ per_base = per_base_sequence_quality.drop(1) # drop header
142
+ column = per_base_quality_column(mean_or_median)
143
+ v = per_base.map do |row|
144
+ (10**(row[column].to_f / -10)).to_f
191
145
  end
146
+ -10 * Math.log10(v.reduce(:+) / v.size)
192
147
  end
193
148
 
194
- def sequence_duplication_levels
195
- node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }.first
196
- node.select{|n| n.first != ">>Sequence Duplication Levels" && n.first != "\#Total Duplicate Percentage" } if node
149
+ def overall_mean_quality_score
150
+ overall_quality_score(:mean)
197
151
  end
198
152
 
199
- def total_duplicate_percentage
200
- node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }.first
201
- node.select{|n| n.first == "\#Total Duplicate Percentage" }.flatten[1].to_f if node
153
+ def overall_median_quality_score
154
+ overall_quality_score(:median)
202
155
  end
203
156
 
204
- def overrepresented_sequences
205
- node = @object.select{|a| a.first.first == ">>Overrepresented sequences" }.first
206
- node.select{|n| n.first != ">>Overrepresented sequences" } if node
157
+ def overall_n_content
158
+ per_base = per_base_n_content
159
+ v = per_base.map{|c| c[1].to_f }
160
+ v.reduce(:+) / v.size
207
161
  end
208
162
 
209
- def adapter_content
210
- node = @object.select{|a| a.first.first == ">>Adapter Content" }.first
211
- node.select{|n| n.first != ">>Adapter Content" } if node
163
+ def mean_sequence_length
164
+ dist = sequence_length_distribution.drop(1) # drop column header
165
+ if dist.size == 1
166
+ dist[0][0].to_f
167
+ else
168
+ sum = dist.map do |length_count|
169
+ l = length_count[0]
170
+ c = length_count[1].to_f
171
+ ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c
172
+ end
173
+ sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+)
174
+ end
212
175
  end
213
176
 
214
- def kmer_content
215
- node = @object.select{|a| a.first.first == ">>Kmer Content" }.first
216
- node.select{|n| n.first != ">>Kmer Content" } if node
177
+ def median_sequence_length
178
+ dist = sequence_length_distribution.drop(1) # drop column header
179
+ if dist.size == 1
180
+ dist[0][0].to_f
181
+ else
182
+ k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median
183
+ median = 0
184
+ dist.each do |l_c|
185
+ c = l_c[1].to_f # count of reads in this length range
186
+ if k > c
187
+ k -= c
188
+ else
189
+ l = l_c[0]
190
+ median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2)
191
+ break
192
+ end
193
+ end
194
+ median
195
+ end
217
196
  end
218
197
 
219
198
  def summary
199
+ parse
200
+ end
201
+
202
+ def parse
220
203
  {
221
- fastqc_version: self.fastqc_version,
222
- filename: self.filename,
223
- file_type: self.file_type,
224
- encoding: self.encoding,
225
- total_sequences: self.total_sequences,
226
- filtered_sequences: self.filtered_sequences,
227
- sequence_length: self.sequence_length,
228
- percent_gc: self.percent_gc,
229
- per_base_sequence_quality: self.per_base_sequence_quality,
230
- per_tile_sequence_quality: self.per_tile_sequence_quality,
231
- per_sequence_quality_scores: self.per_sequence_quality_scores,
232
- per_base_sequence_content: self.per_base_sequence_content,
233
- per_sequence_gc_content: self.per_sequence_gc_content,
234
- per_base_n_content: self.per_base_n_content,
235
- sequence_length_distribution: self.sequence_length_distribution,
236
- total_duplicate_percentage: self.total_duplicate_percentage,
237
- sequence_duplication_levels: self.sequence_duplication_levels,
238
- overrepresented_sequences: self.overrepresented_sequences,
239
- adapter_content: self.adapter_content,
240
- kmer_content: self.kmer_content,
241
- min_length: self.min_length,
242
- max_length: self.max_length,
243
- overall_mean_quality_score: self.overall_mean_quality_score,
244
- overall_median_quality_score: self.overall_median_quality_score,
245
- overall_n_content: self.overall_n_content,
246
- mean_sequence_length: self.mean_sequence_length,
247
- median_sequence_length: self.median_sequence_length,
204
+ fastqc_version: fastqc_version,
205
+ filename: filename,
206
+ file_type: file_type,
207
+ encoding: encoding,
208
+ total_sequences: total_sequences,
209
+ sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality,
210
+ filtered_sequences: filtered_sequences,
211
+ sequence_length: sequence_length,
212
+ percent_gc: percent_gc,
213
+ per_base_sequence_quality: per_base_sequence_quality,
214
+ per_tile_sequence_quality: per_tile_sequence_quality,
215
+ per_sequence_quality_scores: per_sequence_quality_scores,
216
+ per_base_sequence_content: per_base_sequence_content,
217
+ per_sequence_gc_content: per_sequence_gc_content,
218
+ per_base_n_content: per_base_n_content,
219
+ sequence_length_distribution: sequence_length_distribution,
220
+ total_duplicate_percentage: total_duplicate_percentage,
221
+ sequence_duplication_levels: sequence_duplication_levels,
222
+ overrepresented_sequences: overrepresented_sequences,
223
+ adapter_content: adapter_content,
224
+ kmer_content: kmer_content,
225
+ min_length: min_length,
226
+ max_length: max_length,
227
+ overall_mean_quality_score: overall_mean_quality_score,
228
+ overall_median_quality_score: overall_median_quality_score,
229
+ overall_n_content: overall_n_content,
230
+ mean_sequence_length: mean_sequence_length,
231
+ median_sequence_length: median_sequence_length,
248
232
  }
249
233
  end
250
234
  end
@@ -6,9 +6,13 @@ require 'rdf/turtle'
6
6
  module Bio
7
7
  module FastQC
8
8
  class Semantics
9
- def initialize(summary_json, id: nil)
9
+ def initialize(fastqc_object, id: nil)
10
10
  @id = id
11
- @summary = summary_json
11
+ @fastqc_object = fastqc_object
12
+ end
13
+
14
+ def rdf_version
15
+ "0.1.0"
12
16
  end
13
17
 
14
18
  def turtle
@@ -23,6 +27,9 @@ module Bio
23
27
  {
24
28
  "uo" => "http://purl.obolibrary.org/obo/",
25
29
  "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
30
+ "dcterms" => "http://purl.org/dc/terms/",
31
+ "pav" => "http://purl.org/pav/",
32
+ "foaf" => "http://xmlns.com/foaf/0.1/",
26
33
  }
27
34
  end
28
35
 
@@ -32,18 +39,30 @@ module Bio
32
39
  object
33
40
  end
34
41
 
35
- def identifier
36
- if @id
37
- @id
38
- else
39
- "http://me.com/data/QNT" + @summary[:filename].split(".").first
40
- end
42
+ def uri_base
43
+ "http://purl.jp/bio/01/quanto"
44
+ end
45
+
46
+ def identifier_literal
47
+ @id ? @id : "QNT" + @fastqc_object[:filename].split(".")[0]
48
+ end
49
+
50
+ def identifier_uri
51
+ uri_base + "/resource/" + identifier_literal
41
52
  end
42
53
 
43
54
  def object_core
44
55
  {
45
56
  "@context" => jsonld_context,
46
- "@id" => identifier,
57
+ "@id" => identifier_uri,
58
+ "@type" => "SequenceStatisticsReport",
59
+ "dcterms:identifier" => identifier_literal,
60
+ "dcterms:contributor" => ["Tazro Ohta", "Shuichi Kawashima"],
61
+ "dcterms:created" => Time.now.strftime("%Y-%m-%d"),
62
+ "dcterms:license" => "http://creativecommons.org/licenses/by-sa/2.1/jp/deed.en",
63
+ "dcterms:publisher" => "http://dbcls.rois.ac.jp/",
64
+ "pav:version" => rdf_version,
65
+ "foaf:page" => "http://quanto.dbcls.jp",
47
66
  }
48
67
  end
49
68
 
@@ -94,24 +113,26 @@ module Bio
94
113
  end
95
114
 
96
115
  def fastqc_version
97
- {}
116
+ {
117
+ "fastqcVersion" => @fastqc_object[:fastqc_version],
118
+ }
98
119
  end
99
120
 
100
121
  def filename
101
122
  {
102
- "filename" => @summary[:filename],
123
+ "filename" => @fastqc_object[:filename],
103
124
  }
104
125
  end
105
126
 
106
127
  def file_type
107
128
  {
108
- "fileType" => @summary[:file_type],
129
+ "fileType" => @fastqc_object[:file_type],
109
130
  }
110
131
  end
111
132
 
112
133
  def encoding
113
134
  {
114
- "encoding" => @summary[:encoding],
135
+ "encoding" => @fastqc_object[:encoding],
115
136
  }
116
137
  end
117
138
 
@@ -120,7 +141,7 @@ module Bio
120
141
  "totalSequences" => {
121
142
  "@type" => "SequenceReadContent",
122
143
  "hasUnit" => "uo:CountUnit",
123
- "rdf:value" => @summary[:total_sequences],
144
+ "rdf:value" => @fastqc_object[:total_sequences],
124
145
  }
125
146
  }
126
147
  end
@@ -130,7 +151,7 @@ module Bio
130
151
  "filteredSequences" => {
131
152
  "@type" => "SequenceReadContent",
132
153
  "hasUnit" => "uo:CountUnit",
133
- "rdf:value" => @summary[:filtered_sequences],
154
+ "rdf:value" => @fastqc_object[:filtered_sequences],
134
155
  }
135
156
  }
136
157
  end
@@ -140,7 +161,7 @@ module Bio
140
161
  "sequenceLength" => {
141
162
  "@type" => "SequenceReadLength",
142
163
  "hasUnit" => "uo:CountUnit",
143
- "rdf:value" => @summary[:sequence_length],
164
+ "rdf:value" => @fastqc_object[:sequence_length],
144
165
  }
145
166
  }
146
167
  end
@@ -150,7 +171,7 @@ module Bio
150
171
  "percentGC" => {
151
172
  "@type" => "NucleotideBaseContent",
152
173
  "hasUnit" => "uo:CountUnit",
153
- "rdf:value" => @summary[:percent_gc],
174
+ "rdf:value" => @fastqc_object[:percent_gc],
154
175
  }
155
176
  }
156
177
  end
@@ -158,7 +179,7 @@ module Bio
158
179
  def per_base_sequence_quality
159
180
  {
160
181
  "@type" => "PerBaseSequenceQuality",
161
- "hasRow" => per_base_sequence_quality_rows(@summary[:per_base_sequence_quality]),
182
+ "hasRow" => per_base_sequence_quality_rows(@fastqc_object[:per_base_sequence_quality]),
162
183
  }
163
184
  end
164
185
 
@@ -220,7 +241,7 @@ module Bio
220
241
  def per_sequence_quality_scores
221
242
  {
222
243
  "@type" => "PerSequnceQualityScores",
223
- "hasRow" => per_sequence_quality_scores_rows(@summary[:per_sequence_quality_scores]),
244
+ "hasRow" => per_sequence_quality_scores_rows(@fastqc_object[:per_sequence_quality_scores]),
224
245
  }
225
246
  end
226
247
 
@@ -248,7 +269,7 @@ module Bio
248
269
  def per_base_sequence_content
249
270
  {
250
271
  "@type" => "PerBaseSequenceContent",
251
- "hasRow" => per_base_sequence_content_rows(@summary[:per_base_sequence_content]),
272
+ "hasRow" => per_base_sequence_content_rows(@fastqc_object[:per_base_sequence_content]),
252
273
  }
253
274
  end
254
275
 
@@ -293,7 +314,7 @@ module Bio
293
314
  def per_sequence_gc_content
294
315
  {
295
316
  "@type" => "PerSequenceGCContent",
296
- "hasRow" => per_sequence_gc_content_rows(@summary[:per_sequence_gc_content]),
317
+ "hasRow" => per_sequence_gc_content_rows(@fastqc_object[:per_sequence_gc_content]),
297
318
  }
298
319
  end
299
320
 
@@ -321,7 +342,7 @@ module Bio
321
342
  def per_base_n_content
322
343
  {
323
344
  "@type" => "PerBaseNContent",
324
- "hasRow" => per_base_n_content_rows(@summary[:per_base_n_content]),
345
+ "hasRow" => per_base_n_content_rows(@fastqc_object[:per_base_n_content]),
325
346
  }
326
347
  end
327
348
 
@@ -348,7 +369,7 @@ module Bio
348
369
  def sequence_length_distribution
349
370
  {
350
371
  "@type" => "SequenceLengthDistribution",
351
- "hasRow" => sequence_length_distribution_rows(@summary[:sequence_length_distribution]),
372
+ "hasRow" => sequence_length_distribution_rows(@fastqc_object[:sequence_length_distribution]),
352
373
  }
353
374
  end
354
375
 
@@ -381,7 +402,7 @@ module Bio
381
402
  def sequence_duplication_levels
382
403
  {
383
404
  "@type" => "SequenceDuplicationLevels",
384
- "hasRow" => sequence_duplication_levels_rows(@summary[:sequence_duplication_levels]),
405
+ "hasRow" => sequence_duplication_levels_rows(@fastqc_object[:sequence_duplication_levels]),
385
406
  }
386
407
  end
387
408
 
@@ -410,7 +431,7 @@ module Bio
410
431
  def overrepresented_sequences
411
432
  {
412
433
  "@type" => "OverrepresentedSequences",
413
- "hasRow" => overrepresented_sequences_rows(@summary[:overrepresented_sequences]),
434
+ "hasRow" => overrepresented_sequences_rows(@fastqc_object[:overrepresented_sequences]),
414
435
  }
415
436
  end
416
437
 
@@ -446,7 +467,7 @@ module Bio
446
467
  def kmer_content
447
468
  {
448
469
  "@type" => "KmerContent",
449
- "hasRow" => kmer_content_rows(@summary[:kmer_content]),
470
+ "hasRow" => kmer_content_rows(@fastqc_object[:kmer_content]),
450
471
  }
451
472
  end
452
473
 
@@ -486,7 +507,7 @@ module Bio
486
507
  "minSequenceLength" => {
487
508
  "@type" => "SequenceReadLength",
488
509
  "hasUnit" => "uo:CountUnit",
489
- "rdf:value" => @summary[:min_length],
510
+ "rdf:value" => @fastqc_object[:min_length],
490
511
  }
491
512
  }
492
513
  end
@@ -496,7 +517,7 @@ module Bio
496
517
  "maxSequenceLength" => {
497
518
  "@type" => "SequenceReadLength",
498
519
  "hasUnit" => "uo:CountUnit",
499
- "rdf:value" => @summary[:max_length],
520
+ "rdf:value" => @fastqc_object[:max_length],
500
521
  }
501
522
  }
502
523
  end
@@ -506,7 +527,7 @@ module Bio
506
527
  "meanSequenceLength" => {
507
528
  "@type" => "SequenceReadLength",
508
529
  "hasUnit" => "uo:CountUnit",
509
- "rdf:value" => @summary[:mean_sequence_length],
530
+ "rdf:value" => @fastqc_object[:mean_sequence_length],
510
531
  }
511
532
  }
512
533
  end
@@ -516,7 +537,7 @@ module Bio
516
537
  "medianSequenceLength" => {
517
538
  "@type" => "SequenceReadLength",
518
539
  "hasUnit" => "uo:CountUnit",
519
- "rdf:value" => @summary[:median_sequence_length],
540
+ "rdf:value" => @fastqc_object[:median_sequence_length],
520
541
  }
521
542
  }
522
543
  end
@@ -526,7 +547,7 @@ module Bio
526
547
  "overallMeanBaseCallQuality" => {
527
548
  "@type" => "PhredQualityScore",
528
549
  "hasUnit" => "uo:CountUnit",
529
- "rdf:value" => @summary[:overall_mean_quality_score],
550
+ "rdf:value" => @fastqc_object[:overall_mean_quality_score],
530
551
  }
531
552
  }
532
553
  end
@@ -536,7 +557,7 @@ module Bio
536
557
  "overallMedianBaseCallQuality" => {
537
558
  "@type" => "PhredQualityScore",
538
559
  "hasUnit" => "uo:CountUnit",
539
- "rdf:value" => @summary[:overall_median_quality_score],
560
+ "rdf:value" => @fastqc_object[:overall_median_quality_score],
540
561
  }
541
562
  }
542
563
  end
@@ -546,7 +567,7 @@ module Bio
546
567
  "overallNContent" => {
547
568
  "@type" => "NContent",
548
569
  "hasUnit" => "uo:Percentage",
549
- "rdf:value" => @summary[:overall_n_content],
570
+ "rdf:value" => @fastqc_object[:overall_n_content],
550
571
  }
551
572
  }
552
573
  end
@@ -557,10 +578,10 @@ module Bio
557
578
 
558
579
  def jsonld_context
559
580
  # definition of imported terms in @context
560
- object = imported_keywords
581
+ object = turtle_prefixes
561
582
 
562
583
  # definition of local ontology terms
563
- domain = "http://me.com/sos#"
584
+ domain = uri_base + "/ontology/sos#"
564
585
 
565
586
  # definition of class in @context
566
587
  sos_class.each do |term|
@@ -597,13 +618,6 @@ module Bio
597
618
  object
598
619
  end
599
620
 
600
- def imported_keywords
601
- {
602
- "uo" => "http://purl.obolibrary.org/obo/",
603
- "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
604
- }
605
- end
606
-
607
621
  #
608
622
  # definition of classes
609
623
  #
@@ -690,6 +704,7 @@ module Bio
690
704
 
691
705
  def sos_data_properties_string
692
706
  [
707
+ "fastqcVersion",
693
708
  "filename",
694
709
  "fileType",
695
710
  "encoding",
@@ -14,6 +14,7 @@ describe Bio::FastQC do
14
14
  describe '#read' do
15
15
  it 'returns parsed data from zipfile' do
16
16
  expect(@data).not_to be_empty
17
+ expect(@data).not_to be_nil
17
18
  end
18
19
  end
19
20
  end
@@ -25,55 +26,112 @@ describe Bio::FastQC do
25
26
  end
26
27
 
27
28
  describe '#fastqc_version' do
28
- it 'returns fastqc version as String and not empty' do
29
+ it 'returns fastqc version as String' do
29
30
  expect(@parser.fastqc_version).to be_instance_of(String)
31
+ end
32
+
33
+ it 'does not return empty string' do
30
34
  expect(@parser.fastqc_version).not_to be_empty
31
35
  end
36
+
37
+ it 'does not return nil' do
38
+ expect(@parser.fastqc_version).not_to be_nil
39
+ end
32
40
  end
33
41
 
34
42
  describe '#filename' do
35
- it 'returns filename as String and not empty' do
43
+ it 'returns filename as String' do
36
44
  expect(@parser.filename).to be_instance_of(String)
45
+ end
46
+
47
+ it 'does not return empty string' do
37
48
  expect(@parser.filename).not_to be_empty
38
49
  end
50
+
51
+ it 'does not return nil' do
52
+ expect(@parser.filename).not_to be_nil
53
+ end
39
54
  end
40
55
 
41
56
  describe '#file_type' do
42
- it 'returns file type as String and not empty' do
57
+ it 'returns file type as String' do
43
58
  expect(@parser.file_type).to be_instance_of(String)
59
+ end
60
+
61
+ it 'does not return empty string' do
44
62
  expect(@parser.file_type).not_to be_empty
45
63
  end
64
+
65
+ it 'does not return nil' do
66
+ expect(@parser.file_type).not_to be_nil
67
+ end
46
68
  end
47
69
 
48
70
  describe '#encoding' do
49
- it 'returns encoding type as String and not empty' do
71
+ it 'returns encoding type as String' do
50
72
  expect(@parser.encoding).to be_instance_of(String)
73
+ end
74
+
75
+ it 'does not return empty string' do
51
76
  expect(@parser.encoding).not_to be_empty
52
77
  end
78
+
79
+ it 'does not return nil' do
80
+ expect(@parser.encoding).not_to be_nil
81
+ end
53
82
  end
54
83
 
55
84
  describe '#total_sequences' do
56
85
  it 'returns total number of sequences as Fixnum' do
57
86
  expect(@parser.total_sequences).to be_instance_of(Fixnum)
58
87
  end
88
+
89
+ it 'returns integer larger than zero' do
90
+ expect(@parser.total_sequences).to be > 0
91
+ end
92
+
93
+ it 'does not return nil' do
94
+ expect(@parser.total_sequences).not_to be_nil
95
+ end
59
96
  end
60
97
 
61
98
  describe '#filtered_sequences' do
62
- it 'returns number of filtered sequence as Fixnum and not empty' do
63
- expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
99
+ it 'returns number of filtered sequence as Fixnum, can be nil' do
100
+ if @parser.filtered_sequences
101
+ expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
102
+ end
103
+ end
104
+ end
105
+
106
+ describe '#sequences_flagged_as_poor_quality' do
107
+ it 'returns number of sequences flagged as poor quality as Fixnum, can be nil' do
108
+ if @parser.sequences_flagged_as_poor_quality
109
+ expect(@parser.sequences_flagged_as_poor_quality).to be_instance_of(Fixnum)
110
+ end
64
111
  end
65
112
  end
66
113
 
67
114
  describe '#sequence_length' do
68
- it 'returns length of sequence as String and not empty' do
115
+ it 'returns length of sequence as String' do
69
116
  expect(@parser.sequence_length).to be_instance_of(String)
117
+ end
118
+
119
+ it 'does not return empty string' do
70
120
  expect(@parser.sequence_length).not_to be_empty
71
121
  end
122
+
123
+ it 'does not return nil' do
124
+ expect(@parser.sequence_length).not_to be_nil
125
+ end
72
126
  end
73
127
 
74
128
  describe '#percent_gc' do
75
- it 'returns percentage of GC content as Fixnum and not empty' do
76
- expect(@parser.percent_gc).to be_instance_of(Fixnum)
129
+ it 'returns percentage of GC content as Float' do
130
+ expect(@parser.percent_gc).to be_instance_of(Float)
131
+ end
132
+
133
+ it 'does not return nil' do
134
+ expect(@parser.percent_gc).not_to be_nil
77
135
  end
78
136
  end
79
137
 
@@ -190,6 +248,10 @@ describe Bio::FastQC do
190
248
  it 'returns duplicate percentage as Float and not empty' do
191
249
  expect(@parser.total_duplicate_percentage).to be_instance_of(Float)
192
250
  end
251
+
252
+ it 'does not returns nil' do
253
+ expect(@parser.total_duplicate_percentage).not_to be_nil
254
+ end
193
255
  end
194
256
 
195
257
  describe '#sequence_duplication_levels' do
@@ -256,42 +318,88 @@ describe Bio::FastQC do
256
318
  it 'returns minimum read length as Fixnum and not empty' do
257
319
  expect(@parser.min_length).to be_instance_of(Fixnum)
258
320
  end
321
+
322
+ it 'returns integer larger than zero' do
323
+ expect(@parser.min_length).to be > 0
324
+ end
325
+
326
+ it 'does not return nil' do
327
+ expect(@parser.min_length).not_to be_nil
328
+ end
259
329
  end
260
330
 
261
331
  describe '#max_length' do
262
332
  it 'returns maximum read length as Fixnum and not empty' do
263
333
  expect(@parser.max_length).to be_instance_of(Fixnum)
264
334
  end
335
+
336
+ it 'returns integer larger than zero' do
337
+ expect(@parser.max_length).to be > 0
338
+ end
339
+
340
+ it 'does not return nil' do
341
+ expect(@parser.max_length).not_to be_nil
342
+ end
265
343
  end
266
344
 
267
345
  describe '#overall_mean_quality_score' do
268
346
  it 'returns overall mean quality score as Float and not empty' do
269
347
  expect(@parser.overall_mean_quality_score).to be_instance_of(Float)
270
348
  end
349
+
350
+ it 'does not return nil' do
351
+ expect(@parser.overall_mean_quality_score).not_to be_nil
352
+ end
271
353
  end
272
354
 
273
355
  describe '#overall_median_quality_score' do
274
356
  it 'returns overall median quality score as Float and not empty' do
275
357
  expect(@parser.overall_median_quality_score).to be_instance_of(Float)
276
358
  end
359
+
360
+ it 'does not return nil' do
361
+ expect(@parser.overall_median_quality_score).not_to be_nil
362
+ end
277
363
  end
278
364
 
279
365
  describe '#overall_n_content' do
280
366
  it 'returns overall N content as Float and not empty' do
281
367
  expect(@parser.overall_n_content).to be_instance_of(Float)
282
368
  end
369
+
370
+ it 'does not return nil' do
371
+ expect(@parser.overall_n_content).not_to be_nil
372
+ end
283
373
  end
284
374
 
285
375
  describe '#mean_sequence_length' do
286
376
  it 'returns mean sequence length from read length distribution as Float and not empty' do
287
377
  expect(@parser.mean_sequence_length).to be_instance_of(Float)
288
378
  end
379
+
380
+ it 'does not return nil' do
381
+ expect(@parser.mean_sequence_length).not_to be_nil
382
+ end
289
383
  end
290
384
 
291
385
  describe '#median_sequence_length' do
292
386
  it 'returns median sequence length from read length distribution as Float and not empty' do
293
387
  expect(@parser.median_sequence_length).to be_instance_of(Float)
294
388
  end
389
+
390
+ it 'does not return nil' do
391
+ expect(@parser.median_sequence_length).not_to be_nil
392
+ end
393
+ end
394
+
395
+ describe '#parse' do
396
+ it 'does not return nil' do
397
+ expect(@parser.parse).not_to be_nil
398
+ end
399
+
400
+ it 'returns hash' do
401
+ expect(@parser.parse).to be_instance_of(Hash)
402
+ end
295
403
  end
296
404
  end
297
405
  end
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-fastqc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tazro Inutano Ohta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-22 00:00:00.000000000 Z
11
+ date: 2016-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -229,6 +229,7 @@ files:
229
229
  - lib/bio/fastqc/semantics.rb
230
230
  - spec/bio-fastqc_spec.rb
231
231
  - spec/example_fastqc.zip
232
+ - spec/example_fastqc_454.zip
232
233
  - spec/spec_helper.rb
233
234
  homepage: http://github.com/inutano/bioruby-fastqc
234
235
  licenses: