bio-fastqc 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24e302a0001de21bb4e1ad93ef748b397a5758c9
4
- data.tar.gz: a3fecf79186d42870c47aa5f03595dc62d5c15fc
3
+ metadata.gz: f1a0b30382d2c41b0fc5327cc1e18ce63b7f190e
4
+ data.tar.gz: 479c6f71276f0360f15cc3286bafe8f12b7404d2
5
5
  SHA512:
6
- metadata.gz: 93e3dff6270cd274089ac8cc0598ec66d04e5718ebd26f44ee2cfc01b19b7625f5d1a0bb40b81c18406fa91901f7975c2aab0dbea034782b073eae331cf185ab
7
- data.tar.gz: 2441701ea9d0761bf2f9aac4ae5b9cdb7ab3d3f55270630d4ba9b1158f1dd26107b8812e8c97282541bf0f904df9e8f460047c97bf5c7a61ffe0b80c0ed40381
6
+ metadata.gz: 56c3e7b739a99e6ee39ffc958a8f59f61438ebfd44a7dd3ae9dcb1ea0ea481fb40c927673f6f8ae052e4422cfb4b8e38b6da542ace153b24d7e4f439284f68b3
7
+ data.tar.gz: 41ef0bc02eb028d9f4de4661cd4cc67159159d1e17ff4f5c06d8779ee3b0ddaf1b348f6aea1d06222873443820aba50133cb6732cfe2a8f875e87c6f12f2573f
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.2
1
+ 0.6.0
@@ -3,9 +3,9 @@
3
3
  module Bio
4
4
  module FastQC
5
5
  class Converter
6
- def initialize(summary_json, id: nil)
6
+ def initialize(fastqc_object, id: nil)
7
7
  @id = id
8
- @summary_json = summary_json
8
+ @fastqc_object = fastqc_object
9
9
  end
10
10
 
11
11
  def convert_to(format)
@@ -23,20 +23,20 @@ module Bio
23
23
 
24
24
  def to_json
25
25
  json = if @id
26
- { @id => @summary_json }
26
+ { @id => @fastqc_object }
27
27
  else
28
- @summary_json
28
+ @fastqc_object
29
29
  end
30
30
  JSON.dump(json)
31
31
  end
32
32
 
33
33
  def to_jsonld
34
- json_ld_object = Semantics.new(@summary_json, id: @id).json_ld_object
34
+ json_ld_object = Semantics.new(@fastqc_object, id: @id).json_ld_object
35
35
  JSON.dump(json_ld_object)
36
36
  end
37
37
 
38
38
  def to_turtle
39
- Semantics.new(@summary_json, id: @id).turtle
39
+ Semantics.new(@fastqc_object, id: @id).turtle
40
40
  end
41
41
 
42
42
  def to_ttl
@@ -47,28 +47,28 @@ module Bio
47
47
  identifier = if @id
48
48
  @id
49
49
  else
50
- @summary_json[:filename].split(".").first
50
+ @fastqc_object[:filename].split(".").first
51
51
  end
52
52
 
53
53
  # return one-line tab separated value
54
54
  [
55
55
  identifier,
56
- @summary_json[:fastqc_version],
57
- @summary_json[:filename],
58
- @summary_json[:file_type],
59
- @summary_json[:encoding],
60
- @summary_json[:total_sequences],
61
- @summary_json[:filtered_sequences],
62
- @summary_json[:sequence_length],
63
- @summary_json[:min_length],
64
- @summary_json[:max_length],
65
- @summary_json[:mean_sequence_length],
66
- @summary_json[:median_sequence_length],
67
- @summary_json[:percent_gc],
68
- @summary_json[:total_duplicate_percentage],
69
- @summary_json[:overall_mean_quality_score],
70
- @summary_json[:overall_median_quality_score],
71
- @summary_json[:overall_n_content],
56
+ @fastqc_object[:fastqc_version],
57
+ @fastqc_object[:filename],
58
+ @fastqc_object[:file_type],
59
+ @fastqc_object[:encoding],
60
+ @fastqc_object[:total_sequences],
61
+ @fastqc_object[:filtered_sequences],
62
+ @fastqc_object[:sequence_length],
63
+ @fastqc_object[:min_length],
64
+ @fastqc_object[:max_length],
65
+ @fastqc_object[:mean_sequence_length],
66
+ @fastqc_object[:median_sequence_length],
67
+ @fastqc_object[:percent_gc],
68
+ @fastqc_object[:total_duplicate_percentage],
69
+ @fastqc_object[:overall_mean_quality_score],
70
+ @fastqc_object[:overall_median_quality_score],
71
+ @fastqc_object[:overall_n_content],
72
72
  ].join("\t")
73
73
  end
74
74
  end
data/lib/bio/fastqc/io.rb CHANGED
@@ -5,8 +5,8 @@ require 'rdf/turtle'
5
5
  module Bio
6
6
  module FastQC
7
7
  class IO
8
- def initialize(summary_json, id: nil)
9
- @summary_json = summary_json
8
+ def initialize(fastqc_object, id: nil)
9
+ @fastqc_object = fastqc_object
10
10
  @id = id
11
11
  end
12
12
 
@@ -24,17 +24,17 @@ module Bio
24
24
  end
25
25
 
26
26
  def write_json(output_file)
27
- json = Converter.new(@summary_json, id: @id).to_json
27
+ json = Converter.new(@fastqc_object, id: @id).to_json
28
28
  open(output_file, 'w'){|file| file.puts(json) }
29
29
  end
30
30
 
31
31
  def write_jsonld(output_file)
32
- jsonld = Converter.new(@summary_json, id: @id).to_jsonld
32
+ jsonld = Converter.new(@fastqc_object, id: @id).to_jsonld
33
33
  open(output_file, 'w'){|file| file.puts(jsonld) }
34
34
  end
35
35
 
36
36
  def write_ttl(output_file)
37
- semantics = Semantics.new(@summary_json, id: @id)
37
+ semantics = Semantics.new(@fastqc_object, id: @id)
38
38
  graph = semantics.turtle_graph
39
39
  prefixes = semantics.turtle_prefixes
40
40
  RDF::Turtle::Writer.open(output_file, prefixes: prefixes) do |writer|
@@ -43,7 +43,7 @@ module Bio
43
43
  end
44
44
 
45
45
  def write_tsv(output_file)
46
- tsv = Converter.new(@summary_json, id: @id).to_tsv
46
+ tsv = Converter.new(@fastqc_object, id: @id).to_tsv
47
47
  open(output_file, 'w'){|file| file.puts(tsv) }
48
48
  end
49
49
  end
@@ -5,246 +5,230 @@ module Bio
5
5
  class Parser
6
6
  def initialize(fastqc_data_txt)
7
7
  @data = fastqc_data_txt
8
- @object = parse(@data)
9
- @base = self.basic_statistics
8
+ @module_results = parse_modules
9
+ @basic_statistics = basic_statistics
10
10
  end
11
11
 
12
- def parse(data)
13
- modules = data.split(">>END_MODULE\n")
14
- modules.map do |node|
15
- lines = node.split("\n")
16
- rm_header = lines.map do |line|
17
- if line !~ /^\#/ || line =~ /^#Total Duplicate Percentage/
18
- line.split("\t")
19
- end
20
- end
21
- rm_header.compact
12
+ def parse_modules
13
+ @data.split(">>END_MODULE\n").map do |mod|
14
+ mod.split("\n").map{|line| line.split("\t") }
22
15
  end
23
16
  end
24
17
 
25
- def fastqc_version
26
- @data.split("\n").first.split("\t").last
27
- end
18
+ #
19
+ # Basic Statistics module
20
+ #
28
21
 
29
22
  def basic_statistics
30
- Hash[*@object.select{|a| a.first.first == ">>Basic Statistics" }.flatten]
23
+ Hash[*@module_results[0].flatten]
31
24
  end
32
25
 
33
- def filename
34
- @base["Filename"]
26
+ def fastqc_version # software version of FastQC
27
+ @basic_statistics["##FastQC"]
35
28
  end
36
29
 
37
- def file_type
38
- @base["File type"]
30
+ def filename # input filename for FastQC program
31
+ @basic_statistics["Filename"]
39
32
  end
40
33
 
41
- def encoding
42
- @base["Encoding"]
34
+ def file_type # input file type
35
+ @basic_statistics["File type"]
43
36
  end
44
37
 
45
- def total_sequences
46
- @base["Total Sequences"].to_i
38
+ def encoding # quality encoding method for input file type
39
+ @basic_statistics["Encoding"]
47
40
  end
48
41
 
49
- def filtered_sequences
50
- @base["Filtered Sequences"].to_i
42
+ def total_sequences # total number of sequence reads
43
+ @basic_statistics["Total Sequences"].to_i
51
44
  end
52
45
 
53
- def sequence_length
54
- @base["Sequence length"]
46
+ def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality
47
+ @basic_statistics["Sequences flagged as poor quality"].to_i
55
48
  end
56
49
 
57
- def min_length
58
- l = @base["Sequence length"]
59
- if l =~ /\d-\d/
60
- l.sub(/-\d+$/,"").to_i
61
- else
62
- l.to_i
63
- end
50
+ def filtered_sequences # number of sequence reads filtered out
51
+ @basic_statistics["Filtered Sequences"].to_i
64
52
  end
65
53
 
66
- def max_length
67
- l = @base["Sequence length"]
68
- if l =~ /\d-\d/
69
- l.sub(/^\d+-/,"").to_i
70
- else
71
- l.to_i
72
- end
54
+ def sequence_length # store as string: can be range
55
+ @basic_statistics["Sequence length"]
73
56
  end
74
57
 
75
- def percent_gc
76
- @base["%GC"].to_i
58
+ def percent_gc # overall percentage of GC content
59
+ @basic_statistics["%GC"].to_f
77
60
  end
78
61
 
79
- def per_base_sequence_quality
80
- node = @object.select{|a| a.first.first == ">>Per base sequence quality" }.first
81
- node.select{|n| n.first != ">>Per base sequence quality" } if node
82
- end
62
+ #
63
+ # Other modules
64
+ #
83
65
 
84
- ## Custom module: overall mean base call quality indicator
85
- def overall_mean_quality_score
86
- per_base = self.per_base_sequence_quality
87
- if per_base
88
- v = per_base.map{|c| (10**(c[1].to_f/-10)).to_f }
89
- -10 * Math.log10(v.reduce(:+) / v.size)
90
- end
66
+ def get_module_matrix(module_name, num_of_header_rows)
67
+ mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0]
68
+ mod.drop(num_of_header_rows) if mod
91
69
  end
92
70
 
93
- ## Custom module: overall median base call quality indicator
94
- def overall_median_quality_score
95
- per_base = self.per_base_sequence_quality
96
- if per_base
97
- v = per_base.map{|c| (10**(c[2].to_f/-10)).to_f }
98
- -10 * Math.log10(v.reduce(:+) / v.size)
99
- end
71
+ def per_base_sequence_quality
72
+ get_module_matrix("Per base sequence quality", 1)
100
73
  end
101
74
 
102
75
  def per_tile_sequence_quality
103
- node = @object.select{|a| a.first.first == ">>Per tile sequence quality" }.first
104
- node.select{|n| n.first != ">>Per tile sequence quality" } if node
76
+ get_module_matrix("Per tile sequence quality", 1)
105
77
  end
106
78
 
107
79
  def per_sequence_quality_scores
108
- node = @object.select{|a| a.first.first == ">>Per sequence quality scores" }.first
109
- node.select{|n| n.first != ">>Per sequence quality scores" } if node
80
+ get_module_matrix("Per sequence quality scores", 1)
110
81
  end
111
82
 
112
83
  def per_base_sequence_content
113
- node = @object.select{|a| a.first.first == ">>Per base sequence content" }.first
114
- node.select{|n| n.first != ">>Per base sequence content" } if node
84
+ get_module_matrix("Per base sequence content", 1)
115
85
  end
116
86
 
117
87
  def per_sequence_gc_content
118
- node = @object.select{|a| a.first.first == ">>Per sequence GC content" }.first
119
- node.select{|n| n.first != ">>Per sequence GC content" } if node
88
+ get_module_matrix("Per sequence GC content", 1)
120
89
  end
121
90
 
122
- def per_sequence_gc_content
123
- node = @object.select{|a| a.first.first == ">>Per sequence GC content" }.first
124
- node.select{|n| n.first != ">>Per sequence GC content" } if node
91
+ def per_base_n_content
92
+ get_module_matrix("Per base N content", 1)
125
93
  end
126
94
 
127
- def per_base_n_content
128
- node = @object.select{|a| a.first.first == ">>Per base N content" }.first
129
- node.select{|n| n.first != ">>Per base N content" } if node
95
+ def sequence_length_distribution
96
+ get_module_matrix("Sequence Length Distribution", 1)
130
97
  end
131
98
 
132
- ## Custom module: overall N content
133
- def overall_n_content
134
- per_base = self.per_base_n_content
135
- if per_base
136
- v = per_base.map{|c| c[1].to_f }
137
- v.reduce(:+) / v.size
138
- end
99
+ def total_duplicate_percentage
100
+ get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f
139
101
  end
140
102
 
141
- def sequence_length_distribution
142
- node = @object.select{|a| a.first.first == ">>Sequence Length Distribution" }.first
143
- node.select{|n| n.first != ">>Sequence Length Distribution" } if node
103
+ def sequence_duplication_levels
104
+ get_module_matrix("Sequence Duplication Levels", 2)
144
105
  end
145
106
 
146
- ## Custom module: mean sequence length calculated from distribution
147
- def mean_sequence_length
148
- distribution = self.sequence_length_distribution
149
- if distribution
150
- sum = distribution.map do |length_count|
151
- length = length_count[0]
152
- count = length_count[1].to_f
153
- if length =~ /\d-\d/
154
- f = length.sub(/-\d+$/,"").to_i
155
- b = length.sub(/^\d+-/,"").to_i
156
- mean = (f + b) / 2
157
- mean * count
158
- else
159
- length.to_i * count
160
- end
161
- end
162
- sum.reduce(:+) / self.total_sequences
107
+ def overrepresented_sequences
108
+ get_module_matrix("Overrepresented sequences", 1)
109
+ end
110
+
111
+ def adapter_content
112
+ get_module_matrix("Adapter Content", 1)
113
+ end
114
+
115
+ def kmer_content
116
+ get_module_matrix("Kmer Content", 1)
117
+ end
118
+
119
+ #
120
+ # Custom modules
121
+ #
122
+
123
+ def min_length
124
+ sequence_length.sub(/-\d+$/,"").to_i
125
+ end
126
+
127
+ def max_length
128
+ sequence_length.sub(/^\d+-/,"").to_i
129
+ end
130
+
131
+ def per_base_quality_column(mean_or_median)
132
+ case mean_or_median
133
+ when :mean
134
+ 1
135
+ when :median
136
+ 2
163
137
  end
164
138
  end
165
139
 
166
- ## Custom module: median sequence length calculated from distribution
167
- def median_sequence_length
168
- distribution = self.sequence_length_distribution
169
- if distribution
170
- array = distribution.map do |length_count|
171
- length = length_count[0]
172
- count = length_count[1].to_i
173
- if length =~ /\d-\d/
174
- f = length.sub(/-\d+$/,"").to_i
175
- b = length.sub(/^\d+-/,"").to_i
176
- mean = (f + b) / 2
177
- [mean.to_f] * count
178
- else
179
- [length.to_f] * count
180
- end
181
- end
182
- sorted = array.flatten.sort
183
- quot = sorted.size / 2
184
- if !sorted.size.even?
185
- sorted[quot]
186
- else
187
- f = sorted[quot]
188
- b = sorted[quot - 1]
189
- (f + b) / 2
190
- end
140
+ def overall_quality_score(mean_or_median)
141
+ per_base = per_base_sequence_quality.drop(1) # drop header
142
+ column = per_base_quality_column(mean_or_median)
143
+ v = per_base.map do |row|
144
+ (10**(row[column].to_f / -10)).to_f
191
145
  end
146
+ -10 * Math.log10(v.reduce(:+) / v.size)
192
147
  end
193
148
 
194
- def sequence_duplication_levels
195
- node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }.first
196
- node.select{|n| n.first != ">>Sequence Duplication Levels" && n.first != "\#Total Duplicate Percentage" } if node
149
+ def overall_mean_quality_score
150
+ overall_quality_score(:mean)
197
151
  end
198
152
 
199
- def total_duplicate_percentage
200
- node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }.first
201
- node.select{|n| n.first == "\#Total Duplicate Percentage" }.flatten[1].to_f if node
153
+ def overall_median_quality_score
154
+ overall_quality_score(:median)
202
155
  end
203
156
 
204
- def overrepresented_sequences
205
- node = @object.select{|a| a.first.first == ">>Overrepresented sequences" }.first
206
- node.select{|n| n.first != ">>Overrepresented sequences" } if node
157
+ def overall_n_content
158
+ per_base = per_base_n_content
159
+ v = per_base.map{|c| c[1].to_f }
160
+ v.reduce(:+) / v.size
207
161
  end
208
162
 
209
- def adapter_content
210
- node = @object.select{|a| a.first.first == ">>Adapter Content" }.first
211
- node.select{|n| n.first != ">>Adapter Content" } if node
163
+ def mean_sequence_length
164
+ dist = sequence_length_distribution.drop(1) # drop column header
165
+ if dist.size == 1
166
+ dist[0][0].to_f
167
+ else
168
+ sum = dist.map do |length_count|
169
+ l = length_count[0]
170
+ c = length_count[1].to_f
171
+ ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c
172
+ end
173
+ sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+)
174
+ end
212
175
  end
213
176
 
214
- def kmer_content
215
- node = @object.select{|a| a.first.first == ">>Kmer Content" }.first
216
- node.select{|n| n.first != ">>Kmer Content" } if node
177
+ def median_sequence_length
178
+ dist = sequence_length_distribution.drop(1) # drop column header
179
+ if dist.size == 1
180
+ dist[0][0].to_f
181
+ else
182
+ k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median
183
+ median = 0
184
+ dist.each do |l_c|
185
+ c = l_c[1].to_f # count of reads in this length range
186
+ if k > c
187
+ k -= c
188
+ else
189
+ l = l_c[0]
190
+ median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2)
191
+ break
192
+ end
193
+ end
194
+ median
195
+ end
217
196
  end
218
197
 
219
198
  def summary
199
+ parse
200
+ end
201
+
202
+ def parse
220
203
  {
221
- fastqc_version: self.fastqc_version,
222
- filename: self.filename,
223
- file_type: self.file_type,
224
- encoding: self.encoding,
225
- total_sequences: self.total_sequences,
226
- filtered_sequences: self.filtered_sequences,
227
- sequence_length: self.sequence_length,
228
- percent_gc: self.percent_gc,
229
- per_base_sequence_quality: self.per_base_sequence_quality,
230
- per_tile_sequence_quality: self.per_tile_sequence_quality,
231
- per_sequence_quality_scores: self.per_sequence_quality_scores,
232
- per_base_sequence_content: self.per_base_sequence_content,
233
- per_sequence_gc_content: self.per_sequence_gc_content,
234
- per_base_n_content: self.per_base_n_content,
235
- sequence_length_distribution: self.sequence_length_distribution,
236
- total_duplicate_percentage: self.total_duplicate_percentage,
237
- sequence_duplication_levels: self.sequence_duplication_levels,
238
- overrepresented_sequences: self.overrepresented_sequences,
239
- adapter_content: self.adapter_content,
240
- kmer_content: self.kmer_content,
241
- min_length: self.min_length,
242
- max_length: self.max_length,
243
- overall_mean_quality_score: self.overall_mean_quality_score,
244
- overall_median_quality_score: self.overall_median_quality_score,
245
- overall_n_content: self.overall_n_content,
246
- mean_sequence_length: self.mean_sequence_length,
247
- median_sequence_length: self.median_sequence_length,
204
+ fastqc_version: fastqc_version,
205
+ filename: filename,
206
+ file_type: file_type,
207
+ encoding: encoding,
208
+ total_sequences: total_sequences,
209
+ sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality,
210
+ filtered_sequences: filtered_sequences,
211
+ sequence_length: sequence_length,
212
+ percent_gc: percent_gc,
213
+ per_base_sequence_quality: per_base_sequence_quality,
214
+ per_tile_sequence_quality: per_tile_sequence_quality,
215
+ per_sequence_quality_scores: per_sequence_quality_scores,
216
+ per_base_sequence_content: per_base_sequence_content,
217
+ per_sequence_gc_content: per_sequence_gc_content,
218
+ per_base_n_content: per_base_n_content,
219
+ sequence_length_distribution: sequence_length_distribution,
220
+ total_duplicate_percentage: total_duplicate_percentage,
221
+ sequence_duplication_levels: sequence_duplication_levels,
222
+ overrepresented_sequences: overrepresented_sequences,
223
+ adapter_content: adapter_content,
224
+ kmer_content: kmer_content,
225
+ min_length: min_length,
226
+ max_length: max_length,
227
+ overall_mean_quality_score: overall_mean_quality_score,
228
+ overall_median_quality_score: overall_median_quality_score,
229
+ overall_n_content: overall_n_content,
230
+ mean_sequence_length: mean_sequence_length,
231
+ median_sequence_length: median_sequence_length,
248
232
  }
249
233
  end
250
234
  end
@@ -6,9 +6,13 @@ require 'rdf/turtle'
6
6
  module Bio
7
7
  module FastQC
8
8
  class Semantics
9
- def initialize(summary_json, id: nil)
9
+ def initialize(fastqc_object, id: nil)
10
10
  @id = id
11
- @summary = summary_json
11
+ @fastqc_object = fastqc_object
12
+ end
13
+
14
+ def rdf_version
15
+ "0.1.0"
12
16
  end
13
17
 
14
18
  def turtle
@@ -23,6 +27,9 @@ module Bio
23
27
  {
24
28
  "uo" => "http://purl.obolibrary.org/obo/",
25
29
  "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
30
+ "dcterms" => "http://purl.org/dc/terms/",
31
+ "pav" => "http://purl.org/pav/",
32
+ "foaf" => "http://xmlns.com/foaf/0.1/",
26
33
  }
27
34
  end
28
35
 
@@ -32,18 +39,30 @@ module Bio
32
39
  object
33
40
  end
34
41
 
35
- def identifier
36
- if @id
37
- @id
38
- else
39
- "http://me.com/data/QNT" + @summary[:filename].split(".").first
40
- end
42
+ def uri_base
43
+ "http://purl.jp/bio/01/quanto"
44
+ end
45
+
46
+ def identifier_literal
47
+ @id ? @id : "QNT" + @fastqc_object[:filename].split(".")[0]
48
+ end
49
+
50
+ def identifier_uri
51
+ uri_base + "/resource/" + identifier_literal
41
52
  end
42
53
 
43
54
  def object_core
44
55
  {
45
56
  "@context" => jsonld_context,
46
- "@id" => identifier,
57
+ "@id" => identifier_uri,
58
+ "@type" => "SequenceStatisticsReport",
59
+ "dcterms:identifier" => identifier_literal,
60
+ "dcterms:contributor" => ["Tazro Ohta", "Shuichi Kawashima"],
61
+ "dcterms:created" => Time.now.strftime("%Y-%m-%d"),
62
+ "dcterms:license" => "http://creativecommons.org/licenses/by-sa/2.1/jp/deed.en",
63
+ "dcterms:publisher" => "http://dbcls.rois.ac.jp/",
64
+ "pav:version" => rdf_version,
65
+ "foaf:page" => "http://quanto.dbcls.jp",
47
66
  }
48
67
  end
49
68
 
@@ -94,24 +113,26 @@ module Bio
94
113
  end
95
114
 
96
115
  def fastqc_version
97
- {}
116
+ {
117
+ "fastqcVersion" => @fastqc_object[:fastqc_version],
118
+ }
98
119
  end
99
120
 
100
121
  def filename
101
122
  {
102
- "filename" => @summary[:filename],
123
+ "filename" => @fastqc_object[:filename],
103
124
  }
104
125
  end
105
126
 
106
127
  def file_type
107
128
  {
108
- "fileType" => @summary[:file_type],
129
+ "fileType" => @fastqc_object[:file_type],
109
130
  }
110
131
  end
111
132
 
112
133
  def encoding
113
134
  {
114
- "encoding" => @summary[:encoding],
135
+ "encoding" => @fastqc_object[:encoding],
115
136
  }
116
137
  end
117
138
 
@@ -120,7 +141,7 @@ module Bio
120
141
  "totalSequences" => {
121
142
  "@type" => "SequenceReadContent",
122
143
  "hasUnit" => "uo:CountUnit",
123
- "rdf:value" => @summary[:total_sequences],
144
+ "rdf:value" => @fastqc_object[:total_sequences],
124
145
  }
125
146
  }
126
147
  end
@@ -130,7 +151,7 @@ module Bio
130
151
  "filteredSequences" => {
131
152
  "@type" => "SequenceReadContent",
132
153
  "hasUnit" => "uo:CountUnit",
133
- "rdf:value" => @summary[:filtered_sequences],
154
+ "rdf:value" => @fastqc_object[:filtered_sequences],
134
155
  }
135
156
  }
136
157
  end
@@ -140,7 +161,7 @@ module Bio
140
161
  "sequenceLength" => {
141
162
  "@type" => "SequenceReadLength",
142
163
  "hasUnit" => "uo:CountUnit",
143
- "rdf:value" => @summary[:sequence_length],
164
+ "rdf:value" => @fastqc_object[:sequence_length],
144
165
  }
145
166
  }
146
167
  end
@@ -150,7 +171,7 @@ module Bio
150
171
  "percentGC" => {
151
172
  "@type" => "NucleotideBaseContent",
152
173
  "hasUnit" => "uo:CountUnit",
153
- "rdf:value" => @summary[:percent_gc],
174
+ "rdf:value" => @fastqc_object[:percent_gc],
154
175
  }
155
176
  }
156
177
  end
@@ -158,7 +179,7 @@ module Bio
158
179
  def per_base_sequence_quality
159
180
  {
160
181
  "@type" => "PerBaseSequenceQuality",
161
- "hasRow" => per_base_sequence_quality_rows(@summary[:per_base_sequence_quality]),
182
+ "hasRow" => per_base_sequence_quality_rows(@fastqc_object[:per_base_sequence_quality]),
162
183
  }
163
184
  end
164
185
 
@@ -220,7 +241,7 @@ module Bio
220
241
  def per_sequence_quality_scores
221
242
  {
222
243
  "@type" => "PerSequnceQualityScores",
223
- "hasRow" => per_sequence_quality_scores_rows(@summary[:per_sequence_quality_scores]),
244
+ "hasRow" => per_sequence_quality_scores_rows(@fastqc_object[:per_sequence_quality_scores]),
224
245
  }
225
246
  end
226
247
 
@@ -248,7 +269,7 @@ module Bio
248
269
  def per_base_sequence_content
249
270
  {
250
271
  "@type" => "PerBaseSequenceContent",
251
- "hasRow" => per_base_sequence_content_rows(@summary[:per_base_sequence_content]),
272
+ "hasRow" => per_base_sequence_content_rows(@fastqc_object[:per_base_sequence_content]),
252
273
  }
253
274
  end
254
275
 
@@ -293,7 +314,7 @@ module Bio
293
314
  def per_sequence_gc_content
294
315
  {
295
316
  "@type" => "PerSequenceGCContent",
296
- "hasRow" => per_sequence_gc_content_rows(@summary[:per_sequence_gc_content]),
317
+ "hasRow" => per_sequence_gc_content_rows(@fastqc_object[:per_sequence_gc_content]),
297
318
  }
298
319
  end
299
320
 
@@ -321,7 +342,7 @@ module Bio
321
342
  def per_base_n_content
322
343
  {
323
344
  "@type" => "PerBaseNContent",
324
- "hasRow" => per_base_n_content_rows(@summary[:per_base_n_content]),
345
+ "hasRow" => per_base_n_content_rows(@fastqc_object[:per_base_n_content]),
325
346
  }
326
347
  end
327
348
 
@@ -348,7 +369,7 @@ module Bio
348
369
  def sequence_length_distribution
349
370
  {
350
371
  "@type" => "SequenceLengthDistribution",
351
- "hasRow" => sequence_length_distribution_rows(@summary[:sequence_length_distribution]),
372
+ "hasRow" => sequence_length_distribution_rows(@fastqc_object[:sequence_length_distribution]),
352
373
  }
353
374
  end
354
375
 
@@ -381,7 +402,7 @@ module Bio
381
402
  def sequence_duplication_levels
382
403
  {
383
404
  "@type" => "SequenceDuplicationLevels",
384
- "hasRow" => sequence_duplication_levels_rows(@summary[:sequence_duplication_levels]),
405
+ "hasRow" => sequence_duplication_levels_rows(@fastqc_object[:sequence_duplication_levels]),
385
406
  }
386
407
  end
387
408
 
@@ -410,7 +431,7 @@ module Bio
410
431
  def overrepresented_sequences
411
432
  {
412
433
  "@type" => "OverrepresentedSequences",
413
- "hasRow" => overrepresented_sequences_rows(@summary[:overrepresented_sequences]),
434
+ "hasRow" => overrepresented_sequences_rows(@fastqc_object[:overrepresented_sequences]),
414
435
  }
415
436
  end
416
437
 
@@ -446,7 +467,7 @@ module Bio
446
467
  def kmer_content
447
468
  {
448
469
  "@type" => "KmerContent",
449
- "hasRow" => kmer_content_rows(@summary[:kmer_content]),
470
+ "hasRow" => kmer_content_rows(@fastqc_object[:kmer_content]),
450
471
  }
451
472
  end
452
473
 
@@ -486,7 +507,7 @@ module Bio
486
507
  "minSequenceLength" => {
487
508
  "@type" => "SequenceReadLength",
488
509
  "hasUnit" => "uo:CountUnit",
489
- "rdf:value" => @summary[:min_length],
510
+ "rdf:value" => @fastqc_object[:min_length],
490
511
  }
491
512
  }
492
513
  end
@@ -496,7 +517,7 @@ module Bio
496
517
  "maxSequenceLength" => {
497
518
  "@type" => "SequenceReadLength",
498
519
  "hasUnit" => "uo:CountUnit",
499
- "rdf:value" => @summary[:max_length],
520
+ "rdf:value" => @fastqc_object[:max_length],
500
521
  }
501
522
  }
502
523
  end
@@ -506,7 +527,7 @@ module Bio
506
527
  "meanSequenceLength" => {
507
528
  "@type" => "SequenceReadLength",
508
529
  "hasUnit" => "uo:CountUnit",
509
- "rdf:value" => @summary[:mean_sequence_length],
530
+ "rdf:value" => @fastqc_object[:mean_sequence_length],
510
531
  }
511
532
  }
512
533
  end
@@ -516,7 +537,7 @@ module Bio
516
537
  "medianSequenceLength" => {
517
538
  "@type" => "SequenceReadLength",
518
539
  "hasUnit" => "uo:CountUnit",
519
- "rdf:value" => @summary[:median_sequence_length],
540
+ "rdf:value" => @fastqc_object[:median_sequence_length],
520
541
  }
521
542
  }
522
543
  end
@@ -526,7 +547,7 @@ module Bio
526
547
  "overallMeanBaseCallQuality" => {
527
548
  "@type" => "PhredQualityScore",
528
549
  "hasUnit" => "uo:CountUnit",
529
- "rdf:value" => @summary[:overall_mean_quality_score],
550
+ "rdf:value" => @fastqc_object[:overall_mean_quality_score],
530
551
  }
531
552
  }
532
553
  end
@@ -536,7 +557,7 @@ module Bio
536
557
  "overallMedianBaseCallQuality" => {
537
558
  "@type" => "PhredQualityScore",
538
559
  "hasUnit" => "uo:CountUnit",
539
- "rdf:value" => @summary[:overall_median_quality_score],
560
+ "rdf:value" => @fastqc_object[:overall_median_quality_score],
540
561
  }
541
562
  }
542
563
  end
@@ -546,7 +567,7 @@ module Bio
546
567
  "overallNContent" => {
547
568
  "@type" => "NContent",
548
569
  "hasUnit" => "uo:Percentage",
549
- "rdf:value" => @summary[:overall_n_content],
570
+ "rdf:value" => @fastqc_object[:overall_n_content],
550
571
  }
551
572
  }
552
573
  end
@@ -557,10 +578,10 @@ module Bio
557
578
 
558
579
  def jsonld_context
559
580
  # definition of imported terms in @context
560
- object = imported_keywords
581
+ object = turtle_prefixes
561
582
 
562
583
  # definition of local ontology terms
563
- domain = "http://me.com/sos#"
584
+ domain = uri_base + "/ontology/sos#"
564
585
 
565
586
  # definition of class in @context
566
587
  sos_class.each do |term|
@@ -597,13 +618,6 @@ module Bio
597
618
  object
598
619
  end
599
620
 
600
- def imported_keywords
601
- {
602
- "uo" => "http://purl.obolibrary.org/obo/",
603
- "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
604
- }
605
- end
606
-
607
621
  #
608
622
  # definition of classes
609
623
  #
@@ -690,6 +704,7 @@ module Bio
690
704
 
691
705
  def sos_data_properties_string
692
706
  [
707
+ "fastqcVersion",
693
708
  "filename",
694
709
  "fileType",
695
710
  "encoding",
@@ -14,6 +14,7 @@ describe Bio::FastQC do
14
14
  describe '#read' do
15
15
  it 'returns parsed data from zipfile' do
16
16
  expect(@data).not_to be_empty
17
+ expect(@data).not_to be_nil
17
18
  end
18
19
  end
19
20
  end
@@ -25,55 +26,112 @@ describe Bio::FastQC do
25
26
  end
26
27
 
27
28
  describe '#fastqc_version' do
28
- it 'returns fastqc version as String and not empty' do
29
+ it 'returns fastqc version as String' do
29
30
  expect(@parser.fastqc_version).to be_instance_of(String)
31
+ end
32
+
33
+ it 'does not return empty string' do
30
34
  expect(@parser.fastqc_version).not_to be_empty
31
35
  end
36
+
37
+ it 'does not return nil' do
38
+ expect(@parser.fastqc_version).not_to be_nil
39
+ end
32
40
  end
33
41
 
34
42
  describe '#filename' do
35
- it 'returns filename as String and not empty' do
43
+ it 'returns filename as String' do
36
44
  expect(@parser.filename).to be_instance_of(String)
45
+ end
46
+
47
+ it 'does not return empty string' do
37
48
  expect(@parser.filename).not_to be_empty
38
49
  end
50
+
51
+ it 'does not return nil' do
52
+ expect(@parser.filename).not_to be_nil
53
+ end
39
54
  end
40
55
 
41
56
  describe '#file_type' do
42
- it 'returns file type as String and not empty' do
57
+ it 'returns file type as String' do
43
58
  expect(@parser.file_type).to be_instance_of(String)
59
+ end
60
+
61
+ it 'does not return empty string' do
44
62
  expect(@parser.file_type).not_to be_empty
45
63
  end
64
+
65
+ it 'does not return nil' do
66
+ expect(@parser.file_type).not_to be_nil
67
+ end
46
68
  end
47
69
 
48
70
  describe '#encoding' do
49
- it 'returns encoding type as String and not empty' do
71
+ it 'returns encoding type as String' do
50
72
  expect(@parser.encoding).to be_instance_of(String)
73
+ end
74
+
75
+ it 'does not return empty string' do
51
76
  expect(@parser.encoding).not_to be_empty
52
77
  end
78
+
79
+ it 'does not return nil' do
80
+ expect(@parser.encoding).not_to be_nil
81
+ end
53
82
  end
54
83
 
55
84
  describe '#total_sequences' do
56
85
  it 'returns total number of sequences as Fixnum' do
57
86
  expect(@parser.total_sequences).to be_instance_of(Fixnum)
58
87
  end
88
+
89
+ it 'returns integer larger than zero' do
90
+ expect(@parser.total_sequences).to be > 0
91
+ end
92
+
93
+ it 'does not return nil' do
94
+ expect(@parser.total_sequences).not_to be_nil
95
+ end
59
96
  end
60
97
 
61
98
  describe '#filtered_sequences' do
62
- it 'returns number of filtered sequence as Fixnum and not empty' do
63
- expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
99
+ it 'returns number of filtered sequence as Fixnum, can be nil' do
100
+ if @parser.filtered_sequences
101
+ expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
102
+ end
103
+ end
104
+ end
105
+
106
+ describe '#sequences_flagged_as_poor_quality' do
107
+ it 'returns number of sequences flagged as poor quality as Fixnum, can be nil' do
108
+ if @parser.sequences_flagged_as_poor_quality
109
+ expect(@parser.sequences_flagged_as_poor_quality).to be_instance_of(Fixnum)
110
+ end
64
111
  end
65
112
  end
66
113
 
67
114
  describe '#sequence_length' do
68
- it 'returns length of sequence as String and not empty' do
115
+ it 'returns length of sequence as String' do
69
116
  expect(@parser.sequence_length).to be_instance_of(String)
117
+ end
118
+
119
+ it 'does not return empty string' do
70
120
  expect(@parser.sequence_length).not_to be_empty
71
121
  end
122
+
123
+ it 'does not return nil' do
124
+ expect(@parser.sequence_length).not_to be_nil
125
+ end
72
126
  end
73
127
 
74
128
  describe '#percent_gc' do
75
- it 'returns percentage of GC content as Fixnum and not empty' do
76
- expect(@parser.percent_gc).to be_instance_of(Fixnum)
129
+ it 'returns percentage of GC content as Float' do
130
+ expect(@parser.percent_gc).to be_instance_of(Float)
131
+ end
132
+
133
+ it 'does not return nil' do
134
+ expect(@parser.percent_gc).not_to be_nil
77
135
  end
78
136
  end
79
137
 
@@ -190,6 +248,10 @@ describe Bio::FastQC do
190
248
  it 'returns duplicate percentage as Float and not empty' do
191
249
  expect(@parser.total_duplicate_percentage).to be_instance_of(Float)
192
250
  end
251
+
252
+ it 'does not returns nil' do
253
+ expect(@parser.total_duplicate_percentage).not_to be_nil
254
+ end
193
255
  end
194
256
 
195
257
  describe '#sequence_duplication_levels' do
@@ -256,42 +318,88 @@ describe Bio::FastQC do
256
318
  it 'returns minimum read length as Fixnum and not empty' do
257
319
  expect(@parser.min_length).to be_instance_of(Fixnum)
258
320
  end
321
+
322
+ it 'returns integer larger than zero' do
323
+ expect(@parser.min_length).to be > 0
324
+ end
325
+
326
+ it 'does not return nil' do
327
+ expect(@parser.min_length).not_to be_nil
328
+ end
259
329
  end
260
330
 
261
331
  describe '#max_length' do
262
332
  it 'returns maximum read length as Fixnum and not empty' do
263
333
  expect(@parser.max_length).to be_instance_of(Fixnum)
264
334
  end
335
+
336
+ it 'returns integer larger than zero' do
337
+ expect(@parser.max_length).to be > 0
338
+ end
339
+
340
+ it 'does not return nil' do
341
+ expect(@parser.max_length).not_to be_nil
342
+ end
265
343
  end
266
344
 
267
345
  describe '#overall_mean_quality_score' do
268
346
  it 'returns overall mean quality score as Float and not empty' do
269
347
  expect(@parser.overall_mean_quality_score).to be_instance_of(Float)
270
348
  end
349
+
350
+ it 'does not return nil' do
351
+ expect(@parser.overall_mean_quality_score).not_to be_nil
352
+ end
271
353
  end
272
354
 
273
355
  describe '#overall_median_quality_score' do
274
356
  it 'returns overall median quality score as Float and not empty' do
275
357
  expect(@parser.overall_median_quality_score).to be_instance_of(Float)
276
358
  end
359
+
360
+ it 'does not return nil' do
361
+ expect(@parser.overall_median_quality_score).not_to be_nil
362
+ end
277
363
  end
278
364
 
279
365
  describe '#overall_n_content' do
280
366
  it 'returns overall N content as Float and not empty' do
281
367
  expect(@parser.overall_n_content).to be_instance_of(Float)
282
368
  end
369
+
370
+ it 'does not return nil' do
371
+ expect(@parser.overall_n_content).not_to be_nil
372
+ end
283
373
  end
284
374
 
285
375
  describe '#mean_sequence_length' do
286
376
  it 'returns mean sequence length from read length distribution as Float and not empty' do
287
377
  expect(@parser.mean_sequence_length).to be_instance_of(Float)
288
378
  end
379
+
380
+ it 'does not return nil' do
381
+ expect(@parser.mean_sequence_length).not_to be_nil
382
+ end
289
383
  end
290
384
 
291
385
  describe '#median_sequence_length' do
292
386
  it 'returns median sequence length from read length distribution as Float and not empty' do
293
387
  expect(@parser.median_sequence_length).to be_instance_of(Float)
294
388
  end
389
+
390
+ it 'does not return nil' do
391
+ expect(@parser.median_sequence_length).not_to be_nil
392
+ end
393
+ end
394
+
395
+ describe '#parse' do
396
+ it 'does not return nil' do
397
+ expect(@parser.parse).not_to be_nil
398
+ end
399
+
400
+ it 'returns hash' do
401
+ expect(@parser.parse).to be_instance_of(Hash)
402
+ end
295
403
  end
296
404
  end
297
405
  end
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-fastqc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tazro Inutano Ohta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-22 00:00:00.000000000 Z
11
+ date: 2016-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -229,6 +229,7 @@ files:
229
229
  - lib/bio/fastqc/semantics.rb
230
230
  - spec/bio-fastqc_spec.rb
231
231
  - spec/example_fastqc.zip
232
+ - spec/example_fastqc_454.zip
232
233
  - spec/spec_helper.rb
233
234
  homepage: http://github.com/inutano/bioruby-fastqc
234
235
  licenses: