RubyGems - bio-fastqc - Versions diffs - 0.5.2 → 0.6.0 - Mend

bio-fastqc 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/VERSION +1 -1
data/lib/bio/fastqc/converter.rb +23 -23
data/lib/bio/fastqc/io.rb +6 -6
data/lib/bio/fastqc/parser.rb +154 -170
data/lib/bio/fastqc/semantics.rb +57 -42
data/spec/bio-fastqc_spec.rb +117 -9
data/spec/example_fastqc_454.zip +0 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 24e302a0001de21bb4e1ad93ef748b397a5758c9
-  data.tar.gz: a3fecf79186d42870c47aa5f03595dc62d5c15fc
+  metadata.gz: f1a0b30382d2c41b0fc5327cc1e18ce63b7f190e
+  data.tar.gz: 479c6f71276f0360f15cc3286bafe8f12b7404d2
 SHA512:
-  metadata.gz: 93e3dff6270cd274089ac8cc0598ec66d04e5718ebd26f44ee2cfc01b19b7625f5d1a0bb40b81c18406fa91901f7975c2aab0dbea034782b073eae331cf185ab
-  data.tar.gz: 2441701ea9d0761bf2f9aac4ae5b9cdb7ab3d3f55270630d4ba9b1158f1dd26107b8812e8c97282541bf0f904df9e8f460047c97bf5c7a61ffe0b80c0ed40381
+  metadata.gz: 56c3e7b739a99e6ee39ffc958a8f59f61438ebfd44a7dd3ae9dcb1ea0ea481fb40c927673f6f8ae052e4422cfb4b8e38b6da542ace153b24d7e4f439284f68b3
+  data.tar.gz: 41ef0bc02eb028d9f4de4661cd4cc67159159d1e17ff4f5c06d8779ee3b0ddaf1b348f6aea1d06222873443820aba50133cb6732cfe2a8f875e87c6f12f2573f

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.5.2
1	+ 0.6.0

data/lib/bio/fastqc/converter.rb CHANGED Viewed

@@ -3,9 +3,9 @@
 module Bio
   module FastQC
     class Converter
-      def initialize(summary_json, id: nil)
+      def initialize(fastqc_object, id: nil)
         @id = id
-        @summary_json = summary_json
+        @fastqc_object = fastqc_object
       end
       def convert_to(format)
@@ -23,20 +23,20 @@ module Bio
       def to_json
         json = if @id
-                 { @id => @summary_json }
+                 { @id => @fastqc_object }
                else
-                 @summary_json
+                 @fastqc_object
                end
         JSON.dump(json)
       end
       def to_jsonld
-        json_ld_object = Semantics.new(@summary_json, id: @id).json_ld_object
+        json_ld_object = Semantics.new(@fastqc_object, id: @id).json_ld_object
         JSON.dump(json_ld_object)
       end
       def to_turtle
-        Semantics.new(@summary_json, id: @id).turtle
+        Semantics.new(@fastqc_object, id: @id).turtle
       end
       def to_ttl
@@ -47,28 +47,28 @@ module Bio
         identifier = if @id
                        @id
                      else
-                       @summary_json[:filename].split(".").first
+                       @fastqc_object[:filename].split(".").first
                      end
         # return one-line tab separated value
         [
           identifier,
-          @summary_json[:fastqc_version],
-          @summary_json[:filename],
-          @summary_json[:file_type],
-          @summary_json[:encoding],
-          @summary_json[:total_sequences],
-          @summary_json[:filtered_sequences],
-          @summary_json[:sequence_length],
-          @summary_json[:min_length],
-          @summary_json[:max_length],
-          @summary_json[:mean_sequence_length],
-          @summary_json[:median_sequence_length],
-          @summary_json[:percent_gc],
-          @summary_json[:total_duplicate_percentage],
-          @summary_json[:overall_mean_quality_score],
-          @summary_json[:overall_median_quality_score],
-          @summary_json[:overall_n_content],
+          @fastqc_object[:fastqc_version],
+          @fastqc_object[:filename],
+          @fastqc_object[:file_type],
+          @fastqc_object[:encoding],
+          @fastqc_object[:total_sequences],
+          @fastqc_object[:filtered_sequences],
+          @fastqc_object[:sequence_length],
+          @fastqc_object[:min_length],
+          @fastqc_object[:max_length],
+          @fastqc_object[:mean_sequence_length],
+          @fastqc_object[:median_sequence_length],
+          @fastqc_object[:percent_gc],
+          @fastqc_object[:total_duplicate_percentage],
+          @fastqc_object[:overall_mean_quality_score],
+          @fastqc_object[:overall_median_quality_score],
+          @fastqc_object[:overall_n_content],
         ].join("\t")
       end
     end

data/lib/bio/fastqc/io.rb CHANGED Viewed

@@ -5,8 +5,8 @@ require 'rdf/turtle'
 module Bio
   module FastQC
     class IO
-      def initialize(summary_json, id: nil)
-        @summary_json = summary_json
+      def initialize(fastqc_object, id: nil)
+        @fastqc_object = fastqc_object
         @id = id
       end
@@ -24,17 +24,17 @@ module Bio
       end
       def write_json(output_file)
-        json = Converter.new(@summary_json, id: @id).to_json
+        json = Converter.new(@fastqc_object, id: @id).to_json
         open(output_file, 'w'){|file| file.puts(json) }
       end
       def write_jsonld(output_file)
-        jsonld = Converter.new(@summary_json, id: @id).to_jsonld
+        jsonld = Converter.new(@fastqc_object, id: @id).to_jsonld
         open(output_file, 'w'){|file| file.puts(jsonld) }
       end
       def write_ttl(output_file)
-        semantics = Semantics.new(@summary_json, id: @id)
+        semantics = Semantics.new(@fastqc_object, id: @id)
         graph = semantics.turtle_graph
         prefixes = semantics.turtle_prefixes
         RDF::Turtle::Writer.open(output_file, prefixes: prefixes) do |writer|
@@ -43,7 +43,7 @@ module Bio
       end
       def write_tsv(output_file)
-        tsv = Converter.new(@summary_json, id: @id).to_tsv
+        tsv = Converter.new(@fastqc_object, id: @id).to_tsv
         open(output_file, 'w'){|file| file.puts(tsv) }
       end
     end

data/lib/bio/fastqc/parser.rb CHANGED Viewed

@@ -5,246 +5,230 @@ module Bio
     class Parser
       def initialize(fastqc_data_txt)
         @data = fastqc_data_txt
-        @object = parse(@data)
-        @base = self.basic_statistics
+        @module_results = parse_modules
+        @basic_statistics = basic_statistics
       end
-      def parse(data)
-        modules = data.split(">>END_MODULE\n")
-        modules.map do |node|
-          lines = node.split("\n")
-          rm_header = lines.map do |line|
-            if line !~ /^\#/ || line =~ /^#Total Duplicate Percentage/
-              line.split("\t")
-            end
-          end
-          rm_header.compact
+      def parse_modules
+        @data.split(">>END_MODULE\n").map do |mod|
+          mod.split("\n").map{|line| line.split("\t") }
         end
       end
-      def fastqc_version
-        @data.split("\n").first.split("\t").last
-      end
+      #
+      # Basic Statistics module
+      #
       def basic_statistics
-        Hash[*@object.select{|a| a.first.first == ">>Basic Statistics" }.flatten]
+        Hash[*@module_results[0].flatten]
       end
-      def filename
-        @base["Filename"]
+      def fastqc_version # software version of FastQC
+        @basic_statistics["##FastQC"]
       end
-      def file_type
-        @base["File type"]
+      def filename # input filename for FastQC program
+        @basic_statistics["Filename"]
       end
-      def encoding
-        @base["Encoding"]
+      def file_type # input file type
+        @basic_statistics["File type"]
       end
-      def total_sequences
-        @base["Total Sequences"].to_i
+      def encoding # quality encoding method for input file type
+        @basic_statistics["Encoding"]
       end
-      def filtered_sequences
-        @base["Filtered Sequences"].to_i
+      def total_sequences # total number of sequence reads
+        @basic_statistics["Total Sequences"].to_i
       end
-      def sequence_length
-        @base["Sequence length"]
+      def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality
+        @basic_statistics["Sequences flagged as poor quality"].to_i
       end
-      def min_length
-        l = @base["Sequence length"]
-        if l =~ /\d-\d/
-          l.sub(/-\d+$/,"").to_i
-        else
-          l.to_i
-        end
+      def filtered_sequences # number of sequence reads filtered out
+        @basic_statistics["Filtered Sequences"].to_i
       end
-      def max_length
-        l = @base["Sequence length"]
-        if l =~ /\d-\d/
-          l.sub(/^\d+-/,"").to_i
-        else
-          l.to_i
-        end
+      def sequence_length # store as string: can be range
+        @basic_statistics["Sequence length"]
       end
-      def percent_gc
-        @base["%GC"].to_i
+      def percent_gc # overall percentage of GC content
+        @basic_statistics["%GC"].to_f
       end
-      def per_base_sequence_quality
-        node = @object.select{|a| a.first.first == ">>Per base sequence quality" }.first
-        node.select{|n| n.first != ">>Per base sequence quality" } if node
-      end
+      #
+      # Other modules
+      #
-      ## Custom module: overall mean base call quality indicator
-      def overall_mean_quality_score
-        per_base = self.per_base_sequence_quality
-        if per_base
-          v = per_base.map{|c| (10**(c[1].to_f/-10)).to_f }
-          -10 * Math.log10(v.reduce(:+) / v.size)
-        end
+      def get_module_matrix(module_name, num_of_header_rows)
+        mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0]
+        mod.drop(num_of_header_rows) if mod
       end
-      ## Custom module: overall median base call quality indicator
-      def overall_median_quality_score
-        per_base = self.per_base_sequence_quality
-        if per_base
-          v = per_base.map{|c| (10**(c[2].to_f/-10)).to_f }
-          -10 * Math.log10(v.reduce(:+) / v.size)
-        end
+      def per_base_sequence_quality
+        get_module_matrix("Per base sequence quality", 1)
       end
       def per_tile_sequence_quality
-        node = @object.select{|a| a.first.first == ">>Per tile sequence quality" }.first
-        node.select{|n| n.first != ">>Per tile sequence quality" } if node
+        get_module_matrix("Per tile sequence quality", 1)
       end
       def per_sequence_quality_scores
-        node = @object.select{|a| a.first.first == ">>Per sequence quality scores" }.first
-        node.select{|n| n.first != ">>Per sequence quality scores" } if node
+        get_module_matrix("Per sequence quality scores", 1)
       end
       def per_base_sequence_content
-        node = @object.select{|a| a.first.first == ">>Per base sequence content" }.first
-        node.select{|n| n.first != ">>Per base sequence content" } if node
+        get_module_matrix("Per base sequence content", 1)
       end
       def per_sequence_gc_content
-        node = @object.select{|a| a.first.first == ">>Per sequence GC content" }.first
-        node.select{|n| n.first != ">>Per sequence GC content" } if node
+        get_module_matrix("Per sequence GC content", 1)
       end
-      def per_sequence_gc_content
-        node = @object.select{|a| a.first.first == ">>Per sequence GC content" }.first
-        node.select{|n| n.first != ">>Per sequence GC content" } if node
+      def per_base_n_content
+        get_module_matrix("Per base N content", 1)
       end
-      def per_base_n_content
-        node = @object.select{|a| a.first.first == ">>Per base N content" }.first
-        node.select{|n| n.first != ">>Per base N content" } if node
+      def sequence_length_distribution
+        get_module_matrix("Sequence Length Distribution", 1)
       end
-      ## Custom module: overall N content
-      def overall_n_content
-        per_base = self.per_base_n_content
-        if per_base
-          v = per_base.map{|c| c[1].to_f }
-          v.reduce(:+) / v.size
-        end
+      def total_duplicate_percentage
+        get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f
       end
-      def sequence_length_distribution
-        node = @object.select{|a| a.first.first == ">>Sequence Length Distribution" }.first
-        node.select{|n| n.first != ">>Sequence Length Distribution" } if node
+      def sequence_duplication_levels
+        get_module_matrix("Sequence Duplication Levels", 2)
       end
-      ## Custom module: mean sequence length calculated from distribution
-      def mean_sequence_length
-        distribution = self.sequence_length_distribution
-        if distribution
-          sum = distribution.map do |length_count|
-            length = length_count[0]
-            count = length_count[1].to_f
-            if length =~ /\d-\d/
-              f = length.sub(/-\d+$/,"").to_i
-              b = length.sub(/^\d+-/,"").to_i
-              mean = (f + b) / 2
-              mean * count
-            else
-              length.to_i * count
-            end
-          end
-          sum.reduce(:+) / self.total_sequences
+      def overrepresented_sequences
+        get_module_matrix("Overrepresented sequences", 1)
+      end
+      def adapter_content
+        get_module_matrix("Adapter Content", 1)
+      end
+      def kmer_content
+        get_module_matrix("Kmer Content", 1)
+      end
+      #
+      # Custom modules
+      #
+      def min_length
+        sequence_length.sub(/-\d+$/,"").to_i
+      end
+      def max_length
+        sequence_length.sub(/^\d+-/,"").to_i
+      end
+      def per_base_quality_column(mean_or_median)
+        case mean_or_median
+        when :mean
+          1
+        when :median
+          2
         end
       end
-      ## Custom module: median sequence length calculated from distribution
-      def median_sequence_length
-        distribution = self.sequence_length_distribution
-        if distribution
-          array = distribution.map do |length_count|
-            length = length_count[0]
-            count = length_count[1].to_i
-            if length =~ /\d-\d/
-              f = length.sub(/-\d+$/,"").to_i
-              b = length.sub(/^\d+-/,"").to_i
-              mean = (f + b) / 2
-              [mean.to_f] * count
-            else
-              [length.to_f] * count
-            end
-          end
-          sorted = array.flatten.sort
-          quot = sorted.size / 2
-          if !sorted.size.even?
-            sorted[quot]
-          else
-            f = sorted[quot]
-            b = sorted[quot - 1]
-            (f + b) / 2
-          end
+      def overall_quality_score(mean_or_median)
+        per_base = per_base_sequence_quality.drop(1) # drop header
+        column = per_base_quality_column(mean_or_median)
+        v = per_base.map do |row|
+          (10**(row[column].to_f / -10)).to_f
         end
+        -10 * Math.log10(v.reduce(:+) / v.size)
       end
-      def sequence_duplication_levels
-        node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }.first
-        node.select{|n| n.first != ">>Sequence Duplication Levels" && n.first != "\#Total Duplicate Percentage" } if node
+      def overall_mean_quality_score
+        overall_quality_score(:mean)
       end
-      def total_duplicate_percentage
-        node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }.first
-        node.select{|n| n.first == "\#Total Duplicate Percentage" }.flatten[1].to_f if node
+      def overall_median_quality_score
+        overall_quality_score(:median)
       end
-      def overrepresented_sequences
-        node = @object.select{|a| a.first.first == ">>Overrepresented sequences" }.first
-        node.select{|n| n.first != ">>Overrepresented sequences" } if node
+      def overall_n_content
+        per_base = per_base_n_content
+        v = per_base.map{|c| c[1].to_f }
+        v.reduce(:+) / v.size
       end
-      def adapter_content
-        node = @object.select{|a| a.first.first == ">>Adapter Content" }.first
-        node.select{|n| n.first != ">>Adapter Content" } if node
+      def mean_sequence_length
+        dist = sequence_length_distribution.drop(1) # drop column header
+        if dist.size == 1
+          dist[0][0].to_f
+        else
+          sum = dist.map do |length_count|
+            l = length_count[0]
+            c  = length_count[1].to_f
+            ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c
+          end
+          sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+)
+        end
       end
-      def kmer_content
-        node = @object.select{|a| a.first.first == ">>Kmer Content" }.first
-        node.select{|n| n.first != ">>Kmer Content" } if node
+      def median_sequence_length
+        dist = sequence_length_distribution.drop(1) # drop column header
+        if dist.size == 1
+          dist[0][0].to_f
+        else
+          k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median
+          median = 0
+          dist.each do |l_c|
+            c = l_c[1].to_f # count of reads in this length range
+            if k > c
+              k -= c
+            else
+              l = l_c[0]
+              median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2)
+              break
+            end
+          end
+          median
+        end
       end
       def summary
+        parse
+      end
+      def parse
         {
-          fastqc_version: self.fastqc_version,
-          filename: self.filename,
-          file_type: self.file_type,
-          encoding: self.encoding,
-          total_sequences: self.total_sequences,
-          filtered_sequences: self.filtered_sequences,
-          sequence_length: self.sequence_length,
-          percent_gc: self.percent_gc,
-          per_base_sequence_quality: self.per_base_sequence_quality,
-          per_tile_sequence_quality: self.per_tile_sequence_quality,
-          per_sequence_quality_scores: self.per_sequence_quality_scores,
-          per_base_sequence_content: self.per_base_sequence_content,
-          per_sequence_gc_content: self.per_sequence_gc_content,
-          per_base_n_content: self.per_base_n_content,
-          sequence_length_distribution: self.sequence_length_distribution,
-          total_duplicate_percentage: self.total_duplicate_percentage,
-          sequence_duplication_levels: self.sequence_duplication_levels,
-          overrepresented_sequences: self.overrepresented_sequences,
-          adapter_content: self.adapter_content,
-          kmer_content: self.kmer_content,
-          min_length: self.min_length,
-          max_length: self.max_length,
-          overall_mean_quality_score: self.overall_mean_quality_score,
-          overall_median_quality_score: self.overall_median_quality_score,
-          overall_n_content: self.overall_n_content,
-          mean_sequence_length: self.mean_sequence_length,
-          median_sequence_length: self.median_sequence_length,
+          fastqc_version: fastqc_version,
+          filename: filename,
+          file_type: file_type,
+          encoding: encoding,
+          total_sequences: total_sequences,
+          sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality,
+          filtered_sequences: filtered_sequences,
+          sequence_length: sequence_length,
+          percent_gc: percent_gc,
+          per_base_sequence_quality: per_base_sequence_quality,
+          per_tile_sequence_quality: per_tile_sequence_quality,
+          per_sequence_quality_scores: per_sequence_quality_scores,
+          per_base_sequence_content: per_base_sequence_content,
+          per_sequence_gc_content: per_sequence_gc_content,
+          per_base_n_content: per_base_n_content,
+          sequence_length_distribution: sequence_length_distribution,
+          total_duplicate_percentage: total_duplicate_percentage,
+          sequence_duplication_levels: sequence_duplication_levels,
+          overrepresented_sequences: overrepresented_sequences,
+          adapter_content: adapter_content,
+          kmer_content: kmer_content,
+          min_length: min_length,
+          max_length: max_length,
+          overall_mean_quality_score: overall_mean_quality_score,
+          overall_median_quality_score: overall_median_quality_score,
+          overall_n_content: overall_n_content,
+          mean_sequence_length: mean_sequence_length,
+          median_sequence_length: median_sequence_length,
         }
       end
     end

data/lib/bio/fastqc/semantics.rb CHANGED Viewed

@@ -6,9 +6,13 @@ require 'rdf/turtle'
 module Bio
   module FastQC
     class Semantics
-      def initialize(summary_json, id: nil)
+      def initialize(fastqc_object, id: nil)
         @id = id
-        @summary = summary_json
+        @fastqc_object = fastqc_object
+      end
+      def rdf_version
+        "0.1.0"
       end
       def turtle
@@ -23,6 +27,9 @@ module Bio
         {
           "uo" => "http://purl.obolibrary.org/obo/",
           "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+          "dcterms" => "http://purl.org/dc/terms/",
+          "pav" => "http://purl.org/pav/",
+          "foaf" => "http://xmlns.com/foaf/0.1/",
         }
       end
@@ -32,18 +39,30 @@ module Bio
         object
       end
-      def identifier
-        if @id
-          @id
-        else
-          "http://me.com/data/QNT" + @summary[:filename].split(".").first
-        end
+      def uri_base
+        "http://purl.jp/bio/01/quanto"
+      end
+      def identifier_literal
+        @id ? @id : "QNT" + @fastqc_object[:filename].split(".")[0]
+      end
+      def identifier_uri
+        uri_base + "/resource/" + identifier_literal
       end
       def object_core
         {
           "@context" => jsonld_context,
-          "@id" => identifier,
+          "@id" => identifier_uri,
+          "@type" => "SequenceStatisticsReport",
+          "dcterms:identifier" => identifier_literal,
+          "dcterms:contributor" => ["Tazro Ohta", "Shuichi Kawashima"],
+          "dcterms:created" => Time.now.strftime("%Y-%m-%d"),
+          "dcterms:license" => "http://creativecommons.org/licenses/by-sa/2.1/jp/deed.en",
+          "dcterms:publisher" => "http://dbcls.rois.ac.jp/",
+          "pav:version" => rdf_version,
+          "foaf:page" => "http://quanto.dbcls.jp",
         }
       end
@@ -94,24 +113,26 @@ module Bio
       end
       def fastqc_version
-        {}
+        {
+          "fastqcVersion" => @fastqc_object[:fastqc_version],
+        }
       end
       def filename
         {
-          "filename" => @summary[:filename],
+          "filename" => @fastqc_object[:filename],
         }
       end
       def file_type
         {
-          "fileType" => @summary[:file_type],
+          "fileType" => @fastqc_object[:file_type],
         }
       end
       def encoding
         {
-          "encoding" => @summary[:encoding],
+          "encoding" => @fastqc_object[:encoding],
         }
       end
@@ -120,7 +141,7 @@ module Bio
           "totalSequences" => {
             "@type" => "SequenceReadContent",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:total_sequences],
+            "rdf:value" => @fastqc_object[:total_sequences],
           }
         }
       end
@@ -130,7 +151,7 @@ module Bio
           "filteredSequences" => {
             "@type" => "SequenceReadContent",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:filtered_sequences],
+            "rdf:value" => @fastqc_object[:filtered_sequences],
           }
         }
       end
@@ -140,7 +161,7 @@ module Bio
           "sequenceLength" => {
             "@type" => "SequenceReadLength",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:sequence_length],
+            "rdf:value" => @fastqc_object[:sequence_length],
           }
         }
       end
@@ -150,7 +171,7 @@ module Bio
           "percentGC" => {
             "@type" => "NucleotideBaseContent",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:percent_gc],
+            "rdf:value" => @fastqc_object[:percent_gc],
           }
         }
       end
@@ -158,7 +179,7 @@ module Bio
       def per_base_sequence_quality
         {
           "@type" => "PerBaseSequenceQuality",
-          "hasRow" => per_base_sequence_quality_rows(@summary[:per_base_sequence_quality]),
+          "hasRow" => per_base_sequence_quality_rows(@fastqc_object[:per_base_sequence_quality]),
         }
       end
@@ -220,7 +241,7 @@ module Bio
       def per_sequence_quality_scores
         {
           "@type" => "PerSequnceQualityScores",
-          "hasRow" => per_sequence_quality_scores_rows(@summary[:per_sequence_quality_scores]),
+          "hasRow" => per_sequence_quality_scores_rows(@fastqc_object[:per_sequence_quality_scores]),
         }
       end
@@ -248,7 +269,7 @@ module Bio
       def per_base_sequence_content
         {
           "@type" => "PerBaseSequenceContent",
-          "hasRow" => per_base_sequence_content_rows(@summary[:per_base_sequence_content]),
+          "hasRow" => per_base_sequence_content_rows(@fastqc_object[:per_base_sequence_content]),
         }
       end
@@ -293,7 +314,7 @@ module Bio
       def per_sequence_gc_content
         {
           "@type" => "PerSequenceGCContent",
-          "hasRow" => per_sequence_gc_content_rows(@summary[:per_sequence_gc_content]),
+          "hasRow" => per_sequence_gc_content_rows(@fastqc_object[:per_sequence_gc_content]),
         }
       end
@@ -321,7 +342,7 @@ module Bio
       def per_base_n_content
         {
           "@type" => "PerBaseNContent",
-          "hasRow" => per_base_n_content_rows(@summary[:per_base_n_content]),
+          "hasRow" => per_base_n_content_rows(@fastqc_object[:per_base_n_content]),
         }
       end
@@ -348,7 +369,7 @@ module Bio
       def sequence_length_distribution
         {
           "@type" => "SequenceLengthDistribution",
-          "hasRow" => sequence_length_distribution_rows(@summary[:sequence_length_distribution]),
+          "hasRow" => sequence_length_distribution_rows(@fastqc_object[:sequence_length_distribution]),
         }
       end
@@ -381,7 +402,7 @@ module Bio
       def sequence_duplication_levels
         {
           "@type" => "SequenceDuplicationLevels",
-          "hasRow" => sequence_duplication_levels_rows(@summary[:sequence_duplication_levels]),
+          "hasRow" => sequence_duplication_levels_rows(@fastqc_object[:sequence_duplication_levels]),
         }
       end
@@ -410,7 +431,7 @@ module Bio
       def overrepresented_sequences
         {
           "@type" => "OverrepresentedSequences",
-          "hasRow" => overrepresented_sequences_rows(@summary[:overrepresented_sequences]),
+          "hasRow" => overrepresented_sequences_rows(@fastqc_object[:overrepresented_sequences]),
         }
       end
@@ -446,7 +467,7 @@ module Bio
       def kmer_content
         {
           "@type" => "KmerContent",
-          "hasRow" => kmer_content_rows(@summary[:kmer_content]),
+          "hasRow" => kmer_content_rows(@fastqc_object[:kmer_content]),
         }
       end
@@ -486,7 +507,7 @@ module Bio
           "minSequenceLength" => {
             "@type" => "SequenceReadLength",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:min_length],
+            "rdf:value" => @fastqc_object[:min_length],
           }
         }
       end
@@ -496,7 +517,7 @@ module Bio
           "maxSequenceLength" => {
             "@type" => "SequenceReadLength",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:max_length],
+            "rdf:value" => @fastqc_object[:max_length],
           }
         }
       end
@@ -506,7 +527,7 @@ module Bio
           "meanSequenceLength" => {
             "@type" => "SequenceReadLength",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:mean_sequence_length],
+            "rdf:value" => @fastqc_object[:mean_sequence_length],
           }
         }
       end
@@ -516,7 +537,7 @@ module Bio
             "medianSequenceLength" => {
             "@type" => "SequenceReadLength",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:median_sequence_length],
+            "rdf:value" => @fastqc_object[:median_sequence_length],
           }
         }
       end
@@ -526,7 +547,7 @@ module Bio
           "overallMeanBaseCallQuality" => {
             "@type" => "PhredQualityScore",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:overall_mean_quality_score],
+            "rdf:value" => @fastqc_object[:overall_mean_quality_score],
           }
         }
       end
@@ -536,7 +557,7 @@ module Bio
           "overallMedianBaseCallQuality" => {
             "@type" => "PhredQualityScore",
             "hasUnit" => "uo:CountUnit",
-            "rdf:value" => @summary[:overall_median_quality_score],
+            "rdf:value" => @fastqc_object[:overall_median_quality_score],
           }
         }
       end
@@ -546,7 +567,7 @@ module Bio
           "overallNContent" => {
             "@type" => "NContent",
             "hasUnit" => "uo:Percentage",
-            "rdf:value" => @summary[:overall_n_content],
+            "rdf:value" => @fastqc_object[:overall_n_content],
           }
         }
       end
@@ -557,10 +578,10 @@ module Bio
       def jsonld_context
         # definition of imported terms in @context
-        object = imported_keywords
+        object = turtle_prefixes
         # definition of local ontology terms
-        domain = "http://me.com/sos#"
+        domain = uri_base + "/ontology/sos#"
         # definition of class in @context
         sos_class.each do |term|
@@ -597,13 +618,6 @@ module Bio
         object
       end
-      def imported_keywords
-        {
-          "uo" => "http://purl.obolibrary.org/obo/",
-          "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
-        }
-      end
       #
       # definition of classes
       #
@@ -690,6 +704,7 @@ module Bio
       def sos_data_properties_string
         [
+          "fastqcVersion",
           "filename",
           "fileType",
           "encoding",

data/spec/bio-fastqc_spec.rb CHANGED Viewed

@@ -14,6 +14,7 @@ describe Bio::FastQC do
       describe '#read' do
         it 'returns parsed data from zipfile' do
           expect(@data).not_to be_empty
+          expect(@data).not_to be_nil
         end
       end
     end
@@ -25,55 +26,112 @@ describe Bio::FastQC do
       end
       describe '#fastqc_version' do
-        it 'returns fastqc version as String and not empty' do
+        it 'returns fastqc version as String' do
           expect(@parser.fastqc_version).to be_instance_of(String)
+        end
+        it 'does not return empty string' do
           expect(@parser.fastqc_version).not_to be_empty
         end
+        it 'does not return nil' do
+          expect(@parser.fastqc_version).not_to be_nil
+        end
       end
       describe '#filename' do
-        it 'returns filename as String and not empty' do
+        it 'returns filename as String' do
           expect(@parser.filename).to be_instance_of(String)
+        end
+        it 'does not return empty string' do
           expect(@parser.filename).not_to be_empty
         end
+        it 'does not return nil' do
+          expect(@parser.filename).not_to be_nil
+        end
       end
       describe '#file_type' do
-        it 'returns file type as String and not empty' do
+        it 'returns file type as String' do
           expect(@parser.file_type).to be_instance_of(String)
+        end
+        it 'does not return empty string' do
           expect(@parser.file_type).not_to be_empty
         end
+        it 'does not return nil' do
+          expect(@parser.file_type).not_to be_nil
+        end
       end
       describe '#encoding' do
-        it 'returns encoding type as String and not empty' do
+        it 'returns encoding type as String' do
           expect(@parser.encoding).to be_instance_of(String)
+        end
+        it 'does not return empty string' do
           expect(@parser.encoding).not_to be_empty
         end
+        it 'does not return nil' do
+          expect(@parser.encoding).not_to be_nil
+        end
       end
       describe '#total_sequences' do
         it 'returns total number of sequences as Fixnum' do
           expect(@parser.total_sequences).to be_instance_of(Fixnum)
         end
+        it 'returns integer larger than zero' do
+          expect(@parser.total_sequences).to be > 0
+        end
+        it 'does not return nil' do
+          expect(@parser.total_sequences).not_to be_nil
+        end
       end
       describe '#filtered_sequences' do
-        it 'returns number of filtered sequence as Fixnum and not empty' do
-          expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
+        it 'returns number of filtered sequence as Fixnum, can be nil' do
+          if @parser.filtered_sequences
+            expect(@parser.filtered_sequences).to be_instance_of(Fixnum)
+          end
+        end
+      end
+      describe '#sequences_flagged_as_poor_quality' do
+        it 'returns number of sequences flagged as poor quality as Fixnum, can be nil' do
+          if @parser.sequences_flagged_as_poor_quality
+            expect(@parser.sequences_flagged_as_poor_quality).to be_instance_of(Fixnum)
+          end
         end
       end
       describe '#sequence_length' do
-        it 'returns length of sequence as String and not empty' do
+        it 'returns length of sequence as String' do
           expect(@parser.sequence_length).to be_instance_of(String)
+        end
+        it 'does not return empty string' do
           expect(@parser.sequence_length).not_to be_empty
         end
+        it 'does not return nil' do
+          expect(@parser.sequence_length).not_to be_nil
+        end
       end
       describe '#percent_gc' do
-        it 'returns percentage of GC content as Fixnum and not empty' do
-          expect(@parser.percent_gc).to be_instance_of(Fixnum)
+        it 'returns percentage of GC content as Float' do
+          expect(@parser.percent_gc).to be_instance_of(Float)
+        end
+        it 'does not return nil' do
+          expect(@parser.percent_gc).not_to be_nil
         end
       end
@@ -190,6 +248,10 @@ describe Bio::FastQC do
         it 'returns duplicate percentage as Float and not empty' do
           expect(@parser.total_duplicate_percentage).to be_instance_of(Float)
         end
+        it 'does not returns nil' do
+          expect(@parser.total_duplicate_percentage).not_to be_nil
+        end
       end
       describe '#sequence_duplication_levels' do
@@ -256,42 +318,88 @@ describe Bio::FastQC do
         it 'returns minimum read length as Fixnum and not empty' do
           expect(@parser.min_length).to be_instance_of(Fixnum)
         end
+        it 'returns integer larger than zero' do
+          expect(@parser.min_length).to be > 0
+        end
+        it 'does not return nil' do
+          expect(@parser.min_length).not_to be_nil
+        end
       end
       describe '#max_length' do
         it 'returns maximum read length as Fixnum and not empty' do
           expect(@parser.max_length).to be_instance_of(Fixnum)
         end
+        it 'returns integer larger than zero' do
+          expect(@parser.max_length).to be > 0
+        end
+        it 'does not return nil' do
+          expect(@parser.max_length).not_to be_nil
+        end
       end
       describe '#overall_mean_quality_score' do
         it 'returns overall mean quality score as Float and not empty' do
           expect(@parser.overall_mean_quality_score).to be_instance_of(Float)
         end
+        it 'does not return nil' do
+          expect(@parser.overall_mean_quality_score).not_to be_nil
+        end
       end
       describe '#overall_median_quality_score' do
         it 'returns overall median quality score as Float and not empty' do
           expect(@parser.overall_median_quality_score).to be_instance_of(Float)
         end
+        it 'does not return nil' do
+          expect(@parser.overall_median_quality_score).not_to be_nil
+        end
       end
       describe '#overall_n_content' do
         it 'returns overall N content as Float and not empty' do
           expect(@parser.overall_n_content).to be_instance_of(Float)
         end
+        it 'does not return nil' do
+          expect(@parser.overall_n_content).not_to be_nil
+        end
       end
       describe '#mean_sequence_length' do
         it 'returns mean sequence length from read length distribution as Float and not empty' do
           expect(@parser.mean_sequence_length).to be_instance_of(Float)
         end
+        it 'does not return nil' do
+          expect(@parser.mean_sequence_length).not_to be_nil
+        end
       end
       describe '#median_sequence_length' do
         it 'returns median sequence length from read length distribution as Float and not empty' do
           expect(@parser.median_sequence_length).to be_instance_of(Float)
         end
+        it 'does not return nil' do
+          expect(@parser.median_sequence_length).not_to be_nil
+        end
+      end
+      describe '#parse' do
+        it 'does not return nil' do
+          expect(@parser.parse).not_to be_nil
+        end
+        it 'returns hash' do
+          expect(@parser.parse).to be_instance_of(Hash)
+        end
       end
     end
   end

data/spec/example_fastqc_454.zip ADDED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bio-fastqc
 version: !ruby/object:Gem::Version
-  version: 0.5.2
+  version: 0.6.0
 platform: ruby
 authors:
 - Tazro Inutano Ohta
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-03-22 00:00:00.000000000 Z
+date: 2016-04-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rubyzip
@@ -229,6 +229,7 @@ files:
 - lib/bio/fastqc/semantics.rb
 - spec/bio-fastqc_spec.rb
 - spec/example_fastqc.zip
+- spec/example_fastqc_454.zip
 - spec/spec_helper.rb
 homepage: http://github.com/inutano/bioruby-fastqc
 licenses: