RubyGems - dbd - Versions diffs - 0.0.8 → 0.0.9 - Mend

dbd 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.travis.yml +3 -1
data/HISTORY.txt +7 -0
data/README.md +20 -1
data/bin/test_1.rb +1 -1
data/bin/test_3.rb +11 -0
data/bin/test_4.rb +32 -0
data/bin/test_5.rb +40 -0
data/bin/test_6.rb +21 -0
data/data/.gitkeep +0 -0
data/lib/dbd/fact.rb +1 -1
data/lib/dbd/graph.rb +26 -7
data/lib/dbd/helpers/uuid.rb +2 -2
data/lib/dbd/version.rb +1 -1
data/spec/factories/provenance_fact.rb +1 -0
data/spec/lib/dbd/graph/to_csv_spec.rb +31 -0
data/spec/lib/dbd/helpers/uuid/uuid_spec.rb +4 -0
metadata +11 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8ceec5332dfdb3174ee303dcd09cf401186990c8
-  data.tar.gz: a7e61a7b4acc1dcb9a966369ba1c4d8a853b5f9f
+  metadata.gz: 0b21e77e8316f18a011e2356b79a28a35e5dde7d
+  data.tar.gz: d8a0adeebbc7311a512ef08bfbc63902e16f2159
 SHA512:
-  metadata.gz: f1da941433fd3f5dc0f077992bec69d6dc2a3dc761d52549934dd707e713d6317495ae54659e764c21bab079789ed3e74e260ff639bf237715f14222bc267e9b
-  data.tar.gz: cfdd88e0c3ed9e012f354008ab32685c052fffef7897492d78663d2d243c76abd9ae0a3f3a0c2b8b4f5c54ab8371553a9f129436a36a25ea57e46b86d555d8ef
+  metadata.gz: c8e9c42c082ad4bdc8c6af283c9a73fad3c35a75e5ebace47aefbbe127cb0ee001ef066d17ef6e9b253dcd2a6311ccbb520810bdb061ab233c77950be7a99542
+  data.tar.gz: a63bb4462ebdc8fa4a6c2e0f3a1513146f7aac1e85a9d748c3126f5352464a613ee9bf956e27e0fad06edc4816e7226e78a37a1c5218196e6ea0f4ccf13db5e5

data/.gitignore CHANGED

@@ -15,3 +15,4 @@ spec/reports
 test/tmp
 test/version_tmp
 tmp
+/data/foo.csv

data/.travis.yml CHANGED

@@ -4,6 +4,8 @@ script: "bundle exec rspec spec"
 rvm:
   - 2.0.0
   - 1.9.3
-  - 1.9.2
   - jruby-19mode
   - jruby-head
+branches:
+  only:
+    - master

data/HISTORY.txt CHANGED

@@ -43,3 +43,10 @@
 * bin/test_1.rb was used for first successful writing of 10M facts using
   ruby-1.9.3-p429 (will report issues on ruby-2.0.0-p195 and jruby-1.7.4)
+0.0.9 (30 June 2013)
+=====
+* new function graph#to_CSV_file
+* bin/test_5.rb was used to write 10M facts using ruby-2.0.0, 1.9.3 and jruby-1.7.4
+* jruby is 3 time faster, but 10% more memory comsumption

data/README.md CHANGED

@@ -44,7 +44,7 @@ Open Source [MIT]
 ## Installation
-    $ gem install dbd      # Ruby 1.9.2, 1.9.3, 2.0.x, jruby (see .travis.yml)
+    $ gem install dbd      # Ruby 1.9.3, 2.0.x, jruby (see .travis.yml)
 ## Examples
@@ -118,6 +118,25 @@ puts imported_graph.map(&:short)
 # 5eb1ea27 : 3767c493 : todo:story               : A long period of peace,_ that is a "bliss".
 ```
+## Performance tests on 10 M facts
+In version 0.0.9 a number of test programs where added (e.g. ../bin/test_5.rb)
+that where used to populated in memory and write to disk a data set with 10 M facts.
+This function was tested on ruby-2.0.0, ruby-1.9.3 and jruby-1.7.4. The facts
+had an approximate size of 250 Bytes each (80 Bytes object).
+The time needed and memory size (RSS) for populating the in-memory dataset was:
+10 M facts (of 250 Bytes; 2.5 GB netto data):
+| ruby	     | time        | memory (RSS} |
+|------------|-------------| ------------:|
+| ruby-1.9.3 | 863 seconds |       8.1 GB |
+| ruby-2.0.0 | 862 seconds |       9.0 GB |
+|jruby-1.7.4 | 345 seconds |      10.8 GB |
 [RDF]:              http://www.w3.org/RDF/
 [Rationale]:        http://github.com/petervandenabeele/dbd/blob/master/docs/rationale.md
 [MIT]:              https://github.com/petervandenabeele/dbd/blob/master/LICENSE.txt

data/bin/test_1.rb CHANGED

@@ -12,7 +12,7 @@ unless count > 0
 end
 filename = ARGV[1]
-unless filename.size > 0
+unless filename
   puts "Give a 'filename' as second argument."
   exit(1)
 end

data/bin/test_3.rb ADDED

@@ -0,0 +1,11 @@
+# encoding=us-ascii
+# this is a test program for an exception that is thrown in JRuby
+# see http://markmail.org/message/e2ote7rkwht2quel?q=list:org.codehaus.jruby.user
+#row = "A" * 300 # does NOT fail with this value of `row`
+row = "A" * 301
+count = 5_000_000
+csv_string = row * count
+encoded_string = csv_string.encode("utf-8")

data/bin/test_4.rb ADDED

@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+# This is a test program for an issue with CSV.generate
+# in ruby-2.0.0 and ruby-head, see http://bugs.ruby-lang.org/issues/8585
+count = ARGV[0].to_i
+unless count > 0
+  puts "Give a 'count' as first argument."
+  exit(1)
+end
+require 'csv'
+row_data = [
+  "59ffbb3b-1e48-4c1f-81d8-d93afc84c966",
+  "2013-06-28 19:14:55.975000806 UTC",
+  "a11f290e-c441-41bc-8b8c-4e6c27b1b6fc",
+  "c73e6241-d46f-4952-8377-c11372346d15",
+  "test",
+  "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0"]
+puts "starting CSV.open"
+start_time = Time.now
+csv_string = CSV.generate(force_quotes: true) do |csv|
+  count.times do
+    csv << row_data
+  end
+end
+puts "CSV.open took #{Time.now - start_time} seconds"

data/bin/test_5.rb ADDED

@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby
+# This implementation now streams to disk.
+FACTS_PER_RESOURCE = 1000
+count = ARGV[0].to_i
+unless count > 0
+  puts "Give a 'count' as first argument."
+  exit(1)
+end
+filename = ARGV[1]
+unless filename
+  puts "Give a 'filename' as second argument."
+  exit(1)
+end
+require 'dbd'
+start = Time.now
+graph = Dbd::Graph.new
+(0...count).each do |i|
+  provenance_resource = Dbd::ProvenanceResource.new
+  provenance_resource << Dbd::ProvenanceFact.new(predicate: "prov:test" , object: "A" * 10)
+  resource = Dbd::Resource.new(provenance_subject: provenance_resource.subject)
+  (0...FACTS_PER_RESOURCE).each do |j|
+    resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 80} #{i * FACTS_PER_RESOURCE + j}")
+  end
+  graph << provenance_resource << resource
+  puts ("added resource #{i} to the graph")
+end
+puts "Graph is ready (took #{Time.now - start}s), now starting the write to disk"
+graph.to_CSV_file(filename)

data/bin/test_6.rb ADDED

@@ -0,0 +1,21 @@
+#!/usr/bin/env ruby
+# This implementation streams from disk
+filename = ARGV[0]
+unless filename
+  puts "Give a 'filename' as argument."
+  exit(1)
+end
+require 'dbd'
+start = Time.now
+graph = File.open(filename) do |f|
+  Dbd::Graph.from_CSV(f)
+end
+puts "Graph is ready (took #{Time.now - start}s), now starting the write to disk"
+puts "graph.size is #{graph.size}"

data/data/.gitkeep ADDED

File without changes

data/lib/dbd/fact.rb CHANGED

@@ -156,7 +156,7 @@ module Dbd
     # Constructs a Fact or ProvenanceFact from a string values array
     # (e.g. pulled from a CSV row).
     #
-    # @param [Array] values Required : the array with values, organized as in attributes
+    # @param [Array] string_values Required : the array with values, organized as in attributes
     # @return [Fact, ProvenanceFact] the constructed fact
     def self.from_string_values(string_values)
       string_hash = hash_from_values(string_values)

data/lib/dbd/graph.rb CHANGED

@@ -32,17 +32,25 @@ module Dbd
     #
     # @return [String] comma separated string with double quoted cells
     def to_CSV
-      CSV.generate(force_quotes: true) do |csv|
-        @internal_collection.each do |fact|
-          csv << fact.values
-        end
-      end.encode("utf-8")
+      CSV.generate(csv_defaults) do |csv|
+        push_facts(csv)
+      end
+    end
+    ##
+    # Export the graph to a CSV file
+    #
+    # @param [String] :filename the filename to stream the CSV to
+    def to_CSV_file(filename)
+      CSV.open(filename, 'w', csv_defaults) do |csv|
+        push_facts(csv)
+      end
     end
     ##
-    # Import a graph from a CSV string.
+    # Import a graph from a CSV IO stream
     #
-    # @param [String] csv a string that contains the CSV serialization
+    # @param [IO Stream] csv an IO Stream that contains the CSV serialization
     # @return [Graph] the imported graph
     def self.from_CSV(csv)
       new.tap do |graph|
@@ -63,5 +71,16 @@ module Dbd
       fact.time_stamp = TimeStamp.new(larger_than: newest_time_stamp) unless fact.time_stamp
     end
+    def csv_defaults
+      {force_quotes: true,
+       encoding: 'utf-8'}
+    end
+    def push_facts(target)
+      @internal_collection.each do |fact|
+        target << fact.values
+      end
+    end
   end
 end

data/lib/dbd/helpers/uuid.rb CHANGED

@@ -18,14 +18,14 @@ module Dbd
       # Store a SecureRandom.uuid.
       # @return [void]
       def initialize
-        @uuid = SecureRandom.uuid
+        @uuid = SecureRandom.uuid.encode('utf-8')
       end
       ##
       # The to_s of the uuid.
       # @return [String]
       def to_s
-        @uuid.to_s
+        @uuid
       end
     end

data/lib/dbd/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Dbd
-  VERSION = "0.0.8"
+  VERSION = "0.0.9"
 end

data/spec/factories/provenance_fact.rb CHANGED

@@ -1,3 +1,4 @@
+# encoding=utf-8
 module Factories
   module ProvenanceFact

data/spec/lib/dbd/graph/to_csv_spec.rb CHANGED

@@ -1,3 +1,4 @@
+# encoding=utf-8
 require 'spec_helper'
 module Dbd
@@ -10,6 +11,7 @@ module Dbd
     let(:provenance_facts) { Factories::Fact::Collection.provenance_facts(new_subject) }
     let(:provenance_fact_1) { provenance_facts.first }
     let(:fact_2_3) { Factories::Fact::Collection.fact_2_3(provenance_fact_1.subject) }
+    let(:fact_special_characters) { Factories::Fact::fact_with_special_chars(provenance_fact_1.subject, new_subject) }
     let(:subject_regexp) { Fact::Subject.regexp }
     let(:id_regexp) { Fact::ID.regexp }
@@ -157,5 +159,34 @@ module Dbd
         subject.to_CSV.lines.count.should == 6
       end
     end
+    describe "#to_CSV_file" do
+      before do
+        provenance_facts.each do |provenance_fact|
+          subject << provenance_fact
+        end
+        fact_2_3.each do |fact|
+          subject << fact
+        end
+        subject << fact_special_characters
+      end
+      it "has eight lines" do
+        filename = 'data/foo.csv'
+        subject.to_CSV_file(filename)
+        File.open(filename) do |f|
+          f.readlines.count.should == 8
+        end
+      end
+      it "reads back UTF-8 characters correctly" do
+        filename = 'data/foo.csv'
+        subject.to_CSV_file(filename)
+        File.open(filename) do |f|
+          f.readlines.detect{|l| l.match(%r{really with a comma, a double quote "" and a non-ASCII char éà Über.})}.should_not be_nil
+        end
+      end
+    end
   end
 end

data/spec/lib/dbd/helpers/uuid/uuid_spec.rb CHANGED

@@ -10,6 +10,10 @@ module Dbd
       it ".new creates a new random UUID" do
         described_class.new.to_s.should match(UUID.regexp)
       end
+      it ".new creates a new random UUID with UTF-8 encoding" do
+        described_class.new.to_s.encoding.should == Encoding::UTF_8
+      end
     end
   end
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: dbd
 version: !ruby/object:Gem::Version
-  version: 0.0.8
+  version: 0.0.9
 platform: ruby
 authors:
 - Peter Vandenabeele
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-06-23 00:00:00.000000000 Z
+date: 2013-06-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -141,6 +141,10 @@ email:
 - peter@vandenabeele.com
 executables:
 - test_1.rb
+- test_3.rb
+- test_4.rb
+- test_5.rb
+- test_6.rb
 extensions: []
 extra_rdoc_files: []
 files:
@@ -155,6 +159,11 @@ files:
 - README.md
 - Rakefile
 - bin/test_1.rb
+- bin/test_3.rb
+- bin/test_4.rb
+- bin/test_5.rb
+- bin/test_6.rb
+- data/.gitkeep
 - dbd.gemspec
 - docs/rationale.md
 - docs/stories/001_create_a_fact.txt