RubyGems - dbd - Versions diffs - 0.0.12 → 0.0.13 - Mend

dbd 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/HISTORY.txt +9 -0
data/bin/test_5.rb +27 -1
data/bin/test_6.rb +2 -0
data/docs/stories/014_escape_newlines_in_CSV.txt +19 -0
data/docs/stories/015_change_order_of_fields.txt +10 -0
data/lib/dbd/fact.rb +11 -2
data/lib/dbd/fact/factory.rb +15 -2
data/lib/dbd/graph.rb +10 -1
data/lib/dbd/version.rb +1 -1
data/spec/lib/dbd/fact/factory/factory_spec.rb +10 -5
data/spec/lib/dbd/fact/methods_spec.rb +4 -0
data/spec/lib/dbd/graph/from_csv_spec.rb +1 -1
data/spec/lib/dbd/graph/to_csv_spec.rb +6 -6
data/spec/lib/dbd/performance_spec.rb +1 -1
data/spec/spec_helper.rb +3 -0
data/spec/test_factories/fact.rb +3 -3
metadata +4 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 49a3db0e5342a8c507f107cad3afd4ae3e21964c
-  data.tar.gz: 0118bb6bddcdc103577080f5deddc42ff3d2d5d8
+  metadata.gz: b783665f6d90bb2b8e08d8f30b16b5ad93839442
+  data.tar.gz: 74a917c72399200d7a4c1ac257d2a3490cd7c38d
 SHA512:
-  metadata.gz: 04d8a0525f3acf92a9203d63a846c829922f4363a65f7b1cfe900794f2a2ba0725de11a8a6cf330c3b6308a30e031c37d37052180c61adf99c8ab74d7b0a92f2
-  data.tar.gz: 03087f0151bbbe1d1b8c7f7e6a9dd090122136d597453459a8ee96e082f741c9bf4d86970255bc40f93bdd317ab9a1a113d2b2f940b9358ebec5bff11bffae9b
+  metadata.gz: f55e7abd315c57ebd50b4bdba532ef36fdd2b352c29e0d7ae37e1aba68adbff061ca895b491807ec2a8926bf7eb9a6c30761a5ae2270997d66f344c87163ecae
+  data.tar.gz: 7229d01d566e2455b76e64b6bc07fbb908212bebf95d9f1ca18ebe94da76a802aceab78c53646ada5004c35937d5580c8312862e1d46af464f0a6e2555100c87

data/.gitignore CHANGED

@@ -16,3 +16,4 @@ test/tmp
 test/version_tmp
 tmp
 /data/foo.csv
+/.idea

data/HISTORY.txt CHANGED

@@ -76,3 +76,12 @@
   * ProvenanceResource => Context
 * because the context of a fact is much more than the provenance
   (also visibility, encryption, license, ...)
+0.0.13 (7 Aug 2013)
+======
+* text inside quotes in the CSV format now has newlines and backslashed escaped
+  * newline => '\n'
+  * backslash => '\\\\' (double backslash)
+  => now 1 fact is 1 "line" in the CSV file
+     (allowing file operations that are line oriented)

data/bin/test_5.rb CHANGED

@@ -1,6 +1,32 @@
 #!/usr/bin/env ruby
 # This implementation now streams to disk.
+#
+# Some performance (ruby 2.0 on MacBook Pro)
+# /Users/peter_v/dbd/bin $ time ./test_5.rb 100 t_5_100
+# added resource 0 to the graph
+# ...
+# added resource 99 to the graph
+# Graph is ready (took 4.285428s), now starting the write to disk
+#
+# real  0m8.515s
+# user  0m8.331s
+# sys 0m0.181s
+# ...
+# /Users/peter_v/dbd/bin $ time ./test_6.rb t_5_100
+# Graph is ready (took 14.455278s).
+# graph.size is 100100
+#
+# real  0m14.922s
+# user  0m14.728s
+# sys 0m0.189s
+#
+# From version 0.0.13 with newline escaping, the times went up:
+# writing (test_5)
+# real  0m11.656s
+#
+# reading back (test_6)
+# real  0m18.442s
 FACTS_PER_RESOURCE = 1000
@@ -28,7 +54,7 @@ graph = Dbd::Graph.new
   resource = Dbd::Resource.new(context_subject: context.subject)
   (0...FACTS_PER_RESOURCE).each do |j|
-    resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 80} #{i * FACTS_PER_RESOURCE + j}")
+    resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 75} #{i * FACTS_PER_RESOURCE + j} \n CD")
   end
   graph << context << resource

data/bin/test_6.rb CHANGED

@@ -1,6 +1,8 @@
 #!/usr/bin/env ruby
 # This implementation streams from disk
+#
+# See test_5.rb for usage and basic performance test
 filename = ARGV[0]
 unless filename

data/docs/stories/014_escape_newlines_in_CSV.txt ADDED

@@ -0,0 +1,19 @@
+014_escape_newlines_in_csv
+As a user of the system
+I can manipulate CSV files that are written by Dbd with basic
+  UNIX command line tools (e.g. split by dates, merge, ...)
+And the facts in that format are aligned by a newline
+So, the newlines in the String object need to be escaped
+* minimal profiling of write + read speed
+* escape on writing, suggest:
+  * "\n" => "\\n" (a back slash and a letter n)
+  "C:\nuby" => "C:\\nuby"
+  * do we need to escape a "real" slash (or slash + n) now ??
+* escape on reading, suggest:
+  * "\\n" => "\n" (a newline)
+  * what with a slash + n ?
+* validate that write + read performance is not degraded too much

data/docs/stories/015_change_order_of_fields.txt ADDED

@@ -0,0 +1,10 @@
+015_change_order_of_fields
+As a user of the system
+When playing with the files
+I want to sort on time_stamp if needed
+* have time_stamp first
+* then the ID
+* then the context_subject
+* then subject, predicate, object

data/lib/dbd/fact.rb CHANGED

@@ -140,9 +140,12 @@ module Dbd
     ##
     # @return [Array] The 6 values of a Fact converted to a string.
-    # This is similar to the 6 entries in the to_CSV mapping
+    # The individual strings are escaped:
+    # * newlines are escaped to '\n'
+    # This is used for the 6 entries in the to_CSV mapping.
+    #
     def string_values
-      values.map(&:to_s)
+      values.map{ |value| escaped_string(value.to_s) }
     end
     ##
@@ -241,5 +244,11 @@ module Dbd
       end
     end
+    def escaped_string(string)
+      string.
+        gsub(%r{\\}, "\\\\\\\\"). # single \ => double \\
+        gsub(%r{\n}, '\n') # newline => \n
+    end
   end
 end

data/lib/dbd/fact/factory.rb CHANGED

@@ -48,10 +48,23 @@ module Dbd
       private
+        def unescaped_string_values(string_values)
+          string_values.map{ |string_value| unescaped_string(string_value) }
+        end
+        def unescaped_string(string)
+          r = %r{(\\\\|\\n)}
+          repl = {
+            "\\\\" => "\\",  # double backslash => single backslash
+            "\\n" => "\n"}   # backslash n => newline
+          string.gsub(r, repl)
+        end
         def string_hash_from_values(string_values)
-          attributes_strings_array = [top_class.attributes, string_values].transpose
+          unescaped_values = unescaped_string_values(string_values)
+          attributes_strings_array = [top_class.attributes, unescaped_values].transpose
           # Remove empty values (e.g. the context_subject for a ContextFact).
-          attributes_strings_array.delete_if{|a,v| v.nil? || v == ''}
+          attributes_strings_array.delete_if{ |a, v| v == '' }
           Hash[attributes_strings_array]
         end

data/lib/dbd/graph.rb CHANGED

@@ -30,6 +30,9 @@ module Dbd
     ##
     # Export the graph to a CSV string
     #
+    # Newlines in the fields are escaped to "backslash n".
+    # Backslashes in the field are escape to "double backslash".
+    #
     # @return [String] comma separated string with double quoted cells
     def to_CSV
       CSV.generate(csv_defaults) do |csv|
@@ -40,6 +43,9 @@ module Dbd
     ##
     # Export the graph to a CSV file
     #
+    # Newlines in the fields are escaped to "backslash n".
+    # Backslashes in the field are escape to "double backslash".
+    #
     # @param [String] filename the filename to stream the CSV to
     def to_CSV_file(filename)
       CSV.open(filename, 'w', csv_defaults) do |csv|
@@ -50,6 +56,9 @@ module Dbd
     ##
     # Import a graph from a CSV IO stream
     #
+    # Tokens "backslash n" in the CSV fields will be unescaped to newlines.
+    # Tokens "double backslash" in the CSV fields will be unescaped to single backslash
+    #
     # @param [IO Stream] csv an IO Stream that contains the CSV serialization
     # @return [Graph] the imported graph
     def from_CSV(csv)
@@ -76,7 +85,7 @@ module Dbd
     def push_facts(target)
       @internal_collection.each do |fact|
-        target << fact.values
+        target << fact.string_values
       end
     end

data/lib/dbd/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Dbd
-  VERSION = "0.0.12"
+  VERSION = "0.0.13"
 end

data/spec/lib/dbd/fact/factory/factory_spec.rb CHANGED

@@ -39,6 +39,16 @@ module Dbd
           fact.string_values.should == string_values
         end
+        it 'converts a \n (backslash n, no newline) to newline' do
+          fact = described_class.from_string_values(string_values)
+          fact.object.should match(/\n/) # a newline
+        end
+        it 'converts a \\\\ (double backslash) into a single backslash' do
+          fact = described_class.from_string_values(string_values)
+          fact.object.should match(%r{[^\\]\\n}) # a backslash + newline
+        end
         it 'calls validate_string_hash if options[:validate]' do
           described_class.should_receive(:validate_string_hash)
           described_class.from_string_values(string_values, validate: true)
@@ -61,11 +71,6 @@ module Dbd
             with_validation(string_values)
           end
-          it 'for a nil context_subject (for context_facts)' do
-            string_values[2] = nil
-            with_validation(string_values)
-          end
           it 'for an empty context_subject (for context_facts)' do
             string_values[2] = ''
             with_validation(string_values)

data/spec/lib/dbd/fact/methods_spec.rb CHANGED

@@ -137,6 +137,10 @@ module Dbd
       it 'the second element (time_stamp) is a String' do
         full_fact.string_values[1].should be_a(String)
       end
+      it 'escapes a newline into \n and \n into \\\\n (two backslashes and a n)' do
+        full_fact.string_values[5].should == "Gandhi\\nKing\\\\n" # backslash newline
+      end
     end
     describe 'context_fact?' do

data/spec/lib/dbd/graph/from_csv_spec.rb CHANGED

@@ -84,7 +84,7 @@ module Dbd
         resource << special_fact
         graph = described_class.new << resource
         csv = graph.to_CSV
-        csv.should match(%r{A long story with a newline\nreally with a comma, a double quote "" and a non-ASCII char éà Über.})
+        csv.should match(%r{A long story with a newline\\nreally with a comma, a double quote "" and a non-ASCII char éà Über.})
         graph_from_CSV = described_class.new.from_CSV(csv)
         graph_from_CSV.first.should be_equivalent(graph.first)
       end

data/spec/lib/dbd/graph/to_csv_spec.rb CHANGED

@@ -38,7 +38,7 @@ module Dbd
       describe 'with a single context_fact collection' do
         it 'has three logical lines (but one with embedded newline)' do
-          subject.to_CSV.lines.count.should == 4
+          subject.to_CSV.lines.count.should == 3
         end
         it 'ends with a newline' do
@@ -79,7 +79,7 @@ module Dbd
       describe 'handles comma, double quote and newline correctly' do
         it 'has original_source with special characters and double quote escaped' do
-          subject.to_CSV.should match(/"this has a comma , a newline \n and a double quote """/)
+          subject.to_CSV.should match(/"this has a comma , a newline \\n and a double quote """/)
         end
       end
     end
@@ -156,8 +156,8 @@ module Dbd
          end
       end
-      it 'has six lines' do
-        subject.to_CSV.lines.count.should == 6
+      it 'has 5 lines' do
+        subject.to_CSV.lines.count.should == 5
       end
     end
@@ -173,11 +173,11 @@ module Dbd
         subject << fact_special_characters
       end
-      it 'has eight lines' do
+      it 'has six lines' do
         filename = 'data/foo.csv'
         subject.to_CSV_file(filename)
         File.open(filename) do |f|
-          f.readlines.count.should == 8
+          f.readlines.count.should == 6
         end
       end

data/spec/lib/dbd/performance_spec.rb CHANGED

@@ -33,7 +33,7 @@ module Dbd
         duration = Time.now - start
         puts "\nDuration for inserting #{NUMBER_OF_FACTS} facts in the in-memory graph was #{duration*1000_000/NUMBER_OF_FACTS} us PER FACT"
         graph.size.should == 2 * NUMBER_OF_FACTS + 1
-        duration.should < 0.000_15 * NUMBER_OF_FACTS
+        duration.should < 0.000_20 * NUMBER_OF_FACTS
         # typ. 37 us on Mac Ruby 2.0.0 (on 2013-05-15 over 15K iterations)
         # typ. 45 us on Mac Ruby 2.0.0 (on 2013-06-05 over 10K iterations)
         # typ. 47 us on Mac Ruby 2.0.0 (on 2013-06-21 over 10K iterations)

data/spec/spec_helper.rb CHANGED

@@ -9,6 +9,9 @@ RSpec.configure do |config|
   config.order = 'random'
+  config.filter_run :focus => true
+  config.run_all_when_everything_filtered = true
   # exclude neo4j tests for now (not working on Travis)
   config.filter_run_excluding :neo4j => true
   config.filter_run_excluding :neo4j_performance => true

data/spec/test_factories/fact.rb CHANGED

@@ -15,7 +15,7 @@ module TestFactories
        '40fab407-9b04-4a51-9a52-d978abfcbb1f',
        '2e9fbc87-2e94-47e9-a8fd-121cc4bc3e8f',
        'http://example.org/test/name',
-       'Gandhi']
+       "Gandhi\\nKing\\\\n"]
     end
     def self.fact_1(context_subject = nil)
@@ -29,7 +29,7 @@ module TestFactories
       factory_for.new(
         id: forced_id,
         predicate: 'http://example.org/test/name',
-        object: 'Gandhi')
+        object: "Gandhi\nKing\\n") # newline and \n
     end
     def self.fact_with_special_chars(context_subject = nil, subject = nil)
@@ -76,7 +76,7 @@ module TestFactories
         context_subject: context_subject,
         subject: subject,
         predicate: 'http://example.org/test/name',
-        object: 'European Union')
+        object: "\\n\n\\n\n\\\n\\\\\n\\\\\\\nEuropean\nUnion\\n")
     end
     def self.fact_with_newline(context_subject = nil, subject = nil)

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: dbd
 version: !ruby/object:Gem::Version
-  version: 0.0.12
+  version: 0.0.13
 platform: ruby
 authors:
 - Peter Vandenabeele
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-10 00:00:00.000000000 Z
+date: 2013-08-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -179,6 +179,8 @@ files:
 - docs/stories/011_store_resource_in_graph.txt
 - docs/stories/012_provenance_fact_properties_from_provenance_ontology.txt
 - docs/stories/013_read_graph_from_CSV.txt
+- docs/stories/014_escape_newlines_in_CSV.txt
+- docs/stories/015_change_order_of_fields.txt
 - docs/test.rb
 - lib/dbd.rb
 - lib/dbd/context.rb