dbd 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49a3db0e5342a8c507f107cad3afd4ae3e21964c
4
- data.tar.gz: 0118bb6bddcdc103577080f5deddc42ff3d2d5d8
3
+ metadata.gz: b783665f6d90bb2b8e08d8f30b16b5ad93839442
4
+ data.tar.gz: 74a917c72399200d7a4c1ac257d2a3490cd7c38d
5
5
  SHA512:
6
- metadata.gz: 04d8a0525f3acf92a9203d63a846c829922f4363a65f7b1cfe900794f2a2ba0725de11a8a6cf330c3b6308a30e031c37d37052180c61adf99c8ab74d7b0a92f2
7
- data.tar.gz: 03087f0151bbbe1d1b8c7f7e6a9dd090122136d597453459a8ee96e082f741c9bf4d86970255bc40f93bdd317ab9a1a113d2b2f940b9358ebec5bff11bffae9b
6
+ metadata.gz: f55e7abd315c57ebd50b4bdba532ef36fdd2b352c29e0d7ae37e1aba68adbff061ca895b491807ec2a8926bf7eb9a6c30761a5ae2270997d66f344c87163ecae
7
+ data.tar.gz: 7229d01d566e2455b76e64b6bc07fbb908212bebf95d9f1ca18ebe94da76a802aceab78c53646ada5004c35937d5580c8312862e1d46af464f0a6e2555100c87
data/.gitignore CHANGED
@@ -16,3 +16,4 @@ test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
18
  /data/foo.csv
19
+ /.idea
@@ -76,3 +76,12 @@
76
76
  * ProvenanceResource => Context
77
77
  * because the context of a fact is much more than the provenance
78
78
  (also visibility, encryption, license, ...)
79
+
80
+ 0.0.13 (7 Aug 2013)
81
+ ======
82
+
83
+ * text inside quotes in the CSV format now has newlines and backslashed escaped
84
+ * newline => '\n'
85
+ * backslash => '\\\\' (double backslash)
86
+ => now 1 fact is 1 "line" in the CSV file
87
+ (allowing file operations that are line oriented)
@@ -1,6 +1,32 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  # This implementation now streams to disk.
4
+ #
5
+ # Some performance (ruby 2.0 on MacBook Pro)
6
+ # /Users/peter_v/dbd/bin $ time ./test_5.rb 100 t_5_100
7
+ # added resource 0 to the graph
8
+ # ...
9
+ # added resource 99 to the graph
10
+ # Graph is ready (took 4.285428s), now starting the write to disk
11
+ #
12
+ # real 0m8.515s
13
+ # user 0m8.331s
14
+ # sys 0m0.181s
15
+ # ...
16
+ # /Users/peter_v/dbd/bin $ time ./test_6.rb t_5_100
17
+ # Graph is ready (took 14.455278s).
18
+ # graph.size is 100100
19
+ #
20
+ # real 0m14.922s
21
+ # user 0m14.728s
22
+ # sys 0m0.189s
23
+ #
24
+ # From version 0.0.13 with newline escaping, the times went up:
25
+ # writing (test_5)
26
+ # real 0m11.656s
27
+ #
28
+ # reading back (test_6)
29
+ # real 0m18.442s
4
30
 
5
31
  FACTS_PER_RESOURCE = 1000
6
32
 
@@ -28,7 +54,7 @@ graph = Dbd::Graph.new
28
54
 
29
55
  resource = Dbd::Resource.new(context_subject: context.subject)
30
56
  (0...FACTS_PER_RESOURCE).each do |j|
31
- resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 80} #{i * FACTS_PER_RESOURCE + j}")
57
+ resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 75} #{i * FACTS_PER_RESOURCE + j} \n CD")
32
58
  end
33
59
 
34
60
  graph << context << resource
@@ -1,6 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  # This implementation streams from disk
4
+ #
5
+ # See test_5.rb for usage and basic performance test
4
6
 
5
7
  filename = ARGV[0]
6
8
  unless filename
@@ -0,0 +1,19 @@
1
+ 014_escape_newlines_in_csv
2
+
3
+ As a user of the system
4
+ I can manipulate CSV files that are written by Dbd with basic
5
+ UNIX command line tools (e.g. split by dates, merge, ...)
6
+ And the facts in that format are aligned by a newline
7
+ So, the newlines in the String object need to be escaped
8
+
9
+ * minimal profiling of write + read speed
10
+ * escape on writing, suggest:
11
+ * "\n" => "\\n" (a back slash and a letter n)
12
+ "C:\nuby" => "C:\\nuby"
13
+ * do we need to escape a "real" slash (or slash + n) now ??
14
+
15
+ * escape on reading, suggest:
16
+ * "\\n" => "\n" (a newline)
17
+ * what with a slash + n ?
18
+
19
+ * validate that write + read performance is not degraded too much
@@ -0,0 +1,10 @@
1
+ 015_change_order_of_fields
2
+
3
+ As a user of the system
4
+ When playing with the files
5
+ I want to sort on time_stamp if needed
6
+
7
+ * have time_stamp first
8
+ * then the ID
9
+ * then the context_subject
10
+ * then subject, predicate, object
@@ -140,9 +140,12 @@ module Dbd
140
140
 
141
141
  ##
142
142
  # @return [Array] The 6 values of a Fact converted to a string.
143
- # This is similar to the 6 entries in the to_CSV mapping
143
+ # The individual strings are escaped:
144
+ # * newlines are escaped to '\n'
145
+ # This is used for the 6 entries in the to_CSV mapping.
146
+ #
144
147
  def string_values
145
- values.map(&:to_s)
148
+ values.map{ |value| escaped_string(value.to_s) }
146
149
  end
147
150
 
148
151
  ##
@@ -241,5 +244,11 @@ module Dbd
241
244
  end
242
245
  end
243
246
 
247
+ def escaped_string(string)
248
+ string.
249
+ gsub(%r{\\}, "\\\\\\\\"). # single \ => double \\
250
+ gsub(%r{\n}, '\n') # newline => \n
251
+ end
252
+
244
253
  end
245
254
  end
@@ -48,10 +48,23 @@ module Dbd
48
48
 
49
49
  private
50
50
 
51
+ def unescaped_string_values(string_values)
52
+ string_values.map{ |string_value| unescaped_string(string_value) }
53
+ end
54
+
55
+ def unescaped_string(string)
56
+ r = %r{(\\\\|\\n)}
57
+ repl = {
58
+ "\\\\" => "\\", # double backslash => single backslash
59
+ "\\n" => "\n"} # backslash n => newline
60
+ string.gsub(r, repl)
61
+ end
62
+
51
63
  def string_hash_from_values(string_values)
52
- attributes_strings_array = [top_class.attributes, string_values].transpose
64
+ unescaped_values = unescaped_string_values(string_values)
65
+ attributes_strings_array = [top_class.attributes, unescaped_values].transpose
53
66
  # Remove empty values (e.g. the context_subject for a ContextFact).
54
- attributes_strings_array.delete_if{|a,v| v.nil? || v == ''}
67
+ attributes_strings_array.delete_if{ |a, v| v == '' }
55
68
  Hash[attributes_strings_array]
56
69
  end
57
70
 
@@ -30,6 +30,9 @@ module Dbd
30
30
  ##
31
31
  # Export the graph to a CSV string
32
32
  #
33
+ # Newlines in the fields are escaped to "backslash n".
34
+ # Backslashes in the field are escape to "double backslash".
35
+ #
33
36
  # @return [String] comma separated string with double quoted cells
34
37
  def to_CSV
35
38
  CSV.generate(csv_defaults) do |csv|
@@ -40,6 +43,9 @@ module Dbd
40
43
  ##
41
44
  # Export the graph to a CSV file
42
45
  #
46
+ # Newlines in the fields are escaped to "backslash n".
47
+ # Backslashes in the field are escape to "double backslash".
48
+ #
43
49
  # @param [String] filename the filename to stream the CSV to
44
50
  def to_CSV_file(filename)
45
51
  CSV.open(filename, 'w', csv_defaults) do |csv|
@@ -50,6 +56,9 @@ module Dbd
50
56
  ##
51
57
  # Import a graph from a CSV IO stream
52
58
  #
59
+ # Tokens "backslash n" in the CSV fields will be unescaped to newlines.
60
+ # Tokens "double backslash" in the CSV fields will be unescaped to single backslash
61
+ #
53
62
  # @param [IO Stream] csv an IO Stream that contains the CSV serialization
54
63
  # @return [Graph] the imported graph
55
64
  def from_CSV(csv)
@@ -76,7 +85,7 @@ module Dbd
76
85
 
77
86
  def push_facts(target)
78
87
  @internal_collection.each do |fact|
79
- target << fact.values
88
+ target << fact.string_values
80
89
  end
81
90
  end
82
91
 
@@ -1,3 +1,3 @@
1
1
  module Dbd
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
@@ -39,6 +39,16 @@ module Dbd
39
39
  fact.string_values.should == string_values
40
40
  end
41
41
 
42
+ it 'converts a \n (backslash n, no newline) to newline' do
43
+ fact = described_class.from_string_values(string_values)
44
+ fact.object.should match(/\n/) # a newline
45
+ end
46
+
47
+ it 'converts a \\\\ (double backslash) into a single backslash' do
48
+ fact = described_class.from_string_values(string_values)
49
+ fact.object.should match(%r{[^\\]\\n}) # a backslash + newline
50
+ end
51
+
42
52
  it 'calls validate_string_hash if options[:validate]' do
43
53
  described_class.should_receive(:validate_string_hash)
44
54
  described_class.from_string_values(string_values, validate: true)
@@ -61,11 +71,6 @@ module Dbd
61
71
  with_validation(string_values)
62
72
  end
63
73
 
64
- it 'for a nil context_subject (for context_facts)' do
65
- string_values[2] = nil
66
- with_validation(string_values)
67
- end
68
-
69
74
  it 'for an empty context_subject (for context_facts)' do
70
75
  string_values[2] = ''
71
76
  with_validation(string_values)
@@ -137,6 +137,10 @@ module Dbd
137
137
  it 'the second element (time_stamp) is a String' do
138
138
  full_fact.string_values[1].should be_a(String)
139
139
  end
140
+
141
+ it 'escapes a newline into \n and \n into \\\\n (two backslashes and a n)' do
142
+ full_fact.string_values[5].should == "Gandhi\\nKing\\\\n" # backslash newline
143
+ end
140
144
  end
141
145
 
142
146
  describe 'context_fact?' do
@@ -84,7 +84,7 @@ module Dbd
84
84
  resource << special_fact
85
85
  graph = described_class.new << resource
86
86
  csv = graph.to_CSV
87
- csv.should match(%r{A long story with a newline\nreally with a comma, a double quote "" and a non-ASCII char éà Über.})
87
+ csv.should match(%r{A long story with a newline\\nreally with a comma, a double quote "" and a non-ASCII char éà Über.})
88
88
  graph_from_CSV = described_class.new.from_CSV(csv)
89
89
  graph_from_CSV.first.should be_equivalent(graph.first)
90
90
  end
@@ -38,7 +38,7 @@ module Dbd
38
38
 
39
39
  describe 'with a single context_fact collection' do
40
40
  it 'has three logical lines (but one with embedded newline)' do
41
- subject.to_CSV.lines.count.should == 4
41
+ subject.to_CSV.lines.count.should == 3
42
42
  end
43
43
 
44
44
  it 'ends with a newline' do
@@ -79,7 +79,7 @@ module Dbd
79
79
 
80
80
  describe 'handles comma, double quote and newline correctly' do
81
81
  it 'has original_source with special characters and double quote escaped' do
82
- subject.to_CSV.should match(/"this has a comma , a newline \n and a double quote """/)
82
+ subject.to_CSV.should match(/"this has a comma , a newline \\n and a double quote """/)
83
83
  end
84
84
  end
85
85
  end
@@ -156,8 +156,8 @@ module Dbd
156
156
  end
157
157
  end
158
158
 
159
- it 'has six lines' do
160
- subject.to_CSV.lines.count.should == 6
159
+ it 'has 5 lines' do
160
+ subject.to_CSV.lines.count.should == 5
161
161
  end
162
162
  end
163
163
 
@@ -173,11 +173,11 @@ module Dbd
173
173
  subject << fact_special_characters
174
174
  end
175
175
 
176
- it 'has eight lines' do
176
+ it 'has six lines' do
177
177
  filename = 'data/foo.csv'
178
178
  subject.to_CSV_file(filename)
179
179
  File.open(filename) do |f|
180
- f.readlines.count.should == 8
180
+ f.readlines.count.should == 6
181
181
  end
182
182
  end
183
183
 
@@ -33,7 +33,7 @@ module Dbd
33
33
  duration = Time.now - start
34
34
  puts "\nDuration for inserting #{NUMBER_OF_FACTS} facts in the in-memory graph was #{duration*1000_000/NUMBER_OF_FACTS} us PER FACT"
35
35
  graph.size.should == 2 * NUMBER_OF_FACTS + 1
36
- duration.should < 0.000_15 * NUMBER_OF_FACTS
36
+ duration.should < 0.000_20 * NUMBER_OF_FACTS
37
37
  # typ. 37 us on Mac Ruby 2.0.0 (on 2013-05-15 over 15K iterations)
38
38
  # typ. 45 us on Mac Ruby 2.0.0 (on 2013-06-05 over 10K iterations)
39
39
  # typ. 47 us on Mac Ruby 2.0.0 (on 2013-06-21 over 10K iterations)
@@ -9,6 +9,9 @@ RSpec.configure do |config|
9
9
 
10
10
  config.order = 'random'
11
11
 
12
+ config.filter_run :focus => true
13
+ config.run_all_when_everything_filtered = true
14
+
12
15
  # exclude neo4j tests for now (not working on Travis)
13
16
  config.filter_run_excluding :neo4j => true
14
17
  config.filter_run_excluding :neo4j_performance => true
@@ -15,7 +15,7 @@ module TestFactories
15
15
  '40fab407-9b04-4a51-9a52-d978abfcbb1f',
16
16
  '2e9fbc87-2e94-47e9-a8fd-121cc4bc3e8f',
17
17
  'http://example.org/test/name',
18
- 'Gandhi']
18
+ "Gandhi\\nKing\\\\n"]
19
19
  end
20
20
 
21
21
  def self.fact_1(context_subject = nil)
@@ -29,7 +29,7 @@ module TestFactories
29
29
  factory_for.new(
30
30
  id: forced_id,
31
31
  predicate: 'http://example.org/test/name',
32
- object: 'Gandhi')
32
+ object: "Gandhi\nKing\\n") # newline and \n
33
33
  end
34
34
 
35
35
  def self.fact_with_special_chars(context_subject = nil, subject = nil)
@@ -76,7 +76,7 @@ module TestFactories
76
76
  context_subject: context_subject,
77
77
  subject: subject,
78
78
  predicate: 'http://example.org/test/name',
79
- object: 'European Union')
79
+ object: "\\n\n\\n\n\\\n\\\\\n\\\\\\\nEuropean\nUnion\\n")
80
80
  end
81
81
 
82
82
  def self.fact_with_newline(context_subject = nil, subject = nil)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dbd
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Vandenabeele
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-10 00:00:00.000000000 Z
11
+ date: 2013-08-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -179,6 +179,8 @@ files:
179
179
  - docs/stories/011_store_resource_in_graph.txt
180
180
  - docs/stories/012_provenance_fact_properties_from_provenance_ontology.txt
181
181
  - docs/stories/013_read_graph_from_CSV.txt
182
+ - docs/stories/014_escape_newlines_in_CSV.txt
183
+ - docs/stories/015_change_order_of_fields.txt
182
184
  - docs/test.rb
183
185
  - lib/dbd.rb
184
186
  - lib/dbd/context.rb