dbd 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/HISTORY.txt +9 -0
- data/bin/test_5.rb +27 -1
- data/bin/test_6.rb +2 -0
- data/docs/stories/014_escape_newlines_in_CSV.txt +19 -0
- data/docs/stories/015_change_order_of_fields.txt +10 -0
- data/lib/dbd/fact.rb +11 -2
- data/lib/dbd/fact/factory.rb +15 -2
- data/lib/dbd/graph.rb +10 -1
- data/lib/dbd/version.rb +1 -1
- data/spec/lib/dbd/fact/factory/factory_spec.rb +10 -5
- data/spec/lib/dbd/fact/methods_spec.rb +4 -0
- data/spec/lib/dbd/graph/from_csv_spec.rb +1 -1
- data/spec/lib/dbd/graph/to_csv_spec.rb +6 -6
- data/spec/lib/dbd/performance_spec.rb +1 -1
- data/spec/spec_helper.rb +3 -0
- data/spec/test_factories/fact.rb +3 -3
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b783665f6d90bb2b8e08d8f30b16b5ad93839442
|
4
|
+
data.tar.gz: 74a917c72399200d7a4c1ac257d2a3490cd7c38d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f55e7abd315c57ebd50b4bdba532ef36fdd2b352c29e0d7ae37e1aba68adbff061ca895b491807ec2a8926bf7eb9a6c30761a5ae2270997d66f344c87163ecae
|
7
|
+
data.tar.gz: 7229d01d566e2455b76e64b6bc07fbb908212bebf95d9f1ca18ebe94da76a802aceab78c53646ada5004c35937d5580c8312862e1d46af464f0a6e2555100c87
|
data/.gitignore
CHANGED
data/HISTORY.txt
CHANGED
@@ -76,3 +76,12 @@
|
|
76
76
|
* ProvenanceResource => Context
|
77
77
|
* because the context of a fact is much more than the provenance
|
78
78
|
(also visibility, encryption, license, ...)
|
79
|
+
|
80
|
+
0.0.13 (7 Aug 2013)
|
81
|
+
======
|
82
|
+
|
83
|
+
* text inside quotes in the CSV format now has newlines and backslashed escaped
|
84
|
+
* newline => '\n'
|
85
|
+
* backslash => '\\\\' (double backslash)
|
86
|
+
=> now 1 fact is 1 "line" in the CSV file
|
87
|
+
(allowing file operations that are line oriented)
|
data/bin/test_5.rb
CHANGED
@@ -1,6 +1,32 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
# This implementation now streams to disk.
|
4
|
+
#
|
5
|
+
# Some performance (ruby 2.0 on MacBook Pro)
|
6
|
+
# /Users/peter_v/dbd/bin $ time ./test_5.rb 100 t_5_100
|
7
|
+
# added resource 0 to the graph
|
8
|
+
# ...
|
9
|
+
# added resource 99 to the graph
|
10
|
+
# Graph is ready (took 4.285428s), now starting the write to disk
|
11
|
+
#
|
12
|
+
# real 0m8.515s
|
13
|
+
# user 0m8.331s
|
14
|
+
# sys 0m0.181s
|
15
|
+
# ...
|
16
|
+
# /Users/peter_v/dbd/bin $ time ./test_6.rb t_5_100
|
17
|
+
# Graph is ready (took 14.455278s).
|
18
|
+
# graph.size is 100100
|
19
|
+
#
|
20
|
+
# real 0m14.922s
|
21
|
+
# user 0m14.728s
|
22
|
+
# sys 0m0.189s
|
23
|
+
#
|
24
|
+
# From version 0.0.13 with newline escaping, the times went up:
|
25
|
+
# writing (test_5)
|
26
|
+
# real 0m11.656s
|
27
|
+
#
|
28
|
+
# reading back (test_6)
|
29
|
+
# real 0m18.442s
|
4
30
|
|
5
31
|
FACTS_PER_RESOURCE = 1000
|
6
32
|
|
@@ -28,7 +54,7 @@ graph = Dbd::Graph.new
|
|
28
54
|
|
29
55
|
resource = Dbd::Resource.new(context_subject: context.subject)
|
30
56
|
(0...FACTS_PER_RESOURCE).each do |j|
|
31
|
-
resource << Dbd::Fact.new(predicate: "test", object: "#{'B' *
|
57
|
+
resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 75} #{i * FACTS_PER_RESOURCE + j} \n CD")
|
32
58
|
end
|
33
59
|
|
34
60
|
graph << context << resource
|
data/bin/test_6.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
014_escape_newlines_in_csv
|
2
|
+
|
3
|
+
As a user of the system
|
4
|
+
I can manipulate CSV files that are written by Dbd with basic
|
5
|
+
UNIX command line tools (e.g. split by dates, merge, ...)
|
6
|
+
And the facts in that format are aligned by a newline
|
7
|
+
So, the newlines in the String object need to be escaped
|
8
|
+
|
9
|
+
* minimal profiling of write + read speed
|
10
|
+
* escape on writing, suggest:
|
11
|
+
* "\n" => "\\n" (a back slash and a letter n)
|
12
|
+
"C:\nuby" => "C:\\nuby"
|
13
|
+
* do we need to escape a "real" slash (or slash + n) now ??
|
14
|
+
|
15
|
+
* escape on reading, suggest:
|
16
|
+
* "\\n" => "\n" (a newline)
|
17
|
+
* what with a slash + n ?
|
18
|
+
|
19
|
+
* validate that write + read performance is not degraded too much
|
data/lib/dbd/fact.rb
CHANGED
@@ -140,9 +140,12 @@ module Dbd
|
|
140
140
|
|
141
141
|
##
|
142
142
|
# @return [Array] The 6 values of a Fact converted to a string.
|
143
|
-
#
|
143
|
+
# The individual strings are escaped:
|
144
|
+
# * newlines are escaped to '\n'
|
145
|
+
# This is used for the 6 entries in the to_CSV mapping.
|
146
|
+
#
|
144
147
|
def string_values
|
145
|
-
values.map(
|
148
|
+
values.map{ |value| escaped_string(value.to_s) }
|
146
149
|
end
|
147
150
|
|
148
151
|
##
|
@@ -241,5 +244,11 @@ module Dbd
|
|
241
244
|
end
|
242
245
|
end
|
243
246
|
|
247
|
+
def escaped_string(string)
|
248
|
+
string.
|
249
|
+
gsub(%r{\\}, "\\\\\\\\"). # single \ => double \\
|
250
|
+
gsub(%r{\n}, '\n') # newline => \n
|
251
|
+
end
|
252
|
+
|
244
253
|
end
|
245
254
|
end
|
data/lib/dbd/fact/factory.rb
CHANGED
@@ -48,10 +48,23 @@ module Dbd
|
|
48
48
|
|
49
49
|
private
|
50
50
|
|
51
|
+
def unescaped_string_values(string_values)
|
52
|
+
string_values.map{ |string_value| unescaped_string(string_value) }
|
53
|
+
end
|
54
|
+
|
55
|
+
def unescaped_string(string)
|
56
|
+
r = %r{(\\\\|\\n)}
|
57
|
+
repl = {
|
58
|
+
"\\\\" => "\\", # double backslash => single backslash
|
59
|
+
"\\n" => "\n"} # backslash n => newline
|
60
|
+
string.gsub(r, repl)
|
61
|
+
end
|
62
|
+
|
51
63
|
def string_hash_from_values(string_values)
|
52
|
-
|
64
|
+
unescaped_values = unescaped_string_values(string_values)
|
65
|
+
attributes_strings_array = [top_class.attributes, unescaped_values].transpose
|
53
66
|
# Remove empty values (e.g. the context_subject for a ContextFact).
|
54
|
-
attributes_strings_array.delete_if{|a,v| v
|
67
|
+
attributes_strings_array.delete_if{ |a, v| v == '' }
|
55
68
|
Hash[attributes_strings_array]
|
56
69
|
end
|
57
70
|
|
data/lib/dbd/graph.rb
CHANGED
@@ -30,6 +30,9 @@ module Dbd
|
|
30
30
|
##
|
31
31
|
# Export the graph to a CSV string
|
32
32
|
#
|
33
|
+
# Newlines in the fields are escaped to "backslash n".
|
34
|
+
# Backslashes in the field are escape to "double backslash".
|
35
|
+
#
|
33
36
|
# @return [String] comma separated string with double quoted cells
|
34
37
|
def to_CSV
|
35
38
|
CSV.generate(csv_defaults) do |csv|
|
@@ -40,6 +43,9 @@ module Dbd
|
|
40
43
|
##
|
41
44
|
# Export the graph to a CSV file
|
42
45
|
#
|
46
|
+
# Newlines in the fields are escaped to "backslash n".
|
47
|
+
# Backslashes in the field are escape to "double backslash".
|
48
|
+
#
|
43
49
|
# @param [String] filename the filename to stream the CSV to
|
44
50
|
def to_CSV_file(filename)
|
45
51
|
CSV.open(filename, 'w', csv_defaults) do |csv|
|
@@ -50,6 +56,9 @@ module Dbd
|
|
50
56
|
##
|
51
57
|
# Import a graph from a CSV IO stream
|
52
58
|
#
|
59
|
+
# Tokens "backslash n" in the CSV fields will be unescaped to newlines.
|
60
|
+
# Tokens "double backslash" in the CSV fields will be unescaped to single backslash
|
61
|
+
#
|
53
62
|
# @param [IO Stream] csv an IO Stream that contains the CSV serialization
|
54
63
|
# @return [Graph] the imported graph
|
55
64
|
def from_CSV(csv)
|
@@ -76,7 +85,7 @@ module Dbd
|
|
76
85
|
|
77
86
|
def push_facts(target)
|
78
87
|
@internal_collection.each do |fact|
|
79
|
-
target << fact.
|
88
|
+
target << fact.string_values
|
80
89
|
end
|
81
90
|
end
|
82
91
|
|
data/lib/dbd/version.rb
CHANGED
@@ -39,6 +39,16 @@ module Dbd
|
|
39
39
|
fact.string_values.should == string_values
|
40
40
|
end
|
41
41
|
|
42
|
+
it 'converts a \n (backslash n, no newline) to newline' do
|
43
|
+
fact = described_class.from_string_values(string_values)
|
44
|
+
fact.object.should match(/\n/) # a newline
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'converts a \\\\ (double backslash) into a single backslash' do
|
48
|
+
fact = described_class.from_string_values(string_values)
|
49
|
+
fact.object.should match(%r{[^\\]\\n}) # a backslash + newline
|
50
|
+
end
|
51
|
+
|
42
52
|
it 'calls validate_string_hash if options[:validate]' do
|
43
53
|
described_class.should_receive(:validate_string_hash)
|
44
54
|
described_class.from_string_values(string_values, validate: true)
|
@@ -61,11 +71,6 @@ module Dbd
|
|
61
71
|
with_validation(string_values)
|
62
72
|
end
|
63
73
|
|
64
|
-
it 'for a nil context_subject (for context_facts)' do
|
65
|
-
string_values[2] = nil
|
66
|
-
with_validation(string_values)
|
67
|
-
end
|
68
|
-
|
69
74
|
it 'for an empty context_subject (for context_facts)' do
|
70
75
|
string_values[2] = ''
|
71
76
|
with_validation(string_values)
|
@@ -137,6 +137,10 @@ module Dbd
|
|
137
137
|
it 'the second element (time_stamp) is a String' do
|
138
138
|
full_fact.string_values[1].should be_a(String)
|
139
139
|
end
|
140
|
+
|
141
|
+
it 'escapes a newline into \n and \n into \\\\n (two backslashes and a n)' do
|
142
|
+
full_fact.string_values[5].should == "Gandhi\\nKing\\\\n" # backslash newline
|
143
|
+
end
|
140
144
|
end
|
141
145
|
|
142
146
|
describe 'context_fact?' do
|
@@ -84,7 +84,7 @@ module Dbd
|
|
84
84
|
resource << special_fact
|
85
85
|
graph = described_class.new << resource
|
86
86
|
csv = graph.to_CSV
|
87
|
-
csv.should match(%r{A long story with a newline
|
87
|
+
csv.should match(%r{A long story with a newline\\nreally with a comma, a double quote "" and a non-ASCII char éà Über.})
|
88
88
|
graph_from_CSV = described_class.new.from_CSV(csv)
|
89
89
|
graph_from_CSV.first.should be_equivalent(graph.first)
|
90
90
|
end
|
@@ -38,7 +38,7 @@ module Dbd
|
|
38
38
|
|
39
39
|
describe 'with a single context_fact collection' do
|
40
40
|
it 'has three logical lines (but one with embedded newline)' do
|
41
|
-
subject.to_CSV.lines.count.should ==
|
41
|
+
subject.to_CSV.lines.count.should == 3
|
42
42
|
end
|
43
43
|
|
44
44
|
it 'ends with a newline' do
|
@@ -79,7 +79,7 @@ module Dbd
|
|
79
79
|
|
80
80
|
describe 'handles comma, double quote and newline correctly' do
|
81
81
|
it 'has original_source with special characters and double quote escaped' do
|
82
|
-
subject.to_CSV.should match(/"this has a comma , a newline
|
82
|
+
subject.to_CSV.should match(/"this has a comma , a newline \\n and a double quote """/)
|
83
83
|
end
|
84
84
|
end
|
85
85
|
end
|
@@ -156,8 +156,8 @@ module Dbd
|
|
156
156
|
end
|
157
157
|
end
|
158
158
|
|
159
|
-
it 'has
|
160
|
-
subject.to_CSV.lines.count.should ==
|
159
|
+
it 'has 5 lines' do
|
160
|
+
subject.to_CSV.lines.count.should == 5
|
161
161
|
end
|
162
162
|
end
|
163
163
|
|
@@ -173,11 +173,11 @@ module Dbd
|
|
173
173
|
subject << fact_special_characters
|
174
174
|
end
|
175
175
|
|
176
|
-
it 'has
|
176
|
+
it 'has six lines' do
|
177
177
|
filename = 'data/foo.csv'
|
178
178
|
subject.to_CSV_file(filename)
|
179
179
|
File.open(filename) do |f|
|
180
|
-
f.readlines.count.should ==
|
180
|
+
f.readlines.count.should == 6
|
181
181
|
end
|
182
182
|
end
|
183
183
|
|
@@ -33,7 +33,7 @@ module Dbd
|
|
33
33
|
duration = Time.now - start
|
34
34
|
puts "\nDuration for inserting #{NUMBER_OF_FACTS} facts in the in-memory graph was #{duration*1000_000/NUMBER_OF_FACTS} us PER FACT"
|
35
35
|
graph.size.should == 2 * NUMBER_OF_FACTS + 1
|
36
|
-
duration.should < 0.
|
36
|
+
duration.should < 0.000_20 * NUMBER_OF_FACTS
|
37
37
|
# typ. 37 us on Mac Ruby 2.0.0 (on 2013-05-15 over 15K iterations)
|
38
38
|
# typ. 45 us on Mac Ruby 2.0.0 (on 2013-06-05 over 10K iterations)
|
39
39
|
# typ. 47 us on Mac Ruby 2.0.0 (on 2013-06-21 over 10K iterations)
|
data/spec/spec_helper.rb
CHANGED
@@ -9,6 +9,9 @@ RSpec.configure do |config|
|
|
9
9
|
|
10
10
|
config.order = 'random'
|
11
11
|
|
12
|
+
config.filter_run :focus => true
|
13
|
+
config.run_all_when_everything_filtered = true
|
14
|
+
|
12
15
|
# exclude neo4j tests for now (not working on Travis)
|
13
16
|
config.filter_run_excluding :neo4j => true
|
14
17
|
config.filter_run_excluding :neo4j_performance => true
|
data/spec/test_factories/fact.rb
CHANGED
@@ -15,7 +15,7 @@ module TestFactories
|
|
15
15
|
'40fab407-9b04-4a51-9a52-d978abfcbb1f',
|
16
16
|
'2e9fbc87-2e94-47e9-a8fd-121cc4bc3e8f',
|
17
17
|
'http://example.org/test/name',
|
18
|
-
|
18
|
+
"Gandhi\\nKing\\\\n"]
|
19
19
|
end
|
20
20
|
|
21
21
|
def self.fact_1(context_subject = nil)
|
@@ -29,7 +29,7 @@ module TestFactories
|
|
29
29
|
factory_for.new(
|
30
30
|
id: forced_id,
|
31
31
|
predicate: 'http://example.org/test/name',
|
32
|
-
object:
|
32
|
+
object: "Gandhi\nKing\\n") # newline and \n
|
33
33
|
end
|
34
34
|
|
35
35
|
def self.fact_with_special_chars(context_subject = nil, subject = nil)
|
@@ -76,7 +76,7 @@ module TestFactories
|
|
76
76
|
context_subject: context_subject,
|
77
77
|
subject: subject,
|
78
78
|
predicate: 'http://example.org/test/name',
|
79
|
-
object:
|
79
|
+
object: "\\n\n\\n\n\\\n\\\\\n\\\\\\\nEuropean\nUnion\\n")
|
80
80
|
end
|
81
81
|
|
82
82
|
def self.fact_with_newline(context_subject = nil, subject = nil)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dbd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Vandenabeele
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07
|
11
|
+
date: 2013-08-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -179,6 +179,8 @@ files:
|
|
179
179
|
- docs/stories/011_store_resource_in_graph.txt
|
180
180
|
- docs/stories/012_provenance_fact_properties_from_provenance_ontology.txt
|
181
181
|
- docs/stories/013_read_graph_from_CSV.txt
|
182
|
+
- docs/stories/014_escape_newlines_in_CSV.txt
|
183
|
+
- docs/stories/015_change_order_of_fields.txt
|
182
184
|
- docs/test.rb
|
183
185
|
- lib/dbd.rb
|
184
186
|
- lib/dbd/context.rb
|