dbd 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/HISTORY.txt +9 -0
- data/bin/test_5.rb +27 -1
- data/bin/test_6.rb +2 -0
- data/docs/stories/014_escape_newlines_in_CSV.txt +19 -0
- data/docs/stories/015_change_order_of_fields.txt +10 -0
- data/lib/dbd/fact.rb +11 -2
- data/lib/dbd/fact/factory.rb +15 -2
- data/lib/dbd/graph.rb +10 -1
- data/lib/dbd/version.rb +1 -1
- data/spec/lib/dbd/fact/factory/factory_spec.rb +10 -5
- data/spec/lib/dbd/fact/methods_spec.rb +4 -0
- data/spec/lib/dbd/graph/from_csv_spec.rb +1 -1
- data/spec/lib/dbd/graph/to_csv_spec.rb +6 -6
- data/spec/lib/dbd/performance_spec.rb +1 -1
- data/spec/spec_helper.rb +3 -0
- data/spec/test_factories/fact.rb +3 -3
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b783665f6d90bb2b8e08d8f30b16b5ad93839442
|
4
|
+
data.tar.gz: 74a917c72399200d7a4c1ac257d2a3490cd7c38d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f55e7abd315c57ebd50b4bdba532ef36fdd2b352c29e0d7ae37e1aba68adbff061ca895b491807ec2a8926bf7eb9a6c30761a5ae2270997d66f344c87163ecae
|
7
|
+
data.tar.gz: 7229d01d566e2455b76e64b6bc07fbb908212bebf95d9f1ca18ebe94da76a802aceab78c53646ada5004c35937d5580c8312862e1d46af464f0a6e2555100c87
|
data/.gitignore
CHANGED
data/HISTORY.txt
CHANGED
@@ -76,3 +76,12 @@
|
|
76
76
|
* ProvenanceResource => Context
|
77
77
|
* because the context of a fact is much more than the provenance
|
78
78
|
(also visibility, encryption, license, ...)
|
79
|
+
|
80
|
+
0.0.13 (7 Aug 2013)
|
81
|
+
======
|
82
|
+
|
83
|
+
* text inside quotes in the CSV format now has newlines and backslashed escaped
|
84
|
+
* newline => '\n'
|
85
|
+
* backslash => '\\\\' (double backslash)
|
86
|
+
=> now 1 fact is 1 "line" in the CSV file
|
87
|
+
(allowing file operations that are line oriented)
|
data/bin/test_5.rb
CHANGED
@@ -1,6 +1,32 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
# This implementation now streams to disk.
|
4
|
+
#
|
5
|
+
# Some performance (ruby 2.0 on MacBook Pro)
|
6
|
+
# /Users/peter_v/dbd/bin $ time ./test_5.rb 100 t_5_100
|
7
|
+
# added resource 0 to the graph
|
8
|
+
# ...
|
9
|
+
# added resource 99 to the graph
|
10
|
+
# Graph is ready (took 4.285428s), now starting the write to disk
|
11
|
+
#
|
12
|
+
# real 0m8.515s
|
13
|
+
# user 0m8.331s
|
14
|
+
# sys 0m0.181s
|
15
|
+
# ...
|
16
|
+
# /Users/peter_v/dbd/bin $ time ./test_6.rb t_5_100
|
17
|
+
# Graph is ready (took 14.455278s).
|
18
|
+
# graph.size is 100100
|
19
|
+
#
|
20
|
+
# real 0m14.922s
|
21
|
+
# user 0m14.728s
|
22
|
+
# sys 0m0.189s
|
23
|
+
#
|
24
|
+
# From version 0.0.13 with newline escaping, the times went up:
|
25
|
+
# writing (test_5)
|
26
|
+
# real 0m11.656s
|
27
|
+
#
|
28
|
+
# reading back (test_6)
|
29
|
+
# real 0m18.442s
|
4
30
|
|
5
31
|
FACTS_PER_RESOURCE = 1000
|
6
32
|
|
@@ -28,7 +54,7 @@ graph = Dbd::Graph.new
|
|
28
54
|
|
29
55
|
resource = Dbd::Resource.new(context_subject: context.subject)
|
30
56
|
(0...FACTS_PER_RESOURCE).each do |j|
|
31
|
-
resource << Dbd::Fact.new(predicate: "test", object: "#{'B' *
|
57
|
+
resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 75} #{i * FACTS_PER_RESOURCE + j} \n CD")
|
32
58
|
end
|
33
59
|
|
34
60
|
graph << context << resource
|
data/bin/test_6.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
014_escape_newlines_in_csv
|
2
|
+
|
3
|
+
As a user of the system
|
4
|
+
I can manipulate CSV files that are written by Dbd with basic
|
5
|
+
UNIX command line tools (e.g. split by dates, merge, ...)
|
6
|
+
And the facts in that format are aligned by a newline
|
7
|
+
So, the newlines in the String object need to be escaped
|
8
|
+
|
9
|
+
* minimal profiling of write + read speed
|
10
|
+
* escape on writing, suggest:
|
11
|
+
* "\n" => "\\n" (a back slash and a letter n)
|
12
|
+
"C:\nuby" => "C:\\nuby"
|
13
|
+
* do we need to escape a "real" slash (or slash + n) now ??
|
14
|
+
|
15
|
+
* escape on reading, suggest:
|
16
|
+
* "\\n" => "\n" (a newline)
|
17
|
+
* what with a slash + n ?
|
18
|
+
|
19
|
+
* validate that write + read performance is not degraded too much
|
data/lib/dbd/fact.rb
CHANGED
@@ -140,9 +140,12 @@ module Dbd
|
|
140
140
|
|
141
141
|
##
|
142
142
|
# @return [Array] The 6 values of a Fact converted to a string.
|
143
|
-
#
|
143
|
+
# The individual strings are escaped:
|
144
|
+
# * newlines are escaped to '\n'
|
145
|
+
# This is used for the 6 entries in the to_CSV mapping.
|
146
|
+
#
|
144
147
|
def string_values
|
145
|
-
values.map(
|
148
|
+
values.map{ |value| escaped_string(value.to_s) }
|
146
149
|
end
|
147
150
|
|
148
151
|
##
|
@@ -241,5 +244,11 @@ module Dbd
|
|
241
244
|
end
|
242
245
|
end
|
243
246
|
|
247
|
+
def escaped_string(string)
|
248
|
+
string.
|
249
|
+
gsub(%r{\\}, "\\\\\\\\"). # single \ => double \\
|
250
|
+
gsub(%r{\n}, '\n') # newline => \n
|
251
|
+
end
|
252
|
+
|
244
253
|
end
|
245
254
|
end
|
data/lib/dbd/fact/factory.rb
CHANGED
@@ -48,10 +48,23 @@ module Dbd
|
|
48
48
|
|
49
49
|
private
|
50
50
|
|
51
|
+
def unescaped_string_values(string_values)
|
52
|
+
string_values.map{ |string_value| unescaped_string(string_value) }
|
53
|
+
end
|
54
|
+
|
55
|
+
def unescaped_string(string)
|
56
|
+
r = %r{(\\\\|\\n)}
|
57
|
+
repl = {
|
58
|
+
"\\\\" => "\\", # double backslash => single backslash
|
59
|
+
"\\n" => "\n"} # backslash n => newline
|
60
|
+
string.gsub(r, repl)
|
61
|
+
end
|
62
|
+
|
51
63
|
def string_hash_from_values(string_values)
|
52
|
-
|
64
|
+
unescaped_values = unescaped_string_values(string_values)
|
65
|
+
attributes_strings_array = [top_class.attributes, unescaped_values].transpose
|
53
66
|
# Remove empty values (e.g. the context_subject for a ContextFact).
|
54
|
-
attributes_strings_array.delete_if{|a,v| v
|
67
|
+
attributes_strings_array.delete_if{ |a, v| v == '' }
|
55
68
|
Hash[attributes_strings_array]
|
56
69
|
end
|
57
70
|
|
data/lib/dbd/graph.rb
CHANGED
@@ -30,6 +30,9 @@ module Dbd
|
|
30
30
|
##
|
31
31
|
# Export the graph to a CSV string
|
32
32
|
#
|
33
|
+
# Newlines in the fields are escaped to "backslash n".
|
34
|
+
# Backslashes in the field are escape to "double backslash".
|
35
|
+
#
|
33
36
|
# @return [String] comma separated string with double quoted cells
|
34
37
|
def to_CSV
|
35
38
|
CSV.generate(csv_defaults) do |csv|
|
@@ -40,6 +43,9 @@ module Dbd
|
|
40
43
|
##
|
41
44
|
# Export the graph to a CSV file
|
42
45
|
#
|
46
|
+
# Newlines in the fields are escaped to "backslash n".
|
47
|
+
# Backslashes in the field are escape to "double backslash".
|
48
|
+
#
|
43
49
|
# @param [String] filename the filename to stream the CSV to
|
44
50
|
def to_CSV_file(filename)
|
45
51
|
CSV.open(filename, 'w', csv_defaults) do |csv|
|
@@ -50,6 +56,9 @@ module Dbd
|
|
50
56
|
##
|
51
57
|
# Import a graph from a CSV IO stream
|
52
58
|
#
|
59
|
+
# Tokens "backslash n" in the CSV fields will be unescaped to newlines.
|
60
|
+
# Tokens "double backslash" in the CSV fields will be unescaped to single backslash
|
61
|
+
#
|
53
62
|
# @param [IO Stream] csv an IO Stream that contains the CSV serialization
|
54
63
|
# @return [Graph] the imported graph
|
55
64
|
def from_CSV(csv)
|
@@ -76,7 +85,7 @@ module Dbd
|
|
76
85
|
|
77
86
|
def push_facts(target)
|
78
87
|
@internal_collection.each do |fact|
|
79
|
-
target << fact.
|
88
|
+
target << fact.string_values
|
80
89
|
end
|
81
90
|
end
|
82
91
|
|
data/lib/dbd/version.rb
CHANGED
@@ -39,6 +39,16 @@ module Dbd
|
|
39
39
|
fact.string_values.should == string_values
|
40
40
|
end
|
41
41
|
|
42
|
+
it 'converts a \n (backslash n, no newline) to newline' do
|
43
|
+
fact = described_class.from_string_values(string_values)
|
44
|
+
fact.object.should match(/\n/) # a newline
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'converts a \\\\ (double backslash) into a single backslash' do
|
48
|
+
fact = described_class.from_string_values(string_values)
|
49
|
+
fact.object.should match(%r{[^\\]\\n}) # a backslash + newline
|
50
|
+
end
|
51
|
+
|
42
52
|
it 'calls validate_string_hash if options[:validate]' do
|
43
53
|
described_class.should_receive(:validate_string_hash)
|
44
54
|
described_class.from_string_values(string_values, validate: true)
|
@@ -61,11 +71,6 @@ module Dbd
|
|
61
71
|
with_validation(string_values)
|
62
72
|
end
|
63
73
|
|
64
|
-
it 'for a nil context_subject (for context_facts)' do
|
65
|
-
string_values[2] = nil
|
66
|
-
with_validation(string_values)
|
67
|
-
end
|
68
|
-
|
69
74
|
it 'for an empty context_subject (for context_facts)' do
|
70
75
|
string_values[2] = ''
|
71
76
|
with_validation(string_values)
|
@@ -137,6 +137,10 @@ module Dbd
|
|
137
137
|
it 'the second element (time_stamp) is a String' do
|
138
138
|
full_fact.string_values[1].should be_a(String)
|
139
139
|
end
|
140
|
+
|
141
|
+
it 'escapes a newline into \n and \n into \\\\n (two backslashes and a n)' do
|
142
|
+
full_fact.string_values[5].should == "Gandhi\\nKing\\\\n" # backslash newline
|
143
|
+
end
|
140
144
|
end
|
141
145
|
|
142
146
|
describe 'context_fact?' do
|
@@ -84,7 +84,7 @@ module Dbd
|
|
84
84
|
resource << special_fact
|
85
85
|
graph = described_class.new << resource
|
86
86
|
csv = graph.to_CSV
|
87
|
-
csv.should match(%r{A long story with a newline
|
87
|
+
csv.should match(%r{A long story with a newline\\nreally with a comma, a double quote "" and a non-ASCII char éà Über.})
|
88
88
|
graph_from_CSV = described_class.new.from_CSV(csv)
|
89
89
|
graph_from_CSV.first.should be_equivalent(graph.first)
|
90
90
|
end
|
@@ -38,7 +38,7 @@ module Dbd
|
|
38
38
|
|
39
39
|
describe 'with a single context_fact collection' do
|
40
40
|
it 'has three logical lines (but one with embedded newline)' do
|
41
|
-
subject.to_CSV.lines.count.should ==
|
41
|
+
subject.to_CSV.lines.count.should == 3
|
42
42
|
end
|
43
43
|
|
44
44
|
it 'ends with a newline' do
|
@@ -79,7 +79,7 @@ module Dbd
|
|
79
79
|
|
80
80
|
describe 'handles comma, double quote and newline correctly' do
|
81
81
|
it 'has original_source with special characters and double quote escaped' do
|
82
|
-
subject.to_CSV.should match(/"this has a comma , a newline
|
82
|
+
subject.to_CSV.should match(/"this has a comma , a newline \\n and a double quote """/)
|
83
83
|
end
|
84
84
|
end
|
85
85
|
end
|
@@ -156,8 +156,8 @@ module Dbd
|
|
156
156
|
end
|
157
157
|
end
|
158
158
|
|
159
|
-
it 'has
|
160
|
-
subject.to_CSV.lines.count.should ==
|
159
|
+
it 'has 5 lines' do
|
160
|
+
subject.to_CSV.lines.count.should == 5
|
161
161
|
end
|
162
162
|
end
|
163
163
|
|
@@ -173,11 +173,11 @@ module Dbd
|
|
173
173
|
subject << fact_special_characters
|
174
174
|
end
|
175
175
|
|
176
|
-
it 'has
|
176
|
+
it 'has six lines' do
|
177
177
|
filename = 'data/foo.csv'
|
178
178
|
subject.to_CSV_file(filename)
|
179
179
|
File.open(filename) do |f|
|
180
|
-
f.readlines.count.should ==
|
180
|
+
f.readlines.count.should == 6
|
181
181
|
end
|
182
182
|
end
|
183
183
|
|
@@ -33,7 +33,7 @@ module Dbd
|
|
33
33
|
duration = Time.now - start
|
34
34
|
puts "\nDuration for inserting #{NUMBER_OF_FACTS} facts in the in-memory graph was #{duration*1000_000/NUMBER_OF_FACTS} us PER FACT"
|
35
35
|
graph.size.should == 2 * NUMBER_OF_FACTS + 1
|
36
|
-
duration.should < 0.
|
36
|
+
duration.should < 0.000_20 * NUMBER_OF_FACTS
|
37
37
|
# typ. 37 us on Mac Ruby 2.0.0 (on 2013-05-15 over 15K iterations)
|
38
38
|
# typ. 45 us on Mac Ruby 2.0.0 (on 2013-06-05 over 10K iterations)
|
39
39
|
# typ. 47 us on Mac Ruby 2.0.0 (on 2013-06-21 over 10K iterations)
|
data/spec/spec_helper.rb
CHANGED
@@ -9,6 +9,9 @@ RSpec.configure do |config|
|
|
9
9
|
|
10
10
|
config.order = 'random'
|
11
11
|
|
12
|
+
config.filter_run :focus => true
|
13
|
+
config.run_all_when_everything_filtered = true
|
14
|
+
|
12
15
|
# exclude neo4j tests for now (not working on Travis)
|
13
16
|
config.filter_run_excluding :neo4j => true
|
14
17
|
config.filter_run_excluding :neo4j_performance => true
|
data/spec/test_factories/fact.rb
CHANGED
@@ -15,7 +15,7 @@ module TestFactories
|
|
15
15
|
'40fab407-9b04-4a51-9a52-d978abfcbb1f',
|
16
16
|
'2e9fbc87-2e94-47e9-a8fd-121cc4bc3e8f',
|
17
17
|
'http://example.org/test/name',
|
18
|
-
|
18
|
+
"Gandhi\\nKing\\\\n"]
|
19
19
|
end
|
20
20
|
|
21
21
|
def self.fact_1(context_subject = nil)
|
@@ -29,7 +29,7 @@ module TestFactories
|
|
29
29
|
factory_for.new(
|
30
30
|
id: forced_id,
|
31
31
|
predicate: 'http://example.org/test/name',
|
32
|
-
object:
|
32
|
+
object: "Gandhi\nKing\\n") # newline and \n
|
33
33
|
end
|
34
34
|
|
35
35
|
def self.fact_with_special_chars(context_subject = nil, subject = nil)
|
@@ -76,7 +76,7 @@ module TestFactories
|
|
76
76
|
context_subject: context_subject,
|
77
77
|
subject: subject,
|
78
78
|
predicate: 'http://example.org/test/name',
|
79
|
-
object:
|
79
|
+
object: "\\n\n\\n\n\\\n\\\\\n\\\\\\\nEuropean\nUnion\\n")
|
80
80
|
end
|
81
81
|
|
82
82
|
def self.fact_with_newline(context_subject = nil, subject = nil)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dbd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Vandenabeele
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07
|
11
|
+
date: 2013-08-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -179,6 +179,8 @@ files:
|
|
179
179
|
- docs/stories/011_store_resource_in_graph.txt
|
180
180
|
- docs/stories/012_provenance_fact_properties_from_provenance_ontology.txt
|
181
181
|
- docs/stories/013_read_graph_from_CSV.txt
|
182
|
+
- docs/stories/014_escape_newlines_in_CSV.txt
|
183
|
+
- docs/stories/015_change_order_of_fields.txt
|
182
184
|
- docs/test.rb
|
183
185
|
- lib/dbd.rb
|
184
186
|
- lib/dbd/context.rb
|