dbd 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +3 -1
- data/HISTORY.txt +7 -0
- data/README.md +20 -1
- data/bin/test_1.rb +1 -1
- data/bin/test_3.rb +11 -0
- data/bin/test_4.rb +32 -0
- data/bin/test_5.rb +40 -0
- data/bin/test_6.rb +21 -0
- data/data/.gitkeep +0 -0
- data/lib/dbd/fact.rb +1 -1
- data/lib/dbd/graph.rb +26 -7
- data/lib/dbd/helpers/uuid.rb +2 -2
- data/lib/dbd/version.rb +1 -1
- data/spec/factories/provenance_fact.rb +1 -0
- data/spec/lib/dbd/graph/to_csv_spec.rb +31 -0
- data/spec/lib/dbd/helpers/uuid/uuid_spec.rb +4 -0
- metadata +11 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b21e77e8316f18a011e2356b79a28a35e5dde7d
|
4
|
+
data.tar.gz: d8a0adeebbc7311a512ef08bfbc63902e16f2159
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c8e9c42c082ad4bdc8c6af283c9a73fad3c35a75e5ebace47aefbbe127cb0ee001ef066d17ef6e9b253dcd2a6311ccbb520810bdb061ab233c77950be7a99542
|
7
|
+
data.tar.gz: a63bb4462ebdc8fa4a6c2e0f3a1513146f7aac1e85a9d748c3126f5352464a613ee9bf956e27e0fad06edc4816e7226e78a37a1c5218196e6ea0f4ccf13db5e5
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/HISTORY.txt
CHANGED
@@ -43,3 +43,10 @@
|
|
43
43
|
|
44
44
|
* bin/test_1.rb was used for first successful writing of 10M facts using
|
45
45
|
ruby-1.9.3-p429 (will report issues on ruby-2.0.0-p195 and jruby-1.7.4)
|
46
|
+
|
47
|
+
0.0.9 (30 June 2013)
|
48
|
+
=====
|
49
|
+
|
50
|
+
* new function graph#to_CSV_file
|
51
|
+
* bin/test_5.rb was used to write 10M facts using ruby-2.0.0, 1.9.3 and jruby-1.7.4
|
52
|
+
* jruby is 3 time faster, but 10% more memory comsumption
|
data/README.md
CHANGED
@@ -44,7 +44,7 @@ Open Source [MIT]
|
|
44
44
|
|
45
45
|
## Installation
|
46
46
|
|
47
|
-
$ gem install dbd # Ruby 1.9.
|
47
|
+
$ gem install dbd # Ruby 1.9.3, 2.0.x, jruby (see .travis.yml)
|
48
48
|
|
49
49
|
## Examples
|
50
50
|
|
@@ -118,6 +118,25 @@ puts imported_graph.map(&:short)
|
|
118
118
|
# 5eb1ea27 : 3767c493 : todo:story : A long period of peace,_ that is a "bliss".
|
119
119
|
```
|
120
120
|
|
121
|
+
## Performance tests on 10 M facts
|
122
|
+
|
123
|
+
In version 0.0.9 a number of test programs where added (e.g. ../bin/test_5.rb)
|
124
|
+
that where used to populated in memory and write to disk a data set with 10 M facts.
|
125
|
+
|
126
|
+
This function was tested on ruby-2.0.0, ruby-1.9.3 and jruby-1.7.4. The facts
|
127
|
+
had an approximate size of 250 Bytes each (80 Bytes object).
|
128
|
+
|
129
|
+
The time needed and memory size (RSS) for populating the in-memory dataset was:
|
130
|
+
|
131
|
+
10 M facts (of 250 Bytes; 2.5 GB netto data):
|
132
|
+
|
133
|
+
| ruby | time | memory (RSS} |
|
134
|
+
|------------|-------------| ------------:|
|
135
|
+
| ruby-1.9.3 | 863 seconds | 8.1 GB |
|
136
|
+
| ruby-2.0.0 | 862 seconds | 9.0 GB |
|
137
|
+
|jruby-1.7.4 | 345 seconds | 10.8 GB |
|
138
|
+
|
139
|
+
|
121
140
|
[RDF]: http://www.w3.org/RDF/
|
122
141
|
[Rationale]: http://github.com/petervandenabeele/dbd/blob/master/docs/rationale.md
|
123
142
|
[MIT]: https://github.com/petervandenabeele/dbd/blob/master/LICENSE.txt
|
data/bin/test_1.rb
CHANGED
data/bin/test_3.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# encoding=us-ascii
|
2
|
+
|
3
|
+
# this is a test program for an exception that is thrown in JRuby
|
4
|
+
# see http://markmail.org/message/e2ote7rkwht2quel?q=list:org.codehaus.jruby.user
|
5
|
+
|
6
|
+
#row = "A" * 300 # does NOT fail with this value of `row`
|
7
|
+
row = "A" * 301
|
8
|
+
count = 5_000_000
|
9
|
+
|
10
|
+
csv_string = row * count
|
11
|
+
encoded_string = csv_string.encode("utf-8")
|
data/bin/test_4.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This is a test program for an issue with CSV.generate
|
4
|
+
# in ruby-2.0.0 and ruby-head, see http://bugs.ruby-lang.org/issues/8585
|
5
|
+
|
6
|
+
count = ARGV[0].to_i
|
7
|
+
unless count > 0
|
8
|
+
puts "Give a 'count' as first argument."
|
9
|
+
exit(1)
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'csv'
|
13
|
+
|
14
|
+
row_data = [
|
15
|
+
"59ffbb3b-1e48-4c1f-81d8-d93afc84c966",
|
16
|
+
"2013-06-28 19:14:55.975000806 UTC",
|
17
|
+
"a11f290e-c441-41bc-8b8c-4e6c27b1b6fc",
|
18
|
+
"c73e6241-d46f-4952-8377-c11372346d15",
|
19
|
+
"test",
|
20
|
+
"BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0"]
|
21
|
+
|
22
|
+
puts "starting CSV.open"
|
23
|
+
|
24
|
+
start_time = Time.now
|
25
|
+
|
26
|
+
csv_string = CSV.generate(force_quotes: true) do |csv|
|
27
|
+
count.times do
|
28
|
+
csv << row_data
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
puts "CSV.open took #{Time.now - start_time} seconds"
|
data/bin/test_5.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This implementation now streams to disk.
|
4
|
+
|
5
|
+
FACTS_PER_RESOURCE = 1000
|
6
|
+
|
7
|
+
count = ARGV[0].to_i
|
8
|
+
unless count > 0
|
9
|
+
puts "Give a 'count' as first argument."
|
10
|
+
exit(1)
|
11
|
+
end
|
12
|
+
|
13
|
+
filename = ARGV[1]
|
14
|
+
unless filename
|
15
|
+
puts "Give a 'filename' as second argument."
|
16
|
+
exit(1)
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'dbd'
|
20
|
+
|
21
|
+
start = Time.now
|
22
|
+
|
23
|
+
graph = Dbd::Graph.new
|
24
|
+
|
25
|
+
(0...count).each do |i|
|
26
|
+
provenance_resource = Dbd::ProvenanceResource.new
|
27
|
+
provenance_resource << Dbd::ProvenanceFact.new(predicate: "prov:test" , object: "A" * 10)
|
28
|
+
|
29
|
+
resource = Dbd::Resource.new(provenance_subject: provenance_resource.subject)
|
30
|
+
(0...FACTS_PER_RESOURCE).each do |j|
|
31
|
+
resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 80} #{i * FACTS_PER_RESOURCE + j}")
|
32
|
+
end
|
33
|
+
|
34
|
+
graph << provenance_resource << resource
|
35
|
+
puts ("added resource #{i} to the graph")
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "Graph is ready (took #{Time.now - start}s), now starting the write to disk"
|
39
|
+
|
40
|
+
graph.to_CSV_file(filename)
|
data/bin/test_6.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This implementation streams from disk
|
4
|
+
|
5
|
+
filename = ARGV[0]
|
6
|
+
unless filename
|
7
|
+
puts "Give a 'filename' as argument."
|
8
|
+
exit(1)
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'dbd'
|
12
|
+
|
13
|
+
start = Time.now
|
14
|
+
|
15
|
+
graph = File.open(filename) do |f|
|
16
|
+
Dbd::Graph.from_CSV(f)
|
17
|
+
end
|
18
|
+
|
19
|
+
puts "Graph is ready (took #{Time.now - start}s), now starting the write to disk"
|
20
|
+
|
21
|
+
puts "graph.size is #{graph.size}"
|
data/data/.gitkeep
ADDED
File without changes
|
data/lib/dbd/fact.rb
CHANGED
@@ -156,7 +156,7 @@ module Dbd
|
|
156
156
|
# Constructs a Fact or ProvenanceFact from a string values array
|
157
157
|
# (e.g. pulled from a CSV row).
|
158
158
|
#
|
159
|
-
# @param [Array]
|
159
|
+
# @param [Array] string_values Required : the array with values, organized as in attributes
|
160
160
|
# @return [Fact, ProvenanceFact] the constructed fact
|
161
161
|
def self.from_string_values(string_values)
|
162
162
|
string_hash = hash_from_values(string_values)
|
data/lib/dbd/graph.rb
CHANGED
@@ -32,17 +32,25 @@ module Dbd
|
|
32
32
|
#
|
33
33
|
# @return [String] comma separated string with double quoted cells
|
34
34
|
def to_CSV
|
35
|
-
CSV.generate(
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
CSV.generate(csv_defaults) do |csv|
|
36
|
+
push_facts(csv)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Export the graph to a CSV file
|
42
|
+
#
|
43
|
+
# @param [String] :filename the filename to stream the CSV to
|
44
|
+
def to_CSV_file(filename)
|
45
|
+
CSV.open(filename, 'w', csv_defaults) do |csv|
|
46
|
+
push_facts(csv)
|
47
|
+
end
|
40
48
|
end
|
41
49
|
|
42
50
|
##
|
43
|
-
# Import a graph from a CSV
|
51
|
+
# Import a graph from a CSV IO stream
|
44
52
|
#
|
45
|
-
# @param [
|
53
|
+
# @param [IO Stream] csv an IO Stream that contains the CSV serialization
|
46
54
|
# @return [Graph] the imported graph
|
47
55
|
def self.from_CSV(csv)
|
48
56
|
new.tap do |graph|
|
@@ -63,5 +71,16 @@ module Dbd
|
|
63
71
|
fact.time_stamp = TimeStamp.new(larger_than: newest_time_stamp) unless fact.time_stamp
|
64
72
|
end
|
65
73
|
|
74
|
+
def csv_defaults
|
75
|
+
{force_quotes: true,
|
76
|
+
encoding: 'utf-8'}
|
77
|
+
end
|
78
|
+
|
79
|
+
def push_facts(target)
|
80
|
+
@internal_collection.each do |fact|
|
81
|
+
target << fact.values
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
66
85
|
end
|
67
86
|
end
|
data/lib/dbd/helpers/uuid.rb
CHANGED
@@ -18,14 +18,14 @@ module Dbd
|
|
18
18
|
# Store a SecureRandom.uuid.
|
19
19
|
# @return [void]
|
20
20
|
def initialize
|
21
|
-
@uuid = SecureRandom.uuid
|
21
|
+
@uuid = SecureRandom.uuid.encode('utf-8')
|
22
22
|
end
|
23
23
|
|
24
24
|
##
|
25
25
|
# The to_s of the uuid.
|
26
26
|
# @return [String]
|
27
27
|
def to_s
|
28
|
-
@uuid
|
28
|
+
@uuid
|
29
29
|
end
|
30
30
|
|
31
31
|
end
|
data/lib/dbd/version.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding=utf-8
|
1
2
|
require 'spec_helper'
|
2
3
|
|
3
4
|
module Dbd
|
@@ -10,6 +11,7 @@ module Dbd
|
|
10
11
|
let(:provenance_facts) { Factories::Fact::Collection.provenance_facts(new_subject) }
|
11
12
|
let(:provenance_fact_1) { provenance_facts.first }
|
12
13
|
let(:fact_2_3) { Factories::Fact::Collection.fact_2_3(provenance_fact_1.subject) }
|
14
|
+
let(:fact_special_characters) { Factories::Fact::fact_with_special_chars(provenance_fact_1.subject, new_subject) }
|
13
15
|
|
14
16
|
let(:subject_regexp) { Fact::Subject.regexp }
|
15
17
|
let(:id_regexp) { Fact::ID.regexp }
|
@@ -157,5 +159,34 @@ module Dbd
|
|
157
159
|
subject.to_CSV.lines.count.should == 6
|
158
160
|
end
|
159
161
|
end
|
162
|
+
|
163
|
+
describe "#to_CSV_file" do
|
164
|
+
|
165
|
+
before do
|
166
|
+
provenance_facts.each do |provenance_fact|
|
167
|
+
subject << provenance_fact
|
168
|
+
end
|
169
|
+
fact_2_3.each do |fact|
|
170
|
+
subject << fact
|
171
|
+
end
|
172
|
+
subject << fact_special_characters
|
173
|
+
end
|
174
|
+
|
175
|
+
it "has eight lines" do
|
176
|
+
filename = 'data/foo.csv'
|
177
|
+
subject.to_CSV_file(filename)
|
178
|
+
File.open(filename) do |f|
|
179
|
+
f.readlines.count.should == 8
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
it "reads back UTF-8 characters correctly" do
|
184
|
+
filename = 'data/foo.csv'
|
185
|
+
subject.to_CSV_file(filename)
|
186
|
+
File.open(filename) do |f|
|
187
|
+
f.readlines.detect{|l| l.match(%r{really with a comma, a double quote "" and a non-ASCII char éà Über.})}.should_not be_nil
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
160
191
|
end
|
161
192
|
end
|
@@ -10,6 +10,10 @@ module Dbd
|
|
10
10
|
it ".new creates a new random UUID" do
|
11
11
|
described_class.new.to_s.should match(UUID.regexp)
|
12
12
|
end
|
13
|
+
|
14
|
+
it ".new creates a new random UUID with UTF-8 encoding" do
|
15
|
+
described_class.new.to_s.encoding.should == Encoding::UTF_8
|
16
|
+
end
|
13
17
|
end
|
14
18
|
end
|
15
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dbd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Vandenabeele
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -141,6 +141,10 @@ email:
|
|
141
141
|
- peter@vandenabeele.com
|
142
142
|
executables:
|
143
143
|
- test_1.rb
|
144
|
+
- test_3.rb
|
145
|
+
- test_4.rb
|
146
|
+
- test_5.rb
|
147
|
+
- test_6.rb
|
144
148
|
extensions: []
|
145
149
|
extra_rdoc_files: []
|
146
150
|
files:
|
@@ -155,6 +159,11 @@ files:
|
|
155
159
|
- README.md
|
156
160
|
- Rakefile
|
157
161
|
- bin/test_1.rb
|
162
|
+
- bin/test_3.rb
|
163
|
+
- bin/test_4.rb
|
164
|
+
- bin/test_5.rb
|
165
|
+
- bin/test_6.rb
|
166
|
+
- data/.gitkeep
|
158
167
|
- dbd.gemspec
|
159
168
|
- docs/rationale.md
|
160
169
|
- docs/stories/001_create_a_fact.txt
|