dbd 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +3 -1
- data/HISTORY.txt +7 -0
- data/README.md +20 -1
- data/bin/test_1.rb +1 -1
- data/bin/test_3.rb +11 -0
- data/bin/test_4.rb +32 -0
- data/bin/test_5.rb +40 -0
- data/bin/test_6.rb +21 -0
- data/data/.gitkeep +0 -0
- data/lib/dbd/fact.rb +1 -1
- data/lib/dbd/graph.rb +26 -7
- data/lib/dbd/helpers/uuid.rb +2 -2
- data/lib/dbd/version.rb +1 -1
- data/spec/factories/provenance_fact.rb +1 -0
- data/spec/lib/dbd/graph/to_csv_spec.rb +31 -0
- data/spec/lib/dbd/helpers/uuid/uuid_spec.rb +4 -0
- metadata +11 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b21e77e8316f18a011e2356b79a28a35e5dde7d
|
4
|
+
data.tar.gz: d8a0adeebbc7311a512ef08bfbc63902e16f2159
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c8e9c42c082ad4bdc8c6af283c9a73fad3c35a75e5ebace47aefbbe127cb0ee001ef066d17ef6e9b253dcd2a6311ccbb520810bdb061ab233c77950be7a99542
|
7
|
+
data.tar.gz: a63bb4462ebdc8fa4a6c2e0f3a1513146f7aac1e85a9d748c3126f5352464a613ee9bf956e27e0fad06edc4816e7226e78a37a1c5218196e6ea0f4ccf13db5e5
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/HISTORY.txt
CHANGED
@@ -43,3 +43,10 @@
|
|
43
43
|
|
44
44
|
* bin/test_1.rb was used for first successful writing of 10M facts using
|
45
45
|
ruby-1.9.3-p429 (will report issues on ruby-2.0.0-p195 and jruby-1.7.4)
|
46
|
+
|
47
|
+
0.0.9 (30 June 2013)
|
48
|
+
=====
|
49
|
+
|
50
|
+
* new function graph#to_CSV_file
|
51
|
+
* bin/test_5.rb was used to write 10M facts using ruby-2.0.0, 1.9.3 and jruby-1.7.4
|
52
|
+
* jruby is 3 time faster, but 10% more memory comsumption
|
data/README.md
CHANGED
@@ -44,7 +44,7 @@ Open Source [MIT]
|
|
44
44
|
|
45
45
|
## Installation
|
46
46
|
|
47
|
-
$ gem install dbd # Ruby 1.9.
|
47
|
+
$ gem install dbd # Ruby 1.9.3, 2.0.x, jruby (see .travis.yml)
|
48
48
|
|
49
49
|
## Examples
|
50
50
|
|
@@ -118,6 +118,25 @@ puts imported_graph.map(&:short)
|
|
118
118
|
# 5eb1ea27 : 3767c493 : todo:story : A long period of peace,_ that is a "bliss".
|
119
119
|
```
|
120
120
|
|
121
|
+
## Performance tests on 10 M facts
|
122
|
+
|
123
|
+
In version 0.0.9 a number of test programs where added (e.g. ../bin/test_5.rb)
|
124
|
+
that where used to populated in memory and write to disk a data set with 10 M facts.
|
125
|
+
|
126
|
+
This function was tested on ruby-2.0.0, ruby-1.9.3 and jruby-1.7.4. The facts
|
127
|
+
had an approximate size of 250 Bytes each (80 Bytes object).
|
128
|
+
|
129
|
+
The time needed and memory size (RSS) for populating the in-memory dataset was:
|
130
|
+
|
131
|
+
10 M facts (of 250 Bytes; 2.5 GB netto data):
|
132
|
+
|
133
|
+
| ruby | time | memory (RSS} |
|
134
|
+
|------------|-------------| ------------:|
|
135
|
+
| ruby-1.9.3 | 863 seconds | 8.1 GB |
|
136
|
+
| ruby-2.0.0 | 862 seconds | 9.0 GB |
|
137
|
+
|jruby-1.7.4 | 345 seconds | 10.8 GB |
|
138
|
+
|
139
|
+
|
121
140
|
[RDF]: http://www.w3.org/RDF/
|
122
141
|
[Rationale]: http://github.com/petervandenabeele/dbd/blob/master/docs/rationale.md
|
123
142
|
[MIT]: https://github.com/petervandenabeele/dbd/blob/master/LICENSE.txt
|
data/bin/test_1.rb
CHANGED
data/bin/test_3.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# encoding=us-ascii
|
2
|
+
|
3
|
+
# this is a test program for an exception that is thrown in JRuby
|
4
|
+
# see http://markmail.org/message/e2ote7rkwht2quel?q=list:org.codehaus.jruby.user
|
5
|
+
|
6
|
+
#row = "A" * 300 # does NOT fail with this value of `row`
|
7
|
+
row = "A" * 301
|
8
|
+
count = 5_000_000
|
9
|
+
|
10
|
+
csv_string = row * count
|
11
|
+
encoded_string = csv_string.encode("utf-8")
|
data/bin/test_4.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This is a test program for an issue with CSV.generate
|
4
|
+
# in ruby-2.0.0 and ruby-head, see http://bugs.ruby-lang.org/issues/8585
|
5
|
+
|
6
|
+
count = ARGV[0].to_i
|
7
|
+
unless count > 0
|
8
|
+
puts "Give a 'count' as first argument."
|
9
|
+
exit(1)
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'csv'
|
13
|
+
|
14
|
+
row_data = [
|
15
|
+
"59ffbb3b-1e48-4c1f-81d8-d93afc84c966",
|
16
|
+
"2013-06-28 19:14:55.975000806 UTC",
|
17
|
+
"a11f290e-c441-41bc-8b8c-4e6c27b1b6fc",
|
18
|
+
"c73e6241-d46f-4952-8377-c11372346d15",
|
19
|
+
"test",
|
20
|
+
"BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0"]
|
21
|
+
|
22
|
+
puts "starting CSV.open"
|
23
|
+
|
24
|
+
start_time = Time.now
|
25
|
+
|
26
|
+
csv_string = CSV.generate(force_quotes: true) do |csv|
|
27
|
+
count.times do
|
28
|
+
csv << row_data
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
puts "CSV.open took #{Time.now - start_time} seconds"
|
data/bin/test_5.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This implementation now streams to disk.
|
4
|
+
|
5
|
+
FACTS_PER_RESOURCE = 1000
|
6
|
+
|
7
|
+
count = ARGV[0].to_i
|
8
|
+
unless count > 0
|
9
|
+
puts "Give a 'count' as first argument."
|
10
|
+
exit(1)
|
11
|
+
end
|
12
|
+
|
13
|
+
filename = ARGV[1]
|
14
|
+
unless filename
|
15
|
+
puts "Give a 'filename' as second argument."
|
16
|
+
exit(1)
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'dbd'
|
20
|
+
|
21
|
+
start = Time.now
|
22
|
+
|
23
|
+
graph = Dbd::Graph.new
|
24
|
+
|
25
|
+
(0...count).each do |i|
|
26
|
+
provenance_resource = Dbd::ProvenanceResource.new
|
27
|
+
provenance_resource << Dbd::ProvenanceFact.new(predicate: "prov:test" , object: "A" * 10)
|
28
|
+
|
29
|
+
resource = Dbd::Resource.new(provenance_subject: provenance_resource.subject)
|
30
|
+
(0...FACTS_PER_RESOURCE).each do |j|
|
31
|
+
resource << Dbd::Fact.new(predicate: "test", object: "#{'B' * 80} #{i * FACTS_PER_RESOURCE + j}")
|
32
|
+
end
|
33
|
+
|
34
|
+
graph << provenance_resource << resource
|
35
|
+
puts ("added resource #{i} to the graph")
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "Graph is ready (took #{Time.now - start}s), now starting the write to disk"
|
39
|
+
|
40
|
+
graph.to_CSV_file(filename)
|
data/bin/test_6.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This implementation streams from disk
|
4
|
+
|
5
|
+
filename = ARGV[0]
|
6
|
+
unless filename
|
7
|
+
puts "Give a 'filename' as argument."
|
8
|
+
exit(1)
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'dbd'
|
12
|
+
|
13
|
+
start = Time.now
|
14
|
+
|
15
|
+
graph = File.open(filename) do |f|
|
16
|
+
Dbd::Graph.from_CSV(f)
|
17
|
+
end
|
18
|
+
|
19
|
+
puts "Graph is ready (took #{Time.now - start}s), now starting the write to disk"
|
20
|
+
|
21
|
+
puts "graph.size is #{graph.size}"
|
data/data/.gitkeep
ADDED
File without changes
|
data/lib/dbd/fact.rb
CHANGED
@@ -156,7 +156,7 @@ module Dbd
|
|
156
156
|
# Constructs a Fact or ProvenanceFact from a string values array
|
157
157
|
# (e.g. pulled from a CSV row).
|
158
158
|
#
|
159
|
-
# @param [Array]
|
159
|
+
# @param [Array] string_values Required : the array with values, organized as in attributes
|
160
160
|
# @return [Fact, ProvenanceFact] the constructed fact
|
161
161
|
def self.from_string_values(string_values)
|
162
162
|
string_hash = hash_from_values(string_values)
|
data/lib/dbd/graph.rb
CHANGED
@@ -32,17 +32,25 @@ module Dbd
|
|
32
32
|
#
|
33
33
|
# @return [String] comma separated string with double quoted cells
|
34
34
|
def to_CSV
|
35
|
-
CSV.generate(
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
CSV.generate(csv_defaults) do |csv|
|
36
|
+
push_facts(csv)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Export the graph to a CSV file
|
42
|
+
#
|
43
|
+
# @param [String] :filename the filename to stream the CSV to
|
44
|
+
def to_CSV_file(filename)
|
45
|
+
CSV.open(filename, 'w', csv_defaults) do |csv|
|
46
|
+
push_facts(csv)
|
47
|
+
end
|
40
48
|
end
|
41
49
|
|
42
50
|
##
|
43
|
-
# Import a graph from a CSV
|
51
|
+
# Import a graph from a CSV IO stream
|
44
52
|
#
|
45
|
-
# @param [
|
53
|
+
# @param [IO Stream] csv an IO Stream that contains the CSV serialization
|
46
54
|
# @return [Graph] the imported graph
|
47
55
|
def self.from_CSV(csv)
|
48
56
|
new.tap do |graph|
|
@@ -63,5 +71,16 @@ module Dbd
|
|
63
71
|
fact.time_stamp = TimeStamp.new(larger_than: newest_time_stamp) unless fact.time_stamp
|
64
72
|
end
|
65
73
|
|
74
|
+
def csv_defaults
|
75
|
+
{force_quotes: true,
|
76
|
+
encoding: 'utf-8'}
|
77
|
+
end
|
78
|
+
|
79
|
+
def push_facts(target)
|
80
|
+
@internal_collection.each do |fact|
|
81
|
+
target << fact.values
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
66
85
|
end
|
67
86
|
end
|
data/lib/dbd/helpers/uuid.rb
CHANGED
@@ -18,14 +18,14 @@ module Dbd
|
|
18
18
|
# Store a SecureRandom.uuid.
|
19
19
|
# @return [void]
|
20
20
|
def initialize
|
21
|
-
@uuid = SecureRandom.uuid
|
21
|
+
@uuid = SecureRandom.uuid.encode('utf-8')
|
22
22
|
end
|
23
23
|
|
24
24
|
##
|
25
25
|
# The to_s of the uuid.
|
26
26
|
# @return [String]
|
27
27
|
def to_s
|
28
|
-
@uuid
|
28
|
+
@uuid
|
29
29
|
end
|
30
30
|
|
31
31
|
end
|
data/lib/dbd/version.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding=utf-8
|
1
2
|
require 'spec_helper'
|
2
3
|
|
3
4
|
module Dbd
|
@@ -10,6 +11,7 @@ module Dbd
|
|
10
11
|
let(:provenance_facts) { Factories::Fact::Collection.provenance_facts(new_subject) }
|
11
12
|
let(:provenance_fact_1) { provenance_facts.first }
|
12
13
|
let(:fact_2_3) { Factories::Fact::Collection.fact_2_3(provenance_fact_1.subject) }
|
14
|
+
let(:fact_special_characters) { Factories::Fact::fact_with_special_chars(provenance_fact_1.subject, new_subject) }
|
13
15
|
|
14
16
|
let(:subject_regexp) { Fact::Subject.regexp }
|
15
17
|
let(:id_regexp) { Fact::ID.regexp }
|
@@ -157,5 +159,34 @@ module Dbd
|
|
157
159
|
subject.to_CSV.lines.count.should == 6
|
158
160
|
end
|
159
161
|
end
|
162
|
+
|
163
|
+
describe "#to_CSV_file" do
|
164
|
+
|
165
|
+
before do
|
166
|
+
provenance_facts.each do |provenance_fact|
|
167
|
+
subject << provenance_fact
|
168
|
+
end
|
169
|
+
fact_2_3.each do |fact|
|
170
|
+
subject << fact
|
171
|
+
end
|
172
|
+
subject << fact_special_characters
|
173
|
+
end
|
174
|
+
|
175
|
+
it "has eight lines" do
|
176
|
+
filename = 'data/foo.csv'
|
177
|
+
subject.to_CSV_file(filename)
|
178
|
+
File.open(filename) do |f|
|
179
|
+
f.readlines.count.should == 8
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
it "reads back UTF-8 characters correctly" do
|
184
|
+
filename = 'data/foo.csv'
|
185
|
+
subject.to_CSV_file(filename)
|
186
|
+
File.open(filename) do |f|
|
187
|
+
f.readlines.detect{|l| l.match(%r{really with a comma, a double quote "" and a non-ASCII char éà Über.})}.should_not be_nil
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
160
191
|
end
|
161
192
|
end
|
@@ -10,6 +10,10 @@ module Dbd
|
|
10
10
|
it ".new creates a new random UUID" do
|
11
11
|
described_class.new.to_s.should match(UUID.regexp)
|
12
12
|
end
|
13
|
+
|
14
|
+
it ".new creates a new random UUID with UTF-8 encoding" do
|
15
|
+
described_class.new.to_s.encoding.should == Encoding::UTF_8
|
16
|
+
end
|
13
17
|
end
|
14
18
|
end
|
15
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dbd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Vandenabeele
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -141,6 +141,10 @@ email:
|
|
141
141
|
- peter@vandenabeele.com
|
142
142
|
executables:
|
143
143
|
- test_1.rb
|
144
|
+
- test_3.rb
|
145
|
+
- test_4.rb
|
146
|
+
- test_5.rb
|
147
|
+
- test_6.rb
|
144
148
|
extensions: []
|
145
149
|
extra_rdoc_files: []
|
146
150
|
files:
|
@@ -155,6 +159,11 @@ files:
|
|
155
159
|
- README.md
|
156
160
|
- Rakefile
|
157
161
|
- bin/test_1.rb
|
162
|
+
- bin/test_3.rb
|
163
|
+
- bin/test_4.rb
|
164
|
+
- bin/test_5.rb
|
165
|
+
- bin/test_6.rb
|
166
|
+
- data/.gitkeep
|
158
167
|
- dbd.gemspec
|
159
168
|
- docs/rationale.md
|
160
169
|
- docs/stories/001_create_a_fact.txt
|