dbd 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.rvmrc +1 -0
- data/.travis.yml +10 -0
- data/Gemfile +8 -0
- data/Guardfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +97 -0
- data/Rakefile +1 -0
- data/dbd.gemspec +30 -0
- data/docs/rationale.md +17 -0
- data/docs/stories/001_create_a_fact.txt +15 -0
- data/docs/stories/002_create_a_facts_collection.txt +14 -0
- data/docs/stories/003_create_a_fact_origin.txt +15 -0
- data/docs/stories/004_create_fact_origins_collection.txt +8 -0
- data/docs/stories/005_CSV_export_the_graph.txt +18 -0
- data/docs/stories/006_refactor_fact_origin_to_provenance_fact.txt +20 -0
- data/docs/stories/007_rename_property_to_predicate.txt +6 -0
- data/docs/stories/008_testing_different_ruby_versions.txt +7 -0
- data/docs/stories/009_build_and_store_resources_with_provenance.txt +38 -0
- data/docs/stories/010_provenance_fact_properties_from_provenance_ontology.txt +10 -0
- data/docs/test.rb +32 -0
- data/lib/dbd.rb +13 -0
- data/lib/dbd/errors.rb +11 -0
- data/lib/dbd/fact.rb +182 -0
- data/lib/dbd/fact/collection.rb +60 -0
- data/lib/dbd/fact/id.rb +19 -0
- data/lib/dbd/fact/subject.rb +21 -0
- data/lib/dbd/graph.rb +47 -0
- data/lib/dbd/helpers/ordered_set_collection.rb +86 -0
- data/lib/dbd/helpers/uuid.rb +33 -0
- data/lib/dbd/provenance_fact.rb +76 -0
- data/lib/dbd/provenance_resource.rb +54 -0
- data/lib/dbd/rdf.rb +9 -0
- data/lib/dbd/repo.rb +8 -0
- data/lib/dbd/repo/neo4j_repo.rb +4 -0
- data/lib/dbd/repo/neo4j_repo/base.rb +55 -0
- data/lib/dbd/resource.rb +117 -0
- data/lib/dbd/version.rb +3 -0
- data/spec/factories/fact.rb +76 -0
- data/spec/factories/provenance_fact.rb +34 -0
- data/spec/factories/provenance_resource.rb +16 -0
- data/spec/factories/resource.rb +17 -0
- data/spec/lib/dbd/fact/collection_spec.rb +236 -0
- data/spec/lib/dbd/fact/id_spec.rb +19 -0
- data/spec/lib/dbd/fact/subject_spec.rb +19 -0
- data/spec/lib/dbd/fact_spec.rb +217 -0
- data/spec/lib/dbd/graph_spec.rb +214 -0
- data/spec/lib/dbd/helpers/ordered_set_collection_spec.rb +88 -0
- data/spec/lib/dbd/helpers/uuid_spec.rb +15 -0
- data/spec/lib/dbd/provenance_fact_spec.rb +108 -0
- data/spec/lib/dbd/provenance_resource_spec.rb +77 -0
- data/spec/lib/dbd/rdf_base_spec.rb +39 -0
- data/spec/lib/dbd/repo/neo4j_repo/base_spec.rb +85 -0
- data/spec/lib/dbd/repo/neo4j_repo/performance_spec.rb +40 -0
- data/spec/lib/dbd/resource_spec.rb +166 -0
- data/spec/spec_helper.rb +19 -0
- metadata +272 -0
data/lib/dbd/errors.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
module Dbd
|
2
|
+
|
3
|
+
class OutOfOrderError < StandardError ; end
|
4
|
+
class FactError < StandardError ; end
|
5
|
+
|
6
|
+
class ProvenanceError < StandardError ; end
|
7
|
+
class SubjectError < StandardError ; end
|
8
|
+
class PredicateError < StandardError ; end
|
9
|
+
class ObjectError < StandardError ; end
|
10
|
+
|
11
|
+
end
|
data/lib/dbd/fact.rb
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
require 'dbd/fact/collection'
|
2
|
+
require 'dbd/fact/subject'
|
3
|
+
require 'dbd/fact/id'
|
4
|
+
|
5
|
+
module Dbd
|
6
|
+
|
7
|
+
##
|
8
|
+
# Basic Fact of knowledge.
|
9
|
+
#
|
10
|
+
# The database is built as an ordered sequence of facts, the "fact stream".
|
11
|
+
#
|
12
|
+
# This is somewhat similar to a "triple" in the RDF (Resource Description
|
13
|
+
# Framework) concept, but with different and extended functionality.
|
14
|
+
#
|
15
|
+
# Each basic fact has:
|
16
|
+
# * a unique and invariant *id* (a uuid)
|
17
|
+
#
|
18
|
+
# To allow referencing back to it (e.g. to invalidate it later in the fact stream).
|
19
|
+
#
|
20
|
+
# * a *time_stamp* (time with nanosecond granularity)
|
21
|
+
#
|
22
|
+
# To allow verifying that the order in the fact stream is correct.
|
23
|
+
#
|
24
|
+
# A time_stamp does not need to represent the exact time of the
|
25
|
+
# creation of the fact, but it has to increase in strictly monotic
|
26
|
+
# order in the fact stream.
|
27
|
+
#
|
28
|
+
# * a *provenance_subject* (a uuid)
|
29
|
+
#
|
30
|
+
# The subject of the ProvenanceResource (a set of ProvenanceFacts with
|
31
|
+
# the same subject) about this fact. Each Fact, points *back* to a
|
32
|
+
# ProvenanceResource (the ProvenanceResource must have been fully
|
33
|
+
# defined, earlier in the fact stream).
|
34
|
+
#
|
35
|
+
# * a *subject* (a uuid)
|
36
|
+
#
|
37
|
+
# "About which Resource is this fact?".
|
38
|
+
#
|
39
|
+
# Similar to the subject of an RDF triple, except that this subject is not
|
40
|
+
# a URI, but an abstract uuid (that is world-wide unique and invariant).
|
41
|
+
#
|
42
|
+
# Links to "real-world" URI's and URL's can be added later as separate facts
|
43
|
+
# (this also allows linking multiple "real-world" URI's to a single Resource).
|
44
|
+
#
|
45
|
+
# * a *predicate* (a string)
|
46
|
+
#
|
47
|
+
# "Which property of the resource are we describing?".
|
48
|
+
#
|
49
|
+
# Currently this is a string, but I suggest modeling this similar to predicate
|
50
|
+
# in RDF. Probably more detailed modeling using RDF predicate will follow.
|
51
|
+
#
|
52
|
+
# * an *object* (a string)
|
53
|
+
#
|
54
|
+
# "What is the value of the property of the resource we are describing?".
|
55
|
+
#
|
56
|
+
# Currently this is a string, but I suggest modeling this similar to object
|
57
|
+
# in RDF. Probably more detailed modeling using RDF object will follow.
|
58
|
+
class Fact
|
59
|
+
|
60
|
+
##
|
61
|
+
# @return [Array] The 6 attributes of a Fact.
|
62
|
+
def self.attributes
|
63
|
+
[:id,
|
64
|
+
:time_stamp,
|
65
|
+
:provenance_subject,
|
66
|
+
:subject,
|
67
|
+
:predicate,
|
68
|
+
:object]
|
69
|
+
end
|
70
|
+
|
71
|
+
attributes.each do |attribute|
|
72
|
+
attr_reader attribute
|
73
|
+
end
|
74
|
+
|
75
|
+
def time_stamp=(time_stamp)
|
76
|
+
raise RuntimeError if @time_stamp
|
77
|
+
@time_stamp = time_stamp
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# @return [Subject] A new random subject.
|
82
|
+
def self.new_subject
|
83
|
+
Subject.new
|
84
|
+
end
|
85
|
+
|
86
|
+
##
|
87
|
+
# @return [ID] A new random id.
|
88
|
+
def self.new_id
|
89
|
+
ID.new
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
# Builds a new Fact.
|
94
|
+
#
|
95
|
+
# @param [Hash{Symbol => Object}] options
|
96
|
+
# @option options [#to_s] :predicate Required : the predicate for this Fact
|
97
|
+
# @option options [#to_s] :object Required : the object for this Fact (required)
|
98
|
+
# @option options [Subject] :provenance_subject (nil) Optional: the subject of the provenance(resource|fact)
|
99
|
+
# @option options [Subject] :subject (nil) Optional: the subject for this Fact
|
100
|
+
def initialize(options)
|
101
|
+
@id = self.class.new_id
|
102
|
+
@provenance_subject = options[:provenance_subject]
|
103
|
+
@subject = options[:subject]
|
104
|
+
@predicate = options[:predicate]
|
105
|
+
@object = options[:object]
|
106
|
+
raise PredicateError, "predicate cannot be nil" if predicate.nil?
|
107
|
+
raise ObjectError, "object cannot be nil" if object.nil?
|
108
|
+
end
|
109
|
+
|
110
|
+
##
|
111
|
+
# @return [Array] The 6 values of a Fact.
|
112
|
+
def values
|
113
|
+
self.class.attributes.map{ |attribute| self.send(attribute) }
|
114
|
+
end
|
115
|
+
|
116
|
+
##
|
117
|
+
# Executes the required update in used_provenance_subjects.
|
118
|
+
#
|
119
|
+
# For a Fact, pointing to a ProvenanceResource in it's provenance_subject,
|
120
|
+
# marks this provenance_subject in the "used_provenance_subjects" hash that
|
121
|
+
# is passed in as an argument (DCI). This will avoid further changes to the
|
122
|
+
# ProvenanceResource with this provenance_subject.
|
123
|
+
#
|
124
|
+
# This is overridden in the ProvenanceFact, since only relevant for a Fact.
|
125
|
+
def update_used_provenance_subjects(h)
|
126
|
+
# using a provenance_subject sets the key
|
127
|
+
h[provenance_subject] = true
|
128
|
+
end
|
129
|
+
|
130
|
+
##
|
131
|
+
# Checks if a fact is valid for storing in the graph.
|
132
|
+
#
|
133
|
+
# @return [#true?] not nil if valid
|
134
|
+
def valid?
|
135
|
+
# id not validated, is set automatically
|
136
|
+
# predicate not validated, is validated in initialize
|
137
|
+
# object not validated, is validated in initialize
|
138
|
+
provenance_subject_valid?(provenance_subject) &&
|
139
|
+
subject
|
140
|
+
end
|
141
|
+
|
142
|
+
##
|
143
|
+
# Validates the presence or absence of provenance_subject.
|
144
|
+
#
|
145
|
+
# Here, in (base) Fact, provenance_subject must be present
|
146
|
+
# In the derived ProvenanceFact it must not be present.
|
147
|
+
# This is how the difference is encoded between Fact and
|
148
|
+
# ProvenanceFact in the fact stream.
|
149
|
+
# @param [#nil?] provenance_subject
|
150
|
+
# Return [Boolean]
|
151
|
+
def provenance_subject_valid?(provenance_subject)
|
152
|
+
provenance_subject
|
153
|
+
end
|
154
|
+
|
155
|
+
##
|
156
|
+
# Builds duplicate with the subject set.
|
157
|
+
#
|
158
|
+
# @param [Subject] subject_arg
|
159
|
+
# @return [Fact] the duplicate fact
|
160
|
+
def dup_with_subject(subject_arg)
|
161
|
+
self.class.new(
|
162
|
+
provenance_subject: provenance_subject,
|
163
|
+
subject: subject_arg, # from arg
|
164
|
+
predicate: predicate,
|
165
|
+
object: object)
|
166
|
+
end
|
167
|
+
|
168
|
+
##
|
169
|
+
# Builds duplicate with the provenance_subject set.
|
170
|
+
#
|
171
|
+
# @param [Subject] provenance_subject_arg
|
172
|
+
# @return [Fact] the duplicate fact
|
173
|
+
def dup_with_provenance_subject(provenance_subject_arg)
|
174
|
+
self.class.new(
|
175
|
+
provenance_subject: provenance_subject_arg, # from arg
|
176
|
+
subject: subject,
|
177
|
+
predicate: predicate,
|
178
|
+
object: object)
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'dbd/helpers/ordered_set_collection'
|
2
|
+
|
3
|
+
module Dbd
|
4
|
+
class Fact
|
5
|
+
module Collection
|
6
|
+
|
7
|
+
include Helpers::OrderedSetCollection
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
@hash_by_subject = Hash.new { |h, k| h[k] = [] }
|
12
|
+
@used_provenance_subjects = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def newest_time_stamp
|
16
|
+
newest_entry = @internal_collection.last
|
17
|
+
newest_entry && newest_entry.time_stamp
|
18
|
+
end
|
19
|
+
|
20
|
+
def oldest_time_stamp
|
21
|
+
oldest_entry = @internal_collection.first
|
22
|
+
oldest_entry && oldest_entry.time_stamp
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# This is the central method of Fact::Collection module
|
27
|
+
#
|
28
|
+
# @param [Fact] fact the fact that is added to the collection
|
29
|
+
#
|
30
|
+
# @return [self] for chaining
|
31
|
+
#
|
32
|
+
# Validates that added fact is valid.
|
33
|
+
#
|
34
|
+
# Validates that added fact is newer.
|
35
|
+
#
|
36
|
+
# Validates that subject was never used as provenance_subject [A].
|
37
|
+
#
|
38
|
+
# Adds the fact and return the index in the collection.
|
39
|
+
#
|
40
|
+
# Store this index in the hash_by_subject.
|
41
|
+
#
|
42
|
+
# Mark the fact in the list of used provenance_subjects (for [A]).
|
43
|
+
def <<(fact)
|
44
|
+
# TODO Add a more descriptive Exception message
|
45
|
+
raise FactError unless fact.valid?
|
46
|
+
raise OutOfOrderError if (self.newest_time_stamp && fact.time_stamp <= self.newest_time_stamp)
|
47
|
+
raise OutOfOrderError if (@used_provenance_subjects[fact.subject])
|
48
|
+
index = Helpers::OrderedSetCollection.add_and_return_index(fact, @internal_collection)
|
49
|
+
@hash_by_subject[fact.subject] << index
|
50
|
+
fact.update_used_provenance_subjects(@used_provenance_subjects)
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
def by_subject(fact_subject)
|
55
|
+
@hash_by_subject[fact_subject].map{ |index| @internal_collection[index]}
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/dbd/fact/id.rb
ADDED
data/lib/dbd/graph.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Dbd
|
4
|
+
|
5
|
+
##
|
6
|
+
# The Graph stores the Facts and ProvenanceFacts in an in-memory
|
7
|
+
# collection structure.
|
8
|
+
class Graph
|
9
|
+
|
10
|
+
include Fact::Collection
|
11
|
+
|
12
|
+
def <<(fact)
|
13
|
+
enforce_strictly_monotonic_time(fact)
|
14
|
+
super(fact)
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Export the graph to a CSV string
|
19
|
+
#
|
20
|
+
# @return [String] comma separated string with double quoted cells
|
21
|
+
def to_CSV
|
22
|
+
CSV.generate(force_quotes: true) do |csv|
|
23
|
+
@internal_collection.each do |fact|
|
24
|
+
csv << fact.values
|
25
|
+
end
|
26
|
+
end.encode("utf-8")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# The system mmust enforce that the time_stamps are strictly monotonic.
|
33
|
+
#
|
34
|
+
# This has been detected because on Java (JRuby) the the Wall time has
|
35
|
+
# a resolution of only 1 ms so sometimes, the exact same value for
|
36
|
+
# Time.now was reported.
|
37
|
+
def enforce_strictly_monotonic_time(fact)
|
38
|
+
new_time = Time.now.utc
|
39
|
+
newest_time_stamp = newest_time_stamp()
|
40
|
+
if newest_time_stamp && new_time <= newest_time_stamp
|
41
|
+
new_time = newest_time_stamp + 0.000_000_002 # Add approx. 2 nanoseconds
|
42
|
+
end
|
43
|
+
fact.time_stamp = new_time
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Dbd
|
2
|
+
module Helpers
|
3
|
+
|
4
|
+
##
|
5
|
+
# Transforms the mixing class into an OrderedSet.
|
6
|
+
#
|
7
|
+
# On the mixing class, enumerable functions are possible,
|
8
|
+
# looping over the set in O(n), but it is not intended
|
9
|
+
# that the mixing class allows arbitrary access into
|
10
|
+
# the collection.
|
11
|
+
#
|
12
|
+
# The *add_and_return_index* module method allows to get
|
13
|
+
# an index to an added element, so indexes can be
|
14
|
+
# built to access elements in O(1). The mixing class
|
15
|
+
# should not expose this index to the added element in
|
16
|
+
# it's public API. The goal is to allow other
|
17
|
+
# implementations (e.g. with Hadoop, Neo4j, ...) with
|
18
|
+
# the same API.
|
19
|
+
module OrderedSetCollection
|
20
|
+
|
21
|
+
include Enumerable
|
22
|
+
|
23
|
+
##
|
24
|
+
# Creates @internal_collection in the mixing class.
|
25
|
+
def initialize
|
26
|
+
@internal_collection = []
|
27
|
+
super
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Inserts an element at the end of the collection.
|
32
|
+
# Returns self to allow chaining.
|
33
|
+
# @param [Object] element
|
34
|
+
# @return [Object] self
|
35
|
+
def <<(element)
|
36
|
+
@internal_collection << element
|
37
|
+
self
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# For the Enumerable functionality.
|
42
|
+
def each
|
43
|
+
@internal_collection.each do |e|
|
44
|
+
yield e
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# This is required as an efficient way to find the last
|
50
|
+
# element without stepping through the entire collection.
|
51
|
+
# This implementation is probably not thread safe.
|
52
|
+
# @return [Object] the last element
|
53
|
+
def last
|
54
|
+
@internal_collection.last
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# This is required as an efficient way to find the size
|
59
|
+
# without stepping through the entire collection.
|
60
|
+
# This implementation is probably not thread safe.
|
61
|
+
# @return [Object] the last element
|
62
|
+
def size
|
63
|
+
@internal_collection.size
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# Adds an element at the end of the collection and
|
68
|
+
# returns the array index of that element.
|
69
|
+
#
|
70
|
+
# This is not an instance method to avoid it ending
|
71
|
+
# up in the public API of classes that mixin this module.
|
72
|
+
#
|
73
|
+
# The implementation to find the index of the inserted
|
74
|
+
# element with `rindex` is primitive, but I did not see
|
75
|
+
# a better way in Ruby to do this (using `size` would
|
76
|
+
# certainly be not thread safe, maybe the current
|
77
|
+
# approach is thread safe, but that is not tested).
|
78
|
+
# @return [Integer] index
|
79
|
+
def self.add_and_return_index(element, collection)
|
80
|
+
collection << element
|
81
|
+
collection.rindex(element)
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'securerandom'
|
2
|
+
|
3
|
+
module Dbd
|
4
|
+
module Helpers
|
5
|
+
|
6
|
+
##
|
7
|
+
# A simple UUID implementation based on SecureRandom.
|
8
|
+
class UUID
|
9
|
+
|
10
|
+
##
|
11
|
+
# A regexp that can be used in tests.
|
12
|
+
# @return [Regexp]
|
13
|
+
def self.regexp
|
14
|
+
/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Store a SecureRandom.uuid.
|
19
|
+
# @return [void]
|
20
|
+
def initialize
|
21
|
+
@uuid = SecureRandom.uuid
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# The to_s of the uuid.
|
26
|
+
# @return [String]
|
27
|
+
def to_s
|
28
|
+
@uuid.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|