dbd 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.rvmrc +1 -0
- data/.travis.yml +10 -0
- data/Gemfile +8 -0
- data/Guardfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +97 -0
- data/Rakefile +1 -0
- data/dbd.gemspec +30 -0
- data/docs/rationale.md +17 -0
- data/docs/stories/001_create_a_fact.txt +15 -0
- data/docs/stories/002_create_a_facts_collection.txt +14 -0
- data/docs/stories/003_create_a_fact_origin.txt +15 -0
- data/docs/stories/004_create_fact_origins_collection.txt +8 -0
- data/docs/stories/005_CSV_export_the_graph.txt +18 -0
- data/docs/stories/006_refactor_fact_origin_to_provenance_fact.txt +20 -0
- data/docs/stories/007_rename_property_to_predicate.txt +6 -0
- data/docs/stories/008_testing_different_ruby_versions.txt +7 -0
- data/docs/stories/009_build_and_store_resources_with_provenance.txt +38 -0
- data/docs/stories/010_provenance_fact_properties_from_provenance_ontology.txt +10 -0
- data/docs/test.rb +32 -0
- data/lib/dbd.rb +13 -0
- data/lib/dbd/errors.rb +11 -0
- data/lib/dbd/fact.rb +182 -0
- data/lib/dbd/fact/collection.rb +60 -0
- data/lib/dbd/fact/id.rb +19 -0
- data/lib/dbd/fact/subject.rb +21 -0
- data/lib/dbd/graph.rb +47 -0
- data/lib/dbd/helpers/ordered_set_collection.rb +86 -0
- data/lib/dbd/helpers/uuid.rb +33 -0
- data/lib/dbd/provenance_fact.rb +76 -0
- data/lib/dbd/provenance_resource.rb +54 -0
- data/lib/dbd/rdf.rb +9 -0
- data/lib/dbd/repo.rb +8 -0
- data/lib/dbd/repo/neo4j_repo.rb +4 -0
- data/lib/dbd/repo/neo4j_repo/base.rb +55 -0
- data/lib/dbd/resource.rb +117 -0
- data/lib/dbd/version.rb +3 -0
- data/spec/factories/fact.rb +76 -0
- data/spec/factories/provenance_fact.rb +34 -0
- data/spec/factories/provenance_resource.rb +16 -0
- data/spec/factories/resource.rb +17 -0
- data/spec/lib/dbd/fact/collection_spec.rb +236 -0
- data/spec/lib/dbd/fact/id_spec.rb +19 -0
- data/spec/lib/dbd/fact/subject_spec.rb +19 -0
- data/spec/lib/dbd/fact_spec.rb +217 -0
- data/spec/lib/dbd/graph_spec.rb +214 -0
- data/spec/lib/dbd/helpers/ordered_set_collection_spec.rb +88 -0
- data/spec/lib/dbd/helpers/uuid_spec.rb +15 -0
- data/spec/lib/dbd/provenance_fact_spec.rb +108 -0
- data/spec/lib/dbd/provenance_resource_spec.rb +77 -0
- data/spec/lib/dbd/rdf_base_spec.rb +39 -0
- data/spec/lib/dbd/repo/neo4j_repo/base_spec.rb +85 -0
- data/spec/lib/dbd/repo/neo4j_repo/performance_spec.rb +40 -0
- data/spec/lib/dbd/resource_spec.rb +166 -0
- data/spec/spec_helper.rb +19 -0
- metadata +272 -0
data/lib/dbd/errors.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
module Dbd
|
2
|
+
|
3
|
+
class OutOfOrderError < StandardError ; end
|
4
|
+
class FactError < StandardError ; end
|
5
|
+
|
6
|
+
class ProvenanceError < StandardError ; end
|
7
|
+
class SubjectError < StandardError ; end
|
8
|
+
class PredicateError < StandardError ; end
|
9
|
+
class ObjectError < StandardError ; end
|
10
|
+
|
11
|
+
end
|
data/lib/dbd/fact.rb
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
require 'dbd/fact/collection'
|
2
|
+
require 'dbd/fact/subject'
|
3
|
+
require 'dbd/fact/id'
|
4
|
+
|
5
|
+
module Dbd
|
6
|
+
|
7
|
+
##
|
8
|
+
# Basic Fact of knowledge.
|
9
|
+
#
|
10
|
+
# The database is built as an ordered sequence of facts, the "fact stream".
|
11
|
+
#
|
12
|
+
# This is somewhat similar to a "triple" in the RDF (Resource Description
|
13
|
+
# Framework) concept, but with different and extended functionality.
|
14
|
+
#
|
15
|
+
# Each basic fact has:
|
16
|
+
# * a unique and invariant *id* (a uuid)
|
17
|
+
#
|
18
|
+
# To allow referencing back to it (e.g. to invalidate it later in the fact stream).
|
19
|
+
#
|
20
|
+
# * a *time_stamp* (time with nanosecond granularity)
|
21
|
+
#
|
22
|
+
# To allow verifying that the order in the fact stream is correct.
|
23
|
+
#
|
24
|
+
# A time_stamp does not need to represent the exact time of the
|
25
|
+
# creation of the fact, but it has to increase in strictly monotic
|
26
|
+
# order in the fact stream.
|
27
|
+
#
|
28
|
+
# * a *provenance_subject* (a uuid)
|
29
|
+
#
|
30
|
+
# The subject of the ProvenanceResource (a set of ProvenanceFacts with
|
31
|
+
# the same subject) about this fact. Each Fact, points *back* to a
|
32
|
+
# ProvenanceResource (the ProvenanceResource must have been fully
|
33
|
+
# defined, earlier in the fact stream).
|
34
|
+
#
|
35
|
+
# * a *subject* (a uuid)
|
36
|
+
#
|
37
|
+
# "About which Resource is this fact?".
|
38
|
+
#
|
39
|
+
# Similar to the subject of an RDF triple, except that this subject is not
|
40
|
+
# a URI, but an abstract uuid (that is world-wide unique and invariant).
|
41
|
+
#
|
42
|
+
# Links to "real-world" URI's and URL's can be added later as separate facts
|
43
|
+
# (this also allows linking multiple "real-world" URI's to a single Resource).
|
44
|
+
#
|
45
|
+
# * a *predicate* (a string)
|
46
|
+
#
|
47
|
+
# "Which property of the resource are we describing?".
|
48
|
+
#
|
49
|
+
# Currently this is a string, but I suggest modeling this similar to predicate
|
50
|
+
# in RDF. Probably more detailed modeling using RDF predicate will follow.
|
51
|
+
#
|
52
|
+
# * an *object* (a string)
|
53
|
+
#
|
54
|
+
# "What is the value of the property of the resource we are describing?".
|
55
|
+
#
|
56
|
+
# Currently this is a string, but I suggest modeling this similar to object
|
57
|
+
# in RDF. Probably more detailed modeling using RDF object will follow.
|
58
|
+
class Fact
|
59
|
+
|
60
|
+
##
|
61
|
+
# @return [Array] The 6 attributes of a Fact.
|
62
|
+
def self.attributes
|
63
|
+
[:id,
|
64
|
+
:time_stamp,
|
65
|
+
:provenance_subject,
|
66
|
+
:subject,
|
67
|
+
:predicate,
|
68
|
+
:object]
|
69
|
+
end
|
70
|
+
|
71
|
+
attributes.each do |attribute|
|
72
|
+
attr_reader attribute
|
73
|
+
end
|
74
|
+
|
75
|
+
def time_stamp=(time_stamp)
|
76
|
+
raise RuntimeError if @time_stamp
|
77
|
+
@time_stamp = time_stamp
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# @return [Subject] A new random subject.
|
82
|
+
def self.new_subject
|
83
|
+
Subject.new
|
84
|
+
end
|
85
|
+
|
86
|
+
##
|
87
|
+
# @return [ID] A new random id.
|
88
|
+
def self.new_id
|
89
|
+
ID.new
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
# Builds a new Fact.
|
94
|
+
#
|
95
|
+
# @param [Hash{Symbol => Object}] options
|
96
|
+
# @option options [#to_s] :predicate Required : the predicate for this Fact
|
97
|
+
# @option options [#to_s] :object Required : the object for this Fact (required)
|
98
|
+
# @option options [Subject] :provenance_subject (nil) Optional: the subject of the provenance(resource|fact)
|
99
|
+
# @option options [Subject] :subject (nil) Optional: the subject for this Fact
|
100
|
+
def initialize(options)
|
101
|
+
@id = self.class.new_id
|
102
|
+
@provenance_subject = options[:provenance_subject]
|
103
|
+
@subject = options[:subject]
|
104
|
+
@predicate = options[:predicate]
|
105
|
+
@object = options[:object]
|
106
|
+
raise PredicateError, "predicate cannot be nil" if predicate.nil?
|
107
|
+
raise ObjectError, "object cannot be nil" if object.nil?
|
108
|
+
end
|
109
|
+
|
110
|
+
##
|
111
|
+
# @return [Array] The 6 values of a Fact.
|
112
|
+
def values
|
113
|
+
self.class.attributes.map{ |attribute| self.send(attribute) }
|
114
|
+
end
|
115
|
+
|
116
|
+
##
|
117
|
+
# Executes the required update in used_provenance_subjects.
|
118
|
+
#
|
119
|
+
# For a Fact, pointing to a ProvenanceResource in it's provenance_subject,
|
120
|
+
# marks this provenance_subject in the "used_provenance_subjects" hash that
|
121
|
+
# is passed in as an argument (DCI). This will avoid further changes to the
|
122
|
+
# ProvenanceResource with this provenance_subject.
|
123
|
+
#
|
124
|
+
# This is overridden in the ProvenanceFact, since only relevant for a Fact.
|
125
|
+
def update_used_provenance_subjects(h)
|
126
|
+
# using a provenance_subject sets the key
|
127
|
+
h[provenance_subject] = true
|
128
|
+
end
|
129
|
+
|
130
|
+
##
|
131
|
+
# Checks if a fact is valid for storing in the graph.
|
132
|
+
#
|
133
|
+
# @return [#true?] not nil if valid
|
134
|
+
def valid?
|
135
|
+
# id not validated, is set automatically
|
136
|
+
# predicate not validated, is validated in initialize
|
137
|
+
# object not validated, is validated in initialize
|
138
|
+
provenance_subject_valid?(provenance_subject) &&
|
139
|
+
subject
|
140
|
+
end
|
141
|
+
|
142
|
+
##
|
143
|
+
# Validates the presence or absence of provenance_subject.
|
144
|
+
#
|
145
|
+
# Here, in (base) Fact, provenance_subject must be present
|
146
|
+
# In the derived ProvenanceFact it must not be present.
|
147
|
+
# This is how the difference is encoded between Fact and
|
148
|
+
# ProvenanceFact in the fact stream.
|
149
|
+
# @param [#nil?] provenance_subject
|
150
|
+
# Return [Boolean]
|
151
|
+
def provenance_subject_valid?(provenance_subject)
|
152
|
+
provenance_subject
|
153
|
+
end
|
154
|
+
|
155
|
+
##
|
156
|
+
# Builds duplicate with the subject set.
|
157
|
+
#
|
158
|
+
# @param [Subject] subject_arg
|
159
|
+
# @return [Fact] the duplicate fact
|
160
|
+
def dup_with_subject(subject_arg)
|
161
|
+
self.class.new(
|
162
|
+
provenance_subject: provenance_subject,
|
163
|
+
subject: subject_arg, # from arg
|
164
|
+
predicate: predicate,
|
165
|
+
object: object)
|
166
|
+
end
|
167
|
+
|
168
|
+
##
|
169
|
+
# Builds duplicate with the provenance_subject set.
|
170
|
+
#
|
171
|
+
# @param [Subject] provenance_subject_arg
|
172
|
+
# @return [Fact] the duplicate fact
|
173
|
+
def dup_with_provenance_subject(provenance_subject_arg)
|
174
|
+
self.class.new(
|
175
|
+
provenance_subject: provenance_subject_arg, # from arg
|
176
|
+
subject: subject,
|
177
|
+
predicate: predicate,
|
178
|
+
object: object)
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'dbd/helpers/ordered_set_collection'
|
2
|
+
|
3
|
+
module Dbd
|
4
|
+
class Fact
|
5
|
+
module Collection
|
6
|
+
|
7
|
+
include Helpers::OrderedSetCollection
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
@hash_by_subject = Hash.new { |h, k| h[k] = [] }
|
12
|
+
@used_provenance_subjects = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def newest_time_stamp
|
16
|
+
newest_entry = @internal_collection.last
|
17
|
+
newest_entry && newest_entry.time_stamp
|
18
|
+
end
|
19
|
+
|
20
|
+
def oldest_time_stamp
|
21
|
+
oldest_entry = @internal_collection.first
|
22
|
+
oldest_entry && oldest_entry.time_stamp
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# This is the central method of Fact::Collection module
|
27
|
+
#
|
28
|
+
# @param [Fact] fact the fact that is added to the collection
|
29
|
+
#
|
30
|
+
# @return [self] for chaining
|
31
|
+
#
|
32
|
+
# Validates that added fact is valid.
|
33
|
+
#
|
34
|
+
# Validates that added fact is newer.
|
35
|
+
#
|
36
|
+
# Validates that subject was never used as provenance_subject [A].
|
37
|
+
#
|
38
|
+
# Adds the fact and return the index in the collection.
|
39
|
+
#
|
40
|
+
# Store this index in the hash_by_subject.
|
41
|
+
#
|
42
|
+
# Mark the fact in the list of used provenance_subjects (for [A]).
|
43
|
+
def <<(fact)
|
44
|
+
# TODO Add a more descriptive Exception message
|
45
|
+
raise FactError unless fact.valid?
|
46
|
+
raise OutOfOrderError if (self.newest_time_stamp && fact.time_stamp <= self.newest_time_stamp)
|
47
|
+
raise OutOfOrderError if (@used_provenance_subjects[fact.subject])
|
48
|
+
index = Helpers::OrderedSetCollection.add_and_return_index(fact, @internal_collection)
|
49
|
+
@hash_by_subject[fact.subject] << index
|
50
|
+
fact.update_used_provenance_subjects(@used_provenance_subjects)
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
def by_subject(fact_subject)
|
55
|
+
@hash_by_subject[fact_subject].map{ |index| @internal_collection[index]}
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/dbd/fact/id.rb
ADDED
data/lib/dbd/graph.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Dbd
|
4
|
+
|
5
|
+
##
|
6
|
+
# The Graph stores the Facts and ProvenanceFacts in an in-memory
|
7
|
+
# collection structure.
|
8
|
+
class Graph
|
9
|
+
|
10
|
+
include Fact::Collection
|
11
|
+
|
12
|
+
def <<(fact)
|
13
|
+
enforce_strictly_monotonic_time(fact)
|
14
|
+
super(fact)
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Export the graph to a CSV string
|
19
|
+
#
|
20
|
+
# @return [String] comma separated string with double quoted cells
|
21
|
+
def to_CSV
|
22
|
+
CSV.generate(force_quotes: true) do |csv|
|
23
|
+
@internal_collection.each do |fact|
|
24
|
+
csv << fact.values
|
25
|
+
end
|
26
|
+
end.encode("utf-8")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# The system mmust enforce that the time_stamps are strictly monotonic.
|
33
|
+
#
|
34
|
+
# This has been detected because on Java (JRuby) the the Wall time has
|
35
|
+
# a resolution of only 1 ms so sometimes, the exact same value for
|
36
|
+
# Time.now was reported.
|
37
|
+
def enforce_strictly_monotonic_time(fact)
|
38
|
+
new_time = Time.now.utc
|
39
|
+
newest_time_stamp = newest_time_stamp()
|
40
|
+
if newest_time_stamp && new_time <= newest_time_stamp
|
41
|
+
new_time = newest_time_stamp + 0.000_000_002 # Add approx. 2 nanoseconds
|
42
|
+
end
|
43
|
+
fact.time_stamp = new_time
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Dbd
|
2
|
+
module Helpers
|
3
|
+
|
4
|
+
##
|
5
|
+
# Transforms the mixing class into an OrderedSet.
|
6
|
+
#
|
7
|
+
# On the mixing class, enumerable functions are possible,
|
8
|
+
# looping over the set in O(n), but it is not intended
|
9
|
+
# that the mixing class allows arbitrary access into
|
10
|
+
# the collection.
|
11
|
+
#
|
12
|
+
# The *add_and_return_index* module method allows to get
|
13
|
+
# an index to an added element, so indexes can be
|
14
|
+
# built to access elements in O(1). The mixing class
|
15
|
+
# should not expose this index to the added element in
|
16
|
+
# it's public API. The goal is to allow other
|
17
|
+
# implementations (e.g. with Hadoop, Neo4j, ...) with
|
18
|
+
# the same API.
|
19
|
+
module OrderedSetCollection
|
20
|
+
|
21
|
+
include Enumerable
|
22
|
+
|
23
|
+
##
|
24
|
+
# Creates @internal_collection in the mixing class.
|
25
|
+
def initialize
|
26
|
+
@internal_collection = []
|
27
|
+
super
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Inserts an element at the end of the collection.
|
32
|
+
# Returns self to allow chaining.
|
33
|
+
# @param [Object] element
|
34
|
+
# @return [Object] self
|
35
|
+
def <<(element)
|
36
|
+
@internal_collection << element
|
37
|
+
self
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# For the Enumerable functionality.
|
42
|
+
def each
|
43
|
+
@internal_collection.each do |e|
|
44
|
+
yield e
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# This is required as an efficient way to find the last
|
50
|
+
# element without stepping through the entire collection.
|
51
|
+
# This implementation is probably not thread safe.
|
52
|
+
# @return [Object] the last element
|
53
|
+
def last
|
54
|
+
@internal_collection.last
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# This is required as an efficient way to find the size
|
59
|
+
# without stepping through the entire collection.
|
60
|
+
# This implementation is probably not thread safe.
|
61
|
+
# @return [Object] the last element
|
62
|
+
def size
|
63
|
+
@internal_collection.size
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# Adds an element at the end of the collection and
|
68
|
+
# returns the array index of that element.
|
69
|
+
#
|
70
|
+
# This is not an instance method to avoid it ending
|
71
|
+
# up in the public API of classes that mixin this module.
|
72
|
+
#
|
73
|
+
# The implementation to find the index of the inserted
|
74
|
+
# element with `rindex` is primitive, but I did not see
|
75
|
+
# a better way in Ruby to do this (using `size` would
|
76
|
+
# certainly be not thread safe, maybe the current
|
77
|
+
# approach is thread safe, but that is not tested).
|
78
|
+
# @return [Integer] index
|
79
|
+
def self.add_and_return_index(element, collection)
|
80
|
+
collection << element
|
81
|
+
collection.rindex(element)
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'securerandom'
|
2
|
+
|
3
|
+
module Dbd
|
4
|
+
module Helpers
|
5
|
+
|
6
|
+
##
|
7
|
+
# A simple UUID implementation based on SecureRandom.
|
8
|
+
class UUID
|
9
|
+
|
10
|
+
##
|
11
|
+
# A regexp that can be used in tests.
|
12
|
+
# @return [Regexp]
|
13
|
+
def self.regexp
|
14
|
+
/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Store a SecureRandom.uuid.
|
19
|
+
# @return [void]
|
20
|
+
def initialize
|
21
|
+
@uuid = SecureRandom.uuid
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# The to_s of the uuid.
|
26
|
+
# @return [String]
|
27
|
+
def to_s
|
28
|
+
@uuid.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|