dbd 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +2 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +10 -0
  5. data/Gemfile +8 -0
  6. data/Guardfile +7 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +97 -0
  9. data/Rakefile +1 -0
  10. data/dbd.gemspec +30 -0
  11. data/docs/rationale.md +17 -0
  12. data/docs/stories/001_create_a_fact.txt +15 -0
  13. data/docs/stories/002_create_a_facts_collection.txt +14 -0
  14. data/docs/stories/003_create_a_fact_origin.txt +15 -0
  15. data/docs/stories/004_create_fact_origins_collection.txt +8 -0
  16. data/docs/stories/005_CSV_export_the_graph.txt +18 -0
  17. data/docs/stories/006_refactor_fact_origin_to_provenance_fact.txt +20 -0
  18. data/docs/stories/007_rename_property_to_predicate.txt +6 -0
  19. data/docs/stories/008_testing_different_ruby_versions.txt +7 -0
  20. data/docs/stories/009_build_and_store_resources_with_provenance.txt +38 -0
  21. data/docs/stories/010_provenance_fact_properties_from_provenance_ontology.txt +10 -0
  22. data/docs/test.rb +32 -0
  23. data/lib/dbd.rb +13 -0
  24. data/lib/dbd/errors.rb +11 -0
  25. data/lib/dbd/fact.rb +182 -0
  26. data/lib/dbd/fact/collection.rb +60 -0
  27. data/lib/dbd/fact/id.rb +19 -0
  28. data/lib/dbd/fact/subject.rb +21 -0
  29. data/lib/dbd/graph.rb +47 -0
  30. data/lib/dbd/helpers/ordered_set_collection.rb +86 -0
  31. data/lib/dbd/helpers/uuid.rb +33 -0
  32. data/lib/dbd/provenance_fact.rb +76 -0
  33. data/lib/dbd/provenance_resource.rb +54 -0
  34. data/lib/dbd/rdf.rb +9 -0
  35. data/lib/dbd/repo.rb +8 -0
  36. data/lib/dbd/repo/neo4j_repo.rb +4 -0
  37. data/lib/dbd/repo/neo4j_repo/base.rb +55 -0
  38. data/lib/dbd/resource.rb +117 -0
  39. data/lib/dbd/version.rb +3 -0
  40. data/spec/factories/fact.rb +76 -0
  41. data/spec/factories/provenance_fact.rb +34 -0
  42. data/spec/factories/provenance_resource.rb +16 -0
  43. data/spec/factories/resource.rb +17 -0
  44. data/spec/lib/dbd/fact/collection_spec.rb +236 -0
  45. data/spec/lib/dbd/fact/id_spec.rb +19 -0
  46. data/spec/lib/dbd/fact/subject_spec.rb +19 -0
  47. data/spec/lib/dbd/fact_spec.rb +217 -0
  48. data/spec/lib/dbd/graph_spec.rb +214 -0
  49. data/spec/lib/dbd/helpers/ordered_set_collection_spec.rb +88 -0
  50. data/spec/lib/dbd/helpers/uuid_spec.rb +15 -0
  51. data/spec/lib/dbd/provenance_fact_spec.rb +108 -0
  52. data/spec/lib/dbd/provenance_resource_spec.rb +77 -0
  53. data/spec/lib/dbd/rdf_base_spec.rb +39 -0
  54. data/spec/lib/dbd/repo/neo4j_repo/base_spec.rb +85 -0
  55. data/spec/lib/dbd/repo/neo4j_repo/performance_spec.rb +40 -0
  56. data/spec/lib/dbd/resource_spec.rb +166 -0
  57. data/spec/spec_helper.rb +19 -0
  58. metadata +272 -0
@@ -0,0 +1,11 @@
1
+ module Dbd
2
+
3
+ class OutOfOrderError < StandardError ; end
4
+ class FactError < StandardError ; end
5
+
6
+ class ProvenanceError < StandardError ; end
7
+ class SubjectError < StandardError ; end
8
+ class PredicateError < StandardError ; end
9
+ class ObjectError < StandardError ; end
10
+
11
+ end
@@ -0,0 +1,182 @@
1
+ require 'dbd/fact/collection'
2
+ require 'dbd/fact/subject'
3
+ require 'dbd/fact/id'
4
+
5
+ module Dbd
6
+
7
+ ##
8
+ # Basic Fact of knowledge.
9
+ #
10
+ # The database is built as an ordered sequence of facts, the "fact stream".
11
+ #
12
+ # This is somewhat similar to a "triple" in the RDF (Resource Description
13
+ # Framework) concept, but with different and extended functionality.
14
+ #
15
+ # Each basic fact has:
16
+ # * a unique and invariant *id* (a uuid)
17
+ #
18
+ # To allow referencing back to it (e.g. to invalidate it later in the fact stream).
19
+ #
20
+ # * a *time_stamp* (time with nanosecond granularity)
21
+ #
22
+ # To allow verifying that the order in the fact stream is correct.
23
+ #
24
+ # A time_stamp does not need to represent the exact time of the
25
+ # creation of the fact, but it has to increase in strictly monotic
26
+ # order in the fact stream.
27
+ #
28
+ # * a *provenance_subject* (a uuid)
29
+ #
30
+ # The subject of the ProvenanceResource (a set of ProvenanceFacts with
31
+ # the same subject) about this fact. Each Fact, points *back* to a
32
+ # ProvenanceResource (the ProvenanceResource must have been fully
33
+ # defined, earlier in the fact stream).
34
+ #
35
+ # * a *subject* (a uuid)
36
+ #
37
+ # "About which Resource is this fact?".
38
+ #
39
+ # Similar to the subject of an RDF triple, except that this subject is not
40
+ # a URI, but an abstract uuid (that is world-wide unique and invariant).
41
+ #
42
+ # Links to "real-world" URI's and URL's can be added later as separate facts
43
+ # (this also allows linking multiple "real-world" URI's to a single Resource).
44
+ #
45
+ # * a *predicate* (a string)
46
+ #
47
+ # "Which property of the resource are we describing?".
48
+ #
49
+ # Currently this is a string, but I suggest modeling this similar to predicate
50
+ # in RDF. Probably more detailed modeling using RDF predicate will follow.
51
+ #
52
+ # * an *object* (a string)
53
+ #
54
+ # "What is the value of the property of the resource we are describing?".
55
+ #
56
+ # Currently this is a string, but I suggest modeling this similar to object
57
+ # in RDF. Probably more detailed modeling using RDF object will follow.
58
+ class Fact
59
+
60
+ ##
61
+ # @return [Array] The 6 attributes of a Fact.
62
+ def self.attributes
63
+ [:id,
64
+ :time_stamp,
65
+ :provenance_subject,
66
+ :subject,
67
+ :predicate,
68
+ :object]
69
+ end
70
+
71
+ attributes.each do |attribute|
72
+ attr_reader attribute
73
+ end
74
+
75
+ def time_stamp=(time_stamp)
76
+ raise RuntimeError if @time_stamp
77
+ @time_stamp = time_stamp
78
+ end
79
+
80
+ ##
81
+ # @return [Subject] A new random subject.
82
+ def self.new_subject
83
+ Subject.new
84
+ end
85
+
86
+ ##
87
+ # @return [ID] A new random id.
88
+ def self.new_id
89
+ ID.new
90
+ end
91
+
92
+ ##
93
+ # Builds a new Fact.
94
+ #
95
+ # @param [Hash{Symbol => Object}] options
96
+ # @option options [#to_s] :predicate Required : the predicate for this Fact
97
+ # @option options [#to_s] :object Required : the object for this Fact (required)
98
+ # @option options [Subject] :provenance_subject (nil) Optional: the subject of the provenance(resource|fact)
99
+ # @option options [Subject] :subject (nil) Optional: the subject for this Fact
100
+ def initialize(options)
101
+ @id = self.class.new_id
102
+ @provenance_subject = options[:provenance_subject]
103
+ @subject = options[:subject]
104
+ @predicate = options[:predicate]
105
+ @object = options[:object]
106
+ raise PredicateError, "predicate cannot be nil" if predicate.nil?
107
+ raise ObjectError, "object cannot be nil" if object.nil?
108
+ end
109
+
110
+ ##
111
+ # @return [Array] The 6 values of a Fact.
112
+ def values
113
+ self.class.attributes.map{ |attribute| self.send(attribute) }
114
+ end
115
+
116
+ ##
117
+ # Executes the required update in used_provenance_subjects.
118
+ #
119
+ # For a Fact, pointing to a ProvenanceResource in it's provenance_subject,
120
+ # marks this provenance_subject in the "used_provenance_subjects" hash that
121
+ # is passed in as an argument (DCI). This will avoid further changes to the
122
+ # ProvenanceResource with this provenance_subject.
123
+ #
124
+ # This is overridden in the ProvenanceFact, since only relevant for a Fact.
125
+ def update_used_provenance_subjects(h)
126
+ # using a provenance_subject sets the key
127
+ h[provenance_subject] = true
128
+ end
129
+
130
+ ##
131
+ # Checks if a fact is valid for storing in the graph.
132
+ #
133
+ # @return [#true?] not nil if valid
134
+ def valid?
135
+ # id not validated, is set automatically
136
+ # predicate not validated, is validated in initialize
137
+ # object not validated, is validated in initialize
138
+ provenance_subject_valid?(provenance_subject) &&
139
+ subject
140
+ end
141
+
142
+ ##
143
+ # Validates the presence or absence of provenance_subject.
144
+ #
145
+ # Here, in (base) Fact, provenance_subject must be present
146
+ # In the derived ProvenanceFact it must not be present.
147
+ # This is how the difference is encoded between Fact and
148
+ # ProvenanceFact in the fact stream.
149
+ # @param [#nil?] provenance_subject
150
+ # Return [Boolean]
151
+ def provenance_subject_valid?(provenance_subject)
152
+ provenance_subject
153
+ end
154
+
155
+ ##
156
+ # Builds duplicate with the subject set.
157
+ #
158
+ # @param [Subject] subject_arg
159
+ # @return [Fact] the duplicate fact
160
+ def dup_with_subject(subject_arg)
161
+ self.class.new(
162
+ provenance_subject: provenance_subject,
163
+ subject: subject_arg, # from arg
164
+ predicate: predicate,
165
+ object: object)
166
+ end
167
+
168
+ ##
169
+ # Builds duplicate with the provenance_subject set.
170
+ #
171
+ # @param [Subject] provenance_subject_arg
172
+ # @return [Fact] the duplicate fact
173
+ def dup_with_provenance_subject(provenance_subject_arg)
174
+ self.class.new(
175
+ provenance_subject: provenance_subject_arg, # from arg
176
+ subject: subject,
177
+ predicate: predicate,
178
+ object: object)
179
+ end
180
+
181
+ end
182
+ end
@@ -0,0 +1,60 @@
1
+ require 'dbd/helpers/ordered_set_collection'
2
+
3
+ module Dbd
4
+ class Fact
5
+ module Collection
6
+
7
+ include Helpers::OrderedSetCollection
8
+
9
+ def initialize
10
+ super
11
+ @hash_by_subject = Hash.new { |h, k| h[k] = [] }
12
+ @used_provenance_subjects = {}
13
+ end
14
+
15
+ def newest_time_stamp
16
+ newest_entry = @internal_collection.last
17
+ newest_entry && newest_entry.time_stamp
18
+ end
19
+
20
+ def oldest_time_stamp
21
+ oldest_entry = @internal_collection.first
22
+ oldest_entry && oldest_entry.time_stamp
23
+ end
24
+
25
+ ##
26
+ # This is the central method of Fact::Collection module
27
+ #
28
+ # @param [Fact] fact the fact that is added to the collection
29
+ #
30
+ # @return [self] for chaining
31
+ #
32
+ # Validates that added fact is valid.
33
+ #
34
+ # Validates that added fact is newer.
35
+ #
36
+ # Validates that subject was never used as provenance_subject [A].
37
+ #
38
+ # Adds the fact and return the index in the collection.
39
+ #
40
+ # Store this index in the hash_by_subject.
41
+ #
42
+ # Mark the fact in the list of used provenance_subjects (for [A]).
43
+ def <<(fact)
44
+ # TODO Add a more descriptive Exception message
45
+ raise FactError unless fact.valid?
46
+ raise OutOfOrderError if (self.newest_time_stamp && fact.time_stamp <= self.newest_time_stamp)
47
+ raise OutOfOrderError if (@used_provenance_subjects[fact.subject])
48
+ index = Helpers::OrderedSetCollection.add_and_return_index(fact, @internal_collection)
49
+ @hash_by_subject[fact.subject] << index
50
+ fact.update_used_provenance_subjects(@used_provenance_subjects)
51
+ self
52
+ end
53
+
54
+ def by_subject(fact_subject)
55
+ @hash_by_subject[fact_subject].map{ |index| @internal_collection[index]}
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,19 @@
1
+ module Dbd
2
+ class Fact
3
+ class ID
4
+
5
+ def initialize
6
+ @uuid = Helpers::UUID.new
7
+ end
8
+
9
+ def to_s
10
+ @uuid.to_s
11
+ end
12
+
13
+ def self.regexp
14
+ Helpers::UUID.regexp
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ require 'dbd/helpers/uuid'
2
+
3
+ module Dbd
4
+ class Fact
5
+ class Subject
6
+
7
+ def initialize
8
+ @uuid = Helpers::UUID.new
9
+ end
10
+
11
+ def to_s
12
+ @uuid.to_s
13
+ end
14
+
15
+ def self.regexp
16
+ Helpers::UUID.regexp
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,47 @@
1
+ require 'csv'
2
+
3
+ module Dbd
4
+
5
+ ##
6
+ # The Graph stores the Facts and ProvenanceFacts in an in-memory
7
+ # collection structure.
8
+ class Graph
9
+
10
+ include Fact::Collection
11
+
12
+ def <<(fact)
13
+ enforce_strictly_monotonic_time(fact)
14
+ super(fact)
15
+ end
16
+
17
+ ##
18
+ # Export the graph to a CSV string
19
+ #
20
+ # @return [String] comma separated string with double quoted cells
21
+ def to_CSV
22
+ CSV.generate(force_quotes: true) do |csv|
23
+ @internal_collection.each do |fact|
24
+ csv << fact.values
25
+ end
26
+ end.encode("utf-8")
27
+ end
28
+
29
+ private
30
+
31
+ ##
32
+ # The system mmust enforce that the time_stamps are strictly monotonic.
33
+ #
34
+ # This has been detected because on Java (JRuby) the the Wall time has
35
+ # a resolution of only 1 ms so sometimes, the exact same value for
36
+ # Time.now was reported.
37
+ def enforce_strictly_monotonic_time(fact)
38
+ new_time = Time.now.utc
39
+ newest_time_stamp = newest_time_stamp()
40
+ if newest_time_stamp && new_time <= newest_time_stamp
41
+ new_time = newest_time_stamp + 0.000_000_002 # Add approx. 2 nanoseconds
42
+ end
43
+ fact.time_stamp = new_time
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,86 @@
1
+ module Dbd
2
+ module Helpers
3
+
4
+ ##
5
+ # Transforms the mixing class into an OrderedSet.
6
+ #
7
+ # On the mixing class, enumerable functions are possible,
8
+ # looping over the set in O(n), but it is not intended
9
+ # that the mixing class allows arbitrary access into
10
+ # the collection.
11
+ #
12
+ # The *add_and_return_index* module method allows to get
13
+ # an index to an added element, so indexes can be
14
+ # built to access elements in O(1). The mixing class
15
+ # should not expose this index to the added element in
16
+ # it's public API. The goal is to allow other
17
+ # implementations (e.g. with Hadoop, Neo4j, ...) with
18
+ # the same API.
19
+ module OrderedSetCollection
20
+
21
+ include Enumerable
22
+
23
+ ##
24
+ # Creates @internal_collection in the mixing class.
25
+ def initialize
26
+ @internal_collection = []
27
+ super
28
+ end
29
+
30
+ ##
31
+ # Inserts an element at the end of the collection.
32
+ # Returns self to allow chaining.
33
+ # @param [Object] element
34
+ # @return [Object] self
35
+ def <<(element)
36
+ @internal_collection << element
37
+ self
38
+ end
39
+
40
+ ##
41
+ # For the Enumerable functionality.
42
+ def each
43
+ @internal_collection.each do |e|
44
+ yield e
45
+ end
46
+ end
47
+
48
+ ##
49
+ # This is required as an efficient way to find the last
50
+ # element without stepping through the entire collection.
51
+ # This implementation is probably not thread safe.
52
+ # @return [Object] the last element
53
+ def last
54
+ @internal_collection.last
55
+ end
56
+
57
+ ##
58
+ # This is required as an efficient way to find the size
59
+ # without stepping through the entire collection.
60
+ # This implementation is probably not thread safe.
61
+ # @return [Object] the last element
62
+ def size
63
+ @internal_collection.size
64
+ end
65
+
66
+ ##
67
+ # Adds an element at the end of the collection and
68
+ # returns the array index of that element.
69
+ #
70
+ # This is not an instance method to avoid it ending
71
+ # up in the public API of classes that mixin this module.
72
+ #
73
+ # The implementation to find the index of the inserted
74
+ # element with `rindex` is primitive, but I did not see
75
+ # a better way in Ruby to do this (using `size` would
76
+ # certainly be not thread safe, maybe the current
77
+ # approach is thread safe, but that is not tested).
78
+ # @return [Integer] index
79
+ def self.add_and_return_index(element, collection)
80
+ collection << element
81
+ collection.rindex(element)
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,33 @@
1
+ require 'securerandom'
2
+
3
+ module Dbd
4
+ module Helpers
5
+
6
+ ##
7
+ # A simple UUID implementation based on SecureRandom.
8
+ class UUID
9
+
10
+ ##
11
+ # A regexp that can be used in tests.
12
+ # @return [Regexp]
13
+ def self.regexp
14
+ /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/
15
+ end
16
+
17
+ ##
18
+ # Store a SecureRandom.uuid.
19
+ # @return [void]
20
+ def initialize
21
+ @uuid = SecureRandom.uuid
22
+ end
23
+
24
+ ##
25
+ # The to_s of the uuid.
26
+ # @return [String]
27
+ def to_s
28
+ @uuid.to_s
29
+ end
30
+
31
+ end
32
+ end
33
+ end