dbd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +2 -0
  3. data/.rvmrc +1 -0
  4. data/.travis.yml +10 -0
  5. data/Gemfile +8 -0
  6. data/Guardfile +7 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +97 -0
  9. data/Rakefile +1 -0
  10. data/dbd.gemspec +30 -0
  11. data/docs/rationale.md +17 -0
  12. data/docs/stories/001_create_a_fact.txt +15 -0
  13. data/docs/stories/002_create_a_facts_collection.txt +14 -0
  14. data/docs/stories/003_create_a_fact_origin.txt +15 -0
  15. data/docs/stories/004_create_fact_origins_collection.txt +8 -0
  16. data/docs/stories/005_CSV_export_the_graph.txt +18 -0
  17. data/docs/stories/006_refactor_fact_origin_to_provenance_fact.txt +20 -0
  18. data/docs/stories/007_rename_property_to_predicate.txt +6 -0
  19. data/docs/stories/008_testing_different_ruby_versions.txt +7 -0
  20. data/docs/stories/009_build_and_store_resources_with_provenance.txt +38 -0
  21. data/docs/stories/010_provenance_fact_properties_from_provenance_ontology.txt +10 -0
  22. data/docs/test.rb +32 -0
  23. data/lib/dbd.rb +13 -0
  24. data/lib/dbd/errors.rb +11 -0
  25. data/lib/dbd/fact.rb +182 -0
  26. data/lib/dbd/fact/collection.rb +60 -0
  27. data/lib/dbd/fact/id.rb +19 -0
  28. data/lib/dbd/fact/subject.rb +21 -0
  29. data/lib/dbd/graph.rb +47 -0
  30. data/lib/dbd/helpers/ordered_set_collection.rb +86 -0
  31. data/lib/dbd/helpers/uuid.rb +33 -0
  32. data/lib/dbd/provenance_fact.rb +76 -0
  33. data/lib/dbd/provenance_resource.rb +54 -0
  34. data/lib/dbd/rdf.rb +9 -0
  35. data/lib/dbd/repo.rb +8 -0
  36. data/lib/dbd/repo/neo4j_repo.rb +4 -0
  37. data/lib/dbd/repo/neo4j_repo/base.rb +55 -0
  38. data/lib/dbd/resource.rb +117 -0
  39. data/lib/dbd/version.rb +3 -0
  40. data/spec/factories/fact.rb +76 -0
  41. data/spec/factories/provenance_fact.rb +34 -0
  42. data/spec/factories/provenance_resource.rb +16 -0
  43. data/spec/factories/resource.rb +17 -0
  44. data/spec/lib/dbd/fact/collection_spec.rb +236 -0
  45. data/spec/lib/dbd/fact/id_spec.rb +19 -0
  46. data/spec/lib/dbd/fact/subject_spec.rb +19 -0
  47. data/spec/lib/dbd/fact_spec.rb +217 -0
  48. data/spec/lib/dbd/graph_spec.rb +214 -0
  49. data/spec/lib/dbd/helpers/ordered_set_collection_spec.rb +88 -0
  50. data/spec/lib/dbd/helpers/uuid_spec.rb +15 -0
  51. data/spec/lib/dbd/provenance_fact_spec.rb +108 -0
  52. data/spec/lib/dbd/provenance_resource_spec.rb +77 -0
  53. data/spec/lib/dbd/rdf_base_spec.rb +39 -0
  54. data/spec/lib/dbd/repo/neo4j_repo/base_spec.rb +85 -0
  55. data/spec/lib/dbd/repo/neo4j_repo/performance_spec.rb +40 -0
  56. data/spec/lib/dbd/resource_spec.rb +166 -0
  57. data/spec/spec_helper.rb +19 -0
  58. metadata +272 -0
@@ -0,0 +1,11 @@
1
+ module Dbd
2
+
3
+ class OutOfOrderError < StandardError ; end
4
+ class FactError < StandardError ; end
5
+
6
+ class ProvenanceError < StandardError ; end
7
+ class SubjectError < StandardError ; end
8
+ class PredicateError < StandardError ; end
9
+ class ObjectError < StandardError ; end
10
+
11
+ end
@@ -0,0 +1,182 @@
1
+ require 'dbd/fact/collection'
2
+ require 'dbd/fact/subject'
3
+ require 'dbd/fact/id'
4
+
5
+ module Dbd
6
+
7
+ ##
8
+ # Basic Fact of knowledge.
9
+ #
10
+ # The database is built as an ordered sequence of facts, the "fact stream".
11
+ #
12
+ # This is somewhat similar to a "triple" in the RDF (Resource Description
13
+ # Framework) concept, but with different and extended functionality.
14
+ #
15
+ # Each basic fact has:
16
+ # * a unique and invariant *id* (a uuid)
17
+ #
18
+ # To allow referencing back to it (e.g. to invalidate it later in the fact stream).
19
+ #
20
+ # * a *time_stamp* (time with nanosecond granularity)
21
+ #
22
+ # To allow verifying that the order in the fact stream is correct.
23
+ #
24
+ # A time_stamp does not need to represent the exact time of the
25
+ # creation of the fact, but it has to increase in strictly monotic
26
+ # order in the fact stream.
27
+ #
28
+ # * a *provenance_subject* (a uuid)
29
+ #
30
+ # The subject of the ProvenanceResource (a set of ProvenanceFacts with
31
+ # the same subject) about this fact. Each Fact, points *back* to a
32
+ # ProvenanceResource (the ProvenanceResource must have been fully
33
+ # defined, earlier in the fact stream).
34
+ #
35
+ # * a *subject* (a uuid)
36
+ #
37
+ # "About which Resource is this fact?".
38
+ #
39
+ # Similar to the subject of an RDF triple, except that this subject is not
40
+ # a URI, but an abstract uuid (that is world-wide unique and invariant).
41
+ #
42
+ # Links to "real-world" URI's and URL's can be added later as separate facts
43
+ # (this also allows linking multiple "real-world" URI's to a single Resource).
44
+ #
45
+ # * a *predicate* (a string)
46
+ #
47
+ # "Which property of the resource are we describing?".
48
+ #
49
+ # Currently this is a string, but I suggest modeling this similar to predicate
50
+ # in RDF. Probably more detailed modeling using RDF predicate will follow.
51
+ #
52
+ # * an *object* (a string)
53
+ #
54
+ # "What is the value of the property of the resource we are describing?".
55
+ #
56
+ # Currently this is a string, but I suggest modeling this similar to object
57
+ # in RDF. Probably more detailed modeling using RDF object will follow.
58
+ class Fact
59
+
60
+ ##
61
+ # @return [Array] The 6 attributes of a Fact.
62
+ def self.attributes
63
+ [:id,
64
+ :time_stamp,
65
+ :provenance_subject,
66
+ :subject,
67
+ :predicate,
68
+ :object]
69
+ end
70
+
71
+ attributes.each do |attribute|
72
+ attr_reader attribute
73
+ end
74
+
75
+ def time_stamp=(time_stamp)
76
+ raise RuntimeError if @time_stamp
77
+ @time_stamp = time_stamp
78
+ end
79
+
80
+ ##
81
+ # @return [Subject] A new random subject.
82
+ def self.new_subject
83
+ Subject.new
84
+ end
85
+
86
+ ##
87
+ # @return [ID] A new random id.
88
+ def self.new_id
89
+ ID.new
90
+ end
91
+
92
+ ##
93
+ # Builds a new Fact.
94
+ #
95
+ # @param [Hash{Symbol => Object}] options
96
+ # @option options [#to_s] :predicate Required : the predicate for this Fact
97
+ # @option options [#to_s] :object Required : the object for this Fact (required)
98
+ # @option options [Subject] :provenance_subject (nil) Optional: the subject of the provenance(resource|fact)
99
+ # @option options [Subject] :subject (nil) Optional: the subject for this Fact
100
+ def initialize(options)
101
+ @id = self.class.new_id
102
+ @provenance_subject = options[:provenance_subject]
103
+ @subject = options[:subject]
104
+ @predicate = options[:predicate]
105
+ @object = options[:object]
106
+ raise PredicateError, "predicate cannot be nil" if predicate.nil?
107
+ raise ObjectError, "object cannot be nil" if object.nil?
108
+ end
109
+
110
+ ##
111
+ # @return [Array] The 6 values of a Fact.
112
+ def values
113
+ self.class.attributes.map{ |attribute| self.send(attribute) }
114
+ end
115
+
116
+ ##
117
+ # Executes the required update in used_provenance_subjects.
118
+ #
119
+ # For a Fact, pointing to a ProvenanceResource in it's provenance_subject,
120
+ # marks this provenance_subject in the "used_provenance_subjects" hash that
121
+ # is passed in as an argument (DCI). This will avoid further changes to the
122
+ # ProvenanceResource with this provenance_subject.
123
+ #
124
+ # This is overridden in the ProvenanceFact, since only relevant for a Fact.
125
+ def update_used_provenance_subjects(h)
126
+ # using a provenance_subject sets the key
127
+ h[provenance_subject] = true
128
+ end
129
+
130
+ ##
131
+ # Checks if a fact is valid for storing in the graph.
132
+ #
133
+ # @return [#true?] not nil if valid
134
+ def valid?
135
+ # id not validated, is set automatically
136
+ # predicate not validated, is validated in initialize
137
+ # object not validated, is validated in initialize
138
+ provenance_subject_valid?(provenance_subject) &&
139
+ subject
140
+ end
141
+
142
+ ##
143
+ # Validates the presence or absence of provenance_subject.
144
+ #
145
+ # Here, in (base) Fact, provenance_subject must be present
146
+ # In the derived ProvenanceFact it must not be present.
147
+ # This is how the difference is encoded between Fact and
148
+ # ProvenanceFact in the fact stream.
149
+ # @param [#nil?] provenance_subject
150
+ # Return [Boolean]
151
+ def provenance_subject_valid?(provenance_subject)
152
+ provenance_subject
153
+ end
154
+
155
+ ##
156
+ # Builds duplicate with the subject set.
157
+ #
158
+ # @param [Subject] subject_arg
159
+ # @return [Fact] the duplicate fact
160
+ def dup_with_subject(subject_arg)
161
+ self.class.new(
162
+ provenance_subject: provenance_subject,
163
+ subject: subject_arg, # from arg
164
+ predicate: predicate,
165
+ object: object)
166
+ end
167
+
168
+ ##
169
+ # Builds duplicate with the provenance_subject set.
170
+ #
171
+ # @param [Subject] provenance_subject_arg
172
+ # @return [Fact] the duplicate fact
173
+ def dup_with_provenance_subject(provenance_subject_arg)
174
+ self.class.new(
175
+ provenance_subject: provenance_subject_arg, # from arg
176
+ subject: subject,
177
+ predicate: predicate,
178
+ object: object)
179
+ end
180
+
181
+ end
182
+ end
@@ -0,0 +1,60 @@
1
+ require 'dbd/helpers/ordered_set_collection'
2
+
3
+ module Dbd
4
+ class Fact
5
+ module Collection
6
+
7
+ include Helpers::OrderedSetCollection
8
+
9
+ def initialize
10
+ super
11
+ @hash_by_subject = Hash.new { |h, k| h[k] = [] }
12
+ @used_provenance_subjects = {}
13
+ end
14
+
15
+ def newest_time_stamp
16
+ newest_entry = @internal_collection.last
17
+ newest_entry && newest_entry.time_stamp
18
+ end
19
+
20
+ def oldest_time_stamp
21
+ oldest_entry = @internal_collection.first
22
+ oldest_entry && oldest_entry.time_stamp
23
+ end
24
+
25
+ ##
26
+ # This is the central method of Fact::Collection module
27
+ #
28
+ # @param [Fact] fact the fact that is added to the collection
29
+ #
30
+ # @return [self] for chaining
31
+ #
32
+ # Validates that added fact is valid.
33
+ #
34
+ # Validates that added fact is newer.
35
+ #
36
+ # Validates that subject was never used as provenance_subject [A].
37
+ #
38
+ # Adds the fact and return the index in the collection.
39
+ #
40
+ # Store this index in the hash_by_subject.
41
+ #
42
+ # Mark the fact in the list of used provenance_subjects (for [A]).
43
+ def <<(fact)
44
+ # TODO Add a more descriptive Exception message
45
+ raise FactError unless fact.valid?
46
+ raise OutOfOrderError if (self.newest_time_stamp && fact.time_stamp <= self.newest_time_stamp)
47
+ raise OutOfOrderError if (@used_provenance_subjects[fact.subject])
48
+ index = Helpers::OrderedSetCollection.add_and_return_index(fact, @internal_collection)
49
+ @hash_by_subject[fact.subject] << index
50
+ fact.update_used_provenance_subjects(@used_provenance_subjects)
51
+ self
52
+ end
53
+
54
+ def by_subject(fact_subject)
55
+ @hash_by_subject[fact_subject].map{ |index| @internal_collection[index]}
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,19 @@
1
+ module Dbd
2
+ class Fact
3
+ class ID
4
+
5
+ def initialize
6
+ @uuid = Helpers::UUID.new
7
+ end
8
+
9
+ def to_s
10
+ @uuid.to_s
11
+ end
12
+
13
+ def self.regexp
14
+ Helpers::UUID.regexp
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ require 'dbd/helpers/uuid'
2
+
3
+ module Dbd
4
+ class Fact
5
+ class Subject
6
+
7
+ def initialize
8
+ @uuid = Helpers::UUID.new
9
+ end
10
+
11
+ def to_s
12
+ @uuid.to_s
13
+ end
14
+
15
+ def self.regexp
16
+ Helpers::UUID.regexp
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,47 @@
1
+ require 'csv'
2
+
3
+ module Dbd
4
+
5
+ ##
6
+ # The Graph stores the Facts and ProvenanceFacts in an in-memory
7
+ # collection structure.
8
+ class Graph
9
+
10
+ include Fact::Collection
11
+
12
+ def <<(fact)
13
+ enforce_strictly_monotonic_time(fact)
14
+ super(fact)
15
+ end
16
+
17
+ ##
18
+ # Export the graph to a CSV string
19
+ #
20
+ # @return [String] comma separated string with double quoted cells
21
+ def to_CSV
22
+ CSV.generate(force_quotes: true) do |csv|
23
+ @internal_collection.each do |fact|
24
+ csv << fact.values
25
+ end
26
+ end.encode("utf-8")
27
+ end
28
+
29
+ private
30
+
31
+ ##
32
+ # The system mmust enforce that the time_stamps are strictly monotonic.
33
+ #
34
+ # This has been detected because on Java (JRuby) the the Wall time has
35
+ # a resolution of only 1 ms so sometimes, the exact same value for
36
+ # Time.now was reported.
37
+ def enforce_strictly_monotonic_time(fact)
38
+ new_time = Time.now.utc
39
+ newest_time_stamp = newest_time_stamp()
40
+ if newest_time_stamp && new_time <= newest_time_stamp
41
+ new_time = newest_time_stamp + 0.000_000_002 # Add approx. 2 nanoseconds
42
+ end
43
+ fact.time_stamp = new_time
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,86 @@
1
+ module Dbd
2
+ module Helpers
3
+
4
+ ##
5
+ # Transforms the mixing class into an OrderedSet.
6
+ #
7
+ # On the mixing class, enumerable functions are possible,
8
+ # looping over the set in O(n), but it is not intended
9
+ # that the mixing class allows arbitrary access into
10
+ # the collection.
11
+ #
12
+ # The *add_and_return_index* module method allows to get
13
+ # an index to an added element, so indexes can be
14
+ # built to access elements in O(1). The mixing class
15
+ # should not expose this index to the added element in
16
+ # it's public API. The goal is to allow other
17
+ # implementations (e.g. with Hadoop, Neo4j, ...) with
18
+ # the same API.
19
+ module OrderedSetCollection
20
+
21
+ include Enumerable
22
+
23
+ ##
24
+ # Creates @internal_collection in the mixing class.
25
+ def initialize
26
+ @internal_collection = []
27
+ super
28
+ end
29
+
30
+ ##
31
+ # Inserts an element at the end of the collection.
32
+ # Returns self to allow chaining.
33
+ # @param [Object] element
34
+ # @return [Object] self
35
+ def <<(element)
36
+ @internal_collection << element
37
+ self
38
+ end
39
+
40
+ ##
41
+ # For the Enumerable functionality.
42
+ def each
43
+ @internal_collection.each do |e|
44
+ yield e
45
+ end
46
+ end
47
+
48
+ ##
49
+ # This is required as an efficient way to find the last
50
+ # element without stepping through the entire collection.
51
+ # This implementation is probably not thread safe.
52
+ # @return [Object] the last element
53
+ def last
54
+ @internal_collection.last
55
+ end
56
+
57
+ ##
58
+ # This is required as an efficient way to find the size
59
+ # without stepping through the entire collection.
60
+ # This implementation is probably not thread safe.
61
+ # @return [Object] the last element
62
+ def size
63
+ @internal_collection.size
64
+ end
65
+
66
+ ##
67
+ # Adds an element at the end of the collection and
68
+ # returns the array index of that element.
69
+ #
70
+ # This is not an instance method to avoid it ending
71
+ # up in the public API of classes that mixin this module.
72
+ #
73
+ # The implementation to find the index of the inserted
74
+ # element with `rindex` is primitive, but I did not see
75
+ # a better way in Ruby to do this (using `size` would
76
+ # certainly be not thread safe, maybe the current
77
+ # approach is thread safe, but that is not tested).
78
+ # @return [Integer] index
79
+ def self.add_and_return_index(element, collection)
80
+ collection << element
81
+ collection.rindex(element)
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,33 @@
1
+ require 'securerandom'
2
+
3
+ module Dbd
4
+ module Helpers
5
+
6
+ ##
7
+ # A simple UUID implementation based on SecureRandom.
8
+ class UUID
9
+
10
+ ##
11
+ # A regexp that can be used in tests.
12
+ # @return [Regexp]
13
+ def self.regexp
14
+ /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/
15
+ end
16
+
17
+ ##
18
+ # Store a SecureRandom.uuid.
19
+ # @return [void]
20
+ def initialize
21
+ @uuid = SecureRandom.uuid
22
+ end
23
+
24
+ ##
25
+ # The to_s of the uuid.
26
+ # @return [String]
27
+ def to_s
28
+ @uuid.to_s
29
+ end
30
+
31
+ end
32
+ end
33
+ end