spinoza 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +66 -3
- data/lib/spinoza/calvin/executor.rb +107 -0
- data/lib/spinoza/calvin/node.rb +44 -0
- data/lib/spinoza/calvin/readcaster.rb +50 -0
- data/lib/spinoza/calvin/scheduler.rb +134 -0
- data/lib/spinoza/calvin/sequencer.rb +74 -0
- data/lib/spinoza/common.rb +3 -0
- data/lib/spinoza/system/link.rb +33 -0
- data/lib/spinoza/system/lock-manager.rb +22 -8
- data/lib/spinoza/system/log.rb +95 -0
- data/lib/spinoza/system/meta-log.rb +103 -0
- data/lib/spinoza/system/model.rb +14 -0
- data/lib/spinoza/system/node.rb +56 -7
- data/lib/spinoza/system/operation.rb +22 -6
- data/lib/spinoza/system/store.rb +15 -14
- data/lib/spinoza/system/{table-spec.rb → table.rb} +10 -6
- data/lib/spinoza/system/timeline.rb +81 -0
- data/lib/spinoza/transaction.rb +170 -39
- data/lib/spinoza/version.rb +1 -1
- data/test/test-executor.rb +110 -0
- data/test/test-link.rb +43 -0
- data/test/test-log.rb +47 -0
- data/test/test-meta-log.rb +63 -0
- data/test/test-node.rb +35 -14
- data/test/test-readcaster.rb +87 -0
- data/test/test-scheduler.rb +163 -0
- data/test/test-sequencer.rb +78 -0
- data/test/test-timeline.rb +58 -0
- data/test/test-transaction.rb +75 -18
- metadata +42 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8864d3908b3001de934f150becf1876972c78809
|
4
|
+
data.tar.gz: 7da59f8200f6b5d501842d950b98d9be440eb4ce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f33925d0842233788b242cdde6d0d3c1311dff1d7d2e9349907516fd3ec95ba40b0961786e6f4e00accd3c8a01efe3e9732563758b3c640b34f6a4bb1b0745f7
|
7
|
+
data.tar.gz: 57e8c0075dc16a91180a89955f70c8548c310f10e4d0db444fa39a365c0be659174cb9ed7896bad64065263b7d31471a738d5eb7eb5f4dbad4e6f7364f4f7fd8
|
data/README.md
CHANGED
@@ -1,14 +1,77 @@
|
|
1
1
|
spinoza
|
2
2
|
=======
|
3
3
|
|
4
|
-
A model of the Calvin distributed database.
|
4
|
+
A model of the Calvin distributed database. The main purpose of this model is expository, rather than analysis for correctness or performance.
|
5
5
|
|
6
6
|
Spinoza, like Calvin, was a philosopher who dealt in determinism.
|
7
7
|
|
8
|
-
The model of the underlying computer and network system is in lib/spinoza/system.
|
9
|
-
|
10
8
|
Calvin is developed by the Yale Databases group; the open-source releases are at https://github.com/yaledb.
|
11
9
|
|
10
|
+
|
11
|
+
Structure
|
12
|
+
=========
|
13
|
+
|
14
|
+
The model of the underlying computer and network system is in [lib/spinoza/system](lib/spinoza/system).
|
15
|
+
|
16
|
+
The Calvin model, implemented on the system models, is in [lib/spinoza/calvin](lib/spinoza/calvin). Other distributed transaction models could also be implemented on this layer.
|
17
|
+
|
18
|
+
The transaction class, in [lib/spinoza/transaction.rb](lib/spinoza/transaction.rb), is mostly abstracted from these layers. It is very simplistic, intended to illustrate Calvin's replication and consistency characteristics.
|
19
|
+
|
20
|
+
|
21
|
+
Running
|
22
|
+
=======
|
23
|
+
|
24
|
+
You will need ruby 2.0 or later, from http://ruby-lang.org, and the gems listed in the gemspec:
|
25
|
+
|
26
|
+
sequel
|
27
|
+
sqlite3
|
28
|
+
rbtree
|
29
|
+
|
30
|
+
You can also `gem install spinoza`, but it may not be up to date.
|
31
|
+
|
32
|
+
To run the unit tests:
|
33
|
+
|
34
|
+
rake test
|
35
|
+
|
36
|
+
Examples TBD.
|
37
|
+
|
38
|
+
|
39
|
+
References
|
40
|
+
==========
|
41
|
+
|
42
|
+
* The Calvin papers:
|
43
|
+
|
44
|
+
* [The Case for Determinism in Database Systems](http://cs-www.cs.yale.edu/homes/dna/papers/determinism-vldb10.pdf)
|
45
|
+
|
46
|
+
* [Consistency Tradeoffs in Modern Distributed Database System Design](http://cs-www.cs.yale.edu/homes/dna/papers/abadi-pacelc.pdf)
|
47
|
+
|
48
|
+
* [Modularity and Scalability in Calvin](http://sites.computer.org/debull/A13june/calvin1.pdf)
|
49
|
+
|
50
|
+
* [Calvin: Fast Distributed Transactions for Partitioned Database Systems](http://www.cs.yale.edu/homes/dna/papers/calvin-sigmod12.pdf)
|
51
|
+
|
52
|
+
* [Lightweight Locking for Main Memory Database Systems](http://cs-www.cs.yale.edu/homes/dna/papers/vll-vldb13.pdf)
|
53
|
+
|
54
|
+
|
55
|
+
To do
|
56
|
+
=====
|
57
|
+
|
58
|
+
* The performance and error modeling should optionally be statistical, with variation using some distribution.
|
59
|
+
|
60
|
+
* Model IO latency and compute time, in addition to currently modeled network latency.
|
61
|
+
|
62
|
+
* `Log#time_replicated` should be a function of the reading node and depend on the link characteristics between that node and the writing node.
|
63
|
+
|
64
|
+
* Transactions, to be more realistic, should have dataflow dependencies among operations. (But only for non-key values, because Calvin splits dependent transactions.)
|
65
|
+
|
66
|
+
* Transactions also need conditionals, or, at least, conditional abort, which is needed to support the splitting mentioned above.
|
67
|
+
|
68
|
+
* For comparison, implement a 2-phase commit transaction processor on top of the Spinoza::System classes.
|
69
|
+
|
70
|
+
* Output spacetime diagrams using graphviz.
|
71
|
+
|
72
|
+
* See also 'TODO' in code.
|
73
|
+
|
74
|
+
|
12
75
|
Contact
|
13
76
|
=======
|
14
77
|
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'spinoza/common'
|
2
|
+
|
3
|
+
# Represents the work performed in one thread. The scheduler assigns a sequence
|
4
|
+
# of transactions to each of several executors. The executor handles the
|
5
|
+
# transactions one at a time, in a series of substeps as data is received from
|
6
|
+
# peers. Within an executor, the sequence of transactions and substeps is
|
7
|
+
# totally ordered wrt the global timeline, but the sequences of two Executors
|
8
|
+
# may interleave, which is how Calvin achieves some write concurrency.
|
9
|
+
#
|
10
|
+
# Does not have access to any subsystems except the node's Store and
|
11
|
+
# communication with peer executors via the readcasters.
|
12
|
+
#
|
13
|
+
class Calvin::Executor
|
14
|
+
attr_reader :store
|
15
|
+
attr_reader :readcaster
|
16
|
+
attr_reader :task
|
17
|
+
|
18
|
+
class StateError < StandardError; end
|
19
|
+
|
20
|
+
# Represents the state of executing one transaction in this Executor, in
|
21
|
+
# the case where that execution involves waiting for data from peers.
|
22
|
+
class Task
|
23
|
+
attr_reader :txn
|
24
|
+
|
25
|
+
# Accumulates results as they arrive locally and from peers.
|
26
|
+
attr_accessor :read_results
|
27
|
+
|
28
|
+
# Set of tables the task is waiting for.
|
29
|
+
attr_accessor :remote_read_tables
|
30
|
+
|
31
|
+
def initialize txn, read_results: [], remote_read_tables: Set[]
|
32
|
+
@txn = txn
|
33
|
+
@read_results = read_results
|
34
|
+
@remote_read_tables = remote_read_tables
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def initialize store: nil, readcaster: nil
|
39
|
+
@store = store
|
40
|
+
@readcaster = readcaster
|
41
|
+
ready!
|
42
|
+
end
|
43
|
+
|
44
|
+
def ready!
|
45
|
+
@task = nil
|
46
|
+
end
|
47
|
+
|
48
|
+
def ready?
|
49
|
+
@task.nil?
|
50
|
+
end
|
51
|
+
|
52
|
+
def assert_ready?
|
53
|
+
unless ready?
|
54
|
+
raise StateError, "cannot start new task -- already executing #{task}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Assumes all locks are held around this call.
|
59
|
+
def execute_transaction txn
|
60
|
+
assert_ready?
|
61
|
+
|
62
|
+
local_read_results = @readcaster.execute_local_reads txn
|
63
|
+
@readcaster.serve_reads txn, local_read_results
|
64
|
+
|
65
|
+
if passive? txn
|
66
|
+
result = local_read_results
|
67
|
+
ready!
|
68
|
+
|
69
|
+
elsif all_reads_are_local? txn
|
70
|
+
result = local_read_results
|
71
|
+
store.execute *txn.all_write_ops
|
72
|
+
ready!
|
73
|
+
|
74
|
+
else
|
75
|
+
@task = Task.new txn,
|
76
|
+
read_results: local_read_results,
|
77
|
+
remote_read_tables: txn.remote_read_tables(store)
|
78
|
+
result = false
|
79
|
+
end
|
80
|
+
|
81
|
+
return result
|
82
|
+
end
|
83
|
+
|
84
|
+
def passive? txn
|
85
|
+
not txn.active? store
|
86
|
+
end
|
87
|
+
|
88
|
+
def all_reads_are_local? txn
|
89
|
+
txn.all_reads_are_local? store
|
90
|
+
end
|
91
|
+
|
92
|
+
# Assumes all locks are held around this call.
|
93
|
+
def recv_remote_reads table, read_results
|
94
|
+
if task.remote_read_tables.include? table
|
95
|
+
task.remote_read_tables.delete table
|
96
|
+
task.read_results.concat read_results
|
97
|
+
# else this is a redundant message for this table, so ignore it
|
98
|
+
end
|
99
|
+
|
100
|
+
return false unless task.remote_read_tables.empty?
|
101
|
+
|
102
|
+
store.execute *task.txn.all_write_ops
|
103
|
+
result = task.read_results
|
104
|
+
ready!
|
105
|
+
result
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spinoza/system/node'
|
2
|
+
require 'spinoza/calvin/sequencer'
|
3
|
+
require 'spinoza/calvin/scheduler'
|
4
|
+
|
5
|
+
class Calvin::Node < Spinoza::Node
|
6
|
+
attr_reader :sequencer, :scheduler
|
7
|
+
attr_reader :log, :meta_log
|
8
|
+
|
9
|
+
def initialize *tables, log: nil, meta_log: nil,
|
10
|
+
sequencer: nil, scheduler: nil, **rest
|
11
|
+
super *tables, **rest
|
12
|
+
|
13
|
+
@log = log
|
14
|
+
@meta_log = meta_log
|
15
|
+
@sequencer = sequencer || Calvin::Sequencer.new(node: self)
|
16
|
+
@scheduler = scheduler || Calvin::Scheduler.new(node: self)
|
17
|
+
|
18
|
+
on_transaction_finish &method(:default_output)
|
19
|
+
end
|
20
|
+
|
21
|
+
def default_output transaction, result
|
22
|
+
r = result.map {|rr| [rr.op.table, rr.val].join(":")}.join(", ")
|
23
|
+
puts "%07.6f [RESULT] #{transaction} => #{r}" % timeline.now
|
24
|
+
end
|
25
|
+
|
26
|
+
def recv msg: nil
|
27
|
+
scheduler.recv_peer_results **msg
|
28
|
+
end
|
29
|
+
|
30
|
+
def read_batch batch_id
|
31
|
+
log.read batch_id, node: self
|
32
|
+
end
|
33
|
+
|
34
|
+
def on_transaction_finish &b
|
35
|
+
@finished_transaction_handler = b
|
36
|
+
end
|
37
|
+
|
38
|
+
# Override this to put the result somewhere.
|
39
|
+
def finished_transaction transaction, result
|
40
|
+
if @finished_transaction_handler
|
41
|
+
@finished_transaction_handler[transaction, result]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'spinoza/common'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
class Calvin::Readcaster
|
5
|
+
attr_reader :node
|
6
|
+
|
7
|
+
def initialize node: nil
|
8
|
+
@node = node
|
9
|
+
@links = nil
|
10
|
+
@tables = node.tables
|
11
|
+
end
|
12
|
+
|
13
|
+
def inspect
|
14
|
+
"<#{self.class} on #{node.inspect}>"
|
15
|
+
end
|
16
|
+
|
17
|
+
# Pre-computed map, by table, of which nodes might need data from this node:
|
18
|
+
# {table => Set[link, ...]} In other, words, excludes `table,link` pairs for
|
19
|
+
# which link.dst already has `table`.
|
20
|
+
def links
|
21
|
+
unless @links
|
22
|
+
@links = Hash.new {|h,k| h[k] = Set[]}
|
23
|
+
node.links.each do |there, link|
|
24
|
+
(@tables - there.tables).each do |table|
|
25
|
+
@links[table] << link
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
@links
|
30
|
+
end
|
31
|
+
|
32
|
+
def execute_local_reads txn
|
33
|
+
local_reads = txn.all_read_ops.select {|r| @tables.include? r.table}
|
34
|
+
node.store.execute *local_reads
|
35
|
+
end
|
36
|
+
|
37
|
+
def serve_reads txn, local_read_results
|
38
|
+
local_read_results.group_by {|r| r.op.table}.each do |table, results|
|
39
|
+
links[table].each do |link|
|
40
|
+
if txn.active?(link.dst)
|
41
|
+
send_read link, transaction: txn, table: table, read_results: results
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def send_read link, **opts
|
48
|
+
link.send_message **opts
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'spinoza/common'
|
2
|
+
require 'spinoza/calvin/executor'
|
3
|
+
require 'spinoza/calvin/readcaster'
|
4
|
+
|
5
|
+
class Calvin::Scheduler
|
6
|
+
attr_reader :node
|
7
|
+
|
8
|
+
attr_reader :executors
|
9
|
+
attr_reader :idle_executors
|
10
|
+
|
11
|
+
# Maps { locally executing transaction => Executor }
|
12
|
+
attr_reader :ex_for_txn
|
13
|
+
|
14
|
+
# Transactions to be executed, in order.
|
15
|
+
attr_reader :work_queue
|
16
|
+
|
17
|
+
def initialize node: raise, n_threads: 4
|
18
|
+
@node = node
|
19
|
+
|
20
|
+
@executors = n_threads.times.map {
|
21
|
+
Calvin::Executor.new(
|
22
|
+
store: node.store,
|
23
|
+
readcaster: Calvin::Readcaster.new(node: node))}
|
24
|
+
|
25
|
+
@idle_executors = @executors.dup
|
26
|
+
@ex_for_txn = {}
|
27
|
+
@work_queue = []
|
28
|
+
|
29
|
+
node.meta_log.on_entry_available self, :handle_meta_log_entry
|
30
|
+
end
|
31
|
+
|
32
|
+
def inspect
|
33
|
+
"<#{self.class} on #{node.inspect}>"
|
34
|
+
end
|
35
|
+
|
36
|
+
def handle_meta_log_entry id: raise, node: raise, value: raise
|
37
|
+
batch_id = value
|
38
|
+
batch = node.read_batch(batch_id)
|
39
|
+
if batch
|
40
|
+
work_queue.concat batch
|
41
|
+
handle_next_transactions
|
42
|
+
else
|
43
|
+
# Log entry did not yet propagate to this node, even though MetaLog entry
|
44
|
+
# did propagate. Won't happen with default latency settings.
|
45
|
+
raise "TODO" ##
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Handle messages from peers. The only messages are the unidirectional
|
50
|
+
# broadcasts of read results.
|
51
|
+
def recv_peer_results transaction: raise, table: raise, read_results: raise
|
52
|
+
ex = ex_for_txn[transaction]
|
53
|
+
if ex
|
54
|
+
result = ex.recv_remote_reads table, read_results
|
55
|
+
if result
|
56
|
+
finish_transaction transaction, result
|
57
|
+
handle_next_transactions
|
58
|
+
end
|
59
|
+
else
|
60
|
+
## TODO what if transaction hasn't started yet? Buffer? This won't
|
61
|
+
## happen with our simplistic latency assumptions.
|
62
|
+
# The transaction has already finished locally, but another
|
63
|
+
# node is still sending out read results.
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def handle_next_transactions
|
68
|
+
until work_queue.empty? or idle_executors.empty?
|
69
|
+
success = handle_next_transaction
|
70
|
+
break unless success
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def handle_next_transaction
|
75
|
+
ex = idle_executors.last
|
76
|
+
txn = work_queue.first
|
77
|
+
raise if ex_for_txn[txn]
|
78
|
+
|
79
|
+
lock_succeeded = try_lock(txn)
|
80
|
+
|
81
|
+
if lock_succeeded
|
82
|
+
txn = work_queue.shift
|
83
|
+
result = ex.execute_transaction(txn)
|
84
|
+
if result
|
85
|
+
finish_transaction txn, result
|
86
|
+
else
|
87
|
+
idle_executors.pop
|
88
|
+
ex_for_txn[txn] = ex
|
89
|
+
end
|
90
|
+
|
91
|
+
else
|
92
|
+
node.lock_manager.unlock_all txn
|
93
|
+
# nothing to do until some executor finishes its current transaction
|
94
|
+
## TODO optimization: attempt to reorder another txn to the head
|
95
|
+
## of the work_queue where lock sets are disjoint.
|
96
|
+
end
|
97
|
+
|
98
|
+
lock_succeeded
|
99
|
+
end
|
100
|
+
|
101
|
+
def try_lock txn
|
102
|
+
lm = node.lock_manager
|
103
|
+
rset = txn.read_set
|
104
|
+
wset = txn.write_set
|
105
|
+
|
106
|
+
# get write locks first, so r/w on same key doesn't fail
|
107
|
+
wset.each do |table, keys|
|
108
|
+
keys.each do |key|
|
109
|
+
next if key == Spinoza::Transaction::INSERT_KEY
|
110
|
+
lm.lock_write [table, key], txn
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
rset.each do |table, keys|
|
115
|
+
keys.each do |key|
|
116
|
+
lm.lock_read [table, key], txn
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
true
|
121
|
+
|
122
|
+
rescue Spinoza::LockManager::ConcurrencyError
|
123
|
+
false
|
124
|
+
end
|
125
|
+
|
126
|
+
def finish_transaction transaction, result
|
127
|
+
ex = ex_for_txn.delete(transaction)
|
128
|
+
if ex
|
129
|
+
idle_executors.push ex
|
130
|
+
end
|
131
|
+
node.lock_manager.unlock_all transaction
|
132
|
+
node.finished_transaction transaction, result
|
133
|
+
end
|
134
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'spinoza/system/model'
|
2
|
+
|
3
|
+
# Accepts transaction requests from clients. The requests accepted in an epoch
|
4
|
+
# are grouped as a batch, given a sequential id, and replicated to the
|
5
|
+
# transaction schedulers on each node.
|
6
|
+
class Calvin::Sequencer < Spinoza::Model
|
7
|
+
attr_reader :node
|
8
|
+
|
9
|
+
# ID used to construct UUID for batch.
|
10
|
+
attr_reader :id
|
11
|
+
|
12
|
+
# Length of epoch in seconds.
|
13
|
+
attr_reader :dt_epoch
|
14
|
+
|
15
|
+
@seq_id = 0
|
16
|
+
class << self
|
17
|
+
def next_id
|
18
|
+
@seq_id += 1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize node: raise, dt_epoch: 0.010
|
23
|
+
super timeline: node.timeline
|
24
|
+
|
25
|
+
@node = node
|
26
|
+
@dt_epoch = dt_epoch
|
27
|
+
@batch = []
|
28
|
+
@epoch = 0
|
29
|
+
@id = self.class.next_id
|
30
|
+
|
31
|
+
step_epoch
|
32
|
+
end
|
33
|
+
|
34
|
+
def inspect
|
35
|
+
"<#{self.class} on #{node.inspect}>"
|
36
|
+
end
|
37
|
+
|
38
|
+
def step_epoch
|
39
|
+
unless @batch.empty?
|
40
|
+
batch_id = [@id, @epoch] # globally unique, but not ordered
|
41
|
+
log.write batch_id, @batch, node: node
|
42
|
+
|
43
|
+
log.when_durable batch_id,
|
44
|
+
actor: self,
|
45
|
+
action: :append_batch_to_meta_log,
|
46
|
+
batch_id: batch_id
|
47
|
+
|
48
|
+
@batch = []
|
49
|
+
end
|
50
|
+
@epoch += 1
|
51
|
+
|
52
|
+
timeline.schedule Spinoza::Event[
|
53
|
+
time: time_now + dt_epoch,
|
54
|
+
actor: self,
|
55
|
+
action: :step_epoch
|
56
|
+
]
|
57
|
+
end
|
58
|
+
|
59
|
+
def append_batch_to_meta_log batch_id: batch_id
|
60
|
+
meta_log.append batch_id, node: node
|
61
|
+
end
|
62
|
+
|
63
|
+
def log
|
64
|
+
node.log
|
65
|
+
end
|
66
|
+
|
67
|
+
def meta_log
|
68
|
+
node.meta_log
|
69
|
+
end
|
70
|
+
|
71
|
+
def accept_transaction txn
|
72
|
+
@batch << txn
|
73
|
+
end
|
74
|
+
end
|