pregel 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
@@ -0,0 +1,26 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pregel (0.0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.1.2)
10
+ rspec (2.0.1)
11
+ rspec-core (~> 2.0.1)
12
+ rspec-expectations (~> 2.0.1)
13
+ rspec-mocks (~> 2.0.1)
14
+ rspec-core (2.0.1)
15
+ rspec-expectations (2.0.1)
16
+ diff-lcs (>= 1.1.2)
17
+ rspec-mocks (2.0.1)
18
+ rspec-core (~> 2.0.1)
19
+ rspec-expectations (~> 2.0.1)
20
+
21
+ PLATFORMS
22
+ ruby
23
+
24
+ DEPENDENCIES
25
+ pregel!
26
+ rspec (~> 2.0.0)
@@ -0,0 +1,57 @@
1
+ # Pregel
2
+
3
+ Single-node implementation of Google's Pregel framework for large-scale graph processing. It does not provide any of the distributed components, but implements the core functional pieces within a single Ruby VM such that you can develop and run iterative graph algorithms as if you had the full power of Pregel at your disposal.
4
+
5
+ To learn more about Pregel see following resources:
6
+
7
+ * [Pregel, a system for large-scale graph processing](http://portal.acm.org/citation.cfm?id=1582716.1582723)
8
+ * [Large-scale graph computing at Google](http://googleresearch.blogspot.com/2009/06/large-scale-graph-computing-at-google.html)
9
+ * [Phoebus](http://github.com/xslogic/phoebus) is a distributed Erlang implementation of Pregel
10
+
11
+ # PageRank example with Pregel
12
+ To run a PageRank computation on an arbitrary graph, you simply specify the vertices & edges, and then define a compute function for each vertex. The coordinator then partitions the work between a specified number of workers (Ruby threads, in our case), and iteratively executes "supersteps" until we converge on a result. At each superstep, the vertex can read and process incoming messages and then send messages to other vertices. Hence, the full PageRank implementation is:
13
+
14
+ class PageRankVertex < Vertex
15
+ def compute
16
+ if superstep >= 1
17
+ sum = messages.inject(0) {|total,msg| total += msg; total }
18
+ @value = (0.15 / num_nodes) + 0.85 * sum
19
+ end
20
+
21
+ if superstep < 30
22
+ deliver_to_all_neighbors(@value / neighbors.size)
23
+ else
24
+ halt
25
+ end
26
+ end
27
+ end
28
+
29
+ The above algorithm will run for 30 iterations, at which point all vertices will mark themselves as inactive and the coordinator will terminate our computation.
30
+
31
+ * [Computing PageRank for a simple circular graph](https://github.com/igrigorik/pregel/blob/master/spec/coordinator_spec.rb#L52)
32
+ * [Computing PageRank for a more complex grap](https://github.com/igrigorik/pregel/blob/master/spec/coordinator_spec.rb#L70)
33
+
34
+ # License
35
+
36
+ (The MIT License)
37
+
38
+ Copyright (c) 2010 Ilya Grigorik
39
+
40
+ Permission is hereby granted, free of charge, to any person obtaining
41
+ a copy of this software and associated documentation files (the
42
+ 'Software'), to deal in the Software without restriction, including
43
+ without limitation the rights to use, copy, modify, merge, publish,
44
+ distribute, sublicense, and/or sell copies of the Software, and to
45
+ permit persons to whom the Software is furnished to do so, subject to
46
+ the following conditions:
47
+
48
+ The above copyright notice and this permission notice shall be
49
+ included in all copies or substantial portions of the Software.
50
+
51
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
52
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
53
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
54
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
55
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
56
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
57
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,3 @@
1
+ require 'bundler'
2
+
3
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1 @@
1
+ Autotest.add_discovery { 'rspec2' }
@@ -0,0 +1,30 @@
1
+ require 'pregel/vertex'
2
+ require 'pregel/worker'
3
+ require 'pregel/coordinator'
4
+
5
+ require 'singleton'
6
+
7
+ class PostOffice
8
+ include Singleton
9
+
10
+ def initialize
11
+ @mailboxes = Hash.new
12
+ @mutex = Mutex.new
13
+ end
14
+
15
+ def deliver(to, msg)
16
+ @mutex.synchronize do
17
+ if @mailboxes[to]
18
+ @mailboxes[to].push msg
19
+ else
20
+ @mailboxes[to] = [msg]
21
+ end
22
+ end
23
+ end
24
+
25
+ def read(box)
26
+ @mutex.synchronize do
27
+ @mailboxes.delete(box) || []
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,33 @@
1
+ module Pregel
2
+ class Coordinator
3
+ attr_reader :workers
4
+
5
+ def initialize(graph, options = {})
6
+ raise "empty graph" if graph.empty?
7
+
8
+ @workers = []
9
+ @options = {
10
+ :partitions => 1
11
+ }.merge(options)
12
+
13
+ partition(graph) do |subgraph|
14
+ @workers << Worker.new(subgraph)
15
+ end
16
+ end
17
+
18
+ def partition(graph)
19
+ size = (graph.size.to_f / @options[:partitions]).ceil
20
+ graph.each_slice(size) { |slice| yield slice }
21
+ end
22
+
23
+ def run
24
+ loop do
25
+ # execute a superstep and wait for workers to complete
26
+ step = @workers.select {|w| w.active > 0}.collect {|w| w.superstep }
27
+ step.each {|t| t.join}
28
+
29
+ break if @workers.select {|w| w.active > 0}.size.zero?
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,3 @@
1
+ module Pregel
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,41 @@
1
+ module Pregel
2
+ class Vertex
3
+ attr_reader :id
4
+ attr_accessor :value, :messages
5
+
6
+ def initialize(id, value, *outedges)
7
+ @id = id
8
+ @value = value
9
+ @outedges = outedges
10
+ @messages = []
11
+ @active = true
12
+ @superstep = 0
13
+ end
14
+
15
+ def edges
16
+ block_given? ? @outedges.each {|e| yield e} : @outedges
17
+ end
18
+
19
+ def deliver_to_all_neighbors(msg)
20
+ edges.each {|e| deliver(e, msg)}
21
+ end
22
+
23
+ def deliver(to, msg)
24
+ PostOffice.instance.deliver(to, msg)
25
+ end
26
+
27
+ def step
28
+ @superstep += 1
29
+ compute
30
+ end
31
+
32
+ def halt; @active = false; end
33
+ def active!; @active = true; end
34
+ def active?; @active; end
35
+
36
+ def superstep; @superstep; end
37
+ def neighbors; @outedges; end
38
+
39
+ def compute; end
40
+ end
41
+ end
@@ -0,0 +1,25 @@
1
+ module Pregel
2
+ class Worker
3
+ attr_reader :vertices, :active
4
+
5
+ def initialize(graph = [])
6
+ raise 'empty worker graph' if graph.empty?
7
+ @vertices = graph
8
+ @active = graph.size
9
+ end
10
+
11
+ def superstep
12
+ Thread.new do
13
+ @vertices.each do |v|
14
+ v.messages = PostOffice.instance.read(v.id)
15
+ v.active! if v.messages.size > 0
16
+ end
17
+
18
+ active = @vertices.select {|v| v.active?}
19
+ active.each {|v| v.step}
20
+
21
+ @active = active.select {|v| v.active?}.size
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "pregel/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "pregel"
7
+ s.version = Pregel::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Ilya Grigorik"]
10
+ s.email = ["ilya@igvita.com"]
11
+ s.homepage = "http://github.com/igrigorik/pregel"
12
+ s.summary = "Single-node implementation of Google's Pregel framework for large-scale graph processing."
13
+ s.description = s.summary
14
+ s.rubyforge_project = "pregel"
15
+
16
+ s.add_development_dependency "rspec", '~> 2.0.0'
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+ end
@@ -0,0 +1,95 @@
1
+ require 'helper'
2
+
3
+ describe Coordinator do
4
+ it 'should not allow empty graphs' do
5
+ lambda { Coordinator.new([]) }.should raise_error
6
+ end
7
+
8
+ let(:graph) do
9
+ [
10
+ AddVertex.new(:igvita, 1, :wikipedia),
11
+ AddVertex.new(:wikipedia, 2, :google),
12
+ AddVertex.new(:google, 1, :wikipedia)
13
+ ]
14
+ end
15
+
16
+ it 'should partition graphs with variable worker sizes' do
17
+ c = Coordinator.new(graph)
18
+ c.workers.size.should == 1
19
+
20
+ c = Coordinator.new(graph, partitions: 2)
21
+ c.workers.size.should == 2
22
+ end
23
+
24
+ it 'should schedule workers to run until there are no active vertices' do
25
+ c = Coordinator.new(graph)
26
+ c.run
27
+
28
+ c.workers.each do |w|
29
+ w.vertices.each do |v|
30
+ v.value.should == 5
31
+ end
32
+ end
33
+ end
34
+
35
+ context 'PageRank' do
36
+ class PageRankVertex < Vertex
37
+ def compute
38
+ if superstep >= 1
39
+ sum = messages.inject(0) {|total,msg| total += msg; total }
40
+ @value = (0.15 / 3) + 0.85 * sum
41
+ end
42
+
43
+ if superstep < 30
44
+ deliver_to_all_neighbors(@value / neighbors.size)
45
+ else
46
+ halt
47
+ end
48
+ end
49
+ end
50
+
51
+
52
+ it 'should calculate PageRank of a circular graph' do
53
+ graph = [
54
+ # name value out-edges
55
+ PageRankVertex.new(:igvita, 1, :wikipedia),
56
+ PageRankVertex.new(:wikipedia, 1, :google),
57
+ PageRankVertex.new(:google, 1, :igvita)
58
+ ]
59
+
60
+ c = Coordinator.new(graph)
61
+ c.run
62
+
63
+ c.workers.each do |w|
64
+ w.vertices.each do |v|
65
+ (v.value * 100).to_i.should == 33
66
+ end
67
+ end
68
+ end
69
+
70
+ it 'should calculate PageRank of arbitrary graph' do
71
+ graph = [
72
+ # page 1 -> page 1, page 2 (0.18)
73
+ # page 2 -> page 1, page 3 (0.13)
74
+ # page 3 -> page 3 (0.69)
75
+
76
+ # name value out-edges
77
+ PageRankVertex.new(:igvita, 1, :igvita, :wikipedia),
78
+ PageRankVertex.new(:wikipedia, 1, :igvita, :google),
79
+ PageRankVertex.new(:google, 1, :google)
80
+ ]
81
+
82
+ c = Coordinator.new(graph)
83
+ c.run
84
+
85
+ c.workers.each do |w|
86
+ (w.vertices.find {|v| v.id == :igvita }.value * 100).ceil.to_i.should == 19
87
+ (w.vertices.find {|v| v.id == :wikipedia }.value * 100).ceil.to_i.should == 13
88
+ (w.vertices.find {|v| v.id == :google }.value * 100).to_i.should == 68
89
+ end
90
+ end
91
+ end
92
+
93
+ it 'should parition nodes by hashing the node id'
94
+ it 'should allow scheduling multiple partitions to a single worker'
95
+ end
@@ -0,0 +1,10 @@
1
+ require 'lib/pregel'
2
+
3
+ include Pregel
4
+
5
+ class AddVertex < Vertex
6
+ def compute
7
+ @value += 1
8
+ halt if @value >= 5
9
+ end
10
+ end
@@ -0,0 +1,56 @@
1
+ require 'helper'
2
+
3
+ describe Vertex do
4
+ it 'should create a vertex with id, value, and edges' do
5
+ lambda { Vertex.new(:a, 10, :b) }.should_not raise_error
6
+ lambda { Vertex.new(:a, 10, :b, :c) }.should_not raise_error
7
+ end
8
+
9
+ let(:v) { Vertex.new(:a, 10, :b, :c) }
10
+
11
+ it 'should report new vertex as active' do
12
+ v.active?.should be_true
13
+
14
+ v.halt
15
+ v.active?.should be_false
16
+ end
17
+
18
+ it 'should contain a modifiable value' do
19
+ v.value = 20
20
+ v.value.should == 20
21
+ end
22
+
23
+ it 'should keep track of the current superstep' do
24
+ v.superstep.should == 0
25
+ v.step
26
+ v.superstep.should == 1
27
+ end
28
+
29
+ it 'should allow iterating over out-edges' do
30
+ v.edges.size.should == 2
31
+ v.edges.each {|e| [:b, :c].should include e }
32
+ end
33
+
34
+ it 'should allow arbitrary type for edge' do
35
+ lambda do
36
+ Vertex.new(:a, 10, {id: 1, weight:10}, {id: 2, weight:20})
37
+ end.should_not raise_error
38
+ end
39
+
40
+ it 'should allow a user defined compute' do
41
+ class V < Vertex
42
+ def compute; @value = 20; end
43
+ end
44
+
45
+ v = V.new(:a, 10)
46
+ v.compute
47
+ v.value.should == 20
48
+ end
49
+
50
+ it 'should have an inbox for incoming messages' do
51
+ v.messages.size.should == 0
52
+ v.messages = [:a, :b]
53
+ v.messages.size.should == 2
54
+ end
55
+
56
+ end
@@ -0,0 +1,61 @@
1
+ require 'helper'
2
+
3
+ describe Worker do
4
+
5
+ let(:graph) do
6
+ [
7
+ AddVertex.new(:igvita, 1, :wikipedia),
8
+ AddVertex.new(:wikipedia, 2, :google),
9
+ AddVertex.new(:google, 1, :wikipedia)
10
+ ]
11
+ end
12
+
13
+ let(:worker) do
14
+ Worker.new(graph)
15
+ end
16
+
17
+ it 'should accept non-zero number of nodes' do
18
+ lambda { Worker.new([]) }.should raise_error
19
+ end
20
+
21
+ it 'should partition graphs with variable worker sizes' do
22
+ graph = [
23
+ Vertex.new(:igvita, 1, :wikipedia),
24
+ Vertex.new(:wikipedia, 2, :google),
25
+ Vertex.new(:google, 1, :wikipedia)
26
+ ]
27
+
28
+ c = Coordinator.new(graph)
29
+ c.workers.size.should == 1
30
+
31
+ c = Coordinator.new(graph, partitions: 2)
32
+ c.workers.size.should == 2
33
+ end
34
+
35
+ it 'should execute an async superstep' do
36
+ # TODO: simulate async message delivery to worker by returning
37
+ # a thread per message
38
+ worker.superstep.should be_an_instance_of Thread
39
+ end
40
+
41
+ it 'should perform single iteration on the graph' do
42
+ # TODO: reuse a threadpool within worker for each partition
43
+ worker.superstep.join
44
+ worker.vertices.first.value.should == 2
45
+ end
46
+
47
+ it 'should return the number of active vertices for next superstep' do
48
+ worker.superstep.join
49
+ worker.active.should > 0
50
+ end
51
+
52
+ it 'should deliver messages to vertices at beginning of each superstep' do
53
+ PostOffice.instance.deliver(:igvita, 'hello')
54
+ worker.superstep.join
55
+
56
+ ig = worker.vertices.find {|v| v.id == :igvita }
57
+ ig.messages.size.should == 1
58
+ ig.messages.first.should == 'hello'
59
+ end
60
+
61
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pregel
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Ilya Grigorik
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-10-30 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 2
30
+ - 0
31
+ - 0
32
+ version: 2.0.0
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: Single-node implementation of Google's Pregel framework for large-scale graph processing.
36
+ email:
37
+ - ilya@igvita.com
38
+ executables: []
39
+
40
+ extensions: []
41
+
42
+ extra_rdoc_files: []
43
+
44
+ files:
45
+ - .gitignore
46
+ - Gemfile
47
+ - Gemfile.lock
48
+ - README.md
49
+ - Rakefile
50
+ - autotest/discover.rb
51
+ - lib/pregel.rb
52
+ - lib/pregel/coordinator.rb
53
+ - lib/pregel/version.rb
54
+ - lib/pregel/vertex.rb
55
+ - lib/pregel/worker.rb
56
+ - pregel.gemspec
57
+ - spec/coordinator_spec.rb
58
+ - spec/helper.rb
59
+ - spec/vertex_spec.rb
60
+ - spec/worker_spec.rb
61
+ has_rdoc: true
62
+ homepage: http://github.com/igrigorik/pregel
63
+ licenses: []
64
+
65
+ post_install_message:
66
+ rdoc_options: []
67
+
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ requirements: []
87
+
88
+ rubyforge_project: pregel
89
+ rubygems_version: 1.3.7
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: Single-node implementation of Google's Pregel framework for large-scale graph processing.
93
+ test_files:
94
+ - spec/coordinator_spec.rb
95
+ - spec/helper.rb
96
+ - spec/vertex_spec.rb
97
+ - spec/worker_spec.rb