pregel 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
@@ -0,0 +1,26 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pregel (0.0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.1.2)
10
+ rspec (2.0.1)
11
+ rspec-core (~> 2.0.1)
12
+ rspec-expectations (~> 2.0.1)
13
+ rspec-mocks (~> 2.0.1)
14
+ rspec-core (2.0.1)
15
+ rspec-expectations (2.0.1)
16
+ diff-lcs (>= 1.1.2)
17
+ rspec-mocks (2.0.1)
18
+ rspec-core (~> 2.0.1)
19
+ rspec-expectations (~> 2.0.1)
20
+
21
+ PLATFORMS
22
+ ruby
23
+
24
+ DEPENDENCIES
25
+ pregel!
26
+ rspec (~> 2.0.0)
@@ -0,0 +1,57 @@
1
+ # Pregel
2
+
3
+ Single-node implementation of Google's Pregel framework for large-scale graph processing. It does not provide any of the distributed components, but implements the core functional pieces within a single Ruby VM such that you can develop and run iterative graph algorithms as if you had the full power of Pregel at your disposal.
4
+
5
+ To learn more about Pregel see following resources:
6
+
7
+ * [Pregel, a system for large-scale graph processing](http://portal.acm.org/citation.cfm?id=1582716.1582723)
8
+ * [Large-scale graph computing at Google](http://googleresearch.blogspot.com/2009/06/large-scale-graph-computing-at-google.html)
9
+ * [Phoebus](http://github.com/xslogic/phoebus) is a distributed Erlang implementation of Pregel
10
+
11
+ # PageRank example with Pregel
12
+ To run a PageRank computation on an arbitrary graph, you simply specify the vertices & edges, and then define a compute function for each vertex. The coordinator then partitions the work between a specified number of workers (Ruby threads, in our case), and iteratively executes "supersteps" until we converge on a result. At each superstep, the vertex can read and process incoming messages and then send messages to other vertices. Hence, the full PageRank implementation is:
13
+
14
+ class PageRankVertex < Vertex
15
+ def compute
16
+ if superstep >= 1
17
+ sum = messages.inject(0) {|total,msg| total += msg; total }
18
+ @value = (0.15 / num_nodes) + 0.85 * sum
19
+ end
20
+
21
+ if superstep < 30
22
+ deliver_to_all_neighbors(@value / neighbors.size)
23
+ else
24
+ halt
25
+ end
26
+ end
27
+ end
28
+
29
+ The above algorithm will run for 30 iterations, at which point all vertices will mark themselves as inactive and the coordinator will terminate our computation.
30
+
31
+ * [Computing PageRank for a simple circular graph](https://github.com/igrigorik/pregel/blob/master/spec/coordinator_spec.rb#L52)
32
+ * [Computing PageRank for a more complex grap](https://github.com/igrigorik/pregel/blob/master/spec/coordinator_spec.rb#L70)
33
+
34
+ # License
35
+
36
+ (The MIT License)
37
+
38
+ Copyright (c) 2010 Ilya Grigorik
39
+
40
+ Permission is hereby granted, free of charge, to any person obtaining
41
+ a copy of this software and associated documentation files (the
42
+ 'Software'), to deal in the Software without restriction, including
43
+ without limitation the rights to use, copy, modify, merge, publish,
44
+ distribute, sublicense, and/or sell copies of the Software, and to
45
+ permit persons to whom the Software is furnished to do so, subject to
46
+ the following conditions:
47
+
48
+ The above copyright notice and this permission notice shall be
49
+ included in all copies or substantial portions of the Software.
50
+
51
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
52
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
53
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
54
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
55
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
56
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
57
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,3 @@
1
+ require 'bundler'
2
+
3
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1 @@
1
+ Autotest.add_discovery { 'rspec2' }
@@ -0,0 +1,30 @@
1
+ require 'pregel/vertex'
2
+ require 'pregel/worker'
3
+ require 'pregel/coordinator'
4
+
5
+ require 'singleton'
6
+
7
+ class PostOffice
8
+ include Singleton
9
+
10
+ def initialize
11
+ @mailboxes = Hash.new
12
+ @mutex = Mutex.new
13
+ end
14
+
15
+ def deliver(to, msg)
16
+ @mutex.synchronize do
17
+ if @mailboxes[to]
18
+ @mailboxes[to].push msg
19
+ else
20
+ @mailboxes[to] = [msg]
21
+ end
22
+ end
23
+ end
24
+
25
+ def read(box)
26
+ @mutex.synchronize do
27
+ @mailboxes.delete(box) || []
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,33 @@
1
+ module Pregel
2
+ class Coordinator
3
+ attr_reader :workers
4
+
5
+ def initialize(graph, options = {})
6
+ raise "empty graph" if graph.empty?
7
+
8
+ @workers = []
9
+ @options = {
10
+ :partitions => 1
11
+ }.merge(options)
12
+
13
+ partition(graph) do |subgraph|
14
+ @workers << Worker.new(subgraph)
15
+ end
16
+ end
17
+
18
+ def partition(graph)
19
+ size = (graph.size.to_f / @options[:partitions]).ceil
20
+ graph.each_slice(size) { |slice| yield slice }
21
+ end
22
+
23
+ def run
24
+ loop do
25
+ # execute a superstep and wait for workers to complete
26
+ step = @workers.select {|w| w.active > 0}.collect {|w| w.superstep }
27
+ step.each {|t| t.join}
28
+
29
+ break if @workers.select {|w| w.active > 0}.size.zero?
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,3 @@
1
+ module Pregel
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,41 @@
1
+ module Pregel
2
+ class Vertex
3
+ attr_reader :id
4
+ attr_accessor :value, :messages
5
+
6
+ def initialize(id, value, *outedges)
7
+ @id = id
8
+ @value = value
9
+ @outedges = outedges
10
+ @messages = []
11
+ @active = true
12
+ @superstep = 0
13
+ end
14
+
15
+ def edges
16
+ block_given? ? @outedges.each {|e| yield e} : @outedges
17
+ end
18
+
19
+ def deliver_to_all_neighbors(msg)
20
+ edges.each {|e| deliver(e, msg)}
21
+ end
22
+
23
+ def deliver(to, msg)
24
+ PostOffice.instance.deliver(to, msg)
25
+ end
26
+
27
+ def step
28
+ @superstep += 1
29
+ compute
30
+ end
31
+
32
+ def halt; @active = false; end
33
+ def active!; @active = true; end
34
+ def active?; @active; end
35
+
36
+ def superstep; @superstep; end
37
+ def neighbors; @outedges; end
38
+
39
+ def compute; end
40
+ end
41
+ end
@@ -0,0 +1,25 @@
1
+ module Pregel
2
+ class Worker
3
+ attr_reader :vertices, :active
4
+
5
+ def initialize(graph = [])
6
+ raise 'empty worker graph' if graph.empty?
7
+ @vertices = graph
8
+ @active = graph.size
9
+ end
10
+
11
+ def superstep
12
+ Thread.new do
13
+ @vertices.each do |v|
14
+ v.messages = PostOffice.instance.read(v.id)
15
+ v.active! if v.messages.size > 0
16
+ end
17
+
18
+ active = @vertices.select {|v| v.active?}
19
+ active.each {|v| v.step}
20
+
21
+ @active = active.select {|v| v.active?}.size
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "pregel/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "pregel"
7
+ s.version = Pregel::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Ilya Grigorik"]
10
+ s.email = ["ilya@igvita.com"]
11
+ s.homepage = "http://github.com/igrigorik/pregel"
12
+ s.summary = "Single-node implementation of Google's Pregel framework for large-scale graph processing."
13
+ s.description = s.summary
14
+ s.rubyforge_project = "pregel"
15
+
16
+ s.add_development_dependency "rspec", '~> 2.0.0'
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+ end
@@ -0,0 +1,95 @@
1
+ require 'helper'
2
+
3
+ describe Coordinator do
4
+ it 'should not allow empty graphs' do
5
+ lambda { Coordinator.new([]) }.should raise_error
6
+ end
7
+
8
+ let(:graph) do
9
+ [
10
+ AddVertex.new(:igvita, 1, :wikipedia),
11
+ AddVertex.new(:wikipedia, 2, :google),
12
+ AddVertex.new(:google, 1, :wikipedia)
13
+ ]
14
+ end
15
+
16
+ it 'should partition graphs with variable worker sizes' do
17
+ c = Coordinator.new(graph)
18
+ c.workers.size.should == 1
19
+
20
+ c = Coordinator.new(graph, partitions: 2)
21
+ c.workers.size.should == 2
22
+ end
23
+
24
+ it 'should schedule workers to run until there are no active vertices' do
25
+ c = Coordinator.new(graph)
26
+ c.run
27
+
28
+ c.workers.each do |w|
29
+ w.vertices.each do |v|
30
+ v.value.should == 5
31
+ end
32
+ end
33
+ end
34
+
35
+ context 'PageRank' do
36
+ class PageRankVertex < Vertex
37
+ def compute
38
+ if superstep >= 1
39
+ sum = messages.inject(0) {|total,msg| total += msg; total }
40
+ @value = (0.15 / 3) + 0.85 * sum
41
+ end
42
+
43
+ if superstep < 30
44
+ deliver_to_all_neighbors(@value / neighbors.size)
45
+ else
46
+ halt
47
+ end
48
+ end
49
+ end
50
+
51
+
52
+ it 'should calculate PageRank of a circular graph' do
53
+ graph = [
54
+ # name value out-edges
55
+ PageRankVertex.new(:igvita, 1, :wikipedia),
56
+ PageRankVertex.new(:wikipedia, 1, :google),
57
+ PageRankVertex.new(:google, 1, :igvita)
58
+ ]
59
+
60
+ c = Coordinator.new(graph)
61
+ c.run
62
+
63
+ c.workers.each do |w|
64
+ w.vertices.each do |v|
65
+ (v.value * 100).to_i.should == 33
66
+ end
67
+ end
68
+ end
69
+
70
+ it 'should calculate PageRank of arbitrary graph' do
71
+ graph = [
72
+ # page 1 -> page 1, page 2 (0.18)
73
+ # page 2 -> page 1, page 3 (0.13)
74
+ # page 3 -> page 3 (0.69)
75
+
76
+ # name value out-edges
77
+ PageRankVertex.new(:igvita, 1, :igvita, :wikipedia),
78
+ PageRankVertex.new(:wikipedia, 1, :igvita, :google),
79
+ PageRankVertex.new(:google, 1, :google)
80
+ ]
81
+
82
+ c = Coordinator.new(graph)
83
+ c.run
84
+
85
+ c.workers.each do |w|
86
+ (w.vertices.find {|v| v.id == :igvita }.value * 100).ceil.to_i.should == 19
87
+ (w.vertices.find {|v| v.id == :wikipedia }.value * 100).ceil.to_i.should == 13
88
+ (w.vertices.find {|v| v.id == :google }.value * 100).to_i.should == 68
89
+ end
90
+ end
91
+ end
92
+
93
+ it 'should parition nodes by hashing the node id'
94
+ it 'should allow scheduling multiple partitions to a single worker'
95
+ end
@@ -0,0 +1,10 @@
1
+ require 'lib/pregel'
2
+
3
+ include Pregel
4
+
5
+ class AddVertex < Vertex
6
+ def compute
7
+ @value += 1
8
+ halt if @value >= 5
9
+ end
10
+ end
@@ -0,0 +1,56 @@
1
+ require 'helper'
2
+
3
+ describe Vertex do
4
+ it 'should create a vertex with id, value, and edges' do
5
+ lambda { Vertex.new(:a, 10, :b) }.should_not raise_error
6
+ lambda { Vertex.new(:a, 10, :b, :c) }.should_not raise_error
7
+ end
8
+
9
+ let(:v) { Vertex.new(:a, 10, :b, :c) }
10
+
11
+ it 'should report new vertex as active' do
12
+ v.active?.should be_true
13
+
14
+ v.halt
15
+ v.active?.should be_false
16
+ end
17
+
18
+ it 'should contain a modifiable value' do
19
+ v.value = 20
20
+ v.value.should == 20
21
+ end
22
+
23
+ it 'should keep track of the current superstep' do
24
+ v.superstep.should == 0
25
+ v.step
26
+ v.superstep.should == 1
27
+ end
28
+
29
+ it 'should allow iterating over out-edges' do
30
+ v.edges.size.should == 2
31
+ v.edges.each {|e| [:b, :c].should include e }
32
+ end
33
+
34
+ it 'should allow arbitrary type for edge' do
35
+ lambda do
36
+ Vertex.new(:a, 10, {id: 1, weight:10}, {id: 2, weight:20})
37
+ end.should_not raise_error
38
+ end
39
+
40
+ it 'should allow a user defined compute' do
41
+ class V < Vertex
42
+ def compute; @value = 20; end
43
+ end
44
+
45
+ v = V.new(:a, 10)
46
+ v.compute
47
+ v.value.should == 20
48
+ end
49
+
50
+ it 'should have an inbox for incoming messages' do
51
+ v.messages.size.should == 0
52
+ v.messages = [:a, :b]
53
+ v.messages.size.should == 2
54
+ end
55
+
56
+ end
@@ -0,0 +1,61 @@
1
+ require 'helper'
2
+
3
+ describe Worker do
4
+
5
+ let(:graph) do
6
+ [
7
+ AddVertex.new(:igvita, 1, :wikipedia),
8
+ AddVertex.new(:wikipedia, 2, :google),
9
+ AddVertex.new(:google, 1, :wikipedia)
10
+ ]
11
+ end
12
+
13
+ let(:worker) do
14
+ Worker.new(graph)
15
+ end
16
+
17
+ it 'should accept non-zero number of nodes' do
18
+ lambda { Worker.new([]) }.should raise_error
19
+ end
20
+
21
+ it 'should partition graphs with variable worker sizes' do
22
+ graph = [
23
+ Vertex.new(:igvita, 1, :wikipedia),
24
+ Vertex.new(:wikipedia, 2, :google),
25
+ Vertex.new(:google, 1, :wikipedia)
26
+ ]
27
+
28
+ c = Coordinator.new(graph)
29
+ c.workers.size.should == 1
30
+
31
+ c = Coordinator.new(graph, partitions: 2)
32
+ c.workers.size.should == 2
33
+ end
34
+
35
+ it 'should execute an async superstep' do
36
+ # TODO: simulate async message delivery to worker by returning
37
+ # a thread per message
38
+ worker.superstep.should be_an_instance_of Thread
39
+ end
40
+
41
+ it 'should perform single iteration on the graph' do
42
+ # TODO: reuse a threadpool within worker for each partition
43
+ worker.superstep.join
44
+ worker.vertices.first.value.should == 2
45
+ end
46
+
47
+ it 'should return the number of active vertices for next superstep' do
48
+ worker.superstep.join
49
+ worker.active.should > 0
50
+ end
51
+
52
+ it 'should deliver messages to vertices at beginning of each superstep' do
53
+ PostOffice.instance.deliver(:igvita, 'hello')
54
+ worker.superstep.join
55
+
56
+ ig = worker.vertices.find {|v| v.id == :igvita }
57
+ ig.messages.size.should == 1
58
+ ig.messages.first.should == 'hello'
59
+ end
60
+
61
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pregel
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Ilya Grigorik
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-10-30 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 2
30
+ - 0
31
+ - 0
32
+ version: 2.0.0
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: Single-node implementation of Google's Pregel framework for large-scale graph processing.
36
+ email:
37
+ - ilya@igvita.com
38
+ executables: []
39
+
40
+ extensions: []
41
+
42
+ extra_rdoc_files: []
43
+
44
+ files:
45
+ - .gitignore
46
+ - Gemfile
47
+ - Gemfile.lock
48
+ - README.md
49
+ - Rakefile
50
+ - autotest/discover.rb
51
+ - lib/pregel.rb
52
+ - lib/pregel/coordinator.rb
53
+ - lib/pregel/version.rb
54
+ - lib/pregel/vertex.rb
55
+ - lib/pregel/worker.rb
56
+ - pregel.gemspec
57
+ - spec/coordinator_spec.rb
58
+ - spec/helper.rb
59
+ - spec/vertex_spec.rb
60
+ - spec/worker_spec.rb
61
+ has_rdoc: true
62
+ homepage: http://github.com/igrigorik/pregel
63
+ licenses: []
64
+
65
+ post_install_message:
66
+ rdoc_options: []
67
+
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ requirements: []
87
+
88
+ rubyforge_project: pregel
89
+ rubygems_version: 1.3.7
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: Single-node implementation of Google's Pregel framework for large-scale graph processing.
93
+ test_files:
94
+ - spec/coordinator_spec.rb
95
+ - spec/helper.rb
96
+ - spec/vertex_spec.rb
97
+ - spec/worker_spec.rb