simple_map_reduce 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +141 -0
- data/.rspec +2 -0
- data/.rubocop.yml +69 -0
- data/.ruby-version +1 -0
- data/.travis.yml +8 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Dockerfile +9 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +107 -0
- data/Rakefile +8 -0
- data/bin/console +15 -0
- data/bin/job_tracker +18 -0
- data/bin/job_worker1 +22 -0
- data/bin/job_worker2 +22 -0
- data/bin/job_worker3 +22 -0
- data/bin/register_word_count_job +80 -0
- data/bin/setup +8 -0
- data/docker-compose.yml +98 -0
- data/exe/simple_map_reduce +183 -0
- data/lib/simple_map_reduce/driver/config.rb +8 -0
- data/lib/simple_map_reduce/driver/job.rb +8 -0
- data/lib/simple_map_reduce/s3_client.rb +15 -0
- data/lib/simple_map_reduce/server/confg.rb +42 -0
- data/lib/simple_map_reduce/server/job.rb +129 -0
- data/lib/simple_map_reduce/server/job_tracker.rb +304 -0
- data/lib/simple_map_reduce/server/job_worker.rb +96 -0
- data/lib/simple_map_reduce/server/task.rb +92 -0
- data/lib/simple_map_reduce/server/worker.rb +76 -0
- data/lib/simple_map_reduce/version.rb +5 -0
- data/lib/simple_map_reduce/worker/register_map_task_worker.rb +51 -0
- data/lib/simple_map_reduce/worker/run_map_task_worker.rb +161 -0
- data/lib/simple_map_reduce/worker/run_reduce_task_worker.rb +97 -0
- data/lib/simple_map_reduce.rb +32 -0
- data/simple_map_reduce.gemspec +41 -0
- metadata +290 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'sinatra'
|
5
|
+
require 'sinatra/json'
|
6
|
+
require 'sinatra/reloader' if development?
|
7
|
+
|
8
|
+
module SimpleMapReduce
|
9
|
+
module Server
|
10
|
+
class JobWorker < Sinatra::Base
|
11
|
+
configure do
|
12
|
+
use Rack::Lock
|
13
|
+
end
|
14
|
+
configure :development do
|
15
|
+
register Sinatra::Reloader
|
16
|
+
end
|
17
|
+
|
18
|
+
post '/map_tasks' do
|
19
|
+
raw_body = request.body.read
|
20
|
+
job = SimpleMapReduce::Server::Job.deserialize(raw_body)
|
21
|
+
self.class.job_manager.enqueue_job!(SimpleMapReduce::Worker::RunMapTaskWorker, args: [job, self.class.worker_id])
|
22
|
+
|
23
|
+
json(succeeded: true, job_id: job.id)
|
24
|
+
end
|
25
|
+
|
26
|
+
post '/reduce_tasks' do
|
27
|
+
raw_body = request.body.read
|
28
|
+
task = SimpleMapReduce::Server::Task.deserialize(raw_body)
|
29
|
+
|
30
|
+
self.class.job_manager.enqueue_job!(SimpleMapReduce::Worker::RunReduceTaskWorker, args: [task, self.class.worker_id])
|
31
|
+
|
32
|
+
json(succeeded: true, job_id: task.job_id, task_id: task.id)
|
33
|
+
end
|
34
|
+
|
35
|
+
class << self
|
36
|
+
attr_accessor :worker_id
|
37
|
+
|
38
|
+
def setup_worker
|
39
|
+
check_s3_access
|
40
|
+
register_myself_to_job_tracker
|
41
|
+
job_manager
|
42
|
+
logger.info('All setup process is done successfully. This worker is operation ready.')
|
43
|
+
logger.info("This job worker url: #{SimpleMapReduce.job_worker_url}, id: #{worker_id}")
|
44
|
+
logger.info("The job tracker url: #{SimpleMapReduce.job_tracker_url}")
|
45
|
+
end
|
46
|
+
|
47
|
+
def check_s3_access
|
48
|
+
s3_client.list_buckets
|
49
|
+
logger.info('[OK] s3 connection test')
|
50
|
+
end
|
51
|
+
|
52
|
+
def register_myself_to_job_tracker
|
53
|
+
response = http_client.post do |request|
|
54
|
+
request.url('/workers')
|
55
|
+
request.body = { url: SimpleMapReduce.job_worker_url }.to_json
|
56
|
+
end
|
57
|
+
|
58
|
+
body = JSON.parse(response.body, symbolize_names: true)
|
59
|
+
self.worker_id = body[:id]
|
60
|
+
logger.info("[OK] registering this worker to the job_tracker #{SimpleMapReduce.job_worker_url}")
|
61
|
+
end
|
62
|
+
|
63
|
+
def job_manager
|
64
|
+
@job_manager ||= ::Rasteira::EmbedWorker::Manager.run
|
65
|
+
end
|
66
|
+
|
67
|
+
def http_client
|
68
|
+
@http_client ||= ::Faraday.new(
|
69
|
+
url: SimpleMapReduce.job_tracker_url,
|
70
|
+
headers: {
|
71
|
+
'Accept' => 'application/json',
|
72
|
+
'Content-Type' => 'application/json'
|
73
|
+
}
|
74
|
+
) do |faraday|
|
75
|
+
faraday.response :raise_error
|
76
|
+
faraday.adapter Faraday.default_adapter
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def s3_client
|
81
|
+
SimpleMapReduce::S3Client.instance.client
|
82
|
+
end
|
83
|
+
|
84
|
+
def logger
|
85
|
+
SimpleMapReduce.logger
|
86
|
+
end
|
87
|
+
|
88
|
+
# @override
|
89
|
+
def quit!
|
90
|
+
job_manager.shutdown_workers!
|
91
|
+
super
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'msgpack'
|
4
|
+
require 'securerandom'
|
5
|
+
|
6
|
+
module SimpleMapReduce
|
7
|
+
module Server
|
8
|
+
class Task
|
9
|
+
extend Forwardable
|
10
|
+
include AASM
|
11
|
+
|
12
|
+
attr_reader :job_id,
|
13
|
+
:task_class_name, :task_script,
|
14
|
+
:task_input_bucket_name, :task_input_file_path,
|
15
|
+
:task_output_bucket_name, :task_output_directory_path,
|
16
|
+
:worker
|
17
|
+
|
18
|
+
delegate current_state: :aasm
|
19
|
+
alias state current_state
|
20
|
+
|
21
|
+
aasm do
|
22
|
+
state :ready, initial: true
|
23
|
+
state :in_process
|
24
|
+
state :succeeded
|
25
|
+
state :failed
|
26
|
+
|
27
|
+
event :start do
|
28
|
+
transitions ready: :in_process
|
29
|
+
end
|
30
|
+
|
31
|
+
event :succeed do
|
32
|
+
transitions from: :in_process, to: :succeeded
|
33
|
+
end
|
34
|
+
|
35
|
+
event :fail do
|
36
|
+
transitions from: :in_process, to: :failed
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def initialize(id: nil,
|
41
|
+
job_id:,
|
42
|
+
task_class_name:,
|
43
|
+
task_script:,
|
44
|
+
task_input_bucket_name:,
|
45
|
+
task_input_file_path:,
|
46
|
+
task_output_bucket_name:,
|
47
|
+
task_output_directory_path:,
|
48
|
+
worker: nil)
|
49
|
+
@id = id
|
50
|
+
@job_id = job_id
|
51
|
+
@task_class_name = task_class_name
|
52
|
+
@task_script = task_script
|
53
|
+
@task_input_bucket_name = task_input_bucket_name
|
54
|
+
@task_input_file_path = task_input_file_path
|
55
|
+
@task_output_bucket_name = task_output_bucket_name
|
56
|
+
@task_output_directory_path = task_output_directory_path
|
57
|
+
@worker = worker
|
58
|
+
end
|
59
|
+
|
60
|
+
def id
|
61
|
+
@id ||= SecureRandom.uuid
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_h
|
65
|
+
{
|
66
|
+
id: id,
|
67
|
+
job_id: @job_id,
|
68
|
+
task_class_name: @task_class_name,
|
69
|
+
task_script: @task_script,
|
70
|
+
task_input_bucket_name: @task_input_bucket_name,
|
71
|
+
task_input_file_path: @task_input_file_path,
|
72
|
+
task_output_bucket_name: @task_output_bucket_name,
|
73
|
+
task_output_directory_path: @task_output_directory_path
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
def serialize
|
78
|
+
to_h.to_msgpack
|
79
|
+
end
|
80
|
+
|
81
|
+
def dump
|
82
|
+
to_h.merge(state: state)
|
83
|
+
end
|
84
|
+
|
85
|
+
class << self
|
86
|
+
def deserialize(data)
|
87
|
+
new(Hash[MessagePack.unpack(data).map { |k, v| [k.to_sym, v] }])
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'securerandom'
|
4
|
+
require 'forwardable'
|
5
|
+
require 'aasm'
|
6
|
+
|
7
|
+
module SimpleMapReduce
|
8
|
+
module Server
|
9
|
+
class Worker
|
10
|
+
extend Forwardable
|
11
|
+
include AASM
|
12
|
+
|
13
|
+
attr_accessor :url
|
14
|
+
|
15
|
+
delegate current_state: :aasm
|
16
|
+
alias state current_state
|
17
|
+
|
18
|
+
aasm do
|
19
|
+
state :ready, initial: true
|
20
|
+
state :reserved
|
21
|
+
state :working
|
22
|
+
|
23
|
+
event :ready do
|
24
|
+
transitions to: :ready
|
25
|
+
end
|
26
|
+
|
27
|
+
event :reserve do
|
28
|
+
transitions from: %i(ready working), to: :reserved
|
29
|
+
end
|
30
|
+
|
31
|
+
event :work do
|
32
|
+
transitions from: :reserved, to: :working
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize(url:)
|
37
|
+
@url = url
|
38
|
+
unless valid?
|
39
|
+
raise ArgumentError, 'invalid url'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def id
|
44
|
+
@id ||= SecureRandom.uuid
|
45
|
+
end
|
46
|
+
|
47
|
+
def dump
|
48
|
+
{
|
49
|
+
id: id,
|
50
|
+
url: @url,
|
51
|
+
state: state
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
# update Job
|
56
|
+
# @params [Hash] attributes
|
57
|
+
# @options attributes [String] url
|
58
|
+
# @options attributes [String] event
|
59
|
+
def update!(url: nil, event: nil)
|
60
|
+
if url
|
61
|
+
self.url = url
|
62
|
+
end
|
63
|
+
|
64
|
+
if event
|
65
|
+
public_send(event.to_sym)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def valid?
|
72
|
+
!@url.to_s.empty? && @url =~ URI::DEFAULT_PARSER.make_regexp
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleMapReduce
|
4
|
+
module Worker
|
5
|
+
class RegisterMapTaskWorker
|
6
|
+
def perform(job)
|
7
|
+
logger.info('register map task worker start!')
|
8
|
+
client = http_client(job.map_worker.url)
|
9
|
+
response = client.post do |request|
|
10
|
+
request.url('/map_tasks')
|
11
|
+
request.body = job.serialize
|
12
|
+
end
|
13
|
+
logger.debug(response.body)
|
14
|
+
|
15
|
+
job.map_worker.work!
|
16
|
+
job.start!
|
17
|
+
rescue => e
|
18
|
+
logger.error(e.inspect)
|
19
|
+
logger.error(e.backtrace.take(50))
|
20
|
+
SimpleMapReduce::Server::JobTracker.store_worker(job.map_worker)
|
21
|
+
job.failed!
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
HTTP_MSGPACK_HEADER = {
|
27
|
+
'Accept' => 'application/x-msgpack',
|
28
|
+
'Content-Type' => 'application/x-msgpack'
|
29
|
+
}.freeze
|
30
|
+
|
31
|
+
def http_client(url)
|
32
|
+
::Faraday.new(
|
33
|
+
url: url,
|
34
|
+
headers: HTTP_MSGPACK_HEADER,
|
35
|
+
request: {
|
36
|
+
open_timeout: 10,
|
37
|
+
timeout: 15
|
38
|
+
}
|
39
|
+
) do |faraday|
|
40
|
+
faraday.response :logger
|
41
|
+
faraday.response :raise_error
|
42
|
+
faraday.adapter Faraday.default_adapter
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def logger
|
47
|
+
SimpleMapReduce.logger
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleMapReduce
|
4
|
+
module Worker
|
5
|
+
class RunMapTaskWorker
|
6
|
+
def perform(job, map_worker_id)
|
7
|
+
task_wrapper_class_name = "TaskWrapper#{job.id.delete('-')}"
|
8
|
+
self.class.class_eval("class #{task_wrapper_class_name}; end", 'Task Wrapper Class')
|
9
|
+
task_wrapper_class = self.class.const_get(task_wrapper_class_name)
|
10
|
+
task_wrapper_class.class_eval(job.map_script, 'Map task script')
|
11
|
+
map_task = task_wrapper_class.const_get(job.map_class_name, false).new
|
12
|
+
unless map_task.respond_to?(:map)
|
13
|
+
# TODO: notify job_tracker
|
14
|
+
logger.error('no map method')
|
15
|
+
return
|
16
|
+
end
|
17
|
+
logger.info('map task start')
|
18
|
+
|
19
|
+
local_input_cache = Tempfile.new
|
20
|
+
s3_client.get_object(
|
21
|
+
response_target: local_input_cache.path,
|
22
|
+
bucket: job.job_input_bucket_name,
|
23
|
+
key: job.job_input_directory_path
|
24
|
+
)
|
25
|
+
local_input_cache.rewind
|
26
|
+
|
27
|
+
local_output_cache = Tempfile.new
|
28
|
+
local_input_cache.each_line(chomp: true, rs: "\n") do |line|
|
29
|
+
map_task.map(line, local_output_cache)
|
30
|
+
end
|
31
|
+
|
32
|
+
local_output_cache.rewind
|
33
|
+
logger.debug("output data size: #{local_output_cache.size}")
|
34
|
+
logger.debug('---map output digest---')
|
35
|
+
local_output_cache.take(5).each do |line|
|
36
|
+
logger.debug(line)
|
37
|
+
end
|
38
|
+
logger.debug('---map output digest---')
|
39
|
+
|
40
|
+
response = http_client(SimpleMapReduce.job_tracker_url).post do |request|
|
41
|
+
request.url('/workers/reserve')
|
42
|
+
# TODO: providing a way to specify worker_size
|
43
|
+
request.body = { worker_size: 2 }.to_json
|
44
|
+
end
|
45
|
+
logger.debug(response.body)
|
46
|
+
|
47
|
+
# {"succeeded":true,"workers":[{"id":70157882164440,"url":"http://localhost:4569","state":'reserved'}]}
|
48
|
+
reserved_workers = JSON.parse(response.body, symbolize_names: true)[:reserved_workers]
|
49
|
+
if reserved_workers.count == 0
|
50
|
+
# keep working with same worker
|
51
|
+
reserved_workers << { id: map_worker_id, url: job.map_worker_url, state: 'working' }
|
52
|
+
end
|
53
|
+
|
54
|
+
shuffle(job, reserved_workers, local_output_cache)
|
55
|
+
|
56
|
+
unless reserved_workers.map { |w| w[:id] }.include?(map_worker_id)
|
57
|
+
response = http_client(SimpleMapReduce.job_tracker_url).put do |request|
|
58
|
+
request.url("/workers/#{map_worker_id}")
|
59
|
+
request.body = { event: 'ready' }.to_json
|
60
|
+
end
|
61
|
+
logger.debug(response.body)
|
62
|
+
end
|
63
|
+
rescue => e
|
64
|
+
logger.error(e.inspect)
|
65
|
+
logger.error(e.backtrace.take(50))
|
66
|
+
# TODO: notifying to job_tracker that this task have failed
|
67
|
+
ensure
|
68
|
+
local_input_cache&.delete
|
69
|
+
local_output_cache&.delete
|
70
|
+
reserved_workers&.each do |worker|
|
71
|
+
worker[:shuffled_local_output]&.delete
|
72
|
+
end
|
73
|
+
if self.class.const_defined?(task_wrapper_class_name.to_sym)
|
74
|
+
self.class.send(:remove_const, task_wrapper_class_name.to_sym)
|
75
|
+
end
|
76
|
+
logger.info('map task end')
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def s3_client
|
82
|
+
SimpleMapReduce::S3Client.instance.client
|
83
|
+
end
|
84
|
+
|
85
|
+
def logger
|
86
|
+
SimpleMapReduce.logger
|
87
|
+
end
|
88
|
+
|
89
|
+
HTTP_JSON_HEADER = {
|
90
|
+
'Accept' => 'application/x-msgpack',
|
91
|
+
'Content-Type' => 'application/x-msgpack'
|
92
|
+
}.freeze
|
93
|
+
|
94
|
+
def http_client(url)
|
95
|
+
::Faraday.new(
|
96
|
+
url: url,
|
97
|
+
headers: HTTP_JSON_HEADER
|
98
|
+
) do |faraday|
|
99
|
+
faraday.response :logger
|
100
|
+
faraday.response :raise_error
|
101
|
+
faraday.adapter Faraday.default_adapter
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def shuffle(job, workers, local_output_cache)
|
106
|
+
workers_count = workers.count
|
107
|
+
raise 'No workers' unless workers_count > 0
|
108
|
+
|
109
|
+
workers.each do |worker|
|
110
|
+
worker[:shuffled_local_output] = Tempfile.new
|
111
|
+
end
|
112
|
+
|
113
|
+
local_output_cache.each_line(rs: "\n") do |raw_line|
|
114
|
+
output = JSON.parse(raw_line, symbolize_names: true)
|
115
|
+
partition_id = output[:key].hash % workers_count
|
116
|
+
workers[partition_id][:shuffled_local_output].puts(output.to_json)
|
117
|
+
end
|
118
|
+
|
119
|
+
task_script = job.reduce_script
|
120
|
+
task_class_name = job.reduce_class_name
|
121
|
+
task_input_bucket_name = SimpleMapReduce.s3_intermediate_bucket_name
|
122
|
+
task_output_bucket_name = job.job_output_bucket_name
|
123
|
+
task_output_directory_path = job.job_output_directory_path
|
124
|
+
task_input_file_path_prefix = "#{job.id}/map_output_#{Time.now.to_i}/"
|
125
|
+
|
126
|
+
workers.each_with_index do |worker, partition_id|
|
127
|
+
reduce_task = ::SimpleMapReduce::Server::Task.new(
|
128
|
+
job_id: job.id,
|
129
|
+
task_class_name: task_class_name,
|
130
|
+
task_script: task_script,
|
131
|
+
task_input_bucket_name: task_input_bucket_name,
|
132
|
+
task_input_file_path: "#{task_input_file_path_prefix}#{partition_id}_map_output.txt",
|
133
|
+
task_output_bucket_name: task_output_bucket_name,
|
134
|
+
task_output_directory_path: task_output_directory_path
|
135
|
+
)
|
136
|
+
|
137
|
+
local_output_cache = worker[:shuffled_local_output]
|
138
|
+
local_output_cache.rewind
|
139
|
+
s3_client.put_object(
|
140
|
+
body: local_output_cache.read,
|
141
|
+
bucket: reduce_task.task_input_bucket_name,
|
142
|
+
key: reduce_task.task_input_file_path
|
143
|
+
)
|
144
|
+
|
145
|
+
response = http_client(worker[:url]).post do |request|
|
146
|
+
request.url('/reduce_tasks')
|
147
|
+
request.body = reduce_task.serialize
|
148
|
+
end
|
149
|
+
logger.debug(response.body)
|
150
|
+
|
151
|
+
next if worker[:state] == 'working'
|
152
|
+
response = http_client(SimpleMapReduce.job_tracker_url).put do |request|
|
153
|
+
request.url("/workers/#{worker[:id]}")
|
154
|
+
request.body = { event: 'work' }.to_json
|
155
|
+
end
|
156
|
+
logger.debug(response.body)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|