simple_map_reduce 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +141 -0
- data/.rspec +2 -0
- data/.rubocop.yml +69 -0
- data/.ruby-version +1 -0
- data/.travis.yml +8 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Dockerfile +9 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +107 -0
- data/Rakefile +8 -0
- data/bin/console +15 -0
- data/bin/job_tracker +18 -0
- data/bin/job_worker1 +22 -0
- data/bin/job_worker2 +22 -0
- data/bin/job_worker3 +22 -0
- data/bin/register_word_count_job +80 -0
- data/bin/setup +8 -0
- data/docker-compose.yml +98 -0
- data/exe/simple_map_reduce +183 -0
- data/lib/simple_map_reduce/driver/config.rb +8 -0
- data/lib/simple_map_reduce/driver/job.rb +8 -0
- data/lib/simple_map_reduce/s3_client.rb +15 -0
- data/lib/simple_map_reduce/server/confg.rb +42 -0
- data/lib/simple_map_reduce/server/job.rb +129 -0
- data/lib/simple_map_reduce/server/job_tracker.rb +304 -0
- data/lib/simple_map_reduce/server/job_worker.rb +96 -0
- data/lib/simple_map_reduce/server/task.rb +92 -0
- data/lib/simple_map_reduce/server/worker.rb +76 -0
- data/lib/simple_map_reduce/version.rb +5 -0
- data/lib/simple_map_reduce/worker/register_map_task_worker.rb +51 -0
- data/lib/simple_map_reduce/worker/run_map_task_worker.rb +161 -0
- data/lib/simple_map_reduce/worker/run_reduce_task_worker.rb +97 -0
- data/lib/simple_map_reduce.rb +32 -0
- data/simple_map_reduce.gemspec +41 -0
- metadata +290 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'aws-sdk'
|
4
|
+
require 'singleton'
|
5
|
+
|
6
|
+
module SimpleMapReduce
|
7
|
+
class S3Client
|
8
|
+
include ::Singleton
|
9
|
+
attr_reader :client
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@client = ::Aws::S3::Client.new(SimpleMapReduce.s3_config)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleMapReduce
|
4
|
+
module Server
|
5
|
+
class Config
|
6
|
+
attr_reader :s3_config, :s3_intermediate_bucket_name, :s3_input_bucket_name, :s3_output_bucket_name,
|
7
|
+
:server_port, :logger, :job_tracker_url, :job_worker_url
|
8
|
+
|
9
|
+
DEFAULT_S3_CONFIG = {
|
10
|
+
access_key_id: 'MINIO_ACCESS_KEY',
|
11
|
+
secret_access_key: 'MINIO_SECRET_KEY',
|
12
|
+
endpoint: 'http://127.0.0.1:9000',
|
13
|
+
region: 'us-east-1',
|
14
|
+
force_path_style: true
|
15
|
+
}.freeze
|
16
|
+
DEFAULT_S3_INPUT_BUCKET_NAME = 'input'
|
17
|
+
DEFAULT_S3_INTERMEDIATE_BUCKET_NAME = 'intermediate'
|
18
|
+
DEFAULT_S3_OUTPUT_BUCKET_NAME = 'output'
|
19
|
+
DEFAULT_SERVER_PORT = 4567
|
20
|
+
|
21
|
+
def initialize(options)
|
22
|
+
setup_s3_config(options)
|
23
|
+
|
24
|
+
@s3_input_bucket_name = options[:s3_input_bucket_name] || DEFAULT_S3_INPUT_BUCKET_NAME
|
25
|
+
@s3_intermediate_bucket_name = options[:s3_intermediate_bucket_name] || DEFAULT_S3_INTERMEDIATE_BUCKET_NAME
|
26
|
+
@s3_output_bucket_name = options[:s3_output_bucket_name] || DEFAULT_S3_OUTPUT_BUCKET_NAME
|
27
|
+
|
28
|
+
@server_port = options[:server_port] || 4567
|
29
|
+
@job_tracker_url = options[:job_tracker_url]
|
30
|
+
@job_worker_url = options[:job_worker_url]
|
31
|
+
@logger = options[:logger] || Logger.new(STDOUT)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def setup_s3_config(options)
|
37
|
+
s3_config = Hash[options[:s3_config].to_a.map { |v| [v[0].to_sym, v[1]] }] # support ruby <= 2.4
|
38
|
+
@s3_config = s3_config.empty? ? DEFAULT_S3_CONFIG : s3_config
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'msgpack'
|
4
|
+
require 'securerandom'
|
5
|
+
require 'forwardable'
|
6
|
+
require 'aasm'
|
7
|
+
|
8
|
+
module SimpleMapReduce
|
9
|
+
module Server
|
10
|
+
class Job
|
11
|
+
extend Forwardable
|
12
|
+
include AASM
|
13
|
+
attr_reader :map_script, :map_class_name, :reduce_script, :reduce_class_name,
|
14
|
+
:job_input_bucket_name, :job_input_directory_path,
|
15
|
+
:job_output_bucket_name, :job_output_directory_path,
|
16
|
+
:map_worker
|
17
|
+
|
18
|
+
delegate current_state: :aasm
|
19
|
+
alias state current_state
|
20
|
+
|
21
|
+
aasm do
|
22
|
+
state :ready, initial: true
|
23
|
+
state :in_process
|
24
|
+
state :succeeded
|
25
|
+
state :failed
|
26
|
+
|
27
|
+
event :start do
|
28
|
+
transitions from: :ready, to: :in_process
|
29
|
+
end
|
30
|
+
|
31
|
+
event :succeeded do
|
32
|
+
transitions from: :in_process, to: :succeeded
|
33
|
+
end
|
34
|
+
|
35
|
+
event :failed do
|
36
|
+
transitions from: %i(in_process ready), to: :failed
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def initialize(id: nil,
|
41
|
+
map_script:,
|
42
|
+
map_class_name:,
|
43
|
+
reduce_script:,
|
44
|
+
reduce_class_name:,
|
45
|
+
job_input_bucket_name:,
|
46
|
+
job_input_directory_path:,
|
47
|
+
job_output_bucket_name:,
|
48
|
+
job_output_directory_path:,
|
49
|
+
map_worker_url: nil,
|
50
|
+
map_worker: nil)
|
51
|
+
|
52
|
+
@id = id
|
53
|
+
@map_script = map_script&.strip
|
54
|
+
@map_class_name = map_class_name&.strip
|
55
|
+
@reduce_script = reduce_script&.strip
|
56
|
+
@reduce_class_name = reduce_class_name&.strip
|
57
|
+
@job_input_bucket_name = job_input_bucket_name&.strip
|
58
|
+
@job_input_directory_path = job_input_directory_path&.strip
|
59
|
+
@job_output_bucket_name = job_output_bucket_name&.strip
|
60
|
+
@job_output_directory_path = job_output_directory_path&.strip
|
61
|
+
@map_worker = map_worker
|
62
|
+
if @map_worker.nil? && map_worker_url
|
63
|
+
@map_worker = SimpleMapReduce::Server::Worker.new(url: map_worker_url)
|
64
|
+
end
|
65
|
+
|
66
|
+
unless valid?
|
67
|
+
raise ArgumentError, 'invalid Job parameters are detected'
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def id
|
72
|
+
@id ||= SecureRandom.uuid
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_h
|
76
|
+
{
|
77
|
+
id: id,
|
78
|
+
map_script: @map_script,
|
79
|
+
map_class_name: @map_class_name,
|
80
|
+
reduce_script: @reduce_script,
|
81
|
+
reduce_class_name: @reduce_class_name,
|
82
|
+
job_input_bucket_name: @job_input_bucket_name,
|
83
|
+
job_input_directory_path: @job_input_directory_path,
|
84
|
+
job_output_bucket_name: @job_output_bucket_name,
|
85
|
+
job_output_directory_path: @job_output_directory_path,
|
86
|
+
map_worker_url: @map_worker&.url
|
87
|
+
}
|
88
|
+
end
|
89
|
+
|
90
|
+
def serialize
|
91
|
+
to_h.to_msgpack
|
92
|
+
end
|
93
|
+
|
94
|
+
def dump
|
95
|
+
to_h.merge(state: state)
|
96
|
+
end
|
97
|
+
|
98
|
+
def map_worker_url
|
99
|
+
@map_worker&.url
|
100
|
+
end
|
101
|
+
|
102
|
+
def valid?
|
103
|
+
!@map_script.to_s.empty? &&
|
104
|
+
!@map_class_name.to_s.empty? &&
|
105
|
+
!@reduce_script.to_s.empty? &&
|
106
|
+
!@reduce_class_name.to_s.empty? &&
|
107
|
+
!@job_input_bucket_name.to_s.empty? &&
|
108
|
+
!@job_input_directory_path.to_s.empty? &&
|
109
|
+
!@job_output_bucket_name.to_s.empty? &&
|
110
|
+
!@job_output_directory_path.to_s.empty?
|
111
|
+
end
|
112
|
+
|
113
|
+
# update Job
|
114
|
+
# @params [Hash] attributes
|
115
|
+
# @options attributes [String] event
|
116
|
+
def update!(event: nil)
|
117
|
+
if event
|
118
|
+
public_send(event.to_sym)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
class << self
|
123
|
+
def deserialize(data)
|
124
|
+
new(Hash[MessagePack.unpack(data).map { |k, v| [k.to_sym, v] }])
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,304 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'sinatra'
|
5
|
+
require 'sinatra/json'
|
6
|
+
require 'sinatra/reloader' if development?
|
7
|
+
|
8
|
+
module SimpleMapReduce
|
9
|
+
module Server
|
10
|
+
class JobTracker < Sinatra::Base
|
11
|
+
configure do
|
12
|
+
use Rack::Lock
|
13
|
+
# TODO: be configurable
|
14
|
+
MAX_WORKER_RESERVABLE_SIZE = 5
|
15
|
+
end
|
16
|
+
configure :development do
|
17
|
+
register Sinatra::Reloader
|
18
|
+
end
|
19
|
+
|
20
|
+
post '/jobs' do
|
21
|
+
params = JSON.parse(request.body.read, symbolize_names: true)
|
22
|
+
available_workers = self.class.fetch_available_workers
|
23
|
+
if available_workers.empty?
|
24
|
+
status 409
|
25
|
+
json(succeeded: false, error_message: 'No worker is available now. Try it again.')
|
26
|
+
return
|
27
|
+
end
|
28
|
+
|
29
|
+
registered_job = nil
|
30
|
+
selected_map_worker = available_workers.last
|
31
|
+
begin
|
32
|
+
registered_job = self.class.register_job(
|
33
|
+
map_script: params[:map_script],
|
34
|
+
map_class_name: params[:map_class_name],
|
35
|
+
reduce_script: params[:reduce_script],
|
36
|
+
reduce_class_name: params[:reduce_class_name],
|
37
|
+
job_input_bucket_name: params[:job_input_bucket_name],
|
38
|
+
job_input_directory_path: params[:job_input_directory_path],
|
39
|
+
job_output_bucket_name: params[:job_output_bucket_name],
|
40
|
+
job_output_directory_path: params[:job_output_directory_path],
|
41
|
+
map_worker: selected_map_worker
|
42
|
+
)
|
43
|
+
rescue ArgumentError => e
|
44
|
+
status 400
|
45
|
+
json(succeeded: false, error_message: e.message)
|
46
|
+
begin
|
47
|
+
self.class.store_worker(selected_map_worker)
|
48
|
+
rescue => store_worker_error
|
49
|
+
logger.error("failed to store_worker: `#{store_worker_error.inspect}`")
|
50
|
+
end
|
51
|
+
|
52
|
+
return
|
53
|
+
rescue => e
|
54
|
+
status 500
|
55
|
+
json(succeeded: false, error_message: e.message)
|
56
|
+
begin
|
57
|
+
self.class.store_worker(selected_map_worker)
|
58
|
+
rescue => store_worker_error
|
59
|
+
logger.error("failed to store_worker: `#{store_worker_error.inspect}`")
|
60
|
+
end
|
61
|
+
|
62
|
+
return
|
63
|
+
end
|
64
|
+
|
65
|
+
json(succeeded: true, id: registered_job.id)
|
66
|
+
end
|
67
|
+
|
68
|
+
put '/jobs/:id' do
|
69
|
+
job = self.class.jobs&.[](params[:id])
|
70
|
+
if job.nil?
|
71
|
+
status 404
|
72
|
+
json(succeeded: false, error_message: 'job not found')
|
73
|
+
return
|
74
|
+
end
|
75
|
+
|
76
|
+
begin
|
77
|
+
attrs = JSON.parse(request.body.read, symbolize_names: true)
|
78
|
+
job.update!(attrs)
|
79
|
+
json(succeeded: true, job: job.dump)
|
80
|
+
rescue => e
|
81
|
+
status 400
|
82
|
+
json(succeeded: false, error_class: e.class.to_s, error_message: e.message)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
get '/jobs/:id' do
|
87
|
+
job = self.class.jobs&.[](params[:id])
|
88
|
+
if job.nil?
|
89
|
+
status 404
|
90
|
+
json(succeeded: false, error_message: 'job not found')
|
91
|
+
else
|
92
|
+
json(job: job.dump)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
get '/jobs' do
|
97
|
+
json(self.class.jobs&.values&.map(&:dump) || [])
|
98
|
+
end
|
99
|
+
|
100
|
+
get '/workers/:id' do
|
101
|
+
worker = self.class.workers[params[:id]]
|
102
|
+
if worker.nil?
|
103
|
+
status 404
|
104
|
+
json(succeeded: false, worker: nil)
|
105
|
+
else
|
106
|
+
json(succeeded: true, worker: worker.dump)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
post '/workers' do
|
111
|
+
params = JSON.parse(request.body.read, symbolize_names: true)
|
112
|
+
|
113
|
+
worker = nil
|
114
|
+
begin
|
115
|
+
worker = self.class.register_worker(url: params[:url])
|
116
|
+
rescue => e
|
117
|
+
status 400
|
118
|
+
json(succeeded: false, error_class: e.class.to_s, error_message: e.message)
|
119
|
+
return
|
120
|
+
end
|
121
|
+
|
122
|
+
json(succeeded: true, id: worker.id)
|
123
|
+
end
|
124
|
+
|
125
|
+
put '/workers/:id' do
|
126
|
+
worker = self.class.workers[params[:id]]
|
127
|
+
if worker.nil?
|
128
|
+
status 404
|
129
|
+
json(succeeded: false, job: nil)
|
130
|
+
return
|
131
|
+
end
|
132
|
+
|
133
|
+
begin
|
134
|
+
attrs = JSON.parse(request.body.read, symbolize_names: true)
|
135
|
+
worker.update!(attrs)
|
136
|
+
json(succeeded: true, worker: worker.dump)
|
137
|
+
rescue => e
|
138
|
+
status 400
|
139
|
+
json(succeeded: false, error_class: e.class.to_s, error_message: e.message)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
get '/workers' do
|
144
|
+
json(self.class.workers&.values&.map(&:dump) || [])
|
145
|
+
end
|
146
|
+
|
147
|
+
post '/workers/reserve' do
|
148
|
+
params = begin
|
149
|
+
JSON.parse(request.body.read, symbolize_names: true)
|
150
|
+
rescue
|
151
|
+
{}
|
152
|
+
end
|
153
|
+
worker_size = [
|
154
|
+
(params[:worker_size].to_i.zero? ? 1 : params[:worker_size].to_i.abs),
|
155
|
+
MAX_WORKER_RESERVABLE_SIZE
|
156
|
+
].min
|
157
|
+
begin
|
158
|
+
reserved_workers = self.class.fetch_available_workers(worker_size)
|
159
|
+
json(succeeded: true, reserved_workers: reserved_workers.map(&:dump))
|
160
|
+
rescue => e
|
161
|
+
reserved_workers.each { |reserved_worker| self.class.store_worker(reserved_worker) }
|
162
|
+
status 500
|
163
|
+
json(succeeded: false, error_message: e.message)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
class << self
|
168
|
+
attr_accessor :config
|
169
|
+
attr_reader :jobs
|
170
|
+
attr_reader :workers
|
171
|
+
|
172
|
+
def setup_job_tracker
|
173
|
+
check_s3_access
|
174
|
+
create_s3_buckets_if_not_existing
|
175
|
+
job_manager
|
176
|
+
logger.info('All setup process is done successfully. The job tracker is operation ready.')
|
177
|
+
logger.info("This job tracker url: #{SimpleMapReduce.job_tracker_url}")
|
178
|
+
end
|
179
|
+
|
180
|
+
def check_s3_access
|
181
|
+
s3_client.list_buckets
|
182
|
+
logger.info('[OK] s3 connection test')
|
183
|
+
end
|
184
|
+
|
185
|
+
def create_s3_buckets_if_not_existing
|
186
|
+
current_bucket_names = s3_client.list_buckets.buckets.map(&:name)
|
187
|
+
unless current_bucket_names.include?(SimpleMapReduce.s3_input_bucket_name)
|
188
|
+
s3_client.create_bucket(bucket: SimpleMapReduce.s3_input_bucket_name)
|
189
|
+
logger.info("create bucket #{SimpleMapReduce.s3_input_bucket_name}")
|
190
|
+
end
|
191
|
+
|
192
|
+
unless current_bucket_names.include?(SimpleMapReduce.s3_intermediate_bucket_name)
|
193
|
+
s3_client.create_bucket(bucket: SimpleMapReduce.s3_intermediate_bucket_name)
|
194
|
+
logger.info("create bucket #{SimpleMapReduce.s3_intermediate_bucket_name}")
|
195
|
+
end
|
196
|
+
|
197
|
+
unless current_bucket_names.include?(SimpleMapReduce.s3_output_bucket_name)
|
198
|
+
s3_client.create_bucket(bucket: SimpleMapReduce.s3_output_bucket_name)
|
199
|
+
logger.info("create bucket #{SimpleMapReduce.s3_output_bucket_name}")
|
200
|
+
end
|
201
|
+
logger.info('[OK] confirmed that all necessary s3 buckets exist')
|
202
|
+
end
|
203
|
+
|
204
|
+
def register_job(map_script:,
|
205
|
+
map_class_name:,
|
206
|
+
reduce_script:,
|
207
|
+
reduce_class_name:,
|
208
|
+
job_input_bucket_name:,
|
209
|
+
job_input_directory_path:,
|
210
|
+
job_output_bucket_name:,
|
211
|
+
job_output_directory_path:,
|
212
|
+
map_worker:)
|
213
|
+
|
214
|
+
job = ::SimpleMapReduce::Server::Job.new(
|
215
|
+
map_script: map_script,
|
216
|
+
map_class_name: map_class_name,
|
217
|
+
reduce_script: reduce_script,
|
218
|
+
reduce_class_name: reduce_class_name,
|
219
|
+
job_input_directory_path: job_input_directory_path,
|
220
|
+
job_input_bucket_name: job_input_bucket_name,
|
221
|
+
job_output_bucket_name: job_output_bucket_name,
|
222
|
+
job_output_directory_path: job_output_directory_path,
|
223
|
+
map_worker: map_worker
|
224
|
+
)
|
225
|
+
if @jobs.nil?
|
226
|
+
@jobs = {}
|
227
|
+
end
|
228
|
+
|
229
|
+
# enqueue job
|
230
|
+
job_manager.enqueue_job!(SimpleMapReduce::Worker::RegisterMapTaskWorker, args: job)
|
231
|
+
|
232
|
+
@jobs[job.id] = job
|
233
|
+
job
|
234
|
+
end
|
235
|
+
|
236
|
+
def register_worker(url:)
|
237
|
+
worker = ::SimpleMapReduce::Server::Worker.new(url: url)
|
238
|
+
if @workers.nil?
|
239
|
+
@workers = {}
|
240
|
+
end
|
241
|
+
|
242
|
+
@workers[worker.id] = worker
|
243
|
+
worker
|
244
|
+
end
|
245
|
+
|
246
|
+
def fetch_available_workers(worker_size = 1)
|
247
|
+
mutex.lock
|
248
|
+
|
249
|
+
if @workers.nil? || @workers.empty?
|
250
|
+
return []
|
251
|
+
end
|
252
|
+
|
253
|
+
ready_workers = @workers.select { |_id, worker| worker.ready? }
|
254
|
+
if ready_workers.count > 0
|
255
|
+
ready_workers = ready_workers.keys.take(worker_size)
|
256
|
+
|
257
|
+
ready_workers.map do |retry_worker_id|
|
258
|
+
@workers[retry_worker_id].reserve
|
259
|
+
@workers[retry_worker_id]
|
260
|
+
end
|
261
|
+
else
|
262
|
+
return []
|
263
|
+
end
|
264
|
+
ensure
|
265
|
+
mutex.unlock
|
266
|
+
end
|
267
|
+
|
268
|
+
def store_worker(worker)
|
269
|
+
mutex.lock
|
270
|
+
|
271
|
+
if @workers.nil?
|
272
|
+
@workers = {}
|
273
|
+
end
|
274
|
+
|
275
|
+
@workers[worker.id].ready!
|
276
|
+
ensure
|
277
|
+
mutex.unlock
|
278
|
+
end
|
279
|
+
|
280
|
+
def job_manager
|
281
|
+
@job_manager ||= ::Rasteira::EmbedWorker::Manager.run
|
282
|
+
end
|
283
|
+
|
284
|
+
def mutex
|
285
|
+
@mutex ||= Mutex.new
|
286
|
+
end
|
287
|
+
|
288
|
+
def s3_client
|
289
|
+
SimpleMapReduce::S3Client.instance.client
|
290
|
+
end
|
291
|
+
|
292
|
+
def logger
|
293
|
+
SimpleMapReduce.logger
|
294
|
+
end
|
295
|
+
|
296
|
+
# @override
|
297
|
+
def quit!
|
298
|
+
job_manager.shutdown_workers!
|
299
|
+
super
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|