simple_map_reduce 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleMapReduce
4
+ module Driver
5
+ class Config
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleMapReduce
4
+ module Driver
5
+ class Job
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aws-sdk'
4
+ require 'singleton'
5
+
6
+ module SimpleMapReduce
7
+ class S3Client
8
+ include ::Singleton
9
+ attr_reader :client
10
+
11
+ def initialize
12
+ @client = ::Aws::S3::Client.new(SimpleMapReduce.s3_config)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleMapReduce
4
+ module Server
5
+ class Config
6
+ attr_reader :s3_config, :s3_intermediate_bucket_name, :s3_input_bucket_name, :s3_output_bucket_name,
7
+ :server_port, :logger, :job_tracker_url, :job_worker_url
8
+
9
+ DEFAULT_S3_CONFIG = {
10
+ access_key_id: 'MINIO_ACCESS_KEY',
11
+ secret_access_key: 'MINIO_SECRET_KEY',
12
+ endpoint: 'http://127.0.0.1:9000',
13
+ region: 'us-east-1',
14
+ force_path_style: true
15
+ }.freeze
16
+ DEFAULT_S3_INPUT_BUCKET_NAME = 'input'
17
+ DEFAULT_S3_INTERMEDIATE_BUCKET_NAME = 'intermediate'
18
+ DEFAULT_S3_OUTPUT_BUCKET_NAME = 'output'
19
+ DEFAULT_SERVER_PORT = 4567
20
+
21
+ def initialize(options)
22
+ setup_s3_config(options)
23
+
24
+ @s3_input_bucket_name = options[:s3_input_bucket_name] || DEFAULT_S3_INPUT_BUCKET_NAME
25
+ @s3_intermediate_bucket_name = options[:s3_intermediate_bucket_name] || DEFAULT_S3_INTERMEDIATE_BUCKET_NAME
26
+ @s3_output_bucket_name = options[:s3_output_bucket_name] || DEFAULT_S3_OUTPUT_BUCKET_NAME
27
+
28
+ @server_port = options[:server_port] || 4567
29
+ @job_tracker_url = options[:job_tracker_url]
30
+ @job_worker_url = options[:job_worker_url]
31
+ @logger = options[:logger] || Logger.new(STDOUT)
32
+ end
33
+
34
+ private
35
+
36
+ def setup_s3_config(options)
37
+ s3_config = Hash[options[:s3_config].to_a.map { |v| [v[0].to_sym, v[1]] }] # support ruby <= 2.4
38
+ @s3_config = s3_config.empty? ? DEFAULT_S3_CONFIG : s3_config
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'msgpack'
4
+ require 'securerandom'
5
+ require 'forwardable'
6
+ require 'aasm'
7
+
8
+ module SimpleMapReduce
9
+ module Server
10
+ class Job
11
+ extend Forwardable
12
+ include AASM
13
+ attr_reader :map_script, :map_class_name, :reduce_script, :reduce_class_name,
14
+ :job_input_bucket_name, :job_input_directory_path,
15
+ :job_output_bucket_name, :job_output_directory_path,
16
+ :map_worker
17
+
18
+ delegate current_state: :aasm
19
+ alias state current_state
20
+
21
+ aasm do
22
+ state :ready, initial: true
23
+ state :in_process
24
+ state :succeeded
25
+ state :failed
26
+
27
+ event :start do
28
+ transitions from: :ready, to: :in_process
29
+ end
30
+
31
+ event :succeeded do
32
+ transitions from: :in_process, to: :succeeded
33
+ end
34
+
35
+ event :failed do
36
+ transitions from: %i(in_process ready), to: :failed
37
+ end
38
+ end
39
+
40
+ def initialize(id: nil,
41
+ map_script:,
42
+ map_class_name:,
43
+ reduce_script:,
44
+ reduce_class_name:,
45
+ job_input_bucket_name:,
46
+ job_input_directory_path:,
47
+ job_output_bucket_name:,
48
+ job_output_directory_path:,
49
+ map_worker_url: nil,
50
+ map_worker: nil)
51
+
52
+ @id = id
53
+ @map_script = map_script&.strip
54
+ @map_class_name = map_class_name&.strip
55
+ @reduce_script = reduce_script&.strip
56
+ @reduce_class_name = reduce_class_name&.strip
57
+ @job_input_bucket_name = job_input_bucket_name&.strip
58
+ @job_input_directory_path = job_input_directory_path&.strip
59
+ @job_output_bucket_name = job_output_bucket_name&.strip
60
+ @job_output_directory_path = job_output_directory_path&.strip
61
+ @map_worker = map_worker
62
+ if @map_worker.nil? && map_worker_url
63
+ @map_worker = SimpleMapReduce::Server::Worker.new(url: map_worker_url)
64
+ end
65
+
66
+ unless valid?
67
+ raise ArgumentError, 'invalid Job parameters are detected'
68
+ end
69
+ end
70
+
71
+ def id
72
+ @id ||= SecureRandom.uuid
73
+ end
74
+
75
+ def to_h
76
+ {
77
+ id: id,
78
+ map_script: @map_script,
79
+ map_class_name: @map_class_name,
80
+ reduce_script: @reduce_script,
81
+ reduce_class_name: @reduce_class_name,
82
+ job_input_bucket_name: @job_input_bucket_name,
83
+ job_input_directory_path: @job_input_directory_path,
84
+ job_output_bucket_name: @job_output_bucket_name,
85
+ job_output_directory_path: @job_output_directory_path,
86
+ map_worker_url: @map_worker&.url
87
+ }
88
+ end
89
+
90
+ def serialize
91
+ to_h.to_msgpack
92
+ end
93
+
94
+ def dump
95
+ to_h.merge(state: state)
96
+ end
97
+
98
+ def map_worker_url
99
+ @map_worker&.url
100
+ end
101
+
102
+ def valid?
103
+ !@map_script.to_s.empty? &&
104
+ !@map_class_name.to_s.empty? &&
105
+ !@reduce_script.to_s.empty? &&
106
+ !@reduce_class_name.to_s.empty? &&
107
+ !@job_input_bucket_name.to_s.empty? &&
108
+ !@job_input_directory_path.to_s.empty? &&
109
+ !@job_output_bucket_name.to_s.empty? &&
110
+ !@job_output_directory_path.to_s.empty?
111
+ end
112
+
113
+ # update Job
114
+ # @params [Hash] attributes
115
+ # @options attributes [String] event
116
+ def update!(event: nil)
117
+ if event
118
+ public_send(event.to_sym)
119
+ end
120
+ end
121
+
122
+ class << self
123
+ def deserialize(data)
124
+ new(Hash[MessagePack.unpack(data).map { |k, v| [k.to_sym, v] }])
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,304 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'sinatra'
5
+ require 'sinatra/json'
6
+ require 'sinatra/reloader' if development?
7
+
8
+ module SimpleMapReduce
9
+ module Server
10
+ class JobTracker < Sinatra::Base
11
+ configure do
12
+ use Rack::Lock
13
+ # TODO: be configurable
14
+ MAX_WORKER_RESERVABLE_SIZE = 5
15
+ end
16
+ configure :development do
17
+ register Sinatra::Reloader
18
+ end
19
+
20
+ post '/jobs' do
21
+ params = JSON.parse(request.body.read, symbolize_names: true)
22
+ available_workers = self.class.fetch_available_workers
23
+ if available_workers.empty?
24
+ status 409
25
+ json(succeeded: false, error_message: 'No worker is available now. Try it again.')
26
+ return
27
+ end
28
+
29
+ registered_job = nil
30
+ selected_map_worker = available_workers.last
31
+ begin
32
+ registered_job = self.class.register_job(
33
+ map_script: params[:map_script],
34
+ map_class_name: params[:map_class_name],
35
+ reduce_script: params[:reduce_script],
36
+ reduce_class_name: params[:reduce_class_name],
37
+ job_input_bucket_name: params[:job_input_bucket_name],
38
+ job_input_directory_path: params[:job_input_directory_path],
39
+ job_output_bucket_name: params[:job_output_bucket_name],
40
+ job_output_directory_path: params[:job_output_directory_path],
41
+ map_worker: selected_map_worker
42
+ )
43
+ rescue ArgumentError => e
44
+ status 400
45
+ json(succeeded: false, error_message: e.message)
46
+ begin
47
+ self.class.store_worker(selected_map_worker)
48
+ rescue => store_worker_error
49
+ logger.error("failed to store_worker: `#{store_worker_error.inspect}`")
50
+ end
51
+
52
+ return
53
+ rescue => e
54
+ status 500
55
+ json(succeeded: false, error_message: e.message)
56
+ begin
57
+ self.class.store_worker(selected_map_worker)
58
+ rescue => store_worker_error
59
+ logger.error("failed to store_worker: `#{store_worker_error.inspect}`")
60
+ end
61
+
62
+ return
63
+ end
64
+
65
+ json(succeeded: true, id: registered_job.id)
66
+ end
67
+
68
+ put '/jobs/:id' do
69
+ job = self.class.jobs&.[](params[:id])
70
+ if job.nil?
71
+ status 404
72
+ json(succeeded: false, error_message: 'job not found')
73
+ return
74
+ end
75
+
76
+ begin
77
+ attrs = JSON.parse(request.body.read, symbolize_names: true)
78
+ job.update!(attrs)
79
+ json(succeeded: true, job: job.dump)
80
+ rescue => e
81
+ status 400
82
+ json(succeeded: false, error_class: e.class.to_s, error_message: e.message)
83
+ end
84
+ end
85
+
86
+ get '/jobs/:id' do
87
+ job = self.class.jobs&.[](params[:id])
88
+ if job.nil?
89
+ status 404
90
+ json(succeeded: false, error_message: 'job not found')
91
+ else
92
+ json(job: job.dump)
93
+ end
94
+ end
95
+
96
+ get '/jobs' do
97
+ json(self.class.jobs&.values&.map(&:dump) || [])
98
+ end
99
+
100
+ get '/workers/:id' do
101
+ worker = self.class.workers[params[:id]]
102
+ if worker.nil?
103
+ status 404
104
+ json(succeeded: false, worker: nil)
105
+ else
106
+ json(succeeded: true, worker: worker.dump)
107
+ end
108
+ end
109
+
110
+ post '/workers' do
111
+ params = JSON.parse(request.body.read, symbolize_names: true)
112
+
113
+ worker = nil
114
+ begin
115
+ worker = self.class.register_worker(url: params[:url])
116
+ rescue => e
117
+ status 400
118
+ json(succeeded: false, error_class: e.class.to_s, error_message: e.message)
119
+ return
120
+ end
121
+
122
+ json(succeeded: true, id: worker.id)
123
+ end
124
+
125
+ put '/workers/:id' do
126
+ worker = self.class.workers[params[:id]]
127
+ if worker.nil?
128
+ status 404
129
+ json(succeeded: false, job: nil)
130
+ return
131
+ end
132
+
133
+ begin
134
+ attrs = JSON.parse(request.body.read, symbolize_names: true)
135
+ worker.update!(attrs)
136
+ json(succeeded: true, worker: worker.dump)
137
+ rescue => e
138
+ status 400
139
+ json(succeeded: false, error_class: e.class.to_s, error_message: e.message)
140
+ end
141
+ end
142
+
143
+ get '/workers' do
144
+ json(self.class.workers&.values&.map(&:dump) || [])
145
+ end
146
+
147
+ post '/workers/reserve' do
148
+ params = begin
149
+ JSON.parse(request.body.read, symbolize_names: true)
150
+ rescue
151
+ {}
152
+ end
153
+ worker_size = [
154
+ (params[:worker_size].to_i.zero? ? 1 : params[:worker_size].to_i.abs),
155
+ MAX_WORKER_RESERVABLE_SIZE
156
+ ].min
157
+ begin
158
+ reserved_workers = self.class.fetch_available_workers(worker_size)
159
+ json(succeeded: true, reserved_workers: reserved_workers.map(&:dump))
160
+ rescue => e
161
+ reserved_workers.each { |reserved_worker| self.class.store_worker(reserved_worker) }
162
+ status 500
163
+ json(succeeded: false, error_message: e.message)
164
+ end
165
+ end
166
+
167
+ class << self
168
+ attr_accessor :config
169
+ attr_reader :jobs
170
+ attr_reader :workers
171
+
172
+ def setup_job_tracker
173
+ check_s3_access
174
+ create_s3_buckets_if_not_existing
175
+ job_manager
176
+ logger.info('All setup process is done successfully. The job tracker is operation ready.')
177
+ logger.info("This job tracker url: #{SimpleMapReduce.job_tracker_url}")
178
+ end
179
+
180
+ def check_s3_access
181
+ s3_client.list_buckets
182
+ logger.info('[OK] s3 connection test')
183
+ end
184
+
185
+ def create_s3_buckets_if_not_existing
186
+ current_bucket_names = s3_client.list_buckets.buckets.map(&:name)
187
+ unless current_bucket_names.include?(SimpleMapReduce.s3_input_bucket_name)
188
+ s3_client.create_bucket(bucket: SimpleMapReduce.s3_input_bucket_name)
189
+ logger.info("create bucket #{SimpleMapReduce.s3_input_bucket_name}")
190
+ end
191
+
192
+ unless current_bucket_names.include?(SimpleMapReduce.s3_intermediate_bucket_name)
193
+ s3_client.create_bucket(bucket: SimpleMapReduce.s3_intermediate_bucket_name)
194
+ logger.info("create bucket #{SimpleMapReduce.s3_intermediate_bucket_name}")
195
+ end
196
+
197
+ unless current_bucket_names.include?(SimpleMapReduce.s3_output_bucket_name)
198
+ s3_client.create_bucket(bucket: SimpleMapReduce.s3_output_bucket_name)
199
+ logger.info("create bucket #{SimpleMapReduce.s3_output_bucket_name}")
200
+ end
201
+ logger.info('[OK] confirmed that all necessary s3 buckets exist')
202
+ end
203
+
204
+ def register_job(map_script:,
205
+ map_class_name:,
206
+ reduce_script:,
207
+ reduce_class_name:,
208
+ job_input_bucket_name:,
209
+ job_input_directory_path:,
210
+ job_output_bucket_name:,
211
+ job_output_directory_path:,
212
+ map_worker:)
213
+
214
+ job = ::SimpleMapReduce::Server::Job.new(
215
+ map_script: map_script,
216
+ map_class_name: map_class_name,
217
+ reduce_script: reduce_script,
218
+ reduce_class_name: reduce_class_name,
219
+ job_input_directory_path: job_input_directory_path,
220
+ job_input_bucket_name: job_input_bucket_name,
221
+ job_output_bucket_name: job_output_bucket_name,
222
+ job_output_directory_path: job_output_directory_path,
223
+ map_worker: map_worker
224
+ )
225
+ if @jobs.nil?
226
+ @jobs = {}
227
+ end
228
+
229
+ # enqueue job
230
+ job_manager.enqueue_job!(SimpleMapReduce::Worker::RegisterMapTaskWorker, args: job)
231
+
232
+ @jobs[job.id] = job
233
+ job
234
+ end
235
+
236
+ def register_worker(url:)
237
+ worker = ::SimpleMapReduce::Server::Worker.new(url: url)
238
+ if @workers.nil?
239
+ @workers = {}
240
+ end
241
+
242
+ @workers[worker.id] = worker
243
+ worker
244
+ end
245
+
246
+ def fetch_available_workers(worker_size = 1)
247
+ mutex.lock
248
+
249
+ if @workers.nil? || @workers.empty?
250
+ return []
251
+ end
252
+
253
+ ready_workers = @workers.select { |_id, worker| worker.ready? }
254
+ if ready_workers.count > 0
255
+ ready_workers = ready_workers.keys.take(worker_size)
256
+
257
+ ready_workers.map do |retry_worker_id|
258
+ @workers[retry_worker_id].reserve
259
+ @workers[retry_worker_id]
260
+ end
261
+ else
262
+ return []
263
+ end
264
+ ensure
265
+ mutex.unlock
266
+ end
267
+
268
+ def store_worker(worker)
269
+ mutex.lock
270
+
271
+ if @workers.nil?
272
+ @workers = {}
273
+ end
274
+
275
+ @workers[worker.id].ready!
276
+ ensure
277
+ mutex.unlock
278
+ end
279
+
280
+ def job_manager
281
+ @job_manager ||= ::Rasteira::EmbedWorker::Manager.run
282
+ end
283
+
284
+ def mutex
285
+ @mutex ||= Mutex.new
286
+ end
287
+
288
+ def s3_client
289
+ SimpleMapReduce::S3Client.instance.client
290
+ end
291
+
292
+ def logger
293
+ SimpleMapReduce.logger
294
+ end
295
+
296
+ # @override
297
+ def quit!
298
+ job_manager.shutdown_workers!
299
+ super
300
+ end
301
+ end
302
+ end
303
+ end
304
+ end