simple_map_reduce 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'sinatra'
5
+ require 'sinatra/json'
6
+ require 'sinatra/reloader' if development?
7
+
8
+ module SimpleMapReduce
9
+ module Server
10
+ class JobWorker < Sinatra::Base
11
+ configure do
12
+ use Rack::Lock
13
+ end
14
+ configure :development do
15
+ register Sinatra::Reloader
16
+ end
17
+
18
+ post '/map_tasks' do
19
+ raw_body = request.body.read
20
+ job = SimpleMapReduce::Server::Job.deserialize(raw_body)
21
+ self.class.job_manager.enqueue_job!(SimpleMapReduce::Worker::RunMapTaskWorker, args: [job, self.class.worker_id])
22
+
23
+ json(succeeded: true, job_id: job.id)
24
+ end
25
+
26
+ post '/reduce_tasks' do
27
+ raw_body = request.body.read
28
+ task = SimpleMapReduce::Server::Task.deserialize(raw_body)
29
+
30
+ self.class.job_manager.enqueue_job!(SimpleMapReduce::Worker::RunReduceTaskWorker, args: [task, self.class.worker_id])
31
+
32
+ json(succeeded: true, job_id: task.job_id, task_id: task.id)
33
+ end
34
+
35
+ class << self
36
+ attr_accessor :worker_id
37
+
38
+ def setup_worker
39
+ check_s3_access
40
+ register_myself_to_job_tracker
41
+ job_manager
42
+ logger.info('All setup process is done successfully. This worker is operation ready.')
43
+ logger.info("This job worker url: #{SimpleMapReduce.job_worker_url}, id: #{worker_id}")
44
+ logger.info("The job tracker url: #{SimpleMapReduce.job_tracker_url}")
45
+ end
46
+
47
+ def check_s3_access
48
+ s3_client.list_buckets
49
+ logger.info('[OK] s3 connection test')
50
+ end
51
+
52
+ def register_myself_to_job_tracker
53
+ response = http_client.post do |request|
54
+ request.url('/workers')
55
+ request.body = { url: SimpleMapReduce.job_worker_url }.to_json
56
+ end
57
+
58
+ body = JSON.parse(response.body, symbolize_names: true)
59
+ self.worker_id = body[:id]
60
+ logger.info("[OK] registering this worker to the job_tracker #{SimpleMapReduce.job_worker_url}")
61
+ end
62
+
63
+ def job_manager
64
+ @job_manager ||= ::Rasteira::EmbedWorker::Manager.run
65
+ end
66
+
67
+ def http_client
68
+ @http_client ||= ::Faraday.new(
69
+ url: SimpleMapReduce.job_tracker_url,
70
+ headers: {
71
+ 'Accept' => 'application/json',
72
+ 'Content-Type' => 'application/json'
73
+ }
74
+ ) do |faraday|
75
+ faraday.response :raise_error
76
+ faraday.adapter Faraday.default_adapter
77
+ end
78
+ end
79
+
80
+ def s3_client
81
+ SimpleMapReduce::S3Client.instance.client
82
+ end
83
+
84
+ def logger
85
+ SimpleMapReduce.logger
86
+ end
87
+
88
+ # @override
89
+ def quit!
90
+ job_manager.shutdown_workers!
91
+ super
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'msgpack'
4
+ require 'securerandom'
5
+
6
+ module SimpleMapReduce
7
+ module Server
8
+ class Task
9
+ extend Forwardable
10
+ include AASM
11
+
12
+ attr_reader :job_id,
13
+ :task_class_name, :task_script,
14
+ :task_input_bucket_name, :task_input_file_path,
15
+ :task_output_bucket_name, :task_output_directory_path,
16
+ :worker
17
+
18
+ delegate current_state: :aasm
19
+ alias state current_state
20
+
21
+ aasm do
22
+ state :ready, initial: true
23
+ state :in_process
24
+ state :succeeded
25
+ state :failed
26
+
27
+ event :start do
28
+ transitions ready: :in_process
29
+ end
30
+
31
+ event :succeed do
32
+ transitions from: :in_process, to: :succeeded
33
+ end
34
+
35
+ event :fail do
36
+ transitions from: :in_process, to: :failed
37
+ end
38
+ end
39
+
40
+ def initialize(id: nil,
41
+ job_id:,
42
+ task_class_name:,
43
+ task_script:,
44
+ task_input_bucket_name:,
45
+ task_input_file_path:,
46
+ task_output_bucket_name:,
47
+ task_output_directory_path:,
48
+ worker: nil)
49
+ @id = id
50
+ @job_id = job_id
51
+ @task_class_name = task_class_name
52
+ @task_script = task_script
53
+ @task_input_bucket_name = task_input_bucket_name
54
+ @task_input_file_path = task_input_file_path
55
+ @task_output_bucket_name = task_output_bucket_name
56
+ @task_output_directory_path = task_output_directory_path
57
+ @worker = worker
58
+ end
59
+
60
+ def id
61
+ @id ||= SecureRandom.uuid
62
+ end
63
+
64
+ def to_h
65
+ {
66
+ id: id,
67
+ job_id: @job_id,
68
+ task_class_name: @task_class_name,
69
+ task_script: @task_script,
70
+ task_input_bucket_name: @task_input_bucket_name,
71
+ task_input_file_path: @task_input_file_path,
72
+ task_output_bucket_name: @task_output_bucket_name,
73
+ task_output_directory_path: @task_output_directory_path
74
+ }
75
+ end
76
+
77
+ def serialize
78
+ to_h.to_msgpack
79
+ end
80
+
81
+ def dump
82
+ to_h.merge(state: state)
83
+ end
84
+
85
+ class << self
86
+ def deserialize(data)
87
+ new(Hash[MessagePack.unpack(data).map { |k, v| [k.to_sym, v] }])
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'securerandom'
4
+ require 'forwardable'
5
+ require 'aasm'
6
+
7
+ module SimpleMapReduce
8
+ module Server
9
+ class Worker
10
+ extend Forwardable
11
+ include AASM
12
+
13
+ attr_accessor :url
14
+
15
+ delegate current_state: :aasm
16
+ alias state current_state
17
+
18
+ aasm do
19
+ state :ready, initial: true
20
+ state :reserved
21
+ state :working
22
+
23
+ event :ready do
24
+ transitions to: :ready
25
+ end
26
+
27
+ event :reserve do
28
+ transitions from: %i(ready working), to: :reserved
29
+ end
30
+
31
+ event :work do
32
+ transitions from: :reserved, to: :working
33
+ end
34
+ end
35
+
36
+ def initialize(url:)
37
+ @url = url
38
+ unless valid?
39
+ raise ArgumentError, 'invalid url'
40
+ end
41
+ end
42
+
43
+ def id
44
+ @id ||= SecureRandom.uuid
45
+ end
46
+
47
+ def dump
48
+ {
49
+ id: id,
50
+ url: @url,
51
+ state: state
52
+ }
53
+ end
54
+
55
+ # update Job
56
+ # @params [Hash] attributes
57
+ # @options attributes [String] url
58
+ # @options attributes [String] event
59
+ def update!(url: nil, event: nil)
60
+ if url
61
+ self.url = url
62
+ end
63
+
64
+ if event
65
+ public_send(event.to_sym)
66
+ end
67
+ end
68
+
69
+ private
70
+
71
+ def valid?
72
+ !@url.to_s.empty? && @url =~ URI::DEFAULT_PARSER.make_regexp
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleMapReduce
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleMapReduce
4
+ module Worker
5
+ class RegisterMapTaskWorker
6
+ def perform(job)
7
+ logger.info('register map task worker start!')
8
+ client = http_client(job.map_worker.url)
9
+ response = client.post do |request|
10
+ request.url('/map_tasks')
11
+ request.body = job.serialize
12
+ end
13
+ logger.debug(response.body)
14
+
15
+ job.map_worker.work!
16
+ job.start!
17
+ rescue => e
18
+ logger.error(e.inspect)
19
+ logger.error(e.backtrace.take(50))
20
+ SimpleMapReduce::Server::JobTracker.store_worker(job.map_worker)
21
+ job.failed!
22
+ end
23
+
24
+ private
25
+
26
+ HTTP_MSGPACK_HEADER = {
27
+ 'Accept' => 'application/x-msgpack',
28
+ 'Content-Type' => 'application/x-msgpack'
29
+ }.freeze
30
+
31
+ def http_client(url)
32
+ ::Faraday.new(
33
+ url: url,
34
+ headers: HTTP_MSGPACK_HEADER,
35
+ request: {
36
+ open_timeout: 10,
37
+ timeout: 15
38
+ }
39
+ ) do |faraday|
40
+ faraday.response :logger
41
+ faraday.response :raise_error
42
+ faraday.adapter Faraday.default_adapter
43
+ end
44
+ end
45
+
46
+ def logger
47
+ SimpleMapReduce.logger
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleMapReduce
4
+ module Worker
5
+ class RunMapTaskWorker
6
+ def perform(job, map_worker_id)
7
+ task_wrapper_class_name = "TaskWrapper#{job.id.delete('-')}"
8
+ self.class.class_eval("class #{task_wrapper_class_name}; end", 'Task Wrapper Class')
9
+ task_wrapper_class = self.class.const_get(task_wrapper_class_name)
10
+ task_wrapper_class.class_eval(job.map_script, 'Map task script')
11
+ map_task = task_wrapper_class.const_get(job.map_class_name, false).new
12
+ unless map_task.respond_to?(:map)
13
+ # TODO: notify job_tracker
14
+ logger.error('no map method')
15
+ return
16
+ end
17
+ logger.info('map task start')
18
+
19
+ local_input_cache = Tempfile.new
20
+ s3_client.get_object(
21
+ response_target: local_input_cache.path,
22
+ bucket: job.job_input_bucket_name,
23
+ key: job.job_input_directory_path
24
+ )
25
+ local_input_cache.rewind
26
+
27
+ local_output_cache = Tempfile.new
28
+ local_input_cache.each_line(chomp: true, rs: "\n") do |line|
29
+ map_task.map(line, local_output_cache)
30
+ end
31
+
32
+ local_output_cache.rewind
33
+ logger.debug("output data size: #{local_output_cache.size}")
34
+ logger.debug('---map output digest---')
35
+ local_output_cache.take(5).each do |line|
36
+ logger.debug(line)
37
+ end
38
+ logger.debug('---map output digest---')
39
+
40
+ response = http_client(SimpleMapReduce.job_tracker_url).post do |request|
41
+ request.url('/workers/reserve')
42
+ # TODO: providing a way to specify worker_size
43
+ request.body = { worker_size: 2 }.to_json
44
+ end
45
+ logger.debug(response.body)
46
+
47
+ # {"succeeded":true,"workers":[{"id":70157882164440,"url":"http://localhost:4569","state":'reserved'}]}
48
+ reserved_workers = JSON.parse(response.body, symbolize_names: true)[:reserved_workers]
49
+ if reserved_workers.count == 0
50
+ # keep working with same worker
51
+ reserved_workers << { id: map_worker_id, url: job.map_worker_url, state: 'working' }
52
+ end
53
+
54
+ shuffle(job, reserved_workers, local_output_cache)
55
+
56
+ unless reserved_workers.map { |w| w[:id] }.include?(map_worker_id)
57
+ response = http_client(SimpleMapReduce.job_tracker_url).put do |request|
58
+ request.url("/workers/#{map_worker_id}")
59
+ request.body = { event: 'ready' }.to_json
60
+ end
61
+ logger.debug(response.body)
62
+ end
63
+ rescue => e
64
+ logger.error(e.inspect)
65
+ logger.error(e.backtrace.take(50))
66
+ # TODO: notifying to job_tracker that this task have failed
67
+ ensure
68
+ local_input_cache&.delete
69
+ local_output_cache&.delete
70
+ reserved_workers&.each do |worker|
71
+ worker[:shuffled_local_output]&.delete
72
+ end
73
+ if self.class.const_defined?(task_wrapper_class_name.to_sym)
74
+ self.class.send(:remove_const, task_wrapper_class_name.to_sym)
75
+ end
76
+ logger.info('map task end')
77
+ end
78
+
79
+ private
80
+
81
+ def s3_client
82
+ SimpleMapReduce::S3Client.instance.client
83
+ end
84
+
85
+ def logger
86
+ SimpleMapReduce.logger
87
+ end
88
+
89
+ HTTP_JSON_HEADER = {
90
+ 'Accept' => 'application/x-msgpack',
91
+ 'Content-Type' => 'application/x-msgpack'
92
+ }.freeze
93
+
94
+ def http_client(url)
95
+ ::Faraday.new(
96
+ url: url,
97
+ headers: HTTP_JSON_HEADER
98
+ ) do |faraday|
99
+ faraday.response :logger
100
+ faraday.response :raise_error
101
+ faraday.adapter Faraday.default_adapter
102
+ end
103
+ end
104
+
105
+ def shuffle(job, workers, local_output_cache)
106
+ workers_count = workers.count
107
+ raise 'No workers' unless workers_count > 0
108
+
109
+ workers.each do |worker|
110
+ worker[:shuffled_local_output] = Tempfile.new
111
+ end
112
+
113
+ local_output_cache.each_line(rs: "\n") do |raw_line|
114
+ output = JSON.parse(raw_line, symbolize_names: true)
115
+ partition_id = output[:key].hash % workers_count
116
+ workers[partition_id][:shuffled_local_output].puts(output.to_json)
117
+ end
118
+
119
+ task_script = job.reduce_script
120
+ task_class_name = job.reduce_class_name
121
+ task_input_bucket_name = SimpleMapReduce.s3_intermediate_bucket_name
122
+ task_output_bucket_name = job.job_output_bucket_name
123
+ task_output_directory_path = job.job_output_directory_path
124
+ task_input_file_path_prefix = "#{job.id}/map_output_#{Time.now.to_i}/"
125
+
126
+ workers.each_with_index do |worker, partition_id|
127
+ reduce_task = ::SimpleMapReduce::Server::Task.new(
128
+ job_id: job.id,
129
+ task_class_name: task_class_name,
130
+ task_script: task_script,
131
+ task_input_bucket_name: task_input_bucket_name,
132
+ task_input_file_path: "#{task_input_file_path_prefix}#{partition_id}_map_output.txt",
133
+ task_output_bucket_name: task_output_bucket_name,
134
+ task_output_directory_path: task_output_directory_path
135
+ )
136
+
137
+ local_output_cache = worker[:shuffled_local_output]
138
+ local_output_cache.rewind
139
+ s3_client.put_object(
140
+ body: local_output_cache.read,
141
+ bucket: reduce_task.task_input_bucket_name,
142
+ key: reduce_task.task_input_file_path
143
+ )
144
+
145
+ response = http_client(worker[:url]).post do |request|
146
+ request.url('/reduce_tasks')
147
+ request.body = reduce_task.serialize
148
+ end
149
+ logger.debug(response.body)
150
+
151
+ next if worker[:state] == 'working'
152
+ response = http_client(SimpleMapReduce.job_tracker_url).put do |request|
153
+ request.url("/workers/#{worker[:id]}")
154
+ request.body = { event: 'work' }.to_json
155
+ end
156
+ logger.debug(response.body)
157
+ end
158
+ end
159
+ end
160
+ end
161
+ end