reduce_map 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/grpc/server_pb.rb +16 -0
- data/lib/grpc/server_services_pb.rb +21 -0
- data/lib/grpc/worker_pb.rb +17 -0
- data/lib/grpc/worker_services_pb.rb +21 -0
- data/lib/map_reduce.rb +139 -0
- data/lib/worker.rb +118 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7315d208b60e9ee8eebaa9919ff385f4446144d7b6215d0d5fa164be30be8105
|
4
|
+
data.tar.gz: 845b4cc210894649511a36bbdf30ab4d8da9ae49c0c482d24e9b1240ea5d0f64
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6e083efed9b116274fd1323dbad7997fa2c52d422e59a58646b56bc5f1e9298ca18c18b78a1429a6aeee75a809bb4ff58bf4fc8e854cd341ec3dc4d725752d7a
|
7
|
+
data.tar.gz: 1762e721e164282bd37602b1db69fb4abcdfa3236630960cd5bd36e2f7bda35bc3818ee17f2a532038f6c2ecff172842e8689160f924ba57349ab67861318196
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3
|
+
# source: server.proto
|
4
|
+
|
5
|
+
require 'google/protobuf'
|
6
|
+
|
7
|
+
|
8
|
+
descriptor_data = "\n\x0cserver.proto\"\x07\n\x05\x45mpty\"=\n\nWorkerInfo\x12\x0c\n\x04uuid\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\t\x12\x10\n\x08\x66ilename\x18\x03 \x01(\t\"&\n\x14RegisterWorkerResult\x12\x0e\n\x06result\x18\x01 \x01(\x08\"?\n\x15RegisterWorkerRequest\x12\x0c\n\x04uuid\x18\x01 \x01(\t\x12\n\n\x02ip\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t2q\n\x0fMapReduceMaster\x12\x41\n\x0eRegisterWorker\x12\x16.RegisterWorkerRequest\x1a\x15.RegisterWorkerResult\"\x00\x12\x1b\n\x04Ping\x12\x0b.WorkerInfo\x1a\x06.Emptyb\x06proto3"
|
9
|
+
|
10
|
+
pool = Google::Protobuf::DescriptorPool.generated_pool
|
11
|
+
pool.add_serialized_file(descriptor_data)
|
12
|
+
|
13
|
+
Empty = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("Empty").msgclass
|
14
|
+
WorkerInfo = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("WorkerInfo").msgclass
|
15
|
+
RegisterWorkerResult = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("RegisterWorkerResult").msgclass
|
16
|
+
RegisterWorkerRequest = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("RegisterWorkerRequest").msgclass
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
|
+
# Source: server.proto for package ''
|
3
|
+
|
4
|
+
require 'grpc'
|
5
|
+
require_relative 'server_pb'
|
6
|
+
|
7
|
+
module MapReduceMaster
|
8
|
+
class Service
|
9
|
+
|
10
|
+
include ::GRPC::GenericService
|
11
|
+
|
12
|
+
self.marshal_class_method = :encode
|
13
|
+
self.unmarshal_class_method = :decode
|
14
|
+
self.service_name = 'MapReduceMaster'
|
15
|
+
|
16
|
+
rpc :RegisterWorker, ::RegisterWorkerRequest, ::RegisterWorkerResult
|
17
|
+
rpc :Ping, ::WorkerInfo, ::Empty
|
18
|
+
end
|
19
|
+
|
20
|
+
Stub = Service.rpc_stub_class
|
21
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3
|
+
# source: worker.proto
|
4
|
+
|
5
|
+
require 'google/protobuf'
|
6
|
+
|
7
|
+
require_relative 'server_pb'
|
8
|
+
|
9
|
+
|
10
|
+
descriptor_data = "\n\x0cworker.proto\x1a\x0cserver.proto\"-\n\rMapInfoResult\x12\x0c\n\x04uuid\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x08\"7\n\x07MapInfo\x12\x10\n\x08\x66ilename\x18\x01 \x01(\t\x12\r\n\x05\x62lock\x18\x02 \x01(\t\x12\x0b\n\x03key\x18\x03 \x01(\t\":\n\nReduceInfo\x12\x10\n\x08\x66ilename\x18\x01 \x01(\t\x12\r\n\x05\x62lock\x18\x02 \x01(\t\x12\x0b\n\x03key\x18\x03 \x01(\t2R\n\x06Worker\x12 \n\x0cMapOperation\x12\x08.MapInfo\x1a\x06.Empty\x12&\n\x0fReduceOperation\x12\x0b.ReduceInfo\x1a\x06.Emptyb\x06proto3"
|
11
|
+
|
12
|
+
pool = Google::Protobuf::DescriptorPool.generated_pool
|
13
|
+
pool.add_serialized_file(descriptor_data)
|
14
|
+
|
15
|
+
MapInfoResult = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("MapInfoResult").msgclass
|
16
|
+
MapInfo = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("MapInfo").msgclass
|
17
|
+
ReduceInfo = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("ReduceInfo").msgclass
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
|
+
# Source: worker.proto for package ''
|
3
|
+
|
4
|
+
require 'grpc'
|
5
|
+
require_relative 'worker_pb'
|
6
|
+
|
7
|
+
module WorkerServer
|
8
|
+
class Service
|
9
|
+
|
10
|
+
include ::GRPC::GenericService
|
11
|
+
|
12
|
+
self.marshal_class_method = :encode
|
13
|
+
self.unmarshal_class_method = :decode
|
14
|
+
self.service_name = 'Worker'
|
15
|
+
|
16
|
+
rpc :MapOperation, ::MapInfo, ::Empty
|
17
|
+
rpc :ReduceOperation, ::ReduceInfo, ::Empty
|
18
|
+
end
|
19
|
+
|
20
|
+
Stub = Service.rpc_stub_class
|
21
|
+
end
|
data/lib/map_reduce.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
require_relative './grpc/server_services_pb'
|
2
|
+
require_relative './grpc/worker_services_pb'
|
3
|
+
require 'grpc'
|
4
|
+
require 'async'
|
5
|
+
require 'async/semaphore'
|
6
|
+
require 'pathname'
|
7
|
+
require_relative 'worker'
|
8
|
+
require 'digest'
|
9
|
+
require 'method_source'
|
10
|
+
|
11
|
+
class MapReduce < MapReduceMaster::Service
|
12
|
+
attr_accessor :logger, :worker_count, :map_count, :data, :files, :map_finished
|
13
|
+
|
14
|
+
def initialize(logger:, map_count: 5, file:)
|
15
|
+
@file = file
|
16
|
+
@map_count = map_count
|
17
|
+
@worker_count = 0
|
18
|
+
@logger = logger
|
19
|
+
@data = []
|
20
|
+
@files = nil
|
21
|
+
@map_finished = false
|
22
|
+
end
|
23
|
+
|
24
|
+
def ping(worker_req, _)
|
25
|
+
uuid = worker_req.uuid
|
26
|
+
success = worker_req.success
|
27
|
+
worker = data.find { |w| w[:uuid] == uuid }
|
28
|
+
worker[:status] = 0
|
29
|
+
|
30
|
+
if success == 'true'
|
31
|
+
logger.info("[Master] Worker #{uuid} completed the map operation succesful")
|
32
|
+
else
|
33
|
+
logger.info("[Master] Worker #{uuid} failed to complete the map opeartion successful")
|
34
|
+
@files << worker_req.filename
|
35
|
+
end
|
36
|
+
|
37
|
+
Empty.new
|
38
|
+
end
|
39
|
+
|
40
|
+
def register_worker(worker_req, _)
|
41
|
+
uuid = worker_req.uuid
|
42
|
+
type = worker_req.type
|
43
|
+
ip = worker_req.ip
|
44
|
+
mutex = Mutex.new
|
45
|
+
mutex.lock
|
46
|
+
data << ({ uuid:, ip:, status: 0 })
|
47
|
+
# That count is being back by the ruby GIL
|
48
|
+
@worker_count += 1
|
49
|
+
@logger.info('[Master] Worker register success')
|
50
|
+
RegisterWorkerResult.new(result: true)
|
51
|
+
ensure
|
52
|
+
mutex.unlock
|
53
|
+
end
|
54
|
+
|
55
|
+
def wait_for_enough_workers
|
56
|
+
logger.info('[Master] Wait for the creation of workers')
|
57
|
+
Worker.start_worker(logger, map_count)
|
58
|
+
logger.info('[Master] Finished!')
|
59
|
+
end
|
60
|
+
|
61
|
+
def reduce(&block)
|
62
|
+
Thread.new do
|
63
|
+
loop do
|
64
|
+
next unless @map_finished == true
|
65
|
+
|
66
|
+
block = block.source.sub(/^\s*master\.reduce do\s*\n/, '').sub(/^\s*end\s*\n/, '')
|
67
|
+
message = Base64.encode64(block)
|
68
|
+
worker = data.select { |w| w[:status] == 0 }.first
|
69
|
+
|
70
|
+
stub = WorkerServer::Stub.new(worker[:ip], :this_channel_is_insecure)
|
71
|
+
request = ReduceInfo.new(filename: "files/#{@encrypt_key}/map.txt", block: message, key: @encrypt_key)
|
72
|
+
worker[:status] = 'processing'
|
73
|
+
stub.reduce_operation(request)
|
74
|
+
|
75
|
+
break
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def map(&block)
|
81
|
+
block = block.source.sub(/^\s*master\.map do\s*\n/, '').sub(/^\s*end\s*\n/, '')
|
82
|
+
message = Base64.encode64(block)
|
83
|
+
Thread.new do
|
84
|
+
loop do
|
85
|
+
@map_finished = true if files.empty?
|
86
|
+
break if files.empty?
|
87
|
+
|
88
|
+
Async do
|
89
|
+
workers = data.select { |w| w[:status] == 0 }.first(files.count)
|
90
|
+
|
91
|
+
semaphore = Async::Semaphore.new(workers.count)
|
92
|
+
tasks = []
|
93
|
+
|
94
|
+
workers.each do |worker|
|
95
|
+
tasks << semaphore.async do
|
96
|
+
stub = WorkerServer::Stub.new(worker[:ip], :this_channel_is_insecure)
|
97
|
+
request = MapInfo.new(filename: files.pop, block: message, key: @encrypt_key)
|
98
|
+
worker[:status] = 'processing'
|
99
|
+
stub.map_operation(request)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
tasks.each(&:wait)
|
103
|
+
end.wait
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def distribute_input
|
109
|
+
path_name = @file
|
110
|
+
key = path_name.to_path
|
111
|
+
logger.info('[Master] Start to distribute input')
|
112
|
+
@files = split_files(key, path_name)
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def split_files(key, file)
|
118
|
+
@encrypt_key = generate_digest_key(key)
|
119
|
+
p "Going to pritn encrypt_key = #{@encrypt_key}"
|
120
|
+
FileUtils.mkdir_p("./files/#{@encrypt_key}")
|
121
|
+
p ""
|
122
|
+
line_maximum = (File.open(file).count / @map_count).to_i
|
123
|
+
file_data = file.readlines.map(&:chomp)
|
124
|
+
file_number = file_data.length / line_maximum
|
125
|
+
files = []
|
126
|
+
file_number.times do |index|
|
127
|
+
path = "./files/#{@encrypt_key}/file_#{index}"
|
128
|
+
File.write(path, file_data.slice!(0..line_maximum))
|
129
|
+
files << path
|
130
|
+
end
|
131
|
+
files
|
132
|
+
end
|
133
|
+
|
134
|
+
def generate_digest_key(key)
|
135
|
+
digest = Digest::SHA256.new
|
136
|
+
digest.update(key)
|
137
|
+
digest.hexdigest
|
138
|
+
end
|
139
|
+
end
|
data/lib/worker.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
require_relative './grpc/worker_services_pb'
|
2
|
+
require_relative './grpc/server_services_pb'
|
3
|
+
require_relative './grpc/server_pb'
|
4
|
+
require 'base64'
|
5
|
+
require 'google/protobuf'
|
6
|
+
require 'async'
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
class Worker < WorkerServer::Service
|
10
|
+
attr_accessor :worker_number, :master_ip, :port, :logger, :uuid, :result
|
11
|
+
|
12
|
+
def initialize(worker_number:, master_ip:, port:, logger:)
|
13
|
+
@worker_number = worker_number
|
14
|
+
@uuid = generate_uuid
|
15
|
+
@master_ip = master_ip
|
16
|
+
@port = port
|
17
|
+
@logger = logger
|
18
|
+
@result = []
|
19
|
+
end
|
20
|
+
|
21
|
+
def map_operation(worker_req, _)
|
22
|
+
block = eval(Base64.decode64(worker_req.block))
|
23
|
+
block.call(File.read(worker_req.filename))
|
24
|
+
File.open("files/#{worker_req.key}/map.txt", 'a') do |file|
|
25
|
+
result.each do |array|
|
26
|
+
file.puts array.inspect
|
27
|
+
end
|
28
|
+
end
|
29
|
+
logger.info("[Worker] Worker #{uuid} gRPC finished the map operation")
|
30
|
+
stub = MapReduceMaster::Stub.new(@master_ip, :this_channel_is_insecure)
|
31
|
+
request = WorkerInfo.new(uuid: @uuid, success: 'true', filename: worker_req.filename)
|
32
|
+
stub.ping(request)
|
33
|
+
Empty.new
|
34
|
+
rescue StandardError => e
|
35
|
+
logger.error("[Worker] #{uuid} with error #{e}")
|
36
|
+
stub = MapReduceMaster::Stub.new(@master_ip, :this_channel_is_insecure)
|
37
|
+
request = WorkerInfo.new(uuid: @uuid, success: nil, filename: worker_req.filename)
|
38
|
+
stub.ping(request)
|
39
|
+
end
|
40
|
+
|
41
|
+
def reduce_operation(worker_req, _)
|
42
|
+
data = sort_map_file(worker_req.filename)
|
43
|
+
unique_keys = data.map { |item| item[0] }.uniq
|
44
|
+
file_path = "files/#{worker_req.key}/reduce.txt"
|
45
|
+
logger.info('[Worker] Starting Reduce Operation')
|
46
|
+
Async do
|
47
|
+
1.upto(unique_keys.count) do |i|
|
48
|
+
Async do
|
49
|
+
results = data.select { |item| item[0] == unique_keys[i] }
|
50
|
+
block = eval(Base64.decode64(worker_req.block))
|
51
|
+
response = block.call(results)
|
52
|
+
File.open(file_path, 'a') do |file|
|
53
|
+
file.puts response
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
logger.info('[Worker] Finished Reduce Operation')
|
59
|
+
logger.info("[Worker] File stored at #{file_path}")
|
60
|
+
Empty.new
|
61
|
+
end
|
62
|
+
|
63
|
+
def start
|
64
|
+
grpc_server = GRPC::RpcServer.new
|
65
|
+
grpc_server.add_http2_port("0.0.0.0:#{port}", :this_port_is_insecure)
|
66
|
+
grpc_server.handle(self)
|
67
|
+
Thread.new do
|
68
|
+
grpc_server.run_till_terminated
|
69
|
+
ensure
|
70
|
+
logger.info('[Worker] Worker gRPC thread failed')
|
71
|
+
end
|
72
|
+
logger.info('[Worker] Worker gRPC thread start')
|
73
|
+
|
74
|
+
logger.info('[Worker] load functions finish')
|
75
|
+
register_worker
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.start_worker(logger, worker_number)
|
79
|
+
master_ip = '0.0.0.0:50051'
|
80
|
+
Async do
|
81
|
+
1.upto(worker_number) do |i|
|
82
|
+
Async do
|
83
|
+
worker = new(worker_number:, master_ip:, port: "3000#{i}", logger:)
|
84
|
+
worker.start
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def sort_map_file(file_path)
|
93
|
+
file_data = []
|
94
|
+
File.open(file_path, 'r') do |file|
|
95
|
+
file.each_line do |line|
|
96
|
+
file_data << eval(line.strip)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
file_data.sort_by { |item| item[0] }
|
100
|
+
end
|
101
|
+
|
102
|
+
def emit_intermediate(k, count:)
|
103
|
+
result << [k, count]
|
104
|
+
end
|
105
|
+
|
106
|
+
alias emit emit_intermediate
|
107
|
+
|
108
|
+
def generate_uuid
|
109
|
+
SecureRandom.uuid
|
110
|
+
end
|
111
|
+
|
112
|
+
def register_worker
|
113
|
+
stub = MapReduceMaster::Stub.new(@master_ip, :this_channel_is_insecure)
|
114
|
+
request = RegisterWorkerRequest.new(uuid: @uuid, ip: "localhost:#{@port}", type: 'map')
|
115
|
+
stub.register_worker(request)
|
116
|
+
@logger.info('[Worker] Worker register itself finish')
|
117
|
+
end
|
118
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: reduce_map
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Francisco Paradela
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-07-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: async
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.12.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.12.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: grpc
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.62.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.62.0
|
41
|
+
description: Ruby map/reduce framework
|
42
|
+
email: franciscoleite.dev@protonmail.com
|
43
|
+
executables: []
|
44
|
+
extensions: []
|
45
|
+
extra_rdoc_files: []
|
46
|
+
files:
|
47
|
+
- lib/grpc/server_pb.rb
|
48
|
+
- lib/grpc/server_services_pb.rb
|
49
|
+
- lib/grpc/worker_pb.rb
|
50
|
+
- lib/grpc/worker_services_pb.rb
|
51
|
+
- lib/map_reduce.rb
|
52
|
+
- lib/worker.rb
|
53
|
+
homepage: https://rubygems.org/gems/map_reduce
|
54
|
+
licenses:
|
55
|
+
- MIT
|
56
|
+
metadata: {}
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
requirements: []
|
72
|
+
rubygems_version: 3.5.9
|
73
|
+
signing_key:
|
74
|
+
specification_version: 4
|
75
|
+
summary: This framework is designed to provide a fully multi-threaded, distributed
|
76
|
+
and asynchronous approach to MapReduce processing.
|
77
|
+
test_files: []
|