reduce_map 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7315d208b60e9ee8eebaa9919ff385f4446144d7b6215d0d5fa164be30be8105
4
+ data.tar.gz: 845b4cc210894649511a36bbdf30ab4d8da9ae49c0c482d24e9b1240ea5d0f64
5
+ SHA512:
6
+ metadata.gz: 6e083efed9b116274fd1323dbad7997fa2c52d422e59a58646b56bc5f1e9298ca18c18b78a1429a6aeee75a809bb4ff58bf4fc8e854cd341ec3dc4d725752d7a
7
+ data.tar.gz: 1762e721e164282bd37602b1db69fb4abcdfa3236630960cd5bd36e2f7bda35bc3818ee17f2a532038f6c2ecff172842e8689160f924ba57349ab67861318196
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: server.proto
4
+
5
+ require 'google/protobuf'
6
+
7
+
8
+ descriptor_data = "\n\x0cserver.proto\"\x07\n\x05\x45mpty\"=\n\nWorkerInfo\x12\x0c\n\x04uuid\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\t\x12\x10\n\x08\x66ilename\x18\x03 \x01(\t\"&\n\x14RegisterWorkerResult\x12\x0e\n\x06result\x18\x01 \x01(\x08\"?\n\x15RegisterWorkerRequest\x12\x0c\n\x04uuid\x18\x01 \x01(\t\x12\n\n\x02ip\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t2q\n\x0fMapReduceMaster\x12\x41\n\x0eRegisterWorker\x12\x16.RegisterWorkerRequest\x1a\x15.RegisterWorkerResult\"\x00\x12\x1b\n\x04Ping\x12\x0b.WorkerInfo\x1a\x06.Emptyb\x06proto3"
9
+
10
+ pool = Google::Protobuf::DescriptorPool.generated_pool
11
+ pool.add_serialized_file(descriptor_data)
12
+
13
+ Empty = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("Empty").msgclass
14
+ WorkerInfo = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("WorkerInfo").msgclass
15
+ RegisterWorkerResult = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("RegisterWorkerResult").msgclass
16
+ RegisterWorkerRequest = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("RegisterWorkerRequest").msgclass
@@ -0,0 +1,21 @@
1
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
2
+ # Source: server.proto for package ''
3
+
4
+ require 'grpc'
5
+ require_relative 'server_pb'
6
+
7
+ module MapReduceMaster
8
+ class Service
9
+
10
+ include ::GRPC::GenericService
11
+
12
+ self.marshal_class_method = :encode
13
+ self.unmarshal_class_method = :decode
14
+ self.service_name = 'MapReduceMaster'
15
+
16
+ rpc :RegisterWorker, ::RegisterWorkerRequest, ::RegisterWorkerResult
17
+ rpc :Ping, ::WorkerInfo, ::Empty
18
+ end
19
+
20
+ Stub = Service.rpc_stub_class
21
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: worker.proto
4
+
5
+ require 'google/protobuf'
6
+
7
+ require_relative 'server_pb'
8
+
9
+
10
+ descriptor_data = "\n\x0cworker.proto\x1a\x0cserver.proto\"-\n\rMapInfoResult\x12\x0c\n\x04uuid\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x08\"7\n\x07MapInfo\x12\x10\n\x08\x66ilename\x18\x01 \x01(\t\x12\r\n\x05\x62lock\x18\x02 \x01(\t\x12\x0b\n\x03key\x18\x03 \x01(\t\":\n\nReduceInfo\x12\x10\n\x08\x66ilename\x18\x01 \x01(\t\x12\r\n\x05\x62lock\x18\x02 \x01(\t\x12\x0b\n\x03key\x18\x03 \x01(\t2R\n\x06Worker\x12 \n\x0cMapOperation\x12\x08.MapInfo\x1a\x06.Empty\x12&\n\x0fReduceOperation\x12\x0b.ReduceInfo\x1a\x06.Emptyb\x06proto3"
11
+
12
+ pool = Google::Protobuf::DescriptorPool.generated_pool
13
+ pool.add_serialized_file(descriptor_data)
14
+
15
+ MapInfoResult = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("MapInfoResult").msgclass
16
+ MapInfo = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("MapInfo").msgclass
17
+ ReduceInfo = ::Google::Protobuf::DescriptorPool.generated_pool.lookup("ReduceInfo").msgclass
@@ -0,0 +1,21 @@
1
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
2
+ # Source: worker.proto for package ''
3
+
4
+ require 'grpc'
5
+ require_relative 'worker_pb'
6
+
7
+ module WorkerServer
8
+ class Service
9
+
10
+ include ::GRPC::GenericService
11
+
12
+ self.marshal_class_method = :encode
13
+ self.unmarshal_class_method = :decode
14
+ self.service_name = 'Worker'
15
+
16
+ rpc :MapOperation, ::MapInfo, ::Empty
17
+ rpc :ReduceOperation, ::ReduceInfo, ::Empty
18
+ end
19
+
20
+ Stub = Service.rpc_stub_class
21
+ end
data/lib/map_reduce.rb ADDED
@@ -0,0 +1,139 @@
1
+ require_relative './grpc/server_services_pb'
2
+ require_relative './grpc/worker_services_pb'
3
+ require 'grpc'
4
+ require 'async'
5
+ require 'async/semaphore'
6
+ require 'pathname'
7
+ require_relative 'worker'
8
+ require 'digest'
9
+ require 'method_source'
10
+
11
+ class MapReduce < MapReduceMaster::Service
12
+ attr_accessor :logger, :worker_count, :map_count, :data, :files, :map_finished
13
+
14
+ def initialize(logger:, map_count: 5, file:)
15
+ @file = file
16
+ @map_count = map_count
17
+ @worker_count = 0
18
+ @logger = logger
19
+ @data = []
20
+ @files = nil
21
+ @map_finished = false
22
+ end
23
+
24
+ def ping(worker_req, _)
25
+ uuid = worker_req.uuid
26
+ success = worker_req.success
27
+ worker = data.find { |w| w[:uuid] == uuid }
28
+ worker[:status] = 0
29
+
30
+ if success == 'true'
31
+ logger.info("[Master] Worker #{uuid} completed the map operation succesful")
32
+ else
33
+ logger.info("[Master] Worker #{uuid} failed to complete the map opeartion successful")
34
+ @files << worker_req.filename
35
+ end
36
+
37
+ Empty.new
38
+ end
39
+
40
+ def register_worker(worker_req, _)
41
+ uuid = worker_req.uuid
42
+ type = worker_req.type
43
+ ip = worker_req.ip
44
+ mutex = Mutex.new
45
+ mutex.lock
46
+ data << ({ uuid:, ip:, status: 0 })
47
+ # That count is being back by the ruby GIL
48
+ @worker_count += 1
49
+ @logger.info('[Master] Worker register success')
50
+ RegisterWorkerResult.new(result: true)
51
+ ensure
52
+ mutex.unlock
53
+ end
54
+
55
+ def wait_for_enough_workers
56
+ logger.info('[Master] Wait for the creation of workers')
57
+ Worker.start_worker(logger, map_count)
58
+ logger.info('[Master] Finished!')
59
+ end
60
+
61
+ def reduce(&block)
62
+ Thread.new do
63
+ loop do
64
+ next unless @map_finished == true
65
+
66
+ block = block.source.sub(/^\s*master\.reduce do\s*\n/, '').sub(/^\s*end\s*\n/, '')
67
+ message = Base64.encode64(block)
68
+ worker = data.select { |w| w[:status] == 0 }.first
69
+
70
+ stub = WorkerServer::Stub.new(worker[:ip], :this_channel_is_insecure)
71
+ request = ReduceInfo.new(filename: "files/#{@encrypt_key}/map.txt", block: message, key: @encrypt_key)
72
+ worker[:status] = 'processing'
73
+ stub.reduce_operation(request)
74
+
75
+ break
76
+ end
77
+ end
78
+ end
79
+
80
+ def map(&block)
81
+ block = block.source.sub(/^\s*master\.map do\s*\n/, '').sub(/^\s*end\s*\n/, '')
82
+ message = Base64.encode64(block)
83
+ Thread.new do
84
+ loop do
85
+ @map_finished = true if files.empty?
86
+ break if files.empty?
87
+
88
+ Async do
89
+ workers = data.select { |w| w[:status] == 0 }.first(files.count)
90
+
91
+ semaphore = Async::Semaphore.new(workers.count)
92
+ tasks = []
93
+
94
+ workers.each do |worker|
95
+ tasks << semaphore.async do
96
+ stub = WorkerServer::Stub.new(worker[:ip], :this_channel_is_insecure)
97
+ request = MapInfo.new(filename: files.pop, block: message, key: @encrypt_key)
98
+ worker[:status] = 'processing'
99
+ stub.map_operation(request)
100
+ end
101
+ end
102
+ tasks.each(&:wait)
103
+ end.wait
104
+ end
105
+ end
106
+ end
107
+
108
+ def distribute_input
109
+ path_name = @file
110
+ key = path_name.to_path
111
+ logger.info('[Master] Start to distribute input')
112
+ @files = split_files(key, path_name)
113
+ end
114
+
115
+ private
116
+
117
+ def split_files(key, file)
118
+ @encrypt_key = generate_digest_key(key)
119
+ p "Going to pritn encrypt_key = #{@encrypt_key}"
120
+ FileUtils.mkdir_p("./files/#{@encrypt_key}")
121
+ p ""
122
+ line_maximum = (File.open(file).count / @map_count).to_i
123
+ file_data = file.readlines.map(&:chomp)
124
+ file_number = file_data.length / line_maximum
125
+ files = []
126
+ file_number.times do |index|
127
+ path = "./files/#{@encrypt_key}/file_#{index}"
128
+ File.write(path, file_data.slice!(0..line_maximum))
129
+ files << path
130
+ end
131
+ files
132
+ end
133
+
134
+ def generate_digest_key(key)
135
+ digest = Digest::SHA256.new
136
+ digest.update(key)
137
+ digest.hexdigest
138
+ end
139
+ end
data/lib/worker.rb ADDED
@@ -0,0 +1,118 @@
1
+ require_relative './grpc/worker_services_pb'
2
+ require_relative './grpc/server_services_pb'
3
+ require_relative './grpc/server_pb'
4
+ require 'base64'
5
+ require 'google/protobuf'
6
+ require 'async'
7
+ require 'logger'
8
+
9
+ class Worker < WorkerServer::Service
10
+ attr_accessor :worker_number, :master_ip, :port, :logger, :uuid, :result
11
+
12
+ def initialize(worker_number:, master_ip:, port:, logger:)
13
+ @worker_number = worker_number
14
+ @uuid = generate_uuid
15
+ @master_ip = master_ip
16
+ @port = port
17
+ @logger = logger
18
+ @result = []
19
+ end
20
+
21
+ def map_operation(worker_req, _)
22
+ block = eval(Base64.decode64(worker_req.block))
23
+ block.call(File.read(worker_req.filename))
24
+ File.open("files/#{worker_req.key}/map.txt", 'a') do |file|
25
+ result.each do |array|
26
+ file.puts array.inspect
27
+ end
28
+ end
29
+ logger.info("[Worker] Worker #{uuid} gRPC finished the map operation")
30
+ stub = MapReduceMaster::Stub.new(@master_ip, :this_channel_is_insecure)
31
+ request = WorkerInfo.new(uuid: @uuid, success: 'true', filename: worker_req.filename)
32
+ stub.ping(request)
33
+ Empty.new
34
+ rescue StandardError => e
35
+ logger.error("[Worker] #{uuid} with error #{e}")
36
+ stub = MapReduceMaster::Stub.new(@master_ip, :this_channel_is_insecure)
37
+ request = WorkerInfo.new(uuid: @uuid, success: nil, filename: worker_req.filename)
38
+ stub.ping(request)
39
+ end
40
+
41
+ def reduce_operation(worker_req, _)
42
+ data = sort_map_file(worker_req.filename)
43
+ unique_keys = data.map { |item| item[0] }.uniq
44
+ file_path = "files/#{worker_req.key}/reduce.txt"
45
+ logger.info('[Worker] Starting Reduce Operation')
46
+ Async do
47
+ 1.upto(unique_keys.count) do |i|
48
+ Async do
49
+ results = data.select { |item| item[0] == unique_keys[i] }
50
+ block = eval(Base64.decode64(worker_req.block))
51
+ response = block.call(results)
52
+ File.open(file_path, 'a') do |file|
53
+ file.puts response
54
+ end
55
+ end
56
+ end
57
+ end
58
+ logger.info('[Worker] Finished Reduce Operation')
59
+ logger.info("[Worker] File stored at #{file_path}")
60
+ Empty.new
61
+ end
62
+
63
+ def start
64
+ grpc_server = GRPC::RpcServer.new
65
+ grpc_server.add_http2_port("0.0.0.0:#{port}", :this_port_is_insecure)
66
+ grpc_server.handle(self)
67
+ Thread.new do
68
+ grpc_server.run_till_terminated
69
+ ensure
70
+ logger.info('[Worker] Worker gRPC thread failed')
71
+ end
72
+ logger.info('[Worker] Worker gRPC thread start')
73
+
74
+ logger.info('[Worker] load functions finish')
75
+ register_worker
76
+ end
77
+
78
+ def self.start_worker(logger, worker_number)
79
+ master_ip = '0.0.0.0:50051'
80
+ Async do
81
+ 1.upto(worker_number) do |i|
82
+ Async do
83
+ worker = new(worker_number:, master_ip:, port: "3000#{i}", logger:)
84
+ worker.start
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ private
91
+
92
+ def sort_map_file(file_path)
93
+ file_data = []
94
+ File.open(file_path, 'r') do |file|
95
+ file.each_line do |line|
96
+ file_data << eval(line.strip)
97
+ end
98
+ end
99
+ file_data.sort_by { |item| item[0] }
100
+ end
101
+
102
+ def emit_intermediate(k, count:)
103
+ result << [k, count]
104
+ end
105
+
106
+ alias emit emit_intermediate
107
+
108
+ def generate_uuid
109
+ SecureRandom.uuid
110
+ end
111
+
112
+ def register_worker
113
+ stub = MapReduceMaster::Stub.new(@master_ip, :this_channel_is_insecure)
114
+ request = RegisterWorkerRequest.new(uuid: @uuid, ip: "localhost:#{@port}", type: 'map')
115
+ stub.register_worker(request)
116
+ @logger.info('[Worker] Worker register itself finish')
117
+ end
118
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reduce_map
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Francisco Paradela
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-07-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: async
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 2.12.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 2.12.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: grpc
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.62.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.62.0
41
+ description: Ruby map/reduce framework
42
+ email: franciscoleite.dev@protonmail.com
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - lib/grpc/server_pb.rb
48
+ - lib/grpc/server_services_pb.rb
49
+ - lib/grpc/worker_pb.rb
50
+ - lib/grpc/worker_services_pb.rb
51
+ - lib/map_reduce.rb
52
+ - lib/worker.rb
53
+ homepage: https://rubygems.org/gems/map_reduce
54
+ licenses:
55
+ - MIT
56
+ metadata: {}
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubygems_version: 3.5.9
73
+ signing_key:
74
+ specification_version: 4
75
+ summary: This framework is designed to provide a fully multi-threaded, distributed
76
+ and asynchronous approach to MapReduce processing.
77
+ test_files: []