simple_map_reduce 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'simple_map_reduce'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require 'irb'
15
+ IRB.start(__FILE__)
data/bin/job_tracker ADDED
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+
3
+ cd `dirname $0`
4
+ if [ "$USE_DOCKER" == "" ];then
5
+ job_tracker_url="http://localhost:4567"
6
+ s3_endpoint="http://localhost:9000"
7
+ else
8
+ job_tracker_url="http://job_tracker:4567"
9
+ s3_endpoint="http://minio:9000"
10
+ fi
11
+
12
+ ../exe/simple_map_reduce run_job_tracker! \
13
+ --job-tracker-url=`echo $job_tracker_url` \
14
+ --s3_config=access_key_id:'MINIO_ACCESS_KEY' \
15
+ secret_access_key:'MINIO_SECRET_KEY' \
16
+ endpoint:`echo $s3_endpoint` \
17
+ region:'us-east-1' \
18
+ force_path_style:true
data/bin/job_worker1 ADDED
@@ -0,0 +1,22 @@
1
+ #!/bin/bash
2
+
3
+ cd `dirname $0`
4
+ if [ "$USE_DOCKER" == "" ];then
5
+ job_tracker_url="http://localhost:4567"
6
+ job_worker_url="http://localhost:4568"
7
+ s3_endpoint="http://localhost:9000"
8
+ else
9
+ job_tracker_url="http://job_tracker:4567"
10
+ job_worker_url="http://job_worker1:4568"
11
+ s3_endpoint="http://minio:9000"
12
+ fi
13
+
14
+ ../exe/simple_map_reduce run_job_worker! \
15
+ --job-tracker-url=`echo $job_tracker_url` \
16
+ --job-worker-url=`echo $job_worker_url` \
17
+ --server-port=4568 \
18
+ --s3_config=access_key_id:'MINIO_ACCESS_KEY' \
19
+ secret_access_key:'MINIO_SECRET_KEY' \
20
+ endpoint:`echo $s3_endpoint` \
21
+ region:'us-east-1' \
22
+ force_path_style:true
data/bin/job_worker2 ADDED
@@ -0,0 +1,22 @@
1
+ #!/bin/bash
2
+
3
+ cd `dirname $0`
4
+ if [ "$USE_DOCKER" == "" ];then
5
+ job_tracker_url="http://localhost:4567"
6
+ job_worker_url="http://localhost:4569"
7
+ s3_endpoint="http://localhost:9000"
8
+ else
9
+ job_tracker_url="http://job_tracker:4567"
10
+ job_worker_url="http://job_worker2:4569"
11
+ s3_endpoint="http://minio:9000"
12
+ fi
13
+
14
+ ../exe/simple_map_reduce run_job_worker! \
15
+ --job-tracker-url=`echo $job_tracker_url` \
16
+ --job-worker-url=`echo $job_worker_url` \
17
+ --server-port=4569 \
18
+ --s3_config=access_key_id:'MINIO_ACCESS_KEY' \
19
+ secret_access_key:'MINIO_SECRET_KEY' \
20
+ endpoint:`echo $s3_endpoint` \
21
+ region:'us-east-1' \
22
+ force_path_style:true
data/bin/job_worker3 ADDED
@@ -0,0 +1,22 @@
1
+ #!/bin/bash
2
+
3
+ cd `dirname $0`
4
+ if [ "$USE_DOCKER" == "" ];then
5
+ job_tracker_url="http://localhost:4567"
6
+ job_worker_url="http://localhost:4570"
7
+ s3_endpoint="http://localhost:9000"
8
+ else
9
+ job_tracker_url="http://job_tracker:4567"
10
+ job_worker_url="http://job_worker3:4570"
11
+ s3_endpoint="http://minio:9000"
12
+ fi
13
+
14
+ ../exe/simple_map_reduce run_job_worker! \
15
+ --job-tracker-url=`echo $job_tracker_url` \
16
+ --job-worker-url=`echo $job_worker_url` \
17
+ --server-port=4570 \
18
+ --s3_config=access_key_id:'MINIO_ACCESS_KEY' \
19
+ secret_access_key:'MINIO_SECRET_KEY' \
20
+ endpoint:`echo $s3_endpoint` \
21
+ region:'us-east-1' \
22
+ force_path_style:true
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'simple_map_reduce'
6
+
7
+ map_class_name = 'WordCount'
8
+ map_script = <<-'EOS'
9
+ class WordCount
10
+ def map(input_data, output_io)
11
+ input_data.split(' ').each do |raw_word|
12
+ word = raw_word.strip
13
+ next if word.empty?
14
+ word.delete!('_=,.[]()#\'"-=~|&%')
15
+ word.downcase!
16
+
17
+ output_io.puts({ key: word, value: 1 }.to_json)
18
+ end
19
+ end
20
+ end
21
+ EOS
22
+
23
+ reduce_class_name = 'WordCount'
24
+ reduce_script = <<-'EOS'
25
+ require 'json'
26
+ class WordCount
27
+ def reduce(input_io, output_io)
28
+ output = Hash.new(0)
29
+ count = 0
30
+ input_io.each_line(chomp: true, rs: "\n") do |line|
31
+ input = JSON.parse(line, symbolize_names: true)
32
+ output[input[:key]] += input[:value]
33
+ count += 1
34
+ if count % 100 == 0
35
+ puts "current count: #{count}"
36
+ end
37
+ end
38
+
39
+ output.each do |key, value|
40
+ output_io.puts(JSON.generate(Hash[key, value]))
41
+ end
42
+ end
43
+ end
44
+ EOS
45
+
46
+ job_input_directory_path = 'input.txt'
47
+ job_input_bucket_name = 'input'
48
+
49
+ job_output_directory_path = 'word_count'
50
+ job_output_bucket_name = 'output'
51
+
52
+ job = ::SimpleMapReduce::Server::Job.new(
53
+ map_script: map_script,
54
+ map_class_name: map_class_name,
55
+ reduce_script: reduce_script,
56
+ reduce_class_name: reduce_class_name,
57
+ job_input_directory_path: job_input_directory_path,
58
+ job_input_bucket_name: job_input_bucket_name,
59
+ job_output_directory_path: job_output_directory_path,
60
+ job_output_bucket_name: job_output_bucket_name
61
+ )
62
+
63
+ url = 'http://localhost:4567'
64
+ http_client = ::Faraday.new(
65
+ url: url,
66
+ headers: {
67
+ 'Accept' => 'application/json ',
68
+ 'Content-Type' => 'application/json'
69
+ }
70
+ ) do |faraday|
71
+ faraday.response :logger
72
+ faraday.adapter ::Faraday.default_adapter
73
+ end
74
+
75
+ response = http_client.post do |request|
76
+ request.url('/jobs')
77
+ request.body = job.to_h.to_json
78
+ end
79
+
80
+ puts response.inspect
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,98 @@
1
+ version: '2'
2
+ services:
3
+ job_tracker:
4
+ build: .
5
+ volumes:
6
+ - ./bin:/app/bin
7
+ - ./examples:/app/examples
8
+ - ./exe:/app/exe
9
+ - ./lib:/app/lib
10
+ - ./spec:/app/spec
11
+ - ./Gemfile:/app/Gemfile
12
+ - ./Rakefile:/app/Rakefile
13
+ - ./simple_map_reduce.gemspec:/app/simple_map_reduce.gemspec
14
+ - ./.gitignore:/app/.gitignore
15
+ - ./.rspec:/app/.rspec
16
+ - ./.rubocop.yml:/app/.rubocop.yml
17
+ ports:
18
+ - '4567:4567'
19
+ links:
20
+ - minio
21
+ environment:
22
+ USE_DOCKER: 'true'
23
+ command: bash -c "bin/job_tracker"
24
+ job_worker1:
25
+ build: .
26
+ volumes:
27
+ - ./bin:/app/bin
28
+ - ./examples:/app/examples
29
+ - ./exe:/app/exe
30
+ - ./lib:/app/lib
31
+ - ./spec:/app/spec
32
+ - ./Gemfile:/app/Gemfile
33
+ - ./Rakefile:/app/Rakefile
34
+ - ./simple_map_reduce.gemspec:/app/simple_map_reduce.gemspec
35
+ - ./.gitignore:/app/.gitignore
36
+ - ./.rspec:/app/.rspec
37
+ - ./.rubocop.yml:/app/.rubocop.yml
38
+ ports:
39
+ - '4568:4568'
40
+ links:
41
+ - job_tracker
42
+ - minio
43
+ environment:
44
+ USE_DOCKER: 'true'
45
+ command: bash -c "bin/job_worker1"
46
+ job_worker2:
47
+ build: .
48
+ volumes:
49
+ - ./bin:/app/bin
50
+ - ./examples:/app/examples
51
+ - ./exe:/app/exe
52
+ - ./lib:/app/lib
53
+ - ./spec:/app/spec
54
+ - ./Gemfile:/app/Gemfile
55
+ - ./Rakefile:/app/Rakefile
56
+ - ./simple_map_reduce.gemspec:/app/simple_map_reduce.gemspec
57
+ - ./.gitignore:/app/.gitignore
58
+ - ./.rspec:/app/.rspec
59
+ - ./.rubocop.yml:/app/.rubocop.yml
60
+ ports:
61
+ - '4569:4569'
62
+ environment:
63
+ USE_DOCKER: 'true'
64
+ links:
65
+ - job_tracker
66
+ - minio
67
+ command: bash -c "bin/job_worker2"
68
+ job_worker3:
69
+ build: .
70
+ volumes:
71
+ - ./bin:/app/bin
72
+ - ./examples:/app/examples
73
+ - ./exe:/app/exe
74
+ - ./lib:/app/lib
75
+ - ./spec:/app/spec
76
+ - ./Gemfile:/app/Gemfile
77
+ - ./Rakefile:/app/Rakefile
78
+ - ./simple_map_reduce.gemspec:/app/simple_map_reduce.gemspec
79
+ - ./.gitignore:/app/.gitignore
80
+ - ./.rspec:/app/.rspec
81
+ - ./.rubocop.yml:/app/.rubocop.yml
82
+ ports:
83
+ - '4570:4570'
84
+ environment:
85
+ USE_DOCKER: 'true'
86
+ links:
87
+ - job_tracker
88
+ - minio
89
+ command: bash -c "bin/job_worker3"
90
+ minio:
91
+ image: minio/minio
92
+ ports:
93
+ - '9000:9000'
94
+ environment:
95
+ - 'MINIO_ACCESS_KEY=MINIO_ACCESS_KEY'
96
+ - 'MINIO_SECRET_KEY=MINIO_SECRET_KEY'
97
+ - 'MINIO_REGION=us-east-1'
98
+ command: server /export
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require_relative '../lib/simple_map_reduce'
6
+ require 'thor'
7
+ require 'faker'
8
+ require 'aws-sdk'
9
+
10
+ module SimpleMapReduce
11
+ class Cli < ::Thor
12
+ desc 'run_job_tracker!', 'start job_tracker server'
13
+ method_option :job_tracker_url, type: :string, required: true
14
+ method_option :server_port, type: :numeric
15
+ method_option :s3_config, type: :hash
16
+ method_option :s3_input_bucket_name, type: :string
17
+ method_option :s3_intermediate_bucket_name, type: :string
18
+ method_option :s3_output_bucket_name, type: :string
19
+ def run_job_tracker!
20
+ opts = Hash[options.to_h.to_a.map { |v| [v[0].to_sym, v[1]] }]
21
+ config = SimpleMapReduce::Server::Config.new(
22
+ job_tracker_url: opts[:job_tracker_url],
23
+ server_port: opts[:server_port],
24
+ s3_config: opts[:s3_config],
25
+ s3_input_bucket_name: opts[:s3_input_bucket_name],
26
+ s3_intermediate_bucket_name: opts[:s3_intermediate_bucket_name],
27
+ s3_output_bucket_name: opts[:s3_output_bucket_name]
28
+ )
29
+ assign_config_parameters(config)
30
+ SimpleMapReduce::Server::JobTracker.run!(port: config.server_port, bind: '0.0.0.0') do
31
+ SimpleMapReduce::Server::JobTracker.setup_job_tracker
32
+ end
33
+ end
34
+
35
+ desc 'run_job_worker!!', 'start job_worker server'
36
+ method_option :job_tracker_url, type: :string, required: true
37
+ method_option :job_worker_url, type: :string, required: true
38
+ method_option :server_port, type: :numeric
39
+ method_option :s3_config, type: :hash
40
+ method_option :s3_input_bucket_name, type: :string
41
+ method_option :s3_intermediate_bucket_name, type: :string
42
+ method_option :s3_output_bucket_name, type: :string
43
+ def run_job_worker!
44
+ opts = Hash[options.to_h.to_a.map { |v| [v[0].to_sym, v[1]] }]
45
+ config = SimpleMapReduce::Server::Config.new(
46
+ job_tracker_url: opts[:job_tracker_url],
47
+ job_worker_url: opts[:job_worker_url],
48
+ server_port: opts[:server_port],
49
+ s3_config: opts[:s3_config],
50
+ s3_input_bucket_name: opts[:s3_input_bucket_name],
51
+ s3_intermediate_bucket_name: opts[:s3_intermediate_bucket_name],
52
+ s3_output_bucket_name: opts[:s3_output_bucket_name]
53
+ )
54
+ assign_config_parameters(config)
55
+ SimpleMapReduce::Server::JobWorker.run!(port: config.server_port, bind: '0.0.0.0') do
56
+ SimpleMapReduce::Server::JobWorker.setup_worker
57
+ end
58
+ end
59
+
60
+ desc 'generate_lorem_text_data', 'generate test data'
61
+ method_option :count, type: :numeric, default: 1000, desc: 'The number of line count you want'
62
+ method_option :upload, type: :boolean, default: false, desc: 'The flag to upload to local minio bucket'
63
+ def generate_lorem_text_data
64
+ base_path = Dir.pwd
65
+ file = File.open(File.join(base_path, 'input.txt'), 'w+')
66
+ file.write(Faker::Lorem.paragraphs(options[:count]).join("\n"))
67
+ puts 'input.txt generated'
68
+
69
+ if options[:upload]
70
+ config = {
71
+ access_key_id: 'MINIO_ACCESS_KEY',
72
+ secret_access_key: 'MINIO_SECRET_KEY',
73
+ endpoint: ENV['USE_DOCKER'] ? 'http://minio:9000' : 'http://127.0.0.1:9000',
74
+ region: 'us-east-1',
75
+ force_path_style: true
76
+ }
77
+ s3 = ::Aws::S3::Client.new(config)
78
+ file.rewind
79
+ s3.put_object(
80
+ body: file.read,
81
+ bucket: 'input',
82
+ key: 'input.txt'
83
+ )
84
+ puts 'uploaded'
85
+ end
86
+
87
+ file.close
88
+ puts 'done'
89
+ end
90
+
91
+ desc 'execute_word_count', 'execute word count sample job'
92
+ def execute_word_count
93
+ map_class_name = 'WordCount'
94
+ map_script = <<-'EOS'
95
+ class WordCount
96
+ def map(input_data, output_io)
97
+ input_data.split(' ').each do |raw_word|
98
+ word = raw_word.strip
99
+ next if word.empty?
100
+ word.delete!('_=,.[]()#\'"-=~|&%')
101
+ word.downcase!
102
+
103
+ output_io.puts({ key: word, value: 1 }.to_json)
104
+ end
105
+ end
106
+ end
107
+ EOS
108
+
109
+ reduce_class_name = 'WordCount'
110
+ reduce_script = <<-'EOS'
111
+ require 'json'
112
+ class WordCount
113
+ def reduce(input_io, output_io)
114
+ output = Hash.new(0)
115
+ count = 0
116
+ input_io.each_line(chomp: true, rs: "\n") do |line|
117
+ input = JSON.parse(line, symbolize_names: true)
118
+ output[input[:key]] += input[:value]
119
+ count += 1
120
+ if count % 100 == 0
121
+ puts "current count: #{count}"
122
+ end
123
+ end
124
+
125
+ output.each do |key, value|
126
+ output_io.puts(JSON.generate(Hash[key, value]))
127
+ end
128
+ end
129
+ end
130
+ EOS
131
+
132
+ job_input_directory_path = 'input.txt'
133
+ job_input_bucket_name = 'input'
134
+
135
+ job_output_directory_path = 'word_count'
136
+ job_output_bucket_name = 'output'
137
+
138
+ job = ::SimpleMapReduce::Server::Job.new(
139
+ map_script: map_script,
140
+ map_class_name: map_class_name,
141
+ reduce_script: reduce_script,
142
+ reduce_class_name: reduce_class_name,
143
+ job_input_directory_path: job_input_directory_path,
144
+ job_input_bucket_name: job_input_bucket_name,
145
+ job_output_directory_path: job_output_directory_path,
146
+ job_output_bucket_name: job_output_bucket_name
147
+ )
148
+
149
+ url = 'http://localhost:4567'
150
+ http_client = ::Faraday.new(
151
+ url: url,
152
+ headers: {
153
+ 'Accept' => 'application/json ',
154
+ 'Content-Type' => 'application/json'
155
+ }
156
+ ) do |faraday|
157
+ faraday.response :logger
158
+ faraday.adapter ::Faraday.default_adapter
159
+ end
160
+
161
+ response = http_client.post do |request|
162
+ request.url('/jobs')
163
+ request.body = job.to_h.to_json
164
+ end
165
+
166
+ puts response.inspect
167
+ end
168
+
169
+ private
170
+
171
+ def assign_config_parameters(config)
172
+ SimpleMapReduce.s3_config = config.s3_config
173
+ SimpleMapReduce.s3_input_bucket_name = config.s3_input_bucket_name
174
+ SimpleMapReduce.s3_intermediate_bucket_name = config.s3_intermediate_bucket_name
175
+ SimpleMapReduce.s3_output_bucket_name = config.s3_output_bucket_name
176
+ SimpleMapReduce.logger = config.logger
177
+ SimpleMapReduce.job_tracker_url = config.job_tracker_url
178
+ SimpleMapReduce.job_worker_url = config.job_worker_url
179
+ end
180
+ end
181
+ end
182
+
183
+ SimpleMapReduce::Cli.start