map-reduce-ruby 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 19ac70d25d6622d30398997cae0a4d6730ebc13f8bbc31df896c5acf480923c4
4
+ data.tar.gz: ede2a8f736053a268175a7948324a15954475526d0589bf47416347b57a50740
5
+ SHA512:
6
+ metadata.gz: 650125350b5f166ccc0fb1346c77dfa9f17471ac9ae9290b4cbc4731a5baec872a810551e0b15af064b37f73c7e103ab6281d48cd839f26d4aab6780d5cd66d1
7
+ data.tar.gz: 70d9f8668143cb507537e369ad0ea79d9bb9f2c33469dffccd5952116f09758a7a880831d81aa4e3b67bc548b4b36aafe321a8d553946d8734fb78c11cd1686e
@@ -0,0 +1,23 @@
1
+ name: test
2
+ on: [push, pull_request]
3
+ jobs:
4
+ build:
5
+ runs-on: ubuntu-latest
6
+ strategy:
7
+ matrix:
8
+ ruby: ['2.6', '2.7', '3.0']
9
+ steps:
10
+ - uses: actions/checkout@v1
11
+ - uses: actions/setup-ruby@v1
12
+ with:
13
+ ruby-version: ${{ matrix.ruby }}
14
+ - uses: actions/cache@v1
15
+ id: cache
16
+ with:
17
+ path: vendor/bundler
18
+ key: ${{ hashFiles('Gemfile.lock') }}-${{ matrix.ruby }}
19
+ - run: |
20
+ gem install bundler
21
+ bundle install --path=vendor/bundler
22
+ bundle exec rspec
23
+ bundle exec rubocop
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,51 @@
1
+ AllCops:
2
+ NewCops: enable
3
+
4
+ Naming/FileName:
5
+ Exclude:
6
+ - lib/map-reduce-ruby.rb
7
+
8
+ Style/StringConcatenation:
9
+ Exclude:
10
+ - spec/**/*.rb
11
+
12
+ Metrics/BlockLength:
13
+ Enabled: false
14
+
15
+ Gemspec/RequiredRubyVersion:
16
+ Enabled: false
17
+
18
+ Style/MutableConstant:
19
+ Enabled: false
20
+
21
+ Metrics/MethodLength:
22
+ Enabled: false
23
+
24
+ Style/Documentation:
25
+ Enabled: false
26
+
27
+ Style/NumericPredicate:
28
+ Enabled: false
29
+
30
+ Metrics/AbcSize:
31
+ Enabled: false
32
+
33
+ Metrics/CyclomaticComplexity:
34
+ Enabled: false
35
+
36
+ Metrics/PerceivedComplexity:
37
+ Enabled: false
38
+
39
+ Style/StringLiterals:
40
+ Enabled: true
41
+ EnforcedStyle: double_quotes
42
+
43
+ Style/StringLiteralsInInterpolation:
44
+ Enabled: true
45
+ EnforcedStyle: double_quotes
46
+
47
+ Layout/LineLength:
48
+ Max: 120
49
+
50
+ Style/FrozenStringLiteralComment:
51
+ EnforcedStyle: never
data/CHANGELOG.md ADDED
@@ -0,0 +1 @@
1
+ # CHANGELOG
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in map_reduce.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,58 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ map-reduce-ruby (1.0.0)
5
+ json
6
+ lazy_priority_queue
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ast (2.4.2)
12
+ diff-lcs (1.4.4)
13
+ json (2.5.1)
14
+ lazy_priority_queue (0.1.1)
15
+ parallel (1.20.1)
16
+ parser (3.0.0.0)
17
+ ast (~> 2.4.1)
18
+ rainbow (3.0.0)
19
+ regexp_parser (2.0.3)
20
+ rexml (3.2.4)
21
+ rspec (3.10.0)
22
+ rspec-core (~> 3.10.0)
23
+ rspec-expectations (~> 3.10.0)
24
+ rspec-mocks (~> 3.10.0)
25
+ rspec-core (3.10.1)
26
+ rspec-support (~> 3.10.0)
27
+ rspec-expectations (3.10.1)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.10.0)
30
+ rspec-mocks (3.10.1)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.10.0)
33
+ rspec-support (3.10.1)
34
+ rubocop (0.93.1)
35
+ parallel (~> 1.10)
36
+ parser (>= 2.7.1.5)
37
+ rainbow (>= 2.2.2, < 4.0)
38
+ regexp_parser (>= 1.8)
39
+ rexml
40
+ rubocop-ast (>= 0.6.0)
41
+ ruby-progressbar (~> 1.7)
42
+ unicode-display_width (>= 1.4.0, < 2.0)
43
+ rubocop-ast (1.4.1)
44
+ parser (>= 2.7.1.5)
45
+ ruby-progressbar (1.11.0)
46
+ unicode-display_width (1.7.0)
47
+
48
+ PLATFORMS
49
+ ruby
50
+ x86_64-linux
51
+
52
+ DEPENDENCIES
53
+ map-reduce-ruby!
54
+ rspec
55
+ rubocop
56
+
57
+ BUNDLED WITH
58
+ 2.2.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Benjamin Vetter
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,211 @@
1
+ # MapReduce
2
+
3
+ **The easiest way to write distributed, larger than memory map-reduce jobs**
4
+
5
+ The MapReduce gem provides the easiest way to write custom, distributed, larger
6
+ than memory map-reduce jobs by using your local disk and some arbitrary storage
7
+ layer like s3. You can specify how much memory you are willing to offer and
8
+ MapReduce will use its buffers accordingly. Finally, you can use your already
9
+ existing background job system like `sidekiq` or one of its various
10
+ alternatives. Finally, your keys and values can be everything that can be
11
+ serialized as json.
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ ```ruby
18
+ gem 'map-reduce-ruby'
19
+ ```
20
+
21
+ And then execute:
22
+
23
+ $ bundle install
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install map-reduce-ruby
28
+
29
+ ## Usage
30
+
31
+ Any map-reduce job consists of an implementation of your `map` function, your
32
+ `reduce` function and worker code. So let's start with an implementation for a
33
+ word count map-reduce task which fetches txt documents from the web. Please
34
+ note that your keys and values can be everything that can be serialized as
35
+ json, but nothing else.
36
+
37
+ ```ruby
38
+ class WordCounter
39
+ def map(url)
40
+ HTTP.get(url).to_s.split.each do |word|
41
+ yield(word, 1)
42
+ end
43
+ end
44
+
45
+ def reduce(word, count1, count2)
46
+ count1 + count2
47
+ end
48
+ end
49
+ ```
50
+
51
+ The `#map` method takes some key, e.g. a url, and yields an arbitrary amount of
52
+ key-value pairs. The `#reduce` method takes the key as well as two values and
53
+ should return a single reduced value.
54
+
55
+ Next, we need some worker code to run the mapping part:
56
+
57
+ ```ruby
58
+ class WordCountMapper
59
+ def perform(job_id, mapper_id, url)
60
+ mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
61
+ mapper.map(url)
62
+
63
+ mapper.shuffle do |partition, tempfile|
64
+ # store content of tempfile e.g. on s3:
65
+ bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: tempfile)
66
+ end
67
+ end
68
+ end
69
+ ```
70
+
71
+ Please note that `MapReduce::HashPartitioner.new(16)` states that we want split
72
+ the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
73
+ worker code to run the reduce part:
74
+
75
+ ```ruby
76
+ class WordCountReducer
77
+ def perform(job_id, partition)
78
+ reducer = MapReduce::Reducer.new(WordCounter.new)
79
+
80
+ # fetch all chunks of the partitions e.g. from s3:
81
+ bucket.list(prefix: "map_reduce/jobs/#{job_id}/partitions/#{partition}/").each do |object|
82
+ chunk_path = reducer.add_chunk # returns a path to a tempfile
83
+
84
+ object.download_file(temp_path)
85
+ end
86
+
87
+ reducer.reduce(chunk_limit: 32) do |word, count|
88
+ # each word with its final count
89
+ end
90
+ end
91
+ end
92
+ ```
93
+
94
+ Please note that `MapReduce::Reducer#add_chunk` returns a path to a tempfile,
95
+ not a `Tempfile` object. This allows to limit the number of open file
96
+ descriptors.
97
+
98
+ To run your mappers, you can do:
99
+
100
+ ```ruby
101
+ job_id = SecureRandom.hex
102
+
103
+ list_of_urls.each_with_index do |url, index|
104
+ WordCountMapper.perform_async(job_id, index, url)
105
+ end
106
+ ```
107
+
108
+ And to run your reducers:
109
+
110
+ ```ruby
111
+ (0..15).each do |partition|
112
+ WordCountReducer.perform_async(job_id, partition)
113
+ end
114
+ ```
115
+
116
+ How to automate running the mappers and reducers in sequence, depends on your
117
+ background job system. The most simple approach is e.g. to track your mapper
118
+ state in redis and have a job to start your reducers which waits up until your
119
+ mappers are finished.
120
+
121
+ That's it.
122
+
123
+ ## Internals
124
+
125
+ To fully understand the performance details, the following outlines the inner
126
+ workings of MapReduce. Of course, feel free to check the code as well.
127
+
128
+ `MapReduce::Mapper#map` calls your `map` implementation and adds each yielded
129
+ key-value pair to an internal buffer up until the memory limit is reached.
130
+ When the memory limit is reached, the buffer is sorted by key and fed through
131
+ your `reduce` implementation already, as this can greatly reduce the amount of
132
+ data already. The result is written to a tempfile. This proceeds up until all
133
+ key-value pairs are yielded. `MapReduce::Mapper#shuffle` then reads the first
134
+ key-value pair of all already sorted chunk tempfiles and adds them to a
135
+ priority queue using a binomial heap, such that with every `pop` operation on
136
+ that heap, we get items sorted by key. When the item returned by `pop` e.g.
137
+ belongs to the second chunk, then the next key-value pair of the second chunk
138
+ is subsequently read and added to the priority queue, up until no more pairs
139
+ are available. This guarantees that we sort all chunks without fully loading
140
+ them into memory and is called `k-way-merge`. With every `pop` operation, your
141
+ `reduce` implementation is continously called up until the key changes between
142
+ two calls to `pop`. When the key changes, the key is known to be fully reduced,
143
+ such that the key is hashed modulo the number of partitions and gets written to
144
+ the correct partition tempfile (when `MapReduce::HashPartitioner` is used).
145
+
146
+ The resulting partition tempfiles need to be stored in some global storage
147
+ system like s3, such that your mapper workers can upload them and the reducer
148
+ workers can download them.
149
+
150
+ `MapReduce::Reducer#add_chunk` adds and registers a new tempfile path such that
151
+ your reducer can download a mapper file for the particular partition and write
152
+ its contents to that tempfile path. `MapReduce::Reducer#reduce` finally again
153
+ builds up a priority queue and performs `k-way-merge`, feeds the key-value
154
+ pairs into your reduce implementation up until a key change between `pop`
155
+ operations occurs and yields the fully reduced key-value pair. At the end
156
+ `#reduce` removes all the tempfiles. You can pass a `chunk_limit` to
157
+ `MapReduce::Reducer#reduce`, which is most useful when you run on a system with
158
+ a limited number of open file descriptors allowed. The `chunk_limit` ensures
159
+ that only the specified amount of chunks are processed in a single run. A run
160
+ basically means: it takes up to `chunk_limit` chunks, reduces them and pushes
161
+ the result as a new chunk to the list of chunks to process. Thus, if your
162
+ number of file descriptors is unlimited, you want to set it to a higher number
163
+ to avoid the overhead of multiple runs.
164
+
165
+ ## Partitioners
166
+
167
+ Partitioners are used to split the dataset into a specified amount of
168
+ partitions, which allows to parallelize the work to be done by reducers.
169
+ MapReduce comes with a `HashPartitioner`, which takes the number of partitions
170
+ as an argument and derives the partition number from the key as follows:
171
+
172
+ ```ruby
173
+ class HashPartitioner
174
+ def initialize(num_partitions)
175
+ @num_partitions = num_partitions
176
+ end
177
+
178
+ def call(key)
179
+ Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % @num_partitions
180
+ end
181
+ end
182
+ ```
183
+
184
+ Thus, writing your own custom partitioner is really easy and, as it follows the
185
+ interface of callables, could even be expressed as a simple one-liner:
186
+
187
+ ```ruby
188
+ MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
189
+ ```
190
+
191
+ ## Development
192
+
193
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
194
+ `rake spec` to run the tests. You can also run `bin/console` for an interactive
195
+ prompt that will allow you to experiment.
196
+
197
+ To install this gem onto your local machine, run `bundle exec rake install`. To
198
+ release a new version, update the version number in `version.rb`, and then run
199
+ `bundle exec rake release`, which will create a git tag for the version, push
200
+ git commits and the created tag, and push the `.gem` file to
201
+ [rubygems.org](https://rubygems.org).
202
+
203
+ ## Contributing
204
+
205
+ Bug reports and pull requests are welcome on GitHub at
206
+ https://github.com/mrkamel/map-reduce-ruby
207
+
208
+ ## License
209
+
210
+ The gem is available as open source under the terms of the [MIT
211
+ License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ require "rubocop/rake_task"
7
+
8
+ RuboCop::RakeTask.new
9
+
10
+ task default: %i[spec rubocop]
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "map_reduce"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1 @@
1
+ require "map_reduce"
data/lib/map_reduce.rb ADDED
@@ -0,0 +1,16 @@
1
+ require "tempfile"
2
+ require "json"
3
+ require "digest"
4
+ require "fileutils"
5
+ require "tmpdir"
6
+ require "lazy_priority_queue"
7
+ require "map_reduce/version"
8
+ require "map_reduce/priority_queue"
9
+ require "map_reduce/temp_path"
10
+ require "map_reduce/mergeable"
11
+ require "map_reduce/reduceable"
12
+ require "map_reduce/hash_partitioner"
13
+ require "map_reduce/mapper"
14
+ require "map_reduce/reducer"
15
+
16
+ module MapReduce; end
@@ -0,0 +1,32 @@
1
+ module MapReduce
2
+ # The MapReduce::HashPartitioner calculates a partition for the passed keys
3
+ # using SHA1 modulo the desired number of partitions.
4
+
5
+ class HashPartitioner
6
+ # Initializes a HashPartitioner.
7
+ #
8
+ # @param num_partitions [Fixnum] The desired number of partitions.
9
+ # Typically 8, 16, 32, 64, etc. but can be everything according to your
10
+ # needs.
11
+ #
12
+ # @example
13
+ # MapReduce::HashPartitioner.new(16)
14
+
15
+ def initialize(num_partitions)
16
+ @num_partitions = num_partitions
17
+ end
18
+
19
+ # Calculates the partition for the specified key.
20
+ #
21
+ # @param key The key to calculate the partition for. Can be everything
22
+ # that can be serialized as json.
23
+ # @returns [Integer] The partition number.
24
+ #
25
+ # @example
26
+ # partitioner.call("some key")
27
+
28
+ def call(key)
29
+ Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % @num_partitions
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,113 @@
1
+ module MapReduce
2
+ # The MapReduce::Mapper class runs the mapping part of your map-reduce job.
3
+
4
+ class Mapper
5
+ include Mergeable
6
+ include Reduceable
7
+ include MonitorMixin
8
+
9
+ attr_reader :partitions
10
+
11
+ # Initializes a new mapper.
12
+ #
13
+ # @param implementation Your map-reduce implementation, i.e. an object
14
+ # which responds to #map and #reduce.
15
+ # @param partitioner [#call] A partitioner, i.e. an object which responds
16
+ # to #call and calculates a partition for the passed key.
17
+ # @param memory_limit [#to_i] The memory limit, i.e. the buffer size in
18
+ # bytes.
19
+ #
20
+ # @example
21
+ # MapReduce::Mapper.new(MyImplementation.new, partitioner: HashPartitioner.new(16), memory_limit: 100.megabytes)
22
+
23
+ def initialize(implementation, partitioner: HashPartitioner.new(32), memory_limit: 100 * 1024 * 1024)
24
+ super()
25
+
26
+ @implementation = implementation
27
+ @partitioner = partitioner
28
+ @memory_limit = memory_limit.to_i
29
+
30
+ @buffer_size = 0
31
+ @buffer = []
32
+ @chunks = []
33
+ end
34
+
35
+ # Passes the received key to your map-reduce implementation and adds
36
+ # yielded key-value pair to a buffer. When the memory limit is reached, the
37
+ # chunk is sorted and written to a tempfile.
38
+ #
39
+ # @param key The key to pass to the map-reduce implementation.
40
+ #
41
+ # @example
42
+ # mapper.map("some_key")
43
+ # mapper.map("other_key")
44
+
45
+ def map(*args, **kwargs)
46
+ @implementation.map(*args, **kwargs) do |new_key, new_value|
47
+ synchronize do
48
+ @buffer.push([new_key, new_value])
49
+
50
+ @buffer_size += JSON.generate([new_key, new_value]).bytesize
51
+
52
+ write_chunk if @buffer_size >= @memory_limit
53
+ end
54
+ end
55
+ end
56
+
57
+ # Performs a k-way-merge of the sorted chunks written to tempfiles while
58
+ # already reducing the result using your map-reduce implementation and
59
+ # splitting the dataset into partitions. Finally yields each partition with
60
+ # the tempfile containing the data of the partition.
61
+ #
62
+ # @example
63
+ # mapper.shuffle do |partition, tempfile|
64
+ # # store data e.g. on s3
65
+ # end
66
+
67
+ def shuffle(&block)
68
+ return enum_for(:shuffle) unless block_given?
69
+
70
+ write_chunk if @buffer_size > 0
71
+
72
+ partitions = {}
73
+
74
+ reduce_chunk(k_way_merge(@chunks), @implementation).each do |pair|
75
+ partition = @partitioner.call(pair[0])
76
+
77
+ (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
78
+ end
79
+
80
+ @chunks.each { |tempfile| tempfile.close(true) }
81
+ @chunks = []
82
+
83
+ partitions.each_value(&:rewind)
84
+
85
+ partitions.each do |partition, tempfile|
86
+ block.call(partition, tempfile)
87
+ end
88
+
89
+ partitions.each_value { |tempfile| tempfile.close(true) }
90
+
91
+ nil
92
+ end
93
+
94
+ private
95
+
96
+ def write_chunk
97
+ tempfile = Tempfile.new
98
+
99
+ @buffer.sort_by! { |item| JSON.generate(item.first) }
100
+
101
+ reduce_chunk(@buffer, @implementation).each do |pair|
102
+ tempfile.puts JSON.generate(pair)
103
+ end
104
+
105
+ tempfile.rewind
106
+
107
+ @chunks.push(tempfile)
108
+
109
+ @buffer_size = 0
110
+ @buffer = []
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,56 @@
1
+ module MapReduce
2
+ # The MapReduce::Mergeable mixin provides the k-way-merge operation used by
3
+ # mappers as well as reducers.
4
+
5
+ module Mergeable
6
+ private
7
+
8
+ # Performs the k-way-merge of the passed files using a priority queue using
9
+ # a binomial heap. The content of the passed files needs to be sorted. It
10
+ # starts by reading one item of each file and adding it to the priority
11
+ # queue. Afterwards, it continously pops an item from the queue, yields it
12
+ # and reads a new item from the file the popped item belongs to, adding the
13
+ # read item to the queue. This continues up until all items from the files
14
+ # have been read. This guarantees that the yielded key-value pairs are
15
+ # sorted without having all items in-memory.
16
+ #
17
+ # @param files [IO, Tempfile] The files to run the k-way-merge for. The
18
+ # content of the files must be sorted.
19
+
20
+ def k_way_merge(files)
21
+ return enum_for(:k_way_merge, files) unless block_given?
22
+
23
+ queue = PriorityQueue.new
24
+
25
+ files.each_with_index do |file, index|
26
+ line = file.eof? ? nil : file.readline
27
+
28
+ next unless line
29
+
30
+ key, value = JSON.parse(line)
31
+
32
+ queue.push([key, value, index], JSON.generate(key))
33
+ end
34
+
35
+ loop do
36
+ key, value, index = queue.pop
37
+
38
+ return unless index
39
+
40
+ yield([key, value])
41
+
42
+ line = files[index].yield_self { |file| file.eof? ? nil : file.readline }
43
+
44
+ next unless line
45
+
46
+ key, value = JSON.parse(line)
47
+
48
+ queue.push([key, value, index], JSON.generate(key))
49
+ end
50
+
51
+ files.each(&:rewind)
52
+
53
+ nil
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,49 @@
1
+ module MapReduce
2
+ # The MapReduce::PriorityQueue implements a min priority queue using a
3
+ # binomial heap.
4
+
5
+ class PriorityQueue
6
+ # Initializes the priority queue.
7
+ #
8
+ # @example
9
+ # MapReduce::PriorityQueue.new
10
+
11
+ def initialize
12
+ @queue = MinPriorityQueue.new
13
+ @sequence_number = 0
14
+ end
15
+
16
+ # Adds a new item to the priority queue while the key is used for sorting.
17
+ # The object and key can basically be everything, but the key must be some
18
+ # comparable object.
19
+ #
20
+ # @param object The object to add to the priority queue.
21
+ # @param key The key to use for sorting.
22
+ #
23
+ # @example
24
+ # priority_queue = MapReduce::PriorityQueue.new
25
+ # priority_queue.push("some object", "some key")
26
+
27
+ def push(object, key)
28
+ @queue.push([@sequence_number, object], key)
29
+
30
+ @sequence_number += 1
31
+ end
32
+
33
+ # Pops the min item from the queue.
34
+ #
35
+ # @returns The popped object.
36
+ #
37
+ # @example
38
+ # priority_queue = MapReduce::PriorityQueue.new
39
+ # priority_queue.push("object1", "key1")
40
+ # priority_queue.push("object2", "key2")
41
+ # priority_queue.pop
42
+
43
+ def pop
44
+ _, object = @queue.pop
45
+
46
+ object
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,38 @@
1
+ module MapReduce
2
+ # The MapReduce::Reduceable mixin allows to reduce an arbitrary chunk using
3
+ # the specified map-reduce implementation.
4
+
5
+ module Reduceable
6
+ private
7
+
8
+ # Reduces the specified chunk, e.g. some enumerable, using the specified
9
+ # map-reduce implementation using a lookahead of one to detect key changes.
10
+ # The reduce implementation is called up until a key change is detected,
11
+ # because the key change signals that the reduce operation is finished for
12
+ # the particular key, such that it will then be yielded.
13
+ #
14
+ # @param chunk The chunk to be reduced. Can e.g. be some enumerable.
15
+ # @param implementation The map-reduce implementation.
16
+
17
+ def reduce_chunk(chunk, implementation)
18
+ return enum_for(:reduce_chunk, chunk, implementation) unless block_given?
19
+
20
+ last_item = chunk.inject do |prev_item, cur_item|
21
+ prev_key = prev_item[0]
22
+
23
+ # Here we can compare without serializing the keys to json first,
24
+ # because we reduce a chunk which includes a deserialization step.
25
+
26
+ if prev_key == cur_item[0]
27
+ [prev_key, implementation.reduce(prev_key, prev_item[1], cur_item[1])]
28
+ else
29
+ yield(prev_item)
30
+
31
+ cur_item
32
+ end
33
+ end
34
+
35
+ yield(last_item) if last_item
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,107 @@
1
+ module MapReduce
2
+ # The MapReduce::Reducer class runs the reducer part of your map-reduce job.
3
+
4
+ class Reducer
5
+ include Mergeable
6
+ include Reduceable
7
+ include MonitorMixin
8
+
9
+ class InvalidChunkLimit < StandardError; end
10
+
11
+ # Initializes a new reducer.
12
+ #
13
+ # @param implementation Your map-reduce implementation, i.e. an object
14
+ # which responds to #map and #reduce.
15
+ #
16
+ # @example
17
+ # MapReduce::Reducer.new(MyImplementation.new)
18
+
19
+ def initialize(implementation)
20
+ super()
21
+
22
+ @implementation = implementation
23
+
24
+ @temp_paths ||= []
25
+ end
26
+
27
+ # Adds a chunk from the mapper-phase to the reducer by registering a
28
+ # tempfile and returning the path to that tempfile, such that you can
29
+ # download a chunk e.g. from s3 and write the content to this tempfile.
30
+ #
31
+ # @returns [String] The path to a tempfile.
32
+ #
33
+ # @example
34
+ # chunk_path = reducer.add_chunk
35
+ # File.write(chunk_path, "downloaded blob")
36
+
37
+ def add_chunk
38
+ temp_path = TempPath.new
39
+
40
+ synchronize do
41
+ @temp_paths.push(temp_path)
42
+ end
43
+
44
+ temp_path.path
45
+ end
46
+
47
+ # Performs a k-way-merge of the added chunks and yields the reduced
48
+ # key-value pairs. It performs multiple runs when more than `chunk_limit`
49
+ # chunks exist. A run means: it takes up to `chunk_limit` chunks,
50
+ # reduces them and pushes the result as a new chunk. At the end it
51
+ # removes all tempfiles, even if errors occur.
52
+ #
53
+ # @param chunk_limit [Integer] The maximum number of files to process
54
+ # during a single run. Most useful when you run on a system where the
55
+ # number of open file descriptors is limited. If your number of file
56
+ # descriptors is unlimited, you want to set it to a higher number to
57
+ # avoid the overhead of multiple runs.
58
+ #
59
+ # @example
60
+ # reducer = MapReduce::Reducer.new(MyImplementation.new)
61
+ #
62
+ # chunk1_path = reducer.add_chunk
63
+ # # write data to the file
64
+ #
65
+ # chunk2_path = reducer.add_chunk
66
+ # # write data to the file
67
+ #
68
+ # reducer.reduce(chunk_limit: 32) do |key, value|
69
+ # # ...
70
+ # end
71
+
72
+ def reduce(chunk_limit:, &block)
73
+ return enum_for(:reduce, chunk_limit: chunk_limit) unless block_given?
74
+
75
+ raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
76
+
77
+ begin
78
+ loop do
79
+ slice = @temp_paths.shift(chunk_limit)
80
+ files = slice.select { |temp_path| File.exist?(temp_path.path) }
81
+ .map { |temp_path| File.open(temp_path.path, "r") }
82
+
83
+ begin
84
+ if @temp_paths.empty?
85
+ reduce_chunk(k_way_merge(files), @implementation).each do |pair|
86
+ block.call(pair)
87
+ end
88
+
89
+ return
90
+ end
91
+
92
+ File.open(add_chunk, "w") do |file|
93
+ reduce_chunk(k_way_merge(files), @implementation).each do |pair|
94
+ file.puts JSON.generate(pair)
95
+ end
96
+ end
97
+ ensure
98
+ files.each(&:close)
99
+ slice.each(&:delete)
100
+ end
101
+ end
102
+ ensure
103
+ @temp_paths.each(&:delete)
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,42 @@
1
+ module MapReduce
2
+ # The MapReduce::TempPath generates a tempfile path and automatically deletes
3
+ # the file when the object is garbage collected or manually deleted. Using
4
+ # this class instead of Tempfile allows to have less open file descriptors.
5
+
6
+ class TempPath
7
+ attr_reader :path
8
+
9
+ # Initializes a new tempfile path.
10
+ #
11
+ # @example
12
+ # temp_path = MapReduce::TempPath.new
13
+ # File.write(temp_path.path, "blob")
14
+
15
+ def initialize
16
+ @path = Dir::Tmpname.create("") do
17
+ # nothing
18
+ end
19
+
20
+ FileUtils.touch(@path)
21
+
22
+ ObjectSpace.define_finalizer(self, self.class.finalize(@path))
23
+ end
24
+
25
+ # @api private
26
+
27
+ def self.finalize(path)
28
+ proc { FileUtils.rm_f(path) }
29
+ end
30
+
31
+ # Allows to manually delete the tempfile.
32
+ #
33
+ # @example
34
+ # temp_path = MapReduce::TempPath.new
35
+ # File.write(temp_path.path, "blob")
36
+ # temp_path.delete
37
+
38
+ def delete
39
+ FileUtils.rm_f(path)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,3 @@
1
+ module MapReduce
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,38 @@
1
+ require_relative "lib/map_reduce/version"
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "map-reduce-ruby"
5
+ spec.version = MapReduce::VERSION
6
+ spec.authors = ["Benjamin Vetter"]
7
+ spec.email = ["vetter@flakks.com"]
8
+
9
+ spec.summary = "The easiest way to write distributed, larger than memory map-reduce jobs"
10
+ spec.description = "The MapReduce gem is the easiest way to write custom, distributed, larger "\
11
+ "than memory map-reduce jobs"
12
+ spec.homepage = "https://github.com/mrkamel/map-reduce-ruby"
13
+ spec.license = "MIT"
14
+
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/mrkamel/map-reduce-ruby"
19
+ spec.metadata["changelog_uri"] = "https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_development_dependency "rspec"
31
+ spec.add_development_dependency "rubocop"
32
+
33
+ spec.add_dependency "json"
34
+ spec.add_dependency "lazy_priority_queue"
35
+
36
+ # For more information and examples about making a new gem, checkout our
37
+ # guide at: https://bundler.io/guides/creating_gem.html
38
+ end
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: map-reduce-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Benjamin Vetter
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-07-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubocop
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: json
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: lazy_priority_queue
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: The MapReduce gem is the easiest way to write custom, distributed, larger
70
+ than memory map-reduce jobs
71
+ email:
72
+ - vetter@flakks.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".github/workflows/test.yml"
78
+ - ".gitignore"
79
+ - ".rspec"
80
+ - ".rubocop.yml"
81
+ - CHANGELOG.md
82
+ - Gemfile
83
+ - Gemfile.lock
84
+ - LICENSE.txt
85
+ - README.md
86
+ - Rakefile
87
+ - bin/console
88
+ - bin/setup
89
+ - lib/map-reduce-ruby.rb
90
+ - lib/map_reduce.rb
91
+ - lib/map_reduce/hash_partitioner.rb
92
+ - lib/map_reduce/mapper.rb
93
+ - lib/map_reduce/mergeable.rb
94
+ - lib/map_reduce/priority_queue.rb
95
+ - lib/map_reduce/reduceable.rb
96
+ - lib/map_reduce/reducer.rb
97
+ - lib/map_reduce/temp_path.rb
98
+ - lib/map_reduce/version.rb
99
+ - map-reduce-ruby.gemspec
100
+ homepage: https://github.com/mrkamel/map-reduce-ruby
101
+ licenses:
102
+ - MIT
103
+ metadata:
104
+ homepage_uri: https://github.com/mrkamel/map-reduce-ruby
105
+ source_code_uri: https://github.com/mrkamel/map-reduce-ruby
106
+ changelog_uri: https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md
107
+ post_install_message:
108
+ rdoc_options: []
109
+ require_paths:
110
+ - lib
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: 2.5.0
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubygems_version: 3.0.3
123
+ signing_key:
124
+ specification_version: 4
125
+ summary: The easiest way to write distributed, larger than memory map-reduce jobs
126
+ test_files: []