map-reduce-ruby 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 19ac70d25d6622d30398997cae0a4d6730ebc13f8bbc31df896c5acf480923c4
4
+ data.tar.gz: ede2a8f736053a268175a7948324a15954475526d0589bf47416347b57a50740
5
+ SHA512:
6
+ metadata.gz: 650125350b5f166ccc0fb1346c77dfa9f17471ac9ae9290b4cbc4731a5baec872a810551e0b15af064b37f73c7e103ab6281d48cd839f26d4aab6780d5cd66d1
7
+ data.tar.gz: 70d9f8668143cb507537e369ad0ea79d9bb9f2c33469dffccd5952116f09758a7a880831d81aa4e3b67bc548b4b36aafe321a8d553946d8734fb78c11cd1686e
@@ -0,0 +1,23 @@
1
+ name: test
2
+ on: [push, pull_request]
3
+ jobs:
4
+ build:
5
+ runs-on: ubuntu-latest
6
+ strategy:
7
+ matrix:
8
+ ruby: ['2.6', '2.7', '3.0']
9
+ steps:
10
+ - uses: actions/checkout@v1
11
+ - uses: actions/setup-ruby@v1
12
+ with:
13
+ ruby-version: ${{ matrix.ruby }}
14
+ - uses: actions/cache@v1
15
+ id: cache
16
+ with:
17
+ path: vendor/bundler
18
+ key: ${{ hashFiles('Gemfile.lock') }}-${{ matrix.ruby }}
19
+ - run: |
20
+ gem install bundler
21
+ bundle install --path=vendor/bundler
22
+ bundle exec rspec
23
+ bundle exec rubocop
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,51 @@
1
+ AllCops:
2
+ NewCops: enable
3
+
4
+ Naming/FileName:
5
+ Exclude:
6
+ - lib/map-reduce-ruby.rb
7
+
8
+ Style/StringConcatenation:
9
+ Exclude:
10
+ - spec/**/*.rb
11
+
12
+ Metrics/BlockLength:
13
+ Enabled: false
14
+
15
+ Gemspec/RequiredRubyVersion:
16
+ Enabled: false
17
+
18
+ Style/MutableConstant:
19
+ Enabled: false
20
+
21
+ Metrics/MethodLength:
22
+ Enabled: false
23
+
24
+ Style/Documentation:
25
+ Enabled: false
26
+
27
+ Style/NumericPredicate:
28
+ Enabled: false
29
+
30
+ Metrics/AbcSize:
31
+ Enabled: false
32
+
33
+ Metrics/CyclomaticComplexity:
34
+ Enabled: false
35
+
36
+ Metrics/PerceivedComplexity:
37
+ Enabled: false
38
+
39
+ Style/StringLiterals:
40
+ Enabled: true
41
+ EnforcedStyle: double_quotes
42
+
43
+ Style/StringLiteralsInInterpolation:
44
+ Enabled: true
45
+ EnforcedStyle: double_quotes
46
+
47
+ Layout/LineLength:
48
+ Max: 120
49
+
50
+ Style/FrozenStringLiteralComment:
51
+ EnforcedStyle: never
data/CHANGELOG.md ADDED
@@ -0,0 +1 @@
1
+ # CHANGELOG
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in map_reduce.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,58 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ map-reduce-ruby (1.0.0)
5
+ json
6
+ lazy_priority_queue
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ast (2.4.2)
12
+ diff-lcs (1.4.4)
13
+ json (2.5.1)
14
+ lazy_priority_queue (0.1.1)
15
+ parallel (1.20.1)
16
+ parser (3.0.0.0)
17
+ ast (~> 2.4.1)
18
+ rainbow (3.0.0)
19
+ regexp_parser (2.0.3)
20
+ rexml (3.2.4)
21
+ rspec (3.10.0)
22
+ rspec-core (~> 3.10.0)
23
+ rspec-expectations (~> 3.10.0)
24
+ rspec-mocks (~> 3.10.0)
25
+ rspec-core (3.10.1)
26
+ rspec-support (~> 3.10.0)
27
+ rspec-expectations (3.10.1)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.10.0)
30
+ rspec-mocks (3.10.1)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.10.0)
33
+ rspec-support (3.10.1)
34
+ rubocop (0.93.1)
35
+ parallel (~> 1.10)
36
+ parser (>= 2.7.1.5)
37
+ rainbow (>= 2.2.2, < 4.0)
38
+ regexp_parser (>= 1.8)
39
+ rexml
40
+ rubocop-ast (>= 0.6.0)
41
+ ruby-progressbar (~> 1.7)
42
+ unicode-display_width (>= 1.4.0, < 2.0)
43
+ rubocop-ast (1.4.1)
44
+ parser (>= 2.7.1.5)
45
+ ruby-progressbar (1.11.0)
46
+ unicode-display_width (1.7.0)
47
+
48
+ PLATFORMS
49
+ ruby
50
+ x86_64-linux
51
+
52
+ DEPENDENCIES
53
+ map-reduce-ruby!
54
+ rspec
55
+ rubocop
56
+
57
+ BUNDLED WITH
58
+ 2.2.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Benjamin Vetter
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,211 @@
1
+ # MapReduce
2
+
3
+ **The easiest way to write distributed, larger than memory map-reduce jobs**
4
+
5
+ The MapReduce gem provides the easiest way to write custom, distributed, larger
6
+ than memory map-reduce jobs by using your local disk and some arbitrary storage
7
+ layer like s3. You can specify how much memory you are willing to offer and
8
+ MapReduce will use its buffers accordingly. Finally, you can use your already
9
+ existing background job system like `sidekiq` or one of its various
10
+ alternatives. Finally, your keys and values can be everything that can be
11
+ serialized as json.
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ ```ruby
18
+ gem 'map-reduce-ruby'
19
+ ```
20
+
21
+ And then execute:
22
+
23
+ $ bundle install
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install map-reduce-ruby
28
+
29
+ ## Usage
30
+
31
+ Any map-reduce job consists of an implementation of your `map` function, your
32
+ `reduce` function and worker code. So let's start with an implementation for a
33
+ word count map-reduce task which fetches txt documents from the web. Please
34
+ note that your keys and values can be everything that can be serialized as
35
+ json, but nothing else.
36
+
37
+ ```ruby
38
+ class WordCounter
39
+ def map(url)
40
+ HTTP.get(url).to_s.split.each do |word|
41
+ yield(word, 1)
42
+ end
43
+ end
44
+
45
+ def reduce(word, count1, count2)
46
+ count1 + count2
47
+ end
48
+ end
49
+ ```
50
+
51
+ The `#map` method takes some key, e.g. a url, and yields an arbitrary amount of
52
+ key-value pairs. The `#reduce` method takes the key as well as two values and
53
+ should return a single reduced value.
54
+
55
+ Next, we need some worker code to run the mapping part:
56
+
57
+ ```ruby
58
+ class WordCountMapper
59
+ def perform(job_id, mapper_id, url)
60
+ mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
61
+ mapper.map(url)
62
+
63
+ mapper.shuffle do |partition, tempfile|
64
+ # store content of tempfile e.g. on s3:
65
+ bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: tempfile)
66
+ end
67
+ end
68
+ end
69
+ ```
70
+
71
+ Please note that `MapReduce::HashPartitioner.new(16)` states that we want split
72
+ the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
73
+ worker code to run the reduce part:
74
+
75
+ ```ruby
76
+ class WordCountReducer
77
+ def perform(job_id, partition)
78
+ reducer = MapReduce::Reducer.new(WordCounter.new)
79
+
80
+ # fetch all chunks of the partitions e.g. from s3:
81
+ bucket.list(prefix: "map_reduce/jobs/#{job_id}/partitions/#{partition}/").each do |object|
82
+ chunk_path = reducer.add_chunk # returns a path to a tempfile
83
+
84
+ object.download_file(temp_path)
85
+ end
86
+
87
+ reducer.reduce(chunk_limit: 32) do |word, count|
88
+ # each word with its final count
89
+ end
90
+ end
91
+ end
92
+ ```
93
+
94
+ Please note that `MapReduce::Reducer#add_chunk` returns a path to a tempfile,
95
+ not a `Tempfile` object. This allows to limit the number of open file
96
+ descriptors.
97
+
98
+ To run your mappers, you can do:
99
+
100
+ ```ruby
101
+ job_id = SecureRandom.hex
102
+
103
+ list_of_urls.each_with_index do |url, index|
104
+ WordCountMapper.perform_async(job_id, index, url)
105
+ end
106
+ ```
107
+
108
+ And to run your reducers:
109
+
110
+ ```ruby
111
+ (0..15).each do |partition|
112
+ WordCountReducer.perform_async(job_id, partition)
113
+ end
114
+ ```
115
+
116
+ How to automate running the mappers and reducers in sequence, depends on your
117
+ background job system. The most simple approach is e.g. to track your mapper
118
+ state in redis and have a job to start your reducers which waits up until your
119
+ mappers are finished.
120
+
121
+ That's it.
122
+
123
+ ## Internals
124
+
125
+ To fully understand the performance details, the following outlines the inner
126
+ workings of MapReduce. Of course, feel free to check the code as well.
127
+
128
+ `MapReduce::Mapper#map` calls your `map` implementation and adds each yielded
129
+ key-value pair to an internal buffer up until the memory limit is reached.
130
+ When the memory limit is reached, the buffer is sorted by key and fed through
131
+ your `reduce` implementation already, as this can greatly reduce the amount of
132
+ data already. The result is written to a tempfile. This proceeds up until all
133
+ key-value pairs are yielded. `MapReduce::Mapper#shuffle` then reads the first
134
+ key-value pair of all already sorted chunk tempfiles and adds them to a
135
+ priority queue using a binomial heap, such that with every `pop` operation on
136
+ that heap, we get items sorted by key. When the item returned by `pop` e.g.
137
+ belongs to the second chunk, then the next key-value pair of the second chunk
138
+ is subsequently read and added to the priority queue, up until no more pairs
139
+ are available. This guarantees that we sort all chunks without fully loading
140
+ them into memory and is called `k-way-merge`. With every `pop` operation, your
141
+ `reduce` implementation is continously called up until the key changes between
142
+ two calls to `pop`. When the key changes, the key is known to be fully reduced,
143
+ such that the key is hashed modulo the number of partitions and gets written to
144
+ the correct partition tempfile (when `MapReduce::HashPartitioner` is used).
145
+
146
+ The resulting partition tempfiles need to be stored in some global storage
147
+ system like s3, such that your mapper workers can upload them and the reducer
148
+ workers can download them.
149
+
150
+ `MapReduce::Reducer#add_chunk` adds and registers a new tempfile path such that
151
+ your reducer can download a mapper file for the particular partition and write
152
+ its contents to that tempfile path. `MapReduce::Reducer#reduce` finally again
153
+ builds up a priority queue and performs `k-way-merge`, feeds the key-value
154
+ pairs into your reduce implementation up until a key change between `pop`
155
+ operations occurs and yields the fully reduced key-value pair. At the end
156
+ `#reduce` removes all the tempfiles. You can pass a `chunk_limit` to
157
+ `MapReduce::Reducer#reduce`, which is most useful when you run on a system with
158
+ a limited number of open file descriptors allowed. The `chunk_limit` ensures
159
+ that only the specified amount of chunks are processed in a single run. A run
160
+ basically means: it takes up to `chunk_limit` chunks, reduces them and pushes
161
+ the result as a new chunk to the list of chunks to process. Thus, if your
162
+ number of file descriptors is unlimited, you want to set it to a higher number
163
+ to avoid the overhead of multiple runs.
164
+
165
+ ## Partitioners
166
+
167
+ Partitioners are used to split the dataset into a specified amount of
168
+ partitions, which allows to parallelize the work to be done by reducers.
169
+ MapReduce comes with a `HashPartitioner`, which takes the number of partitions
170
+ as an argument and derives the partition number from the key as follows:
171
+
172
+ ```ruby
173
+ class HashPartitioner
174
+ def initialize(num_partitions)
175
+ @num_partitions = num_partitions
176
+ end
177
+
178
+ def call(key)
179
+ Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % @num_partitions
180
+ end
181
+ end
182
+ ```
183
+
184
+ Thus, writing your own custom partitioner is really easy and, as it follows the
185
+ interface of callables, could even be expressed as a simple one-liner:
186
+
187
+ ```ruby
188
+ MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
189
+ ```
190
+
191
+ ## Development
192
+
193
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
194
+ `rake spec` to run the tests. You can also run `bin/console` for an interactive
195
+ prompt that will allow you to experiment.
196
+
197
+ To install this gem onto your local machine, run `bundle exec rake install`. To
198
+ release a new version, update the version number in `version.rb`, and then run
199
+ `bundle exec rake release`, which will create a git tag for the version, push
200
+ git commits and the created tag, and push the `.gem` file to
201
+ [rubygems.org](https://rubygems.org).
202
+
203
+ ## Contributing
204
+
205
+ Bug reports and pull requests are welcome on GitHub at
206
+ https://github.com/mrkamel/map-reduce-ruby
207
+
208
+ ## License
209
+
210
+ The gem is available as open source under the terms of the [MIT
211
+ License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ require "rubocop/rake_task"
7
+
8
+ RuboCop::RakeTask.new
9
+
10
+ task default: %i[spec rubocop]
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "map_reduce"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1 @@
1
+ require "map_reduce"
data/lib/map_reduce.rb ADDED
@@ -0,0 +1,16 @@
1
+ require "tempfile"
2
+ require "json"
3
+ require "digest"
4
+ require "fileutils"
5
+ require "tmpdir"
6
+ require "lazy_priority_queue"
7
+ require "map_reduce/version"
8
+ require "map_reduce/priority_queue"
9
+ require "map_reduce/temp_path"
10
+ require "map_reduce/mergeable"
11
+ require "map_reduce/reduceable"
12
+ require "map_reduce/hash_partitioner"
13
+ require "map_reduce/mapper"
14
+ require "map_reduce/reducer"
15
+
16
+ module MapReduce; end
@@ -0,0 +1,32 @@
1
+ module MapReduce
2
+ # The MapReduce::HashPartitioner calculates a partition for the passed keys
3
+ # using SHA1 modulo the desired number of partitions.
4
+
5
+ class HashPartitioner
6
+ # Initializes a HashPartitioner.
7
+ #
8
+ # @param num_partitions [Fixnum] The desired number of partitions.
9
+ # Typically 8, 16, 32, 64, etc. but can be everything according to your
10
+ # needs.
11
+ #
12
+ # @example
13
+ # MapReduce::HashPartitioner.new(16)
14
+
15
+ def initialize(num_partitions)
16
+ @num_partitions = num_partitions
17
+ end
18
+
19
+ # Calculates the partition for the specified key.
20
+ #
21
+ # @param key The key to calculate the partition for. Can be everything
22
+ # that can be serialized as json.
23
+ # @returns [Integer] The partition number.
24
+ #
25
+ # @example
26
+ # partitioner.call("some key")
27
+
28
+ def call(key)
29
+ Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % @num_partitions
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,113 @@
1
+ module MapReduce
2
+ # The MapReduce::Mapper class runs the mapping part of your map-reduce job.
3
+
4
+ class Mapper
5
+ include Mergeable
6
+ include Reduceable
7
+ include MonitorMixin
8
+
9
+ attr_reader :partitions
10
+
11
+ # Initializes a new mapper.
12
+ #
13
+ # @param implementation Your map-reduce implementation, i.e. an object
14
+ # which responds to #map and #reduce.
15
+ # @param partitioner [#call] A partitioner, i.e. an object which responds
16
+ # to #call and calculates a partition for the passed key.
17
+ # @param memory_limit [#to_i] The memory limit, i.e. the buffer size in
18
+ # bytes.
19
+ #
20
+ # @example
21
+ # MapReduce::Mapper.new(MyImplementation.new, partitioner: HashPartitioner.new(16), memory_limit: 100.megabytes)
22
+
23
+ def initialize(implementation, partitioner: HashPartitioner.new(32), memory_limit: 100 * 1024 * 1024)
24
+ super()
25
+
26
+ @implementation = implementation
27
+ @partitioner = partitioner
28
+ @memory_limit = memory_limit.to_i
29
+
30
+ @buffer_size = 0
31
+ @buffer = []
32
+ @chunks = []
33
+ end
34
+
35
+ # Passes the received key to your map-reduce implementation and adds
36
+ # yielded key-value pair to a buffer. When the memory limit is reached, the
37
+ # chunk is sorted and written to a tempfile.
38
+ #
39
+ # @param key The key to pass to the map-reduce implementation.
40
+ #
41
+ # @example
42
+ # mapper.map("some_key")
43
+ # mapper.map("other_key")
44
+
45
+ def map(*args, **kwargs)
46
+ @implementation.map(*args, **kwargs) do |new_key, new_value|
47
+ synchronize do
48
+ @buffer.push([new_key, new_value])
49
+
50
+ @buffer_size += JSON.generate([new_key, new_value]).bytesize
51
+
52
+ write_chunk if @buffer_size >= @memory_limit
53
+ end
54
+ end
55
+ end
56
+
57
+ # Performs a k-way-merge of the sorted chunks written to tempfiles while
58
+ # already reducing the result using your map-reduce implementation and
59
+ # splitting the dataset into partitions. Finally yields each partition with
60
+ # the tempfile containing the data of the partition.
61
+ #
62
+ # @example
63
+ # mapper.shuffle do |partition, tempfile|
64
+ # # store data e.g. on s3
65
+ # end
66
+
67
+ def shuffle(&block)
68
+ return enum_for(:shuffle) unless block_given?
69
+
70
+ write_chunk if @buffer_size > 0
71
+
72
+ partitions = {}
73
+
74
+ reduce_chunk(k_way_merge(@chunks), @implementation).each do |pair|
75
+ partition = @partitioner.call(pair[0])
76
+
77
+ (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
78
+ end
79
+
80
+ @chunks.each { |tempfile| tempfile.close(true) }
81
+ @chunks = []
82
+
83
+ partitions.each_value(&:rewind)
84
+
85
+ partitions.each do |partition, tempfile|
86
+ block.call(partition, tempfile)
87
+ end
88
+
89
+ partitions.each_value { |tempfile| tempfile.close(true) }
90
+
91
+ nil
92
+ end
93
+
94
+ private
95
+
96
+ def write_chunk
97
+ tempfile = Tempfile.new
98
+
99
+ @buffer.sort_by! { |item| JSON.generate(item.first) }
100
+
101
+ reduce_chunk(@buffer, @implementation).each do |pair|
102
+ tempfile.puts JSON.generate(pair)
103
+ end
104
+
105
+ tempfile.rewind
106
+
107
+ @chunks.push(tempfile)
108
+
109
+ @buffer_size = 0
110
+ @buffer = []
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,56 @@
1
+ module MapReduce
2
+ # The MapReduce::Mergeable mixin provides the k-way-merge operation used by
3
+ # mappers as well as reducers.
4
+
5
+ module Mergeable
6
+ private
7
+
8
+ # Performs the k-way-merge of the passed files using a priority queue using
9
+ # a binomial heap. The content of the passed files needs to be sorted. It
10
+ # starts by reading one item of each file and adding it to the priority
11
+ # queue. Afterwards, it continously pops an item from the queue, yields it
12
+ # and reads a new item from the file the popped item belongs to, adding the
13
+ # read item to the queue. This continues up until all items from the files
14
+ # have been read. This guarantees that the yielded key-value pairs are
15
+ # sorted without having all items in-memory.
16
+ #
17
+ # @param files [IO, Tempfile] The files to run the k-way-merge for. The
18
+ # content of the files must be sorted.
19
+
20
+ def k_way_merge(files)
21
+ return enum_for(:k_way_merge, files) unless block_given?
22
+
23
+ queue = PriorityQueue.new
24
+
25
+ files.each_with_index do |file, index|
26
+ line = file.eof? ? nil : file.readline
27
+
28
+ next unless line
29
+
30
+ key, value = JSON.parse(line)
31
+
32
+ queue.push([key, value, index], JSON.generate(key))
33
+ end
34
+
35
+ loop do
36
+ key, value, index = queue.pop
37
+
38
+ return unless index
39
+
40
+ yield([key, value])
41
+
42
+ line = files[index].yield_self { |file| file.eof? ? nil : file.readline }
43
+
44
+ next unless line
45
+
46
+ key, value = JSON.parse(line)
47
+
48
+ queue.push([key, value, index], JSON.generate(key))
49
+ end
50
+
51
+ files.each(&:rewind)
52
+
53
+ nil
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,49 @@
1
+ module MapReduce
2
+ # The MapReduce::PriorityQueue implements a min priority queue using a
3
+ # binomial heap.
4
+
5
+ class PriorityQueue
6
+ # Initializes the priority queue.
7
+ #
8
+ # @example
9
+ # MapReduce::PriorityQueue.new
10
+
11
+ def initialize
12
+ @queue = MinPriorityQueue.new
13
+ @sequence_number = 0
14
+ end
15
+
16
+ # Adds a new item to the priority queue while the key is used for sorting.
17
+ # The object and key can basically be everything, but the key must be some
18
+ # comparable object.
19
+ #
20
+ # @param object The object to add to the priority queue.
21
+ # @param key The key to use for sorting.
22
+ #
23
+ # @example
24
+ # priority_queue = MapReduce::PriorityQueue.new
25
+ # priority_queue.push("some object", "some key")
26
+
27
+ def push(object, key)
28
+ @queue.push([@sequence_number, object], key)
29
+
30
+ @sequence_number += 1
31
+ end
32
+
33
+ # Pops the min item from the queue.
34
+ #
35
+ # @returns The popped object.
36
+ #
37
+ # @example
38
+ # priority_queue = MapReduce::PriorityQueue.new
39
+ # priority_queue.push("object1", "key1")
40
+ # priority_queue.push("object2", "key2")
41
+ # priority_queue.pop
42
+
43
+ def pop
44
+ _, object = @queue.pop
45
+
46
+ object
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,38 @@
1
+ module MapReduce
2
+ # The MapReduce::Reduceable mixin allows to reduce an arbitrary chunk using
3
+ # the specified map-reduce implementation.
4
+
5
+ module Reduceable
6
+ private
7
+
8
+ # Reduces the specified chunk, e.g. some enumerable, using the specified
9
+ # map-reduce implementation using a lookahead of one to detect key changes.
10
+ # The reduce implementation is called up until a key change is detected,
11
+ # because the key change signals that the reduce operation is finished for
12
+ # the particular key, such that it will then be yielded.
13
+ #
14
+ # @param chunk The chunk to be reduced. Can e.g. be some enumerable.
15
+ # @param implementation The map-reduce implementation.
16
+
17
+ def reduce_chunk(chunk, implementation)
18
+ return enum_for(:reduce_chunk, chunk, implementation) unless block_given?
19
+
20
+ last_item = chunk.inject do |prev_item, cur_item|
21
+ prev_key = prev_item[0]
22
+
23
+ # Here we can compare without serializing the keys to json first,
24
+ # because we reduce a chunk which includes a deserialization step.
25
+
26
+ if prev_key == cur_item[0]
27
+ [prev_key, implementation.reduce(prev_key, prev_item[1], cur_item[1])]
28
+ else
29
+ yield(prev_item)
30
+
31
+ cur_item
32
+ end
33
+ end
34
+
35
+ yield(last_item) if last_item
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,107 @@
1
+ module MapReduce
2
+ # The MapReduce::Reducer class runs the reducer part of your map-reduce job.
3
+
4
+ class Reducer
5
+ include Mergeable
6
+ include Reduceable
7
+ include MonitorMixin
8
+
9
+ class InvalidChunkLimit < StandardError; end
10
+
11
+ # Initializes a new reducer.
12
+ #
13
+ # @param implementation Your map-reduce implementation, i.e. an object
14
+ # which responds to #map and #reduce.
15
+ #
16
+ # @example
17
+ # MapReduce::Reducer.new(MyImplementation.new)
18
+
19
+ def initialize(implementation)
20
+ super()
21
+
22
+ @implementation = implementation
23
+
24
+ @temp_paths ||= []
25
+ end
26
+
27
+ # Adds a chunk from the mapper-phase to the reducer by registering a
28
+ # tempfile and returning the path to that tempfile, such that you can
29
+ # download a chunk e.g. from s3 and write the content to this tempfile.
30
+ #
31
+ # @returns [String] The path to a tempfile.
32
+ #
33
+ # @example
34
+ # chunk_path = reducer.add_chunk
35
+ # File.write(chunk_path, "downloaded blob")
36
+
37
+ def add_chunk
38
+ temp_path = TempPath.new
39
+
40
+ synchronize do
41
+ @temp_paths.push(temp_path)
42
+ end
43
+
44
+ temp_path.path
45
+ end
46
+
47
+ # Performs a k-way-merge of the added chunks and yields the reduced
48
+ # key-value pairs. It performs multiple runs when more than `chunk_limit`
49
+ # chunks exist. A run means: it takes up to `chunk_limit` chunks,
50
+ # reduces them and pushes the result as a new chunk. At the end it
51
+ # removes all tempfiles, even if errors occur.
52
+ #
53
+ # @param chunk_limit [Integer] The maximum number of files to process
54
+ # during a single run. Most useful when you run on a system where the
55
+ # number of open file descriptors is limited. If your number of file
56
+ # descriptors is unlimited, you want to set it to a higher number to
57
+ # avoid the overhead of multiple runs.
58
+ #
59
+ # @example
60
+ # reducer = MapReduce::Reducer.new(MyImplementation.new)
61
+ #
62
+ # chunk1_path = reducer.add_chunk
63
+ # # write data to the file
64
+ #
65
+ # chunk2_path = reducer.add_chunk
66
+ # # write data to the file
67
+ #
68
+ # reducer.reduce(chunk_limit: 32) do |key, value|
69
+ # # ...
70
+ # end
71
+
72
+ def reduce(chunk_limit:, &block)
73
+ return enum_for(:reduce, chunk_limit: chunk_limit) unless block_given?
74
+
75
+ raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
76
+
77
+ begin
78
+ loop do
79
+ slice = @temp_paths.shift(chunk_limit)
80
+ files = slice.select { |temp_path| File.exist?(temp_path.path) }
81
+ .map { |temp_path| File.open(temp_path.path, "r") }
82
+
83
+ begin
84
+ if @temp_paths.empty?
85
+ reduce_chunk(k_way_merge(files), @implementation).each do |pair|
86
+ block.call(pair)
87
+ end
88
+
89
+ return
90
+ end
91
+
92
+ File.open(add_chunk, "w") do |file|
93
+ reduce_chunk(k_way_merge(files), @implementation).each do |pair|
94
+ file.puts JSON.generate(pair)
95
+ end
96
+ end
97
+ ensure
98
+ files.each(&:close)
99
+ slice.each(&:delete)
100
+ end
101
+ end
102
+ ensure
103
+ @temp_paths.each(&:delete)
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,42 @@
1
+ module MapReduce
2
+ # The MapReduce::TempPath generates a tempfile path and automatically deletes
3
+ # the file when the object is garbage collected or manually deleted. Using
4
+ # this class instead of Tempfile allows to have less open file descriptors.
5
+
6
+ class TempPath
7
+ attr_reader :path
8
+
9
+ # Initializes a new tempfile path.
10
+ #
11
+ # @example
12
+ # temp_path = MapReduce::TempPath.new
13
+ # File.write(temp_path.path, "blob")
14
+
15
+ def initialize
16
+ @path = Dir::Tmpname.create("") do
17
+ # nothing
18
+ end
19
+
20
+ FileUtils.touch(@path)
21
+
22
+ ObjectSpace.define_finalizer(self, self.class.finalize(@path))
23
+ end
24
+
25
+ # @api private
26
+
27
+ def self.finalize(path)
28
+ proc { FileUtils.rm_f(path) }
29
+ end
30
+
31
+ # Allows to manually delete the tempfile.
32
+ #
33
+ # @example
34
+ # temp_path = MapReduce::TempPath.new
35
+ # File.write(temp_path.path, "blob")
36
+ # temp_path.delete
37
+
38
+ def delete
39
+ FileUtils.rm_f(path)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,3 @@
1
+ module MapReduce
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,38 @@
1
+ require_relative "lib/map_reduce/version"
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "map-reduce-ruby"
5
+ spec.version = MapReduce::VERSION
6
+ spec.authors = ["Benjamin Vetter"]
7
+ spec.email = ["vetter@flakks.com"]
8
+
9
+ spec.summary = "The easiest way to write distributed, larger than memory map-reduce jobs"
10
+ spec.description = "The MapReduce gem is the easiest way to write custom, distributed, larger "\
11
+ "than memory map-reduce jobs"
12
+ spec.homepage = "https://github.com/mrkamel/map-reduce-ruby"
13
+ spec.license = "MIT"
14
+
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/mrkamel/map-reduce-ruby"
19
+ spec.metadata["changelog_uri"] = "https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_development_dependency "rspec"
31
+ spec.add_development_dependency "rubocop"
32
+
33
+ spec.add_dependency "json"
34
+ spec.add_dependency "lazy_priority_queue"
35
+
36
+ # For more information and examples about making a new gem, checkout our
37
+ # guide at: https://bundler.io/guides/creating_gem.html
38
+ end
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: map-reduce-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Benjamin Vetter
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-07-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubocop
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: json
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: lazy_priority_queue
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: The MapReduce gem is the easiest way to write custom, distributed, larger
70
+ than memory map-reduce jobs
71
+ email:
72
+ - vetter@flakks.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".github/workflows/test.yml"
78
+ - ".gitignore"
79
+ - ".rspec"
80
+ - ".rubocop.yml"
81
+ - CHANGELOG.md
82
+ - Gemfile
83
+ - Gemfile.lock
84
+ - LICENSE.txt
85
+ - README.md
86
+ - Rakefile
87
+ - bin/console
88
+ - bin/setup
89
+ - lib/map-reduce-ruby.rb
90
+ - lib/map_reduce.rb
91
+ - lib/map_reduce/hash_partitioner.rb
92
+ - lib/map_reduce/mapper.rb
93
+ - lib/map_reduce/mergeable.rb
94
+ - lib/map_reduce/priority_queue.rb
95
+ - lib/map_reduce/reduceable.rb
96
+ - lib/map_reduce/reducer.rb
97
+ - lib/map_reduce/temp_path.rb
98
+ - lib/map_reduce/version.rb
99
+ - map-reduce-ruby.gemspec
100
+ homepage: https://github.com/mrkamel/map-reduce-ruby
101
+ licenses:
102
+ - MIT
103
+ metadata:
104
+ homepage_uri: https://github.com/mrkamel/map-reduce-ruby
105
+ source_code_uri: https://github.com/mrkamel/map-reduce-ruby
106
+ changelog_uri: https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md
107
+ post_install_message:
108
+ rdoc_options: []
109
+ require_paths:
110
+ - lib
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: 2.5.0
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubygems_version: 3.0.3
123
+ signing_key:
124
+ specification_version: 4
125
+ summary: The easiest way to write distributed, larger than memory map-reduce jobs
126
+ test_files: []