RubyGems - map-reduce-ruby - Versions diffs - 1.0.0 - Mend

map-reduce-ruby 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +7 -0
data/.github/workflows/test.yml +23 -0
data/.gitignore +11 -0
data/.rspec +3 -0
data/.rubocop.yml +51 -0
data/CHANGELOG.md +1 -0
data/Gemfile +4 -0
data/Gemfile.lock +58 -0
data/LICENSE.txt +21 -0
data/README.md +211 -0
data/Rakefile +10 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/map-reduce-ruby.rb +1 -0
data/lib/map_reduce.rb +16 -0
data/lib/map_reduce/hash_partitioner.rb +32 -0
data/lib/map_reduce/mapper.rb +113 -0
data/lib/map_reduce/mergeable.rb +56 -0
data/lib/map_reduce/priority_queue.rb +49 -0
data/lib/map_reduce/reduceable.rb +38 -0
data/lib/map_reduce/reducer.rb +107 -0
data/lib/map_reduce/temp_path.rb +42 -0
data/lib/map_reduce/version.rb +3 -0
data/map-reduce-ruby.gemspec +38 -0
metadata +126 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 19ac70d25d6622d30398997cae0a4d6730ebc13f8bbc31df896c5acf480923c4
+  data.tar.gz: ede2a8f736053a268175a7948324a15954475526d0589bf47416347b57a50740
+SHA512:
+  metadata.gz: 650125350b5f166ccc0fb1346c77dfa9f17471ac9ae9290b4cbc4731a5baec872a810551e0b15af064b37f73c7e103ab6281d48cd839f26d4aab6780d5cd66d1
+  data.tar.gz: 70d9f8668143cb507537e369ad0ea79d9bb9f2c33469dffccd5952116f09758a7a880831d81aa4e3b67bc548b4b36aafe321a8d553946d8734fb78c11cd1686e

data/.github/workflows/test.yml ADDED Viewed

@@ -0,0 +1,23 @@
+name: test
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ruby: ['2.6', '2.7', '3.0']
+    steps:
+    - uses: actions/checkout@v1
+    - uses: actions/setup-ruby@v1
+      with:
+        ruby-version: ${{ matrix.ruby }}
+    - uses: actions/cache@v1
+      id: cache
+      with:
+        path: vendor/bundler
+        key: ${{ hashFiles('Gemfile.lock') }}-${{ matrix.ruby }}
+    - run: |
+        gem install bundler
+        bundle install --path=vendor/bundler
+        bundle exec rspec
+        bundle exec rubocop

data/.gitignore ADDED Viewed

@@ -0,0 +1,11 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# rspec failure tracking
+.rspec_status

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,51 @@
+AllCops:
+  NewCops: enable
+Naming/FileName:
+  Exclude:
+    - lib/map-reduce-ruby.rb
+Style/StringConcatenation:
+  Exclude:
+    - spec/**/*.rb
+Metrics/BlockLength:
+  Enabled: false
+Gemspec/RequiredRubyVersion:
+  Enabled: false
+Style/MutableConstant:
+  Enabled: false
+Metrics/MethodLength:
+  Enabled: false
+Style/Documentation:
+  Enabled: false
+Style/NumericPredicate:
+  Enabled: false
+Metrics/AbcSize:
+  Enabled: false
+Metrics/CyclomaticComplexity:
+  Enabled: false
+Metrics/PerceivedComplexity:
+  Enabled: false
+Style/StringLiterals:
+  Enabled: true
+  EnforcedStyle: double_quotes
+Style/StringLiteralsInInterpolation:
+  Enabled: true
+  EnforcedStyle: double_quotes
+Layout/LineLength:
+  Max: 120
+Style/FrozenStringLiteralComment:
+  EnforcedStyle: never

data/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ # CHANGELOG

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "https://rubygems.org"
+# Specify your gem's dependencies in map_reduce.gemspec
+gemspec

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,58 @@
+PATH
+  remote: .
+  specs:
+    map-reduce-ruby (1.0.0)
+      json
+      lazy_priority_queue
+GEM
+  remote: https://rubygems.org/
+  specs:
+    ast (2.4.2)
+    diff-lcs (1.4.4)
+    json (2.5.1)
+    lazy_priority_queue (0.1.1)
+    parallel (1.20.1)
+    parser (3.0.0.0)
+      ast (~> 2.4.1)
+    rainbow (3.0.0)
+    regexp_parser (2.0.3)
+    rexml (3.2.4)
+    rspec (3.10.0)
+      rspec-core (~> 3.10.0)
+      rspec-expectations (~> 3.10.0)
+      rspec-mocks (~> 3.10.0)
+    rspec-core (3.10.1)
+      rspec-support (~> 3.10.0)
+    rspec-expectations (3.10.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.10.0)
+    rspec-mocks (3.10.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.10.0)
+    rspec-support (3.10.1)
+    rubocop (0.93.1)
+      parallel (~> 1.10)
+      parser (>= 2.7.1.5)
+      rainbow (>= 2.2.2, < 4.0)
+      regexp_parser (>= 1.8)
+      rexml
+      rubocop-ast (>= 0.6.0)
+      ruby-progressbar (~> 1.7)
+      unicode-display_width (>= 1.4.0, < 2.0)
+    rubocop-ast (1.4.1)
+      parser (>= 2.7.1.5)
+    ruby-progressbar (1.11.0)
+    unicode-display_width (1.7.0)
+PLATFORMS
+  ruby
+  x86_64-linux
+DEPENDENCIES
+  map-reduce-ruby!
+  rspec
+  rubocop
+BUNDLED WITH
+   2.2.2

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2021 Benjamin Vetter
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,211 @@
+# MapReduce
+**The easiest way to write distributed, larger than memory map-reduce jobs**
+The MapReduce gem provides the easiest way to write custom, distributed, larger
+than memory map-reduce jobs by using your local disk and some arbitrary storage
+layer like s3. You can specify how much memory you are willing to offer and
+MapReduce will use its buffers accordingly. Finally, you can use your already
+existing background job system like `sidekiq` or one of its various
+alternatives. Finally, your keys and values can be everything that can be
+serialized as json.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'map-reduce-ruby'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install map-reduce-ruby
+## Usage
+Any map-reduce job consists of an implementation of your `map` function, your
+`reduce` function and worker code. So let's start with an implementation for a
+word count map-reduce task which fetches txt documents from the web. Please
+note that your keys and values can be everything that can be serialized as
+json, but nothing else.
+```ruby
+class WordCounter
+  def map(url)
+    HTTP.get(url).to_s.split.each do |word|
+      yield(word, 1)
+    end
+  end
+  def reduce(word, count1, count2)
+    count1 + count2
+  end
+end
+```
+The `#map` method takes some key, e.g. a url, and yields an arbitrary amount of
+key-value pairs. The `#reduce` method takes the key as well as two values and
+should return a single reduced value.
+Next, we need some worker code to run the mapping part:
+```ruby
+class WordCountMapper
+  def perform(job_id, mapper_id, url)
+    mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
+    mapper.map(url)
+    mapper.shuffle do |partition, tempfile|
+      # store content of tempfile e.g. on s3:
+      bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: tempfile)
+    end
+  end
+end
+```
+Please note that `MapReduce::HashPartitioner.new(16)` states that we want split
+the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
+worker code to run the reduce part:
+```ruby
+class WordCountReducer
+  def perform(job_id, partition)
+    reducer = MapReduce::Reducer.new(WordCounter.new)
+    # fetch all chunks of the partitions e.g. from s3:
+    bucket.list(prefix: "map_reduce/jobs/#{job_id}/partitions/#{partition}/").each do |object|
+      chunk_path = reducer.add_chunk # returns a path to a tempfile
+      object.download_file(temp_path)
+    end
+    reducer.reduce(chunk_limit: 32) do |word, count|
+      # each word with its final count
+    end
+  end
+end
+```
+Please note that `MapReduce::Reducer#add_chunk` returns a path to a tempfile,
+not a `Tempfile` object. This allows to limit the number of open file
+descriptors.
+To run your mappers, you can do:
+```ruby
+job_id = SecureRandom.hex
+list_of_urls.each_with_index do |url, index|
+  WordCountMapper.perform_async(job_id, index, url)
+end
+```
+And to run your reducers:
+```ruby
+(0..15).each do |partition|
+  WordCountReducer.perform_async(job_id, partition)
+end
+```
+How to automate running the mappers and reducers in sequence, depends on your
+background job system. The most simple approach is e.g. to track your mapper
+state in redis and have a job to start your reducers which waits up until your
+mappers are finished.
+That's it.
+## Internals
+To fully understand the performance details, the following outlines the inner
+workings of MapReduce. Of course, feel free to check the code as well.
+`MapReduce::Mapper#map` calls your `map` implementation and adds each yielded
+key-value pair to an internal buffer up until the memory limit is reached.
+When the memory limit is reached, the buffer is sorted by key and fed through
+your `reduce` implementation already, as this can greatly reduce the amount of
+data already. The result is written to a tempfile. This proceeds up until all
+key-value pairs are yielded. `MapReduce::Mapper#shuffle` then reads the first
+key-value pair of all already sorted chunk tempfiles and adds them to a
+priority queue using a binomial heap, such that with every `pop` operation on
+that heap, we get items sorted by key. When the item returned by `pop` e.g.
+belongs to the second chunk, then the next key-value pair of the second chunk
+is subsequently read and added to the priority queue, up until no more pairs
+are available. This guarantees that we sort all chunks without fully loading
+them into memory and is called `k-way-merge`. With every `pop` operation, your
+`reduce` implementation is continously called up until the key changes between
+two calls to `pop`. When the key changes, the key is known to be fully reduced,
+such that the key is hashed modulo the number of partitions and gets written to
+the correct partition tempfile (when `MapReduce::HashPartitioner` is used).
+The resulting partition tempfiles need to be stored in some global storage
+system like s3, such that your mapper workers can upload them and the reducer
+workers can download them.
+`MapReduce::Reducer#add_chunk` adds and registers a new tempfile path such that
+your reducer can download a mapper file for the particular partition and write
+its contents to that tempfile path. `MapReduce::Reducer#reduce` finally again
+builds up a priority queue and performs `k-way-merge`, feeds the key-value
+pairs into your reduce implementation up until a key change between `pop`
+operations occurs and yields the fully reduced key-value pair. At the end
+`#reduce` removes all the tempfiles. You can pass a `chunk_limit` to
+`MapReduce::Reducer#reduce`, which is most useful when you run on a system with
+a limited number of open file descriptors allowed. The `chunk_limit` ensures
+that only the specified amount of chunks are processed in a single run. A run
+basically means: it takes up to `chunk_limit` chunks, reduces them and pushes
+the result as a new chunk to the list of chunks to process. Thus, if your
+number of file descriptors is unlimited, you want to set it to a higher number
+to avoid the overhead of multiple runs.
+## Partitioners
+Partitioners are used to split the dataset into a specified amount of
+partitions, which allows to parallelize the work to be done by reducers.
+MapReduce comes with a `HashPartitioner`, which takes the number of partitions
+as an argument and derives the partition number from the key as follows:
+```ruby
+class HashPartitioner
+  def initialize(num_partitions)
+    @num_partitions = num_partitions
+  end
+  def call(key)
+    Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % @num_partitions
+  end
+end
+```
+Thus, writing your own custom partitioner is really easy and, as it follows the
+interface of callables, could even be expressed as a simple one-liner:
+```ruby
+MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
+```
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run
+`rake spec` to run the tests. You can also run `bin/console` for an interactive
+prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To
+release a new version, update the version number in `version.rb`, and then run
+`bundle exec rake release`, which will create a git tag for the version, push
+git commits and the created tag, and push the `.gem` file to
+[rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at
+https://github.com/mrkamel/map-reduce-ruby
+## License
+The gem is available as open source under the terms of the [MIT
+License](https://opensource.org/licenses/MIT).

data/Rakefile ADDED Viewed

@@ -0,0 +1,10 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+require "rubocop/rake_task"
+RuboCop::RakeTask.new
+task default: %i[spec rubocop]

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "map_reduce"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/map-reduce-ruby.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "map_reduce"

data/lib/map_reduce.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require "tempfile"
+require "json"
+require "digest"
+require "fileutils"
+require "tmpdir"
+require "lazy_priority_queue"
+require "map_reduce/version"
+require "map_reduce/priority_queue"
+require "map_reduce/temp_path"
+require "map_reduce/mergeable"
+require "map_reduce/reduceable"
+require "map_reduce/hash_partitioner"
+require "map_reduce/mapper"
+require "map_reduce/reducer"
+module MapReduce; end

data/lib/map_reduce/hash_partitioner.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module MapReduce
+  # The MapReduce::HashPartitioner calculates a partition for the passed keys
+  # using SHA1 modulo the desired number of partitions.
+  class HashPartitioner
+    # Initializes a HashPartitioner.
+    #
+    # @param num_partitions [Fixnum] The desired number of partitions.
+    #   Typically 8, 16, 32, 64, etc. but can be everything according to your
+    #   needs.
+    #
+    # @example
+    #   MapReduce::HashPartitioner.new(16)
+    def initialize(num_partitions)
+      @num_partitions = num_partitions
+    end
+    # Calculates the partition for the specified key.
+    #
+    # @param key The key to calculate the partition for. Can be everything
+    #   that can be serialized as json.
+    # @returns [Integer] The partition number.
+    #
+    # @example
+    #   partitioner.call("some key")
+    def call(key)
+      Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % @num_partitions
+    end
+  end
+end

data/lib/map_reduce/mapper.rb ADDED Viewed

@@ -0,0 +1,113 @@
+module MapReduce
+  # The MapReduce::Mapper class runs the mapping part of your map-reduce job.
+  class Mapper
+    include Mergeable
+    include Reduceable
+    include MonitorMixin
+    attr_reader :partitions
+    # Initializes a new mapper.
+    #
+    # @param implementation Your map-reduce implementation, i.e. an object
+    #   which responds to #map and #reduce.
+    # @param partitioner [#call] A partitioner, i.e. an object which responds
+    #   to #call and calculates a partition for the passed key.
+    # @param memory_limit [#to_i] The memory limit, i.e. the buffer size in
+    #   bytes.
+    #
+    # @example
+    #  MapReduce::Mapper.new(MyImplementation.new, partitioner: HashPartitioner.new(16), memory_limit: 100.megabytes)
+    def initialize(implementation, partitioner: HashPartitioner.new(32), memory_limit: 100 * 1024 * 1024)
+      super()
+      @implementation = implementation
+      @partitioner = partitioner
+      @memory_limit = memory_limit.to_i
+      @buffer_size = 0
+      @buffer = []
+      @chunks = []
+    end
+    # Passes the received key to your map-reduce implementation and adds
+    # yielded key-value pair to a buffer. When the memory limit is reached, the
+    # chunk is sorted and written to a tempfile.
+    #
+    # @param key The key to pass to the map-reduce implementation.
+    #
+    # @example
+    #   mapper.map("some_key")
+    #   mapper.map("other_key")
+    def map(*args, **kwargs)
+      @implementation.map(*args, **kwargs) do |new_key, new_value|
+        synchronize do
+          @buffer.push([new_key, new_value])
+          @buffer_size += JSON.generate([new_key, new_value]).bytesize
+          write_chunk if @buffer_size >= @memory_limit
+        end
+      end
+    end
+    # Performs a k-way-merge of the sorted chunks written to tempfiles while
+    # already reducing the result using your map-reduce implementation and
+    # splitting the dataset into partitions. Finally yields each partition with
+    # the tempfile containing the data of the partition.
+    #
+    # @example
+    #   mapper.shuffle do |partition, tempfile|
+    #     # store data e.g. on s3
+    #   end
+    def shuffle(&block)
+      return enum_for(:shuffle) unless block_given?
+      write_chunk if @buffer_size > 0
+      partitions = {}
+      reduce_chunk(k_way_merge(@chunks), @implementation).each do |pair|
+        partition = @partitioner.call(pair[0])
+        (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
+      end
+      @chunks.each { |tempfile| tempfile.close(true) }
+      @chunks = []
+      partitions.each_value(&:rewind)
+      partitions.each do |partition, tempfile|
+        block.call(partition, tempfile)
+      end
+      partitions.each_value { |tempfile| tempfile.close(true) }
+      nil
+    end
+    private
+    def write_chunk
+      tempfile = Tempfile.new
+      @buffer.sort_by! { |item| JSON.generate(item.first) }
+      reduce_chunk(@buffer, @implementation).each do |pair|
+        tempfile.puts JSON.generate(pair)
+      end
+      tempfile.rewind
+      @chunks.push(tempfile)
+      @buffer_size = 0
+      @buffer = []
+    end
+  end
+end

data/lib/map_reduce/mergeable.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module MapReduce
+  # The MapReduce::Mergeable mixin provides the k-way-merge operation used by
+  # mappers as well as reducers.
+  module Mergeable
+    private
+    # Performs the k-way-merge of the passed files using a priority queue using
+    # a binomial heap. The content of the passed files needs to be sorted. It
+    # starts by reading one item of each file and adding it to the priority
+    # queue. Afterwards, it continously pops an item from the queue, yields it
+    # and reads a new item from the file the popped item belongs to, adding the
+    # read item to the queue. This continues up until all items from the files
+    # have been read. This guarantees that the yielded key-value pairs are
+    # sorted without having all items in-memory.
+    #
+    # @param files [IO, Tempfile] The files to run the k-way-merge for. The
+    #   content of the files must be sorted.
+    def k_way_merge(files)
+      return enum_for(:k_way_merge, files) unless block_given?
+      queue = PriorityQueue.new
+      files.each_with_index do |file, index|
+        line = file.eof? ? nil : file.readline
+        next unless line
+        key, value = JSON.parse(line)
+        queue.push([key, value, index], JSON.generate(key))
+      end
+      loop do
+        key, value, index = queue.pop
+        return unless index
+        yield([key, value])
+        line = files[index].yield_self { |file| file.eof? ? nil : file.readline }
+        next unless line
+        key, value = JSON.parse(line)
+        queue.push([key, value, index], JSON.generate(key))
+      end
+      files.each(&:rewind)
+      nil
+    end
+  end
+end

data/lib/map_reduce/priority_queue.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module MapReduce
+  # The MapReduce::PriorityQueue implements a min priority queue using a
+  # binomial heap.
+  class PriorityQueue
+    # Initializes the priority queue.
+    #
+    # @example
+    #   MapReduce::PriorityQueue.new
+    def initialize
+      @queue = MinPriorityQueue.new
+      @sequence_number = 0
+    end
+    # Adds a new item to the priority queue while the key is used for sorting.
+    # The object and key can basically be everything, but the key must be some
+    # comparable object.
+    #
+    # @param object The object to add to the priority queue.
+    # @param key The key to use for sorting.
+    #
+    # @example
+    #   priority_queue = MapReduce::PriorityQueue.new
+    #   priority_queue.push("some object", "some key")
+    def push(object, key)
+      @queue.push([@sequence_number, object], key)
+      @sequence_number += 1
+    end
+    # Pops the min item from the queue.
+    #
+    # @returns The popped object.
+    #
+    # @example
+    #   priority_queue = MapReduce::PriorityQueue.new
+    #   priority_queue.push("object1", "key1")
+    #   priority_queue.push("object2", "key2")
+    #   priority_queue.pop
+    def pop
+      _, object = @queue.pop
+      object
+    end
+  end
+end

data/lib/map_reduce/reduceable.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module MapReduce
+  # The MapReduce::Reduceable mixin allows to reduce an arbitrary chunk using
+  # the specified map-reduce implementation.
+  module Reduceable
+    private
+    # Reduces the specified chunk, e.g. some enumerable, using the specified
+    # map-reduce implementation using a lookahead of one to detect key changes.
+    # The reduce implementation is called up until a key change is detected,
+    # because the key change signals that the reduce operation is finished for
+    # the particular key, such that it will then be yielded.
+    #
+    # @param chunk The chunk to be reduced. Can e.g. be some enumerable.
+    # @param implementation The map-reduce implementation.
+    def reduce_chunk(chunk, implementation)
+      return enum_for(:reduce_chunk, chunk, implementation) unless block_given?
+      last_item = chunk.inject do |prev_item, cur_item|
+        prev_key = prev_item[0]
+        # Here we can compare without serializing the keys to json first,
+        # because we reduce a chunk which includes a deserialization step.
+        if prev_key == cur_item[0]
+          [prev_key, implementation.reduce(prev_key, prev_item[1], cur_item[1])]
+        else
+          yield(prev_item)
+          cur_item
+        end
+      end
+      yield(last_item) if last_item
+    end
+  end
+end

data/lib/map_reduce/reducer.rb ADDED Viewed

@@ -0,0 +1,107 @@
+module MapReduce
+  # The MapReduce::Reducer class runs the reducer part of your map-reduce job.
+  class Reducer
+    include Mergeable
+    include Reduceable
+    include MonitorMixin
+    class InvalidChunkLimit < StandardError; end
+    # Initializes a new reducer.
+    #
+    # @param implementation Your map-reduce implementation, i.e. an object
+    #   which responds to #map and #reduce.
+    #
+    # @example
+    #   MapReduce::Reducer.new(MyImplementation.new)
+    def initialize(implementation)
+      super()
+      @implementation = implementation
+      @temp_paths ||= []
+    end
+    # Adds a chunk from the mapper-phase to the reducer by registering a
+    # tempfile and returning the path to that tempfile, such that you can
+    # download a chunk e.g. from s3 and write the content to this tempfile.
+    #
+    # @returns [String] The path to a tempfile.
+    #
+    # @example
+    #   chunk_path = reducer.add_chunk
+    #   File.write(chunk_path, "downloaded blob")
+    def add_chunk
+      temp_path = TempPath.new
+      synchronize do
+        @temp_paths.push(temp_path)
+      end
+      temp_path.path
+    end
+    # Performs a k-way-merge of the added chunks and yields the reduced
+    # key-value pairs. It performs multiple runs when more than `chunk_limit`
+    # chunks exist. A run means: it takes up to `chunk_limit` chunks,
+    # reduces them and pushes the result as a new chunk. At the end it
+    # removes all tempfiles, even if errors occur.
+    #
+    # @param chunk_limit [Integer] The maximum number of files to process
+    #   during a single run. Most useful when you run on a system where the
+    #   number of open file descriptors is limited. If your number of file
+    #   descriptors is unlimited, you want to set it to a higher number to
+    #   avoid the overhead of multiple runs.
+    #
+    # @example
+    #   reducer = MapReduce::Reducer.new(MyImplementation.new)
+    #
+    #   chunk1_path = reducer.add_chunk
+    #   # write data to the file
+    #
+    #   chunk2_path = reducer.add_chunk
+    #   # write data to the file
+    #
+    #   reducer.reduce(chunk_limit: 32) do |key, value|
+    #     # ...
+    #   end
+    def reduce(chunk_limit:, &block)
+      return enum_for(:reduce, chunk_limit: chunk_limit) unless block_given?
+      raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
+      begin
+        loop do
+          slice = @temp_paths.shift(chunk_limit)
+          files = slice.select { |temp_path| File.exist?(temp_path.path) }
+                       .map { |temp_path| File.open(temp_path.path, "r") }
+          begin
+            if @temp_paths.empty?
+              reduce_chunk(k_way_merge(files), @implementation).each do |pair|
+                block.call(pair)
+              end
+              return
+            end
+            File.open(add_chunk, "w") do |file|
+              reduce_chunk(k_way_merge(files), @implementation).each do |pair|
+                file.puts JSON.generate(pair)
+              end
+            end
+          ensure
+            files.each(&:close)
+            slice.each(&:delete)
+          end
+        end
+      ensure
+        @temp_paths.each(&:delete)
+      end
+    end
+  end
+end

data/lib/map_reduce/temp_path.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module MapReduce
+  # The MapReduce::TempPath generates a tempfile path and automatically deletes
+  # the file when the object is garbage collected or manually deleted. Using
+  # this class instead of Tempfile allows to have less open file descriptors.
+  class TempPath
+    attr_reader :path
+    # Initializes a new tempfile path.
+    #
+    # @example
+    #   temp_path = MapReduce::TempPath.new
+    #   File.write(temp_path.path, "blob")
+    def initialize
+      @path = Dir::Tmpname.create("") do
+        # nothing
+      end
+      FileUtils.touch(@path)
+      ObjectSpace.define_finalizer(self, self.class.finalize(@path))
+    end
+    # @api private
+    def self.finalize(path)
+      proc { FileUtils.rm_f(path) }
+    end
+    # Allows to manually delete the tempfile.
+    #
+    # @example
+    #   temp_path = MapReduce::TempPath.new
+    #   File.write(temp_path.path, "blob")
+    #   temp_path.delete
+    def delete
+      FileUtils.rm_f(path)
+    end
+  end
+end

data/lib/map_reduce/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module MapReduce
+  VERSION = "1.0.0"
+end

data/map-reduce-ruby.gemspec ADDED Viewed

@@ -0,0 +1,38 @@
+require_relative "lib/map_reduce/version"
+Gem::Specification.new do |spec|
+  spec.name          = "map-reduce-ruby"
+  spec.version       = MapReduce::VERSION
+  spec.authors       = ["Benjamin Vetter"]
+  spec.email         = ["vetter@flakks.com"]
+  spec.summary       = "The easiest way to write distributed, larger than memory map-reduce jobs"
+  spec.description   = "The MapReduce gem is the easiest way to write custom, distributed, larger "\
+                       "than memory map-reduce jobs"
+  spec.homepage      = "https://github.com/mrkamel/map-reduce-ruby"
+  spec.license       = "MIT"
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = "https://github.com/mrkamel/map-reduce-ruby"
+  spec.metadata["changelog_uri"] = "https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md"
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files = Dir.chdir(File.expand_path(__dir__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "rubocop"
+  spec.add_dependency "json"
+  spec.add_dependency "lazy_priority_queue"
+  # For more information and examples about making a new gem, checkout our
+  # guide at: https://bundler.io/guides/creating_gem.html
+end

metadata ADDED Viewed

@@ -0,0 +1,126 @@
+--- !ruby/object:Gem::Specification
+name: map-reduce-ruby
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- Benjamin Vetter
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2021-07-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rubocop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: json
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: lazy_priority_queue
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: The MapReduce gem is the easiest way to write custom, distributed, larger
+  than memory map-reduce jobs
+email:
+- vetter@flakks.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".github/workflows/test.yml"
+- ".gitignore"
+- ".rspec"
+- ".rubocop.yml"
+- CHANGELOG.md
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/map-reduce-ruby.rb
+- lib/map_reduce.rb
+- lib/map_reduce/hash_partitioner.rb
+- lib/map_reduce/mapper.rb
+- lib/map_reduce/mergeable.rb
+- lib/map_reduce/priority_queue.rb
+- lib/map_reduce/reduceable.rb
+- lib/map_reduce/reducer.rb
+- lib/map_reduce/temp_path.rb
+- lib/map_reduce/version.rb
+- map-reduce-ruby.gemspec
+homepage: https://github.com/mrkamel/map-reduce-ruby
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/mrkamel/map-reduce-ruby
+  source_code_uri: https://github.com/mrkamel/map-reduce-ruby
+  changelog_uri: https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 2.5.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.0.3
+signing_key:
+specification_version: 4
+summary: The easiest way to write distributed, larger than memory map-reduce jobs
+test_files: []