RubyGems - map-reduce-ruby - Versions diffs - 2.1.1 → 3.0.0 - Mend

map-reduce-ruby 2.1.1 → 3.0.0

Files changed (11) hide show

checksums.yaml +4 -4
data/.rubocop.yml +4 -1
data/CHANGELOG.md +15 -0
data/Gemfile.lock +1 -1
data/README.md +11 -5
data/lib/map_reduce/mapper.rb +57 -33
data/lib/map_reduce/mergeable.rb +54 -12
data/lib/map_reduce/reducer.rb +15 -19
data/lib/map_reduce/version.rb +1 -1
data/lib/map_reduce.rb +4 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5545309a188291db41e8f5fc24af45a8d983c5084f2233d735cab309921c928c
-  data.tar.gz: 779a839704ace3780a304bc7295c8b1f27e834253ee0544e9ff6ae21eda93753
+  metadata.gz: a794ca527320099e74e49e3f3abe1a19a6fc6df186eb04a822c926af5ae4985c
+  data.tar.gz: f8e1b77d7e3fbcf7171aa1fce94273a21fef45d9d76f8768b07ba450c3d51bdb
 SHA512:
-  metadata.gz: eb577347b7e5c09dd34166e814fc0c50180b6f036fad84de3857dc93d28242be81637d5b0c7f19ea7a846c659eca0c12fcbae01cdac37c4f2bf50c6d9f8f27f6
-  data.tar.gz: 704b5d6a140583099c53902ceaab5af7b45c41c004f159fc215a942a4749063bf0990c2d60cde15af9663da726bab03a1258e085c0e789470150ab96caf895f7
+  metadata.gz: 79c9b21cd9fd8c586fc61b0b9cc9dd3ddee039fdce2c355965e06309a17436aff28932b68647b0b4ebe90e59208dee0dd4ecf7f83dc06a5216c6da4dea3eb00f
+  data.tar.gz: 22bae4b877451ee66fdcdcd0ff0861caf65a41cab5e3b77c03370a4bb172c5d381f7623ab2f91516875e060c54eec3a8ac9063edeffe9e167a9a3390e25f8e62

data/.rubocop.yml CHANGED Viewed

@@ -45,7 +45,7 @@ Style/StringLiteralsInInterpolation:
   EnforcedStyle: double_quotes
 Layout/LineLength:
-  Max: 120
+  Max: 250
 Style/FrozenStringLiteralComment:
   EnforcedStyle: never
@@ -55,3 +55,6 @@ Style/ObjectThen:
 Gemspec/RequireMFA:
   Enabled: false
+Style/HashTransformValues:
+  Enabled: false

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,20 @@
 # CHANGELOG
+## v3.0.0
+* [BREAKING] `MapReduce::Mapper#shuffle` now yields a hash of (partition, path)
+  pairs, which e.g. allows to upload the files in parallel
+* [BREAKING] `MapReduce::Mapper#shuffle` now requires a `chunk_limit`. This
+  allows to further limit the maximum number of open file descriptors
+* [BREAKING] `MapReduce::Mapper#shuffle` no longer returns an `Enumerator` when
+  no block is given
+* [BREAKING] `MapReduce::Reducer::InvalidChunkLimit` is now
+  `MapReduce::InvalidChunkLimit` and inherits from `MapReduce::Error` being the
+  base class for all errors
+* `MapReduce::Mapper#shuffle` no longer keeps all partition files open. Instead,
+  it writes them one after another to further strictly reduce the number of
+  open file descriptors.
 ## v2.1.1
 * Fix in `MapReduce::Mapper` when no `reduce` implementation is given

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    map-reduce-ruby (2.1.1)
+    map-reduce-ruby (3.0.0)
       json
       lazy_priority_queue

data/README.md CHANGED Viewed

@@ -57,9 +57,11 @@ class WordCountMapper
     mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
     mapper.map(url)
-    mapper.shuffle do |partition, tempfile|
-      # store content of tempfile e.g. on s3:
-      bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: tempfile)
+    mapper.shuffle(chunk_limit: 64) do |partitions|
+      partitions.each do |partition, path|
+        # store content of the tempfile located at path e.g. on s3:
+        bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: File.open(path))
+      end
     end
   end
 end
@@ -205,6 +207,10 @@ interface of callables, could even be expressed as a simple one-liner:
 MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
 ```
+## Semantic Versioning
+MapReduce is using Semantic Versioning: [SemVer](http://semver.org/)
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run
@@ -224,5 +230,5 @@ https://github.com/mrkamel/map-reduce-ruby
 ## License
-The gem is available as open source under the terms of the [MIT
-License](https://opensource.org/licenses/MIT).
+The gem is available as open source under the terms of the
+[MIT License](https://opensource.org/licenses/MIT).

data/lib/map_reduce/mapper.rb CHANGED Viewed

@@ -6,8 +6,6 @@ module MapReduce
     include Reduceable
     include MonitorMixin
-    attr_reader :partitions
     # Initializes a new mapper.
     #
     # @param implementation Your map-reduce implementation, i.e. an object
@@ -45,9 +43,11 @@ module MapReduce
     def map(*args, **kwargs)
       @implementation.map(*args, **kwargs) do |new_key, new_value|
         synchronize do
-          @buffer.push([new_key, new_value])
+          partition = @partitioner.call(new_key)
+          item = [[partition, new_key], new_value]
-          @buffer_size += JSON.generate([new_key, new_value]).bytesize
+          @buffer.push(item)
+          @buffer_size += JSON.generate(item).bytesize
           write_chunk if @buffer_size >= @memory_limit
         end
@@ -55,62 +55,86 @@ module MapReduce
     end
     # Performs a k-way-merge of the sorted chunks written to tempfiles while
-    # already reducing the result using your map-reduce implementation and
-    # splitting the dataset into partitions. Finally yields each partition with
-    # the tempfile containing the data of the partition.
+    # already reducing the result using your map-reduce implementation (if
+    # available) and splitting the dataset into partitions. Finally yields a
+    # hash of (partition, path) pairs containing the data for the partitions
+    # in tempfiles.
+    #
+    # @param chunk_limit [Integer] The maximum number of files to process
+    #   at the same time. Most useful when you run on a system where the
+    #   number of open file descriptors is limited. If your number of file
+    #   descriptors is unlimited, you want to set it to a higher number to
+    #   avoid the overhead of multiple runs.
     #
     # @example
-    #   mapper.shuffle do |partition, tempfile|
-    #     # store data e.g. on s3
+    #   mapper.shuffle do |partitions|
+    #     partitions.each do |partition, path|
+    #       # store data e.g. on s3
+    #     end
     #   end
-    def shuffle(&block)
-      return enum_for(:shuffle) unless block_given?
+    def shuffle(chunk_limit:)
+      raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
-      write_chunk if @buffer_size > 0
+      begin
+        write_chunk if @buffer_size > 0
-      partitions = {}
+        chunk = k_way_merge(@chunks, chunk_limit: chunk_limit)
+        chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
-      chunk = k_way_merge(@chunks)
-      chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
+        partitions = split_chunk(chunk)
-      chunk.each do |pair|
-        partition = @partitioner.call(pair[0])
+        yield(partitions.transform_values(&:path))
+      ensure
+        partitions.each_value(&:delete)
-        (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
+        @chunks.each(&:delete)
+        @chunks = []
       end
-      @chunks.each { |tempfile| tempfile.close(true) }
-      @chunks = []
+      nil
+    end
+    private
-      partitions.each_value(&:rewind)
+    def split_chunk(chunk)
+      res = {}
+      current_partition = nil
+      file = nil
-      partitions.each do |partition, tempfile|
-        block.call(partition, tempfile)
+      chunk.each do |((new_partition, key), value)|
+        if new_partition != current_partition
+          file&.close
+          current_partition = new_partition
+          temp_path = TempPath.new
+          res[new_partition] = temp_path
+          file = File.open(temp_path.path, "w+")
+        end
+        file.puts(JSON.generate([key, value]))
       end
-      partitions.each_value { |tempfile| tempfile.close(true) }
+      file&.close
-      nil
+      res
     end
-    private
     def write_chunk
-      tempfile = Tempfile.new
+      temp_path = TempPath.new
       @buffer.sort_by!(&:first)
       chunk = @buffer
       chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
-      chunk.each do |pair|
-        tempfile.puts JSON.generate(pair)
+      File.open(temp_path.path, "w+") do |file|
+        chunk.each do |pair|
+          file.puts JSON.generate(pair)
+        end
       end
-      tempfile.rewind
-      @chunks.push(tempfile)
+      @chunks.push(temp_path)
       @buffer_size = 0
       @buffer = []

data/lib/map_reduce/mergeable.rb CHANGED Viewed

@@ -5,20 +5,62 @@ module MapReduce
   module Mergeable
     private
-    # Performs the k-way-merge of the passed files using a priority queue using
-    # a binomial heap. The content of the passed files needs to be sorted. It
-    # starts by reading one item of each file and adding it to the priority
-    # queue. Afterwards, it continously pops an item from the queue, yields it
-    # and reads a new item from the file the popped item belongs to, adding the
-    # read item to the queue. This continues up until all items from the files
-    # have been read. This guarantees that the yielded key-value pairs are
-    # sorted without having all items in-memory.
+    # Performs the k-way-merge of the passed files referenced by the temp paths
+    # using a priority queue using a binomial heap. The content of the passed
+    # files needs to be sorted. It starts by reading one item of each file and
+    # adding it to the priority queue. Afterwards, it continously pops an item
+    # from the queue, yields it and reads a new item from the file the popped
+    # item belongs to, adding the read item to the queue. This continues up
+    # until all items from the files have been read. This guarantees that the
+    # yielded key-value pairs are sorted without having all items in-memory.
     #
-    # @param files [IO, Tempfile] The files to run the k-way-merge for. The
-    #   content of the files must be sorted.
+    # @param temp_paths [TempPath] The files referenced by the temp paths to
+    #   run the k-way-merge for. The content of the files must be sorted.
+    # @param chunk_limit [Integer] The maximum number of files to process
+    #   at the same time. Most useful when you run on a system where the
+    #   number of open file descriptors is limited. If your number of file
+    #   descriptors is unlimited, you want to set it to a higher number to
+    #   avoid the overhead of multiple runs.
-    def k_way_merge(files)
-      return enum_for(:k_way_merge, files) unless block_given?
+    def k_way_merge(temp_paths, chunk_limit:, &block)
+      return enum_for(__method__, temp_paths, chunk_limit: chunk_limit) unless block_given?
+      dupped_temp_paths = temp_paths.dup
+      additional_temp_paths = []
+      while dupped_temp_paths.size > chunk_limit
+        temp_path_out = TempPath.new
+        File.open(temp_path_out.path, "w+") do |file|
+          files = dupped_temp_paths.shift(chunk_limit).map { |temp_path| File.open(temp_path.path, "r") }
+          k_way_merge!(files) do |pair|
+            file.puts(JSON.generate(pair))
+          end
+          files.each(&:close)
+        end
+        dupped_temp_paths.push(temp_path_out)
+        additional_temp_paths.push(temp_path_out)
+      end
+      files = dupped_temp_paths.map { |temp_path| File.open(temp_path.path, "r") }
+      k_way_merge!(files, &block)
+      files.each(&:close)
+      nil
+    ensure
+      additional_temp_paths&.each(&:delete)
+    end
+    # Performs the actual k-way-merge of the specified files.
+    #
+    # @param files [IO, Tempfile] The files to run the k-way-merge for.
+    #   The content of the files must be sorted.
+    def k_way_merge!(files)
+      return enum_for(__method__, files) unless block_given?
       if files.size == 1
         files.first.each_line do |line|

data/lib/map_reduce/reducer.rb CHANGED Viewed

@@ -6,8 +6,6 @@ module MapReduce
     include Reduceable
     include MonitorMixin
-    class InvalidChunkLimit < StandardError; end
     # Initializes a new reducer.
     #
     # @param implementation Your map-reduce implementation, i.e. an object
@@ -70,38 +68,36 @@ module MapReduce
     #   end
     def reduce(chunk_limit:, &block)
-      return enum_for(:reduce, chunk_limit: chunk_limit) unless block_given?
+      return enum_for(__method__, chunk_limit: chunk_limit) unless block_given?
       raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
       begin
         loop do
           slice = @temp_paths.shift(chunk_limit)
-          files = slice.select { |temp_path| File.exist?(temp_path.path) }
-                       .map { |temp_path| File.open(temp_path.path, "r") }
-          begin
-            if @temp_paths.empty?
-              reduce_chunk(k_way_merge(files), @implementation).each do |pair|
-                block.call(pair)
-              end
-              return
+          if @temp_paths.empty?
+            reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
+              block.call(pair)
             end
-            File.open(add_chunk, "w") do |file|
-              reduce_chunk(k_way_merge(files), @implementation).each do |pair|
-                file.puts JSON.generate(pair)
-              end
+            return
+          end
+          File.open(add_chunk, "w+") do |file|
+            reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
+              file.puts JSON.generate(pair)
             end
-          ensure
-            files.each(&:close)
-            slice.each(&:delete)
           end
+        ensure
+          slice&.each(&:delete)
         end
       ensure
         @temp_paths.each(&:delete)
+        @temp_paths = []
       end
+      nil
     end
   end
 end

data/lib/map_reduce/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module MapReduce
-  VERSION = "2.1.1"
+  VERSION = "3.0.0"
 end

data/lib/map_reduce.rb CHANGED Viewed

@@ -13,4 +13,7 @@ require "map_reduce/hash_partitioner"
 require "map_reduce/mapper"
 require "map_reduce/reducer"
-module MapReduce; end
+module MapReduce
+  class Error < StandardError; end
+  class InvalidChunkLimit < Error; end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: map-reduce-ruby
 version: !ruby/object:Gem::Version
-  version: 2.1.1
+  version: 3.0.0
 platform: ruby
 authors:
 - Benjamin Vetter
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-10-24 00:00:00.000000000 Z
+date: 2022-11-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec