RubyGems - map-reduce-ruby - Versions diffs - 2.1.0 → 3.0.0 - Mend

map-reduce-ruby 2.1.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/.rubocop.yml +4 -1
data/CHANGELOG.md +19 -0
data/Gemfile.lock +1 -1
data/README.md +11 -5
data/lib/map_reduce/mapper.rb +60 -33
data/lib/map_reduce/mergeable.rb +54 -12
data/lib/map_reduce/reducer.rb +15 -19
data/lib/map_reduce/version.rb +1 -1
data/lib/map_reduce.rb +4 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4f3eeb7739b733f1abdf325ddfcf5a4c42fb257edd837952781246fcda9f48fb
-  data.tar.gz: f40eb08a341fc522c8f7043b74ac1c47fb00af491a107f51a6a76ca67f389629
+  metadata.gz: a794ca527320099e74e49e3f3abe1a19a6fc6df186eb04a822c926af5ae4985c
+  data.tar.gz: f8e1b77d7e3fbcf7171aa1fce94273a21fef45d9d76f8768b07ba450c3d51bdb
 SHA512:
-  metadata.gz: 6e91d7a1f55f0b89d333b317ecedc7978ca29a709dfad403906f6f2835b121a0a49afb29d03f55d8d7c8aa5c1b70628400404b7bf4d590046ae2e66ed48d7abc
-  data.tar.gz: cc4524aec895d935b70548163e5818f8725c5a05985e55f9b8d9330c34491968bb3ec54f02466ac0748eeb130f47a40d07baefa2627b04ed10dbac1c9638b1af
+  metadata.gz: 79c9b21cd9fd8c586fc61b0b9cc9dd3ddee039fdce2c355965e06309a17436aff28932b68647b0b4ebe90e59208dee0dd4ecf7f83dc06a5216c6da4dea3eb00f
+  data.tar.gz: 22bae4b877451ee66fdcdcd0ff0861caf65a41cab5e3b77c03370a4bb172c5d381f7623ab2f91516875e060c54eec3a8ac9063edeffe9e167a9a3390e25f8e62

data/.rubocop.yml CHANGED Viewed

@@ -45,7 +45,7 @@ Style/StringLiteralsInInterpolation:
   EnforcedStyle: double_quotes
 Layout/LineLength:
-  Max: 120
+  Max: 250
 Style/FrozenStringLiteralComment:
   EnforcedStyle: never
@@ -55,3 +55,6 @@ Style/ObjectThen:
 Gemspec/RequireMFA:
   Enabled: false
+Style/HashTransformValues:
+  Enabled: false

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,24 @@
 # CHANGELOG
+## v3.0.0
+* [BREAKING] `MapReduce::Mapper#shuffle` now yields a hash of (partition, path)
+  pairs, which e.g. allows to upload the files in parallel
+* [BREAKING] `MapReduce::Mapper#shuffle` now requires a `chunk_limit`. This
+  allows to further limit the maximum number of open file descriptors
+* [BREAKING] `MapReduce::Mapper#shuffle` no longer returns an `Enumerator` when
+  no block is given
+* [BREAKING] `MapReduce::Reducer::InvalidChunkLimit` is now
+  `MapReduce::InvalidChunkLimit` and inherits from `MapReduce::Error` being the
+  base class for all errors
+* `MapReduce::Mapper#shuffle` no longer keeps all partition files open. Instead,
+  it writes them one after another to further strictly reduce the number of
+  open file descriptors.
+## v2.1.1
+* Fix in `MapReduce::Mapper` when no `reduce` implementation is given
 ## v2.1.0
 * Do not reduce in `MapReduce::Mapper` when no `reduce` implementation is given

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    map-reduce-ruby (2.1.0)
+    map-reduce-ruby (3.0.0)
       json
       lazy_priority_queue

data/README.md CHANGED Viewed

@@ -57,9 +57,11 @@ class WordCountMapper
     mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
     mapper.map(url)
-    mapper.shuffle do |partition, tempfile|
-      # store content of tempfile e.g. on s3:
-      bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: tempfile)
+    mapper.shuffle(chunk_limit: 64) do |partitions|
+      partitions.each do |partition, path|
+        # store content of the tempfile located at path e.g. on s3:
+        bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: File.open(path))
+      end
     end
   end
 end
@@ -205,6 +207,10 @@ interface of callables, could even be expressed as a simple one-liner:
 MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
 ```
+## Semantic Versioning
+MapReduce is using Semantic Versioning: [SemVer](http://semver.org/)
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run
@@ -224,5 +230,5 @@ https://github.com/mrkamel/map-reduce-ruby
 ## License
-The gem is available as open source under the terms of the [MIT
-License](https://opensource.org/licenses/MIT).
+The gem is available as open source under the terms of the
+[MIT License](https://opensource.org/licenses/MIT).

data/lib/map_reduce/mapper.rb CHANGED Viewed

@@ -6,8 +6,6 @@ module MapReduce
     include Reduceable
     include MonitorMixin
-    attr_reader :partitions
     # Initializes a new mapper.
     #
     # @param implementation Your map-reduce implementation, i.e. an object
@@ -45,9 +43,11 @@ module MapReduce
     def map(*args, **kwargs)
       @implementation.map(*args, **kwargs) do |new_key, new_value|
         synchronize do
-          @buffer.push([new_key, new_value])
+          partition = @partitioner.call(new_key)
+          item = [[partition, new_key], new_value]
-          @buffer_size += JSON.generate([new_key, new_value]).bytesize
+          @buffer.push(item)
+          @buffer_size += JSON.generate(item).bytesize
           write_chunk if @buffer_size >= @memory_limit
         end
@@ -55,59 +55,86 @@ module MapReduce
     end
     # Performs a k-way-merge of the sorted chunks written to tempfiles while
-    # already reducing the result using your map-reduce implementation and
-    # splitting the dataset into partitions. Finally yields each partition with
-    # the tempfile containing the data of the partition.
+    # already reducing the result using your map-reduce implementation (if
+    # available) and splitting the dataset into partitions. Finally yields a
+    # hash of (partition, path) pairs containing the data for the partitions
+    # in tempfiles.
+    #
+    # @param chunk_limit [Integer] The maximum number of files to process
+    #   at the same time. Most useful when you run on a system where the
+    #   number of open file descriptors is limited. If your number of file
+    #   descriptors is unlimited, you want to set it to a higher number to
+    #   avoid the overhead of multiple runs.
     #
     # @example
-    #   mapper.shuffle do |partition, tempfile|
-    #     # store data e.g. on s3
+    #   mapper.shuffle do |partitions|
+    #     partitions.each do |partition, path|
+    #       # store data e.g. on s3
+    #     end
     #   end
-    def shuffle(&block)
-      return enum_for(:shuffle) unless block_given?
+    def shuffle(chunk_limit:)
+      raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
-      write_chunk if @buffer_size > 0
+      begin
+        write_chunk if @buffer_size > 0
-      partitions = {}
+        chunk = k_way_merge(@chunks, chunk_limit: chunk_limit)
+        chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
-      chunk = k_way_merge(@chunks)
-      chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
+        partitions = split_chunk(chunk)
-      chunk.each do |pair|
-        partition = @partitioner.call(pair[0])
+        yield(partitions.transform_values(&:path))
+      ensure
+        partitions.each_value(&:delete)
-        (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
+        @chunks.each(&:delete)
+        @chunks = []
       end
-      @chunks.each { |tempfile| tempfile.close(true) }
-      @chunks = []
+      nil
+    end
+    private
+    def split_chunk(chunk)
+      res = {}
+      current_partition = nil
+      file = nil
-      partitions.each_value(&:rewind)
+      chunk.each do |((new_partition, key), value)|
+        if new_partition != current_partition
+          file&.close
-      partitions.each do |partition, tempfile|
-        block.call(partition, tempfile)
+          current_partition = new_partition
+          temp_path = TempPath.new
+          res[new_partition] = temp_path
+          file = File.open(temp_path.path, "w+")
+        end
+        file.puts(JSON.generate([key, value]))
       end
-      partitions.each_value { |tempfile| tempfile.close(true) }
+      file&.close
-      nil
+      res
     end
-    private
     def write_chunk
-      tempfile = Tempfile.new
+      temp_path = TempPath.new
       @buffer.sort_by!(&:first)
-      reduce_chunk(@buffer, @implementation).each do |pair|
-        tempfile.puts JSON.generate(pair)
-      end
+      chunk = @buffer
+      chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
-      tempfile.rewind
+      File.open(temp_path.path, "w+") do |file|
+        chunk.each do |pair|
+          file.puts JSON.generate(pair)
+        end
+      end
-      @chunks.push(tempfile)
+      @chunks.push(temp_path)
       @buffer_size = 0
       @buffer = []

data/lib/map_reduce/mergeable.rb CHANGED Viewed

@@ -5,20 +5,62 @@ module MapReduce
   module Mergeable
     private
-    # Performs the k-way-merge of the passed files using a priority queue using
-    # a binomial heap. The content of the passed files needs to be sorted. It
-    # starts by reading one item of each file and adding it to the priority
-    # queue. Afterwards, it continously pops an item from the queue, yields it
-    # and reads a new item from the file the popped item belongs to, adding the
-    # read item to the queue. This continues up until all items from the files
-    # have been read. This guarantees that the yielded key-value pairs are
-    # sorted without having all items in-memory.
+    # Performs the k-way-merge of the passed files referenced by the temp paths
+    # using a priority queue using a binomial heap. The content of the passed
+    # files needs to be sorted. It starts by reading one item of each file and
+    # adding it to the priority queue. Afterwards, it continously pops an item
+    # from the queue, yields it and reads a new item from the file the popped
+    # item belongs to, adding the read item to the queue. This continues up
+    # until all items from the files have been read. This guarantees that the
+    # yielded key-value pairs are sorted without having all items in-memory.
     #
-    # @param files [IO, Tempfile] The files to run the k-way-merge for. The
-    #   content of the files must be sorted.
+    # @param temp_paths [TempPath] The files referenced by the temp paths to
+    #   run the k-way-merge for. The content of the files must be sorted.
+    # @param chunk_limit [Integer] The maximum number of files to process
+    #   at the same time. Most useful when you run on a system where the
+    #   number of open file descriptors is limited. If your number of file
+    #   descriptors is unlimited, you want to set it to a higher number to
+    #   avoid the overhead of multiple runs.
-    def k_way_merge(files)
-      return enum_for(:k_way_merge, files) unless block_given?
+    def k_way_merge(temp_paths, chunk_limit:, &block)
+      return enum_for(__method__, temp_paths, chunk_limit: chunk_limit) unless block_given?
+      dupped_temp_paths = temp_paths.dup
+      additional_temp_paths = []
+      while dupped_temp_paths.size > chunk_limit
+        temp_path_out = TempPath.new
+        File.open(temp_path_out.path, "w+") do |file|
+          files = dupped_temp_paths.shift(chunk_limit).map { |temp_path| File.open(temp_path.path, "r") }
+          k_way_merge!(files) do |pair|
+            file.puts(JSON.generate(pair))
+          end
+          files.each(&:close)
+        end
+        dupped_temp_paths.push(temp_path_out)
+        additional_temp_paths.push(temp_path_out)
+      end
+      files = dupped_temp_paths.map { |temp_path| File.open(temp_path.path, "r") }
+      k_way_merge!(files, &block)
+      files.each(&:close)
+      nil
+    ensure
+      additional_temp_paths&.each(&:delete)
+    end
+    # Performs the actual k-way-merge of the specified files.
+    #
+    # @param files [IO, Tempfile] The files to run the k-way-merge for.
+    #   The content of the files must be sorted.
+    def k_way_merge!(files)
+      return enum_for(__method__, files) unless block_given?
       if files.size == 1
         files.first.each_line do |line|

data/lib/map_reduce/reducer.rb CHANGED Viewed

@@ -6,8 +6,6 @@ module MapReduce
     include Reduceable
     include MonitorMixin
-    class InvalidChunkLimit < StandardError; end
     # Initializes a new reducer.
     #
     # @param implementation Your map-reduce implementation, i.e. an object
@@ -70,38 +68,36 @@ module MapReduce
     #   end
     def reduce(chunk_limit:, &block)
-      return enum_for(:reduce, chunk_limit: chunk_limit) unless block_given?
+      return enum_for(__method__, chunk_limit: chunk_limit) unless block_given?
       raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
       begin
         loop do
           slice = @temp_paths.shift(chunk_limit)
-          files = slice.select { |temp_path| File.exist?(temp_path.path) }
-                       .map { |temp_path| File.open(temp_path.path, "r") }
-          begin
-            if @temp_paths.empty?
-              reduce_chunk(k_way_merge(files), @implementation).each do |pair|
-                block.call(pair)
-              end
-              return
+          if @temp_paths.empty?
+            reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
+              block.call(pair)
             end
-            File.open(add_chunk, "w") do |file|
-              reduce_chunk(k_way_merge(files), @implementation).each do |pair|
-                file.puts JSON.generate(pair)
-              end
+            return
+          end
+          File.open(add_chunk, "w+") do |file|
+            reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
+              file.puts JSON.generate(pair)
             end
-          ensure
-            files.each(&:close)
-            slice.each(&:delete)
           end
+        ensure
+          slice&.each(&:delete)
         end
       ensure
         @temp_paths.each(&:delete)
+        @temp_paths = []
       end
+      nil
     end
   end
 end

data/lib/map_reduce/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module MapReduce
-  VERSION = "2.1.0"
+  VERSION = "3.0.0"
 end

data/lib/map_reduce.rb CHANGED Viewed

@@ -13,4 +13,7 @@ require "map_reduce/hash_partitioner"
 require "map_reduce/mapper"
 require "map_reduce/reducer"
-module MapReduce; end
+module MapReduce
+  class Error < StandardError; end
+  class InvalidChunkLimit < Error; end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: map-reduce-ruby
 version: !ruby/object:Gem::Version
-  version: 2.1.0
+  version: 3.0.0
 platform: ruby
 authors:
 - Benjamin Vetter
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-10-24 00:00:00.000000000 Z
+date: 2022-11-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec