RubyGems - map-reduce-ruby - Versions diffs - 1.0.0 → 2.1.0 - Mend

map-reduce-ruby 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/.rubocop.yml +6 -0
data/CHANGELOG.md +31 -0
data/Gemfile.lock +29 -28
data/README.md +24 -7
data/lib/map_reduce/mapper.rb +5 -2
data/lib/map_reduce/mergeable.rb +12 -2
data/lib/map_reduce/priority_queue.rb +23 -1
data/lib/map_reduce/version.rb +1 -1
data/map-reduce-ruby.gemspec +1 -1
metadata +6 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 19ac70d25d6622d30398997cae0a4d6730ebc13f8bbc31df896c5acf480923c4
-  data.tar.gz: ede2a8f736053a268175a7948324a15954475526d0589bf47416347b57a50740
+  metadata.gz: 4f3eeb7739b733f1abdf325ddfcf5a4c42fb257edd837952781246fcda9f48fb
+  data.tar.gz: f40eb08a341fc522c8f7043b74ac1c47fb00af491a107f51a6a76ca67f389629
 SHA512:
-  metadata.gz: 650125350b5f166ccc0fb1346c77dfa9f17471ac9ae9290b4cbc4731a5baec872a810551e0b15af064b37f73c7e103ab6281d48cd839f26d4aab6780d5cd66d1
-  data.tar.gz: 70d9f8668143cb507537e369ad0ea79d9bb9f2c33469dffccd5952116f09758a7a880831d81aa4e3b67bc548b4b36aafe321a8d553946d8734fb78c11cd1686e
+  metadata.gz: 6e91d7a1f55f0b89d333b317ecedc7978ca29a709dfad403906f6f2835b121a0a49afb29d03f55d8d7c8aa5c1b70628400404b7bf4d590046ae2e66ed48d7abc
+  data.tar.gz: cc4524aec895d935b70548163e5818f8725c5a05985e55f9b8d9330c34491968bb3ec54f02466ac0748eeb130f47a40d07baefa2627b04ed10dbac1c9638b1af

data/.rubocop.yml CHANGED Viewed

@@ -49,3 +49,9 @@ Layout/LineLength:
 Style/FrozenStringLiteralComment:
   EnforcedStyle: never
+Style/ObjectThen:
+  Enabled: false
+Gemspec/RequireMFA:
+  Enabled: false

data/CHANGELOG.md CHANGED Viewed

@@ -1 +1,32 @@
 # CHANGELOG
+## v2.1.0
+* Do not reduce in `MapReduce::Mapper` when no `reduce` implementation is given
+## v2.0.0
+* [BREAKING] Keys are no longer automatically converted to json before using
+  them for sorting
+  * This allows to have proper semantic sort order for numeric keys in addition
+    to just the clustering of keys
+  * Examples of valid keys: `"key"`, `["foo", 1.0]`, `["foo", ["bar"]]`
+  * Examples of problematic keys: `nil`, `true`, `["foo", nil]`, `{ "foo" => "bar" }`
+  * For migration purposes it is recommended to convert your keys to and from
+    json manually if you have complex keys using `JSON.generate`/`JSON.parse`:
+```ruby
+class WordCounter
+  def map(url)
+    HTTP.get(url).to_s.split.each do |word|
+      yield(JSON.generate("key" => word), 1) # if you use a hash for the key
+    end
+  end
+  def reduce(json_key, count1, count2)
+    key = JSON.parse(json_key) # if you want to access the original key
+    count1 + count2
+  end
+end
+```

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    map-reduce-ruby (1.0.0)
+    map-reduce-ruby (2.1.0)
       json
       lazy_priority_queue
@@ -9,41 +9,42 @@ GEM
   remote: https://rubygems.org/
   specs:
     ast (2.4.2)
-    diff-lcs (1.4.4)
-    json (2.5.1)
+    diff-lcs (1.5.0)
+    json (2.6.2)
     lazy_priority_queue (0.1.1)
-    parallel (1.20.1)
-    parser (3.0.0.0)
+    parallel (1.22.1)
+    parser (3.1.2.1)
       ast (~> 2.4.1)
-    rainbow (3.0.0)
-    regexp_parser (2.0.3)
-    rexml (3.2.4)
-    rspec (3.10.0)
-      rspec-core (~> 3.10.0)
-      rspec-expectations (~> 3.10.0)
-      rspec-mocks (~> 3.10.0)
-    rspec-core (3.10.1)
-      rspec-support (~> 3.10.0)
-    rspec-expectations (3.10.1)
+    rainbow (3.1.1)
+    regexp_parser (2.5.0)
+    rexml (3.2.5)
+    rspec (3.11.0)
+      rspec-core (~> 3.11.0)
+      rspec-expectations (~> 3.11.0)
+      rspec-mocks (~> 3.11.0)
+    rspec-core (3.11.0)
+      rspec-support (~> 3.11.0)
+    rspec-expectations (3.11.1)
       diff-lcs (>= 1.2.0, < 2.0)
-      rspec-support (~> 3.10.0)
-    rspec-mocks (3.10.1)
+      rspec-support (~> 3.11.0)
+    rspec-mocks (3.11.1)
       diff-lcs (>= 1.2.0, < 2.0)
-      rspec-support (~> 3.10.0)
-    rspec-support (3.10.1)
-    rubocop (0.93.1)
+      rspec-support (~> 3.11.0)
+    rspec-support (3.11.1)
+    rubocop (1.36.0)
+      json (~> 2.3)
       parallel (~> 1.10)
-      parser (>= 2.7.1.5)
+      parser (>= 3.1.2.1)
       rainbow (>= 2.2.2, < 4.0)
-      regexp_parser (>= 1.8)
-      rexml
-      rubocop-ast (>= 0.6.0)
+      regexp_parser (>= 1.8, < 3.0)
+      rexml (>= 3.2.5, < 4.0)
+      rubocop-ast (>= 1.20.1, < 2.0)
       ruby-progressbar (~> 1.7)
-      unicode-display_width (>= 1.4.0, < 2.0)
-    rubocop-ast (1.4.1)
-      parser (>= 2.7.1.5)
+      unicode-display_width (>= 1.4.0, < 3.0)
+    rubocop-ast (1.21.0)
+      parser (>= 3.1.1.0)
     ruby-progressbar (1.11.0)
-    unicode-display_width (1.7.0)
+    unicode-display_width (2.3.0)
 PLATFORMS
   ruby

data/README.md CHANGED Viewed

@@ -7,8 +7,7 @@ than memory map-reduce jobs by using your local disk and some arbitrary storage
 layer like s3. You can specify how much memory you are willing to offer and
 MapReduce will use its buffers accordingly. Finally, you can use your already
 existing background job system like `sidekiq` or one of its various
-alternatives. Finally, your keys and values can be everything that can be
-serialized as json.
+alternatives.
 ## Installation
@@ -30,9 +29,7 @@ Or install it yourself as:
 Any map-reduce job consists of an implementation of your `map` function, your
 `reduce` function and worker code. So let's start with an implementation for a
-word count map-reduce task which fetches txt documents from the web. Please
-note that your keys and values can be everything that can be serialized as
-json, but nothing else.
+word count map-reduce task which fetches txt documents from the web.
 ```ruby
 class WordCounter
@@ -68,8 +65,8 @@ class WordCountMapper
 end
 ```
-Please note that `MapReduce::HashPartitioner.new(16)` states that we want split
-the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
+Please note that `MapReduce::HashPartitioner.new(16)` states that we want to
+split the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
 worker code to run the reduce part:
 ```ruby
@@ -120,6 +117,26 @@ mappers are finished.
 That's it.
+## Limitations for Keys
+You have to make sure that your keys are properly sortable in ruby. Please
+note:
+```ruby
+"key" < nil # comparison of String with nil failed (ArgumentError)
+false < true # undefined method `<' for false:FalseClass (NoMethodError)
+1 > "key" # comparison of Integer with String failed (ArgumentError
+{ "key" => "value1" } < { "key" => "value2" } #=> false
+{ "key" => "value1" } > { "key" => "value2" } #=> false
+{ "key" => "value1" } <=> { "key" => "value2" } #=> nil
+```
+For those reasons, it is recommended to only use strings, numbers and arrays or
+a combination of those.
 ## Internals
 To fully understand the performance details, the following outlines the inner

data/lib/map_reduce/mapper.rb CHANGED Viewed

@@ -71,7 +71,10 @@ module MapReduce
       partitions = {}
-      reduce_chunk(k_way_merge(@chunks), @implementation).each do |pair|
+      chunk = k_way_merge(@chunks)
+      chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
+      chunk.each do |pair|
         partition = @partitioner.call(pair[0])
         (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
@@ -96,7 +99,7 @@ module MapReduce
     def write_chunk
       tempfile = Tempfile.new
-      @buffer.sort_by! { |item| JSON.generate(item.first) }
+      @buffer.sort_by!(&:first)
       reduce_chunk(@buffer, @implementation).each do |pair|
         tempfile.puts JSON.generate(pair)

data/lib/map_reduce/mergeable.rb CHANGED Viewed

@@ -20,6 +20,16 @@ module MapReduce
     def k_way_merge(files)
       return enum_for(:k_way_merge, files) unless block_given?
+      if files.size == 1
+        files.first.each_line do |line|
+          yield(JSON.parse(line))
+        end
+        files.each(&:rewind)
+        return
+      end
       queue = PriorityQueue.new
       files.each_with_index do |file, index|
@@ -29,7 +39,7 @@ module MapReduce
         key, value = JSON.parse(line)
-        queue.push([key, value, index], JSON.generate(key))
+        queue.push([key, value, index], key)
       end
       loop do
@@ -45,7 +55,7 @@ module MapReduce
         key, value = JSON.parse(line)
-        queue.push([key, value, index], JSON.generate(key))
+        queue.push([key, value, index], key)
       end
       files.each(&:rewind)

data/lib/map_reduce/priority_queue.rb CHANGED Viewed

@@ -1,4 +1,26 @@
 module MapReduce
+  # Since LazyPriorityQueue is using <= and >=, but not <=>, it does not
+  # support sorting array keys. Therefore we wrap the keys in SortKey, which
+  # provides those operators. See https://bugs.ruby-lang.org/issues/5574
+  class SortKey
+    include Comparable
+    attr_reader :object
+    def initialize(object)
+      @object = object
+    end
+    def <=>(other)
+      res = object <=> other.object
+      raise(ArgumentError, "Unable to compare #{@object.inspect} with #{other.object.inspect}") if res.nil?
+      res
+    end
+  end
   # The MapReduce::PriorityQueue implements a min priority queue using a
   # binomial heap.
@@ -25,7 +47,7 @@ module MapReduce
     #   priority_queue.push("some object", "some key")
     def push(object, key)
-      @queue.push([@sequence_number, object], key)
+      @queue.push([@sequence_number, object], SortKey.new(key))
       @sequence_number += 1
     end

data/lib/map_reduce/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module MapReduce
-  VERSION = "1.0.0"
+  VERSION = "2.1.0"
 end

data/map-reduce-ruby.gemspec CHANGED Viewed

@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
   spec.email         = ["vetter@flakks.com"]
   spec.summary       = "The easiest way to write distributed, larger than memory map-reduce jobs"
-  spec.description   = "The MapReduce gem is the easiest way to write custom, distributed, larger "\
+  spec.description   = "The MapReduce gem is the easiest way to write custom, distributed, larger " \
                        "than memory map-reduce jobs"
   spec.homepage      = "https://github.com/mrkamel/map-reduce-ruby"
   spec.license       = "MIT"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: map-reduce-ruby
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 2.1.0
 platform: ruby
 authors:
 - Benjamin Vetter
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2021-07-05 00:00:00.000000000 Z
+date: 2022-10-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -104,7 +104,7 @@ metadata:
   homepage_uri: https://github.com/mrkamel/map-reduce-ruby
   source_code_uri: https://github.com/mrkamel/map-reduce-ruby
   changelog_uri: https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -119,8 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.3
-signing_key:
+rubygems_version: 3.3.3
+signing_key:
 specification_version: 4
 summary: The easiest way to write distributed, larger than memory map-reduce jobs
 test_files: []