map-reduce-ruby 2.1.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4f3eeb7739b733f1abdf325ddfcf5a4c42fb257edd837952781246fcda9f48fb
4
- data.tar.gz: f40eb08a341fc522c8f7043b74ac1c47fb00af491a107f51a6a76ca67f389629
3
+ metadata.gz: a794ca527320099e74e49e3f3abe1a19a6fc6df186eb04a822c926af5ae4985c
4
+ data.tar.gz: f8e1b77d7e3fbcf7171aa1fce94273a21fef45d9d76f8768b07ba450c3d51bdb
5
5
  SHA512:
6
- metadata.gz: 6e91d7a1f55f0b89d333b317ecedc7978ca29a709dfad403906f6f2835b121a0a49afb29d03f55d8d7c8aa5c1b70628400404b7bf4d590046ae2e66ed48d7abc
7
- data.tar.gz: cc4524aec895d935b70548163e5818f8725c5a05985e55f9b8d9330c34491968bb3ec54f02466ac0748eeb130f47a40d07baefa2627b04ed10dbac1c9638b1af
6
+ metadata.gz: 79c9b21cd9fd8c586fc61b0b9cc9dd3ddee039fdce2c355965e06309a17436aff28932b68647b0b4ebe90e59208dee0dd4ecf7f83dc06a5216c6da4dea3eb00f
7
+ data.tar.gz: 22bae4b877451ee66fdcdcd0ff0861caf65a41cab5e3b77c03370a4bb172c5d381f7623ab2f91516875e060c54eec3a8ac9063edeffe9e167a9a3390e25f8e62
data/.rubocop.yml CHANGED
@@ -45,7 +45,7 @@ Style/StringLiteralsInInterpolation:
45
45
  EnforcedStyle: double_quotes
46
46
 
47
47
  Layout/LineLength:
48
- Max: 120
48
+ Max: 250
49
49
 
50
50
  Style/FrozenStringLiteralComment:
51
51
  EnforcedStyle: never
@@ -55,3 +55,6 @@ Style/ObjectThen:
55
55
 
56
56
  Gemspec/RequireMFA:
57
57
  Enabled: false
58
+
59
+ Style/HashTransformValues:
60
+ Enabled: false
data/CHANGELOG.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v3.0.0
4
+
5
+ * [BREAKING] `MapReduce::Mapper#shuffle` now yields a hash of (partition, path)
6
+ pairs, which e.g. allows to upload the files in parallel
7
+ * [BREAKING] `MapReduce::Mapper#shuffle` now requires a `chunk_limit`. This
8
+ allows to further limit the maximum number of open file descriptors
9
+ * [BREAKING] `MapReduce::Mapper#shuffle` no longer returns an `Enumerator` when
10
+ no block is given
11
+ * [BREAKING] `MapReduce::Reducer::InvalidChunkLimit` is now
12
+ `MapReduce::InvalidChunkLimit` and inherits from `MapReduce::Error` being the
13
+ base class for all errors
14
+ * `MapReduce::Mapper#shuffle` no longer keeps all partition files open. Instead,
15
+ it writes them one after another to further strictly reduce the number of
16
+ open file descriptors.
17
+
18
+ ## v2.1.1
19
+
20
+ * Fix in `MapReduce::Mapper` when no `reduce` implementation is given
21
+
3
22
  ## v2.1.0
4
23
 
5
24
  * Do not reduce in `MapReduce::Mapper` when no `reduce` implementation is given
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- map-reduce-ruby (2.1.0)
4
+ map-reduce-ruby (3.0.0)
5
5
  json
6
6
  lazy_priority_queue
7
7
 
data/README.md CHANGED
@@ -57,9 +57,11 @@ class WordCountMapper
57
57
  mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
58
58
  mapper.map(url)
59
59
 
60
- mapper.shuffle do |partition, tempfile|
61
- # store content of tempfile e.g. on s3:
62
- bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: tempfile)
60
+ mapper.shuffle(chunk_limit: 64) do |partitions|
61
+ partitions.each do |partition, path|
62
+ # store content of the tempfile located at path e.g. on s3:
63
+ bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: File.open(path))
64
+ end
63
65
  end
64
66
  end
65
67
  end
@@ -205,6 +207,10 @@ interface of callables, could even be expressed as a simple one-liner:
205
207
  MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
206
208
  ```
207
209
 
210
+ ## Semantic Versioning
211
+
212
+ MapReduce is using Semantic Versioning: [SemVer](http://semver.org/)
213
+
208
214
  ## Development
209
215
 
210
216
  After checking out the repo, run `bin/setup` to install dependencies. Then, run
@@ -224,5 +230,5 @@ https://github.com/mrkamel/map-reduce-ruby
224
230
 
225
231
  ## License
226
232
 
227
- The gem is available as open source under the terms of the [MIT
228
- License](https://opensource.org/licenses/MIT).
233
+ The gem is available as open source under the terms of the
234
+ [MIT License](https://opensource.org/licenses/MIT).
@@ -6,8 +6,6 @@ module MapReduce
6
6
  include Reduceable
7
7
  include MonitorMixin
8
8
 
9
- attr_reader :partitions
10
-
11
9
  # Initializes a new mapper.
12
10
  #
13
11
  # @param implementation Your map-reduce implementation, i.e. an object
@@ -45,9 +43,11 @@ module MapReduce
45
43
  def map(*args, **kwargs)
46
44
  @implementation.map(*args, **kwargs) do |new_key, new_value|
47
45
  synchronize do
48
- @buffer.push([new_key, new_value])
46
+ partition = @partitioner.call(new_key)
47
+ item = [[partition, new_key], new_value]
49
48
 
50
- @buffer_size += JSON.generate([new_key, new_value]).bytesize
49
+ @buffer.push(item)
50
+ @buffer_size += JSON.generate(item).bytesize
51
51
 
52
52
  write_chunk if @buffer_size >= @memory_limit
53
53
  end
@@ -55,59 +55,86 @@ module MapReduce
55
55
  end
56
56
 
57
57
  # Performs a k-way-merge of the sorted chunks written to tempfiles while
58
- # already reducing the result using your map-reduce implementation and
59
- # splitting the dataset into partitions. Finally yields each partition with
60
- # the tempfile containing the data of the partition.
58
+ # already reducing the result using your map-reduce implementation (if
59
+ # available) and splitting the dataset into partitions. Finally yields a
60
+ # hash of (partition, path) pairs containing the data for the partitions
61
+ # in tempfiles.
62
+ #
63
+ # @param chunk_limit [Integer] The maximum number of files to process
64
+ # at the same time. Most useful when you run on a system where the
65
+ # number of open file descriptors is limited. If your number of file
66
+ # descriptors is unlimited, you want to set it to a higher number to
67
+ # avoid the overhead of multiple runs.
61
68
  #
62
69
  # @example
63
- # mapper.shuffle do |partition, tempfile|
64
- # # store data e.g. on s3
70
+ # mapper.shuffle do |partitions|
71
+ # partitions.each do |partition, path|
72
+ # # store data e.g. on s3
73
+ # end
65
74
  # end
66
75
 
67
- def shuffle(&block)
68
- return enum_for(:shuffle) unless block_given?
76
+ def shuffle(chunk_limit:)
77
+ raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
69
78
 
70
- write_chunk if @buffer_size > 0
79
+ begin
80
+ write_chunk if @buffer_size > 0
71
81
 
72
- partitions = {}
82
+ chunk = k_way_merge(@chunks, chunk_limit: chunk_limit)
83
+ chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
73
84
 
74
- chunk = k_way_merge(@chunks)
75
- chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
85
+ partitions = split_chunk(chunk)
76
86
 
77
- chunk.each do |pair|
78
- partition = @partitioner.call(pair[0])
87
+ yield(partitions.transform_values(&:path))
88
+ ensure
89
+ partitions.each_value(&:delete)
79
90
 
80
- (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
91
+ @chunks.each(&:delete)
92
+ @chunks = []
81
93
  end
82
94
 
83
- @chunks.each { |tempfile| tempfile.close(true) }
84
- @chunks = []
95
+ nil
96
+ end
97
+
98
+ private
99
+
100
+ def split_chunk(chunk)
101
+ res = {}
102
+ current_partition = nil
103
+ file = nil
85
104
 
86
- partitions.each_value(&:rewind)
105
+ chunk.each do |((new_partition, key), value)|
106
+ if new_partition != current_partition
107
+ file&.close
87
108
 
88
- partitions.each do |partition, tempfile|
89
- block.call(partition, tempfile)
109
+ current_partition = new_partition
110
+ temp_path = TempPath.new
111
+ res[new_partition] = temp_path
112
+ file = File.open(temp_path.path, "w+")
113
+ end
114
+
115
+ file.puts(JSON.generate([key, value]))
90
116
  end
91
117
 
92
- partitions.each_value { |tempfile| tempfile.close(true) }
118
+ file&.close
93
119
 
94
- nil
120
+ res
95
121
  end
96
122
 
97
- private
98
-
99
123
  def write_chunk
100
- tempfile = Tempfile.new
124
+ temp_path = TempPath.new
101
125
 
102
126
  @buffer.sort_by!(&:first)
103
127
 
104
- reduce_chunk(@buffer, @implementation).each do |pair|
105
- tempfile.puts JSON.generate(pair)
106
- end
128
+ chunk = @buffer
129
+ chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
107
130
 
108
- tempfile.rewind
131
+ File.open(temp_path.path, "w+") do |file|
132
+ chunk.each do |pair|
133
+ file.puts JSON.generate(pair)
134
+ end
135
+ end
109
136
 
110
- @chunks.push(tempfile)
137
+ @chunks.push(temp_path)
111
138
 
112
139
  @buffer_size = 0
113
140
  @buffer = []
@@ -5,20 +5,62 @@ module MapReduce
5
5
  module Mergeable
6
6
  private
7
7
 
8
- # Performs the k-way-merge of the passed files using a priority queue using
9
- # a binomial heap. The content of the passed files needs to be sorted. It
10
- # starts by reading one item of each file and adding it to the priority
11
- # queue. Afterwards, it continously pops an item from the queue, yields it
12
- # and reads a new item from the file the popped item belongs to, adding the
13
- # read item to the queue. This continues up until all items from the files
14
- # have been read. This guarantees that the yielded key-value pairs are
15
- # sorted without having all items in-memory.
8
+ # Performs the k-way-merge of the passed files referenced by the temp paths
9
+ # using a priority queue using a binomial heap. The content of the passed
10
+ # files needs to be sorted. It starts by reading one item of each file and
11
+ # adding it to the priority queue. Afterwards, it continously pops an item
12
+ # from the queue, yields it and reads a new item from the file the popped
13
+ # item belongs to, adding the read item to the queue. This continues up
14
+ # until all items from the files have been read. This guarantees that the
15
+ # yielded key-value pairs are sorted without having all items in-memory.
16
16
  #
17
- # @param files [IO, Tempfile] The files to run the k-way-merge for. The
18
- # content of the files must be sorted.
17
+ # @param temp_paths [TempPath] The files referenced by the temp paths to
18
+ # run the k-way-merge for. The content of the files must be sorted.
19
+ # @param chunk_limit [Integer] The maximum number of files to process
20
+ # at the same time. Most useful when you run on a system where the
21
+ # number of open file descriptors is limited. If your number of file
22
+ # descriptors is unlimited, you want to set it to a higher number to
23
+ # avoid the overhead of multiple runs.
19
24
 
20
- def k_way_merge(files)
21
- return enum_for(:k_way_merge, files) unless block_given?
25
+ def k_way_merge(temp_paths, chunk_limit:, &block)
26
+ return enum_for(__method__, temp_paths, chunk_limit: chunk_limit) unless block_given?
27
+
28
+ dupped_temp_paths = temp_paths.dup
29
+ additional_temp_paths = []
30
+
31
+ while dupped_temp_paths.size > chunk_limit
32
+ temp_path_out = TempPath.new
33
+
34
+ File.open(temp_path_out.path, "w+") do |file|
35
+ files = dupped_temp_paths.shift(chunk_limit).map { |temp_path| File.open(temp_path.path, "r") }
36
+
37
+ k_way_merge!(files) do |pair|
38
+ file.puts(JSON.generate(pair))
39
+ end
40
+
41
+ files.each(&:close)
42
+ end
43
+
44
+ dupped_temp_paths.push(temp_path_out)
45
+ additional_temp_paths.push(temp_path_out)
46
+ end
47
+
48
+ files = dupped_temp_paths.map { |temp_path| File.open(temp_path.path, "r") }
49
+ k_way_merge!(files, &block)
50
+ files.each(&:close)
51
+
52
+ nil
53
+ ensure
54
+ additional_temp_paths&.each(&:delete)
55
+ end
56
+
57
+ # Performs the actual k-way-merge of the specified files.
58
+ #
59
+ # @param files [IO, Tempfile] The files to run the k-way-merge for.
60
+ # The content of the files must be sorted.
61
+
62
+ def k_way_merge!(files)
63
+ return enum_for(__method__, files) unless block_given?
22
64
 
23
65
  if files.size == 1
24
66
  files.first.each_line do |line|
@@ -6,8 +6,6 @@ module MapReduce
6
6
  include Reduceable
7
7
  include MonitorMixin
8
8
 
9
- class InvalidChunkLimit < StandardError; end
10
-
11
9
  # Initializes a new reducer.
12
10
  #
13
11
  # @param implementation Your map-reduce implementation, i.e. an object
@@ -70,38 +68,36 @@ module MapReduce
70
68
  # end
71
69
 
72
70
  def reduce(chunk_limit:, &block)
73
- return enum_for(:reduce, chunk_limit: chunk_limit) unless block_given?
71
+ return enum_for(__method__, chunk_limit: chunk_limit) unless block_given?
74
72
 
75
73
  raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
76
74
 
77
75
  begin
78
76
  loop do
79
77
  slice = @temp_paths.shift(chunk_limit)
80
- files = slice.select { |temp_path| File.exist?(temp_path.path) }
81
- .map { |temp_path| File.open(temp_path.path, "r") }
82
-
83
- begin
84
- if @temp_paths.empty?
85
- reduce_chunk(k_way_merge(files), @implementation).each do |pair|
86
- block.call(pair)
87
- end
88
78
 
89
- return
79
+ if @temp_paths.empty?
80
+ reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
81
+ block.call(pair)
90
82
  end
91
83
 
92
- File.open(add_chunk, "w") do |file|
93
- reduce_chunk(k_way_merge(files), @implementation).each do |pair|
94
- file.puts JSON.generate(pair)
95
- end
84
+ return
85
+ end
86
+
87
+ File.open(add_chunk, "w+") do |file|
88
+ reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
89
+ file.puts JSON.generate(pair)
96
90
  end
97
- ensure
98
- files.each(&:close)
99
- slice.each(&:delete)
100
91
  end
92
+ ensure
93
+ slice&.each(&:delete)
101
94
  end
102
95
  ensure
103
96
  @temp_paths.each(&:delete)
97
+ @temp_paths = []
104
98
  end
99
+
100
+ nil
105
101
  end
106
102
  end
107
103
  end
@@ -1,3 +1,3 @@
1
1
  module MapReduce
2
- VERSION = "2.1.0"
2
+ VERSION = "3.0.0"
3
3
  end
data/lib/map_reduce.rb CHANGED
@@ -13,4 +13,7 @@ require "map_reduce/hash_partitioner"
13
13
  require "map_reduce/mapper"
14
14
  require "map_reduce/reducer"
15
15
 
16
- module MapReduce; end
16
+ module MapReduce
17
+ class Error < StandardError; end
18
+ class InvalidChunkLimit < Error; end
19
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: map-reduce-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-24 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec