map-reduce-ruby 2.1.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5545309a188291db41e8f5fc24af45a8d983c5084f2233d735cab309921c928c
4
- data.tar.gz: 779a839704ace3780a304bc7295c8b1f27e834253ee0544e9ff6ae21eda93753
3
+ metadata.gz: a794ca527320099e74e49e3f3abe1a19a6fc6df186eb04a822c926af5ae4985c
4
+ data.tar.gz: f8e1b77d7e3fbcf7171aa1fce94273a21fef45d9d76f8768b07ba450c3d51bdb
5
5
  SHA512:
6
- metadata.gz: eb577347b7e5c09dd34166e814fc0c50180b6f036fad84de3857dc93d28242be81637d5b0c7f19ea7a846c659eca0c12fcbae01cdac37c4f2bf50c6d9f8f27f6
7
- data.tar.gz: 704b5d6a140583099c53902ceaab5af7b45c41c004f159fc215a942a4749063bf0990c2d60cde15af9663da726bab03a1258e085c0e789470150ab96caf895f7
6
+ metadata.gz: 79c9b21cd9fd8c586fc61b0b9cc9dd3ddee039fdce2c355965e06309a17436aff28932b68647b0b4ebe90e59208dee0dd4ecf7f83dc06a5216c6da4dea3eb00f
7
+ data.tar.gz: 22bae4b877451ee66fdcdcd0ff0861caf65a41cab5e3b77c03370a4bb172c5d381f7623ab2f91516875e060c54eec3a8ac9063edeffe9e167a9a3390e25f8e62
data/.rubocop.yml CHANGED
@@ -45,7 +45,7 @@ Style/StringLiteralsInInterpolation:
45
45
  EnforcedStyle: double_quotes
46
46
 
47
47
  Layout/LineLength:
48
- Max: 120
48
+ Max: 250
49
49
 
50
50
  Style/FrozenStringLiteralComment:
51
51
  EnforcedStyle: never
@@ -55,3 +55,6 @@ Style/ObjectThen:
55
55
 
56
56
  Gemspec/RequireMFA:
57
57
  Enabled: false
58
+
59
+ Style/HashTransformValues:
60
+ Enabled: false
data/CHANGELOG.md CHANGED
@@ -1,5 +1,20 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v3.0.0
4
+
5
+ * [BREAKING] `MapReduce::Mapper#shuffle` now yields a hash of (partition, path)
6
+ pairs, which e.g. allows to upload the files in parallel
7
+ * [BREAKING] `MapReduce::Mapper#shuffle` now requires a `chunk_limit`. This
8
+ allows to further limit the maximum number of open file descriptors
9
+ * [BREAKING] `MapReduce::Mapper#shuffle` no longer returns an `Enumerator` when
10
+ no block is given
11
+ * [BREAKING] `MapReduce::Reducer::InvalidChunkLimit` is now
12
+ `MapReduce::InvalidChunkLimit` and inherits from `MapReduce::Error` being the
13
+ base class for all errors
14
+ * `MapReduce::Mapper#shuffle` no longer keeps all partition files open. Instead,
15
+ it writes them one after another to further strictly reduce the number of
16
+ open file descriptors.
17
+
3
18
  ## v2.1.1
4
19
 
5
20
  * Fix in `MapReduce::Mapper` when no `reduce` implementation is given
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- map-reduce-ruby (2.1.1)
4
+ map-reduce-ruby (3.0.0)
5
5
  json
6
6
  lazy_priority_queue
7
7
 
data/README.md CHANGED
@@ -57,9 +57,11 @@ class WordCountMapper
57
57
  mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
58
58
  mapper.map(url)
59
59
 
60
- mapper.shuffle do |partition, tempfile|
61
- # store content of tempfile e.g. on s3:
62
- bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: tempfile)
60
+ mapper.shuffle(chunk_limit: 64) do |partitions|
61
+ partitions.each do |partition, path|
62
+ # store content of the tempfile located at path e.g. on s3:
63
+ bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: File.open(path))
64
+ end
63
65
  end
64
66
  end
65
67
  end
@@ -205,6 +207,10 @@ interface of callables, could even be expressed as a simple one-liner:
205
207
  MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
206
208
  ```
207
209
 
210
+ ## Semantic Versioning
211
+
212
+ MapReduce is using Semantic Versioning: [SemVer](http://semver.org/)
213
+
208
214
  ## Development
209
215
 
210
216
  After checking out the repo, run `bin/setup` to install dependencies. Then, run
@@ -224,5 +230,5 @@ https://github.com/mrkamel/map-reduce-ruby
224
230
 
225
231
  ## License
226
232
 
227
- The gem is available as open source under the terms of the [MIT
228
- License](https://opensource.org/licenses/MIT).
233
+ The gem is available as open source under the terms of the
234
+ [MIT License](https://opensource.org/licenses/MIT).
@@ -6,8 +6,6 @@ module MapReduce
6
6
  include Reduceable
7
7
  include MonitorMixin
8
8
 
9
- attr_reader :partitions
10
-
11
9
  # Initializes a new mapper.
12
10
  #
13
11
  # @param implementation Your map-reduce implementation, i.e. an object
@@ -45,9 +43,11 @@ module MapReduce
45
43
  def map(*args, **kwargs)
46
44
  @implementation.map(*args, **kwargs) do |new_key, new_value|
47
45
  synchronize do
48
- @buffer.push([new_key, new_value])
46
+ partition = @partitioner.call(new_key)
47
+ item = [[partition, new_key], new_value]
49
48
 
50
- @buffer_size += JSON.generate([new_key, new_value]).bytesize
49
+ @buffer.push(item)
50
+ @buffer_size += JSON.generate(item).bytesize
51
51
 
52
52
  write_chunk if @buffer_size >= @memory_limit
53
53
  end
@@ -55,62 +55,86 @@ module MapReduce
55
55
  end
56
56
 
57
57
  # Performs a k-way-merge of the sorted chunks written to tempfiles while
58
- # already reducing the result using your map-reduce implementation and
59
- # splitting the dataset into partitions. Finally yields each partition with
60
- # the tempfile containing the data of the partition.
58
+ # already reducing the result using your map-reduce implementation (if
59
+ # available) and splitting the dataset into partitions. Finally yields a
60
+ # hash of (partition, path) pairs containing the data for the partitions
61
+ # in tempfiles.
62
+ #
63
+ # @param chunk_limit [Integer] The maximum number of files to process
64
+ # at the same time. Most useful when you run on a system where the
65
+ # number of open file descriptors is limited. If your number of file
66
+ # descriptors is unlimited, you want to set it to a higher number to
67
+ # avoid the overhead of multiple runs.
61
68
  #
62
69
  # @example
63
- # mapper.shuffle do |partition, tempfile|
64
- # # store data e.g. on s3
70
+ # mapper.shuffle do |partitions|
71
+ # partitions.each do |partition, path|
72
+ # # store data e.g. on s3
73
+ # end
65
74
  # end
66
75
 
67
- def shuffle(&block)
68
- return enum_for(:shuffle) unless block_given?
76
+ def shuffle(chunk_limit:)
77
+ raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
69
78
 
70
- write_chunk if @buffer_size > 0
79
+ begin
80
+ write_chunk if @buffer_size > 0
71
81
 
72
- partitions = {}
82
+ chunk = k_way_merge(@chunks, chunk_limit: chunk_limit)
83
+ chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
73
84
 
74
- chunk = k_way_merge(@chunks)
75
- chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
85
+ partitions = split_chunk(chunk)
76
86
 
77
- chunk.each do |pair|
78
- partition = @partitioner.call(pair[0])
87
+ yield(partitions.transform_values(&:path))
88
+ ensure
89
+ partitions.each_value(&:delete)
79
90
 
80
- (partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
91
+ @chunks.each(&:delete)
92
+ @chunks = []
81
93
  end
82
94
 
83
- @chunks.each { |tempfile| tempfile.close(true) }
84
- @chunks = []
95
+ nil
96
+ end
97
+
98
+ private
85
99
 
86
- partitions.each_value(&:rewind)
100
+ def split_chunk(chunk)
101
+ res = {}
102
+ current_partition = nil
103
+ file = nil
87
104
 
88
- partitions.each do |partition, tempfile|
89
- block.call(partition, tempfile)
105
+ chunk.each do |((new_partition, key), value)|
106
+ if new_partition != current_partition
107
+ file&.close
108
+
109
+ current_partition = new_partition
110
+ temp_path = TempPath.new
111
+ res[new_partition] = temp_path
112
+ file = File.open(temp_path.path, "w+")
113
+ end
114
+
115
+ file.puts(JSON.generate([key, value]))
90
116
  end
91
117
 
92
- partitions.each_value { |tempfile| tempfile.close(true) }
118
+ file&.close
93
119
 
94
- nil
120
+ res
95
121
  end
96
122
 
97
- private
98
-
99
123
  def write_chunk
100
- tempfile = Tempfile.new
124
+ temp_path = TempPath.new
101
125
 
102
126
  @buffer.sort_by!(&:first)
103
127
 
104
128
  chunk = @buffer
105
129
  chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
106
130
 
107
- chunk.each do |pair|
108
- tempfile.puts JSON.generate(pair)
131
+ File.open(temp_path.path, "w+") do |file|
132
+ chunk.each do |pair|
133
+ file.puts JSON.generate(pair)
134
+ end
109
135
  end
110
136
 
111
- tempfile.rewind
112
-
113
- @chunks.push(tempfile)
137
+ @chunks.push(temp_path)
114
138
 
115
139
  @buffer_size = 0
116
140
  @buffer = []
@@ -5,20 +5,62 @@ module MapReduce
5
5
  module Mergeable
6
6
  private
7
7
 
8
- # Performs the k-way-merge of the passed files using a priority queue using
9
- # a binomial heap. The content of the passed files needs to be sorted. It
10
- # starts by reading one item of each file and adding it to the priority
11
- # queue. Afterwards, it continously pops an item from the queue, yields it
12
- # and reads a new item from the file the popped item belongs to, adding the
13
- # read item to the queue. This continues up until all items from the files
14
- # have been read. This guarantees that the yielded key-value pairs are
15
- # sorted without having all items in-memory.
8
+ # Performs the k-way-merge of the passed files referenced by the temp paths
9
+ # using a priority queue using a binomial heap. The content of the passed
10
+ # files needs to be sorted. It starts by reading one item of each file and
11
+ # adding it to the priority queue. Afterwards, it continously pops an item
12
+ # from the queue, yields it and reads a new item from the file the popped
13
+ # item belongs to, adding the read item to the queue. This continues up
14
+ # until all items from the files have been read. This guarantees that the
15
+ # yielded key-value pairs are sorted without having all items in-memory.
16
16
  #
17
- # @param files [IO, Tempfile] The files to run the k-way-merge for. The
18
- # content of the files must be sorted.
17
+ # @param temp_paths [TempPath] The files referenced by the temp paths to
18
+ # run the k-way-merge for. The content of the files must be sorted.
19
+ # @param chunk_limit [Integer] The maximum number of files to process
20
+ # at the same time. Most useful when you run on a system where the
21
+ # number of open file descriptors is limited. If your number of file
22
+ # descriptors is unlimited, you want to set it to a higher number to
23
+ # avoid the overhead of multiple runs.
19
24
 
20
- def k_way_merge(files)
21
- return enum_for(:k_way_merge, files) unless block_given?
25
+ def k_way_merge(temp_paths, chunk_limit:, &block)
26
+ return enum_for(__method__, temp_paths, chunk_limit: chunk_limit) unless block_given?
27
+
28
+ dupped_temp_paths = temp_paths.dup
29
+ additional_temp_paths = []
30
+
31
+ while dupped_temp_paths.size > chunk_limit
32
+ temp_path_out = TempPath.new
33
+
34
+ File.open(temp_path_out.path, "w+") do |file|
35
+ files = dupped_temp_paths.shift(chunk_limit).map { |temp_path| File.open(temp_path.path, "r") }
36
+
37
+ k_way_merge!(files) do |pair|
38
+ file.puts(JSON.generate(pair))
39
+ end
40
+
41
+ files.each(&:close)
42
+ end
43
+
44
+ dupped_temp_paths.push(temp_path_out)
45
+ additional_temp_paths.push(temp_path_out)
46
+ end
47
+
48
+ files = dupped_temp_paths.map { |temp_path| File.open(temp_path.path, "r") }
49
+ k_way_merge!(files, &block)
50
+ files.each(&:close)
51
+
52
+ nil
53
+ ensure
54
+ additional_temp_paths&.each(&:delete)
55
+ end
56
+
57
+ # Performs the actual k-way-merge of the specified files.
58
+ #
59
+ # @param files [IO, Tempfile] The files to run the k-way-merge for.
60
+ # The content of the files must be sorted.
61
+
62
+ def k_way_merge!(files)
63
+ return enum_for(__method__, files) unless block_given?
22
64
 
23
65
  if files.size == 1
24
66
  files.first.each_line do |line|
@@ -6,8 +6,6 @@ module MapReduce
6
6
  include Reduceable
7
7
  include MonitorMixin
8
8
 
9
- class InvalidChunkLimit < StandardError; end
10
-
11
9
  # Initializes a new reducer.
12
10
  #
13
11
  # @param implementation Your map-reduce implementation, i.e. an object
@@ -70,38 +68,36 @@ module MapReduce
70
68
  # end
71
69
 
72
70
  def reduce(chunk_limit:, &block)
73
- return enum_for(:reduce, chunk_limit: chunk_limit) unless block_given?
71
+ return enum_for(__method__, chunk_limit: chunk_limit) unless block_given?
74
72
 
75
73
  raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
76
74
 
77
75
  begin
78
76
  loop do
79
77
  slice = @temp_paths.shift(chunk_limit)
80
- files = slice.select { |temp_path| File.exist?(temp_path.path) }
81
- .map { |temp_path| File.open(temp_path.path, "r") }
82
-
83
- begin
84
- if @temp_paths.empty?
85
- reduce_chunk(k_way_merge(files), @implementation).each do |pair|
86
- block.call(pair)
87
- end
88
78
 
89
- return
79
+ if @temp_paths.empty?
80
+ reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
81
+ block.call(pair)
90
82
  end
91
83
 
92
- File.open(add_chunk, "w") do |file|
93
- reduce_chunk(k_way_merge(files), @implementation).each do |pair|
94
- file.puts JSON.generate(pair)
95
- end
84
+ return
85
+ end
86
+
87
+ File.open(add_chunk, "w+") do |file|
88
+ reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
89
+ file.puts JSON.generate(pair)
96
90
  end
97
- ensure
98
- files.each(&:close)
99
- slice.each(&:delete)
100
91
  end
92
+ ensure
93
+ slice&.each(&:delete)
101
94
  end
102
95
  ensure
103
96
  @temp_paths.each(&:delete)
97
+ @temp_paths = []
104
98
  end
99
+
100
+ nil
105
101
  end
106
102
  end
107
103
  end
@@ -1,3 +1,3 @@
1
1
  module MapReduce
2
- VERSION = "2.1.1"
2
+ VERSION = "3.0.0"
3
3
  end
data/lib/map_reduce.rb CHANGED
@@ -13,4 +13,7 @@ require "map_reduce/hash_partitioner"
13
13
  require "map_reduce/mapper"
14
14
  require "map_reduce/reducer"
15
15
 
16
- module MapReduce; end
16
+ module MapReduce
17
+ class Error < StandardError; end
18
+ class InvalidChunkLimit < Error; end
19
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: map-reduce-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Vetter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-24 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec