jrf 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jrf/cli/runner.rb +82 -15
- data/lib/jrf/cli.rb +1 -1
- data/lib/jrf/reducers.rb +29 -0
- data/lib/jrf/row_context.rb +21 -8
- data/lib/jrf/stage.rb +32 -2
- data/lib/jrf/version.rb +1 -1
- data/test/cli_parallel_test.rb +205 -6
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: de85d7a03d58baee4c931d10869a824a1ff5c2eec121cd15e63ec23805203676
|
|
4
|
+
data.tar.gz: ce3c53475e13d41e3a176ef7c9ea840145fbbf826612457386cc0899c28a1af0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ded54cff09febe7fe02c585f30a702cd82cd11aeb563f840b9f182cd8a6e94c090ba5e71fbd6cad7f377816c14de41c12f5fed449b2b8f7c1d682513db2f19ee
|
|
7
|
+
data.tar.gz: bdd4f9ee2ff809cc718b497a39783027f9c7322582dfdaebc8cae8b6bf4cb1d56b9639a356f2239be66b6ee39693c1da994b4fa9b9db5ecfb6c10d66ede021d9
|
data/lib/jrf/cli/runner.rb
CHANGED
|
@@ -21,7 +21,7 @@ module Jrf
|
|
|
21
21
|
chunk = @input.read(length)
|
|
22
22
|
return nil if chunk.nil?
|
|
23
23
|
|
|
24
|
-
chunk
|
|
24
|
+
chunk.tr!(RS_CHAR, "\n")
|
|
25
25
|
if outbuf
|
|
26
26
|
outbuf.replace(chunk)
|
|
27
27
|
else
|
|
@@ -72,7 +72,7 @@ module Jrf
|
|
|
72
72
|
|
|
73
73
|
def compact!
|
|
74
74
|
if @offset > 0
|
|
75
|
-
@buf
|
|
75
|
+
@buf.slice!(0, @offset)
|
|
76
76
|
@offset = 0
|
|
77
77
|
end
|
|
78
78
|
end
|
|
@@ -140,13 +140,30 @@ module Jrf
|
|
|
140
140
|
|
|
141
141
|
def process_values(blocks, parallel:, verbose:, &block)
|
|
142
142
|
if parallel <= 1 || @file_paths.length <= 1
|
|
143
|
+
# Single file or no parallelism requested — serial is the only option.
|
|
144
|
+
# This also covers the all-files-empty case: no files means no workers to spawn.
|
|
143
145
|
dump_parallel_status("disabled", verbose: verbose)
|
|
144
146
|
return apply_pipeline(blocks, each_input_enum).each(&block)
|
|
145
147
|
end
|
|
146
148
|
|
|
147
|
-
|
|
148
|
-
split_index
|
|
149
|
-
|
|
149
|
+
split_index, probe_stage = classify_parallel_stages(blocks)
|
|
150
|
+
if split_index.nil?
|
|
151
|
+
dump_parallel_status("disabled", verbose: verbose)
|
|
152
|
+
return apply_pipeline(blocks, each_input_enum).each(&block)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# If the first reducer stage is decomposable, workers run everything up to
|
|
156
|
+
# and including it (map prefix + reducer), emit partial accumulators, and the
|
|
157
|
+
# parent merges. This covers both pure reducers (split_index == 0, e.g. `sum(_)`)
|
|
158
|
+
# and map-then-reduce (split_index > 0, e.g. `select(...) >> sum(...)`).
|
|
159
|
+
if probe_stage&.decomposable?
|
|
160
|
+
worker_blocks = blocks[0..split_index]
|
|
161
|
+
rest_blocks = blocks[(split_index + 1)..]
|
|
162
|
+
return process_decomposable_parallel(worker_blocks, rest_blocks, probe_stage,
|
|
163
|
+
parallel: parallel, verbose: verbose, &block)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
if split_index == 0
|
|
150
167
|
dump_parallel_status("disabled", verbose: verbose)
|
|
151
168
|
return apply_pipeline(blocks, each_input_enum).each(&block)
|
|
152
169
|
end
|
|
@@ -162,6 +179,9 @@ module Jrf
|
|
|
162
179
|
@err.puts "parallel: #{status}" if verbose
|
|
163
180
|
end
|
|
164
181
|
|
|
182
|
+
# Returns [split_index, probe_stage] where split_index is the index of the
|
|
183
|
+
# first reducer stage (or blocks.length if all are passthrough), and probe_stage
|
|
184
|
+
# is the Stage object of that first reducer (nil if all passthrough or no input).
|
|
165
185
|
def classify_parallel_stages(blocks)
|
|
166
186
|
# Read the first row from the first file to probe stage modes
|
|
167
187
|
first_value = nil
|
|
@@ -171,24 +191,63 @@ module Jrf
|
|
|
171
191
|
break
|
|
172
192
|
end
|
|
173
193
|
end
|
|
174
|
-
return nil if first_value.nil?
|
|
194
|
+
return [nil, nil] if first_value.nil?
|
|
175
195
|
|
|
176
196
|
# Run the value through each stage independently to classify
|
|
177
197
|
split_index = nil
|
|
198
|
+
probe_stage = nil
|
|
178
199
|
blocks.each_with_index do |block, i|
|
|
179
200
|
probe_pipeline = Pipeline.new(block)
|
|
180
201
|
probe_pipeline.call([first_value]) { |_| }
|
|
181
202
|
stage = probe_pipeline.instance_variable_get(:@stages).first
|
|
182
203
|
if stage.instance_variable_get(:@mode) == :reducer
|
|
183
204
|
split_index = i
|
|
205
|
+
probe_stage = stage
|
|
184
206
|
break
|
|
185
207
|
end
|
|
186
208
|
end
|
|
187
209
|
|
|
188
|
-
split_index || blocks.length
|
|
210
|
+
[split_index || blocks.length, probe_stage]
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def process_decomposable_parallel(worker_blocks, rest_blocks, probe_stage, parallel:, verbose:, &block)
|
|
214
|
+
dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} decompose=#{worker_blocks.length}/#{worker_blocks.length + rest_blocks.length}", verbose: verbose)
|
|
215
|
+
|
|
216
|
+
# Workers run map prefix + reducer stage per file and emit partial accumulators.
|
|
217
|
+
partials_list = []
|
|
218
|
+
reducer_stage_index = worker_blocks.length - 1
|
|
219
|
+
spawner = ->(path) do
|
|
220
|
+
spawn_worker(worker_blocks, path) do |pipeline, input|
|
|
221
|
+
pipeline.call(input) { |_| }
|
|
222
|
+
# If the file was empty, the stage was never initialized (no reducers),
|
|
223
|
+
# so skip emitting — the parent will simply not receive a partial for this worker.
|
|
224
|
+
stage = pipeline.instance_variable_get(:@stages)[reducer_stage_index]
|
|
225
|
+
partials = stage.partial_accumulators
|
|
226
|
+
emit_parallel_frame(partials) unless partials.empty?
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
children = run_parallel_worker_pool(parallel, spawner) { |v| partials_list << v }
|
|
230
|
+
wait_for_parallel_children(children) if children
|
|
231
|
+
return if partials_list.empty?
|
|
232
|
+
|
|
233
|
+
# Reuse the probe stage (already initialized with reducer structure from classify).
|
|
234
|
+
# Replace its accumulators with the first worker's partials, then merge the rest.
|
|
235
|
+
probe_stage.replace_accumulators!(partials_list.first)
|
|
236
|
+
partials_list.drop(1).each { |partials| probe_stage.merge_partials!(partials) }
|
|
237
|
+
|
|
238
|
+
# Finish the reducer stage and pass results through any remaining stages.
|
|
239
|
+
results = probe_stage.finish
|
|
240
|
+
if rest_blocks.empty?
|
|
241
|
+
results.each(&block)
|
|
242
|
+
else
|
|
243
|
+
apply_pipeline(rest_blocks, results.each).each(&block)
|
|
244
|
+
end
|
|
189
245
|
end
|
|
190
246
|
|
|
191
|
-
|
|
247
|
+
# Forks a worker process that reads `path`, builds a pipeline from `blocks`,
|
|
248
|
+
# and yields [pipeline, input_enum] to the caller's block for custom behavior.
|
|
249
|
+
# Returns [read_io, pid].
|
|
250
|
+
def spawn_worker(blocks, path)
|
|
192
251
|
read_io, write_io = IO.pipe
|
|
193
252
|
pid = fork do
|
|
194
253
|
read_io.close
|
|
@@ -200,7 +259,7 @@ module Jrf
|
|
|
200
259
|
end
|
|
201
260
|
worker_failed = false
|
|
202
261
|
begin
|
|
203
|
-
pipeline
|
|
262
|
+
yield pipeline, input_enum
|
|
204
263
|
rescue => e
|
|
205
264
|
@err.puts "#{path}: #{e.message} (#{e.class})"
|
|
206
265
|
worker_failed = true
|
|
@@ -213,14 +272,17 @@ module Jrf
|
|
|
213
272
|
[read_io, pid]
|
|
214
273
|
end
|
|
215
274
|
|
|
216
|
-
|
|
275
|
+
# Runs a pool of up to `num_workers` concurrent workers across all input files.
|
|
276
|
+
# `spawner` is called with a file path and must return [read_io, pid].
|
|
277
|
+
# Yields each decoded JSON value from worker output frames.
|
|
278
|
+
def run_parallel_worker_pool(num_workers, spawner)
|
|
217
279
|
file_queue = @file_paths.dup
|
|
218
280
|
workers = {} # read_io => [reader, pid]
|
|
219
281
|
children = []
|
|
220
282
|
|
|
221
283
|
# Fill initial pool
|
|
222
284
|
while workers.size < num_workers && !file_queue.empty?
|
|
223
|
-
read_io, pid =
|
|
285
|
+
read_io, pid = spawner.call(file_queue.shift)
|
|
224
286
|
workers[read_io] = [ParallelFrameReader.new, pid]
|
|
225
287
|
children << pid
|
|
226
288
|
end
|
|
@@ -242,7 +304,7 @@ module Jrf
|
|
|
242
304
|
|
|
243
305
|
# Spawn next worker if files remain
|
|
244
306
|
unless file_queue.empty?
|
|
245
|
-
read_io, pid =
|
|
307
|
+
read_io, pid = spawner.call(file_queue.shift)
|
|
246
308
|
workers[read_io] = [ParallelFrameReader.new, pid]
|
|
247
309
|
children << pid
|
|
248
310
|
read_ios << read_io
|
|
@@ -261,8 +323,13 @@ module Jrf
|
|
|
261
323
|
|
|
262
324
|
def parallel_map_enum(map_blocks, num_workers)
|
|
263
325
|
children = nil
|
|
326
|
+
spawner = ->(path) do
|
|
327
|
+
spawn_worker(map_blocks, path) do |pipeline, input|
|
|
328
|
+
pipeline.call(input) { |value| emit_parallel_frame(value) }
|
|
329
|
+
end
|
|
330
|
+
end
|
|
264
331
|
Enumerator.new do |y|
|
|
265
|
-
children = run_parallel_worker_pool(
|
|
332
|
+
children = run_parallel_worker_pool(num_workers, spawner) { |value| y << value }
|
|
266
333
|
ensure
|
|
267
334
|
wait_for_parallel_children(children) if children
|
|
268
335
|
end
|
|
@@ -291,8 +358,8 @@ module Jrf
|
|
|
291
358
|
def each_stream_value(stream)
|
|
292
359
|
return each_stream_value_lax(stream) { |value| yield value } if @lax
|
|
293
360
|
|
|
294
|
-
stream.each_line do |
|
|
295
|
-
line
|
|
361
|
+
stream.each_line do |line|
|
|
362
|
+
line.strip!
|
|
296
363
|
next if line.empty?
|
|
297
364
|
yield JSON.parse(line)
|
|
298
365
|
end
|
data/lib/jrf/cli.rb
CHANGED
|
@@ -18,7 +18,7 @@ module Jrf
|
|
|
18
18
|
--lax allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)
|
|
19
19
|
-o, --output FORMAT
|
|
20
20
|
output format: json (default), pretty, tsv
|
|
21
|
-
-P N opportunistically parallelize
|
|
21
|
+
-P N opportunistically parallelize across N workers
|
|
22
22
|
-r, --require LIBRARY
|
|
23
23
|
require LIBRARY before evaluating stages
|
|
24
24
|
--no-jit do not enable YJIT, even when supported by the Ruby runtime
|
data/lib/jrf/reducers.rb
CHANGED
|
@@ -20,8 +20,37 @@ module Jrf
|
|
|
20
20
|
end
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
+
# A reducer whose partial accumulators can be merged across parallel workers.
|
|
24
|
+
#
|
|
25
|
+
# Contract:
|
|
26
|
+
# - `identity` is the neutral element for `merge_fn`: merge(identity, x) == x
|
|
27
|
+
# - `initial` is always set to `identity` (the accumulator starts from the neutral element)
|
|
28
|
+
# - Any bias (e.g. sum's `initial:` keyword) is applied in `finish_fn`, not in the starting accumulator
|
|
29
|
+
class DecomposableReduce < Reduce
|
|
30
|
+
attr_reader :merge_fn
|
|
31
|
+
|
|
32
|
+
def initialize(identity, merge:, finish_fn: nil, &step_fn)
|
|
33
|
+
super(identity, finish_fn: finish_fn, &step_fn)
|
|
34
|
+
@merge_fn = merge
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Returns the raw accumulator without applying finish_fn.
|
|
38
|
+
def partial
|
|
39
|
+
@acc
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Merges another partial accumulator into this one.
|
|
43
|
+
def merge_partial(other_acc)
|
|
44
|
+
@acc = @merge_fn.call(@acc, other_acc)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
23
48
|
def reduce(initial, finish: nil, &step_fn)
|
|
24
49
|
Reduce.new(initial, finish_fn: finish, &step_fn)
|
|
25
50
|
end
|
|
51
|
+
|
|
52
|
+
def decomposable_reduce(identity, merge:, finish: nil, &step_fn)
|
|
53
|
+
DecomposableReduce.new(identity, merge: merge, finish_fn: finish, &step_fn)
|
|
54
|
+
end
|
|
26
55
|
end
|
|
27
56
|
end
|
data/lib/jrf/row_context.rb
CHANGED
|
@@ -17,6 +17,7 @@ module Jrf
|
|
|
17
17
|
spec.fetch(:value),
|
|
18
18
|
initial: reducer_initial_value(spec.fetch(:initial)),
|
|
19
19
|
finish: spec[:finish],
|
|
20
|
+
merge: spec[:merge],
|
|
20
21
|
&spec.fetch(:step)
|
|
21
22
|
)
|
|
22
23
|
end
|
|
@@ -48,27 +49,38 @@ module Jrf
|
|
|
48
49
|
end
|
|
49
50
|
|
|
50
51
|
define_reducer(:sum) do |_ctx, value, initial: 0, block: nil|
|
|
51
|
-
|
|
52
|
+
step = ->(acc, v) { v.nil? ? acc : (acc + v) }
|
|
53
|
+
if initial.is_a?(Numeric)
|
|
54
|
+
# Numeric — decomposable. Bias applied once in finish.
|
|
55
|
+
finish = initial == 0 ? nil : ->(acc) { [acc + initial] }
|
|
56
|
+
{ value: value, initial: 0, step: step, finish: finish, merge: ->(a, b) { a + b } }
|
|
57
|
+
else
|
|
58
|
+
# Non-numeric (e.g. string concat) — not decomposable.
|
|
59
|
+
{ value: value, initial: initial, step: step }
|
|
60
|
+
end
|
|
52
61
|
end
|
|
53
62
|
|
|
54
63
|
define_reducer(:count) do |_ctx, value = MISSING, block: nil|
|
|
64
|
+
merge = ->(a, b) { a + b }
|
|
55
65
|
if value.equal?(MISSING)
|
|
56
|
-
{ value: nil, initial: 0, step: ->(acc, _v) { acc + 1 } }
|
|
66
|
+
{ value: nil, initial: 0, step: ->(acc, _v) { acc + 1 }, merge: merge }
|
|
57
67
|
else
|
|
58
|
-
{ value: value, initial: 0, step: ->(acc, v) { v.nil? ? acc : (acc + 1) } }
|
|
68
|
+
{ value: value, initial: 0, step: ->(acc, v) { v.nil? ? acc : (acc + 1) }, merge: merge }
|
|
59
69
|
end
|
|
60
70
|
end
|
|
61
71
|
|
|
62
72
|
define_reducer(:count_if) do |_ctx, condition, block: nil|
|
|
63
|
-
{ value: condition, initial: 0, step: ->(acc, v) { v ? (acc + 1) : acc } }
|
|
73
|
+
{ value: condition, initial: 0, step: ->(acc, v) { v ? (acc + 1) : acc }, merge: ->(a, b) { a + b } }
|
|
64
74
|
end
|
|
65
75
|
|
|
66
76
|
define_reducer(:min) do |_ctx, value, block: nil|
|
|
67
|
-
|
|
77
|
+
min_merge = ->(a, b) { a.nil? ? b : b.nil? ? a : (a < b ? a : b) }
|
|
78
|
+
{ value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v < acc ? v : acc) }, merge: min_merge }
|
|
68
79
|
end
|
|
69
80
|
|
|
70
81
|
define_reducer(:max) do |_ctx, value, block: nil|
|
|
71
|
-
|
|
82
|
+
max_merge = ->(a, b) { a.nil? ? b : b.nil? ? a : (a > b ? a : b) }
|
|
83
|
+
{ value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v > acc ? v : acc) }, merge: max_merge }
|
|
72
84
|
end
|
|
73
85
|
|
|
74
86
|
define_reducer(:average) do |_ctx, value, block: nil|
|
|
@@ -82,7 +94,8 @@ module Jrf
|
|
|
82
94
|
acc[0] += v
|
|
83
95
|
acc[1] += 1
|
|
84
96
|
acc
|
|
85
|
-
}
|
|
97
|
+
},
|
|
98
|
+
merge: ->(a, b) { [a[0] + b[0], a[1] + b[1]] }
|
|
86
99
|
}
|
|
87
100
|
end
|
|
88
101
|
|
|
@@ -136,7 +149,7 @@ module Jrf
|
|
|
136
149
|
|
|
137
150
|
define_reducer(:group) do |ctx, value = MISSING, block: nil|
|
|
138
151
|
resolved_value = value.equal?(MISSING) ? ctx.send(:current_input) : value
|
|
139
|
-
{ value: resolved_value, initial: -> { [] }, step: ->(acc, v) { acc << v } }
|
|
152
|
+
{ value: resolved_value, initial: -> { [] }, step: ->(acc, v) { acc << v }, merge: ->(a, b) { a + b } }
|
|
140
153
|
end
|
|
141
154
|
|
|
142
155
|
define_reducer(:percentile) do |ctx, value, percentage, block: nil|
|
data/lib/jrf/stage.rb
CHANGED
|
@@ -51,13 +51,17 @@ module Jrf
|
|
|
51
51
|
(@mode == :reducer) ? Control::DROPPED : result
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
-
def step_reduce(value, initial:, finish: nil, step_fn: nil, &step_block)
|
|
54
|
+
def step_reduce(value, initial:, finish: nil, merge: nil, step_fn: nil, &step_block)
|
|
55
55
|
idx = @cursor
|
|
56
56
|
step_fn ||= step_block
|
|
57
57
|
|
|
58
58
|
if @reducers[idx].nil?
|
|
59
59
|
finish_rows = finish || ->(acc) { [acc] }
|
|
60
|
-
@reducers[idx] =
|
|
60
|
+
@reducers[idx] = if merge
|
|
61
|
+
Reducers.decomposable_reduce(initial, merge: merge, finish: finish_rows, &step_fn)
|
|
62
|
+
else
|
|
63
|
+
Reducers.reduce(initial, finish: finish_rows, &step_fn)
|
|
64
|
+
end
|
|
61
65
|
result = ReducerToken.new(idx)
|
|
62
66
|
else
|
|
63
67
|
result = Control::DROPPED
|
|
@@ -167,6 +171,32 @@ module Jrf
|
|
|
167
171
|
end
|
|
168
172
|
end
|
|
169
173
|
|
|
174
|
+
# Returns true if all reducers in this stage are DecomposableReduce instances,
|
|
175
|
+
# meaning partial accumulators from parallel workers can be merged.
|
|
176
|
+
def decomposable?
|
|
177
|
+
@mode == :reducer && @reducers.any? &&
|
|
178
|
+
@reducers.all? { |r| r.is_a?(Reducers::DecomposableReduce) }
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Returns an array of raw accumulator values, one per reducer.
|
|
182
|
+
def partial_accumulators
|
|
183
|
+
@reducers.map(&:partial)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Replaces all reducer accumulators with the given values.
|
|
187
|
+
def replace_accumulators!(partials)
|
|
188
|
+
@reducers.each_with_index do |reducer, i|
|
|
189
|
+
reducer.instance_variable_set(:@acc, partials[i])
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Merges an array of partial accumulators (from another worker) into this stage's reducers.
|
|
194
|
+
def merge_partials!(other_partials)
|
|
195
|
+
@reducers.each_with_index do |reducer, i|
|
|
196
|
+
reducer.merge_partial(other_partials[i])
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
170
200
|
private
|
|
171
201
|
|
|
172
202
|
def with_scoped_reducers(reducer_list)
|
data/lib/jrf/version.rb
CHANGED
data/test/cli_parallel_test.rb
CHANGED
|
@@ -53,8 +53,9 @@ class CliParallelTest < JrfTestCase
|
|
|
53
53
|
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
54
54
|
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
55
55
|
|
|
56
|
-
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
56
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
57
57
|
assert_success(status, stderr, "parallel split map+reduce")
|
|
58
|
+
assert_includes(stderr, "decompose=2/2", "select+sum decomposed")
|
|
58
59
|
assert_equal(%w[90], lines(stdout), "parallel split map+reduce output")
|
|
59
60
|
end
|
|
60
61
|
end
|
|
@@ -72,15 +73,16 @@ class CliParallelTest < JrfTestCase
|
|
|
72
73
|
end
|
|
73
74
|
end
|
|
74
75
|
|
|
75
|
-
def
|
|
76
|
+
def test_parallel_decomposable_reducer
|
|
76
77
|
Dir.mktmpdir do |dir|
|
|
77
78
|
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
78
79
|
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
79
80
|
|
|
80
81
|
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
81
|
-
assert_success(status, stderr, "
|
|
82
|
-
assert_equal(%w[6], lines(stdout), "
|
|
83
|
-
assert_includes(stderr, "parallel:
|
|
82
|
+
assert_success(status, stderr, "parallel decomposable reducer")
|
|
83
|
+
assert_equal(%w[6], lines(stdout), "parallel decomposable reducer output")
|
|
84
|
+
assert_includes(stderr, "parallel: enabled", "parallel enabled for decomposable reducer")
|
|
85
|
+
assert_includes(stderr, "decompose=", "decompose mode indicated")
|
|
84
86
|
end
|
|
85
87
|
end
|
|
86
88
|
|
|
@@ -153,12 +155,209 @@ class CliParallelTest < JrfTestCase
|
|
|
153
155
|
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}, {"x" => 3}])
|
|
154
156
|
write_ndjson(dir, "b.ndjson", [{"x" => 40}, {"x" => 5}])
|
|
155
157
|
|
|
156
|
-
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
158
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
157
159
|
assert_success(status, stderr, "parallel select then sum")
|
|
160
|
+
assert_includes(stderr, "decompose=2/2", "select+sum fully decomposed in workers")
|
|
158
161
|
assert_equal(%w[60], lines(stdout), "parallel select then sum output")
|
|
159
162
|
end
|
|
160
163
|
end
|
|
161
164
|
|
|
165
|
+
def test_parallel_decomposable_multi_reducer
|
|
166
|
+
Dir.mktmpdir do |dir|
|
|
167
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
168
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
169
|
+
|
|
170
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '{s: sum(_["x"]), n: count(), mn: min(_["x"]), mx: max(_["x"])}', *ndjson_files(dir))
|
|
171
|
+
assert_success(status, stderr, "parallel multi reducer")
|
|
172
|
+
assert_includes(stderr, "decompose=", "multi reducer decomposed")
|
|
173
|
+
result = JSON.parse(lines(stdout).first)
|
|
174
|
+
assert_equal(10, result["s"], "sum")
|
|
175
|
+
assert_equal(4, result["n"], "count")
|
|
176
|
+
assert_equal(1, result["mn"], "min")
|
|
177
|
+
assert_equal(4, result["mx"], "max")
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def test_parallel_decomposable_average
|
|
182
|
+
Dir.mktmpdir do |dir|
|
|
183
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
184
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
185
|
+
|
|
186
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'average(_["x"])', *ndjson_files(dir))
|
|
187
|
+
assert_success(status, stderr, "parallel average")
|
|
188
|
+
assert_includes(stderr, "decompose=", "average decomposed")
|
|
189
|
+
assert_equal(["25.0"], lines(stdout), "parallel average output")
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def test_parallel_decomposable_group
|
|
194
|
+
Dir.mktmpdir do |dir|
|
|
195
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
196
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
197
|
+
|
|
198
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'group(_["x"])', *ndjson_files(dir))
|
|
199
|
+
assert_success(status, stderr, "parallel group")
|
|
200
|
+
assert_includes(stderr, "decompose=", "group decomposed")
|
|
201
|
+
result = JSON.parse(lines(stdout).first)
|
|
202
|
+
assert_equal([1, 2, 3], result.sort, "parallel group output")
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def test_parallel_decomposable_sum_with_initial
|
|
207
|
+
Dir.mktmpdir do |dir|
|
|
208
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
209
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
210
|
+
|
|
211
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"], initial: 100)', *ndjson_files(dir))
|
|
212
|
+
assert_success(status, stderr, "sum with numeric initial")
|
|
213
|
+
assert_includes(stderr, "decompose=", "numeric initial decomposes")
|
|
214
|
+
assert_equal(%w[106], lines(stdout), "sum with initial output")
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def test_parallel_sum_with_non_numeric_initial_falls_back
|
|
219
|
+
Dir.mktmpdir do |dir|
|
|
220
|
+
write_ndjson(dir, "a.ndjson", [{"x" => "a"}, {"x" => "b"}])
|
|
221
|
+
write_ndjson(dir, "b.ndjson", [{"x" => "c"}])
|
|
222
|
+
|
|
223
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"], initial: "")', *ndjson_files(dir))
|
|
224
|
+
assert_success(status, stderr, "sum with string initial")
|
|
225
|
+
assert_includes(stderr, "parallel: disabled", "non-numeric initial falls back to serial")
|
|
226
|
+
assert_equal(['"abc"'], lines(stdout), "sum with string initial output")
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def test_sum_with_string_initial
|
|
231
|
+
Dir.mktmpdir do |dir|
|
|
232
|
+
write_ndjson(dir, "a.ndjson", [{"x" => "hello "}, {"x" => "world"}])
|
|
233
|
+
|
|
234
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", 'sum(_["x"], initial: "")', *ndjson_files(dir))
|
|
235
|
+
assert_success(status, stderr, "sum with string initial")
|
|
236
|
+
assert_equal(['"hello world"'], lines(stdout), "sum with string initial output")
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def test_parallel_decomposable_reducer_then_passthrough
|
|
241
|
+
Dir.mktmpdir do |dir|
|
|
242
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
243
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
244
|
+
|
|
245
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"]) >> _ * 2', *ndjson_files(dir))
|
|
246
|
+
assert_success(status, stderr, "parallel decomposable then passthrough")
|
|
247
|
+
assert_includes(stderr, "decompose=", "reducer then passthrough decomposed")
|
|
248
|
+
assert_equal(%w[20], lines(stdout), "parallel decomposable then passthrough output")
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def test_parallel_mixed_decomposable_reducers
|
|
253
|
+
Dir.mktmpdir do |dir|
|
|
254
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
255
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
256
|
+
|
|
257
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '[sum(_["x"]), average(_["x"]), min(_["x"]), max(_["x"]), count()]', *ndjson_files(dir))
|
|
258
|
+
assert_success(status, stderr, "mixed decomposable")
|
|
259
|
+
assert_includes(stderr, "decompose=", "mixed decomposable used decompose")
|
|
260
|
+
result = JSON.parse(lines(stdout).first)
|
|
261
|
+
assert_equal([100, 25.0, 10, 40, 4], result, "mixed decomposable output")
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def test_parallel_mixed_decomposable_and_non_decomposable_falls_back
|
|
266
|
+
Dir.mktmpdir do |dir|
|
|
267
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
268
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
269
|
+
|
|
270
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '[sum(_["x"]), percentile(_["x"], 0.5)]', *ndjson_files(dir))
|
|
271
|
+
assert_success(status, stderr, "mixed with non-decomposable")
|
|
272
|
+
assert_includes(stderr, "parallel: disabled", "mixed with non-decomposable falls back to serial")
|
|
273
|
+
result = JSON.parse(lines(stdout).first)
|
|
274
|
+
assert_equal([100, 20], result, "mixed with non-decomposable output")
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def test_parallel_select_sum_passthrough_decomposes
|
|
279
|
+
Dir.mktmpdir do |dir|
|
|
280
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}])
|
|
281
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 40}])
|
|
282
|
+
|
|
283
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"]) >> _ * 2', *ndjson_files(dir))
|
|
284
|
+
assert_success(status, stderr, "select+sum+passthrough")
|
|
285
|
+
assert_includes(stderr, "decompose=2/3", "select+sum decomposed, passthrough in parent")
|
|
286
|
+
assert_equal(%w[120], lines(stdout), "select+sum+passthrough output")
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def test_parallel_select_non_decomposable_uses_split
|
|
291
|
+
Dir.mktmpdir do |dir|
|
|
292
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 3}, {"x" => 1}])
|
|
293
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 2}])
|
|
294
|
+
|
|
295
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 0) >> sort(_["x"]) >> _["x"]', *ndjson_files(dir))
|
|
296
|
+
assert_success(status, stderr, "select+sort uses split")
|
|
297
|
+
assert_includes(stderr, "split=1/3", "non-decomposable sort uses map-prefix split")
|
|
298
|
+
assert_equal([1, 2, 3], lines(stdout).map { |l| JSON.parse(l) }, "select+sort output")
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def test_parallel_decomposable_with_empty_file
|
|
303
|
+
Dir.mktmpdir do |dir|
|
|
304
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
305
|
+
File.write(File.join(dir, "b.ndjson"), "")
|
|
306
|
+
|
|
307
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '{s: sum(_["x"]), n: count(), mn: min(_["x"])}', *ndjson_files(dir))
|
|
308
|
+
assert_success(status, stderr, "decomposable with empty file")
|
|
309
|
+
assert_includes(stderr, "decompose=", "decomposable with empty file used decompose")
|
|
310
|
+
result = JSON.parse(lines(stdout).first)
|
|
311
|
+
assert_equal(3, result["s"], "sum ignores empty file")
|
|
312
|
+
assert_equal(2, result["n"], "count ignores empty file")
|
|
313
|
+
assert_equal(1, result["mn"], "min ignores empty file")
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def test_parallel_decomposable_all_files_empty
|
|
318
|
+
Dir.mktmpdir do |dir|
|
|
319
|
+
File.write(File.join(dir, "a.ndjson"), "")
|
|
320
|
+
File.write(File.join(dir, "b.ndjson"), "")
|
|
321
|
+
|
|
322
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
323
|
+
assert_success(status, stderr, "all files empty")
|
|
324
|
+
# All files empty means first_value is nil, so classify returns nil → serial fallback
|
|
325
|
+
assert_includes(stderr, "parallel: disabled", "all files empty falls back to serial")
|
|
326
|
+
assert_equal([], lines(stdout), "no output for empty input")
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def test_parallel_non_decomposable_falls_back_to_serial
|
|
331
|
+
Dir.mktmpdir do |dir|
|
|
332
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
333
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
334
|
+
|
|
335
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sort(_["x"]) >> _["x"]', *ndjson_files(dir))
|
|
336
|
+
assert_success(status, stderr, "non-decomposable serial fallback")
|
|
337
|
+
assert_equal([1, 2, 3], lines(stdout).map { |l| JSON.parse(l) }, "sort output")
|
|
338
|
+
assert_includes(stderr, "parallel: disabled", "non-decomposable falls back to serial")
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
def test_parallel_decomposable_matches_serial
|
|
343
|
+
Dir.mktmpdir do |dir|
|
|
344
|
+
write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i} })
|
|
345
|
+
write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i} })
|
|
346
|
+
|
|
347
|
+
files = ndjson_files(dir)
|
|
348
|
+
expr = '{s: sum(_["v"]), n: count(), mn: min(_["v"]), mx: max(_["v"]), avg: average(_["v"])}'
|
|
349
|
+
|
|
350
|
+
serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
|
|
351
|
+
assert_success(serial_status, serial_stderr, "serial baseline")
|
|
352
|
+
|
|
353
|
+
parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-v", "-P", "2", expr, *files)
|
|
354
|
+
assert_success(parallel_status, parallel_stderr, "parallel run")
|
|
355
|
+
assert_includes(parallel_stderr, "decompose=", "decomposable matches serial used decompose")
|
|
356
|
+
|
|
357
|
+
assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel decomposable matches serial")
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
|
|
162
361
|
def test_serial_error_includes_filename
|
|
163
362
|
Dir.mktmpdir do |dir|
|
|
164
363
|
good_path = File.join(dir, "a.ndjson")
|