jrf 0.1.13 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jrf/cli/runner.rb +353 -36
- data/lib/jrf/cli.rb +12 -23
- data/lib/jrf/reducers.rb +29 -0
- data/lib/jrf/row_context.rb +21 -8
- data/lib/jrf/stage.rb +32 -2
- data/lib/jrf/version.rb +1 -1
- data/test/cli_parallel_test.rb +394 -0
- data/test/cli_runner_test.rb +10 -10
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: de85d7a03d58baee4c931d10869a824a1ff5c2eec121cd15e63ec23805203676
|
|
4
|
+
data.tar.gz: ce3c53475e13d41e3a176ef7c9ea840145fbbf826612457386cc0899c28a1af0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ded54cff09febe7fe02c585f30a702cd82cd11aeb563f840b9f182cd8a6e94c090ba5e71fbd6cad7f377816c14de41c12f5fed449b2b8f7c1d682513db2f19ee
|
|
7
|
+
data.tar.gz: bdd4f9ee2ff809cc718b497a39783027f9c7322582dfdaebc8cae8b6bf4cb1d56b9639a356f2239be66b6ee39693c1da994b4fa9b9db5ecfb6c10d66ede021d9
|
data/lib/jrf/cli/runner.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "json"
|
|
4
|
+
require "zlib"
|
|
4
5
|
require_relative "../pipeline"
|
|
5
6
|
require_relative "../pipeline_parser"
|
|
6
7
|
|
|
@@ -9,6 +10,7 @@ module Jrf
|
|
|
9
10
|
class Runner
|
|
10
11
|
RS_CHAR = "\x1e"
|
|
11
12
|
DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
|
|
13
|
+
PARALLEL_FRAME_HEADER_BYTES = 4
|
|
12
14
|
|
|
13
15
|
class RsNormalizer
|
|
14
16
|
def initialize(input)
|
|
@@ -19,7 +21,7 @@ module Jrf
|
|
|
19
21
|
chunk = @input.read(length)
|
|
20
22
|
return nil if chunk.nil?
|
|
21
23
|
|
|
22
|
-
chunk
|
|
24
|
+
chunk.tr!(RS_CHAR, "\n")
|
|
23
25
|
if outbuf
|
|
24
26
|
outbuf.replace(chunk)
|
|
25
27
|
else
|
|
@@ -28,61 +30,360 @@ module Jrf
|
|
|
28
30
|
end
|
|
29
31
|
end
|
|
30
32
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
+
class ParallelFrameReader
|
|
34
|
+
def initialize
|
|
35
|
+
@buf = +""
|
|
36
|
+
@offset = 0
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def append(chunk)
|
|
40
|
+
@buf << chunk
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def each_payload
|
|
44
|
+
while (payload = next_payload)
|
|
45
|
+
yield payload
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def has_partial?
|
|
50
|
+
@offset != @buf.bytesize
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def next_payload
|
|
56
|
+
if @buf.bytesize - @offset < PARALLEL_FRAME_HEADER_BYTES
|
|
57
|
+
compact!
|
|
58
|
+
return nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
payload_len = @buf.byteslice(@offset, PARALLEL_FRAME_HEADER_BYTES).unpack1("N")
|
|
62
|
+
frame_len = PARALLEL_FRAME_HEADER_BYTES + payload_len
|
|
63
|
+
if @buf.bytesize - @offset < frame_len
|
|
64
|
+
compact!
|
|
65
|
+
return nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
payload = @buf.byteslice(@offset + PARALLEL_FRAME_HEADER_BYTES, payload_len)
|
|
69
|
+
@offset += frame_len
|
|
70
|
+
payload
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def compact!
|
|
74
|
+
if @offset > 0
|
|
75
|
+
@buf.slice!(0, @offset)
|
|
76
|
+
@offset = 0
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def initialize(input: $stdin, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
|
|
82
|
+
if input.is_a?(Array)
|
|
83
|
+
@file_paths = input
|
|
84
|
+
@stdin = nil
|
|
85
|
+
else
|
|
86
|
+
@file_paths = []
|
|
87
|
+
@stdin = input
|
|
88
|
+
end
|
|
33
89
|
@out = out
|
|
34
90
|
@err = err
|
|
35
91
|
@lax = lax
|
|
36
92
|
@output_format = output_format
|
|
37
93
|
@atomic_write_bytes = atomic_write_bytes
|
|
38
94
|
@output_buffer = +""
|
|
95
|
+
@input_errors = false
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def input_errors?
|
|
99
|
+
@input_errors
|
|
39
100
|
end
|
|
40
101
|
|
|
41
|
-
def run(expression, verbose: false)
|
|
102
|
+
def run(expression, parallel: 1, verbose: false)
|
|
103
|
+
blocks = build_stage_blocks(expression, verbose: verbose)
|
|
104
|
+
if @output_format == :tsv
|
|
105
|
+
values = []
|
|
106
|
+
process_values(blocks, parallel: parallel, verbose: verbose) do |value|
|
|
107
|
+
values << value
|
|
108
|
+
end
|
|
109
|
+
emit_tsv(values)
|
|
110
|
+
else
|
|
111
|
+
process_values(blocks, parallel: parallel, verbose: verbose) do |value|
|
|
112
|
+
emit_output(value)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
ensure
|
|
116
|
+
write_output(@output_buffer)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
private
|
|
120
|
+
|
|
121
|
+
def build_stage_blocks(expression, verbose:)
|
|
42
122
|
parsed = PipelineParser.new(expression).parse
|
|
43
123
|
stages = parsed[:stages]
|
|
44
124
|
dump_stages(stages) if verbose
|
|
45
|
-
|
|
46
|
-
blocks = stages.map { |stage|
|
|
125
|
+
stages.map { |stage|
|
|
47
126
|
eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
|
|
48
127
|
}
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def apply_pipeline(blocks, input_enum)
|
|
49
131
|
pipeline = Pipeline.new(*blocks)
|
|
132
|
+
Enumerator.new do |y|
|
|
133
|
+
pipeline.call(input_enum) { |value| y << value }
|
|
134
|
+
end
|
|
135
|
+
end
|
|
50
136
|
|
|
51
|
-
|
|
137
|
+
def each_input_enum
|
|
138
|
+
Enumerator.new { |y| each_input_value { |v| y << v } }
|
|
139
|
+
end
|
|
52
140
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
141
|
+
def process_values(blocks, parallel:, verbose:, &block)
|
|
142
|
+
if parallel <= 1 || @file_paths.length <= 1
|
|
143
|
+
# Single file or no parallelism requested — serial is the only option.
|
|
144
|
+
# This also covers the all-files-empty case: no files means no workers to spawn.
|
|
145
|
+
dump_parallel_status("disabled", verbose: verbose)
|
|
146
|
+
return apply_pipeline(blocks, each_input_enum).each(&block)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
split_index, probe_stage = classify_parallel_stages(blocks)
|
|
150
|
+
if split_index.nil?
|
|
151
|
+
dump_parallel_status("disabled", verbose: verbose)
|
|
152
|
+
return apply_pipeline(blocks, each_input_enum).each(&block)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# If the first reducer stage is decomposable, workers run everything up to
|
|
156
|
+
# and including it (map prefix + reducer), emit partial accumulators, and the
|
|
157
|
+
# parent merges. This covers both pure reducers (split_index == 0, e.g. `sum(_)`)
|
|
158
|
+
# and map-then-reduce (split_index > 0, e.g. `select(...) >> sum(...)`).
|
|
159
|
+
if probe_stage&.decomposable?
|
|
160
|
+
worker_blocks = blocks[0..split_index]
|
|
161
|
+
rest_blocks = blocks[(split_index + 1)..]
|
|
162
|
+
return process_decomposable_parallel(worker_blocks, rest_blocks, probe_stage,
|
|
163
|
+
parallel: parallel, verbose: verbose, &block)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
if split_index == 0
|
|
167
|
+
dump_parallel_status("disabled", verbose: verbose)
|
|
168
|
+
return apply_pipeline(blocks, each_input_enum).each(&block)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
map_blocks = blocks[0...split_index]
|
|
172
|
+
reduce_blocks = blocks[split_index..]
|
|
173
|
+
dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} split=#{split_index}/#{blocks.length}", verbose: verbose)
|
|
174
|
+
input_enum = parallel_map_enum(map_blocks, parallel)
|
|
175
|
+
(reduce_blocks.empty? ? input_enum : apply_pipeline(reduce_blocks, input_enum)).each(&block)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def dump_parallel_status(status, verbose:)
|
|
179
|
+
@err.puts "parallel: #{status}" if verbose
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Returns [split_index, probe_stage] where split_index is the index of the
|
|
183
|
+
# first reducer stage (or blocks.length if all are passthrough), and probe_stage
|
|
184
|
+
# is the Stage object of that first reducer (nil if all passthrough or no input).
|
|
185
|
+
def classify_parallel_stages(blocks)
|
|
186
|
+
# Read the first row from the first file to probe stage modes
|
|
187
|
+
first_value = nil
|
|
188
|
+
open_file(@file_paths.first) do |stream|
|
|
189
|
+
each_stream_value(stream) do |value|
|
|
190
|
+
first_value = value
|
|
191
|
+
break
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
return [nil, nil] if first_value.nil?
|
|
195
|
+
|
|
196
|
+
# Run the value through each stage independently to classify
|
|
197
|
+
split_index = nil
|
|
198
|
+
probe_stage = nil
|
|
199
|
+
blocks.each_with_index do |block, i|
|
|
200
|
+
probe_pipeline = Pipeline.new(block)
|
|
201
|
+
probe_pipeline.call([first_value]) { |_| }
|
|
202
|
+
stage = probe_pipeline.instance_variable_get(:@stages).first
|
|
203
|
+
if stage.instance_variable_get(:@mode) == :reducer
|
|
204
|
+
split_index = i
|
|
205
|
+
probe_stage = stage
|
|
206
|
+
break
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
[split_index || blocks.length, probe_stage]
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def process_decomposable_parallel(worker_blocks, rest_blocks, probe_stage, parallel:, verbose:, &block)
|
|
214
|
+
dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} decompose=#{worker_blocks.length}/#{worker_blocks.length + rest_blocks.length}", verbose: verbose)
|
|
215
|
+
|
|
216
|
+
# Workers run map prefix + reducer stage per file and emit partial accumulators.
|
|
217
|
+
partials_list = []
|
|
218
|
+
reducer_stage_index = worker_blocks.length - 1
|
|
219
|
+
spawner = ->(path) do
|
|
220
|
+
spawn_worker(worker_blocks, path) do |pipeline, input|
|
|
221
|
+
pipeline.call(input) { |_| }
|
|
222
|
+
# If the file was empty, the stage was never initialized (no reducers),
|
|
223
|
+
# so skip emitting — the parent will simply not receive a partial for this worker.
|
|
224
|
+
stage = pipeline.instance_variable_get(:@stages)[reducer_stage_index]
|
|
225
|
+
partials = stage.partial_accumulators
|
|
226
|
+
emit_parallel_frame(partials) unless partials.empty?
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
children = run_parallel_worker_pool(parallel, spawner) { |v| partials_list << v }
|
|
230
|
+
wait_for_parallel_children(children) if children
|
|
231
|
+
return if partials_list.empty?
|
|
232
|
+
|
|
233
|
+
# Reuse the probe stage (already initialized with reducer structure from classify).
|
|
234
|
+
# Replace its accumulators with the first worker's partials, then merge the rest.
|
|
235
|
+
probe_stage.replace_accumulators!(partials_list.first)
|
|
236
|
+
partials_list.drop(1).each { |partials| probe_stage.merge_partials!(partials) }
|
|
237
|
+
|
|
238
|
+
# Finish the reducer stage and pass results through any remaining stages.
|
|
239
|
+
results = probe_stage.finish
|
|
240
|
+
if rest_blocks.empty?
|
|
241
|
+
results.each(&block)
|
|
57
242
|
else
|
|
58
|
-
|
|
243
|
+
apply_pipeline(rest_blocks, results.each).each(&block)
|
|
59
244
|
end
|
|
60
|
-
ensure
|
|
61
|
-
write_output(@output_buffer)
|
|
62
245
|
end
|
|
63
246
|
|
|
64
|
-
|
|
247
|
+
# Forks a worker process that reads `path`, builds a pipeline from `blocks`,
|
|
248
|
+
# and yields [pipeline, input_enum] to the caller's block for custom behavior.
|
|
249
|
+
# Returns [read_io, pid].
|
|
250
|
+
def spawn_worker(blocks, path)
|
|
251
|
+
read_io, write_io = IO.pipe
|
|
252
|
+
pid = fork do
|
|
253
|
+
read_io.close
|
|
254
|
+
@out = write_io
|
|
255
|
+
@output_buffer = +""
|
|
256
|
+
pipeline = Pipeline.new(*blocks)
|
|
257
|
+
input_enum = Enumerator.new do |y|
|
|
258
|
+
open_file(path) { |stream| each_stream_value(stream) { |v| y << v } }
|
|
259
|
+
end
|
|
260
|
+
worker_failed = false
|
|
261
|
+
begin
|
|
262
|
+
yield pipeline, input_enum
|
|
263
|
+
rescue => e
|
|
264
|
+
@err.puts "#{path}: #{e.message} (#{e.class})"
|
|
265
|
+
worker_failed = true
|
|
266
|
+
end
|
|
267
|
+
write_output(@output_buffer)
|
|
268
|
+
write_io.close
|
|
269
|
+
exit!(worker_failed ? 1 : 0)
|
|
270
|
+
end
|
|
271
|
+
write_io.close
|
|
272
|
+
[read_io, pid]
|
|
273
|
+
end
|
|
65
274
|
|
|
66
|
-
|
|
67
|
-
|
|
275
|
+
# Runs a pool of up to `num_workers` concurrent workers across all input files.
|
|
276
|
+
# `spawner` is called with a file path and must return [read_io, pid].
|
|
277
|
+
# Yields each decoded JSON value from worker output frames.
|
|
278
|
+
def run_parallel_worker_pool(num_workers, spawner)
|
|
279
|
+
file_queue = @file_paths.dup
|
|
280
|
+
workers = {} # read_io => [reader, pid]
|
|
281
|
+
children = []
|
|
282
|
+
|
|
283
|
+
# Fill initial pool
|
|
284
|
+
while workers.size < num_workers && !file_queue.empty?
|
|
285
|
+
read_io, pid = spawner.call(file_queue.shift)
|
|
286
|
+
workers[read_io] = [ParallelFrameReader.new, pid]
|
|
287
|
+
children << pid
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
read_ios = workers.keys.dup
|
|
291
|
+
|
|
292
|
+
until read_ios.empty?
|
|
293
|
+
ready = IO.select(read_ios)
|
|
294
|
+
ready[0].each do |io|
|
|
295
|
+
reader = workers[io][0]
|
|
296
|
+
chunk = io.read_nonblock(65536, exception: false)
|
|
297
|
+
if chunk == :wait_readable
|
|
298
|
+
next
|
|
299
|
+
elsif chunk.nil?
|
|
300
|
+
raise IOError, "truncated parallel frame from worker" if reader.has_partial?
|
|
301
|
+
read_ios.delete(io)
|
|
302
|
+
io.close
|
|
303
|
+
workers.delete(io)
|
|
304
|
+
|
|
305
|
+
# Spawn next worker if files remain
|
|
306
|
+
unless file_queue.empty?
|
|
307
|
+
read_io, pid = spawner.call(file_queue.shift)
|
|
308
|
+
workers[read_io] = [ParallelFrameReader.new, pid]
|
|
309
|
+
children << pid
|
|
310
|
+
read_ios << read_io
|
|
311
|
+
end
|
|
312
|
+
else
|
|
313
|
+
reader.append(chunk)
|
|
314
|
+
reader.each_payload do |payload|
|
|
315
|
+
yield JSON.parse(payload)
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
children
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
def parallel_map_enum(map_blocks, num_workers)
|
|
325
|
+
children = nil
|
|
326
|
+
spawner = ->(path) do
|
|
327
|
+
spawn_worker(map_blocks, path) do |pipeline, input|
|
|
328
|
+
pipeline.call(input) { |value| emit_parallel_frame(value) }
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
Enumerator.new do |y|
|
|
332
|
+
children = run_parallel_worker_pool(num_workers, spawner) { |value| y << value }
|
|
333
|
+
ensure
|
|
334
|
+
wait_for_parallel_children(children) if children
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
def wait_for_parallel_children(children)
|
|
339
|
+
failed = false
|
|
340
|
+
children.each do |pid|
|
|
341
|
+
_, status = Process.waitpid2(pid)
|
|
342
|
+
failed = true unless status.success?
|
|
343
|
+
end
|
|
344
|
+
exit(1) if failed
|
|
345
|
+
end
|
|
68
346
|
|
|
69
|
-
|
|
347
|
+
def emit_parallel_frame(value)
|
|
348
|
+
payload = JSON.generate(value)
|
|
349
|
+
buffer_output([payload.bytesize].pack("N") << payload)
|
|
70
350
|
end
|
|
71
351
|
|
|
72
|
-
def
|
|
352
|
+
def each_input_value
|
|
73
353
|
each_input do |source|
|
|
74
|
-
source
|
|
75
|
-
|
|
76
|
-
|
|
354
|
+
each_stream_value(source) { |value| yield value }
|
|
355
|
+
end
|
|
356
|
+
end
|
|
77
357
|
|
|
78
|
-
|
|
79
|
-
|
|
358
|
+
def each_stream_value(stream)
|
|
359
|
+
return each_stream_value_lax(stream) { |value| yield value } if @lax
|
|
360
|
+
|
|
361
|
+
stream.each_line do |line|
|
|
362
|
+
line.strip!
|
|
363
|
+
next if line.empty?
|
|
364
|
+
yield JSON.parse(line)
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
def open_file(path)
|
|
369
|
+
if path.end_with?(".gz")
|
|
370
|
+
Zlib::GzipReader.open(path) { |source| yield source }
|
|
371
|
+
else
|
|
372
|
+
File.open(path, "rb") { |source| yield source }
|
|
80
373
|
end
|
|
81
374
|
end
|
|
82
375
|
|
|
83
|
-
def
|
|
376
|
+
def each_stream_value_lax(stream)
|
|
84
377
|
require "oj"
|
|
85
|
-
|
|
378
|
+
Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
|
|
379
|
+
rescue LoadError
|
|
380
|
+
raise "oj is required for --lax mode (gem install oj)"
|
|
381
|
+
rescue Oj::ParseError => e
|
|
382
|
+
raise JSON::ParserError, e.message
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def streaming_json_handler_class
|
|
386
|
+
@streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
|
|
86
387
|
def initialize(&emit)
|
|
87
388
|
@emit = emit
|
|
88
389
|
end
|
|
@@ -94,13 +395,6 @@ module Jrf
|
|
|
94
395
|
def array_append(array, value) = array << value
|
|
95
396
|
def add_value(value) = @emit.call(value)
|
|
96
397
|
end
|
|
97
|
-
each_input do |source|
|
|
98
|
-
Oj.sc_parse(handler.new { |value| yield value }, RsNormalizer.new(source))
|
|
99
|
-
end
|
|
100
|
-
rescue LoadError
|
|
101
|
-
raise "oj is required for --lax mode (gem install oj)"
|
|
102
|
-
rescue Oj::ParseError => e
|
|
103
|
-
raise JSON::ParserError, e.message
|
|
104
398
|
end
|
|
105
399
|
|
|
106
400
|
def dump_stages(stages)
|
|
@@ -109,8 +403,25 @@ module Jrf
|
|
|
109
403
|
end
|
|
110
404
|
end
|
|
111
405
|
|
|
112
|
-
def each_input
|
|
113
|
-
@
|
|
406
|
+
def each_input(&block)
|
|
407
|
+
if @file_paths.empty?
|
|
408
|
+
with_error_handling("<stdin>") { block.call(@stdin) }
|
|
409
|
+
else
|
|
410
|
+
@file_paths.each do |path|
|
|
411
|
+
if path == "-"
|
|
412
|
+
with_error_handling("<stdin>") { block.call(@stdin) }
|
|
413
|
+
else
|
|
414
|
+
with_error_handling(path) { open_file(path, &block) }
|
|
415
|
+
end
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
def with_error_handling(name)
|
|
421
|
+
yield
|
|
422
|
+
rescue IOError, SystemCallError, Zlib::GzipFile::Error, JSON::ParserError => e
|
|
423
|
+
@err.puts "#{name}: #{e.message} (#{e.class})"
|
|
424
|
+
@input_errors = true
|
|
114
425
|
end
|
|
115
426
|
|
|
116
427
|
def emit_output(value)
|
|
@@ -171,7 +482,13 @@ module Jrf
|
|
|
171
482
|
end
|
|
172
483
|
|
|
173
484
|
def write_output(str)
|
|
174
|
-
|
|
485
|
+
return if str.empty?
|
|
486
|
+
|
|
487
|
+
total = 0
|
|
488
|
+
while total < str.bytesize
|
|
489
|
+
written = @out.syswrite(str.byteslice(total..))
|
|
490
|
+
total += written
|
|
491
|
+
end
|
|
175
492
|
end
|
|
176
493
|
end
|
|
177
494
|
end
|
data/lib/jrf/cli.rb
CHANGED
|
@@ -18,6 +18,7 @@ module Jrf
|
|
|
18
18
|
--lax allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)
|
|
19
19
|
-o, --output FORMAT
|
|
20
20
|
output format: json (default), pretty, tsv
|
|
21
|
+
-P N opportunistically parallelize across N workers
|
|
21
22
|
-r, --require LIBRARY
|
|
22
23
|
require LIBRARY before evaluating stages
|
|
23
24
|
--no-jit do not enable YJIT, even when supported by the Ruby runtime
|
|
@@ -45,6 +46,7 @@ module Jrf
|
|
|
45
46
|
verbose = false
|
|
46
47
|
lax = false
|
|
47
48
|
output_format = :json
|
|
49
|
+
parallel = 1
|
|
48
50
|
jit = true
|
|
49
51
|
required_libraries = []
|
|
50
52
|
atomic_write_bytes = Runner::DEFAULT_OUTPUT_BUFFER_LIMIT
|
|
@@ -54,6 +56,7 @@ module Jrf
|
|
|
54
56
|
opts.on("-v", "--verbose", "print parsed stage expressions") { verbose = true }
|
|
55
57
|
opts.on("--lax", "allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)") { lax = true }
|
|
56
58
|
opts.on("-o", "--output FORMAT", %w[json pretty tsv], "output format: json, pretty, tsv") { |fmt| output_format = fmt.to_sym }
|
|
59
|
+
opts.on("-P N", Integer, "opportunistically parallelize the map-prefix across N workers") { |n| parallel = n }
|
|
57
60
|
opts.on("-r", "--require LIBRARY", "require LIBRARY before evaluating stages") { |library| required_libraries << library }
|
|
58
61
|
opts.on("--no-jit", "do not enable YJIT, even when supported by the Ruby runtime") { jit = false }
|
|
59
62
|
opts.on("--atomic-write-bytes N", Integer, "group short outputs into atomic writes of up to N bytes") do |value|
|
|
@@ -89,34 +92,20 @@ module Jrf
|
|
|
89
92
|
enable_yjit if jit
|
|
90
93
|
required_libraries.each { |library| require library }
|
|
91
94
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
argv.each do |path|
|
|
97
|
-
if path == "-"
|
|
98
|
-
y << input
|
|
99
|
-
elsif path.end_with?(".gz")
|
|
100
|
-
require "zlib"
|
|
101
|
-
Zlib::GzipReader.open(path) do |source|
|
|
102
|
-
y << source
|
|
103
|
-
end
|
|
104
|
-
else
|
|
105
|
-
File.open(path, "rb") do |source|
|
|
106
|
-
y << source
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
end
|
|
112
|
-
Runner.new(
|
|
113
|
-
inputs: inputs,
|
|
95
|
+
file_paths = argv.dup
|
|
96
|
+
|
|
97
|
+
runner = Runner.new(
|
|
98
|
+
input: file_paths.empty? ? input : file_paths,
|
|
114
99
|
out: out,
|
|
115
100
|
err: err,
|
|
116
101
|
lax: lax,
|
|
117
102
|
output_format: output_format,
|
|
118
103
|
atomic_write_bytes: atomic_write_bytes
|
|
119
|
-
)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
runner.run(expression, parallel: parallel, verbose: verbose)
|
|
107
|
+
|
|
108
|
+
exit 1 if runner.input_errors?
|
|
120
109
|
end
|
|
121
110
|
|
|
122
111
|
def self.enable_yjit
|
data/lib/jrf/reducers.rb
CHANGED
|
@@ -20,8 +20,37 @@ module Jrf
|
|
|
20
20
|
end
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
+
# A reducer whose partial accumulators can be merged across parallel workers.
|
|
24
|
+
#
|
|
25
|
+
# Contract:
|
|
26
|
+
# - `identity` is the neutral element for `merge_fn`: merge(identity, x) == x
|
|
27
|
+
# - `initial` is always set to `identity` (the accumulator starts from the neutral element)
|
|
28
|
+
# - Any bias (e.g. sum's `initial:` keyword) is applied in `finish_fn`, not in the starting accumulator
|
|
29
|
+
class DecomposableReduce < Reduce
|
|
30
|
+
attr_reader :merge_fn
|
|
31
|
+
|
|
32
|
+
def initialize(identity, merge:, finish_fn: nil, &step_fn)
|
|
33
|
+
super(identity, finish_fn: finish_fn, &step_fn)
|
|
34
|
+
@merge_fn = merge
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Returns the raw accumulator without applying finish_fn.
|
|
38
|
+
def partial
|
|
39
|
+
@acc
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Merges another partial accumulator into this one.
|
|
43
|
+
def merge_partial(other_acc)
|
|
44
|
+
@acc = @merge_fn.call(@acc, other_acc)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
23
48
|
def reduce(initial, finish: nil, &step_fn)
|
|
24
49
|
Reduce.new(initial, finish_fn: finish, &step_fn)
|
|
25
50
|
end
|
|
51
|
+
|
|
52
|
+
def decomposable_reduce(identity, merge:, finish: nil, &step_fn)
|
|
53
|
+
DecomposableReduce.new(identity, merge: merge, finish_fn: finish, &step_fn)
|
|
54
|
+
end
|
|
26
55
|
end
|
|
27
56
|
end
|
data/lib/jrf/row_context.rb
CHANGED
|
@@ -17,6 +17,7 @@ module Jrf
|
|
|
17
17
|
spec.fetch(:value),
|
|
18
18
|
initial: reducer_initial_value(spec.fetch(:initial)),
|
|
19
19
|
finish: spec[:finish],
|
|
20
|
+
merge: spec[:merge],
|
|
20
21
|
&spec.fetch(:step)
|
|
21
22
|
)
|
|
22
23
|
end
|
|
@@ -48,27 +49,38 @@ module Jrf
|
|
|
48
49
|
end
|
|
49
50
|
|
|
50
51
|
define_reducer(:sum) do |_ctx, value, initial: 0, block: nil|
|
|
51
|
-
|
|
52
|
+
step = ->(acc, v) { v.nil? ? acc : (acc + v) }
|
|
53
|
+
if initial.is_a?(Numeric)
|
|
54
|
+
# Numeric — decomposable. Bias applied once in finish.
|
|
55
|
+
finish = initial == 0 ? nil : ->(acc) { [acc + initial] }
|
|
56
|
+
{ value: value, initial: 0, step: step, finish: finish, merge: ->(a, b) { a + b } }
|
|
57
|
+
else
|
|
58
|
+
# Non-numeric (e.g. string concat) — not decomposable.
|
|
59
|
+
{ value: value, initial: initial, step: step }
|
|
60
|
+
end
|
|
52
61
|
end
|
|
53
62
|
|
|
54
63
|
define_reducer(:count) do |_ctx, value = MISSING, block: nil|
|
|
64
|
+
merge = ->(a, b) { a + b }
|
|
55
65
|
if value.equal?(MISSING)
|
|
56
|
-
{ value: nil, initial: 0, step: ->(acc, _v) { acc + 1 } }
|
|
66
|
+
{ value: nil, initial: 0, step: ->(acc, _v) { acc + 1 }, merge: merge }
|
|
57
67
|
else
|
|
58
|
-
{ value: value, initial: 0, step: ->(acc, v) { v.nil? ? acc : (acc + 1) } }
|
|
68
|
+
{ value: value, initial: 0, step: ->(acc, v) { v.nil? ? acc : (acc + 1) }, merge: merge }
|
|
59
69
|
end
|
|
60
70
|
end
|
|
61
71
|
|
|
62
72
|
define_reducer(:count_if) do |_ctx, condition, block: nil|
|
|
63
|
-
{ value: condition, initial: 0, step: ->(acc, v) { v ? (acc + 1) : acc } }
|
|
73
|
+
{ value: condition, initial: 0, step: ->(acc, v) { v ? (acc + 1) : acc }, merge: ->(a, b) { a + b } }
|
|
64
74
|
end
|
|
65
75
|
|
|
66
76
|
define_reducer(:min) do |_ctx, value, block: nil|
|
|
67
|
-
|
|
77
|
+
min_merge = ->(a, b) { a.nil? ? b : b.nil? ? a : (a < b ? a : b) }
|
|
78
|
+
{ value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v < acc ? v : acc) }, merge: min_merge }
|
|
68
79
|
end
|
|
69
80
|
|
|
70
81
|
define_reducer(:max) do |_ctx, value, block: nil|
|
|
71
|
-
|
|
82
|
+
max_merge = ->(a, b) { a.nil? ? b : b.nil? ? a : (a > b ? a : b) }
|
|
83
|
+
{ value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v > acc ? v : acc) }, merge: max_merge }
|
|
72
84
|
end
|
|
73
85
|
|
|
74
86
|
define_reducer(:average) do |_ctx, value, block: nil|
|
|
@@ -82,7 +94,8 @@ module Jrf
|
|
|
82
94
|
acc[0] += v
|
|
83
95
|
acc[1] += 1
|
|
84
96
|
acc
|
|
85
|
-
}
|
|
97
|
+
},
|
|
98
|
+
merge: ->(a, b) { [a[0] + b[0], a[1] + b[1]] }
|
|
86
99
|
}
|
|
87
100
|
end
|
|
88
101
|
|
|
@@ -136,7 +149,7 @@ module Jrf
|
|
|
136
149
|
|
|
137
150
|
define_reducer(:group) do |ctx, value = MISSING, block: nil|
|
|
138
151
|
resolved_value = value.equal?(MISSING) ? ctx.send(:current_input) : value
|
|
139
|
-
{ value: resolved_value, initial: -> { [] }, step: ->(acc, v) { acc << v } }
|
|
152
|
+
{ value: resolved_value, initial: -> { [] }, step: ->(acc, v) { acc << v }, merge: ->(a, b) { a + b } }
|
|
140
153
|
end
|
|
141
154
|
|
|
142
155
|
define_reducer(:percentile) do |ctx, value, percentage, block: nil|
|
data/lib/jrf/stage.rb
CHANGED
|
@@ -51,13 +51,17 @@ module Jrf
|
|
|
51
51
|
(@mode == :reducer) ? Control::DROPPED : result
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
-
def step_reduce(value, initial:, finish: nil, step_fn: nil, &step_block)
|
|
54
|
+
def step_reduce(value, initial:, finish: nil, merge: nil, step_fn: nil, &step_block)
|
|
55
55
|
idx = @cursor
|
|
56
56
|
step_fn ||= step_block
|
|
57
57
|
|
|
58
58
|
if @reducers[idx].nil?
|
|
59
59
|
finish_rows = finish || ->(acc) { [acc] }
|
|
60
|
-
@reducers[idx] =
|
|
60
|
+
@reducers[idx] = if merge
|
|
61
|
+
Reducers.decomposable_reduce(initial, merge: merge, finish: finish_rows, &step_fn)
|
|
62
|
+
else
|
|
63
|
+
Reducers.reduce(initial, finish: finish_rows, &step_fn)
|
|
64
|
+
end
|
|
61
65
|
result = ReducerToken.new(idx)
|
|
62
66
|
else
|
|
63
67
|
result = Control::DROPPED
|
|
@@ -167,6 +171,32 @@ module Jrf
|
|
|
167
171
|
end
|
|
168
172
|
end
|
|
169
173
|
|
|
174
|
+
# Returns true if all reducers in this stage are DecomposableReduce instances,
|
|
175
|
+
# meaning partial accumulators from parallel workers can be merged.
|
|
176
|
+
def decomposable?
|
|
177
|
+
@mode == :reducer && @reducers.any? &&
|
|
178
|
+
@reducers.all? { |r| r.is_a?(Reducers::DecomposableReduce) }
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Returns an array of raw accumulator values, one per reducer.
|
|
182
|
+
def partial_accumulators
|
|
183
|
+
@reducers.map(&:partial)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Replaces all reducer accumulators with the given values.
|
|
187
|
+
def replace_accumulators!(partials)
|
|
188
|
+
@reducers.each_with_index do |reducer, i|
|
|
189
|
+
reducer.instance_variable_set(:@acc, partials[i])
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Merges an array of partial accumulators (from another worker) into this stage's reducers.
|
|
194
|
+
def merge_partials!(other_partials)
|
|
195
|
+
@reducers.each_with_index do |reducer, i|
|
|
196
|
+
reducer.merge_partial(other_partials[i])
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
170
200
|
private
|
|
171
201
|
|
|
172
202
|
def with_scoped_reducers(reducer_list)
|
data/lib/jrf/version.rb
CHANGED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "test_helper"
|
|
4
|
+
|
|
5
|
+
class CliParallelTest < JrfTestCase
|
|
6
|
+
def test_parallel_map_only
|
|
7
|
+
Dir.mktmpdir do |dir|
|
|
8
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
9
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
10
|
+
|
|
11
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '_["x"]', *ndjson_files(dir))
|
|
12
|
+
assert_success(status, stderr, "parallel map only")
|
|
13
|
+
assert_equal([1, 2, 3, 4], lines(stdout).map(&:to_i).sort, "parallel map only output")
|
|
14
|
+
assert_includes(stderr, "parallel: enabled workers=2 files=2 split=1/1", "parallel verbose summary")
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def test_parallel_map_only_pretty_output
|
|
19
|
+
Dir.mktmpdir do |dir|
|
|
20
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}])
|
|
21
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 2}])
|
|
22
|
+
|
|
23
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "pretty", '_["x"]', *ndjson_files(dir))
|
|
24
|
+
assert_success(status, stderr, "parallel pretty map only")
|
|
25
|
+
assert_equal(["1", "2"], stdout.lines.map(&:strip).reject(&:empty?).sort, "parallel pretty map only output")
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def test_parallel_map_only_tsv_output
|
|
30
|
+
Dir.mktmpdir do |dir|
|
|
31
|
+
write_ndjson(dir, "a.ndjson", [{"a" => 1, "b" => 2}])
|
|
32
|
+
write_ndjson(dir, "b.ndjson", [{"a" => 3, "b" => 4}])
|
|
33
|
+
|
|
34
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "tsv", "_", *ndjson_files(dir))
|
|
35
|
+
assert_success(status, stderr, "parallel tsv map only")
|
|
36
|
+
assert_equal(["a\t1", "a\t3", "b\t2", "b\t4"], lines(stdout).sort, "parallel tsv map only output")
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def test_parallel_map_reduce
|
|
41
|
+
Dir.mktmpdir do |dir|
|
|
42
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
43
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
44
|
+
|
|
45
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
46
|
+
assert_success(status, stderr, "parallel map reduce")
|
|
47
|
+
assert_equal(%w[10], lines(stdout), "parallel sum output")
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def test_parallel_split_map_and_reduce
|
|
52
|
+
Dir.mktmpdir do |dir|
|
|
53
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
54
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
55
|
+
|
|
56
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
57
|
+
assert_success(status, stderr, "parallel split map+reduce")
|
|
58
|
+
assert_includes(stderr, "decompose=2/2", "select+sum decomposed")
|
|
59
|
+
assert_equal(%w[90], lines(stdout), "parallel split map+reduce output")
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def test_parallel_group_by
|
|
64
|
+
Dir.mktmpdir do |dir|
|
|
65
|
+
write_ndjson(dir, "a.ndjson", [{"k" => "a", "v" => 1}, {"k" => "b", "v" => 2}])
|
|
66
|
+
write_ndjson(dir, "b.ndjson", [{"k" => "a", "v" => 3}, {"k" => "b", "v" => 4}])
|
|
67
|
+
|
|
68
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'group_by(_["k"]) { |r| sum(r["v"]) }', *ndjson_files(dir))
|
|
69
|
+
assert_success(status, stderr, "parallel group_by")
|
|
70
|
+
result = JSON.parse(lines(stdout).first)
|
|
71
|
+
assert_equal(4, result["a"], "parallel group_by a")
|
|
72
|
+
assert_equal(6, result["b"], "parallel group_by b")
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def test_parallel_decomposable_reducer
|
|
77
|
+
Dir.mktmpdir do |dir|
|
|
78
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
79
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
80
|
+
|
|
81
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
82
|
+
assert_success(status, stderr, "parallel decomposable reducer")
|
|
83
|
+
assert_equal(%w[6], lines(stdout), "parallel decomposable reducer output")
|
|
84
|
+
assert_includes(stderr, "parallel: enabled", "parallel enabled for decomposable reducer")
|
|
85
|
+
assert_includes(stderr, "decompose=", "decompose mode indicated")
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def test_parallel_with_gz_files
|
|
90
|
+
Dir.mktmpdir do |dir|
|
|
91
|
+
gz_path_a = File.join(dir, "a.ndjson.gz")
|
|
92
|
+
Zlib::GzipWriter.open(gz_path_a) { |io| io.write("{\"x\":10}\n{\"x\":20}\n") }
|
|
93
|
+
gz_path_b = File.join(dir, "b.ndjson.gz")
|
|
94
|
+
Zlib::GzipWriter.open(gz_path_b) { |io| io.write("{\"x\":30}\n") }
|
|
95
|
+
|
|
96
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', gz_path_a, gz_path_b)
|
|
97
|
+
assert_success(status, stderr, "parallel with gz")
|
|
98
|
+
assert_equal(%w[60], lines(stdout), "parallel with gz output")
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def test_parallel_matches_serial_output
|
|
103
|
+
Dir.mktmpdir do |dir|
|
|
104
|
+
write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i, "g" => i % 3} })
|
|
105
|
+
write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i, "g" => i % 3} })
|
|
106
|
+
|
|
107
|
+
files = ndjson_files(dir)
|
|
108
|
+
expr = 'group_by(_["g"]) { |r| sum(r["v"]) }'
|
|
109
|
+
|
|
110
|
+
serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
|
|
111
|
+
assert_success(serial_status, serial_stderr, "serial baseline")
|
|
112
|
+
|
|
113
|
+
parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-P", "2", expr, *files)
|
|
114
|
+
assert_success(parallel_status, parallel_stderr, "parallel run")
|
|
115
|
+
|
|
116
|
+
assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel matches serial")
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def test_parallel_worker_error_handling
|
|
121
|
+
Dir.mktmpdir do |dir|
|
|
122
|
+
good_path = File.join(dir, "a.ndjson")
|
|
123
|
+
File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
|
|
124
|
+
|
|
125
|
+
# Create a truncated gz file (valid header, truncated body)
|
|
126
|
+
bad_gz_path = File.join(dir, "b.ndjson.gz")
|
|
127
|
+
full_gz = StringIO.new
|
|
128
|
+
Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
|
|
129
|
+
# Write only the first half to simulate truncation
|
|
130
|
+
File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
|
|
131
|
+
|
|
132
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", '_["x"]', good_path, bad_gz_path)
|
|
133
|
+
assert_failure(status, "worker error causes non-zero exit")
|
|
134
|
+
assert_includes(stderr, bad_gz_path, "error message includes filename")
|
|
135
|
+
# Good file data should still be present
|
|
136
|
+
output_values = lines(stdout).map(&:to_i)
|
|
137
|
+
assert_includes(output_values, 1, "good file data preserved")
|
|
138
|
+
assert_includes(output_values, 2, "good file data preserved")
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def test_parallel_requires_multiple_files
|
|
143
|
+
# With single file and -P, should still work (falls back to serial)
|
|
144
|
+
Dir.mktmpdir do |dir|
|
|
145
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
146
|
+
|
|
147
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
148
|
+
assert_success(status, stderr, "single file with -P")
|
|
149
|
+
assert_equal(%w[3], lines(stdout), "single file with -P output")
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def test_parallel_select_then_sum
|
|
154
|
+
Dir.mktmpdir do |dir|
|
|
155
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}, {"x" => 3}])
|
|
156
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 40}, {"x" => 5}])
|
|
157
|
+
|
|
158
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
159
|
+
assert_success(status, stderr, "parallel select then sum")
|
|
160
|
+
assert_includes(stderr, "decompose=2/2", "select+sum fully decomposed in workers")
|
|
161
|
+
assert_equal(%w[60], lines(stdout), "parallel select then sum output")
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def test_parallel_decomposable_multi_reducer
|
|
166
|
+
Dir.mktmpdir do |dir|
|
|
167
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
168
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
169
|
+
|
|
170
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '{s: sum(_["x"]), n: count(), mn: min(_["x"]), mx: max(_["x"])}', *ndjson_files(dir))
|
|
171
|
+
assert_success(status, stderr, "parallel multi reducer")
|
|
172
|
+
assert_includes(stderr, "decompose=", "multi reducer decomposed")
|
|
173
|
+
result = JSON.parse(lines(stdout).first)
|
|
174
|
+
assert_equal(10, result["s"], "sum")
|
|
175
|
+
assert_equal(4, result["n"], "count")
|
|
176
|
+
assert_equal(1, result["mn"], "min")
|
|
177
|
+
assert_equal(4, result["mx"], "max")
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def test_parallel_decomposable_average
|
|
182
|
+
Dir.mktmpdir do |dir|
|
|
183
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
184
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
185
|
+
|
|
186
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'average(_["x"])', *ndjson_files(dir))
|
|
187
|
+
assert_success(status, stderr, "parallel average")
|
|
188
|
+
assert_includes(stderr, "decompose=", "average decomposed")
|
|
189
|
+
assert_equal(["25.0"], lines(stdout), "parallel average output")
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def test_parallel_decomposable_group
|
|
194
|
+
Dir.mktmpdir do |dir|
|
|
195
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
196
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
197
|
+
|
|
198
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'group(_["x"])', *ndjson_files(dir))
|
|
199
|
+
assert_success(status, stderr, "parallel group")
|
|
200
|
+
assert_includes(stderr, "decompose=", "group decomposed")
|
|
201
|
+
result = JSON.parse(lines(stdout).first)
|
|
202
|
+
assert_equal([1, 2, 3], result.sort, "parallel group output")
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def test_parallel_decomposable_sum_with_initial
|
|
207
|
+
Dir.mktmpdir do |dir|
|
|
208
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
209
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
210
|
+
|
|
211
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"], initial: 100)', *ndjson_files(dir))
|
|
212
|
+
assert_success(status, stderr, "sum with numeric initial")
|
|
213
|
+
assert_includes(stderr, "decompose=", "numeric initial decomposes")
|
|
214
|
+
assert_equal(%w[106], lines(stdout), "sum with initial output")
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def test_parallel_sum_with_non_numeric_initial_falls_back
|
|
219
|
+
Dir.mktmpdir do |dir|
|
|
220
|
+
write_ndjson(dir, "a.ndjson", [{"x" => "a"}, {"x" => "b"}])
|
|
221
|
+
write_ndjson(dir, "b.ndjson", [{"x" => "c"}])
|
|
222
|
+
|
|
223
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"], initial: "")', *ndjson_files(dir))
|
|
224
|
+
assert_success(status, stderr, "sum with string initial")
|
|
225
|
+
assert_includes(stderr, "parallel: disabled", "non-numeric initial falls back to serial")
|
|
226
|
+
assert_equal(['"abc"'], lines(stdout), "sum with string initial output")
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def test_sum_with_string_initial
|
|
231
|
+
Dir.mktmpdir do |dir|
|
|
232
|
+
write_ndjson(dir, "a.ndjson", [{"x" => "hello "}, {"x" => "world"}])
|
|
233
|
+
|
|
234
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", 'sum(_["x"], initial: "")', *ndjson_files(dir))
|
|
235
|
+
assert_success(status, stderr, "sum with string initial")
|
|
236
|
+
assert_equal(['"hello world"'], lines(stdout), "sum with string initial output")
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def test_parallel_decomposable_reducer_then_passthrough
|
|
241
|
+
Dir.mktmpdir do |dir|
|
|
242
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
243
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
244
|
+
|
|
245
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"]) >> _ * 2', *ndjson_files(dir))
|
|
246
|
+
assert_success(status, stderr, "parallel decomposable then passthrough")
|
|
247
|
+
assert_includes(stderr, "decompose=", "reducer then passthrough decomposed")
|
|
248
|
+
assert_equal(%w[20], lines(stdout), "parallel decomposable then passthrough output")
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def test_parallel_mixed_decomposable_reducers
|
|
253
|
+
Dir.mktmpdir do |dir|
|
|
254
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
255
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
256
|
+
|
|
257
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '[sum(_["x"]), average(_["x"]), min(_["x"]), max(_["x"]), count()]', *ndjson_files(dir))
|
|
258
|
+
assert_success(status, stderr, "mixed decomposable")
|
|
259
|
+
assert_includes(stderr, "decompose=", "mixed decomposable used decompose")
|
|
260
|
+
result = JSON.parse(lines(stdout).first)
|
|
261
|
+
assert_equal([100, 25.0, 10, 40, 4], result, "mixed decomposable output")
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def test_parallel_mixed_decomposable_and_non_decomposable_falls_back
|
|
266
|
+
Dir.mktmpdir do |dir|
|
|
267
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
268
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
269
|
+
|
|
270
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '[sum(_["x"]), percentile(_["x"], 0.5)]', *ndjson_files(dir))
|
|
271
|
+
assert_success(status, stderr, "mixed with non-decomposable")
|
|
272
|
+
assert_includes(stderr, "parallel: disabled", "mixed with non-decomposable falls back to serial")
|
|
273
|
+
result = JSON.parse(lines(stdout).first)
|
|
274
|
+
assert_equal([100, 20], result, "mixed with non-decomposable output")
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def test_parallel_select_sum_passthrough_decomposes
|
|
279
|
+
Dir.mktmpdir do |dir|
|
|
280
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}])
|
|
281
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 40}])
|
|
282
|
+
|
|
283
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"]) >> _ * 2', *ndjson_files(dir))
|
|
284
|
+
assert_success(status, stderr, "select+sum+passthrough")
|
|
285
|
+
assert_includes(stderr, "decompose=2/3", "select+sum decomposed, passthrough in parent")
|
|
286
|
+
assert_equal(%w[120], lines(stdout), "select+sum+passthrough output")
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def test_parallel_select_non_decomposable_uses_split
|
|
291
|
+
Dir.mktmpdir do |dir|
|
|
292
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 3}, {"x" => 1}])
|
|
293
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 2}])
|
|
294
|
+
|
|
295
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 0) >> sort(_["x"]) >> _["x"]', *ndjson_files(dir))
|
|
296
|
+
assert_success(status, stderr, "select+sort uses split")
|
|
297
|
+
assert_includes(stderr, "split=1/3", "non-decomposable sort uses map-prefix split")
|
|
298
|
+
assert_equal([1, 2, 3], lines(stdout).map { |l| JSON.parse(l) }, "select+sort output")
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def test_parallel_decomposable_with_empty_file
|
|
303
|
+
Dir.mktmpdir do |dir|
|
|
304
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
305
|
+
File.write(File.join(dir, "b.ndjson"), "")
|
|
306
|
+
|
|
307
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '{s: sum(_["x"]), n: count(), mn: min(_["x"])}', *ndjson_files(dir))
|
|
308
|
+
assert_success(status, stderr, "decomposable with empty file")
|
|
309
|
+
assert_includes(stderr, "decompose=", "decomposable with empty file used decompose")
|
|
310
|
+
result = JSON.parse(lines(stdout).first)
|
|
311
|
+
assert_equal(3, result["s"], "sum ignores empty file")
|
|
312
|
+
assert_equal(2, result["n"], "count ignores empty file")
|
|
313
|
+
assert_equal(1, result["mn"], "min ignores empty file")
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def test_parallel_decomposable_all_files_empty
|
|
318
|
+
Dir.mktmpdir do |dir|
|
|
319
|
+
File.write(File.join(dir, "a.ndjson"), "")
|
|
320
|
+
File.write(File.join(dir, "b.ndjson"), "")
|
|
321
|
+
|
|
322
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
323
|
+
assert_success(status, stderr, "all files empty")
|
|
324
|
+
# All files empty means first_value is nil, so classify returns nil → serial fallback
|
|
325
|
+
assert_includes(stderr, "parallel: disabled", "all files empty falls back to serial")
|
|
326
|
+
assert_equal([], lines(stdout), "no output for empty input")
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def test_parallel_non_decomposable_falls_back_to_serial
|
|
331
|
+
Dir.mktmpdir do |dir|
|
|
332
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
333
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
334
|
+
|
|
335
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sort(_["x"]) >> _["x"]', *ndjson_files(dir))
|
|
336
|
+
assert_success(status, stderr, "non-decomposable serial fallback")
|
|
337
|
+
assert_equal([1, 2, 3], lines(stdout).map { |l| JSON.parse(l) }, "sort output")
|
|
338
|
+
assert_includes(stderr, "parallel: disabled", "non-decomposable falls back to serial")
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
def test_parallel_decomposable_matches_serial
|
|
343
|
+
Dir.mktmpdir do |dir|
|
|
344
|
+
write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i} })
|
|
345
|
+
write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i} })
|
|
346
|
+
|
|
347
|
+
files = ndjson_files(dir)
|
|
348
|
+
expr = '{s: sum(_["v"]), n: count(), mn: min(_["v"]), mx: max(_["v"]), avg: average(_["v"])}'
|
|
349
|
+
|
|
350
|
+
serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
|
|
351
|
+
assert_success(serial_status, serial_stderr, "serial baseline")
|
|
352
|
+
|
|
353
|
+
parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-v", "-P", "2", expr, *files)
|
|
354
|
+
assert_success(parallel_status, parallel_stderr, "parallel run")
|
|
355
|
+
assert_includes(parallel_stderr, "decompose=", "decomposable matches serial used decompose")
|
|
356
|
+
|
|
357
|
+
assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel decomposable matches serial")
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
def test_serial_error_includes_filename
|
|
362
|
+
Dir.mktmpdir do |dir|
|
|
363
|
+
good_path = File.join(dir, "a.ndjson")
|
|
364
|
+
File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
|
|
365
|
+
|
|
366
|
+
bad_gz_path = File.join(dir, "b.ndjson.gz")
|
|
367
|
+
full_gz = StringIO.new
|
|
368
|
+
Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
|
|
369
|
+
File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
|
|
370
|
+
|
|
371
|
+
good_path2 = File.join(dir, "c.ndjson")
|
|
372
|
+
File.write(good_path2, "{\"x\":3}\n")
|
|
373
|
+
|
|
374
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", '_["x"]', good_path, bad_gz_path, good_path2)
|
|
375
|
+
assert_failure(status, "serial error causes non-zero exit")
|
|
376
|
+
assert_includes(stderr, bad_gz_path, "serial error message includes filename")
|
|
377
|
+
refute_includes(stderr, "from ", "serial error does not include stacktrace")
|
|
378
|
+
# Data from good files should still be present
|
|
379
|
+
output_values = lines(stdout).map(&:to_i)
|
|
380
|
+
assert_includes(output_values, 1, "data before bad file preserved")
|
|
381
|
+
assert_includes(output_values, 3, "data after bad file preserved")
|
|
382
|
+
end
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
private
|
|
386
|
+
|
|
387
|
+
def write_ndjson(dir, name, rows)
|
|
388
|
+
File.write(File.join(dir, name), rows.map { |r| JSON.generate(r) + "\n" }.join)
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
def ndjson_files(dir)
|
|
392
|
+
Dir.glob(File.join(dir, "*.ndjson")).sort
|
|
393
|
+
end
|
|
394
|
+
end
|
data/test/cli_runner_test.rb
CHANGED
|
@@ -106,24 +106,23 @@ class CliRunnerTest < JrfTestCase
|
|
|
106
106
|
|
|
107
107
|
def test_runner_buffering_and_require_option
|
|
108
108
|
threshold_input = StringIO.new((1..4).map { |i| "{\"foo\":\"#{'x' * 1020}\",\"i\":#{i}}\n" }.join)
|
|
109
|
-
buffered_runner = RecordingRunner.new(
|
|
109
|
+
buffered_runner = RecordingRunner.new(input: threshold_input, out: StringIO.new, err: StringIO.new)
|
|
110
110
|
buffered_runner.run('_')
|
|
111
111
|
expected_line = JSON.generate({"foo" => "x" * 1020, "i" => 1}) + "\n"
|
|
112
112
|
assert_equal(2, buffered_runner.writes.length, "default atomic write limit buffers records until the configured threshold")
|
|
113
113
|
assert_equal(expected_line.bytesize * 3, buffered_runner.writes.first.bytesize, "default atomic write limit flushes before the next record would exceed the threshold")
|
|
114
114
|
assert_equal(expected_line.bytesize, buffered_runner.writes.last.bytesize, "final buffer flush emits the remaining record")
|
|
115
115
|
|
|
116
|
-
small_limit_runner = RecordingRunner.new(
|
|
116
|
+
small_limit_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":2}\n"), out: StringIO.new, err: StringIO.new, atomic_write_bytes: 1)
|
|
117
117
|
small_limit_runner.run('_["foo"]')
|
|
118
118
|
assert_equal(["1\n", "2\n"], small_limit_runner.writes, "small atomic write limit emits oversized records directly")
|
|
119
119
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
end
|
|
120
|
+
err_io = StringIO.new
|
|
121
|
+
error_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":"), out: StringIO.new, err: err_io)
|
|
122
|
+
error_runner.run('_["foo"]')
|
|
123
|
+
assert_equal(["1\n"], error_runner.writes, "buffer flushes pending output before parse errors")
|
|
124
|
+
assert_includes(err_io.string, "JSON::ParserError", "parse error reported to stderr")
|
|
125
|
+
assert(error_runner.input_errors?, "input_errors? is true after parse error")
|
|
127
126
|
|
|
128
127
|
input_hello = <<~NDJSON
|
|
129
128
|
{"hello":123}
|
|
@@ -648,7 +647,7 @@ class CliRunnerTest < JrfTestCase
|
|
|
648
647
|
assert_equal(%w[9], lines(stdout), "lax trailing separator output")
|
|
649
648
|
|
|
650
649
|
chunked_lax_out = RecordingRunner.new(
|
|
651
|
-
|
|
650
|
+
input: ChunkedSource.new("{\"foo\":1}\n\x1e{\"foo\":2}\n\t{\"foo\":3}\n"),
|
|
652
651
|
out: StringIO.new,
|
|
653
652
|
err: StringIO.new,
|
|
654
653
|
lax: true
|
|
@@ -691,6 +690,7 @@ class CliRunnerTest < JrfTestCase
|
|
|
691
690
|
assert_failure(status, "broken input should fail")
|
|
692
691
|
assert_equal(%w[3], lines(stdout), "reducers flush before parse error")
|
|
693
692
|
assert_includes(stderr, "JSON::ParserError")
|
|
693
|
+
refute_includes(stderr, "from ", "no stacktrace for parse errors")
|
|
694
694
|
end
|
|
695
695
|
|
|
696
696
|
def test_map
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: jrf
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.15
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- kazuho
|
|
@@ -75,6 +75,7 @@ files:
|
|
|
75
75
|
- lib/jrf/row_context.rb
|
|
76
76
|
- lib/jrf/stage.rb
|
|
77
77
|
- lib/jrf/version.rb
|
|
78
|
+
- test/cli_parallel_test.rb
|
|
78
79
|
- test/cli_runner_test.rb
|
|
79
80
|
- test/library_api_test.rb
|
|
80
81
|
- test/readme_examples_test.rb
|