jrf 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2862eaf6bd5f2486ea2c6aebf5caa4fbc2de56f419625bf8bb462392a3ea5dd9
4
- data.tar.gz: 3f29e7024f4e33606d78ad01ce4c45f37c9cd652ba94ac490866cd877368037a
3
+ metadata.gz: de85d7a03d58baee4c931d10869a824a1ff5c2eec121cd15e63ec23805203676
4
+ data.tar.gz: ce3c53475e13d41e3a176ef7c9ea840145fbbf826612457386cc0899c28a1af0
5
5
  SHA512:
6
- metadata.gz: 04f55e0ea8c24f70126964beffbe80bee1800e1e210da2f96186bb8ebdf5542e5dfbab9c06b48624da4ec35912d02456561ec6c0d2c66c094de001ecf7f4096f
7
- data.tar.gz: '093821f35539be4561867b711664a31d3441052fe53e1c0f73489cd8b11fdf845bfb5573375f880ce07cd73ffc2f1d0514b55b760227c01ab515afd39d8ac08a'
6
+ metadata.gz: ded54cff09febe7fe02c585f30a702cd82cd11aeb563f840b9f182cd8a6e94c090ba5e71fbd6cad7f377816c14de41c12f5fed449b2b8f7c1d682513db2f19ee
7
+ data.tar.gz: bdd4f9ee2ff809cc718b497a39783027f9c7322582dfdaebc8cae8b6bf4cb1d56b9639a356f2239be66b6ee39693c1da994b4fa9b9db5ecfb6c10d66ede021d9
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "json"
4
+ require "zlib"
4
5
  require_relative "../pipeline"
5
6
  require_relative "../pipeline_parser"
6
7
 
@@ -9,6 +10,7 @@ module Jrf
9
10
  class Runner
10
11
  RS_CHAR = "\x1e"
11
12
  DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
13
+ PARALLEL_FRAME_HEADER_BYTES = 4
12
14
 
13
15
  class RsNormalizer
14
16
  def initialize(input)
@@ -19,7 +21,7 @@ module Jrf
19
21
  chunk = @input.read(length)
20
22
  return nil if chunk.nil?
21
23
 
22
- chunk = chunk.tr(RS_CHAR, "\n")
24
+ chunk.tr!(RS_CHAR, "\n")
23
25
  if outbuf
24
26
  outbuf.replace(chunk)
25
27
  else
@@ -28,61 +30,360 @@ module Jrf
28
30
  end
29
31
  end
30
32
 
31
- def initialize(inputs:, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
32
- @inputs = inputs
33
+ class ParallelFrameReader
34
+ def initialize
35
+ @buf = +""
36
+ @offset = 0
37
+ end
38
+
39
+ def append(chunk)
40
+ @buf << chunk
41
+ end
42
+
43
+ def each_payload
44
+ while (payload = next_payload)
45
+ yield payload
46
+ end
47
+ end
48
+
49
+ def has_partial?
50
+ @offset != @buf.bytesize
51
+ end
52
+
53
+ private
54
+
55
+ def next_payload
56
+ if @buf.bytesize - @offset < PARALLEL_FRAME_HEADER_BYTES
57
+ compact!
58
+ return nil
59
+ end
60
+
61
+ payload_len = @buf.byteslice(@offset, PARALLEL_FRAME_HEADER_BYTES).unpack1("N")
62
+ frame_len = PARALLEL_FRAME_HEADER_BYTES + payload_len
63
+ if @buf.bytesize - @offset < frame_len
64
+ compact!
65
+ return nil
66
+ end
67
+
68
+ payload = @buf.byteslice(@offset + PARALLEL_FRAME_HEADER_BYTES, payload_len)
69
+ @offset += frame_len
70
+ payload
71
+ end
72
+
73
+ def compact!
74
+ if @offset > 0
75
+ @buf.slice!(0, @offset)
76
+ @offset = 0
77
+ end
78
+ end
79
+ end
80
+
81
+ def initialize(input: $stdin, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
82
+ if input.is_a?(Array)
83
+ @file_paths = input
84
+ @stdin = nil
85
+ else
86
+ @file_paths = []
87
+ @stdin = input
88
+ end
33
89
  @out = out
34
90
  @err = err
35
91
  @lax = lax
36
92
  @output_format = output_format
37
93
  @atomic_write_bytes = atomic_write_bytes
38
94
  @output_buffer = +""
95
+ @input_errors = false
96
+ end
97
+
98
+ def input_errors?
99
+ @input_errors
39
100
  end
40
101
 
41
- def run(expression, verbose: false)
102
+ def run(expression, parallel: 1, verbose: false)
103
+ blocks = build_stage_blocks(expression, verbose: verbose)
104
+ if @output_format == :tsv
105
+ values = []
106
+ process_values(blocks, parallel: parallel, verbose: verbose) do |value|
107
+ values << value
108
+ end
109
+ emit_tsv(values)
110
+ else
111
+ process_values(blocks, parallel: parallel, verbose: verbose) do |value|
112
+ emit_output(value)
113
+ end
114
+ end
115
+ ensure
116
+ write_output(@output_buffer)
117
+ end
118
+
119
+ private
120
+
121
+ def build_stage_blocks(expression, verbose:)
42
122
  parsed = PipelineParser.new(expression).parse
43
123
  stages = parsed[:stages]
44
124
  dump_stages(stages) if verbose
45
-
46
- blocks = stages.map { |stage|
125
+ stages.map { |stage|
47
126
  eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
48
127
  }
128
+ end
129
+
130
+ def apply_pipeline(blocks, input_enum)
49
131
  pipeline = Pipeline.new(*blocks)
132
+ Enumerator.new do |y|
133
+ pipeline.call(input_enum) { |value| y << value }
134
+ end
135
+ end
50
136
 
51
- input_enum = Enumerator.new { |y| each_input_value { |v| y << v } }
137
+ def each_input_enum
138
+ Enumerator.new { |y| each_input_value { |v| y << v } }
139
+ end
52
140
 
53
- if @output_format == :tsv
54
- values = []
55
- pipeline.call(input_enum) { |value| values << value }
56
- emit_tsv(values)
141
+ def process_values(blocks, parallel:, verbose:, &block)
142
+ if parallel <= 1 || @file_paths.length <= 1
143
+ # Single file or no parallelism requested — serial is the only option.
144
+ # This also covers the all-files-empty case: no files means no workers to spawn.
145
+ dump_parallel_status("disabled", verbose: verbose)
146
+ return apply_pipeline(blocks, each_input_enum).each(&block)
147
+ end
148
+
149
+ split_index, probe_stage = classify_parallel_stages(blocks)
150
+ if split_index.nil?
151
+ dump_parallel_status("disabled", verbose: verbose)
152
+ return apply_pipeline(blocks, each_input_enum).each(&block)
153
+ end
154
+
155
+ # If the first reducer stage is decomposable, workers run everything up to
156
+ # and including it (map prefix + reducer), emit partial accumulators, and the
157
+ # parent merges. This covers both pure reducers (split_index == 0, e.g. `sum(_)`)
158
+ # and map-then-reduce (split_index > 0, e.g. `select(...) >> sum(...)`).
159
+ if probe_stage&.decomposable?
160
+ worker_blocks = blocks[0..split_index]
161
+ rest_blocks = blocks[(split_index + 1)..]
162
+ return process_decomposable_parallel(worker_blocks, rest_blocks, probe_stage,
163
+ parallel: parallel, verbose: verbose, &block)
164
+ end
165
+
166
+ if split_index == 0
167
+ dump_parallel_status("disabled", verbose: verbose)
168
+ return apply_pipeline(blocks, each_input_enum).each(&block)
169
+ end
170
+
171
+ map_blocks = blocks[0...split_index]
172
+ reduce_blocks = blocks[split_index..]
173
+ dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} split=#{split_index}/#{blocks.length}", verbose: verbose)
174
+ input_enum = parallel_map_enum(map_blocks, parallel)
175
+ (reduce_blocks.empty? ? input_enum : apply_pipeline(reduce_blocks, input_enum)).each(&block)
176
+ end
177
+
178
+ def dump_parallel_status(status, verbose:)
179
+ @err.puts "parallel: #{status}" if verbose
180
+ end
181
+
182
+ # Returns [split_index, probe_stage] where split_index is the index of the
183
+ # first reducer stage (or blocks.length if all are passthrough), and probe_stage
184
+ # is the Stage object of that first reducer (nil if all passthrough or no input).
185
+ def classify_parallel_stages(blocks)
186
+ # Read the first row from the first file to probe stage modes
187
+ first_value = nil
188
+ open_file(@file_paths.first) do |stream|
189
+ each_stream_value(stream) do |value|
190
+ first_value = value
191
+ break
192
+ end
193
+ end
194
+ return [nil, nil] if first_value.nil?
195
+
196
+ # Run the value through each stage independently to classify
197
+ split_index = nil
198
+ probe_stage = nil
199
+ blocks.each_with_index do |block, i|
200
+ probe_pipeline = Pipeline.new(block)
201
+ probe_pipeline.call([first_value]) { |_| }
202
+ stage = probe_pipeline.instance_variable_get(:@stages).first
203
+ if stage.instance_variable_get(:@mode) == :reducer
204
+ split_index = i
205
+ probe_stage = stage
206
+ break
207
+ end
208
+ end
209
+
210
+ [split_index || blocks.length, probe_stage]
211
+ end
212
+
213
+ def process_decomposable_parallel(worker_blocks, rest_blocks, probe_stage, parallel:, verbose:, &block)
214
+ dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} decompose=#{worker_blocks.length}/#{worker_blocks.length + rest_blocks.length}", verbose: verbose)
215
+
216
+ # Workers run map prefix + reducer stage per file and emit partial accumulators.
217
+ partials_list = []
218
+ reducer_stage_index = worker_blocks.length - 1
219
+ spawner = ->(path) do
220
+ spawn_worker(worker_blocks, path) do |pipeline, input|
221
+ pipeline.call(input) { |_| }
222
+ # If the file was empty, the stage was never initialized (no reducers),
223
+ # so skip emitting — the parent will simply not receive a partial for this worker.
224
+ stage = pipeline.instance_variable_get(:@stages)[reducer_stage_index]
225
+ partials = stage.partial_accumulators
226
+ emit_parallel_frame(partials) unless partials.empty?
227
+ end
228
+ end
229
+ children = run_parallel_worker_pool(parallel, spawner) { |v| partials_list << v }
230
+ wait_for_parallel_children(children) if children
231
+ return if partials_list.empty?
232
+
233
+ # Reuse the probe stage (already initialized with reducer structure from classify).
234
+ # Replace its accumulators with the first worker's partials, then merge the rest.
235
+ probe_stage.replace_accumulators!(partials_list.first)
236
+ partials_list.drop(1).each { |partials| probe_stage.merge_partials!(partials) }
237
+
238
+ # Finish the reducer stage and pass results through any remaining stages.
239
+ results = probe_stage.finish
240
+ if rest_blocks.empty?
241
+ results.each(&block)
57
242
  else
58
- pipeline.call(input_enum) { |value| emit_output(value) }
243
+ apply_pipeline(rest_blocks, results.each).each(&block)
59
244
  end
60
- ensure
61
- write_output(@output_buffer)
62
245
  end
63
246
 
64
- private
247
+ # Forks a worker process that reads `path`, builds a pipeline from `blocks`,
248
+ # and yields [pipeline, input_enum] to the caller's block for custom behavior.
249
+ # Returns [read_io, pid].
250
+ def spawn_worker(blocks, path)
251
+ read_io, write_io = IO.pipe
252
+ pid = fork do
253
+ read_io.close
254
+ @out = write_io
255
+ @output_buffer = +""
256
+ pipeline = Pipeline.new(*blocks)
257
+ input_enum = Enumerator.new do |y|
258
+ open_file(path) { |stream| each_stream_value(stream) { |v| y << v } }
259
+ end
260
+ worker_failed = false
261
+ begin
262
+ yield pipeline, input_enum
263
+ rescue => e
264
+ @err.puts "#{path}: #{e.message} (#{e.class})"
265
+ worker_failed = true
266
+ end
267
+ write_output(@output_buffer)
268
+ write_io.close
269
+ exit!(worker_failed ? 1 : 0)
270
+ end
271
+ write_io.close
272
+ [read_io, pid]
273
+ end
65
274
 
66
- def each_input_value
67
- return each_input_value_lax { |value| yield value } if @lax
275
+ # Runs a pool of up to `num_workers` concurrent workers across all input files.
276
+ # `spawner` is called with a file path and must return [read_io, pid].
277
+ # Yields each decoded JSON value from worker output frames.
278
+ def run_parallel_worker_pool(num_workers, spawner)
279
+ file_queue = @file_paths.dup
280
+ workers = {} # read_io => [reader, pid]
281
+ children = []
282
+
283
+ # Fill initial pool
284
+ while workers.size < num_workers && !file_queue.empty?
285
+ read_io, pid = spawner.call(file_queue.shift)
286
+ workers[read_io] = [ParallelFrameReader.new, pid]
287
+ children << pid
288
+ end
289
+
290
+ read_ios = workers.keys.dup
291
+
292
+ until read_ios.empty?
293
+ ready = IO.select(read_ios)
294
+ ready[0].each do |io|
295
+ reader = workers[io][0]
296
+ chunk = io.read_nonblock(65536, exception: false)
297
+ if chunk == :wait_readable
298
+ next
299
+ elsif chunk.nil?
300
+ raise IOError, "truncated parallel frame from worker" if reader.has_partial?
301
+ read_ios.delete(io)
302
+ io.close
303
+ workers.delete(io)
304
+
305
+ # Spawn next worker if files remain
306
+ unless file_queue.empty?
307
+ read_io, pid = spawner.call(file_queue.shift)
308
+ workers[read_io] = [ParallelFrameReader.new, pid]
309
+ children << pid
310
+ read_ios << read_io
311
+ end
312
+ else
313
+ reader.append(chunk)
314
+ reader.each_payload do |payload|
315
+ yield JSON.parse(payload)
316
+ end
317
+ end
318
+ end
319
+ end
320
+
321
+ children
322
+ end
323
+
324
+ def parallel_map_enum(map_blocks, num_workers)
325
+ children = nil
326
+ spawner = ->(path) do
327
+ spawn_worker(map_blocks, path) do |pipeline, input|
328
+ pipeline.call(input) { |value| emit_parallel_frame(value) }
329
+ end
330
+ end
331
+ Enumerator.new do |y|
332
+ children = run_parallel_worker_pool(num_workers, spawner) { |value| y << value }
333
+ ensure
334
+ wait_for_parallel_children(children) if children
335
+ end
336
+ end
337
+
338
+ def wait_for_parallel_children(children)
339
+ failed = false
340
+ children.each do |pid|
341
+ _, status = Process.waitpid2(pid)
342
+ failed = true unless status.success?
343
+ end
344
+ exit(1) if failed
345
+ end
68
346
 
69
- each_input_value_ndjson { |value| yield value }
347
+ def emit_parallel_frame(value)
348
+ payload = JSON.generate(value)
349
+ buffer_output([payload.bytesize].pack("N") << payload)
70
350
  end
71
351
 
72
- def each_input_value_ndjson
352
+ def each_input_value
73
353
  each_input do |source|
74
- source.each_line do |raw_line|
75
- line = raw_line.strip
76
- next if line.empty?
354
+ each_stream_value(source) { |value| yield value }
355
+ end
356
+ end
77
357
 
78
- yield JSON.parse(line)
79
- end
358
+ def each_stream_value(stream)
359
+ return each_stream_value_lax(stream) { |value| yield value } if @lax
360
+
361
+ stream.each_line do |line|
362
+ line.strip!
363
+ next if line.empty?
364
+ yield JSON.parse(line)
365
+ end
366
+ end
367
+
368
+ def open_file(path)
369
+ if path.end_with?(".gz")
370
+ Zlib::GzipReader.open(path) { |source| yield source }
371
+ else
372
+ File.open(path, "rb") { |source| yield source }
80
373
  end
81
374
  end
82
375
 
83
- def each_input_value_lax
376
+ def each_stream_value_lax(stream)
84
377
  require "oj"
85
- handler = Class.new(Oj::ScHandler) do
378
+ Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
379
+ rescue LoadError
380
+ raise "oj is required for --lax mode (gem install oj)"
381
+ rescue Oj::ParseError => e
382
+ raise JSON::ParserError, e.message
383
+ end
384
+
385
+ def streaming_json_handler_class
386
+ @streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
86
387
  def initialize(&emit)
87
388
  @emit = emit
88
389
  end
@@ -94,13 +395,6 @@ module Jrf
94
395
  def array_append(array, value) = array << value
95
396
  def add_value(value) = @emit.call(value)
96
397
  end
97
- each_input do |source|
98
- Oj.sc_parse(handler.new { |value| yield value }, RsNormalizer.new(source))
99
- end
100
- rescue LoadError
101
- raise "oj is required for --lax mode (gem install oj)"
102
- rescue Oj::ParseError => e
103
- raise JSON::ParserError, e.message
104
398
  end
105
399
 
106
400
  def dump_stages(stages)
@@ -109,8 +403,25 @@ module Jrf
109
403
  end
110
404
  end
111
405
 
112
- def each_input
113
- @inputs.each { |source| yield source }
406
+ def each_input(&block)
407
+ if @file_paths.empty?
408
+ with_error_handling("<stdin>") { block.call(@stdin) }
409
+ else
410
+ @file_paths.each do |path|
411
+ if path == "-"
412
+ with_error_handling("<stdin>") { block.call(@stdin) }
413
+ else
414
+ with_error_handling(path) { open_file(path, &block) }
415
+ end
416
+ end
417
+ end
418
+ end
419
+
420
+ def with_error_handling(name)
421
+ yield
422
+ rescue IOError, SystemCallError, Zlib::GzipFile::Error, JSON::ParserError => e
423
+ @err.puts "#{name}: #{e.message} (#{e.class})"
424
+ @input_errors = true
114
425
  end
115
426
 
116
427
  def emit_output(value)
@@ -171,7 +482,13 @@ module Jrf
171
482
  end
172
483
 
173
484
  def write_output(str)
174
- @out.syswrite(str)
485
+ return if str.empty?
486
+
487
+ total = 0
488
+ while total < str.bytesize
489
+ written = @out.syswrite(str.byteslice(total..))
490
+ total += written
491
+ end
175
492
  end
176
493
  end
177
494
  end
data/lib/jrf/cli.rb CHANGED
@@ -18,6 +18,7 @@ module Jrf
18
18
  --lax allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)
19
19
  -o, --output FORMAT
20
20
  output format: json (default), pretty, tsv
21
+ -P N opportunistically parallelize across N workers
21
22
  -r, --require LIBRARY
22
23
  require LIBRARY before evaluating stages
23
24
  --no-jit do not enable YJIT, even when supported by the Ruby runtime
@@ -45,6 +46,7 @@ module Jrf
45
46
  verbose = false
46
47
  lax = false
47
48
  output_format = :json
49
+ parallel = 1
48
50
  jit = true
49
51
  required_libraries = []
50
52
  atomic_write_bytes = Runner::DEFAULT_OUTPUT_BUFFER_LIMIT
@@ -54,6 +56,7 @@ module Jrf
54
56
  opts.on("-v", "--verbose", "print parsed stage expressions") { verbose = true }
55
57
  opts.on("--lax", "allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)") { lax = true }
56
58
  opts.on("-o", "--output FORMAT", %w[json pretty tsv], "output format: json, pretty, tsv") { |fmt| output_format = fmt.to_sym }
59
+ opts.on("-P N", Integer, "opportunistically parallelize the map-prefix across N workers") { |n| parallel = n }
57
60
  opts.on("-r", "--require LIBRARY", "require LIBRARY before evaluating stages") { |library| required_libraries << library }
58
61
  opts.on("--no-jit", "do not enable YJIT, even when supported by the Ruby runtime") { jit = false }
59
62
  opts.on("--atomic-write-bytes N", Integer, "group short outputs into atomic writes of up to N bytes") do |value|
@@ -89,34 +92,20 @@ module Jrf
89
92
  enable_yjit if jit
90
93
  required_libraries.each { |library| require library }
91
94
 
92
- inputs = Enumerator.new do |y|
93
- if argv.empty?
94
- y << input
95
- else
96
- argv.each do |path|
97
- if path == "-"
98
- y << input
99
- elsif path.end_with?(".gz")
100
- require "zlib"
101
- Zlib::GzipReader.open(path) do |source|
102
- y << source
103
- end
104
- else
105
- File.open(path, "rb") do |source|
106
- y << source
107
- end
108
- end
109
- end
110
- end
111
- end
112
- Runner.new(
113
- inputs: inputs,
95
+ file_paths = argv.dup
96
+
97
+ runner = Runner.new(
98
+ input: file_paths.empty? ? input : file_paths,
114
99
  out: out,
115
100
  err: err,
116
101
  lax: lax,
117
102
  output_format: output_format,
118
103
  atomic_write_bytes: atomic_write_bytes
119
- ).run(expression, verbose: verbose)
104
+ )
105
+
106
+ runner.run(expression, parallel: parallel, verbose: verbose)
107
+
108
+ exit 1 if runner.input_errors?
120
109
  end
121
110
 
122
111
  def self.enable_yjit
data/lib/jrf/reducers.rb CHANGED
@@ -20,8 +20,37 @@ module Jrf
20
20
  end
21
21
  end
22
22
 
23
+ # A reducer whose partial accumulators can be merged across parallel workers.
24
+ #
25
+ # Contract:
26
+ # - `identity` is the neutral element for `merge_fn`: merge(identity, x) == x
27
+ # - `initial` is always set to `identity` (the accumulator starts from the neutral element)
28
+ # - Any bias (e.g. sum's `initial:` keyword) is applied in `finish_fn`, not in the starting accumulator
29
+ class DecomposableReduce < Reduce
30
+ attr_reader :merge_fn
31
+
32
+ def initialize(identity, merge:, finish_fn: nil, &step_fn)
33
+ super(identity, finish_fn: finish_fn, &step_fn)
34
+ @merge_fn = merge
35
+ end
36
+
37
+ # Returns the raw accumulator without applying finish_fn.
38
+ def partial
39
+ @acc
40
+ end
41
+
42
+ # Merges another partial accumulator into this one.
43
+ def merge_partial(other_acc)
44
+ @acc = @merge_fn.call(@acc, other_acc)
45
+ end
46
+ end
47
+
23
48
  def reduce(initial, finish: nil, &step_fn)
24
49
  Reduce.new(initial, finish_fn: finish, &step_fn)
25
50
  end
51
+
52
+ def decomposable_reduce(identity, merge:, finish: nil, &step_fn)
53
+ DecomposableReduce.new(identity, merge: merge, finish_fn: finish, &step_fn)
54
+ end
26
55
  end
27
56
  end
@@ -17,6 +17,7 @@ module Jrf
17
17
  spec.fetch(:value),
18
18
  initial: reducer_initial_value(spec.fetch(:initial)),
19
19
  finish: spec[:finish],
20
+ merge: spec[:merge],
20
21
  &spec.fetch(:step)
21
22
  )
22
23
  end
@@ -48,27 +49,38 @@ module Jrf
48
49
  end
49
50
 
50
51
  define_reducer(:sum) do |_ctx, value, initial: 0, block: nil|
51
- { value: value, initial: initial, step: ->(acc, v) { v.nil? ? acc : (acc + v) } }
52
+ step = ->(acc, v) { v.nil? ? acc : (acc + v) }
53
+ if initial.is_a?(Numeric)
54
+ # Numeric — decomposable. Bias applied once in finish.
55
+ finish = initial == 0 ? nil : ->(acc) { [acc + initial] }
56
+ { value: value, initial: 0, step: step, finish: finish, merge: ->(a, b) { a + b } }
57
+ else
58
+ # Non-numeric (e.g. string concat) — not decomposable.
59
+ { value: value, initial: initial, step: step }
60
+ end
52
61
  end
53
62
 
54
63
  define_reducer(:count) do |_ctx, value = MISSING, block: nil|
64
+ merge = ->(a, b) { a + b }
55
65
  if value.equal?(MISSING)
56
- { value: nil, initial: 0, step: ->(acc, _v) { acc + 1 } }
66
+ { value: nil, initial: 0, step: ->(acc, _v) { acc + 1 }, merge: merge }
57
67
  else
58
- { value: value, initial: 0, step: ->(acc, v) { v.nil? ? acc : (acc + 1) } }
68
+ { value: value, initial: 0, step: ->(acc, v) { v.nil? ? acc : (acc + 1) }, merge: merge }
59
69
  end
60
70
  end
61
71
 
62
72
  define_reducer(:count_if) do |_ctx, condition, block: nil|
63
- { value: condition, initial: 0, step: ->(acc, v) { v ? (acc + 1) : acc } }
73
+ { value: condition, initial: 0, step: ->(acc, v) { v ? (acc + 1) : acc }, merge: ->(a, b) { a + b } }
64
74
  end
65
75
 
66
76
  define_reducer(:min) do |_ctx, value, block: nil|
67
- { value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v < acc ? v : acc) } }
77
+ min_merge = ->(a, b) { a.nil? ? b : b.nil? ? a : (a < b ? a : b) }
78
+ { value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v < acc ? v : acc) }, merge: min_merge }
68
79
  end
69
80
 
70
81
  define_reducer(:max) do |_ctx, value, block: nil|
71
- { value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v > acc ? v : acc) } }
82
+ max_merge = ->(a, b) { a.nil? ? b : b.nil? ? a : (a > b ? a : b) }
83
+ { value: value, initial: nil, step: ->(acc, v) { v.nil? ? acc : (acc.nil? || v > acc ? v : acc) }, merge: max_merge }
72
84
  end
73
85
 
74
86
  define_reducer(:average) do |_ctx, value, block: nil|
@@ -82,7 +94,8 @@ module Jrf
82
94
  acc[0] += v
83
95
  acc[1] += 1
84
96
  acc
85
- }
97
+ },
98
+ merge: ->(a, b) { [a[0] + b[0], a[1] + b[1]] }
86
99
  }
87
100
  end
88
101
 
@@ -136,7 +149,7 @@ module Jrf
136
149
 
137
150
  define_reducer(:group) do |ctx, value = MISSING, block: nil|
138
151
  resolved_value = value.equal?(MISSING) ? ctx.send(:current_input) : value
139
- { value: resolved_value, initial: -> { [] }, step: ->(acc, v) { acc << v } }
152
+ { value: resolved_value, initial: -> { [] }, step: ->(acc, v) { acc << v }, merge: ->(a, b) { a + b } }
140
153
  end
141
154
 
142
155
  define_reducer(:percentile) do |ctx, value, percentage, block: nil|
data/lib/jrf/stage.rb CHANGED
@@ -51,13 +51,17 @@ module Jrf
51
51
  (@mode == :reducer) ? Control::DROPPED : result
52
52
  end
53
53
 
54
- def step_reduce(value, initial:, finish: nil, step_fn: nil, &step_block)
54
+ def step_reduce(value, initial:, finish: nil, merge: nil, step_fn: nil, &step_block)
55
55
  idx = @cursor
56
56
  step_fn ||= step_block
57
57
 
58
58
  if @reducers[idx].nil?
59
59
  finish_rows = finish || ->(acc) { [acc] }
60
- @reducers[idx] = Reducers.reduce(initial, finish: finish_rows, &step_fn)
60
+ @reducers[idx] = if merge
61
+ Reducers.decomposable_reduce(initial, merge: merge, finish: finish_rows, &step_fn)
62
+ else
63
+ Reducers.reduce(initial, finish: finish_rows, &step_fn)
64
+ end
61
65
  result = ReducerToken.new(idx)
62
66
  else
63
67
  result = Control::DROPPED
@@ -167,6 +171,32 @@ module Jrf
167
171
  end
168
172
  end
169
173
 
174
+ # Returns true if all reducers in this stage are DecomposableReduce instances,
175
+ # meaning partial accumulators from parallel workers can be merged.
176
+ def decomposable?
177
+ @mode == :reducer && @reducers.any? &&
178
+ @reducers.all? { |r| r.is_a?(Reducers::DecomposableReduce) }
179
+ end
180
+
181
+ # Returns an array of raw accumulator values, one per reducer.
182
+ def partial_accumulators
183
+ @reducers.map(&:partial)
184
+ end
185
+
186
+ # Replaces all reducer accumulators with the given values.
187
+ def replace_accumulators!(partials)
188
+ @reducers.each_with_index do |reducer, i|
189
+ reducer.instance_variable_set(:@acc, partials[i])
190
+ end
191
+ end
192
+
193
+ # Merges an array of partial accumulators (from another worker) into this stage's reducers.
194
+ def merge_partials!(other_partials)
195
+ @reducers.each_with_index do |reducer, i|
196
+ reducer.merge_partial(other_partials[i])
197
+ end
198
+ end
199
+
170
200
  private
171
201
 
172
202
  def with_scoped_reducers(reducer_list)
data/lib/jrf/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Jrf
4
- VERSION = "0.1.13"
4
+ VERSION = "0.1.15"
5
5
  end
@@ -0,0 +1,394 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "test_helper"
4
+
5
+ class CliParallelTest < JrfTestCase
6
+ def test_parallel_map_only
7
+ Dir.mktmpdir do |dir|
8
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
9
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
10
+
11
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '_["x"]', *ndjson_files(dir))
12
+ assert_success(status, stderr, "parallel map only")
13
+ assert_equal([1, 2, 3, 4], lines(stdout).map(&:to_i).sort, "parallel map only output")
14
+ assert_includes(stderr, "parallel: enabled workers=2 files=2 split=1/1", "parallel verbose summary")
15
+ end
16
+ end
17
+
18
+ def test_parallel_map_only_pretty_output
19
+ Dir.mktmpdir do |dir|
20
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}])
21
+ write_ndjson(dir, "b.ndjson", [{"x" => 2}])
22
+
23
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "pretty", '_["x"]', *ndjson_files(dir))
24
+ assert_success(status, stderr, "parallel pretty map only")
25
+ assert_equal(["1", "2"], stdout.lines.map(&:strip).reject(&:empty?).sort, "parallel pretty map only output")
26
+ end
27
+ end
28
+
29
+ def test_parallel_map_only_tsv_output
30
+ Dir.mktmpdir do |dir|
31
+ write_ndjson(dir, "a.ndjson", [{"a" => 1, "b" => 2}])
32
+ write_ndjson(dir, "b.ndjson", [{"a" => 3, "b" => 4}])
33
+
34
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "tsv", "_", *ndjson_files(dir))
35
+ assert_success(status, stderr, "parallel tsv map only")
36
+ assert_equal(["a\t1", "a\t3", "b\t2", "b\t4"], lines(stdout).sort, "parallel tsv map only output")
37
+ end
38
+ end
39
+
40
+ def test_parallel_map_reduce
41
+ Dir.mktmpdir do |dir|
42
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
43
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
44
+
45
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
46
+ assert_success(status, stderr, "parallel map reduce")
47
+ assert_equal(%w[10], lines(stdout), "parallel sum output")
48
+ end
49
+ end
50
+
51
+ def test_parallel_split_map_and_reduce
52
+ Dir.mktmpdir do |dir|
53
+ write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
54
+ write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
55
+
56
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
57
+ assert_success(status, stderr, "parallel split map+reduce")
58
+ assert_includes(stderr, "decompose=2/2", "select+sum decomposed")
59
+ assert_equal(%w[90], lines(stdout), "parallel split map+reduce output")
60
+ end
61
+ end
62
+
63
+ def test_parallel_group_by
64
+ Dir.mktmpdir do |dir|
65
+ write_ndjson(dir, "a.ndjson", [{"k" => "a", "v" => 1}, {"k" => "b", "v" => 2}])
66
+ write_ndjson(dir, "b.ndjson", [{"k" => "a", "v" => 3}, {"k" => "b", "v" => 4}])
67
+
68
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'group_by(_["k"]) { |r| sum(r["v"]) }', *ndjson_files(dir))
69
+ assert_success(status, stderr, "parallel group_by")
70
+ result = JSON.parse(lines(stdout).first)
71
+ assert_equal(4, result["a"], "parallel group_by a")
72
+ assert_equal(6, result["b"], "parallel group_by b")
73
+ end
74
+ end
75
+
76
+ def test_parallel_decomposable_reducer
77
+ Dir.mktmpdir do |dir|
78
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
79
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}])
80
+
81
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
82
+ assert_success(status, stderr, "parallel decomposable reducer")
83
+ assert_equal(%w[6], lines(stdout), "parallel decomposable reducer output")
84
+ assert_includes(stderr, "parallel: enabled", "parallel enabled for decomposable reducer")
85
+ assert_includes(stderr, "decompose=", "decompose mode indicated")
86
+ end
87
+ end
88
+
89
+ def test_parallel_with_gz_files
90
+ Dir.mktmpdir do |dir|
91
+ gz_path_a = File.join(dir, "a.ndjson.gz")
92
+ Zlib::GzipWriter.open(gz_path_a) { |io| io.write("{\"x\":10}\n{\"x\":20}\n") }
93
+ gz_path_b = File.join(dir, "b.ndjson.gz")
94
+ Zlib::GzipWriter.open(gz_path_b) { |io| io.write("{\"x\":30}\n") }
95
+
96
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', gz_path_a, gz_path_b)
97
+ assert_success(status, stderr, "parallel with gz")
98
+ assert_equal(%w[60], lines(stdout), "parallel with gz output")
99
+ end
100
+ end
101
+
102
+ def test_parallel_matches_serial_output
103
+ Dir.mktmpdir do |dir|
104
+ write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i, "g" => i % 3} })
105
+ write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i, "g" => i % 3} })
106
+
107
+ files = ndjson_files(dir)
108
+ expr = 'group_by(_["g"]) { |r| sum(r["v"]) }'
109
+
110
+ serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
111
+ assert_success(serial_status, serial_stderr, "serial baseline")
112
+
113
+ parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-P", "2", expr, *files)
114
+ assert_success(parallel_status, parallel_stderr, "parallel run")
115
+
116
+ assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel matches serial")
117
+ end
118
+ end
119
+
120
+ def test_parallel_worker_error_handling
121
+ Dir.mktmpdir do |dir|
122
+ good_path = File.join(dir, "a.ndjson")
123
+ File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
124
+
125
+ # Create a truncated gz file (valid header, truncated body)
126
+ bad_gz_path = File.join(dir, "b.ndjson.gz")
127
+ full_gz = StringIO.new
128
+ Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
129
+ # Write only the first half to simulate truncation
130
+ File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
131
+
132
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", '_["x"]', good_path, bad_gz_path)
133
+ assert_failure(status, "worker error causes non-zero exit")
134
+ assert_includes(stderr, bad_gz_path, "error message includes filename")
135
+ # Good file data should still be present
136
+ output_values = lines(stdout).map(&:to_i)
137
+ assert_includes(output_values, 1, "good file data preserved")
138
+ assert_includes(output_values, 2, "good file data preserved")
139
+ end
140
+ end
141
+
142
+ def test_parallel_requires_multiple_files
143
+ # With single file and -P, should still work (falls back to serial)
144
+ Dir.mktmpdir do |dir|
145
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
146
+
147
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
148
+ assert_success(status, stderr, "single file with -P")
149
+ assert_equal(%w[3], lines(stdout), "single file with -P output")
150
+ end
151
+ end
152
+
153
+ def test_parallel_select_then_sum
154
+ Dir.mktmpdir do |dir|
155
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}, {"x" => 3}])
156
+ write_ndjson(dir, "b.ndjson", [{"x" => 40}, {"x" => 5}])
157
+
158
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
159
+ assert_success(status, stderr, "parallel select then sum")
160
+ assert_includes(stderr, "decompose=2/2", "select+sum fully decomposed in workers")
161
+ assert_equal(%w[60], lines(stdout), "parallel select then sum output")
162
+ end
163
+ end
164
+
165
+ def test_parallel_decomposable_multi_reducer
166
+ Dir.mktmpdir do |dir|
167
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
168
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
169
+
170
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '{s: sum(_["x"]), n: count(), mn: min(_["x"]), mx: max(_["x"])}', *ndjson_files(dir))
171
+ assert_success(status, stderr, "parallel multi reducer")
172
+ assert_includes(stderr, "decompose=", "multi reducer decomposed")
173
+ result = JSON.parse(lines(stdout).first)
174
+ assert_equal(10, result["s"], "sum")
175
+ assert_equal(4, result["n"], "count")
176
+ assert_equal(1, result["mn"], "min")
177
+ assert_equal(4, result["mx"], "max")
178
+ end
179
+ end
180
+
181
+ def test_parallel_decomposable_average
182
+ Dir.mktmpdir do |dir|
183
+ write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
184
+ write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
185
+
186
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'average(_["x"])', *ndjson_files(dir))
187
+ assert_success(status, stderr, "parallel average")
188
+ assert_includes(stderr, "decompose=", "average decomposed")
189
+ assert_equal(["25.0"], lines(stdout), "parallel average output")
190
+ end
191
+ end
192
+
193
+ def test_parallel_decomposable_group
194
+ Dir.mktmpdir do |dir|
195
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
196
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}])
197
+
198
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'group(_["x"])', *ndjson_files(dir))
199
+ assert_success(status, stderr, "parallel group")
200
+ assert_includes(stderr, "decompose=", "group decomposed")
201
+ result = JSON.parse(lines(stdout).first)
202
+ assert_equal([1, 2, 3], result.sort, "parallel group output")
203
+ end
204
+ end
205
+
206
+ def test_parallel_decomposable_sum_with_initial
207
+ Dir.mktmpdir do |dir|
208
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
209
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}])
210
+
211
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"], initial: 100)', *ndjson_files(dir))
212
+ assert_success(status, stderr, "sum with numeric initial")
213
+ assert_includes(stderr, "decompose=", "numeric initial decomposes")
214
+ assert_equal(%w[106], lines(stdout), "sum with initial output")
215
+ end
216
+ end
217
+
218
+ def test_parallel_sum_with_non_numeric_initial_falls_back
219
+ Dir.mktmpdir do |dir|
220
+ write_ndjson(dir, "a.ndjson", [{"x" => "a"}, {"x" => "b"}])
221
+ write_ndjson(dir, "b.ndjson", [{"x" => "c"}])
222
+
223
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"], initial: "")', *ndjson_files(dir))
224
+ assert_success(status, stderr, "sum with string initial")
225
+ assert_includes(stderr, "parallel: disabled", "non-numeric initial falls back to serial")
226
+ assert_equal(['"abc"'], lines(stdout), "sum with string initial output")
227
+ end
228
+ end
229
+
230
+ def test_sum_with_string_initial
231
+ Dir.mktmpdir do |dir|
232
+ write_ndjson(dir, "a.ndjson", [{"x" => "hello "}, {"x" => "world"}])
233
+
234
+ stdout, stderr, status = Open3.capture3("./exe/jrf", 'sum(_["x"], initial: "")', *ndjson_files(dir))
235
+ assert_success(status, stderr, "sum with string initial")
236
+ assert_equal(['"hello world"'], lines(stdout), "sum with string initial output")
237
+ end
238
+ end
239
+
240
+ def test_parallel_decomposable_reducer_then_passthrough
241
+ Dir.mktmpdir do |dir|
242
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
243
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
244
+
245
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"]) >> _ * 2', *ndjson_files(dir))
246
+ assert_success(status, stderr, "parallel decomposable then passthrough")
247
+ assert_includes(stderr, "decompose=", "reducer then passthrough decomposed")
248
+ assert_equal(%w[20], lines(stdout), "parallel decomposable then passthrough output")
249
+ end
250
+ end
251
+
252
+ def test_parallel_mixed_decomposable_reducers
253
+ Dir.mktmpdir do |dir|
254
+ write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
255
+ write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
256
+
257
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '[sum(_["x"]), average(_["x"]), min(_["x"]), max(_["x"]), count()]', *ndjson_files(dir))
258
+ assert_success(status, stderr, "mixed decomposable")
259
+ assert_includes(stderr, "decompose=", "mixed decomposable used decompose")
260
+ result = JSON.parse(lines(stdout).first)
261
+ assert_equal([100, 25.0, 10, 40, 4], result, "mixed decomposable output")
262
+ end
263
+ end
264
+
265
+ def test_parallel_mixed_decomposable_and_non_decomposable_falls_back
266
+ Dir.mktmpdir do |dir|
267
+ write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
268
+ write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
269
+
270
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '[sum(_["x"]), percentile(_["x"], 0.5)]', *ndjson_files(dir))
271
+ assert_success(status, stderr, "mixed with non-decomposable")
272
+ assert_includes(stderr, "parallel: disabled", "mixed with non-decomposable falls back to serial")
273
+ result = JSON.parse(lines(stdout).first)
274
+ assert_equal([100, 20], result, "mixed with non-decomposable output")
275
+ end
276
+ end
277
+
278
+ def test_parallel_select_sum_passthrough_decomposes
279
+ Dir.mktmpdir do |dir|
280
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}])
281
+ write_ndjson(dir, "b.ndjson", [{"x" => 40}])
282
+
283
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"]) >> _ * 2', *ndjson_files(dir))
284
+ assert_success(status, stderr, "select+sum+passthrough")
285
+ assert_includes(stderr, "decompose=2/3", "select+sum decomposed, passthrough in parent")
286
+ assert_equal(%w[120], lines(stdout), "select+sum+passthrough output")
287
+ end
288
+ end
289
+
290
+ def test_parallel_select_non_decomposable_uses_split
291
+ Dir.mktmpdir do |dir|
292
+ write_ndjson(dir, "a.ndjson", [{"x" => 3}, {"x" => 1}])
293
+ write_ndjson(dir, "b.ndjson", [{"x" => 2}])
294
+
295
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'select(_["x"] > 0) >> sort(_["x"]) >> _["x"]', *ndjson_files(dir))
296
+ assert_success(status, stderr, "select+sort uses split")
297
+ assert_includes(stderr, "split=1/3", "non-decomposable sort uses map-prefix split")
298
+ assert_equal([1, 2, 3], lines(stdout).map { |l| JSON.parse(l) }, "select+sort output")
299
+ end
300
+ end
301
+
302
+ def test_parallel_decomposable_with_empty_file
303
+ Dir.mktmpdir do |dir|
304
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
305
+ File.write(File.join(dir, "b.ndjson"), "")
306
+
307
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '{s: sum(_["x"]), n: count(), mn: min(_["x"])}', *ndjson_files(dir))
308
+ assert_success(status, stderr, "decomposable with empty file")
309
+ assert_includes(stderr, "decompose=", "decomposable with empty file used decompose")
310
+ result = JSON.parse(lines(stdout).first)
311
+ assert_equal(3, result["s"], "sum ignores empty file")
312
+ assert_equal(2, result["n"], "count ignores empty file")
313
+ assert_equal(1, result["mn"], "min ignores empty file")
314
+ end
315
+ end
316
+
317
+ def test_parallel_decomposable_all_files_empty
318
+ Dir.mktmpdir do |dir|
319
+ File.write(File.join(dir, "a.ndjson"), "")
320
+ File.write(File.join(dir, "b.ndjson"), "")
321
+
322
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
323
+ assert_success(status, stderr, "all files empty")
324
+ # All files empty means first_value is nil, so classify returns nil → serial fallback
325
+ assert_includes(stderr, "parallel: disabled", "all files empty falls back to serial")
326
+ assert_equal([], lines(stdout), "no output for empty input")
327
+ end
328
+ end
329
+
330
+ def test_parallel_non_decomposable_falls_back_to_serial
331
+ Dir.mktmpdir do |dir|
332
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
333
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}])
334
+
335
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sort(_["x"]) >> _["x"]', *ndjson_files(dir))
336
+ assert_success(status, stderr, "non-decomposable serial fallback")
337
+ assert_equal([1, 2, 3], lines(stdout).map { |l| JSON.parse(l) }, "sort output")
338
+ assert_includes(stderr, "parallel: disabled", "non-decomposable falls back to serial")
339
+ end
340
+ end
341
+
342
+ def test_parallel_decomposable_matches_serial
343
+ Dir.mktmpdir do |dir|
344
+ write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i} })
345
+ write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i} })
346
+
347
+ files = ndjson_files(dir)
348
+ expr = '{s: sum(_["v"]), n: count(), mn: min(_["v"]), mx: max(_["v"]), avg: average(_["v"])}'
349
+
350
+ serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
351
+ assert_success(serial_status, serial_stderr, "serial baseline")
352
+
353
+ parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-v", "-P", "2", expr, *files)
354
+ assert_success(parallel_status, parallel_stderr, "parallel run")
355
+ assert_includes(parallel_stderr, "decompose=", "decomposable matches serial used decompose")
356
+
357
+ assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel decomposable matches serial")
358
+ end
359
+ end
360
+
361
+ def test_serial_error_includes_filename
362
+ Dir.mktmpdir do |dir|
363
+ good_path = File.join(dir, "a.ndjson")
364
+ File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
365
+
366
+ bad_gz_path = File.join(dir, "b.ndjson.gz")
367
+ full_gz = StringIO.new
368
+ Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
369
+ File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
370
+
371
+ good_path2 = File.join(dir, "c.ndjson")
372
+ File.write(good_path2, "{\"x\":3}\n")
373
+
374
+ stdout, stderr, status = Open3.capture3("./exe/jrf", '_["x"]', good_path, bad_gz_path, good_path2)
375
+ assert_failure(status, "serial error causes non-zero exit")
376
+ assert_includes(stderr, bad_gz_path, "serial error message includes filename")
377
+ refute_includes(stderr, "from ", "serial error does not include stacktrace")
378
+ # Data from good files should still be present
379
+ output_values = lines(stdout).map(&:to_i)
380
+ assert_includes(output_values, 1, "data before bad file preserved")
381
+ assert_includes(output_values, 3, "data after bad file preserved")
382
+ end
383
+ end
384
+
385
+ private
386
+
387
+ def write_ndjson(dir, name, rows)
388
+ File.write(File.join(dir, name), rows.map { |r| JSON.generate(r) + "\n" }.join)
389
+ end
390
+
391
+ def ndjson_files(dir)
392
+ Dir.glob(File.join(dir, "*.ndjson")).sort
393
+ end
394
+ end
@@ -106,24 +106,23 @@ class CliRunnerTest < JrfTestCase
106
106
 
107
107
  def test_runner_buffering_and_require_option
108
108
  threshold_input = StringIO.new((1..4).map { |i| "{\"foo\":\"#{'x' * 1020}\",\"i\":#{i}}\n" }.join)
109
- buffered_runner = RecordingRunner.new(inputs: [threshold_input], out: StringIO.new, err: StringIO.new)
109
+ buffered_runner = RecordingRunner.new(input: threshold_input, out: StringIO.new, err: StringIO.new)
110
110
  buffered_runner.run('_')
111
111
  expected_line = JSON.generate({"foo" => "x" * 1020, "i" => 1}) + "\n"
112
112
  assert_equal(2, buffered_runner.writes.length, "default atomic write limit buffers records until the configured threshold")
113
113
  assert_equal(expected_line.bytesize * 3, buffered_runner.writes.first.bytesize, "default atomic write limit flushes before the next record would exceed the threshold")
114
114
  assert_equal(expected_line.bytesize, buffered_runner.writes.last.bytesize, "final buffer flush emits the remaining record")
115
115
 
116
- small_limit_runner = RecordingRunner.new(inputs: [StringIO.new("{\"foo\":1}\n{\"foo\":2}\n")], out: StringIO.new, err: StringIO.new, atomic_write_bytes: 1)
116
+ small_limit_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":2}\n"), out: StringIO.new, err: StringIO.new, atomic_write_bytes: 1)
117
117
  small_limit_runner.run('_["foo"]')
118
118
  assert_equal(["1\n", "2\n"], small_limit_runner.writes, "small atomic write limit emits oversized records directly")
119
119
 
120
- error_runner = RecordingRunner.new(inputs: [StringIO.new("{\"foo\":1}\n{\"foo\":")], out: StringIO.new, err: StringIO.new)
121
- begin
122
- error_runner.run('_["foo"]')
123
- flunk("expected parse error for buffered flush test")
124
- rescue JSON::ParserError
125
- assert_equal(["1\n"], error_runner.writes, "buffer flushes pending output before parse errors escape")
126
- end
120
+ err_io = StringIO.new
121
+ error_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":"), out: StringIO.new, err: err_io)
122
+ error_runner.run('_["foo"]')
123
+ assert_equal(["1\n"], error_runner.writes, "buffer flushes pending output before parse errors")
124
+ assert_includes(err_io.string, "JSON::ParserError", "parse error reported to stderr")
125
+ assert(error_runner.input_errors?, "input_errors? is true after parse error")
127
126
 
128
127
  input_hello = <<~NDJSON
129
128
  {"hello":123}
@@ -648,7 +647,7 @@ class CliRunnerTest < JrfTestCase
648
647
  assert_equal(%w[9], lines(stdout), "lax trailing separator output")
649
648
 
650
649
  chunked_lax_out = RecordingRunner.new(
651
- inputs: [ChunkedSource.new("{\"foo\":1}\n\x1e{\"foo\":2}\n\t{\"foo\":3}\n")],
650
+ input: ChunkedSource.new("{\"foo\":1}\n\x1e{\"foo\":2}\n\t{\"foo\":3}\n"),
652
651
  out: StringIO.new,
653
652
  err: StringIO.new,
654
653
  lax: true
@@ -691,6 +690,7 @@ class CliRunnerTest < JrfTestCase
691
690
  assert_failure(status, "broken input should fail")
692
691
  assert_equal(%w[3], lines(stdout), "reducers flush before parse error")
693
692
  assert_includes(stderr, "JSON::ParserError")
693
+ refute_includes(stderr, "from ", "no stacktrace for parse errors")
694
694
  end
695
695
 
696
696
  def test_map
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jrf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - kazuho
@@ -75,6 +75,7 @@ files:
75
75
  - lib/jrf/row_context.rb
76
76
  - lib/jrf/stage.rb
77
77
  - lib/jrf/version.rb
78
+ - test/cli_parallel_test.rb
78
79
  - test/cli_runner_test.rb
79
80
  - test/library_api_test.rb
80
81
  - test/readme_examples_test.rb