jrf 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2862eaf6bd5f2486ea2c6aebf5caa4fbc2de56f419625bf8bb462392a3ea5dd9
4
- data.tar.gz: 3f29e7024f4e33606d78ad01ce4c45f37c9cd652ba94ac490866cd877368037a
3
+ metadata.gz: 78c1f6eb54e20d4dffbfe57f89a49d9e8ec9bbb2a9e118d911f2dec3c649f4ac
4
+ data.tar.gz: 63f43701422cfe200b7932a2177132f5e4e74e690960e71b88d6cc7b767e0b3c
5
5
  SHA512:
6
- metadata.gz: 04f55e0ea8c24f70126964beffbe80bee1800e1e210da2f96186bb8ebdf5542e5dfbab9c06b48624da4ec35912d02456561ec6c0d2c66c094de001ecf7f4096f
7
- data.tar.gz: '093821f35539be4561867b711664a31d3441052fe53e1c0f73489cd8b11fdf845bfb5573375f880ce07cd73ffc2f1d0514b55b760227c01ab515afd39d8ac08a'
6
+ metadata.gz: 152ebdc2322f9a8b6c0cad2cb303a093a45d5e0ecc17b519904e40e069a747b56e33f1ddd33f7f3efb32031d78808d05e32d93ab151572b973a1324f9e676e0b
7
+ data.tar.gz: 63c189a79b484777c25f5c1a7951d930fc2d110f3547216b2fd099469e57e7a062c0ec64ba2c7b0c3d7e88a6fb5f1f40d3b5ba6d1a0803acfc5253b00f43dfe8
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "json"
4
+ require "zlib"
4
5
  require_relative "../pipeline"
5
6
  require_relative "../pipeline_parser"
6
7
 
@@ -9,6 +10,7 @@ module Jrf
9
10
  class Runner
10
11
  RS_CHAR = "\x1e"
11
12
  DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
13
+ PARALLEL_FRAME_HEADER_BYTES = 4
12
14
 
13
15
  class RsNormalizer
14
16
  def initialize(input)
@@ -28,34 +30,87 @@ module Jrf
28
30
  end
29
31
  end
30
32
 
31
- def initialize(inputs:, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
32
- @inputs = inputs
33
+ class ParallelFrameReader
34
+ def initialize
35
+ @buf = +""
36
+ @offset = 0
37
+ end
38
+
39
+ def append(chunk)
40
+ @buf << chunk
41
+ end
42
+
43
+ def each_payload
44
+ while (payload = next_payload)
45
+ yield payload
46
+ end
47
+ end
48
+
49
+ def has_partial?
50
+ @offset != @buf.bytesize
51
+ end
52
+
53
+ private
54
+
55
+ def next_payload
56
+ if @buf.bytesize - @offset < PARALLEL_FRAME_HEADER_BYTES
57
+ compact!
58
+ return nil
59
+ end
60
+
61
+ payload_len = @buf.byteslice(@offset, PARALLEL_FRAME_HEADER_BYTES).unpack1("N")
62
+ frame_len = PARALLEL_FRAME_HEADER_BYTES + payload_len
63
+ if @buf.bytesize - @offset < frame_len
64
+ compact!
65
+ return nil
66
+ end
67
+
68
+ payload = @buf.byteslice(@offset + PARALLEL_FRAME_HEADER_BYTES, payload_len)
69
+ @offset += frame_len
70
+ payload
71
+ end
72
+
73
+ def compact!
74
+ if @offset > 0
75
+ @buf = @buf.byteslice(@offset..) || +""
76
+ @offset = 0
77
+ end
78
+ end
79
+ end
80
+
81
+ def initialize(input: $stdin, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
82
+ if input.is_a?(Array)
83
+ @file_paths = input
84
+ @stdin = nil
85
+ else
86
+ @file_paths = []
87
+ @stdin = input
88
+ end
33
89
  @out = out
34
90
  @err = err
35
91
  @lax = lax
36
92
  @output_format = output_format
37
93
  @atomic_write_bytes = atomic_write_bytes
38
94
  @output_buffer = +""
95
+ @input_errors = false
39
96
  end
40
97
 
41
- def run(expression, verbose: false)
42
- parsed = PipelineParser.new(expression).parse
43
- stages = parsed[:stages]
44
- dump_stages(stages) if verbose
45
-
46
- blocks = stages.map { |stage|
47
- eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
48
- }
49
- pipeline = Pipeline.new(*blocks)
50
-
51
- input_enum = Enumerator.new { |y| each_input_value { |v| y << v } }
98
+ def input_errors?
99
+ @input_errors
100
+ end
52
101
 
102
+ def run(expression, parallel: 1, verbose: false)
103
+ blocks = build_stage_blocks(expression, verbose: verbose)
53
104
  if @output_format == :tsv
54
105
  values = []
55
- pipeline.call(input_enum) { |value| values << value }
106
+ process_values(blocks, parallel: parallel, verbose: verbose) do |value|
107
+ values << value
108
+ end
56
109
  emit_tsv(values)
57
110
  else
58
- pipeline.call(input_enum) { |value| emit_output(value) }
111
+ process_values(blocks, parallel: parallel, verbose: verbose) do |value|
112
+ emit_output(value)
113
+ end
59
114
  end
60
115
  ensure
61
116
  write_output(@output_buffer)
@@ -63,26 +118,205 @@ module Jrf
63
118
 
64
119
  private
65
120
 
66
- def each_input_value
67
- return each_input_value_lax { |value| yield value } if @lax
121
+ def build_stage_blocks(expression, verbose:)
122
+ parsed = PipelineParser.new(expression).parse
123
+ stages = parsed[:stages]
124
+ dump_stages(stages) if verbose
125
+ stages.map { |stage|
126
+ eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
127
+ }
128
+ end
68
129
 
69
- each_input_value_ndjson { |value| yield value }
130
+ def apply_pipeline(blocks, input_enum)
131
+ pipeline = Pipeline.new(*blocks)
132
+ Enumerator.new do |y|
133
+ pipeline.call(input_enum) { |value| y << value }
134
+ end
70
135
  end
71
136
 
72
- def each_input_value_ndjson
73
- each_input do |source|
74
- source.each_line do |raw_line|
75
- line = raw_line.strip
76
- next if line.empty?
137
+ def each_input_enum
138
+ Enumerator.new { |y| each_input_value { |v| y << v } }
139
+ end
140
+
141
+ def process_values(blocks, parallel:, verbose:, &block)
142
+ if parallel <= 1 || @file_paths.length <= 1
143
+ dump_parallel_status("disabled", verbose: verbose)
144
+ return apply_pipeline(blocks, each_input_enum).each(&block)
145
+ end
146
+
147
+ # Parallelize the longest map-only prefix; reducers stay in the parent.
148
+ split_index = classify_parallel_stages(blocks)
149
+ if split_index.nil? || split_index == 0
150
+ dump_parallel_status("disabled", verbose: verbose)
151
+ return apply_pipeline(blocks, each_input_enum).each(&block)
152
+ end
153
+
154
+ map_blocks = blocks[0...split_index]
155
+ reduce_blocks = blocks[split_index..]
156
+ dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} split=#{split_index}/#{blocks.length}", verbose: verbose)
157
+ input_enum = parallel_map_enum(map_blocks, parallel)
158
+ (reduce_blocks.empty? ? input_enum : apply_pipeline(reduce_blocks, input_enum)).each(&block)
159
+ end
160
+
161
+ def dump_parallel_status(status, verbose:)
162
+ @err.puts "parallel: #{status}" if verbose
163
+ end
164
+
165
+ def classify_parallel_stages(blocks)
166
+ # Read the first row from the first file to probe stage modes
167
+ first_value = nil
168
+ open_file(@file_paths.first) do |stream|
169
+ each_stream_value(stream) do |value|
170
+ first_value = value
171
+ break
172
+ end
173
+ end
174
+ return nil if first_value.nil?
175
+
176
+ # Run the value through each stage independently to classify
177
+ split_index = nil
178
+ blocks.each_with_index do |block, i|
179
+ probe_pipeline = Pipeline.new(block)
180
+ probe_pipeline.call([first_value]) { |_| }
181
+ stage = probe_pipeline.instance_variable_get(:@stages).first
182
+ if stage.instance_variable_get(:@mode) == :reducer
183
+ split_index = i
184
+ break
185
+ end
186
+ end
77
187
 
78
- yield JSON.parse(line)
188
+ split_index || blocks.length
189
+ end
190
+
191
+ def spawn_parallel_worker(blocks, path)
192
+ read_io, write_io = IO.pipe
193
+ pid = fork do
194
+ read_io.close
195
+ @out = write_io
196
+ @output_buffer = +""
197
+ pipeline = Pipeline.new(*blocks)
198
+ input_enum = Enumerator.new do |y|
199
+ open_file(path) { |stream| each_stream_value(stream) { |v| y << v } }
200
+ end
201
+ worker_failed = false
202
+ begin
203
+ pipeline.call(input_enum) { |value| emit_parallel_frame(value) }
204
+ rescue => e
205
+ @err.puts "#{path}: #{e.message} (#{e.class})"
206
+ worker_failed = true
207
+ end
208
+ write_output(@output_buffer)
209
+ write_io.close
210
+ exit!(worker_failed ? 1 : 0)
211
+ end
212
+ write_io.close
213
+ [read_io, pid]
214
+ end
215
+
216
+ def run_parallel_worker_pool(blocks, num_workers)
217
+ file_queue = @file_paths.dup
218
+ workers = {} # read_io => [reader, pid]
219
+ children = []
220
+
221
+ # Fill initial pool
222
+ while workers.size < num_workers && !file_queue.empty?
223
+ read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
224
+ workers[read_io] = [ParallelFrameReader.new, pid]
225
+ children << pid
226
+ end
227
+
228
+ read_ios = workers.keys.dup
229
+
230
+ until read_ios.empty?
231
+ ready = IO.select(read_ios)
232
+ ready[0].each do |io|
233
+ reader = workers[io][0]
234
+ chunk = io.read_nonblock(65536, exception: false)
235
+ if chunk == :wait_readable
236
+ next
237
+ elsif chunk.nil?
238
+ raise IOError, "truncated parallel frame from worker" if reader.has_partial?
239
+ read_ios.delete(io)
240
+ io.close
241
+ workers.delete(io)
242
+
243
+ # Spawn next worker if files remain
244
+ unless file_queue.empty?
245
+ read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
246
+ workers[read_io] = [ParallelFrameReader.new, pid]
247
+ children << pid
248
+ read_ios << read_io
249
+ end
250
+ else
251
+ reader.append(chunk)
252
+ reader.each_payload do |payload|
253
+ yield JSON.parse(payload)
254
+ end
255
+ end
79
256
  end
80
257
  end
258
+
259
+ children
260
+ end
261
+
262
+ def parallel_map_enum(map_blocks, num_workers)
263
+ children = nil
264
+ Enumerator.new do |y|
265
+ children = run_parallel_worker_pool(map_blocks, num_workers) { |value| y << value }
266
+ ensure
267
+ wait_for_parallel_children(children) if children
268
+ end
269
+ end
270
+
271
+ def wait_for_parallel_children(children)
272
+ failed = false
273
+ children.each do |pid|
274
+ _, status = Process.waitpid2(pid)
275
+ failed = true unless status.success?
276
+ end
277
+ exit(1) if failed
278
+ end
279
+
280
+ def emit_parallel_frame(value)
281
+ payload = JSON.generate(value)
282
+ buffer_output([payload.bytesize].pack("N") << payload)
283
+ end
284
+
285
+ def each_input_value
286
+ each_input do |source|
287
+ each_stream_value(source) { |value| yield value }
288
+ end
81
289
  end
82
290
 
83
- def each_input_value_lax
291
+ def each_stream_value(stream)
292
+ return each_stream_value_lax(stream) { |value| yield value } if @lax
293
+
294
+ stream.each_line do |raw_line|
295
+ line = raw_line.strip
296
+ next if line.empty?
297
+ yield JSON.parse(line)
298
+ end
299
+ end
300
+
301
+ def open_file(path)
302
+ if path.end_with?(".gz")
303
+ Zlib::GzipReader.open(path) { |source| yield source }
304
+ else
305
+ File.open(path, "rb") { |source| yield source }
306
+ end
307
+ end
308
+
309
+ def each_stream_value_lax(stream)
84
310
  require "oj"
85
- handler = Class.new(Oj::ScHandler) do
311
+ Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
312
+ rescue LoadError
313
+ raise "oj is required for --lax mode (gem install oj)"
314
+ rescue Oj::ParseError => e
315
+ raise JSON::ParserError, e.message
316
+ end
317
+
318
+ def streaming_json_handler_class
319
+ @streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
86
320
  def initialize(&emit)
87
321
  @emit = emit
88
322
  end
@@ -94,13 +328,6 @@ module Jrf
94
328
  def array_append(array, value) = array << value
95
329
  def add_value(value) = @emit.call(value)
96
330
  end
97
- each_input do |source|
98
- Oj.sc_parse(handler.new { |value| yield value }, RsNormalizer.new(source))
99
- end
100
- rescue LoadError
101
- raise "oj is required for --lax mode (gem install oj)"
102
- rescue Oj::ParseError => e
103
- raise JSON::ParserError, e.message
104
331
  end
105
332
 
106
333
  def dump_stages(stages)
@@ -109,8 +336,25 @@ module Jrf
109
336
  end
110
337
  end
111
338
 
112
- def each_input
113
- @inputs.each { |source| yield source }
339
+ def each_input(&block)
340
+ if @file_paths.empty?
341
+ with_error_handling("<stdin>") { block.call(@stdin) }
342
+ else
343
+ @file_paths.each do |path|
344
+ if path == "-"
345
+ with_error_handling("<stdin>") { block.call(@stdin) }
346
+ else
347
+ with_error_handling(path) { open_file(path, &block) }
348
+ end
349
+ end
350
+ end
351
+ end
352
+
353
+ def with_error_handling(name)
354
+ yield
355
+ rescue IOError, SystemCallError, Zlib::GzipFile::Error, JSON::ParserError => e
356
+ @err.puts "#{name}: #{e.message} (#{e.class})"
357
+ @input_errors = true
114
358
  end
115
359
 
116
360
  def emit_output(value)
@@ -171,7 +415,13 @@ module Jrf
171
415
  end
172
416
 
173
417
  def write_output(str)
174
- @out.syswrite(str)
418
+ return if str.empty?
419
+
420
+ total = 0
421
+ while total < str.bytesize
422
+ written = @out.syswrite(str.byteslice(total..))
423
+ total += written
424
+ end
175
425
  end
176
426
  end
177
427
  end
data/lib/jrf/cli.rb CHANGED
@@ -18,6 +18,7 @@ module Jrf
18
18
  --lax allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)
19
19
  -o, --output FORMAT
20
20
  output format: json (default), pretty, tsv
21
+ -P N opportunistically parallelize the map-prefix across N workers
21
22
  -r, --require LIBRARY
22
23
  require LIBRARY before evaluating stages
23
24
  --no-jit do not enable YJIT, even when supported by the Ruby runtime
@@ -45,6 +46,7 @@ module Jrf
45
46
  verbose = false
46
47
  lax = false
47
48
  output_format = :json
49
+ parallel = 1
48
50
  jit = true
49
51
  required_libraries = []
50
52
  atomic_write_bytes = Runner::DEFAULT_OUTPUT_BUFFER_LIMIT
@@ -54,6 +56,7 @@ module Jrf
54
56
  opts.on("-v", "--verbose", "print parsed stage expressions") { verbose = true }
55
57
  opts.on("--lax", "allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)") { lax = true }
56
58
  opts.on("-o", "--output FORMAT", %w[json pretty tsv], "output format: json, pretty, tsv") { |fmt| output_format = fmt.to_sym }
59
+ opts.on("-P N", Integer, "opportunistically parallelize the map-prefix across N workers") { |n| parallel = n }
57
60
  opts.on("-r", "--require LIBRARY", "require LIBRARY before evaluating stages") { |library| required_libraries << library }
58
61
  opts.on("--no-jit", "do not enable YJIT, even when supported by the Ruby runtime") { jit = false }
59
62
  opts.on("--atomic-write-bytes N", Integer, "group short outputs into atomic writes of up to N bytes") do |value|
@@ -89,34 +92,20 @@ module Jrf
89
92
  enable_yjit if jit
90
93
  required_libraries.each { |library| require library }
91
94
 
92
- inputs = Enumerator.new do |y|
93
- if argv.empty?
94
- y << input
95
- else
96
- argv.each do |path|
97
- if path == "-"
98
- y << input
99
- elsif path.end_with?(".gz")
100
- require "zlib"
101
- Zlib::GzipReader.open(path) do |source|
102
- y << source
103
- end
104
- else
105
- File.open(path, "rb") do |source|
106
- y << source
107
- end
108
- end
109
- end
110
- end
111
- end
112
- Runner.new(
113
- inputs: inputs,
95
+ file_paths = argv.dup
96
+
97
+ runner = Runner.new(
98
+ input: file_paths.empty? ? input : file_paths,
114
99
  out: out,
115
100
  err: err,
116
101
  lax: lax,
117
102
  output_format: output_format,
118
103
  atomic_write_bytes: atomic_write_bytes
119
- ).run(expression, verbose: verbose)
104
+ )
105
+
106
+ runner.run(expression, parallel: parallel, verbose: verbose)
107
+
108
+ exit 1 if runner.input_errors?
120
109
  end
121
110
 
122
111
  def self.enable_yjit
data/lib/jrf/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Jrf
4
- VERSION = "0.1.13"
4
+ VERSION = "0.1.14"
5
5
  end
@@ -0,0 +1,195 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "test_helper"
4
+
5
+ class CliParallelTest < JrfTestCase
6
+ def test_parallel_map_only
7
+ Dir.mktmpdir do |dir|
8
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
9
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
10
+
11
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '_["x"]', *ndjson_files(dir))
12
+ assert_success(status, stderr, "parallel map only")
13
+ assert_equal([1, 2, 3, 4], lines(stdout).map(&:to_i).sort, "parallel map only output")
14
+ assert_includes(stderr, "parallel: enabled workers=2 files=2 split=1/1", "parallel verbose summary")
15
+ end
16
+ end
17
+
18
+ def test_parallel_map_only_pretty_output
19
+ Dir.mktmpdir do |dir|
20
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}])
21
+ write_ndjson(dir, "b.ndjson", [{"x" => 2}])
22
+
23
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "pretty", '_["x"]', *ndjson_files(dir))
24
+ assert_success(status, stderr, "parallel pretty map only")
25
+ assert_equal(["1", "2"], stdout.lines.map(&:strip).reject(&:empty?).sort, "parallel pretty map only output")
26
+ end
27
+ end
28
+
29
+ def test_parallel_map_only_tsv_output
30
+ Dir.mktmpdir do |dir|
31
+ write_ndjson(dir, "a.ndjson", [{"a" => 1, "b" => 2}])
32
+ write_ndjson(dir, "b.ndjson", [{"a" => 3, "b" => 4}])
33
+
34
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "tsv", "_", *ndjson_files(dir))
35
+ assert_success(status, stderr, "parallel tsv map only")
36
+ assert_equal(["a\t1", "a\t3", "b\t2", "b\t4"], lines(stdout).sort, "parallel tsv map only output")
37
+ end
38
+ end
39
+
40
+ def test_parallel_map_reduce
41
+ Dir.mktmpdir do |dir|
42
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
43
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
44
+
45
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
46
+ assert_success(status, stderr, "parallel map reduce")
47
+ assert_equal(%w[10], lines(stdout), "parallel sum output")
48
+ end
49
+ end
50
+
51
+ def test_parallel_split_map_and_reduce
52
+ Dir.mktmpdir do |dir|
53
+ write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
54
+ write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
55
+
56
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
57
+ assert_success(status, stderr, "parallel split map+reduce")
58
+ assert_equal(%w[90], lines(stdout), "parallel split map+reduce output")
59
+ end
60
+ end
61
+
62
+ def test_parallel_group_by
63
+ Dir.mktmpdir do |dir|
64
+ write_ndjson(dir, "a.ndjson", [{"k" => "a", "v" => 1}, {"k" => "b", "v" => 2}])
65
+ write_ndjson(dir, "b.ndjson", [{"k" => "a", "v" => 3}, {"k" => "b", "v" => 4}])
66
+
67
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'group_by(_["k"]) { |r| sum(r["v"]) }', *ndjson_files(dir))
68
+ assert_success(status, stderr, "parallel group_by")
69
+ result = JSON.parse(lines(stdout).first)
70
+ assert_equal(4, result["a"], "parallel group_by a")
71
+ assert_equal(6, result["b"], "parallel group_by b")
72
+ end
73
+ end
74
+
75
+ def test_parallel_all_reducers_falls_back_to_serial
76
+ Dir.mktmpdir do |dir|
77
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
78
+ write_ndjson(dir, "b.ndjson", [{"x" => 3}])
79
+
80
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
81
+ assert_success(status, stderr, "all-reducer serial fallback")
82
+ assert_equal(%w[6], lines(stdout), "all-reducer serial fallback output")
83
+ assert_includes(stderr, "parallel: disabled", "parallel disabled summary")
84
+ end
85
+ end
86
+
87
+ def test_parallel_with_gz_files
88
+ Dir.mktmpdir do |dir|
89
+ gz_path_a = File.join(dir, "a.ndjson.gz")
90
+ Zlib::GzipWriter.open(gz_path_a) { |io| io.write("{\"x\":10}\n{\"x\":20}\n") }
91
+ gz_path_b = File.join(dir, "b.ndjson.gz")
92
+ Zlib::GzipWriter.open(gz_path_b) { |io| io.write("{\"x\":30}\n") }
93
+
94
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', gz_path_a, gz_path_b)
95
+ assert_success(status, stderr, "parallel with gz")
96
+ assert_equal(%w[60], lines(stdout), "parallel with gz output")
97
+ end
98
+ end
99
+
100
+ def test_parallel_matches_serial_output
101
+ Dir.mktmpdir do |dir|
102
+ write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i, "g" => i % 3} })
103
+ write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i, "g" => i % 3} })
104
+
105
+ files = ndjson_files(dir)
106
+ expr = 'group_by(_["g"]) { |r| sum(r["v"]) }'
107
+
108
+ serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
109
+ assert_success(serial_status, serial_stderr, "serial baseline")
110
+
111
+ parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-P", "2", expr, *files)
112
+ assert_success(parallel_status, parallel_stderr, "parallel run")
113
+
114
+ assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel matches serial")
115
+ end
116
+ end
117
+
118
+ def test_parallel_worker_error_handling
119
+ Dir.mktmpdir do |dir|
120
+ good_path = File.join(dir, "a.ndjson")
121
+ File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
122
+
123
+ # Create a truncated gz file (valid header, truncated body)
124
+ bad_gz_path = File.join(dir, "b.ndjson.gz")
125
+ full_gz = StringIO.new
126
+ Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
127
+ # Write only the first half to simulate truncation
128
+ File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
129
+
130
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", '_["x"]', good_path, bad_gz_path)
131
+ assert_failure(status, "worker error causes non-zero exit")
132
+ assert_includes(stderr, bad_gz_path, "error message includes filename")
133
+ # Good file data should still be present
134
+ output_values = lines(stdout).map(&:to_i)
135
+ assert_includes(output_values, 1, "good file data preserved")
136
+ assert_includes(output_values, 2, "good file data preserved")
137
+ end
138
+ end
139
+
140
+ def test_parallel_requires_multiple_files
141
+ # With single file and -P, should still work (falls back to serial)
142
+ Dir.mktmpdir do |dir|
143
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
144
+
145
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
146
+ assert_success(status, stderr, "single file with -P")
147
+ assert_equal(%w[3], lines(stdout), "single file with -P output")
148
+ end
149
+ end
150
+
151
+ def test_parallel_select_then_sum
152
+ Dir.mktmpdir do |dir|
153
+ write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}, {"x" => 3}])
154
+ write_ndjson(dir, "b.ndjson", [{"x" => 40}, {"x" => 5}])
155
+
156
+ stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
157
+ assert_success(status, stderr, "parallel select then sum")
158
+ assert_equal(%w[60], lines(stdout), "parallel select then sum output")
159
+ end
160
+ end
161
+
162
+ def test_serial_error_includes_filename
163
+ Dir.mktmpdir do |dir|
164
+ good_path = File.join(dir, "a.ndjson")
165
+ File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
166
+
167
+ bad_gz_path = File.join(dir, "b.ndjson.gz")
168
+ full_gz = StringIO.new
169
+ Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
170
+ File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
171
+
172
+ good_path2 = File.join(dir, "c.ndjson")
173
+ File.write(good_path2, "{\"x\":3}\n")
174
+
175
+ stdout, stderr, status = Open3.capture3("./exe/jrf", '_["x"]', good_path, bad_gz_path, good_path2)
176
+ assert_failure(status, "serial error causes non-zero exit")
177
+ assert_includes(stderr, bad_gz_path, "serial error message includes filename")
178
+ refute_includes(stderr, "from ", "serial error does not include stacktrace")
179
+ # Data from good files should still be present
180
+ output_values = lines(stdout).map(&:to_i)
181
+ assert_includes(output_values, 1, "data before bad file preserved")
182
+ assert_includes(output_values, 3, "data after bad file preserved")
183
+ end
184
+ end
185
+
186
+ private
187
+
188
+ def write_ndjson(dir, name, rows)
189
+ File.write(File.join(dir, name), rows.map { |r| JSON.generate(r) + "\n" }.join)
190
+ end
191
+
192
+ def ndjson_files(dir)
193
+ Dir.glob(File.join(dir, "*.ndjson")).sort
194
+ end
195
+ end
@@ -106,24 +106,23 @@ class CliRunnerTest < JrfTestCase
106
106
 
107
107
  def test_runner_buffering_and_require_option
108
108
  threshold_input = StringIO.new((1..4).map { |i| "{\"foo\":\"#{'x' * 1020}\",\"i\":#{i}}\n" }.join)
109
- buffered_runner = RecordingRunner.new(inputs: [threshold_input], out: StringIO.new, err: StringIO.new)
109
+ buffered_runner = RecordingRunner.new(input: threshold_input, out: StringIO.new, err: StringIO.new)
110
110
  buffered_runner.run('_')
111
111
  expected_line = JSON.generate({"foo" => "x" * 1020, "i" => 1}) + "\n"
112
112
  assert_equal(2, buffered_runner.writes.length, "default atomic write limit buffers records until the configured threshold")
113
113
  assert_equal(expected_line.bytesize * 3, buffered_runner.writes.first.bytesize, "default atomic write limit flushes before the next record would exceed the threshold")
114
114
  assert_equal(expected_line.bytesize, buffered_runner.writes.last.bytesize, "final buffer flush emits the remaining record")
115
115
 
116
- small_limit_runner = RecordingRunner.new(inputs: [StringIO.new("{\"foo\":1}\n{\"foo\":2}\n")], out: StringIO.new, err: StringIO.new, atomic_write_bytes: 1)
116
+ small_limit_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":2}\n"), out: StringIO.new, err: StringIO.new, atomic_write_bytes: 1)
117
117
  small_limit_runner.run('_["foo"]')
118
118
  assert_equal(["1\n", "2\n"], small_limit_runner.writes, "small atomic write limit emits oversized records directly")
119
119
 
120
- error_runner = RecordingRunner.new(inputs: [StringIO.new("{\"foo\":1}\n{\"foo\":")], out: StringIO.new, err: StringIO.new)
121
- begin
122
- error_runner.run('_["foo"]')
123
- flunk("expected parse error for buffered flush test")
124
- rescue JSON::ParserError
125
- assert_equal(["1\n"], error_runner.writes, "buffer flushes pending output before parse errors escape")
126
- end
120
+ err_io = StringIO.new
121
+ error_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":"), out: StringIO.new, err: err_io)
122
+ error_runner.run('_["foo"]')
123
+ assert_equal(["1\n"], error_runner.writes, "buffer flushes pending output before parse errors")
124
+ assert_includes(err_io.string, "JSON::ParserError", "parse error reported to stderr")
125
+ assert(error_runner.input_errors?, "input_errors? is true after parse error")
127
126
 
128
127
  input_hello = <<~NDJSON
129
128
  {"hello":123}
@@ -648,7 +647,7 @@ class CliRunnerTest < JrfTestCase
648
647
  assert_equal(%w[9], lines(stdout), "lax trailing separator output")
649
648
 
650
649
  chunked_lax_out = RecordingRunner.new(
651
- inputs: [ChunkedSource.new("{\"foo\":1}\n\x1e{\"foo\":2}\n\t{\"foo\":3}\n")],
650
+ input: ChunkedSource.new("{\"foo\":1}\n\x1e{\"foo\":2}\n\t{\"foo\":3}\n"),
652
651
  out: StringIO.new,
653
652
  err: StringIO.new,
654
653
  lax: true
@@ -691,6 +690,7 @@ class CliRunnerTest < JrfTestCase
691
690
  assert_failure(status, "broken input should fail")
692
691
  assert_equal(%w[3], lines(stdout), "reducers flush before parse error")
693
692
  assert_includes(stderr, "JSON::ParserError")
693
+ refute_includes(stderr, "from ", "no stacktrace for parse errors")
694
694
  end
695
695
 
696
696
  def test_map
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jrf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - kazuho
@@ -75,6 +75,7 @@ files:
75
75
  - lib/jrf/row_context.rb
76
76
  - lib/jrf/stage.rb
77
77
  - lib/jrf/version.rb
78
+ - test/cli_parallel_test.rb
78
79
  - test/cli_runner_test.rb
79
80
  - test/library_api_test.rb
80
81
  - test/readme_examples_test.rb