RubyGems - jrf - Versions diffs - 0.1.13 → 0.1.14 - Mend

jrf 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2862eaf6bd5f2486ea2c6aebf5caa4fbc2de56f419625bf8bb462392a3ea5dd9
-  data.tar.gz: 3f29e7024f4e33606d78ad01ce4c45f37c9cd652ba94ac490866cd877368037a
+  metadata.gz: 78c1f6eb54e20d4dffbfe57f89a49d9e8ec9bbb2a9e118d911f2dec3c649f4ac
+  data.tar.gz: 63f43701422cfe200b7932a2177132f5e4e74e690960e71b88d6cc7b767e0b3c
 SHA512:
-  metadata.gz: 04f55e0ea8c24f70126964beffbe80bee1800e1e210da2f96186bb8ebdf5542e5dfbab9c06b48624da4ec35912d02456561ec6c0d2c66c094de001ecf7f4096f
-  data.tar.gz: '093821f35539be4561867b711664a31d3441052fe53e1c0f73489cd8b11fdf845bfb5573375f880ce07cd73ffc2f1d0514b55b760227c01ab515afd39d8ac08a'
+  metadata.gz: 152ebdc2322f9a8b6c0cad2cb303a093a45d5e0ecc17b519904e40e069a747b56e33f1ddd33f7f3efb32031d78808d05e32d93ab151572b973a1324f9e676e0b
+  data.tar.gz: 63c189a79b484777c25f5c1a7951d930fc2d110f3547216b2fd099469e57e7a062c0ec64ba2c7b0c3d7e88a6fb5f1f40d3b5ba6d1a0803acfc5253b00f43dfe8

data/lib/jrf/cli/runner.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "json"
+require "zlib"
 require_relative "../pipeline"
 require_relative "../pipeline_parser"
@@ -9,6 +10,7 @@ module Jrf
     class Runner
       RS_CHAR = "\x1e"
       DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
+      PARALLEL_FRAME_HEADER_BYTES = 4
       class RsNormalizer
         def initialize(input)
@@ -28,34 +30,87 @@ module Jrf
         end
       end
-      def initialize(inputs:, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
-        @inputs = inputs
+      class ParallelFrameReader
+        def initialize
+          @buf = +""
+          @offset = 0
+        end
+        def append(chunk)
+          @buf << chunk
+        end
+        def each_payload
+          while (payload = next_payload)
+            yield payload
+          end
+        end
+        def has_partial?
+          @offset != @buf.bytesize
+        end
+        private
+        def next_payload
+          if @buf.bytesize - @offset < PARALLEL_FRAME_HEADER_BYTES
+            compact!
+            return nil
+          end
+          payload_len = @buf.byteslice(@offset, PARALLEL_FRAME_HEADER_BYTES).unpack1("N")
+          frame_len = PARALLEL_FRAME_HEADER_BYTES + payload_len
+          if @buf.bytesize - @offset < frame_len
+            compact!
+            return nil
+          end
+          payload = @buf.byteslice(@offset + PARALLEL_FRAME_HEADER_BYTES, payload_len)
+          @offset += frame_len
+          payload
+        end
+        def compact!
+          if @offset > 0
+            @buf = @buf.byteslice(@offset..) || +""
+            @offset = 0
+          end
+        end
+      end
+      def initialize(input: $stdin, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
+        if input.is_a?(Array)
+          @file_paths = input
+          @stdin = nil
+        else
+          @file_paths = []
+          @stdin = input
+        end
         @out = out
         @err = err
         @lax = lax
         @output_format = output_format
         @atomic_write_bytes = atomic_write_bytes
         @output_buffer = +""
+        @input_errors = false
       end
-      def run(expression, verbose: false)
-        parsed = PipelineParser.new(expression).parse
-        stages = parsed[:stages]
-        dump_stages(stages) if verbose
-        blocks = stages.map { |stage|
-          eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
-        }
-        pipeline = Pipeline.new(*blocks)
-        input_enum = Enumerator.new { |y| each_input_value { |v| y << v } }
+      def input_errors?
+        @input_errors
+      end
+      def run(expression, parallel: 1, verbose: false)
+        blocks = build_stage_blocks(expression, verbose: verbose)
         if @output_format == :tsv
           values = []
-          pipeline.call(input_enum) { |value| values << value }
+          process_values(blocks, parallel: parallel, verbose: verbose) do |value|
+            values << value
+          end
           emit_tsv(values)
         else
-          pipeline.call(input_enum) { |value| emit_output(value) }
+          process_values(blocks, parallel: parallel, verbose: verbose) do |value|
+            emit_output(value)
+          end
         end
       ensure
         write_output(@output_buffer)
@@ -63,26 +118,205 @@ module Jrf
       private
-      def each_input_value
-        return each_input_value_lax { |value| yield value } if @lax
+      def build_stage_blocks(expression, verbose:)
+        parsed = PipelineParser.new(expression).parse
+        stages = parsed[:stages]
+        dump_stages(stages) if verbose
+        stages.map { |stage|
+          eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
+        }
+      end
-        each_input_value_ndjson { |value| yield value }
+      def apply_pipeline(blocks, input_enum)
+        pipeline = Pipeline.new(*blocks)
+        Enumerator.new do |y|
+          pipeline.call(input_enum) { |value| y << value }
+        end
       end
-      def each_input_value_ndjson
-        each_input do |source|
-          source.each_line do |raw_line|
-            line = raw_line.strip
-            next if line.empty?
+      def each_input_enum
+        Enumerator.new { |y| each_input_value { |v| y << v } }
+      end
+      def process_values(blocks, parallel:, verbose:, &block)
+        if parallel <= 1 || @file_paths.length <= 1
+          dump_parallel_status("disabled", verbose: verbose)
+          return apply_pipeline(blocks, each_input_enum).each(&block)
+        end
+        # Parallelize the longest map-only prefix; reducers stay in the parent.
+        split_index = classify_parallel_stages(blocks)
+        if split_index.nil? || split_index == 0
+          dump_parallel_status("disabled", verbose: verbose)
+          return apply_pipeline(blocks, each_input_enum).each(&block)
+        end
+        map_blocks = blocks[0...split_index]
+        reduce_blocks = blocks[split_index..]
+        dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} split=#{split_index}/#{blocks.length}", verbose: verbose)
+        input_enum = parallel_map_enum(map_blocks, parallel)
+        (reduce_blocks.empty? ? input_enum : apply_pipeline(reduce_blocks, input_enum)).each(&block)
+      end
+      def dump_parallel_status(status, verbose:)
+        @err.puts "parallel: #{status}" if verbose
+      end
+      def classify_parallel_stages(blocks)
+        # Read the first row from the first file to probe stage modes
+        first_value = nil
+        open_file(@file_paths.first) do |stream|
+          each_stream_value(stream) do |value|
+            first_value = value
+            break
+          end
+        end
+        return nil if first_value.nil?
+        # Run the value through each stage independently to classify
+        split_index = nil
+        blocks.each_with_index do |block, i|
+          probe_pipeline = Pipeline.new(block)
+          probe_pipeline.call([first_value]) { |_| }
+          stage = probe_pipeline.instance_variable_get(:@stages).first
+          if stage.instance_variable_get(:@mode) == :reducer
+            split_index = i
+            break
+          end
+        end
-            yield JSON.parse(line)
+        split_index || blocks.length
+      end
+      def spawn_parallel_worker(blocks, path)
+        read_io, write_io = IO.pipe
+        pid = fork do
+          read_io.close
+          @out = write_io
+          @output_buffer = +""
+          pipeline = Pipeline.new(*blocks)
+          input_enum = Enumerator.new do |y|
+            open_file(path) { |stream| each_stream_value(stream) { |v| y << v } }
+          end
+          worker_failed = false
+          begin
+            pipeline.call(input_enum) { |value| emit_parallel_frame(value) }
+          rescue => e
+            @err.puts "#{path}: #{e.message} (#{e.class})"
+            worker_failed = true
+          end
+          write_output(@output_buffer)
+          write_io.close
+          exit!(worker_failed ? 1 : 0)
+        end
+        write_io.close
+        [read_io, pid]
+      end
+      def run_parallel_worker_pool(blocks, num_workers)
+        file_queue = @file_paths.dup
+        workers = {} # read_io => [reader, pid]
+        children = []
+        # Fill initial pool
+        while workers.size < num_workers && !file_queue.empty?
+          read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
+          workers[read_io] = [ParallelFrameReader.new, pid]
+          children << pid
+        end
+        read_ios = workers.keys.dup
+        until read_ios.empty?
+          ready = IO.select(read_ios)
+          ready[0].each do |io|
+            reader = workers[io][0]
+            chunk = io.read_nonblock(65536, exception: false)
+            if chunk == :wait_readable
+              next
+            elsif chunk.nil?
+              raise IOError, "truncated parallel frame from worker" if reader.has_partial?
+              read_ios.delete(io)
+              io.close
+              workers.delete(io)
+              # Spawn next worker if files remain
+              unless file_queue.empty?
+                read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
+                workers[read_io] = [ParallelFrameReader.new, pid]
+                children << pid
+                read_ios << read_io
+              end
+            else
+              reader.append(chunk)
+              reader.each_payload do |payload|
+                yield JSON.parse(payload)
+              end
+            end
           end
         end
+        children
+      end
+      def parallel_map_enum(map_blocks, num_workers)
+        children = nil
+        Enumerator.new do |y|
+          children = run_parallel_worker_pool(map_blocks, num_workers) { |value| y << value }
+        ensure
+          wait_for_parallel_children(children) if children
+        end
+      end
+      def wait_for_parallel_children(children)
+        failed = false
+        children.each do |pid|
+          _, status = Process.waitpid2(pid)
+          failed = true unless status.success?
+        end
+        exit(1) if failed
+      end
+      def emit_parallel_frame(value)
+        payload = JSON.generate(value)
+        buffer_output([payload.bytesize].pack("N") << payload)
+      end
+      def each_input_value
+        each_input do |source|
+          each_stream_value(source) { |value| yield value }
+        end
       end
-      def each_input_value_lax
+      def each_stream_value(stream)
+        return each_stream_value_lax(stream) { |value| yield value } if @lax
+        stream.each_line do |raw_line|
+          line = raw_line.strip
+          next if line.empty?
+          yield JSON.parse(line)
+        end
+      end
+      def open_file(path)
+        if path.end_with?(".gz")
+          Zlib::GzipReader.open(path) { |source| yield source }
+        else
+          File.open(path, "rb") { |source| yield source }
+        end
+      end
+      def each_stream_value_lax(stream)
         require "oj"
-        handler = Class.new(Oj::ScHandler) do
+        Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
+      rescue LoadError
+        raise "oj is required for --lax mode (gem install oj)"
+      rescue Oj::ParseError => e
+        raise JSON::ParserError, e.message
+      end
+      def streaming_json_handler_class
+        @streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
           def initialize(&emit)
             @emit = emit
           end
@@ -94,13 +328,6 @@ module Jrf
           def array_append(array, value) = array << value
           def add_value(value) = @emit.call(value)
         end
-        each_input do |source|
-          Oj.sc_parse(handler.new { |value| yield value }, RsNormalizer.new(source))
-        end
-      rescue LoadError
-        raise "oj is required for --lax mode (gem install oj)"
-      rescue Oj::ParseError => e
-        raise JSON::ParserError, e.message
       end
       def dump_stages(stages)
@@ -109,8 +336,25 @@ module Jrf
         end
       end
-      def each_input
-        @inputs.each { |source| yield source }
+      def each_input(&block)
+        if @file_paths.empty?
+          with_error_handling("<stdin>") { block.call(@stdin) }
+        else
+          @file_paths.each do |path|
+            if path == "-"
+              with_error_handling("<stdin>") { block.call(@stdin) }
+            else
+              with_error_handling(path) { open_file(path, &block) }
+            end
+          end
+        end
+      end
+      def with_error_handling(name)
+        yield
+      rescue IOError, SystemCallError, Zlib::GzipFile::Error, JSON::ParserError => e
+        @err.puts "#{name}: #{e.message} (#{e.class})"
+        @input_errors = true
       end
       def emit_output(value)
@@ -171,7 +415,13 @@ module Jrf
       end
       def write_output(str)
-        @out.syswrite(str)
+        return if str.empty?
+        total = 0
+        while total < str.bytesize
+          written = @out.syswrite(str.byteslice(total..))
+          total += written
+        end
       end
     end
   end

data/lib/jrf/cli.rb CHANGED Viewed

@@ -18,6 +18,7 @@ module Jrf
         --lax          allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)
         -o, --output FORMAT
                        output format: json (default), pretty, tsv
+        -P N           opportunistically parallelize the map-prefix across N workers
         -r, --require LIBRARY
                        require LIBRARY before evaluating stages
         --no-jit       do not enable YJIT, even when supported by the Ruby runtime
@@ -45,6 +46,7 @@ module Jrf
       verbose = false
       lax = false
       output_format = :json
+      parallel = 1
       jit = true
       required_libraries = []
       atomic_write_bytes = Runner::DEFAULT_OUTPUT_BUFFER_LIMIT
@@ -54,6 +56,7 @@ module Jrf
           opts.on("-v", "--verbose", "print parsed stage expressions") { verbose = true }
           opts.on("--lax", "allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)") { lax = true }
           opts.on("-o", "--output FORMAT", %w[json pretty tsv], "output format: json, pretty, tsv") { |fmt| output_format = fmt.to_sym }
+          opts.on("-P N", Integer, "opportunistically parallelize the map-prefix across N workers") { |n| parallel = n }
           opts.on("-r", "--require LIBRARY", "require LIBRARY before evaluating stages") { |library| required_libraries << library }
           opts.on("--no-jit", "do not enable YJIT, even when supported by the Ruby runtime") { jit = false }
           opts.on("--atomic-write-bytes N", Integer, "group short outputs into atomic writes of up to N bytes") do |value|
@@ -89,34 +92,20 @@ module Jrf
       enable_yjit if jit
       required_libraries.each { |library| require library }
-      inputs = Enumerator.new do |y|
-        if argv.empty?
-          y << input
-        else
-          argv.each do |path|
-            if path == "-"
-              y << input
-            elsif path.end_with?(".gz")
-              require "zlib"
-              Zlib::GzipReader.open(path) do |source|
-                y << source
-              end
-            else
-              File.open(path, "rb") do |source|
-                y << source
-              end
-            end
-          end
-        end
-      end
-      Runner.new(
-        inputs: inputs,
+      file_paths = argv.dup
+      runner = Runner.new(
+        input: file_paths.empty? ? input : file_paths,
         out: out,
         err: err,
         lax: lax,
         output_format: output_format,
         atomic_write_bytes: atomic_write_bytes
-      ).run(expression, verbose: verbose)
+      )
+      runner.run(expression, parallel: parallel, verbose: verbose)
+      exit 1 if runner.input_errors?
     end
     def self.enable_yjit

data/lib/jrf/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Jrf
-  VERSION = "0.1.13"
+  VERSION = "0.1.14"
 end

data/test/cli_parallel_test.rb ADDED Viewed

@@ -0,0 +1,195 @@
+# frozen_string_literal: true
+require_relative "test_helper"
+class CliParallelTest < JrfTestCase
+  def test_parallel_map_only
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
+      write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '_["x"]', *ndjson_files(dir))
+      assert_success(status, stderr, "parallel map only")
+      assert_equal([1, 2, 3, 4], lines(stdout).map(&:to_i).sort, "parallel map only output")
+      assert_includes(stderr, "parallel: enabled workers=2 files=2 split=1/1", "parallel verbose summary")
+    end
+  end
+  def test_parallel_map_only_pretty_output
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"x" => 1}])
+      write_ndjson(dir, "b.ndjson", [{"x" => 2}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "pretty", '_["x"]', *ndjson_files(dir))
+      assert_success(status, stderr, "parallel pretty map only")
+      assert_equal(["1", "2"], stdout.lines.map(&:strip).reject(&:empty?).sort, "parallel pretty map only output")
+    end
+  end
+  def test_parallel_map_only_tsv_output
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"a" => 1, "b" => 2}])
+      write_ndjson(dir, "b.ndjson", [{"a" => 3, "b" => 4}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "tsv", "_", *ndjson_files(dir))
+      assert_success(status, stderr, "parallel tsv map only")
+      assert_equal(["a\t1", "a\t3", "b\t2", "b\t4"], lines(stdout).sort, "parallel tsv map only output")
+    end
+  end
+  def test_parallel_map_reduce
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
+      write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
+      assert_success(status, stderr, "parallel map reduce")
+      assert_equal(%w[10], lines(stdout), "parallel sum output")
+    end
+  end
+  def test_parallel_split_map_and_reduce
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
+      write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
+      assert_success(status, stderr, "parallel split map+reduce")
+      assert_equal(%w[90], lines(stdout), "parallel split map+reduce output")
+    end
+  end
+  def test_parallel_group_by
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"k" => "a", "v" => 1}, {"k" => "b", "v" => 2}])
+      write_ndjson(dir, "b.ndjson", [{"k" => "a", "v" => 3}, {"k" => "b", "v" => 4}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'group_by(_["k"]) { |r| sum(r["v"]) }', *ndjson_files(dir))
+      assert_success(status, stderr, "parallel group_by")
+      result = JSON.parse(lines(stdout).first)
+      assert_equal(4, result["a"], "parallel group_by a")
+      assert_equal(6, result["b"], "parallel group_by b")
+    end
+  end
+  def test_parallel_all_reducers_falls_back_to_serial
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
+      write_ndjson(dir, "b.ndjson", [{"x" => 3}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
+      assert_success(status, stderr, "all-reducer serial fallback")
+      assert_equal(%w[6], lines(stdout), "all-reducer serial fallback output")
+      assert_includes(stderr, "parallel: disabled", "parallel disabled summary")
+    end
+  end
+  def test_parallel_with_gz_files
+    Dir.mktmpdir do |dir|
+      gz_path_a = File.join(dir, "a.ndjson.gz")
+      Zlib::GzipWriter.open(gz_path_a) { |io| io.write("{\"x\":10}\n{\"x\":20}\n") }
+      gz_path_b = File.join(dir, "b.ndjson.gz")
+      Zlib::GzipWriter.open(gz_path_b) { |io| io.write("{\"x\":30}\n") }
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', gz_path_a, gz_path_b)
+      assert_success(status, stderr, "parallel with gz")
+      assert_equal(%w[60], lines(stdout), "parallel with gz output")
+    end
+  end
+  def test_parallel_matches_serial_output
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i, "g" => i % 3} })
+      write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i, "g" => i % 3} })
+      files = ndjson_files(dir)
+      expr = 'group_by(_["g"]) { |r| sum(r["v"]) }'
+      serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
+      assert_success(serial_status, serial_stderr, "serial baseline")
+      parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-P", "2", expr, *files)
+      assert_success(parallel_status, parallel_stderr, "parallel run")
+      assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel matches serial")
+    end
+  end
+  def test_parallel_worker_error_handling
+    Dir.mktmpdir do |dir|
+      good_path = File.join(dir, "a.ndjson")
+      File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
+      # Create a truncated gz file (valid header, truncated body)
+      bad_gz_path = File.join(dir, "b.ndjson.gz")
+      full_gz = StringIO.new
+      Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
+      # Write only the first half to simulate truncation
+      File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", '_["x"]', good_path, bad_gz_path)
+      assert_failure(status, "worker error causes non-zero exit")
+      assert_includes(stderr, bad_gz_path, "error message includes filename")
+      # Good file data should still be present
+      output_values = lines(stdout).map(&:to_i)
+      assert_includes(output_values, 1, "good file data preserved")
+      assert_includes(output_values, 2, "good file data preserved")
+    end
+  end
+  def test_parallel_requires_multiple_files
+    # With single file and -P, should still work (falls back to serial)
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
+      assert_success(status, stderr, "single file with -P")
+      assert_equal(%w[3], lines(stdout), "single file with -P output")
+    end
+  end
+  def test_parallel_select_then_sum
+    Dir.mktmpdir do |dir|
+      write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}, {"x" => 3}])
+      write_ndjson(dir, "b.ndjson", [{"x" => 40}, {"x" => 5}])
+      stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
+      assert_success(status, stderr, "parallel select then sum")
+      assert_equal(%w[60], lines(stdout), "parallel select then sum output")
+    end
+  end
+  def test_serial_error_includes_filename
+    Dir.mktmpdir do |dir|
+      good_path = File.join(dir, "a.ndjson")
+      File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
+      bad_gz_path = File.join(dir, "b.ndjson.gz")
+      full_gz = StringIO.new
+      Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
+      File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
+      good_path2 = File.join(dir, "c.ndjson")
+      File.write(good_path2, "{\"x\":3}\n")
+      stdout, stderr, status = Open3.capture3("./exe/jrf", '_["x"]', good_path, bad_gz_path, good_path2)
+      assert_failure(status, "serial error causes non-zero exit")
+      assert_includes(stderr, bad_gz_path, "serial error message includes filename")
+      refute_includes(stderr, "from ", "serial error does not include stacktrace")
+      # Data from good files should still be present
+      output_values = lines(stdout).map(&:to_i)
+      assert_includes(output_values, 1, "data before bad file preserved")
+      assert_includes(output_values, 3, "data after bad file preserved")
+    end
+  end
+  private
+  def write_ndjson(dir, name, rows)
+    File.write(File.join(dir, name), rows.map { |r| JSON.generate(r) + "\n" }.join)
+  end
+  def ndjson_files(dir)
+    Dir.glob(File.join(dir, "*.ndjson")).sort
+  end
+end

data/test/cli_runner_test.rb CHANGED Viewed

@@ -106,24 +106,23 @@ class CliRunnerTest < JrfTestCase
   def test_runner_buffering_and_require_option
     threshold_input = StringIO.new((1..4).map { |i| "{\"foo\":\"#{'x' * 1020}\",\"i\":#{i}}\n" }.join)
-    buffered_runner = RecordingRunner.new(inputs: [threshold_input], out: StringIO.new, err: StringIO.new)
+    buffered_runner = RecordingRunner.new(input: threshold_input, out: StringIO.new, err: StringIO.new)
     buffered_runner.run('_')
     expected_line = JSON.generate({"foo" => "x" * 1020, "i" => 1}) + "\n"
     assert_equal(2, buffered_runner.writes.length, "default atomic write limit buffers records until the configured threshold")
     assert_equal(expected_line.bytesize * 3, buffered_runner.writes.first.bytesize, "default atomic write limit flushes before the next record would exceed the threshold")
     assert_equal(expected_line.bytesize, buffered_runner.writes.last.bytesize, "final buffer flush emits the remaining record")
-    small_limit_runner = RecordingRunner.new(inputs: [StringIO.new("{\"foo\":1}\n{\"foo\":2}\n")], out: StringIO.new, err: StringIO.new, atomic_write_bytes: 1)
+    small_limit_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":2}\n"), out: StringIO.new, err: StringIO.new, atomic_write_bytes: 1)
     small_limit_runner.run('_["foo"]')
     assert_equal(["1\n", "2\n"], small_limit_runner.writes, "small atomic write limit emits oversized records directly")
-    error_runner = RecordingRunner.new(inputs: [StringIO.new("{\"foo\":1}\n{\"foo\":")], out: StringIO.new, err: StringIO.new)
-    begin
-      error_runner.run('_["foo"]')
-      flunk("expected parse error for buffered flush test")
-    rescue JSON::ParserError
-      assert_equal(["1\n"], error_runner.writes, "buffer flushes pending output before parse errors escape")
-    end
+    err_io = StringIO.new
+    error_runner = RecordingRunner.new(input: StringIO.new("{\"foo\":1}\n{\"foo\":"), out: StringIO.new, err: err_io)
+    error_runner.run('_["foo"]')
+    assert_equal(["1\n"], error_runner.writes, "buffer flushes pending output before parse errors")
+    assert_includes(err_io.string, "JSON::ParserError", "parse error reported to stderr")
+    assert(error_runner.input_errors?, "input_errors? is true after parse error")
     input_hello = <<~NDJSON
       {"hello":123}
@@ -648,7 +647,7 @@ class CliRunnerTest < JrfTestCase
     assert_equal(%w[9], lines(stdout), "lax trailing separator output")
     chunked_lax_out = RecordingRunner.new(
-      inputs: [ChunkedSource.new("{\"foo\":1}\n\x1e{\"foo\":2}\n\t{\"foo\":3}\n")],
+      input: ChunkedSource.new("{\"foo\":1}\n\x1e{\"foo\":2}\n\t{\"foo\":3}\n"),
       out: StringIO.new,
       err: StringIO.new,
       lax: true
@@ -691,6 +690,7 @@ class CliRunnerTest < JrfTestCase
     assert_failure(status, "broken input should fail")
     assert_equal(%w[3], lines(stdout), "reducers flush before parse error")
     assert_includes(stderr, "JSON::ParserError")
+    refute_includes(stderr, "from ", "no stacktrace for parse errors")
   end
   def test_map

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jrf
 version: !ruby/object:Gem::Version
-  version: 0.1.13
+  version: 0.1.14
 platform: ruby
 authors:
 - kazuho
@@ -75,6 +75,7 @@ files:
 - lib/jrf/row_context.rb
 - lib/jrf/stage.rb
 - lib/jrf/version.rb
+- test/cli_parallel_test.rb
 - test/cli_runner_test.rb
 - test/library_api_test.rb
 - test/readme_examples_test.rb