RubyGems - jrf - Versions diffs - 0.1.12 → 0.1.14 - Mend

jrf 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/jrf.gemspec +2 -0
data/lib/jrf/cli/runner.rb +336 -34
data/lib/jrf/cli.rb +17 -27
data/lib/jrf/version.rb +1 -1
data/test/cli_parallel_test.rb +195 -0
data/test/cli_runner_test.rb +951 -0
data/test/library_api_test.rb +126 -0
data/test/readme_examples_test.rb +16 -0
data/test/test_helper.rb +118 -0
metadata +34 -2
data/test/jrf_test.rb +0 -1103

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ddf9bb5a12260eea615d0107dc7374dec2b5a3fe81c51791ea7e7ffea359d12f
-  data.tar.gz: d556b7d230185a9c397af45abaa08b912f485f3a1494bebdf86f2ef44be81c64
+  metadata.gz: 78c1f6eb54e20d4dffbfe57f89a49d9e8ec9bbb2a9e118d911f2dec3c649f4ac
+  data.tar.gz: 63f43701422cfe200b7932a2177132f5e4e74e690960e71b88d6cc7b767e0b3c
 SHA512:
-  metadata.gz: e7642e5e7c50e9b4da7f28bf906bf432ee7c6d377839af2ff2b73d299c044e08566876bd3d33d5492abe9432b58f0871df2ebb9d4e7ec45ffe02a495fad1e2fc
-  data.tar.gz: '0629e81b1c9cf8070fb0a1fcbe409cf026bb62f72da9855d32fe9f253552de51a21f663b616abdc3b474dd705d9067512838fe5f0e33293f423636a5ea18167f'
+  metadata.gz: 152ebdc2322f9a8b6c0cad2cb303a093a45d5e0ecc17b519904e40e069a747b56e33f1ddd33f7f3efb32031d78808d05e32d93ab151572b973a1324f9e676e0b
+  data.tar.gz: 63c189a79b484777c25f5c1a7951d930fc2d110f3547216b2fd099469e57e7a062c0ec64ba2c7b0c3d7e88a6fb5f1f40d3b5ba6d1a0803acfc5253b00f43dfe8

data/jrf.gemspec CHANGED Viewed

@@ -16,6 +16,8 @@ Gem::Specification.new do |spec|
   spec.bindir = "exe"
   spec.executables = ["jrf"]
   spec.add_dependency "oj", ">= 3.16"
+  spec.add_development_dependency "minitest", ">= 5.0"
+  spec.add_development_dependency "rake", ">= 13.0"
   spec.files = Dir.glob("{exe,lib,test}/*") + Dir.glob("lib/**/*") + %w[DESIGN.txt jrf.gemspec Gemfile Rakefile].select { |path| File.file?(path) }
 end

data/lib/jrf/cli/runner.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "json"
+require "zlib"
 require_relative "../pipeline"
 require_relative "../pipeline_parser"
@@ -9,6 +10,7 @@ module Jrf
     class Runner
       RS_CHAR = "\x1e"
       DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
+      PARALLEL_FRAME_HEADER_BYTES = 4
       class RsNormalizer
         def initialize(input)
@@ -28,56 +30,293 @@ module Jrf
         end
       end
-      def initialize(inputs:, out: $stdout, err: $stderr, lax: false, pretty: false, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
-        @inputs = inputs
+      class ParallelFrameReader
+        def initialize
+          @buf = +""
+          @offset = 0
+        end
+        def append(chunk)
+          @buf << chunk
+        end
+        def each_payload
+          while (payload = next_payload)
+            yield payload
+          end
+        end
+        def has_partial?
+          @offset != @buf.bytesize
+        end
+        private
+        def next_payload
+          if @buf.bytesize - @offset < PARALLEL_FRAME_HEADER_BYTES
+            compact!
+            return nil
+          end
+          payload_len = @buf.byteslice(@offset, PARALLEL_FRAME_HEADER_BYTES).unpack1("N")
+          frame_len = PARALLEL_FRAME_HEADER_BYTES + payload_len
+          if @buf.bytesize - @offset < frame_len
+            compact!
+            return nil
+          end
+          payload = @buf.byteslice(@offset + PARALLEL_FRAME_HEADER_BYTES, payload_len)
+          @offset += frame_len
+          payload
+        end
+        def compact!
+          if @offset > 0
+            @buf = @buf.byteslice(@offset..) || +""
+            @offset = 0
+          end
+        end
+      end
+      def initialize(input: $stdin, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
+        if input.is_a?(Array)
+          @file_paths = input
+          @stdin = nil
+        else
+          @file_paths = []
+          @stdin = input
+        end
         @out = out
         @err = err
         @lax = lax
-        @pretty = pretty
+        @output_format = output_format
         @atomic_write_bytes = atomic_write_bytes
         @output_buffer = +""
+        @input_errors = false
+      end
+      def input_errors?
+        @input_errors
+      end
+      def run(expression, parallel: 1, verbose: false)
+        blocks = build_stage_blocks(expression, verbose: verbose)
+        if @output_format == :tsv
+          values = []
+          process_values(blocks, parallel: parallel, verbose: verbose) do |value|
+            values << value
+          end
+          emit_tsv(values)
+        else
+          process_values(blocks, parallel: parallel, verbose: verbose) do |value|
+            emit_output(value)
+          end
+        end
+      ensure
+        write_output(@output_buffer)
       end
-      def run(expression, verbose: false)
+      private
+      def build_stage_blocks(expression, verbose:)
         parsed = PipelineParser.new(expression).parse
         stages = parsed[:stages]
         dump_stages(stages) if verbose
-        blocks = stages.map { |stage|
+        stages.map { |stage|
           eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
         }
+      end
+      def apply_pipeline(blocks, input_enum)
         pipeline = Pipeline.new(*blocks)
+        Enumerator.new do |y|
+          pipeline.call(input_enum) { |value| y << value }
+        end
+      end
+      def each_input_enum
+        Enumerator.new { |y| each_input_value { |v| y << v } }
+      end
-        input_enum = Enumerator.new { |y| each_input_value { |v| y << v } }
-        pipeline.call(input_enum) do |value|
-          emit_output(value)
+      def process_values(blocks, parallel:, verbose:, &block)
+        if parallel <= 1 || @file_paths.length <= 1
+          dump_parallel_status("disabled", verbose: verbose)
+          return apply_pipeline(blocks, each_input_enum).each(&block)
         end
-      ensure
-        write_output(@output_buffer)
+        # Parallelize the longest map-only prefix; reducers stay in the parent.
+        split_index = classify_parallel_stages(blocks)
+        if split_index.nil? || split_index == 0
+          dump_parallel_status("disabled", verbose: verbose)
+          return apply_pipeline(blocks, each_input_enum).each(&block)
+        end
+        map_blocks = blocks[0...split_index]
+        reduce_blocks = blocks[split_index..]
+        dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} split=#{split_index}/#{blocks.length}", verbose: verbose)
+        input_enum = parallel_map_enum(map_blocks, parallel)
+        (reduce_blocks.empty? ? input_enum : apply_pipeline(reduce_blocks, input_enum)).each(&block)
       end
-      private
+      def dump_parallel_status(status, verbose:)
+        @err.puts "parallel: #{status}" if verbose
+      end
-      def each_input_value
-        return each_input_value_lax { |value| yield value } if @lax
+      def classify_parallel_stages(blocks)
+        # Read the first row from the first file to probe stage modes
+        first_value = nil
+        open_file(@file_paths.first) do |stream|
+          each_stream_value(stream) do |value|
+            first_value = value
+            break
+          end
+        end
+        return nil if first_value.nil?
-        each_input_value_ndjson { |value| yield value }
+        # Run the value through each stage independently to classify
+        split_index = nil
+        blocks.each_with_index do |block, i|
+          probe_pipeline = Pipeline.new(block)
+          probe_pipeline.call([first_value]) { |_| }
+          stage = probe_pipeline.instance_variable_get(:@stages).first
+          if stage.instance_variable_get(:@mode) == :reducer
+            split_index = i
+            break
+          end
+        end
+        split_index || blocks.length
       end
-      def each_input_value_ndjson
-        each_input do |source|
-          source.each_line do |raw_line|
-            line = raw_line.strip
-            next if line.empty?
+      def spawn_parallel_worker(blocks, path)
+        read_io, write_io = IO.pipe
+        pid = fork do
+          read_io.close
+          @out = write_io
+          @output_buffer = +""
+          pipeline = Pipeline.new(*blocks)
+          input_enum = Enumerator.new do |y|
+            open_file(path) { |stream| each_stream_value(stream) { |v| y << v } }
+          end
+          worker_failed = false
+          begin
+            pipeline.call(input_enum) { |value| emit_parallel_frame(value) }
+          rescue => e
+            @err.puts "#{path}: #{e.message} (#{e.class})"
+            worker_failed = true
+          end
+          write_output(@output_buffer)
+          write_io.close
+          exit!(worker_failed ? 1 : 0)
+        end
+        write_io.close
+        [read_io, pid]
+      end
-            yield JSON.parse(line)
+      def run_parallel_worker_pool(blocks, num_workers)
+        file_queue = @file_paths.dup
+        workers = {} # read_io => [reader, pid]
+        children = []
+        # Fill initial pool
+        while workers.size < num_workers && !file_queue.empty?
+          read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
+          workers[read_io] = [ParallelFrameReader.new, pid]
+          children << pid
+        end
+        read_ios = workers.keys.dup
+        until read_ios.empty?
+          ready = IO.select(read_ios)
+          ready[0].each do |io|
+            reader = workers[io][0]
+            chunk = io.read_nonblock(65536, exception: false)
+            if chunk == :wait_readable
+              next
+            elsif chunk.nil?
+              raise IOError, "truncated parallel frame from worker" if reader.has_partial?
+              read_ios.delete(io)
+              io.close
+              workers.delete(io)
+              # Spawn next worker if files remain
+              unless file_queue.empty?
+                read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
+                workers[read_io] = [ParallelFrameReader.new, pid]
+                children << pid
+                read_ios << read_io
+              end
+            else
+              reader.append(chunk)
+              reader.each_payload do |payload|
+                yield JSON.parse(payload)
+              end
+            end
           end
         end
+        children
+      end
+      def parallel_map_enum(map_blocks, num_workers)
+        children = nil
+        Enumerator.new do |y|
+          children = run_parallel_worker_pool(map_blocks, num_workers) { |value| y << value }
+        ensure
+          wait_for_parallel_children(children) if children
+        end
+      end
+      def wait_for_parallel_children(children)
+        failed = false
+        children.each do |pid|
+          _, status = Process.waitpid2(pid)
+          failed = true unless status.success?
+        end
+        exit(1) if failed
+      end
+      def emit_parallel_frame(value)
+        payload = JSON.generate(value)
+        buffer_output([payload.bytesize].pack("N") << payload)
+      end
+      def each_input_value
+        each_input do |source|
+          each_stream_value(source) { |value| yield value }
+        end
+      end
+      def each_stream_value(stream)
+        return each_stream_value_lax(stream) { |value| yield value } if @lax
+        stream.each_line do |raw_line|
+          line = raw_line.strip
+          next if line.empty?
+          yield JSON.parse(line)
+        end
       end
-      def each_input_value_lax
+      def open_file(path)
+        if path.end_with?(".gz")
+          Zlib::GzipReader.open(path) { |source| yield source }
+        else
+          File.open(path, "rb") { |source| yield source }
+        end
+      end
+      def each_stream_value_lax(stream)
         require "oj"
-        handler = Class.new(Oj::ScHandler) do
+        Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
+      rescue LoadError
+        raise "oj is required for --lax mode (gem install oj)"
+      rescue Oj::ParseError => e
+        raise JSON::ParserError, e.message
+      end
+      def streaming_json_handler_class
+        @streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
           def initialize(&emit)
             @emit = emit
           end
@@ -89,13 +328,6 @@ module Jrf
           def array_append(array, value) = array << value
           def add_value(value) = @emit.call(value)
         end
-        each_input do |source|
-          Oj.sc_parse(handler.new { |value| yield value }, RsNormalizer.new(source))
-        end
-      rescue LoadError
-        raise "oj is required for --lax mode (gem install oj)"
-      rescue Oj::ParseError => e
-        raise JSON::ParserError, e.message
       end
       def dump_stages(stages)
@@ -104,12 +336,76 @@ module Jrf
         end
       end
-      def each_input
-        @inputs.each { |source| yield source }
+      def each_input(&block)
+        if @file_paths.empty?
+          with_error_handling("<stdin>") { block.call(@stdin) }
+        else
+          @file_paths.each do |path|
+            if path == "-"
+              with_error_handling("<stdin>") { block.call(@stdin) }
+            else
+              with_error_handling(path) { open_file(path, &block) }
+            end
+          end
+        end
+      end
+      def with_error_handling(name)
+        yield
+      rescue IOError, SystemCallError, Zlib::GzipFile::Error, JSON::ParserError => e
+        @err.puts "#{name}: #{e.message} (#{e.class})"
+        @input_errors = true
       end
       def emit_output(value)
-        record = (@pretty ? JSON.pretty_generate(value) : JSON.generate(value)) << "\n"
+        record = (@output_format == :pretty ? JSON.pretty_generate(value) : JSON.generate(value)) << "\n"
+        buffer_output(record)
+      end
+      def emit_tsv(values)
+        rows = values.flat_map { |value| value_to_rows(value) }
+        rows.each do |row|
+          buffer_output(row.join("\t") << "\n")
+        end
+      end
+      def value_to_rows(value)
+        case value
+        when Hash
+          value.map { |k, v|
+            case v
+            when Array
+              [format_cell(k)] + v.map { |e| format_cell(e) }
+            else
+              [format_cell(k), format_cell(v)]
+            end
+          }
+        when Array
+          value.map { |row|
+            case row
+            when Array
+              row.map { |e| format_cell(e) }
+            else
+              [format_cell(row)]
+            end
+          }
+        else
+          [[format_cell(value)]]
+        end
+      end
+      def format_cell(value)
+        case value
+        when nil
+          "null"
+        when Numeric, String, true, false
+          value.to_s
+        else
+          JSON.generate(value)
+        end
+      end
+      def buffer_output(record)
         if @output_buffer.bytesize + record.bytesize <= @atomic_write_bytes
           @output_buffer << record
         else
@@ -119,7 +415,13 @@ module Jrf
       end
       def write_output(str)
-        @out.syswrite(str)
+        return if str.empty?
+        total = 0
+        while total < str.bytesize
+          written = @out.syswrite(str.byteslice(total..))
+          total += written
+        end
       end
     end
   end

data/lib/jrf/cli.rb CHANGED Viewed

@@ -16,7 +16,9 @@ module Jrf
       Options:
         -v, --verbose  print parsed stage expressions
         --lax          allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)
-        -p, --pretty   pretty-print JSON output instead of compact NDJSON
+        -o, --output FORMAT
+                       output format: json (default), pretty, tsv
+        -P N           opportunistically parallelize the map-prefix across N workers
         -r, --require LIBRARY
                        require LIBRARY before evaluating stages
         --no-jit       do not enable YJIT, even when supported by the Ruby runtime
@@ -43,7 +45,8 @@ module Jrf
     def self.run(argv = ARGV, input: ARGF, out: $stdout, err: $stderr)
       verbose = false
       lax = false
-      pretty = false
+      output_format = :json
+      parallel = 1
       jit = true
       required_libraries = []
       atomic_write_bytes = Runner::DEFAULT_OUTPUT_BUFFER_LIMIT
@@ -52,7 +55,8 @@ module Jrf
           opts.banner = USAGE
           opts.on("-v", "--verbose", "print parsed stage expressions") { verbose = true }
           opts.on("--lax", "allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)") { lax = true }
-          opts.on("-p", "--pretty", "pretty-print JSON output instead of compact NDJSON") { pretty = true }
+          opts.on("-o", "--output FORMAT", %w[json pretty tsv], "output format: json, pretty, tsv") { |fmt| output_format = fmt.to_sym }
+          opts.on("-P N", Integer, "opportunistically parallelize the map-prefix across N workers") { |n| parallel = n }
           opts.on("-r", "--require LIBRARY", "require LIBRARY before evaluating stages") { |library| required_libraries << library }
           opts.on("--no-jit", "do not enable YJIT, even when supported by the Ruby runtime") { jit = false }
           opts.on("--atomic-write-bytes N", Integer, "group short outputs into atomic writes of up to N bytes") do |value|
@@ -88,34 +92,20 @@ module Jrf
       enable_yjit if jit
       required_libraries.each { |library| require library }
-      inputs = Enumerator.new do |y|
-        if argv.empty?
-          y << input
-        else
-          argv.each do |path|
-            if path == "-"
-              y << input
-            elsif path.end_with?(".gz")
-              require "zlib"
-              Zlib::GzipReader.open(path) do |source|
-                y << source
-              end
-            else
-              File.open(path, "rb") do |source|
-                y << source
-              end
-            end
-          end
-        end
-      end
-      Runner.new(
-        inputs: inputs,
+      file_paths = argv.dup
+      runner = Runner.new(
+        input: file_paths.empty? ? input : file_paths,
         out: out,
         err: err,
         lax: lax,
-        pretty: pretty,
+        output_format: output_format,
         atomic_write_bytes: atomic_write_bytes
-      ).run(expression, verbose: verbose)
+      )
+      runner.run(expression, parallel: parallel, verbose: verbose)
+      exit 1 if runner.input_errors?
     end
     def self.enable_yjit

data/lib/jrf/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Jrf
-  VERSION = "0.1.12"
+  VERSION = "0.1.14"
 end