RubyGems - jrf - Versions diffs - 0.1.17 → 0.1.18 - Mend

jrf 0.1.17 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2468eef61c2691e368b10cd5077ba3559430766d3a6001dea88da10793800ff6
-  data.tar.gz: ccce0fb9a0ff3c6e77b669d6908a42147431a0ade44c8365765ad84ca91a111b
+  metadata.gz: 64a9372251878badf67b869ceb4c75dde46411df8bda698afdb5bdd8463bfeb8
+  data.tar.gz: 6ca1ef73d871eb63739a5ad365bfbb2c9a83065a12749a9e9277cfa8a1549571
 SHA512:
-  metadata.gz: 8e11dd7b55c48f80164a3f8c7a1ca5fef51204b098382802144bd757da150b39c2f69d0b69f718ba70da08f0bd88cc19b2c86488040e2062b566bb093f3bca88
-  data.tar.gz: f43bdabcfa9728e5fbc7c4a4a8be1044fe4175a7650a9aee01b14d91e1fdee1ea3c58297c2bf6602e8bf32d16432cf34daf6eb8e9d9863d4e2882e2d662d0c3c
+  metadata.gz: 447e4a2f5ad0330ab7c815abd54bf9844bec4514c359ad63767b184831f5d8361796e02a5992f27a7b928fae57b3af6fa6a46fffa658c6f70f6053e11e6db7cb
+  data.tar.gz: d0908c86484886b68b685c52f148e98925f8027b9e9369eb984a74c2c5eb86810ab5428b50087fa45ddef317f5003e9b494e1206f8e23ef0260bc0e6c0e704ea

data/lib/jrf/cli/runner.rb CHANGED Viewed

@@ -1,35 +1,16 @@
 # frozen_string_literal: true
 require "json"
-require "zlib"
+require_relative "../input_reader"
 require_relative "../pipeline"
 require_relative "../pipeline_parser"
 module Jrf
   class CLI
     class Runner
-      RS_CHAR = "\x1e"
       DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
       PARALLEL_FRAME_HEADER_BYTES = 4
-      class RsNormalizer
-        def initialize(input)
-          @input = input
-        end
-        def read(length = nil, outbuf = nil)
-          chunk = @input.read(length)
-          return nil if chunk.nil?
-          chunk.tr!(RS_CHAR, "\n")
-          if outbuf
-            outbuf.replace(chunk)
-          else
-            chunk
-          end
-        end
-      end
       class ParallelFrameReader
         def initialize
           @buf = +""
@@ -353,46 +334,12 @@ module Jrf
         end
       end
-      def each_stream_value(stream)
-        return each_stream_value_lax(stream) { |value| yield value } if @lax
-        stream.each_line do |line|
-          line.strip!
-          next if line.empty?
-          yield JSON.parse(line)
-        end
-      end
-      def open_file(path)
-        if path.end_with?(".gz")
-          Zlib::GzipReader.open(path) { |source| yield source }
-        else
-          File.open(path, "rb") { |source| yield source }
-        end
+      def each_stream_value(stream, &block)
+        InputReader.each_value(stream, lax: @lax, &block)
       end
-      def each_stream_value_lax(stream)
-        require "oj"
-        Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
-      rescue LoadError
-        raise "oj is required for --lax mode (gem install oj)"
-      rescue Oj::ParseError => e
-        raise JSON::ParserError, e.message
-      end
-      def streaming_json_handler_class
-        @streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
-          def initialize(&emit)
-            @emit = emit
-          end
-          def hash_start = {}
-          def hash_key(key) = key
-          def hash_set(hash, key, value) = hash[key] = value
-          def array_start = []
-          def array_append(array, value) = array << value
-          def add_value(value) = @emit.call(value)
-        end
+      def open_file(path, &block)
+        InputReader.open_path(path, &block)
       end
       def dump_stages(stages)

data/lib/jrf/input_reader.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# frozen_string_literal: true
+require "json"
+require "zlib"
+module Jrf
+  # File and stream input reading for jrf pipelines.
+  #
+  # Used by both the CLI runner and Pipeline#read to share gzip auto-detection,
+  # strict NDJSON parsing, and (lazily loaded) --lax multiline parsing.
+  module InputReader
+    RS_CHAR = "\x1e"
+    module_function
+    def open_path(path, &block)
+      if path.end_with?(".gz")
+        Zlib::GzipReader.open(path, &block)
+      else
+        File.open(path, "rb", &block)
+      end
+    end
+    def each_value(stream, lax: false, &block)
+      if lax
+        each_value_lax(stream, &block)
+      else
+        stream.each_line do |line|
+          line.strip!
+          next if line.empty?
+          block.call(JSON.parse(line))
+        end
+      end
+    end
+    def each_value_lax(stream, &block)
+      require "oj"
+      Oj.sc_parse(streaming_handler_class.new(&block), RsNormalizer.new(stream))
+    rescue LoadError
+      raise "oj is required for --lax mode (gem install oj)"
+    rescue Oj::ParseError => e
+      raise JSON::ParserError, e.message
+    end
+    def streaming_handler_class
+      @streaming_handler_class ||= Class.new(Oj::ScHandler) do
+        def initialize(&emit)
+          @emit = emit
+        end
+        def hash_start = {}
+        def hash_key(key) = key
+        def hash_set(hash, key, value) = hash[key] = value
+        def array_start = []
+        def array_append(array, value) = array << value
+        def add_value(value) = @emit.call(value)
+      end
+    end
+    # Translates JSON-SEQ record separators (RS, 0x1e) to newlines so the
+    # underlying Oj scanner sees a stream of whitespace-delimited values.
+    class RsNormalizer
+      def initialize(input)
+        @input = input
+      end
+      def read(length = nil, outbuf = nil)
+        chunk = @input.read(length)
+        return nil if chunk.nil?
+        chunk.tr!(RS_CHAR, "\n")
+        if outbuf
+          outbuf.replace(chunk)
+        else
+          chunk
+        end
+      end
+    end
+  end
+end

data/lib/jrf/pipeline.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative "control"
+require_relative "input_reader"
 require_relative "row_context"
 require_relative "stage"
@@ -9,8 +10,35 @@ module Jrf
     def initialize(*blocks)
       raise ArgumentError, "at least one stage block is required" if blocks.empty?
-      @ctx = RowContext.new
-      @stages = blocks.map { |block| Stage.new(@ctx, block, src: nil) }
+      @stages = blocks.map { |block| Stage.new(block, src: nil) }
+    end
+    # Run the pipeline on one or more files, mirroring how the CLI reads its
+    # file arguments: each path is opened (with .gz auto-decompression) and
+    # parsed as NDJSON. Pass +lax: true+ for multiline JSON / JSON-SEQ input.
+    #
+    # Without a block, returns an Array of output values; with a block, streams
+    # each output value to the block.
+    #
+    # @param paths [Array<String>] one or more file paths
+    # @param lax [Boolean] enable lax (multiline / whitespace-delimited) parsing
+    # @yieldparam value output value
+    # @return [Array, nil] output values (without block), or nil (with block)
+    # @example Build a lookup hash from one file, use it to filter another
+    #   lookup = Jrf.new(
+    #     proc { reduce({}) { |a, v| a[[v["tid"], v["conn"]]] = v["late_acked"]; a } }
+    #   ).read("conn_stats.ndjson").first
+    def read(*paths, lax: false, &on_output)
+      raise ArgumentError, "at least one path is required" if paths.empty?
+      input = Enumerator.new do |y|
+        paths.each do |path|
+          InputReader.open_path(path) do |stream|
+            InputReader.each_value(stream, lax: lax) { |value| y << value }
+          end
+        end
+      end
+      call(input, &on_output)
     end
     # Run the pipeline on an enumerable of input values.

data/lib/jrf/stage.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative "control"
+require_relative "row_context"
 require_relative "reducers"
 module Jrf
@@ -22,22 +23,23 @@ module Jrf
       end
     end
-    def initialize(ctx, block, src: nil)
-      @ctx = ctx
-      @block = block
+    def initialize(block, src: nil)
       @src = src
       @reducers = []
       @cursor = 0
       @template = nil
       @mode = nil # nil=unknown, :reducer, :passthrough
       @map_transforms = {}
+      @ctx = Class.new(RowContext) do
+        define_method(:__jrf_expr__, &block)
+      end.new
     end
     def call(input)
       @ctx.reset(input)
       @cursor = 0
       @ctx.__jrf_current_stage = self
-      result = @ctx.instance_eval(&@block)
+      result = @ctx.__jrf_expr__
       if @mode.nil?
         if @reducers.any?

data/lib/jrf/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Jrf
-  VERSION = "0.1.17"
+  VERSION = "0.1.18"
 end

data/test/library_api_test.rb CHANGED Viewed

@@ -113,9 +113,40 @@ class LibraryApiTest < JrfTestCase
     assert_equal([], j.call([]), "library empty input")
   end
+  def test_read_from_files
+    Dir.mktmpdir do |dir|
+      plain = File.join(dir, "a.ndjson")
+      File.write(plain, %({"a":1}\n{"a":2}\n\n{"a":3}\n))
+      j = Jrf.new(proc { _["a"] })
+      assert_equal([1, 2, 3], j.read(plain), "library read NDJSON returns array")
+      streamed = []
+      result = j.read(plain) { |v| streamed << v }
+      assert_nil(result, "library read with block returns nil")
+      assert_equal([1, 2, 3], streamed, "library read with block streams values")
+      reducer = Jrf.new(proc { sum(_["a"]) })
+      assert_equal([6], reducer.read(plain), "library read drives reducers to completion")
+      second = File.join(dir, "b.ndjson")
+      File.write(second, %({"a":10}\n{"a":20}\n))
+      assert_equal([1, 2, 3, 10, 20], j.read(plain, second), "library read concatenates multiple paths")
+      gz_path = File.join(dir, "c.ndjson.gz")
+      Zlib::GzipWriter.open(gz_path) { |gz| gz.write(%({"a":100}\n{"a":200}\n)) }
+      assert_equal([100, 200], j.read(gz_path), "library read auto-decompresses .gz")
+      lax_path = File.join(dir, "d.json")
+      File.write(lax_path, %({"a":1}\n{\n  "a": 2\n}\n))
+      assert_equal([1, 2], j.read(lax_path, lax: true), "library read supports lax multiline mode")
+      assert_raises(ArgumentError) { j.read }
+    end
+  end
   def test_stage_reduce_control_tokens
-    ctx = Jrf::RowContext.new
-    stage = Jrf::Stage.new(ctx, proc { })
+    stage = Jrf::Stage.new(proc { })
     first_token = stage.step_reduce(1, initial: 0) { |acc, v| acc + v }
     assert_equal(0, first_token.index, "step_reduce returns token while classifying reducer stage")
     stage.instance_variable_set(:@mode, :reducer)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jrf
 version: !ruby/object:Gem::Version
-  version: 0.1.17
+  version: 0.1.18
 platform: ruby
 authors:
 - kazuho
@@ -69,6 +69,7 @@ files:
 - lib/jrf/cli.rb
 - lib/jrf/cli/runner.rb
 - lib/jrf/control.rb
+- lib/jrf/input_reader.rb
 - lib/jrf/pipeline.rb
 - lib/jrf/pipeline_parser.rb
 - lib/jrf/reducers.rb