jrf 0.1.17 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2468eef61c2691e368b10cd5077ba3559430766d3a6001dea88da10793800ff6
4
- data.tar.gz: ccce0fb9a0ff3c6e77b669d6908a42147431a0ade44c8365765ad84ca91a111b
3
+ metadata.gz: 64a9372251878badf67b869ceb4c75dde46411df8bda698afdb5bdd8463bfeb8
4
+ data.tar.gz: 6ca1ef73d871eb63739a5ad365bfbb2c9a83065a12749a9e9277cfa8a1549571
5
5
  SHA512:
6
- metadata.gz: 8e11dd7b55c48f80164a3f8c7a1ca5fef51204b098382802144bd757da150b39c2f69d0b69f718ba70da08f0bd88cc19b2c86488040e2062b566bb093f3bca88
7
- data.tar.gz: f43bdabcfa9728e5fbc7c4a4a8be1044fe4175a7650a9aee01b14d91e1fdee1ea3c58297c2bf6602e8bf32d16432cf34daf6eb8e9d9863d4e2882e2d662d0c3c
6
+ metadata.gz: 447e4a2f5ad0330ab7c815abd54bf9844bec4514c359ad63767b184831f5d8361796e02a5992f27a7b928fae57b3af6fa6a46fffa658c6f70f6053e11e6db7cb
7
+ data.tar.gz: d0908c86484886b68b685c52f148e98925f8027b9e9369eb984a74c2c5eb86810ab5428b50087fa45ddef317f5003e9b494e1206f8e23ef0260bc0e6c0e704ea
@@ -1,35 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "json"
4
- require "zlib"
4
+ require_relative "../input_reader"
5
5
  require_relative "../pipeline"
6
6
  require_relative "../pipeline_parser"
7
7
 
8
8
  module Jrf
9
9
  class CLI
10
10
  class Runner
11
- RS_CHAR = "\x1e"
12
11
  DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
13
12
  PARALLEL_FRAME_HEADER_BYTES = 4
14
13
 
15
- class RsNormalizer
16
- def initialize(input)
17
- @input = input
18
- end
19
-
20
- def read(length = nil, outbuf = nil)
21
- chunk = @input.read(length)
22
- return nil if chunk.nil?
23
-
24
- chunk.tr!(RS_CHAR, "\n")
25
- if outbuf
26
- outbuf.replace(chunk)
27
- else
28
- chunk
29
- end
30
- end
31
- end
32
-
33
14
  class ParallelFrameReader
34
15
  def initialize
35
16
  @buf = +""
@@ -353,46 +334,12 @@ module Jrf
353
334
  end
354
335
  end
355
336
 
356
- def each_stream_value(stream)
357
- return each_stream_value_lax(stream) { |value| yield value } if @lax
358
-
359
- stream.each_line do |line|
360
- line.strip!
361
- next if line.empty?
362
- yield JSON.parse(line)
363
- end
364
- end
365
-
366
- def open_file(path)
367
- if path.end_with?(".gz")
368
- Zlib::GzipReader.open(path) { |source| yield source }
369
- else
370
- File.open(path, "rb") { |source| yield source }
371
- end
337
+ def each_stream_value(stream, &block)
338
+ InputReader.each_value(stream, lax: @lax, &block)
372
339
  end
373
340
 
374
- def each_stream_value_lax(stream)
375
- require "oj"
376
- Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
377
- rescue LoadError
378
- raise "oj is required for --lax mode (gem install oj)"
379
- rescue Oj::ParseError => e
380
- raise JSON::ParserError, e.message
381
- end
382
-
383
- def streaming_json_handler_class
384
- @streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
385
- def initialize(&emit)
386
- @emit = emit
387
- end
388
-
389
- def hash_start = {}
390
- def hash_key(key) = key
391
- def hash_set(hash, key, value) = hash[key] = value
392
- def array_start = []
393
- def array_append(array, value) = array << value
394
- def add_value(value) = @emit.call(value)
395
- end
341
+ def open_file(path, &block)
342
+ InputReader.open_path(path, &block)
396
343
  end
397
344
 
398
345
  def dump_stages(stages)
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "zlib"
5
+
6
+ module Jrf
7
+ # File and stream input reading for jrf pipelines.
8
+ #
9
+ # Used by both the CLI runner and Pipeline#read to share gzip auto-detection,
10
+ # strict NDJSON parsing, and (lazily loaded) --lax multiline parsing.
11
+ module InputReader
12
+ RS_CHAR = "\x1e"
13
+
14
+ module_function
15
+
16
+ def open_path(path, &block)
17
+ if path.end_with?(".gz")
18
+ Zlib::GzipReader.open(path, &block)
19
+ else
20
+ File.open(path, "rb", &block)
21
+ end
22
+ end
23
+
24
+ def each_value(stream, lax: false, &block)
25
+ if lax
26
+ each_value_lax(stream, &block)
27
+ else
28
+ stream.each_line do |line|
29
+ line.strip!
30
+ next if line.empty?
31
+ block.call(JSON.parse(line))
32
+ end
33
+ end
34
+ end
35
+
36
+ def each_value_lax(stream, &block)
37
+ require "oj"
38
+ Oj.sc_parse(streaming_handler_class.new(&block), RsNormalizer.new(stream))
39
+ rescue LoadError
40
+ raise "oj is required for --lax mode (gem install oj)"
41
+ rescue Oj::ParseError => e
42
+ raise JSON::ParserError, e.message
43
+ end
44
+
45
+ def streaming_handler_class
46
+ @streaming_handler_class ||= Class.new(Oj::ScHandler) do
47
+ def initialize(&emit)
48
+ @emit = emit
49
+ end
50
+
51
+ def hash_start = {}
52
+ def hash_key(key) = key
53
+ def hash_set(hash, key, value) = hash[key] = value
54
+ def array_start = []
55
+ def array_append(array, value) = array << value
56
+ def add_value(value) = @emit.call(value)
57
+ end
58
+ end
59
+
60
+ # Translates JSON-SEQ record separators (RS, 0x1e) to newlines so the
61
+ # underlying Oj scanner sees a stream of whitespace-delimited values.
62
+ class RsNormalizer
63
+ def initialize(input)
64
+ @input = input
65
+ end
66
+
67
+ def read(length = nil, outbuf = nil)
68
+ chunk = @input.read(length)
69
+ return nil if chunk.nil?
70
+
71
+ chunk.tr!(RS_CHAR, "\n")
72
+ if outbuf
73
+ outbuf.replace(chunk)
74
+ else
75
+ chunk
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
data/lib/jrf/pipeline.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "control"
4
+ require_relative "input_reader"
4
5
  require_relative "row_context"
5
6
  require_relative "stage"
6
7
 
@@ -9,8 +10,35 @@ module Jrf
9
10
  def initialize(*blocks)
10
11
  raise ArgumentError, "at least one stage block is required" if blocks.empty?
11
12
 
12
- @ctx = RowContext.new
13
- @stages = blocks.map { |block| Stage.new(@ctx, block, src: nil) }
13
+ @stages = blocks.map { |block| Stage.new(block, src: nil) }
14
+ end
15
+
16
+ # Run the pipeline on one or more files, mirroring how the CLI reads its
17
+ # file arguments: each path is opened (with .gz auto-decompression) and
18
+ # parsed as NDJSON. Pass +lax: true+ for multiline JSON / JSON-SEQ input.
19
+ #
20
+ # Without a block, returns an Array of output values; with a block, streams
21
+ # each output value to the block.
22
+ #
23
+ # @param paths [Array<String>] one or more file paths
24
+ # @param lax [Boolean] enable lax (multiline / whitespace-delimited) parsing
25
+ # @yieldparam value output value
26
+ # @return [Array, nil] output values (without block), or nil (with block)
27
+ # @example Build a lookup hash from one file, use it to filter another
28
+ # lookup = Jrf.new(
29
+ # proc { reduce({}) { |a, v| a[[v["tid"], v["conn"]]] = v["late_acked"]; a } }
30
+ # ).read("conn_stats.ndjson").first
31
+ def read(*paths, lax: false, &on_output)
32
+ raise ArgumentError, "at least one path is required" if paths.empty?
33
+
34
+ input = Enumerator.new do |y|
35
+ paths.each do |path|
36
+ InputReader.open_path(path) do |stream|
37
+ InputReader.each_value(stream, lax: lax) { |value| y << value }
38
+ end
39
+ end
40
+ end
41
+ call(input, &on_output)
14
42
  end
15
43
 
16
44
  # Run the pipeline on an enumerable of input values.
data/lib/jrf/stage.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "control"
4
+ require_relative "row_context"
4
5
  require_relative "reducers"
5
6
 
6
7
  module Jrf
@@ -22,22 +23,23 @@ module Jrf
22
23
  end
23
24
  end
24
25
 
25
- def initialize(ctx, block, src: nil)
26
- @ctx = ctx
27
- @block = block
26
+ def initialize(block, src: nil)
28
27
  @src = src
29
28
  @reducers = []
30
29
  @cursor = 0
31
30
  @template = nil
32
31
  @mode = nil # nil=unknown, :reducer, :passthrough
33
32
  @map_transforms = {}
33
+ @ctx = Class.new(RowContext) do
34
+ define_method(:__jrf_expr__, &block)
35
+ end.new
34
36
  end
35
37
 
36
38
  def call(input)
37
39
  @ctx.reset(input)
38
40
  @cursor = 0
39
41
  @ctx.__jrf_current_stage = self
40
- result = @ctx.instance_eval(&@block)
42
+ result = @ctx.__jrf_expr__
41
43
 
42
44
  if @mode.nil?
43
45
  if @reducers.any?
data/lib/jrf/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Jrf
4
- VERSION = "0.1.17"
4
+ VERSION = "0.1.18"
5
5
  end
@@ -113,9 +113,40 @@ class LibraryApiTest < JrfTestCase
113
113
  assert_equal([], j.call([]), "library empty input")
114
114
  end
115
115
 
116
+ def test_read_from_files
117
+ Dir.mktmpdir do |dir|
118
+ plain = File.join(dir, "a.ndjson")
119
+ File.write(plain, %({"a":1}\n{"a":2}\n\n{"a":3}\n))
120
+
121
+ j = Jrf.new(proc { _["a"] })
122
+ assert_equal([1, 2, 3], j.read(plain), "library read NDJSON returns array")
123
+
124
+ streamed = []
125
+ result = j.read(plain) { |v| streamed << v }
126
+ assert_nil(result, "library read with block returns nil")
127
+ assert_equal([1, 2, 3], streamed, "library read with block streams values")
128
+
129
+ reducer = Jrf.new(proc { sum(_["a"]) })
130
+ assert_equal([6], reducer.read(plain), "library read drives reducers to completion")
131
+
132
+ second = File.join(dir, "b.ndjson")
133
+ File.write(second, %({"a":10}\n{"a":20}\n))
134
+ assert_equal([1, 2, 3, 10, 20], j.read(plain, second), "library read concatenates multiple paths")
135
+
136
+ gz_path = File.join(dir, "c.ndjson.gz")
137
+ Zlib::GzipWriter.open(gz_path) { |gz| gz.write(%({"a":100}\n{"a":200}\n)) }
138
+ assert_equal([100, 200], j.read(gz_path), "library read auto-decompresses .gz")
139
+
140
+ lax_path = File.join(dir, "d.json")
141
+ File.write(lax_path, %({"a":1}\n{\n "a": 2\n}\n))
142
+ assert_equal([1, 2], j.read(lax_path, lax: true), "library read supports lax multiline mode")
143
+
144
+ assert_raises(ArgumentError) { j.read }
145
+ end
146
+ end
147
+
116
148
  def test_stage_reduce_control_tokens
117
- ctx = Jrf::RowContext.new
118
- stage = Jrf::Stage.new(ctx, proc { })
149
+ stage = Jrf::Stage.new(proc { })
119
150
  first_token = stage.step_reduce(1, initial: 0) { |acc, v| acc + v }
120
151
  assert_equal(0, first_token.index, "step_reduce returns token while classifying reducer stage")
121
152
  stage.instance_variable_set(:@mode, :reducer)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jrf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.17
4
+ version: 0.1.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - kazuho
@@ -69,6 +69,7 @@ files:
69
69
  - lib/jrf/cli.rb
70
70
  - lib/jrf/cli/runner.rb
71
71
  - lib/jrf/control.rb
72
+ - lib/jrf/input_reader.rb
72
73
  - lib/jrf/pipeline.rb
73
74
  - lib/jrf/pipeline_parser.rb
74
75
  - lib/jrf/reducers.rb