jrf 0.1.17 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jrf/cli/runner.rb +5 -58
- data/lib/jrf/input_reader.rb +80 -0
- data/lib/jrf/pipeline.rb +30 -2
- data/lib/jrf/stage.rb +6 -4
- data/lib/jrf/version.rb +1 -1
- data/test/library_api_test.rb +33 -2
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 64a9372251878badf67b869ceb4c75dde46411df8bda698afdb5bdd8463bfeb8
|
|
4
|
+
data.tar.gz: 6ca1ef73d871eb63739a5ad365bfbb2c9a83065a12749a9e9277cfa8a1549571
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 447e4a2f5ad0330ab7c815abd54bf9844bec4514c359ad63767b184831f5d8361796e02a5992f27a7b928fae57b3af6fa6a46fffa658c6f70f6053e11e6db7cb
|
|
7
|
+
data.tar.gz: d0908c86484886b68b685c52f148e98925f8027b9e9369eb984a74c2c5eb86810ab5428b50087fa45ddef317f5003e9b494e1206f8e23ef0260bc0e6c0e704ea
|
data/lib/jrf/cli/runner.rb
CHANGED
|
@@ -1,35 +1,16 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "json"
|
|
4
|
-
|
|
4
|
+
require_relative "../input_reader"
|
|
5
5
|
require_relative "../pipeline"
|
|
6
6
|
require_relative "../pipeline_parser"
|
|
7
7
|
|
|
8
8
|
module Jrf
|
|
9
9
|
class CLI
|
|
10
10
|
class Runner
|
|
11
|
-
RS_CHAR = "\x1e"
|
|
12
11
|
DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
|
|
13
12
|
PARALLEL_FRAME_HEADER_BYTES = 4
|
|
14
13
|
|
|
15
|
-
class RsNormalizer
|
|
16
|
-
def initialize(input)
|
|
17
|
-
@input = input
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
def read(length = nil, outbuf = nil)
|
|
21
|
-
chunk = @input.read(length)
|
|
22
|
-
return nil if chunk.nil?
|
|
23
|
-
|
|
24
|
-
chunk.tr!(RS_CHAR, "\n")
|
|
25
|
-
if outbuf
|
|
26
|
-
outbuf.replace(chunk)
|
|
27
|
-
else
|
|
28
|
-
chunk
|
|
29
|
-
end
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
|
|
33
14
|
class ParallelFrameReader
|
|
34
15
|
def initialize
|
|
35
16
|
@buf = +""
|
|
@@ -353,46 +334,12 @@ module Jrf
|
|
|
353
334
|
end
|
|
354
335
|
end
|
|
355
336
|
|
|
356
|
-
def each_stream_value(stream)
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
stream.each_line do |line|
|
|
360
|
-
line.strip!
|
|
361
|
-
next if line.empty?
|
|
362
|
-
yield JSON.parse(line)
|
|
363
|
-
end
|
|
364
|
-
end
|
|
365
|
-
|
|
366
|
-
def open_file(path)
|
|
367
|
-
if path.end_with?(".gz")
|
|
368
|
-
Zlib::GzipReader.open(path) { |source| yield source }
|
|
369
|
-
else
|
|
370
|
-
File.open(path, "rb") { |source| yield source }
|
|
371
|
-
end
|
|
337
|
+
def each_stream_value(stream, &block)
|
|
338
|
+
InputReader.each_value(stream, lax: @lax, &block)
|
|
372
339
|
end
|
|
373
340
|
|
|
374
|
-
def
|
|
375
|
-
|
|
376
|
-
Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
|
|
377
|
-
rescue LoadError
|
|
378
|
-
raise "oj is required for --lax mode (gem install oj)"
|
|
379
|
-
rescue Oj::ParseError => e
|
|
380
|
-
raise JSON::ParserError, e.message
|
|
381
|
-
end
|
|
382
|
-
|
|
383
|
-
def streaming_json_handler_class
|
|
384
|
-
@streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
|
|
385
|
-
def initialize(&emit)
|
|
386
|
-
@emit = emit
|
|
387
|
-
end
|
|
388
|
-
|
|
389
|
-
def hash_start = {}
|
|
390
|
-
def hash_key(key) = key
|
|
391
|
-
def hash_set(hash, key, value) = hash[key] = value
|
|
392
|
-
def array_start = []
|
|
393
|
-
def array_append(array, value) = array << value
|
|
394
|
-
def add_value(value) = @emit.call(value)
|
|
395
|
-
end
|
|
341
|
+
def open_file(path, &block)
|
|
342
|
+
InputReader.open_path(path, &block)
|
|
396
343
|
end
|
|
397
344
|
|
|
398
345
|
def dump_stages(stages)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "zlib"
|
|
5
|
+
|
|
6
|
+
module Jrf
|
|
7
|
+
# File and stream input reading for jrf pipelines.
|
|
8
|
+
#
|
|
9
|
+
# Used by both the CLI runner and Pipeline#read to share gzip auto-detection,
|
|
10
|
+
# strict NDJSON parsing, and (lazily loaded) --lax multiline parsing.
|
|
11
|
+
module InputReader
|
|
12
|
+
RS_CHAR = "\x1e"
|
|
13
|
+
|
|
14
|
+
module_function
|
|
15
|
+
|
|
16
|
+
def open_path(path, &block)
|
|
17
|
+
if path.end_with?(".gz")
|
|
18
|
+
Zlib::GzipReader.open(path, &block)
|
|
19
|
+
else
|
|
20
|
+
File.open(path, "rb", &block)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def each_value(stream, lax: false, &block)
|
|
25
|
+
if lax
|
|
26
|
+
each_value_lax(stream, &block)
|
|
27
|
+
else
|
|
28
|
+
stream.each_line do |line|
|
|
29
|
+
line.strip!
|
|
30
|
+
next if line.empty?
|
|
31
|
+
block.call(JSON.parse(line))
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def each_value_lax(stream, &block)
|
|
37
|
+
require "oj"
|
|
38
|
+
Oj.sc_parse(streaming_handler_class.new(&block), RsNormalizer.new(stream))
|
|
39
|
+
rescue LoadError
|
|
40
|
+
raise "oj is required for --lax mode (gem install oj)"
|
|
41
|
+
rescue Oj::ParseError => e
|
|
42
|
+
raise JSON::ParserError, e.message
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def streaming_handler_class
|
|
46
|
+
@streaming_handler_class ||= Class.new(Oj::ScHandler) do
|
|
47
|
+
def initialize(&emit)
|
|
48
|
+
@emit = emit
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def hash_start = {}
|
|
52
|
+
def hash_key(key) = key
|
|
53
|
+
def hash_set(hash, key, value) = hash[key] = value
|
|
54
|
+
def array_start = []
|
|
55
|
+
def array_append(array, value) = array << value
|
|
56
|
+
def add_value(value) = @emit.call(value)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Translates JSON-SEQ record separators (RS, 0x1e) to newlines so the
|
|
61
|
+
# underlying Oj scanner sees a stream of whitespace-delimited values.
|
|
62
|
+
class RsNormalizer
|
|
63
|
+
def initialize(input)
|
|
64
|
+
@input = input
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def read(length = nil, outbuf = nil)
|
|
68
|
+
chunk = @input.read(length)
|
|
69
|
+
return nil if chunk.nil?
|
|
70
|
+
|
|
71
|
+
chunk.tr!(RS_CHAR, "\n")
|
|
72
|
+
if outbuf
|
|
73
|
+
outbuf.replace(chunk)
|
|
74
|
+
else
|
|
75
|
+
chunk
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/jrf/pipeline.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "control"
|
|
4
|
+
require_relative "input_reader"
|
|
4
5
|
require_relative "row_context"
|
|
5
6
|
require_relative "stage"
|
|
6
7
|
|
|
@@ -9,8 +10,35 @@ module Jrf
|
|
|
9
10
|
def initialize(*blocks)
|
|
10
11
|
raise ArgumentError, "at least one stage block is required" if blocks.empty?
|
|
11
12
|
|
|
12
|
-
@
|
|
13
|
-
|
|
13
|
+
@stages = blocks.map { |block| Stage.new(block, src: nil) }
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Run the pipeline on one or more files, mirroring how the CLI reads its
|
|
17
|
+
# file arguments: each path is opened (with .gz auto-decompression) and
|
|
18
|
+
# parsed as NDJSON. Pass +lax: true+ for multiline JSON / JSON-SEQ input.
|
|
19
|
+
#
|
|
20
|
+
# Without a block, returns an Array of output values; with a block, streams
|
|
21
|
+
# each output value to the block.
|
|
22
|
+
#
|
|
23
|
+
# @param paths [Array<String>] one or more file paths
|
|
24
|
+
# @param lax [Boolean] enable lax (multiline / whitespace-delimited) parsing
|
|
25
|
+
# @yieldparam value output value
|
|
26
|
+
# @return [Array, nil] output values (without block), or nil (with block)
|
|
27
|
+
# @example Build a lookup hash from one file, use it to filter another
|
|
28
|
+
# lookup = Jrf.new(
|
|
29
|
+
# proc { reduce({}) { |a, v| a[[v["tid"], v["conn"]]] = v["late_acked"]; a } }
|
|
30
|
+
# ).read("conn_stats.ndjson").first
|
|
31
|
+
def read(*paths, lax: false, &on_output)
|
|
32
|
+
raise ArgumentError, "at least one path is required" if paths.empty?
|
|
33
|
+
|
|
34
|
+
input = Enumerator.new do |y|
|
|
35
|
+
paths.each do |path|
|
|
36
|
+
InputReader.open_path(path) do |stream|
|
|
37
|
+
InputReader.each_value(stream, lax: lax) { |value| y << value }
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
call(input, &on_output)
|
|
14
42
|
end
|
|
15
43
|
|
|
16
44
|
# Run the pipeline on an enumerable of input values.
|
data/lib/jrf/stage.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "control"
|
|
4
|
+
require_relative "row_context"
|
|
4
5
|
require_relative "reducers"
|
|
5
6
|
|
|
6
7
|
module Jrf
|
|
@@ -22,22 +23,23 @@ module Jrf
|
|
|
22
23
|
end
|
|
23
24
|
end
|
|
24
25
|
|
|
25
|
-
def initialize(
|
|
26
|
-
@ctx = ctx
|
|
27
|
-
@block = block
|
|
26
|
+
def initialize(block, src: nil)
|
|
28
27
|
@src = src
|
|
29
28
|
@reducers = []
|
|
30
29
|
@cursor = 0
|
|
31
30
|
@template = nil
|
|
32
31
|
@mode = nil # nil=unknown, :reducer, :passthrough
|
|
33
32
|
@map_transforms = {}
|
|
33
|
+
@ctx = Class.new(RowContext) do
|
|
34
|
+
define_method(:__jrf_expr__, &block)
|
|
35
|
+
end.new
|
|
34
36
|
end
|
|
35
37
|
|
|
36
38
|
def call(input)
|
|
37
39
|
@ctx.reset(input)
|
|
38
40
|
@cursor = 0
|
|
39
41
|
@ctx.__jrf_current_stage = self
|
|
40
|
-
result = @ctx.
|
|
42
|
+
result = @ctx.__jrf_expr__
|
|
41
43
|
|
|
42
44
|
if @mode.nil?
|
|
43
45
|
if @reducers.any?
|
data/lib/jrf/version.rb
CHANGED
data/test/library_api_test.rb
CHANGED
|
@@ -113,9 +113,40 @@ class LibraryApiTest < JrfTestCase
|
|
|
113
113
|
assert_equal([], j.call([]), "library empty input")
|
|
114
114
|
end
|
|
115
115
|
|
|
116
|
+
def test_read_from_files
|
|
117
|
+
Dir.mktmpdir do |dir|
|
|
118
|
+
plain = File.join(dir, "a.ndjson")
|
|
119
|
+
File.write(plain, %({"a":1}\n{"a":2}\n\n{"a":3}\n))
|
|
120
|
+
|
|
121
|
+
j = Jrf.new(proc { _["a"] })
|
|
122
|
+
assert_equal([1, 2, 3], j.read(plain), "library read NDJSON returns array")
|
|
123
|
+
|
|
124
|
+
streamed = []
|
|
125
|
+
result = j.read(plain) { |v| streamed << v }
|
|
126
|
+
assert_nil(result, "library read with block returns nil")
|
|
127
|
+
assert_equal([1, 2, 3], streamed, "library read with block streams values")
|
|
128
|
+
|
|
129
|
+
reducer = Jrf.new(proc { sum(_["a"]) })
|
|
130
|
+
assert_equal([6], reducer.read(plain), "library read drives reducers to completion")
|
|
131
|
+
|
|
132
|
+
second = File.join(dir, "b.ndjson")
|
|
133
|
+
File.write(second, %({"a":10}\n{"a":20}\n))
|
|
134
|
+
assert_equal([1, 2, 3, 10, 20], j.read(plain, second), "library read concatenates multiple paths")
|
|
135
|
+
|
|
136
|
+
gz_path = File.join(dir, "c.ndjson.gz")
|
|
137
|
+
Zlib::GzipWriter.open(gz_path) { |gz| gz.write(%({"a":100}\n{"a":200}\n)) }
|
|
138
|
+
assert_equal([100, 200], j.read(gz_path), "library read auto-decompresses .gz")
|
|
139
|
+
|
|
140
|
+
lax_path = File.join(dir, "d.json")
|
|
141
|
+
File.write(lax_path, %({"a":1}\n{\n "a": 2\n}\n))
|
|
142
|
+
assert_equal([1, 2], j.read(lax_path, lax: true), "library read supports lax multiline mode")
|
|
143
|
+
|
|
144
|
+
assert_raises(ArgumentError) { j.read }
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
116
148
|
def test_stage_reduce_control_tokens
|
|
117
|
-
|
|
118
|
-
stage = Jrf::Stage.new(ctx, proc { })
|
|
149
|
+
stage = Jrf::Stage.new(proc { })
|
|
119
150
|
first_token = stage.step_reduce(1, initial: 0) { |acc, v| acc + v }
|
|
120
151
|
assert_equal(0, first_token.index, "step_reduce returns token while classifying reducer stage")
|
|
121
152
|
stage.instance_variable_set(:@mode, :reducer)
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: jrf
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.18
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- kazuho
|
|
@@ -69,6 +69,7 @@ files:
|
|
|
69
69
|
- lib/jrf/cli.rb
|
|
70
70
|
- lib/jrf/cli/runner.rb
|
|
71
71
|
- lib/jrf/control.rb
|
|
72
|
+
- lib/jrf/input_reader.rb
|
|
72
73
|
- lib/jrf/pipeline.rb
|
|
73
74
|
- lib/jrf/pipeline_parser.rb
|
|
74
75
|
- lib/jrf/reducers.rb
|