jrf 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ddf9bb5a12260eea615d0107dc7374dec2b5a3fe81c51791ea7e7ffea359d12f
4
- data.tar.gz: d556b7d230185a9c397af45abaa08b912f485f3a1494bebdf86f2ef44be81c64
3
+ metadata.gz: 78c1f6eb54e20d4dffbfe57f89a49d9e8ec9bbb2a9e118d911f2dec3c649f4ac
4
+ data.tar.gz: 63f43701422cfe200b7932a2177132f5e4e74e690960e71b88d6cc7b767e0b3c
5
5
  SHA512:
6
- metadata.gz: e7642e5e7c50e9b4da7f28bf906bf432ee7c6d377839af2ff2b73d299c044e08566876bd3d33d5492abe9432b58f0871df2ebb9d4e7ec45ffe02a495fad1e2fc
7
- data.tar.gz: '0629e81b1c9cf8070fb0a1fcbe409cf026bb62f72da9855d32fe9f253552de51a21f663b616abdc3b474dd705d9067512838fe5f0e33293f423636a5ea18167f'
6
+ metadata.gz: 152ebdc2322f9a8b6c0cad2cb303a093a45d5e0ecc17b519904e40e069a747b56e33f1ddd33f7f3efb32031d78808d05e32d93ab151572b973a1324f9e676e0b
7
+ data.tar.gz: 63c189a79b484777c25f5c1a7951d930fc2d110f3547216b2fd099469e57e7a062c0ec64ba2c7b0c3d7e88a6fb5f1f40d3b5ba6d1a0803acfc5253b00f43dfe8
data/jrf.gemspec CHANGED
@@ -16,6 +16,8 @@ Gem::Specification.new do |spec|
16
16
  spec.bindir = "exe"
17
17
  spec.executables = ["jrf"]
18
18
  spec.add_dependency "oj", ">= 3.16"
19
+ spec.add_development_dependency "minitest", ">= 5.0"
20
+ spec.add_development_dependency "rake", ">= 13.0"
19
21
 
20
22
  spec.files = Dir.glob("{exe,lib,test}/*") + Dir.glob("lib/**/*") + %w[DESIGN.txt jrf.gemspec Gemfile Rakefile].select { |path| File.file?(path) }
21
23
  end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "json"
4
+ require "zlib"
4
5
  require_relative "../pipeline"
5
6
  require_relative "../pipeline_parser"
6
7
 
@@ -9,6 +10,7 @@ module Jrf
9
10
  class Runner
10
11
  RS_CHAR = "\x1e"
11
12
  DEFAULT_OUTPUT_BUFFER_LIMIT = 4096
13
+ PARALLEL_FRAME_HEADER_BYTES = 4
12
14
 
13
15
  class RsNormalizer
14
16
  def initialize(input)
@@ -28,56 +30,293 @@ module Jrf
28
30
  end
29
31
  end
30
32
 
31
- def initialize(inputs:, out: $stdout, err: $stderr, lax: false, pretty: false, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
32
- @inputs = inputs
33
+ class ParallelFrameReader
34
+ def initialize
35
+ @buf = +""
36
+ @offset = 0
37
+ end
38
+
39
+ def append(chunk)
40
+ @buf << chunk
41
+ end
42
+
43
+ def each_payload
44
+ while (payload = next_payload)
45
+ yield payload
46
+ end
47
+ end
48
+
49
+ def has_partial?
50
+ @offset != @buf.bytesize
51
+ end
52
+
53
+ private
54
+
55
+ def next_payload
56
+ if @buf.bytesize - @offset < PARALLEL_FRAME_HEADER_BYTES
57
+ compact!
58
+ return nil
59
+ end
60
+
61
+ payload_len = @buf.byteslice(@offset, PARALLEL_FRAME_HEADER_BYTES).unpack1("N")
62
+ frame_len = PARALLEL_FRAME_HEADER_BYTES + payload_len
63
+ if @buf.bytesize - @offset < frame_len
64
+ compact!
65
+ return nil
66
+ end
67
+
68
+ payload = @buf.byteslice(@offset + PARALLEL_FRAME_HEADER_BYTES, payload_len)
69
+ @offset += frame_len
70
+ payload
71
+ end
72
+
73
+ def compact!
74
+ if @offset > 0
75
+ @buf = @buf.byteslice(@offset..) || +""
76
+ @offset = 0
77
+ end
78
+ end
79
+ end
80
+
81
+ def initialize(input: $stdin, out: $stdout, err: $stderr, lax: false, output_format: :json, atomic_write_bytes: DEFAULT_OUTPUT_BUFFER_LIMIT)
82
+ if input.is_a?(Array)
83
+ @file_paths = input
84
+ @stdin = nil
85
+ else
86
+ @file_paths = []
87
+ @stdin = input
88
+ end
33
89
  @out = out
34
90
  @err = err
35
91
  @lax = lax
36
- @pretty = pretty
92
+ @output_format = output_format
37
93
  @atomic_write_bytes = atomic_write_bytes
38
94
  @output_buffer = +""
95
+ @input_errors = false
96
+ end
97
+
98
+ def input_errors?
99
+ @input_errors
100
+ end
101
+
102
+ def run(expression, parallel: 1, verbose: false)
103
+ blocks = build_stage_blocks(expression, verbose: verbose)
104
+ if @output_format == :tsv
105
+ values = []
106
+ process_values(blocks, parallel: parallel, verbose: verbose) do |value|
107
+ values << value
108
+ end
109
+ emit_tsv(values)
110
+ else
111
+ process_values(blocks, parallel: parallel, verbose: verbose) do |value|
112
+ emit_output(value)
113
+ end
114
+ end
115
+ ensure
116
+ write_output(@output_buffer)
39
117
  end
40
118
 
41
- def run(expression, verbose: false)
119
+ private
120
+
121
+ def build_stage_blocks(expression, verbose:)
42
122
  parsed = PipelineParser.new(expression).parse
43
123
  stages = parsed[:stages]
44
124
  dump_stages(stages) if verbose
45
-
46
- blocks = stages.map { |stage|
125
+ stages.map { |stage|
47
126
  eval("proc { #{stage[:src]} }", nil, "(jrf stage)", 1) # rubocop:disable Security/Eval
48
127
  }
128
+ end
129
+
130
+ def apply_pipeline(blocks, input_enum)
49
131
  pipeline = Pipeline.new(*blocks)
132
+ Enumerator.new do |y|
133
+ pipeline.call(input_enum) { |value| y << value }
134
+ end
135
+ end
136
+
137
+ def each_input_enum
138
+ Enumerator.new { |y| each_input_value { |v| y << v } }
139
+ end
50
140
 
51
- input_enum = Enumerator.new { |y| each_input_value { |v| y << v } }
52
- pipeline.call(input_enum) do |value|
53
- emit_output(value)
141
+ def process_values(blocks, parallel:, verbose:, &block)
142
+ if parallel <= 1 || @file_paths.length <= 1
143
+ dump_parallel_status("disabled", verbose: verbose)
144
+ return apply_pipeline(blocks, each_input_enum).each(&block)
54
145
  end
55
- ensure
56
- write_output(@output_buffer)
146
+
147
+ # Parallelize the longest map-only prefix; reducers stay in the parent.
148
+ split_index = classify_parallel_stages(blocks)
149
+ if split_index.nil? || split_index == 0
150
+ dump_parallel_status("disabled", verbose: verbose)
151
+ return apply_pipeline(blocks, each_input_enum).each(&block)
152
+ end
153
+
154
+ map_blocks = blocks[0...split_index]
155
+ reduce_blocks = blocks[split_index..]
156
+ dump_parallel_status("enabled workers=#{parallel} files=#{@file_paths.length} split=#{split_index}/#{blocks.length}", verbose: verbose)
157
+ input_enum = parallel_map_enum(map_blocks, parallel)
158
+ (reduce_blocks.empty? ? input_enum : apply_pipeline(reduce_blocks, input_enum)).each(&block)
57
159
  end
58
160
 
59
- private
161
+ def dump_parallel_status(status, verbose:)
162
+ @err.puts "parallel: #{status}" if verbose
163
+ end
60
164
 
61
- def each_input_value
62
- return each_input_value_lax { |value| yield value } if @lax
165
+ def classify_parallel_stages(blocks)
166
+ # Read the first row from the first file to probe stage modes
167
+ first_value = nil
168
+ open_file(@file_paths.first) do |stream|
169
+ each_stream_value(stream) do |value|
170
+ first_value = value
171
+ break
172
+ end
173
+ end
174
+ return nil if first_value.nil?
63
175
 
64
- each_input_value_ndjson { |value| yield value }
176
+ # Run the value through each stage independently to classify
177
+ split_index = nil
178
+ blocks.each_with_index do |block, i|
179
+ probe_pipeline = Pipeline.new(block)
180
+ probe_pipeline.call([first_value]) { |_| }
181
+ stage = probe_pipeline.instance_variable_get(:@stages).first
182
+ if stage.instance_variable_get(:@mode) == :reducer
183
+ split_index = i
184
+ break
185
+ end
186
+ end
187
+
188
+ split_index || blocks.length
65
189
  end
66
190
 
67
- def each_input_value_ndjson
68
- each_input do |source|
69
- source.each_line do |raw_line|
70
- line = raw_line.strip
71
- next if line.empty?
191
+ def spawn_parallel_worker(blocks, path)
192
+ read_io, write_io = IO.pipe
193
+ pid = fork do
194
+ read_io.close
195
+ @out = write_io
196
+ @output_buffer = +""
197
+ pipeline = Pipeline.new(*blocks)
198
+ input_enum = Enumerator.new do |y|
199
+ open_file(path) { |stream| each_stream_value(stream) { |v| y << v } }
200
+ end
201
+ worker_failed = false
202
+ begin
203
+ pipeline.call(input_enum) { |value| emit_parallel_frame(value) }
204
+ rescue => e
205
+ @err.puts "#{path}: #{e.message} (#{e.class})"
206
+ worker_failed = true
207
+ end
208
+ write_output(@output_buffer)
209
+ write_io.close
210
+ exit!(worker_failed ? 1 : 0)
211
+ end
212
+ write_io.close
213
+ [read_io, pid]
214
+ end
72
215
 
73
- yield JSON.parse(line)
216
+ def run_parallel_worker_pool(blocks, num_workers)
217
+ file_queue = @file_paths.dup
218
+ workers = {} # read_io => [reader, pid]
219
+ children = []
220
+
221
+ # Fill initial pool
222
+ while workers.size < num_workers && !file_queue.empty?
223
+ read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
224
+ workers[read_io] = [ParallelFrameReader.new, pid]
225
+ children << pid
226
+ end
227
+
228
+ read_ios = workers.keys.dup
229
+
230
+ until read_ios.empty?
231
+ ready = IO.select(read_ios)
232
+ ready[0].each do |io|
233
+ reader = workers[io][0]
234
+ chunk = io.read_nonblock(65536, exception: false)
235
+ if chunk == :wait_readable
236
+ next
237
+ elsif chunk.nil?
238
+ raise IOError, "truncated parallel frame from worker" if reader.has_partial?
239
+ read_ios.delete(io)
240
+ io.close
241
+ workers.delete(io)
242
+
243
+ # Spawn next worker if files remain
244
+ unless file_queue.empty?
245
+ read_io, pid = spawn_parallel_worker(blocks, file_queue.shift)
246
+ workers[read_io] = [ParallelFrameReader.new, pid]
247
+ children << pid
248
+ read_ios << read_io
249
+ end
250
+ else
251
+ reader.append(chunk)
252
+ reader.each_payload do |payload|
253
+ yield JSON.parse(payload)
254
+ end
255
+ end
74
256
  end
75
257
  end
258
+
259
+ children
260
+ end
261
+
262
+ def parallel_map_enum(map_blocks, num_workers)
263
+ children = nil
264
+ Enumerator.new do |y|
265
+ children = run_parallel_worker_pool(map_blocks, num_workers) { |value| y << value }
266
+ ensure
267
+ wait_for_parallel_children(children) if children
268
+ end
269
+ end
270
+
271
+ def wait_for_parallel_children(children)
272
+ failed = false
273
+ children.each do |pid|
274
+ _, status = Process.waitpid2(pid)
275
+ failed = true unless status.success?
276
+ end
277
+ exit(1) if failed
278
+ end
279
+
280
+ def emit_parallel_frame(value)
281
+ payload = JSON.generate(value)
282
+ buffer_output([payload.bytesize].pack("N") << payload)
283
+ end
284
+
285
+ def each_input_value
286
+ each_input do |source|
287
+ each_stream_value(source) { |value| yield value }
288
+ end
289
+ end
290
+
291
+ def each_stream_value(stream)
292
+ return each_stream_value_lax(stream) { |value| yield value } if @lax
293
+
294
+ stream.each_line do |raw_line|
295
+ line = raw_line.strip
296
+ next if line.empty?
297
+ yield JSON.parse(line)
298
+ end
76
299
  end
77
300
 
78
- def each_input_value_lax
301
+ def open_file(path)
302
+ if path.end_with?(".gz")
303
+ Zlib::GzipReader.open(path) { |source| yield source }
304
+ else
305
+ File.open(path, "rb") { |source| yield source }
306
+ end
307
+ end
308
+
309
+ def each_stream_value_lax(stream)
79
310
  require "oj"
80
- handler = Class.new(Oj::ScHandler) do
311
+ Oj.sc_parse(streaming_json_handler_class.new { |value| yield value }, RsNormalizer.new(stream))
312
+ rescue LoadError
313
+ raise "oj is required for --lax mode (gem install oj)"
314
+ rescue Oj::ParseError => e
315
+ raise JSON::ParserError, e.message
316
+ end
317
+
318
+ def streaming_json_handler_class
319
+ @streaming_json_handler_class ||= Class.new(Oj::ScHandler) do
81
320
  def initialize(&emit)
82
321
  @emit = emit
83
322
  end
@@ -89,13 +328,6 @@ module Jrf
89
328
  def array_append(array, value) = array << value
90
329
  def add_value(value) = @emit.call(value)
91
330
  end
92
- each_input do |source|
93
- Oj.sc_parse(handler.new { |value| yield value }, RsNormalizer.new(source))
94
- end
95
- rescue LoadError
96
- raise "oj is required for --lax mode (gem install oj)"
97
- rescue Oj::ParseError => e
98
- raise JSON::ParserError, e.message
99
331
  end
100
332
 
101
333
  def dump_stages(stages)
@@ -104,12 +336,76 @@ module Jrf
104
336
  end
105
337
  end
106
338
 
107
- def each_input
108
- @inputs.each { |source| yield source }
339
+ def each_input(&block)
340
+ if @file_paths.empty?
341
+ with_error_handling("<stdin>") { block.call(@stdin) }
342
+ else
343
+ @file_paths.each do |path|
344
+ if path == "-"
345
+ with_error_handling("<stdin>") { block.call(@stdin) }
346
+ else
347
+ with_error_handling(path) { open_file(path, &block) }
348
+ end
349
+ end
350
+ end
351
+ end
352
+
353
+ def with_error_handling(name)
354
+ yield
355
+ rescue IOError, SystemCallError, Zlib::GzipFile::Error, JSON::ParserError => e
356
+ @err.puts "#{name}: #{e.message} (#{e.class})"
357
+ @input_errors = true
109
358
  end
110
359
 
111
360
  def emit_output(value)
112
- record = (@pretty ? JSON.pretty_generate(value) : JSON.generate(value)) << "\n"
361
+ record = (@output_format == :pretty ? JSON.pretty_generate(value) : JSON.generate(value)) << "\n"
362
+ buffer_output(record)
363
+ end
364
+
365
+ def emit_tsv(values)
366
+ rows = values.flat_map { |value| value_to_rows(value) }
367
+ rows.each do |row|
368
+ buffer_output(row.join("\t") << "\n")
369
+ end
370
+ end
371
+
372
+ def value_to_rows(value)
373
+ case value
374
+ when Hash
375
+ value.map { |k, v|
376
+ case v
377
+ when Array
378
+ [format_cell(k)] + v.map { |e| format_cell(e) }
379
+ else
380
+ [format_cell(k), format_cell(v)]
381
+ end
382
+ }
383
+ when Array
384
+ value.map { |row|
385
+ case row
386
+ when Array
387
+ row.map { |e| format_cell(e) }
388
+ else
389
+ [format_cell(row)]
390
+ end
391
+ }
392
+ else
393
+ [[format_cell(value)]]
394
+ end
395
+ end
396
+
397
+ def format_cell(value)
398
+ case value
399
+ when nil
400
+ "null"
401
+ when Numeric, String, true, false
402
+ value.to_s
403
+ else
404
+ JSON.generate(value)
405
+ end
406
+ end
407
+
408
+ def buffer_output(record)
113
409
  if @output_buffer.bytesize + record.bytesize <= @atomic_write_bytes
114
410
  @output_buffer << record
115
411
  else
@@ -119,7 +415,13 @@ module Jrf
119
415
  end
120
416
 
121
417
  def write_output(str)
122
- @out.syswrite(str)
418
+ return if str.empty?
419
+
420
+ total = 0
421
+ while total < str.bytesize
422
+ written = @out.syswrite(str.byteslice(total..))
423
+ total += written
424
+ end
123
425
  end
124
426
  end
125
427
  end
data/lib/jrf/cli.rb CHANGED
@@ -16,7 +16,9 @@ module Jrf
16
16
  Options:
17
17
  -v, --verbose print parsed stage expressions
18
18
  --lax allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)
19
- -p, --pretty pretty-print JSON output instead of compact NDJSON
19
+ -o, --output FORMAT
20
+ output format: json (default), pretty, tsv
21
+ -P N opportunistically parallelize the map-prefix across N workers
20
22
  -r, --require LIBRARY
21
23
  require LIBRARY before evaluating stages
22
24
  --no-jit do not enable YJIT, even when supported by the Ruby runtime
@@ -43,7 +45,8 @@ module Jrf
43
45
  def self.run(argv = ARGV, input: ARGF, out: $stdout, err: $stderr)
44
46
  verbose = false
45
47
  lax = false
46
- pretty = false
48
+ output_format = :json
49
+ parallel = 1
47
50
  jit = true
48
51
  required_libraries = []
49
52
  atomic_write_bytes = Runner::DEFAULT_OUTPUT_BUFFER_LIMIT
@@ -52,7 +55,8 @@ module Jrf
52
55
  opts.banner = USAGE
53
56
  opts.on("-v", "--verbose", "print parsed stage expressions") { verbose = true }
54
57
  opts.on("--lax", "allow multiline JSON texts; split inputs by whitespace (also detects JSON-SEQ RS 0x1e)") { lax = true }
55
- opts.on("-p", "--pretty", "pretty-print JSON output instead of compact NDJSON") { pretty = true }
58
+ opts.on("-o", "--output FORMAT", %w[json pretty tsv], "output format: json, pretty, tsv") { |fmt| output_format = fmt.to_sym }
59
+ opts.on("-P N", Integer, "opportunistically parallelize the map-prefix across N workers") { |n| parallel = n }
56
60
  opts.on("-r", "--require LIBRARY", "require LIBRARY before evaluating stages") { |library| required_libraries << library }
57
61
  opts.on("--no-jit", "do not enable YJIT, even when supported by the Ruby runtime") { jit = false }
58
62
  opts.on("--atomic-write-bytes N", Integer, "group short outputs into atomic writes of up to N bytes") do |value|
@@ -88,34 +92,20 @@ module Jrf
88
92
  enable_yjit if jit
89
93
  required_libraries.each { |library| require library }
90
94
 
91
- inputs = Enumerator.new do |y|
92
- if argv.empty?
93
- y << input
94
- else
95
- argv.each do |path|
96
- if path == "-"
97
- y << input
98
- elsif path.end_with?(".gz")
99
- require "zlib"
100
- Zlib::GzipReader.open(path) do |source|
101
- y << source
102
- end
103
- else
104
- File.open(path, "rb") do |source|
105
- y << source
106
- end
107
- end
108
- end
109
- end
110
- end
111
- Runner.new(
112
- inputs: inputs,
95
+ file_paths = argv.dup
96
+
97
+ runner = Runner.new(
98
+ input: file_paths.empty? ? input : file_paths,
113
99
  out: out,
114
100
  err: err,
115
101
  lax: lax,
116
- pretty: pretty,
102
+ output_format: output_format,
117
103
  atomic_write_bytes: atomic_write_bytes
118
- ).run(expression, verbose: verbose)
104
+ )
105
+
106
+ runner.run(expression, parallel: parallel, verbose: verbose)
107
+
108
+ exit 1 if runner.input_errors?
119
109
  end
120
110
 
121
111
  def self.enable_yjit
data/lib/jrf/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Jrf
4
- VERSION = "0.1.12"
4
+ VERSION = "0.1.14"
5
5
  end