jrf 0.1.12 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/jrf.gemspec +2 -0
- data/lib/jrf/cli/runner.rb +336 -34
- data/lib/jrf/cli.rb +17 -27
- data/lib/jrf/version.rb +1 -1
- data/test/cli_parallel_test.rb +195 -0
- data/test/cli_runner_test.rb +951 -0
- data/test/library_api_test.rb +126 -0
- data/test/readme_examples_test.rb +16 -0
- data/test/test_helper.rb +118 -0
- metadata +34 -2
- data/test/jrf_test.rb +0 -1103
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "test_helper"
|
|
4
|
+
|
|
5
|
+
class CliParallelTest < JrfTestCase
|
|
6
|
+
def test_parallel_map_only
|
|
7
|
+
Dir.mktmpdir do |dir|
|
|
8
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
9
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
10
|
+
|
|
11
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", '_["x"]', *ndjson_files(dir))
|
|
12
|
+
assert_success(status, stderr, "parallel map only")
|
|
13
|
+
assert_equal([1, 2, 3, 4], lines(stdout).map(&:to_i).sort, "parallel map only output")
|
|
14
|
+
assert_includes(stderr, "parallel: enabled workers=2 files=2 split=1/1", "parallel verbose summary")
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def test_parallel_map_only_pretty_output
|
|
19
|
+
Dir.mktmpdir do |dir|
|
|
20
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}])
|
|
21
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 2}])
|
|
22
|
+
|
|
23
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "pretty", '_["x"]', *ndjson_files(dir))
|
|
24
|
+
assert_success(status, stderr, "parallel pretty map only")
|
|
25
|
+
assert_equal(["1", "2"], stdout.lines.map(&:strip).reject(&:empty?).sort, "parallel pretty map only output")
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def test_parallel_map_only_tsv_output
|
|
30
|
+
Dir.mktmpdir do |dir|
|
|
31
|
+
write_ndjson(dir, "a.ndjson", [{"a" => 1, "b" => 2}])
|
|
32
|
+
write_ndjson(dir, "b.ndjson", [{"a" => 3, "b" => 4}])
|
|
33
|
+
|
|
34
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", "-o", "tsv", "_", *ndjson_files(dir))
|
|
35
|
+
assert_success(status, stderr, "parallel tsv map only")
|
|
36
|
+
assert_equal(["a\t1", "a\t3", "b\t2", "b\t4"], lines(stdout).sort, "parallel tsv map only output")
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def test_parallel_map_reduce
|
|
41
|
+
Dir.mktmpdir do |dir|
|
|
42
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
43
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}, {"x" => 4}])
|
|
44
|
+
|
|
45
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
46
|
+
assert_success(status, stderr, "parallel map reduce")
|
|
47
|
+
assert_equal(%w[10], lines(stdout), "parallel sum output")
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def test_parallel_split_map_and_reduce
|
|
52
|
+
Dir.mktmpdir do |dir|
|
|
53
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 10}, {"x" => 20}])
|
|
54
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 30}, {"x" => 40}])
|
|
55
|
+
|
|
56
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
57
|
+
assert_success(status, stderr, "parallel split map+reduce")
|
|
58
|
+
assert_equal(%w[90], lines(stdout), "parallel split map+reduce output")
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def test_parallel_group_by
|
|
63
|
+
Dir.mktmpdir do |dir|
|
|
64
|
+
write_ndjson(dir, "a.ndjson", [{"k" => "a", "v" => 1}, {"k" => "b", "v" => 2}])
|
|
65
|
+
write_ndjson(dir, "b.ndjson", [{"k" => "a", "v" => 3}, {"k" => "b", "v" => 4}])
|
|
66
|
+
|
|
67
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'group_by(_["k"]) { |r| sum(r["v"]) }', *ndjson_files(dir))
|
|
68
|
+
assert_success(status, stderr, "parallel group_by")
|
|
69
|
+
result = JSON.parse(lines(stdout).first)
|
|
70
|
+
assert_equal(4, result["a"], "parallel group_by a")
|
|
71
|
+
assert_equal(6, result["b"], "parallel group_by b")
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def test_parallel_all_reducers_falls_back_to_serial
|
|
76
|
+
Dir.mktmpdir do |dir|
|
|
77
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
78
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 3}])
|
|
79
|
+
|
|
80
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-v", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
81
|
+
assert_success(status, stderr, "all-reducer serial fallback")
|
|
82
|
+
assert_equal(%w[6], lines(stdout), "all-reducer serial fallback output")
|
|
83
|
+
assert_includes(stderr, "parallel: disabled", "parallel disabled summary")
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def test_parallel_with_gz_files
|
|
88
|
+
Dir.mktmpdir do |dir|
|
|
89
|
+
gz_path_a = File.join(dir, "a.ndjson.gz")
|
|
90
|
+
Zlib::GzipWriter.open(gz_path_a) { |io| io.write("{\"x\":10}\n{\"x\":20}\n") }
|
|
91
|
+
gz_path_b = File.join(dir, "b.ndjson.gz")
|
|
92
|
+
Zlib::GzipWriter.open(gz_path_b) { |io| io.write("{\"x\":30}\n") }
|
|
93
|
+
|
|
94
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', gz_path_a, gz_path_b)
|
|
95
|
+
assert_success(status, stderr, "parallel with gz")
|
|
96
|
+
assert_equal(%w[60], lines(stdout), "parallel with gz output")
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def test_parallel_matches_serial_output
|
|
101
|
+
Dir.mktmpdir do |dir|
|
|
102
|
+
write_ndjson(dir, "a.ndjson", (1..50).map { |i| {"v" => i, "g" => i % 3} })
|
|
103
|
+
write_ndjson(dir, "b.ndjson", (51..100).map { |i| {"v" => i, "g" => i % 3} })
|
|
104
|
+
|
|
105
|
+
files = ndjson_files(dir)
|
|
106
|
+
expr = 'group_by(_["g"]) { |r| sum(r["v"]) }'
|
|
107
|
+
|
|
108
|
+
serial_stdout, serial_stderr, serial_status = Open3.capture3("./exe/jrf", expr, *files)
|
|
109
|
+
assert_success(serial_status, serial_stderr, "serial baseline")
|
|
110
|
+
|
|
111
|
+
parallel_stdout, parallel_stderr, parallel_status = Open3.capture3("./exe/jrf", "-P", "2", expr, *files)
|
|
112
|
+
assert_success(parallel_status, parallel_stderr, "parallel run")
|
|
113
|
+
|
|
114
|
+
assert_equal(JSON.parse(serial_stdout), JSON.parse(parallel_stdout), "parallel matches serial")
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def test_parallel_worker_error_handling
|
|
119
|
+
Dir.mktmpdir do |dir|
|
|
120
|
+
good_path = File.join(dir, "a.ndjson")
|
|
121
|
+
File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
|
|
122
|
+
|
|
123
|
+
# Create a truncated gz file (valid header, truncated body)
|
|
124
|
+
bad_gz_path = File.join(dir, "b.ndjson.gz")
|
|
125
|
+
full_gz = StringIO.new
|
|
126
|
+
Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
|
|
127
|
+
# Write only the first half to simulate truncation
|
|
128
|
+
File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
|
|
129
|
+
|
|
130
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", '_["x"]', good_path, bad_gz_path)
|
|
131
|
+
assert_failure(status, "worker error causes non-zero exit")
|
|
132
|
+
assert_includes(stderr, bad_gz_path, "error message includes filename")
|
|
133
|
+
# Good file data should still be present
|
|
134
|
+
output_values = lines(stdout).map(&:to_i)
|
|
135
|
+
assert_includes(output_values, 1, "good file data preserved")
|
|
136
|
+
assert_includes(output_values, 2, "good file data preserved")
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def test_parallel_requires_multiple_files
|
|
141
|
+
# With single file and -P, should still work (falls back to serial)
|
|
142
|
+
Dir.mktmpdir do |dir|
|
|
143
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 2}])
|
|
144
|
+
|
|
145
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'sum(_["x"])', *ndjson_files(dir))
|
|
146
|
+
assert_success(status, stderr, "single file with -P")
|
|
147
|
+
assert_equal(%w[3], lines(stdout), "single file with -P output")
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def test_parallel_select_then_sum
|
|
152
|
+
Dir.mktmpdir do |dir|
|
|
153
|
+
write_ndjson(dir, "a.ndjson", [{"x" => 1}, {"x" => 20}, {"x" => 3}])
|
|
154
|
+
write_ndjson(dir, "b.ndjson", [{"x" => 40}, {"x" => 5}])
|
|
155
|
+
|
|
156
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", "-P", "2", 'select(_["x"] > 10) >> sum(_["x"])', *ndjson_files(dir))
|
|
157
|
+
assert_success(status, stderr, "parallel select then sum")
|
|
158
|
+
assert_equal(%w[60], lines(stdout), "parallel select then sum output")
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def test_serial_error_includes_filename
|
|
163
|
+
Dir.mktmpdir do |dir|
|
|
164
|
+
good_path = File.join(dir, "a.ndjson")
|
|
165
|
+
File.write(good_path, "{\"x\":1}\n{\"x\":2}\n")
|
|
166
|
+
|
|
167
|
+
bad_gz_path = File.join(dir, "b.ndjson.gz")
|
|
168
|
+
full_gz = StringIO.new
|
|
169
|
+
Zlib::GzipWriter.wrap(full_gz) { |io| io.write("{\"x\":10}\n" * 100) }
|
|
170
|
+
File.binwrite(bad_gz_path, full_gz.string[0, full_gz.string.bytesize / 2])
|
|
171
|
+
|
|
172
|
+
good_path2 = File.join(dir, "c.ndjson")
|
|
173
|
+
File.write(good_path2, "{\"x\":3}\n")
|
|
174
|
+
|
|
175
|
+
stdout, stderr, status = Open3.capture3("./exe/jrf", '_["x"]', good_path, bad_gz_path, good_path2)
|
|
176
|
+
assert_failure(status, "serial error causes non-zero exit")
|
|
177
|
+
assert_includes(stderr, bad_gz_path, "serial error message includes filename")
|
|
178
|
+
refute_includes(stderr, "from ", "serial error does not include stacktrace")
|
|
179
|
+
# Data from good files should still be present
|
|
180
|
+
output_values = lines(stdout).map(&:to_i)
|
|
181
|
+
assert_includes(output_values, 1, "data before bad file preserved")
|
|
182
|
+
assert_includes(output_values, 3, "data after bad file preserved")
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
private
|
|
187
|
+
|
|
188
|
+
def write_ndjson(dir, name, rows)
|
|
189
|
+
File.write(File.join(dir, name), rows.map { |r| JSON.generate(r) + "\n" }.join)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def ndjson_files(dir)
|
|
193
|
+
Dir.glob(File.join(dir, "*.ndjson")).sort
|
|
194
|
+
end
|
|
195
|
+
end
|