mahout 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mahout
4
+ class Benchmark
5
+ def initialize(runner:, config:)
6
+ @runner = runner
7
+ @config = config
8
+ end
9
+
10
+ STANDARD_CLIENT_LEVELS = [1, 4, 8, 16, 32].freeze
11
+ FAST_STANDARD_CLIENT_LEVELS = [1, 8, 32].freeze
12
+
13
+ def call(scale: 100, duration: 60, clients: nil, fast: false, standardized: false, stress: false)
14
+ if stress
15
+ scale = nil if scale == 100
16
+ duration = 600 if duration == 60
17
+ elsif fast && standardized
18
+ scale = 10
19
+ duration = 20
20
+ elsif fast
21
+ scale = 10
22
+ duration = 10
23
+ end
24
+
25
+ @cores = detect_cores
26
+ @clients = clients || (stress ? [@cores * 2, 64].max : @cores)
27
+ @duration = duration
28
+ @db = @config.pg_database
29
+
30
+ detect_hardware(@cores)
31
+
32
+ if stress
33
+ scale ||= stress_scale
34
+ init_pgbench(scale)
35
+ run_stress(scale)
36
+ elsif standardized
37
+ init_pgbench(scale)
38
+ run_standardized(fast: fast)
39
+ else
40
+ init_pgbench(scale)
41
+ run_read_only
42
+ run_write_heavy
43
+ run_mixed
44
+
45
+ unless fast
46
+ run_wal_throughput
47
+ run_checkpoint_impact
48
+ run_connection_scaling
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def init_pgbench(scale)
56
+ $stdout.puts("initializing pgbench (scale: #{scale})")
57
+ @runner.run(
58
+ "sudo -u postgres pgbench -i -s #{scale} -q #{@db}",
59
+ sudo: false, timeout: 3600
60
+ )
61
+ $stdout.puts("")
62
+ end
63
+
64
+ def run_read_only
65
+ $stdout.puts("read-only (SELECT) -- #{@clients} clients, #{@duration}s")
66
+ result = run_pgbench("-S")
67
+ print_results(result)
68
+ end
69
+
70
+ def run_write_heavy
71
+ $stdout.puts("write-heavy (TPC-B) -- #{@clients} clients, #{@duration}s")
72
+ result = run_pgbench("")
73
+ print_results(result)
74
+ end
75
+
76
+ def run_mixed
77
+ $stdout.puts("mixed (70% read, 30% write) -- #{@clients} clients, #{@duration}s")
78
+
79
+ custom_script = <<~SQL
80
+ \\set aid random(1, 100000 * :scale)
81
+ \\set bid random(1, 1 * :scale)
82
+ \\set tid random(1, 10 * :scale)
83
+ \\set delta random(-5000, 5000)
84
+ BEGIN;
85
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
86
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
87
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
88
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
89
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
90
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
91
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
92
+ UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;
93
+ SELECT sum(abalance) FROM pgbench_accounts WHERE aid = :aid;
94
+ UPDATE pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid;
95
+ UPDATE pgbench_branches SET bbalance = bbalance + :delta WHERE bid = :bid;
96
+ END;
97
+ SQL
98
+
99
+ @runner.upload(custom_script, "/tmp/mahout-bench-mixed.sql", mode: "0644", owner: "postgres:postgres")
100
+ result = run_pgbench("-f /tmp/mahout-bench-mixed.sql")
101
+ @runner.run("rm -f /tmp/mahout-bench-mixed.sql", allow_failure: true)
102
+ print_results(result)
103
+ end
104
+
105
+ def run_wal_throughput
106
+ $stdout.puts("wal write throughput -- #{@clients} clients, #{@duration}s")
107
+ $stdout.puts(" (measures WAL generation rate from write-heavy workload)")
108
+
109
+ result = run_pgbench("")
110
+ return $stdout.puts("") if @runner.dry_run?
111
+
112
+ wal_before = get_wal_position
113
+ result = run_pgbench("")
114
+ wal_after = get_wal_position
115
+
116
+ if wal_before && wal_after
117
+ wal_bytes = wal_after - wal_before
118
+ wal_mb = wal_bytes / 1024.0 / 1024.0
119
+ wal_rate = wal_mb / @duration
120
+
121
+ $stdout.puts(" wal generated: #{wal_mb.round(1)}MB in #{@duration}s (#{wal_rate.round(1)}MB/s)")
122
+ end
123
+
124
+ tps = extract_tps(result.stdout)
125
+ latency = extract_latency(result.stdout)
126
+ $stdout.puts(" tps: #{tps}")
127
+ $stdout.puts(" latency: #{latency}")
128
+
129
+ wal_mount = @config.wal_mount || @config.data_mount
130
+ disk_result = @runner.run("df -h --output=target,size,used,avail,pcent #{wal_mount} | tail -1", allow_failure: true)
131
+ $stdout.puts(" wal disk: #{disk_result.stdout.strip}") if disk_result.success?
132
+
133
+ $stdout.puts("")
134
+ end
135
+
136
+ def run_checkpoint_impact
137
+ $stdout.puts("checkpoint impact -- measuring tps drop during forced checkpoint")
138
+
139
+ @runner.run(
140
+ "sudo -u postgres psql -c \"CHECKPOINT\"",
141
+ sudo: false, allow_failure: true
142
+ )
143
+ sleep(2) unless @runner.dry_run?
144
+
145
+ result = @runner.run(
146
+ "sudo -u postgres pgbench -T 30 -c #{@clients} -j #{@clients} -P 1 #{@db} 2>&1",
147
+ sudo: false, allow_failure: true, timeout: 120
148
+ )
149
+ return $stdout.puts("") if @runner.dry_run?
150
+
151
+ @runner.run(
152
+ "sudo -u postgres psql -c \"SELECT pg_sleep(10)\" & " \
153
+ "sleep 5 && sudo -u postgres psql -c \"CHECKPOINT\"",
154
+ sudo: false, allow_failure: true
155
+ )
156
+
157
+ progress_lines = result.stdout.lines.select { |l| l.match?(/^progress:/) }
158
+ if progress_lines.length >= 5
159
+ tps_values = progress_lines.filter_map { |l| m = l.match(/tps=\s*([\d.]+)/); m && m[1].to_f }
160
+ unless tps_values.empty?
161
+ avg = tps_values.sum / tps_values.length
162
+ min = tps_values.min
163
+ max = tps_values.max
164
+ dip = ((max - min) / max * 100).round(1)
165
+ $stdout.puts(" avg tps: #{avg.round(0)}")
166
+ $stdout.puts(" min tps: #{min.round(0)}")
167
+ $stdout.puts(" max tps: #{max.round(0)}")
168
+ $stdout.puts(" checkpoint dip: #{dip}%")
169
+ print_checkpoint_assessment(dip)
170
+ end
171
+ else
172
+ tps = extract_tps(result.stdout)
173
+ $stdout.puts(" tps: #{tps}")
174
+ end
175
+
176
+ $stdout.puts("")
177
+ end
178
+
179
+ def print_checkpoint_assessment(dip)
180
+ if dip < 10
181
+ $stdout.puts(" assessment: minimal impact, storage handles checkpoints well")
182
+ elsif dip < 25
183
+ $stdout.puts(" assessment: normal for block storage with write-heavy workloads")
184
+ elsif dip < 40
185
+ $stdout.puts(" assessment: noticeable dip, queries may slow during checkpoints")
186
+ $stdout.puts(" see README for tuning suggestions")
187
+ else
188
+ $stdout.puts(" assessment: significant dip, data device is the bottleneck")
189
+ $stdout.puts(" see README for tuning suggestions")
190
+ end
191
+ end
192
+
193
+ def get_wal_position
194
+ result = @runner.run(
195
+ "sudo -u postgres psql -tAc \"SELECT pg_current_wal_lsn()\"",
196
+ sudo: false, allow_failure: true
197
+ )
198
+ return nil unless result.success?
199
+
200
+ lsn = result.stdout.strip
201
+ parts = lsn.split("/")
202
+ return nil unless parts.length == 2
203
+
204
+ (parts[0].to_i(16) << 32) + parts[1].to_i(16)
205
+ end
206
+
207
+ def run_connection_scaling
208
+ $stdout.puts("connection scaling (read-only, 15s per level)")
209
+
210
+ levels = [1, @clients / 2, @clients, @clients * 2, @clients * 4].uniq.select { |n| n > 0 }
211
+
212
+ levels.each do |n|
213
+ result = @runner.run(
214
+ "sudo -u postgres pgbench -S -T 15 -c #{n} -j #{[n, @clients].min} " \
215
+ "--latency-limit=100 --log-prefix=/tmp/mahout-bench #{@db} 2>&1",
216
+ sudo: false, allow_failure: true
217
+ )
218
+ next if @runner.dry_run?
219
+
220
+ tps = extract_tps(result.stdout)
221
+ latency = extract_latency(result.stdout)
222
+ $stdout.puts(" #{n} clients: #{tps} tps, #{latency}")
223
+ end
224
+
225
+ @runner.run("rm -f /tmp/mahout-bench*", allow_failure: true)
226
+ $stdout.puts("")
227
+ end
228
+
229
+ def stress_scale
230
+ result = @runner.run(
231
+ "sudo -u postgres psql -tAc \"SELECT setting FROM pg_settings WHERE name = 'shared_buffers'\"",
232
+ sudo: false, allow_failure: true
233
+ )
234
+ return 500 unless result.success?
235
+
236
+ shared_buffers_8k = result.stdout.strip.to_i
237
+ shared_buffers_mb = shared_buffers_8k * 8 / 1024
238
+ target_mb = shared_buffers_mb * 3
239
+ scale = (target_mb / 16.0).ceil
240
+ scale = [[scale, 100].max, 5000].min
241
+
242
+ $stdout.puts("shared_buffers: #{shared_buffers_mb}MB, target dataset: #{target_mb}MB, scale: #{scale}")
243
+ scale
244
+ end
245
+
246
+ def run_stress(scale)
247
+ $stdout.puts("write stress -- #{@clients} clients, #{@duration}s, scale #{scale}")
248
+ $stdout.puts("")
249
+
250
+ @runner.run(
251
+ "sudo -u postgres psql -c \"CHECKPOINT\"",
252
+ sudo: false, allow_failure: true
253
+ )
254
+ sleep(2) unless @runner.dry_run?
255
+
256
+ wal_before = get_wal_position
257
+
258
+ result = @runner.run(
259
+ "sudo -u postgres pgbench -T #{@duration} -c #{@clients} -j #{@cores} -P 1 #{@db} 2>&1",
260
+ sudo: false, allow_failure: true, timeout: @duration + 120
261
+ )
262
+ return $stdout.puts("") if @runner.dry_run?
263
+
264
+ wal_after = get_wal_position
265
+
266
+ progress_lines = result.stdout.lines.select { |l| l.match?(/^progress:/) }
267
+ tps_values = progress_lines.filter_map { |l| m = l.match(/tps=\s*([\d.]+)/); m && m[1].to_f }
268
+
269
+ if tps_values.length >= 10
270
+ avg = tps_values.sum / tps_values.length
271
+ min = tps_values.min
272
+ max = tps_values.max
273
+ stddev = Math.sqrt(tps_values.sum { |v| (v - avg)**2 } / tps_values.length)
274
+
275
+ first_quarter = tps_values[0...(tps_values.length / 4)]
276
+ last_quarter = tps_values[(tps_values.length * 3 / 4)..]
277
+ first_avg = first_quarter.sum / first_quarter.length
278
+ last_avg = last_quarter.sum / last_quarter.length
279
+ degradation = ((first_avg - last_avg) / first_avg * 100).round(1)
280
+
281
+ sorted = tps_values.sort
282
+ p50 = sorted[sorted.length / 2]
283
+ p5 = sorted[(sorted.length * 0.05).to_i]
284
+ p95 = sorted[(sorted.length * 0.95).to_i]
285
+
286
+ $stdout.puts("tps avg: #{avg.round(0)}")
287
+ $stdout.puts("tps min: #{min.round(0)}")
288
+ $stdout.puts("tps max: #{max.round(0)}")
289
+ $stdout.puts("tps stddev: #{stddev.round(0)}")
290
+ $stdout.puts("tps p5: #{p5.round(0)}")
291
+ $stdout.puts("tps p50: #{p50.round(0)}")
292
+ $stdout.puts("tps p95: #{p95.round(0)}")
293
+ $stdout.puts("")
294
+ $stdout.puts("first 25% avg tps: #{first_avg.round(0)}")
295
+ $stdout.puts("last 25% avg tps: #{last_avg.round(0)}")
296
+ $stdout.puts("degradation: #{degradation}%")
297
+
298
+ dips = tps_values.each_cons(2).count { |a, b| b < a * 0.7 }
299
+ $stdout.puts("checkpoint dips (>30% drop): #{dips}")
300
+ else
301
+ tps = extract_tps(result.stdout)
302
+ latency = extract_latency(result.stdout)
303
+ $stdout.puts("tps: #{tps}")
304
+ $stdout.puts("latency: #{latency}")
305
+ end
306
+
307
+ if wal_before && wal_after
308
+ wal_bytes = wal_after - wal_before
309
+ wal_mb = wal_bytes / 1024.0 / 1024.0
310
+ wal_rate = wal_mb / @duration
311
+
312
+ $stdout.puts("")
313
+ $stdout.puts("wal generated: #{wal_mb.round(0)}MB (#{wal_rate.round(1)}MB/s)")
314
+ end
315
+
316
+ latency = extract_latency(result.stdout)
317
+ p99 = extract_percentile(result.stdout)
318
+ $stdout.puts("latency: #{latency}")
319
+ $stdout.puts("p99: #{p99}") if p99
320
+ $stdout.puts("")
321
+ end
322
+
323
+ def run_standardized(fast: false)
324
+ levels = fast ? FAST_STANDARD_CLIENT_LEVELS : STANDARD_CLIENT_LEVELS
325
+
326
+ $stdout.puts("standardized benchmark#{fast ? " (fast)" : ""} -- #{levels.join(", ")} clients, #{@duration}s per test")
327
+ $stdout.puts("")
328
+
329
+ upload_mixed_script unless fast
330
+
331
+ levels.each do |n|
332
+ threads = [n, @cores].min
333
+
334
+ $stdout.puts("#{n} clients")
335
+
336
+ $stdout.puts(" read-only (SELECT)")
337
+ result = run_pgbench_at("-S", n, threads)
338
+ print_results(result, indent: 4)
339
+
340
+ $stdout.puts(" write-heavy (TPC-B)")
341
+ result = run_pgbench_at("", n, threads)
342
+ print_results(result, indent: 4)
343
+
344
+ unless fast
345
+ $stdout.puts(" mixed (70/30)")
346
+ result = run_pgbench_at("-f /tmp/mahout-bench-mixed.sql", n, threads)
347
+ print_results(result, indent: 4)
348
+ end
349
+ end
350
+
351
+ @runner.run("rm -f /tmp/mahout-bench-mixed.sql", allow_failure: true) unless fast
352
+ end
353
+
354
+ def upload_mixed_script
355
+ custom_script = <<~SQL
356
+ \\set aid random(1, 100000 * :scale)
357
+ \\set bid random(1, 1 * :scale)
358
+ \\set tid random(1, 10 * :scale)
359
+ \\set delta random(-5000, 5000)
360
+ BEGIN;
361
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
362
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
363
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
364
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
365
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
366
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
367
+ SELECT abalance FROM pgbench_accounts WHERE aid = :aid;
368
+ UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;
369
+ SELECT sum(abalance) FROM pgbench_accounts WHERE aid = :aid;
370
+ UPDATE pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid;
371
+ UPDATE pgbench_branches SET bbalance = bbalance + :delta WHERE bid = :bid;
372
+ END;
373
+ SQL
374
+
375
+ @runner.upload(custom_script, "/tmp/mahout-bench-mixed.sql", mode: "0644", owner: "postgres:postgres")
376
+ end
377
+
378
+ def detect_hardware(cores)
379
+ $stdout.puts("cores: #{cores}")
380
+
381
+ ram = @runner.run("free -m | grep Mem | awk '{print $2}'", allow_failure: true)
382
+ $stdout.puts("ram: #{ram.stdout.strip}MB") if ram.success?
383
+
384
+ cpu = @runner.run("lscpu | grep 'Model name'", allow_failure: true)
385
+ if cpu.success?
386
+ model = cpu.stdout.strip.sub(/^Model name:\s*/, "")
387
+ $stdout.puts("cpu: #{model}")
388
+ end
389
+
390
+ disk = @runner.run("lsblk -ndo MODEL #{@config.data_device}", allow_failure: true)
391
+ if disk.success? && !disk.stdout.strip.empty?
392
+ $stdout.puts("disk: #{disk.stdout.strip}")
393
+ end
394
+
395
+ $stdout.puts("data: #{@config.data_device} -> #{@config.data_mount}")
396
+ if @config.wal_device
397
+ $stdout.puts("wal: #{@config.wal_device} -> #{@config.wal_mount}")
398
+ else
399
+ $stdout.puts("wal: same as data")
400
+ end
401
+
402
+ $stdout.puts("profile: #{@config.profile}")
403
+ $stdout.puts("")
404
+ end
405
+
406
+ def run_pgbench_at(extra_flags, clients, threads)
407
+ @runner.run(
408
+ "sudo -u postgres pgbench #{extra_flags} -T #{@duration} -c #{clients} -j #{threads} " \
409
+ "-P 5 --latency-limit=100 #{@db} 2>&1",
410
+ sudo: false, allow_failure: true, timeout: @duration + 60
411
+ )
412
+ end
413
+
414
+ def run_pgbench(extra_flags)
415
+ @runner.run(
416
+ "sudo -u postgres pgbench #{extra_flags} -T #{@duration} -c #{@clients} -j #{@clients} " \
417
+ "-P 5 --latency-limit=100 #{@db} 2>&1",
418
+ sudo: false, allow_failure: true, timeout: @duration + 60
419
+ )
420
+ end
421
+
422
+ def print_results(result, indent: 2)
423
+ pad = " " * indent
424
+ return $stdout.puts("") if @runner.dry_run?
425
+
426
+ output = result.stdout
427
+ tps = extract_tps(output)
428
+ latency = extract_latency(output)
429
+ p99 = extract_percentile(output)
430
+
431
+ $stdout.puts("#{pad}tps: #{tps}")
432
+ $stdout.puts("#{pad}latency: #{latency}")
433
+ $stdout.puts("#{pad}p99: #{p99}") if p99
434
+ $stdout.puts("")
435
+ end
436
+
437
+ def extract_tps(output)
438
+ match = output.match(/tps = ([\d.]+).*excluding/i) || output.match(/tps = ([\d.]+)/i)
439
+ match ? match[1] : "n/a"
440
+ end
441
+
442
+ def extract_latency(output)
443
+ avg = output.match(/latency average\s*=\s*([\d.]+)\s*ms/)
444
+ stddev = output.match(/latency stddev\s*=\s*([\d.]+)\s*ms/)
445
+
446
+ parts = []
447
+ parts << "avg #{avg[1]}ms" if avg
448
+ parts << "stddev #{stddev[1]}ms" if stddev
449
+ parts.empty? ? "n/a" : parts.join(", ")
450
+ end
451
+
452
+ def extract_percentile(output)
453
+ match = output.match(/latency.*?(\d+)th percentile\s*=?\s*([\d.]+)\s*ms/i)
454
+ return nil unless match
455
+
456
+ "#{match[2]}ms"
457
+ end
458
+
459
+ def detect_cores
460
+ result = @runner.run("nproc", allow_failure: true)
461
+ result.success? ? result.stdout.strip.to_i : 4
462
+ end
463
+ end
464
+ end