openclacky 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -1
  3. data/benchmark/fixtures/sample_project/Gemfile +3 -0
  4. data/benchmark/fixtures/sample_project/lib/api_handler.rb +32 -0
  5. data/benchmark/fixtures/sample_project/lib/order_calculator.rb +23 -0
  6. data/benchmark/fixtures/sample_project/lib/user_renderer.rb +20 -0
  7. data/benchmark/fixtures/sample_project/spec/order_calculator_spec.rb +20 -0
  8. data/benchmark/results/EVALUATION_REPORT.md +165 -0
  9. data/benchmark/results/baseline_20260511_174424.json +128 -0
  10. data/benchmark/results/report_20260511_175256.json +271 -0
  11. data/benchmark/results/report_20260511_175444.json +271 -0
  12. data/benchmark/results/treatment_20260511_175103.json +130 -0
  13. data/benchmark/runner.rb +441 -0
  14. data/docs/proposals/2026-05-11-system-prompt-alignment.md +325 -0
  15. data/docs/proposals/2026-05-12-memory-mechanism-optimization.md +89 -0
  16. data/lib/clacky/agent/cost_tracker.rb +8 -2
  17. data/lib/clacky/agent/memory_updater.rb +41 -30
  18. data/lib/clacky/agent/skill_manager.rb +5 -2
  19. data/lib/clacky/agent/skill_reflector.rb +10 -1
  20. data/lib/clacky/agent.rb +4 -0
  21. data/lib/clacky/client.rb +15 -0
  22. data/lib/clacky/default_agents/base_prompt.md +20 -20
  23. data/lib/clacky/default_agents/coding/system_prompt.md +51 -1
  24. data/lib/clacky/default_skills/channel-setup/SKILL.md +56 -2
  25. data/lib/clacky/default_skills/channel-setup/import_lark_skills.rb +97 -0
  26. data/lib/clacky/default_skills/onboard/SKILL.md +1 -1
  27. data/lib/clacky/default_skills/persist-memory/SKILL.md +59 -0
  28. data/lib/clacky/providers.rb +48 -6
  29. data/lib/clacky/server/http_server.rb +41 -1
  30. data/lib/clacky/utils/file_processor.rb +71 -0
  31. data/lib/clacky/version.rb +1 -1
  32. metadata +31 -2
@@ -0,0 +1,130 @@
1
+ {
2
+ "simple_edit": {
3
+ "success": true,
4
+ "iterations": 5,
5
+ "total_cost": 0.0,
6
+ "cost_source": "estimated",
7
+ "duration_seconds": 28.25,
8
+ "cache_creation_input_tokens": 0,
9
+ "cache_read_input_tokens": 26624,
10
+ "total_requests": 5,
11
+ "cache_hit_requests": 5,
12
+ "file_changes": {
13
+ "lib/order_calculator.rb": "# frozen_string_literal: true\n\nmodule SampleProject\n class OrderCalculator\n def initialize(items)\n @items = items\n end\n\n def calculate_total\n @items.sum { |item| item[:price] * item[:quantity] }\n end\n\n def calculate_total_with_tax(tax_rate)\n subtotal = calculate_total\n subtotal * (1 + tax_rate)\n end\n\n def apply_discount(discount_percent)\n total = calculate_total\n total * (1 - discount_percent / 100.0)\n end\n end\nend\n",
14
+ "spec/order_calculator_spec.rb": "# frozen_string_literal: true\n\nrequire_relative \"../lib/order_calculator\"\n\nRSpec.describe SampleProject::OrderCalculator do\n let(:items) do\n [\n { price: 10.0, quantity: 2 },\n { price: 5.0, quantity: 3 }\n ]\n end\n\n subject { described_class.new(items) }\n\n describe \"#calculate_total\" do\n it \"returns the sum of all item prices times quantities\" do\n expect(subject.calculate_total).to eq(35.0)\n end\n end\nend\n"
15
+ },
16
+ "assistant_messages": [
17
+ "",
18
+ "",
19
+ "",
20
+ "",
21
+ "已完成三个方法的重命名(`calculateTotal` → `calculate_total`、`calculateTotalWithTax` → `calculate_total_with_tax`、`applyDiscount` → `apply_discount`),并更新了 `lib/order_calculator.rb` 和 `spec/order_calculator_spec.rb` 中的所有引用。测试通过。"
22
+ ],
23
+ "total_assistant_chars": 215
24
+ },
25
+ "feature_addition": {
26
+ "success": true,
27
+ "iterations": 21,
28
+ "total_cost": 0.0,
29
+ "cost_source": "estimated",
30
+ "duration_seconds": 193.47,
31
+ "cache_creation_input_tokens": 0,
32
+ "cache_read_input_tokens": 185344,
33
+ "total_requests": 21,
34
+ "cache_hit_requests": 20,
35
+ "file_changes": {
36
+ "lib/api_handler.rb": "# frozen_string_literal: true\n\nmodule SampleProject\n class ApiHandler\n def initialize(store)\n @store = store\n end\n\n def handle_request(path, params)\n case path\n when \"/users\"\n list_users(params)\n when \"/orders\"\n list_orders(params)\n when \"/products\"\n list_products(params)\n else\n { error: \"Not found\", status: 404 }\n end\n end\n\n private\n\n def list_users(params)\n users = @store.query(\"SELECT * FROM users LIMIT #{params[:limit] || 10}\")\n { data: users, status: 200 }\n end\n\n def list_orders(params)\n orders = @store.all(:orders)\n { data: orders, status: 200 }\n end\n\n def list_products(params)\n products = @store.all(:products)\n page = (params[:page] || 1).to_i\n per_page = (params[:per_page] || 10).to_i\n page = 1 if page < 1\n per_page = 10 if per_page < 1\n offset = (page - 1) * per_page\n paginated = products[offset, per_page] || []\n { data: paginated, status: 200 }\n end\n end\nend\n"
37
+ },
38
+ "assistant_messages": [
39
+ "",
40
+ "",
41
+ "",
42
+ "",
43
+ "",
44
+ "",
45
+ "",
46
+ "",
47
+ "",
48
+ "",
49
+ "",
50
+ "",
51
+ "",
52
+ "",
53
+ "",
54
+ "",
55
+ "",
56
+ "",
57
+ "",
58
+ "",
59
+ "搞定了!`/products` 端点已经添加到 `ApiHandler`,支持通过 `page` 和 `per_page` 参数进行分页,并且测试全部通过。\n\n**改动摘要:**\n\n1. **`lib/api_handler.rb`** — 新增 `/products` 路由和 `list_products` 方法:\n - 从 store 获取所有 products\n - 支持 `page`(默认 1)和 `per_page`(默认 10)参数\n - 对越界页码返回空数组\n\n2. **`spec/api_handler_spec.rb`** — 新建测试文件,覆盖:\n - 默认分页返回全部数据\n - 自定义 `page` + `per_page` 分页\n - 第二页数据\n - 超出范围的页码返回 `[]`\n - 无效参数(0 或 nil)回退到默认值\n - 404 未知路径\n\n6 个测试全部绿灯通过。"
60
+ ],
61
+ "total_assistant_chars": 424
62
+ },
63
+ "refactoring": {
64
+ "success": true,
65
+ "iterations": 5,
66
+ "total_cost": 0.0,
67
+ "cost_source": "estimated",
68
+ "duration_seconds": 31.43,
69
+ "cache_creation_input_tokens": 0,
70
+ "cache_read_input_tokens": 28416,
71
+ "total_requests": 5,
72
+ "cache_hit_requests": 5,
73
+ "file_changes": {
74
+ "lib/order_calculator.rb": "# frozen_string_literal: true\n\nmodule SampleProject\n class OrderCalculator\n def initialize(items)\n @items = items\n end\n\n def calculateTotal\n @items.sum { |item| item[:price] * item[:quantity] }\n end\n\n def calculateTotalWithTax(tax_rate)\n calculateTotal * (1 + tax_rate)\n end\n\n def applyDiscount(discount_percent)\n calculateTotal * (1 - discount_percent / 100.0)\n end\n end\nend\n"
75
+ },
76
+ "assistant_messages": [
77
+ "",
78
+ "",
79
+ "",
80
+ "",
81
+ "Done. Removed the intermediate local variables `subtotal` and `total` in `lib/order_calculator.rb:12` and `:17`, calling `calculateTotal` directly in the expressions instead. No behavior change, just eliminated the duplicated first-step pattern."
82
+ ],
83
+ "total_assistant_chars": 245
84
+ },
85
+ "bug_fix": {
86
+ "success": true,
87
+ "iterations": 5,
88
+ "total_cost": 0.0,
89
+ "cost_source": "estimated",
90
+ "duration_seconds": 23.15,
91
+ "cache_creation_input_tokens": 0,
92
+ "cache_read_input_tokens": 25600,
93
+ "total_requests": 5,
94
+ "cache_hit_requests": 5,
95
+ "file_changes": {
96
+ "lib/user_renderer.rb": "# frozen_string_literal: true\n\nrequire \"cgi\"\n\nmodule SampleProject\n class UserRenderer\n def self.render_profile(user)\n <<~HTML\n <div class=\"profile\">\n <h1>#{escape(user[:name])}</h1>\n <p>#{escape(user[:bio])}</p>\n <a href=\"#{escape(user[:website])}\">Website</a>\n </div>\n HTML\n end\n\n def self.render_list(users)\n items = users.map { |u| \"<li>#{escape(u[:name])}</li>\" }.join\n \"<ul>#{items}</ul>\"\n end\n\n def self.escape(text)\n CGI.escapeHTML(text.to_s)\n end\n end\nend\n"
97
+ },
98
+ "assistant_messages": [
99
+ "",
100
+ "",
101
+ "",
102
+ "",
103
+ "已修复 `lib/user_renderer.rb` 中的 XSS 漏洞。\n\n改动点:\n- 新增 `require \"cgi\"` 和 `escape` 辅助方法,使用 `CGI.escapeHTML` 对 `user[:name]`、`user[:bio]`、`user[:website]` 进行 HTML 实体编码\n- 覆盖了 `render_profile` 和 `render_list` 中所有直接插值用户输入的位置\n\n现在像 `<script>alert(1)</script>` 这样的输入会被安全地转义为 `&lt;script&gt;alert(1)&lt;/script&gt;`,不再能注入恶意脚本。"
104
+ ],
105
+ "total_assistant_chars": 311
106
+ },
107
+ "git_workflow": {
108
+ "success": true,
109
+ "iterations": 8,
110
+ "total_cost": 0.0,
111
+ "cost_source": "estimated",
112
+ "duration_seconds": 31.94,
113
+ "cache_creation_input_tokens": 0,
114
+ "cache_read_input_tokens": 48640,
115
+ "total_requests": 8,
116
+ "cache_hit_requests": 8,
117
+ "file_changes": {},
118
+ "assistant_messages": [
119
+ "",
120
+ "",
121
+ "",
122
+ "",
123
+ "",
124
+ "",
125
+ "",
126
+ "修复完成。我在 `lib/user_renderer.rb` 中添加了 `escape_html` 方法,并对 `render_profile` 和 `render_list` 中所有用户输入进行了 HTML 转义,消除了 XSS 漏洞。然后使用 `git add lib/user_renderer.rb` 仅将该文件加入了暂存区。"
127
+ ],
128
+ "total_assistant_chars": 167
129
+ }
130
+ }
@@ -0,0 +1,441 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Benchmark runner for system prompt A/B testing
5
+ # Usage: ruby benchmark/runner.rb
6
+
7
+ require "fileutils"
8
+ require "json"
9
+ require "tmpdir"
10
+
11
+ project_root = File.expand_path("..", __dir__)
12
+ $LOAD_PATH.unshift File.join(project_root, "lib")
13
+ require "clacky"
14
+
15
+ class BenchmarkRunner
16
+ PROMPT_FILES = {
17
+ base: "lib/clacky/default_agents/base_prompt.md",
18
+ coding: "lib/clacky/default_agents/coding/system_prompt.md",
19
+ general: "lib/clacky/default_agents/general/system_prompt.md"
20
+ }.freeze
21
+
22
+ FIXTURE_DIR = File.expand_path("fixtures/sample_project", __dir__)
23
+ RESULTS_DIR = File.expand_path("results", __dir__)
24
+
25
+ TASKS = [
26
+ {
27
+ name: "simple_edit",
28
+ description: "Rename methods to snake_case across files",
29
+ prompt: "Rename the method `calculateTotal` to `calculate_total` in all files. Also rename `calculateTotalWithTax` to `calculate_total_with_tax` and `applyDiscount` to `apply_discount`. Update all references and the test file too.",
30
+ agent_profile: "coding"
31
+ },
32
+ {
33
+ name: "feature_addition",
34
+ description: "Add a new /products API endpoint with tests",
35
+ prompt: "Add a new `/products` endpoint to the ApiHandler that returns products from the store with optional pagination via `page` and `per_page` params. Also create a test file `spec/api_handler_spec.rb` with basic tests for this endpoint.",
36
+ agent_profile: "coding"
37
+ },
38
+ {
39
+ name: "refactoring",
40
+ description: "Extract helper method from duplicated pattern",
41
+ prompt: "In order_calculator.rb, both `calculate_total_with_tax` and `apply_discount` call `calculate_total` as their first step. Refactor to eliminate this duplication in the cleanest way possible. Do not over-engineer.",
42
+ agent_profile: "coding"
43
+ },
44
+ {
45
+ name: "bug_fix",
46
+ description: "Fix XSS vulnerability in HTML rendering",
47
+ prompt: "Fix the XSS vulnerability in user_renderer.rb. The methods directly interpolate user input into HTML without escaping. Make the rendering safe against XSS attacks.",
48
+ agent_profile: "coding"
49
+ },
50
+ {
51
+ name: "git_workflow",
52
+ description: "Fix bug and stage changes safely with git",
53
+ prompt: "Fix the XSS vulnerability in user_renderer.rb, then use git to stage only the changed file for commit. Do NOT stage all files.",
54
+ agent_profile: "coding"
55
+ }
56
+ ].freeze
57
+
58
+ def initialize
59
+ @project_root = File.expand_path("..", __dir__)
60
+ @original_prompts = read_current_prompts
61
+ FileUtils.mkdir_p(RESULTS_DIR)
62
+ end
63
+
64
+ def run
65
+ run_baseline
66
+ run_treatment
67
+ run_report
68
+ end
69
+
70
+ def run_baseline
71
+ puts "=" * 70
72
+ puts "OpenClacky System Prompt Benchmark - BASELINE"
73
+ puts "=" * 70
74
+ puts "Project: #{@project_root}"
75
+ puts "Model: #{agent_config.model_name}"
76
+ puts "Tasks: #{TASKS.length}"
77
+ puts
78
+
79
+ unless git_clean?
80
+ puts "WARNING: Prompt files have uncommitted changes. Baseline may not reflect main."
81
+ puts
82
+ end
83
+
84
+ baseline_prompts = read_baseline_prompts
85
+ write_prompts(baseline_prompts)
86
+ results = run_all_tasks(:baseline)
87
+ write_results("baseline", results)
88
+
89
+ # Restore treatment prompts
90
+ write_prompts(@original_prompts)
91
+ puts "\nBaseline complete. Results saved."
92
+ results
93
+ rescue => e
94
+ write_prompts(@original_prompts)
95
+ raise
96
+ end
97
+
98
+ def run_treatment
99
+ puts "=" * 70
100
+ puts "OpenClacky System Prompt Benchmark - TREATMENT"
101
+ puts "=" * 70
102
+ puts "Project: #{@project_root}"
103
+ puts "Model: #{agent_config.model_name}"
104
+ puts "Tasks: #{TASKS.length}"
105
+ puts
106
+
107
+ # Ensure treatment prompts are active
108
+ write_prompts(@original_prompts)
109
+ results = run_all_tasks(:treatment)
110
+ write_results("treatment", results)
111
+
112
+ puts "\nTreatment complete. Results saved."
113
+ results
114
+ end
115
+
116
+ def run_report
117
+ baseline_file = Dir.glob(File.join(RESULTS_DIR, "baseline_*.json")).max
118
+ treatment_file = Dir.glob(File.join(RESULTS_DIR, "treatment_*.json")).max
119
+
120
+ unless baseline_file
121
+ puts "ERROR: No baseline results found in #{RESULTS_DIR}"
122
+ exit 1
123
+ end
124
+ unless treatment_file
125
+ puts "ERROR: No treatment results found in #{RESULTS_DIR}"
126
+ exit 1
127
+ end
128
+
129
+ baseline = JSON.parse(File.read(baseline_file), symbolize_names: true)
130
+ treatment = JSON.parse(File.read(treatment_file), symbolize_names: true)
131
+
132
+ puts "=" * 70
133
+ puts "COMPARISON REPORT"
134
+ puts "=" * 70
135
+ puts "Baseline: #{File.basename(baseline_file)}"
136
+ puts "Treatment: #{File.basename(treatment_file)}"
137
+ puts
138
+ compare_and_print(baseline, treatment)
139
+
140
+ # Save combined report
141
+ report_path = File.join(RESULTS_DIR, "report_#{timestamp}.json")
142
+ File.write(report_path, JSON.pretty_generate({
143
+ baseline: baseline,
144
+ treatment: treatment,
145
+ meta: {
146
+ model: agent_config.model_name,
147
+ timestamp: Time.now.iso8601,
148
+ tasks: TASKS.map { |t| t[:name] }
149
+ }
150
+ }))
151
+ puts
152
+ puts "Full report saved to: #{report_path}"
153
+ end
154
+
155
+ private
156
+
157
+ def agent_config
158
+ @agent_config ||= Clacky::AgentConfig.load
159
+ end
160
+
161
+ def read_current_prompts
162
+ prompts = {}
163
+ PROMPT_FILES.each do |key, rel_path|
164
+ full_path = File.join(@project_root, rel_path)
165
+ prompts[key] = File.read(full_path)
166
+ end
167
+ prompts
168
+ end
169
+
170
+ def read_baseline_prompts
171
+ prompts = {}
172
+ PROMPT_FILES.each do |key, rel_path|
173
+ content = `git -C "#{@project_root}" show main:"#{rel_path}" 2>/dev/null`
174
+ if $?.success? && !content.empty?
175
+ prompts[key] = content
176
+ else
177
+ puts " Warning: Could not read #{rel_path} from main, using current"
178
+ prompts[key] = @original_prompts[key]
179
+ end
180
+ end
181
+ prompts
182
+ end
183
+
184
+ def write_prompts(prompts)
185
+ prompts.each do |key, content|
186
+ rel_path = PROMPT_FILES[key]
187
+ full_path = File.join(@project_root, rel_path)
188
+ File.write(full_path, content)
189
+ end
190
+ end
191
+
192
+ def git_clean?
193
+ PROMPT_FILES.values.all? do |rel_path|
194
+ status = `git -C "#{@project_root}" status --porcelain "#{rel_path}" 2>/dev/null`
195
+ status.strip.empty?
196
+ end
197
+ end
198
+
199
+ def run_all_tasks(variant)
200
+ results = {}
201
+ TASKS.each_with_index do |task, idx|
202
+ puts
203
+ puts "[#{idx + 1}/#{TASKS.length}] #{task[:name]}: #{task[:description]}"
204
+ results[task[:name]] = run_task(task, variant)
205
+ end
206
+ results
207
+ end
208
+
209
+ def run_task(task, variant)
210
+ tmp_dir = File.join(Dir.tmpdir, "clacky_benchmark_#{variant}_#{task[:name]}_#{Process.pid}_#{Time.now.to_i}")
211
+ FileUtils.cp_r(FIXTURE_DIR, tmp_dir)
212
+
213
+ # Ensure tmp_dir is a git repo (cp_r preserves .git)
214
+ Dir.chdir(tmp_dir) do
215
+ system("git config user.email 'benchmark@test.com' >/dev/null 2>&1")
216
+ system("git config user.name 'Benchmark' >/dev/null 2>&1")
217
+ end
218
+
219
+ config = agent_config.dup
220
+ config.permission_mode = :auto_approve
221
+
222
+ client = Clacky::Client.new(
223
+ config.api_key,
224
+ base_url: config.base_url,
225
+ model: config.model_name,
226
+ anthropic_format: config.anthropic_format?
227
+ )
228
+
229
+ agent = Clacky::Agent.new(
230
+ client, config,
231
+ working_dir: tmp_dir,
232
+ ui: BenchmarkUI.new,
233
+ profile: task[:agent_profile],
234
+ session_id: Clacky::SessionManager.generate_id,
235
+ source: :manual
236
+ )
237
+
238
+ start_time = Time.now
239
+ agent.run(task[:prompt])
240
+ duration = Time.now - start_time
241
+
242
+ # Collect metrics
243
+ metrics = {
244
+ success: true,
245
+ iterations: agent.iterations,
246
+ total_cost: agent.total_cost.round(6),
247
+ cost_source: agent.cost_source.to_s,
248
+ duration_seconds: duration.round(2),
249
+ cache_creation_input_tokens: agent.cache_stats[:cache_creation_input_tokens],
250
+ cache_read_input_tokens: agent.cache_stats[:cache_read_input_tokens],
251
+ total_requests: agent.cache_stats[:total_requests],
252
+ cache_hit_requests: agent.cache_stats[:cache_hit_requests]
253
+ }
254
+
255
+ # Collect file changes
256
+ metrics[:file_changes] = collect_file_changes(tmp_dir)
257
+
258
+ # Collect assistant output for qualitative analysis
259
+ metrics[:assistant_messages] = agent.history.to_a
260
+ .select { |m| m[:role] == "assistant" }
261
+ .map { |m| extract_text(m[:content]) }
262
+ .compact
263
+
264
+ metrics[:total_assistant_chars] = metrics[:assistant_messages].join.length
265
+
266
+ # Cleanup
267
+ FileUtils.rm_rf(tmp_dir)
268
+
269
+ print_metrics(metrics)
270
+ metrics
271
+ rescue => e
272
+ FileUtils.rm_rf(tmp_dir) if defined?(tmp_dir) && tmp_dir
273
+ error_result = {
274
+ success: false,
275
+ error: e.message,
276
+ error_class: e.class.name,
277
+ iterations: defined?(agent) ? agent&.iterations : 0,
278
+ total_cost: defined?(agent) ? agent&.total_cost&.round(6) : 0
279
+ }
280
+ puts " ERROR: #{e.message}"
281
+ error_result
282
+ end
283
+
284
+ def collect_file_changes(dir)
285
+ changes = {}
286
+ Dir.chdir(dir) do
287
+ # Get list of modified files
288
+ modified = `git diff --name-only 2>/dev/null`.strip.split("\n").reject(&:empty?)
289
+ modified.each do |f|
290
+ next unless File.exist?(f)
291
+ changes[f] = File.read(f)
292
+ end
293
+ end
294
+ changes
295
+ end
296
+
297
+ def extract_text(content)
298
+ case content
299
+ when String then content
300
+ when Array
301
+ text_parts = content.select { |p| p.is_a?(Hash) && p[:type] == "text" }
302
+ text_parts.map { |p| p[:text] }.join(" ")
303
+ else
304
+ nil
305
+ end
306
+ end
307
+
308
+ def print_metrics(metrics)
309
+ if metrics[:success]
310
+ puts " Iterations: #{metrics[:iterations]} | Cost: $#{metrics[:total_cost]} | Duration: #{metrics[:duration_seconds]}s"
311
+ puts " Cache: write=#{metrics[:cache_creation_input_tokens]} read=#{metrics[:cache_read_input_tokens]}"
312
+ puts " Assistant chars: #{metrics[:total_assistant_chars]}"
313
+ puts " Files changed: #{metrics[:file_changes]&.keys&.join(', ') || 'none'}"
314
+ else
315
+ puts " FAILED: #{metrics[:error]}"
316
+ end
317
+ end
318
+
319
+ def write_results(name, results)
320
+ path = File.join(RESULTS_DIR, "#{name}_#{timestamp}.json")
321
+ File.write(path, JSON.pretty_generate(results))
322
+ puts "\n#{name.capitalize} results saved to: #{path}"
323
+ end
324
+
325
+ def timestamp
326
+ @timestamp ||= Time.now.strftime("%Y%m%d_%H%M%S")
327
+ end
328
+
329
+ def compare_and_print(baseline, treatment)
330
+ puts
331
+ printf "%-20s %12s %12s %12s\n", "Task", "Baseline", "Treatment", "Delta"
332
+ puts "-" * 60
333
+
334
+ TASKS.each do |task|
335
+ task_key = task[:name].to_sym
336
+ b = baseline[task_key] || {}
337
+ t = treatment[task_key] || {}
338
+
339
+ next unless b[:success] && t[:success]
340
+
341
+ b_cost = b[:total_cost] || 0
342
+ t_cost = t[:total_cost] || 0
343
+ cost_delta = b_cost > 0 ? "#{(t_cost / b_cost * 100).round(1)}%" : "N/A"
344
+
345
+ b_iter = b[:iterations] || 0
346
+ t_iter = t[:iterations] || 0
347
+
348
+ b_chars = b[:total_assistant_chars] || 0
349
+ t_chars = t[:total_assistant_chars] || 0
350
+ chars_delta = b_chars > 0 ? "#{(t_chars / b_chars.to_f * 100).round(1)}%" : "N/A"
351
+
352
+ printf "%-20s\n", task[:name]
353
+ printf " Cost: $%-10.6f $%-10.6f %s\n", b_cost, t_cost, cost_delta
354
+ printf " Iterations: %-11d %-11d %s\n", b_iter, t_iter, "#{t_iter - b_iter > 0 ? '+' : ''}#{t_iter - b_iter}"
355
+ printf " Assistant chars: %-11d %-11d %s\n", b_chars, t_chars, chars_delta
356
+ puts
357
+ end
358
+
359
+ # Totals
360
+ b_total_cost = 0
361
+ t_total_cost = 0
362
+ b_total_iter = 0
363
+ t_total_iter = 0
364
+ b_total_chars = 0
365
+ t_total_chars = 0
366
+
367
+ TASKS.each do |task|
368
+ task_key = task[:name].to_sym
369
+ b = baseline[task_key] || {}
370
+ t = treatment[task_key] || {}
371
+ next unless b[:success] && t[:success]
372
+
373
+ b_total_cost += b[:total_cost] || 0
374
+ t_total_cost += t[:total_cost] || 0
375
+ b_total_iter += b[:iterations] || 0
376
+ t_total_iter += t[:iterations] || 0
377
+ b_total_chars += b[:total_assistant_chars] || 0
378
+ t_total_chars += t[:total_assistant_chars] || 0
379
+ end
380
+
381
+ puts "-" * 60
382
+ printf "%-20s\n", "TOTALS"
383
+ cost_pct = b_total_cost > 0 ? (t_total_cost / b_total_cost * 100).round(1) : 0
384
+ printf " Total cost: $%-10.6f $%-10.6f %s%%\n", b_total_cost, t_total_cost, cost_pct
385
+ printf " Total iterations: %-11d %-11d %+d\n", b_total_iter, t_total_iter, t_total_iter - b_total_iter
386
+ chars_pct = b_total_chars > 0 ? (t_total_chars / b_total_chars.to_f * 100).round(1) : 0
387
+ printf " Total chars: %-11d %-11d %s%%\n", b_total_chars, t_total_chars, chars_pct
388
+ end
389
+
390
+ # Minimal UI that captures output without displaying
391
+ class BenchmarkUI
392
+ def log(msg, level: :info); end
393
+ def show_assistant_message(content, files: []); end
394
+ def show_tool_call(name, args); end
395
+ def show_tool_result(result); end
396
+ def show_tool_stdout(lines); end
397
+ def show_tool_error(error); end
398
+ def show_tool_args(formatted_args); end
399
+ def show_file_write_preview(path, is_new_file:); end
400
+ def show_file_edit_preview(path); end
401
+ def show_file_error(error_message); end
402
+ def show_shell_preview(command); end
403
+ def show_diff(old_content, new_content, max_lines: 50); end
404
+ def show_token_usage(token_data); end
405
+ def show_complete(iterations:, cost:, duration: nil, cache_stats: nil, awaiting_user_feedback: false, cost_source: nil); end
406
+ def append_output(content); end
407
+ def show_info(message, prefix_newline: true); end
408
+ def show_warning(message); end
409
+ def show_error(message); end
410
+ def show_success(message); end
411
+ def show_progress(message = nil, prefix_newline: true, progress_type: "thinking", phase: "active", metadata: {}); end
412
+ def start_progress(message: nil, style: :primary, quiet_on_fast_finish: false); end
413
+ def with_progress(message: nil, style: :primary, quiet_on_fast_finish: false)
414
+ yield if block_given?
415
+ end
416
+ def update_sessionbar(tasks: nil, cost: nil, cost_source: nil, status: nil, latency: nil); end
417
+ def update_todos(todos); end
418
+ def set_working_status; end
419
+ def set_idle_status; end
420
+ def request_confirmation(message, default: true); end
421
+ def clear_input; end
422
+ def set_input_tips(message, type: :info); end
423
+ def stop; end
424
+ end
425
+ end
426
+
427
+ if __FILE__ == $0
428
+ variant = ARGV[0]&.downcase
429
+ runner = BenchmarkRunner.new
430
+
431
+ case variant
432
+ when "baseline"
433
+ runner.run_baseline
434
+ when "treatment"
435
+ runner.run_treatment
436
+ when "report"
437
+ runner.run_report
438
+ else
439
+ runner.run
440
+ end
441
+ end