openclacky 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -1
- data/benchmark/fixtures/sample_project/Gemfile +3 -0
- data/benchmark/fixtures/sample_project/lib/api_handler.rb +32 -0
- data/benchmark/fixtures/sample_project/lib/order_calculator.rb +23 -0
- data/benchmark/fixtures/sample_project/lib/user_renderer.rb +20 -0
- data/benchmark/fixtures/sample_project/spec/order_calculator_spec.rb +20 -0
- data/benchmark/results/EVALUATION_REPORT.md +165 -0
- data/benchmark/results/baseline_20260511_174424.json +128 -0
- data/benchmark/results/report_20260511_175256.json +271 -0
- data/benchmark/results/report_20260511_175444.json +271 -0
- data/benchmark/results/treatment_20260511_175103.json +130 -0
- data/benchmark/runner.rb +441 -0
- data/docs/proposals/2026-05-11-system-prompt-alignment.md +325 -0
- data/docs/proposals/2026-05-12-memory-mechanism-optimization.md +89 -0
- data/lib/clacky/agent/cost_tracker.rb +8 -2
- data/lib/clacky/agent/memory_updater.rb +41 -30
- data/lib/clacky/agent/skill_manager.rb +5 -2
- data/lib/clacky/agent/skill_reflector.rb +10 -1
- data/lib/clacky/agent.rb +4 -0
- data/lib/clacky/client.rb +15 -0
- data/lib/clacky/default_agents/base_prompt.md +20 -20
- data/lib/clacky/default_agents/coding/system_prompt.md +51 -1
- data/lib/clacky/default_skills/channel-setup/SKILL.md +56 -2
- data/lib/clacky/default_skills/channel-setup/import_lark_skills.rb +97 -0
- data/lib/clacky/default_skills/onboard/SKILL.md +1 -1
- data/lib/clacky/default_skills/persist-memory/SKILL.md +59 -0
- data/lib/clacky/providers.rb +48 -6
- data/lib/clacky/server/http_server.rb +41 -1
- data/lib/clacky/utils/file_processor.rb +71 -0
- data/lib/clacky/version.rb +1 -1
- metadata +31 -2
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
{
|
|
2
|
+
"simple_edit": {
|
|
3
|
+
"success": true,
|
|
4
|
+
"iterations": 5,
|
|
5
|
+
"total_cost": 0.0,
|
|
6
|
+
"cost_source": "estimated",
|
|
7
|
+
"duration_seconds": 28.25,
|
|
8
|
+
"cache_creation_input_tokens": 0,
|
|
9
|
+
"cache_read_input_tokens": 26624,
|
|
10
|
+
"total_requests": 5,
|
|
11
|
+
"cache_hit_requests": 5,
|
|
12
|
+
"file_changes": {
|
|
13
|
+
"lib/order_calculator.rb": "# frozen_string_literal: true\n\nmodule SampleProject\n class OrderCalculator\n def initialize(items)\n @items = items\n end\n\n def calculate_total\n @items.sum { |item| item[:price] * item[:quantity] }\n end\n\n def calculate_total_with_tax(tax_rate)\n subtotal = calculate_total\n subtotal * (1 + tax_rate)\n end\n\n def apply_discount(discount_percent)\n total = calculate_total\n total * (1 - discount_percent / 100.0)\n end\n end\nend\n",
|
|
14
|
+
"spec/order_calculator_spec.rb": "# frozen_string_literal: true\n\nrequire_relative \"../lib/order_calculator\"\n\nRSpec.describe SampleProject::OrderCalculator do\n let(:items) do\n [\n { price: 10.0, quantity: 2 },\n { price: 5.0, quantity: 3 }\n ]\n end\n\n subject { described_class.new(items) }\n\n describe \"#calculate_total\" do\n it \"returns the sum of all item prices times quantities\" do\n expect(subject.calculate_total).to eq(35.0)\n end\n end\nend\n"
|
|
15
|
+
},
|
|
16
|
+
"assistant_messages": [
|
|
17
|
+
"",
|
|
18
|
+
"",
|
|
19
|
+
"",
|
|
20
|
+
"",
|
|
21
|
+
"已完成三个方法的重命名(`calculateTotal` → `calculate_total`、`calculateTotalWithTax` → `calculate_total_with_tax`、`applyDiscount` → `apply_discount`),并更新了 `lib/order_calculator.rb` 和 `spec/order_calculator_spec.rb` 中的所有引用。测试通过。"
|
|
22
|
+
],
|
|
23
|
+
"total_assistant_chars": 215
|
|
24
|
+
},
|
|
25
|
+
"feature_addition": {
|
|
26
|
+
"success": true,
|
|
27
|
+
"iterations": 21,
|
|
28
|
+
"total_cost": 0.0,
|
|
29
|
+
"cost_source": "estimated",
|
|
30
|
+
"duration_seconds": 193.47,
|
|
31
|
+
"cache_creation_input_tokens": 0,
|
|
32
|
+
"cache_read_input_tokens": 185344,
|
|
33
|
+
"total_requests": 21,
|
|
34
|
+
"cache_hit_requests": 20,
|
|
35
|
+
"file_changes": {
|
|
36
|
+
"lib/api_handler.rb": "# frozen_string_literal: true\n\nmodule SampleProject\n class ApiHandler\n def initialize(store)\n @store = store\n end\n\n def handle_request(path, params)\n case path\n when \"/users\"\n list_users(params)\n when \"/orders\"\n list_orders(params)\n when \"/products\"\n list_products(params)\n else\n { error: \"Not found\", status: 404 }\n end\n end\n\n private\n\n def list_users(params)\n users = @store.query(\"SELECT * FROM users LIMIT #{params[:limit] || 10}\")\n { data: users, status: 200 }\n end\n\n def list_orders(params)\n orders = @store.all(:orders)\n { data: orders, status: 200 }\n end\n\n def list_products(params)\n products = @store.all(:products)\n page = (params[:page] || 1).to_i\n per_page = (params[:per_page] || 10).to_i\n page = 1 if page < 1\n per_page = 10 if per_page < 1\n offset = (page - 1) * per_page\n paginated = products[offset, per_page] || []\n { data: paginated, status: 200 }\n end\n end\nend\n"
|
|
37
|
+
},
|
|
38
|
+
"assistant_messages": [
|
|
39
|
+
"",
|
|
40
|
+
"",
|
|
41
|
+
"",
|
|
42
|
+
"",
|
|
43
|
+
"",
|
|
44
|
+
"",
|
|
45
|
+
"",
|
|
46
|
+
"",
|
|
47
|
+
"",
|
|
48
|
+
"",
|
|
49
|
+
"",
|
|
50
|
+
"",
|
|
51
|
+
"",
|
|
52
|
+
"",
|
|
53
|
+
"",
|
|
54
|
+
"",
|
|
55
|
+
"",
|
|
56
|
+
"",
|
|
57
|
+
"",
|
|
58
|
+
"",
|
|
59
|
+
"搞定了!`/products` 端点已经添加到 `ApiHandler`,支持通过 `page` 和 `per_page` 参数进行分页,并且测试全部通过。\n\n**改动摘要:**\n\n1. **`lib/api_handler.rb`** — 新增 `/products` 路由和 `list_products` 方法:\n - 从 store 获取所有 products\n - 支持 `page`(默认 1)和 `per_page`(默认 10)参数\n - 对越界页码返回空数组\n\n2. **`spec/api_handler_spec.rb`** — 新建测试文件,覆盖:\n - 默认分页返回全部数据\n - 自定义 `page` + `per_page` 分页\n - 第二页数据\n - 超出范围的页码返回 `[]`\n - 无效参数(0 或 nil)回退到默认值\n - 404 未知路径\n\n6 个测试全部绿灯通过。"
|
|
60
|
+
],
|
|
61
|
+
"total_assistant_chars": 424
|
|
62
|
+
},
|
|
63
|
+
"refactoring": {
|
|
64
|
+
"success": true,
|
|
65
|
+
"iterations": 5,
|
|
66
|
+
"total_cost": 0.0,
|
|
67
|
+
"cost_source": "estimated",
|
|
68
|
+
"duration_seconds": 31.43,
|
|
69
|
+
"cache_creation_input_tokens": 0,
|
|
70
|
+
"cache_read_input_tokens": 28416,
|
|
71
|
+
"total_requests": 5,
|
|
72
|
+
"cache_hit_requests": 5,
|
|
73
|
+
"file_changes": {
|
|
74
|
+
"lib/order_calculator.rb": "# frozen_string_literal: true\n\nmodule SampleProject\n class OrderCalculator\n def initialize(items)\n @items = items\n end\n\n def calculateTotal\n @items.sum { |item| item[:price] * item[:quantity] }\n end\n\n def calculateTotalWithTax(tax_rate)\n calculateTotal * (1 + tax_rate)\n end\n\n def applyDiscount(discount_percent)\n calculateTotal * (1 - discount_percent / 100.0)\n end\n end\nend\n"
|
|
75
|
+
},
|
|
76
|
+
"assistant_messages": [
|
|
77
|
+
"",
|
|
78
|
+
"",
|
|
79
|
+
"",
|
|
80
|
+
"",
|
|
81
|
+
"Done. Removed the intermediate local variables `subtotal` and `total` in `lib/order_calculator.rb:12` and `:17`, calling `calculateTotal` directly in the expressions instead. No behavior change, just eliminated the duplicated first-step pattern."
|
|
82
|
+
],
|
|
83
|
+
"total_assistant_chars": 245
|
|
84
|
+
},
|
|
85
|
+
"bug_fix": {
|
|
86
|
+
"success": true,
|
|
87
|
+
"iterations": 5,
|
|
88
|
+
"total_cost": 0.0,
|
|
89
|
+
"cost_source": "estimated",
|
|
90
|
+
"duration_seconds": 23.15,
|
|
91
|
+
"cache_creation_input_tokens": 0,
|
|
92
|
+
"cache_read_input_tokens": 25600,
|
|
93
|
+
"total_requests": 5,
|
|
94
|
+
"cache_hit_requests": 5,
|
|
95
|
+
"file_changes": {
|
|
96
|
+
"lib/user_renderer.rb": "# frozen_string_literal: true\n\nrequire \"cgi\"\n\nmodule SampleProject\n class UserRenderer\n def self.render_profile(user)\n <<~HTML\n <div class=\"profile\">\n <h1>#{escape(user[:name])}</h1>\n <p>#{escape(user[:bio])}</p>\n <a href=\"#{escape(user[:website])}\">Website</a>\n </div>\n HTML\n end\n\n def self.render_list(users)\n items = users.map { |u| \"<li>#{escape(u[:name])}</li>\" }.join\n \"<ul>#{items}</ul>\"\n end\n\n def self.escape(text)\n CGI.escapeHTML(text.to_s)\n end\n end\nend\n"
|
|
97
|
+
},
|
|
98
|
+
"assistant_messages": [
|
|
99
|
+
"",
|
|
100
|
+
"",
|
|
101
|
+
"",
|
|
102
|
+
"",
|
|
103
|
+
"已修复 `lib/user_renderer.rb` 中的 XSS 漏洞。\n\n改动点:\n- 新增 `require \"cgi\"` 和 `escape` 辅助方法,使用 `CGI.escapeHTML` 对 `user[:name]`、`user[:bio]`、`user[:website]` 进行 HTML 实体编码\n- 覆盖了 `render_profile` 和 `render_list` 中所有直接插值用户输入的位置\n\n现在像 `<script>alert(1)</script>` 这样的输入会被安全地转义为 `<script>alert(1)</script>`,不再能注入恶意脚本。"
|
|
104
|
+
],
|
|
105
|
+
"total_assistant_chars": 311
|
|
106
|
+
},
|
|
107
|
+
"git_workflow": {
|
|
108
|
+
"success": true,
|
|
109
|
+
"iterations": 8,
|
|
110
|
+
"total_cost": 0.0,
|
|
111
|
+
"cost_source": "estimated",
|
|
112
|
+
"duration_seconds": 31.94,
|
|
113
|
+
"cache_creation_input_tokens": 0,
|
|
114
|
+
"cache_read_input_tokens": 48640,
|
|
115
|
+
"total_requests": 8,
|
|
116
|
+
"cache_hit_requests": 8,
|
|
117
|
+
"file_changes": {},
|
|
118
|
+
"assistant_messages": [
|
|
119
|
+
"",
|
|
120
|
+
"",
|
|
121
|
+
"",
|
|
122
|
+
"",
|
|
123
|
+
"",
|
|
124
|
+
"",
|
|
125
|
+
"",
|
|
126
|
+
"修复完成。我在 `lib/user_renderer.rb` 中添加了 `escape_html` 方法,并对 `render_profile` 和 `render_list` 中所有用户输入进行了 HTML 转义,消除了 XSS 漏洞。然后使用 `git add lib/user_renderer.rb` 仅将该文件加入了暂存区。"
|
|
127
|
+
],
|
|
128
|
+
"total_assistant_chars": 167
|
|
129
|
+
}
|
|
130
|
+
}
|
data/benchmark/runner.rb
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Benchmark runner for system prompt A/B testing
|
|
5
|
+
# Usage: ruby benchmark/runner.rb
|
|
6
|
+
|
|
7
|
+
require "fileutils"
|
|
8
|
+
require "json"
|
|
9
|
+
require "tmpdir"
|
|
10
|
+
|
|
11
|
+
project_root = File.expand_path("..", __dir__)
|
|
12
|
+
$LOAD_PATH.unshift File.join(project_root, "lib")
|
|
13
|
+
require "clacky"
|
|
14
|
+
|
|
15
|
+
class BenchmarkRunner
|
|
16
|
+
PROMPT_FILES = {
|
|
17
|
+
base: "lib/clacky/default_agents/base_prompt.md",
|
|
18
|
+
coding: "lib/clacky/default_agents/coding/system_prompt.md",
|
|
19
|
+
general: "lib/clacky/default_agents/general/system_prompt.md"
|
|
20
|
+
}.freeze
|
|
21
|
+
|
|
22
|
+
FIXTURE_DIR = File.expand_path("fixtures/sample_project", __dir__)
|
|
23
|
+
RESULTS_DIR = File.expand_path("results", __dir__)
|
|
24
|
+
|
|
25
|
+
TASKS = [
|
|
26
|
+
{
|
|
27
|
+
name: "simple_edit",
|
|
28
|
+
description: "Rename methods to snake_case across files",
|
|
29
|
+
prompt: "Rename the method `calculateTotal` to `calculate_total` in all files. Also rename `calculateTotalWithTax` to `calculate_total_with_tax` and `applyDiscount` to `apply_discount`. Update all references and the test file too.",
|
|
30
|
+
agent_profile: "coding"
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
name: "feature_addition",
|
|
34
|
+
description: "Add a new /products API endpoint with tests",
|
|
35
|
+
prompt: "Add a new `/products` endpoint to the ApiHandler that returns products from the store with optional pagination via `page` and `per_page` params. Also create a test file `spec/api_handler_spec.rb` with basic tests for this endpoint.",
|
|
36
|
+
agent_profile: "coding"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
name: "refactoring",
|
|
40
|
+
description: "Extract helper method from duplicated pattern",
|
|
41
|
+
prompt: "In order_calculator.rb, both `calculate_total_with_tax` and `apply_discount` call `calculate_total` as their first step. Refactor to eliminate this duplication in the cleanest way possible. Do not over-engineer.",
|
|
42
|
+
agent_profile: "coding"
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
name: "bug_fix",
|
|
46
|
+
description: "Fix XSS vulnerability in HTML rendering",
|
|
47
|
+
prompt: "Fix the XSS vulnerability in user_renderer.rb. The methods directly interpolate user input into HTML without escaping. Make the rendering safe against XSS attacks.",
|
|
48
|
+
agent_profile: "coding"
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: "git_workflow",
|
|
52
|
+
description: "Fix bug and stage changes safely with git",
|
|
53
|
+
prompt: "Fix the XSS vulnerability in user_renderer.rb, then use git to stage only the changed file for commit. Do NOT stage all files.",
|
|
54
|
+
agent_profile: "coding"
|
|
55
|
+
}
|
|
56
|
+
].freeze
|
|
57
|
+
|
|
58
|
+
def initialize
|
|
59
|
+
@project_root = File.expand_path("..", __dir__)
|
|
60
|
+
@original_prompts = read_current_prompts
|
|
61
|
+
FileUtils.mkdir_p(RESULTS_DIR)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def run
|
|
65
|
+
run_baseline
|
|
66
|
+
run_treatment
|
|
67
|
+
run_report
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def run_baseline
|
|
71
|
+
puts "=" * 70
|
|
72
|
+
puts "OpenClacky System Prompt Benchmark - BASELINE"
|
|
73
|
+
puts "=" * 70
|
|
74
|
+
puts "Project: #{@project_root}"
|
|
75
|
+
puts "Model: #{agent_config.model_name}"
|
|
76
|
+
puts "Tasks: #{TASKS.length}"
|
|
77
|
+
puts
|
|
78
|
+
|
|
79
|
+
unless git_clean?
|
|
80
|
+
puts "WARNING: Prompt files have uncommitted changes. Baseline may not reflect main."
|
|
81
|
+
puts
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
baseline_prompts = read_baseline_prompts
|
|
85
|
+
write_prompts(baseline_prompts)
|
|
86
|
+
results = run_all_tasks(:baseline)
|
|
87
|
+
write_results("baseline", results)
|
|
88
|
+
|
|
89
|
+
# Restore treatment prompts
|
|
90
|
+
write_prompts(@original_prompts)
|
|
91
|
+
puts "\nBaseline complete. Results saved."
|
|
92
|
+
results
|
|
93
|
+
rescue => e
|
|
94
|
+
write_prompts(@original_prompts)
|
|
95
|
+
raise
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def run_treatment
|
|
99
|
+
puts "=" * 70
|
|
100
|
+
puts "OpenClacky System Prompt Benchmark - TREATMENT"
|
|
101
|
+
puts "=" * 70
|
|
102
|
+
puts "Project: #{@project_root}"
|
|
103
|
+
puts "Model: #{agent_config.model_name}"
|
|
104
|
+
puts "Tasks: #{TASKS.length}"
|
|
105
|
+
puts
|
|
106
|
+
|
|
107
|
+
# Ensure treatment prompts are active
|
|
108
|
+
write_prompts(@original_prompts)
|
|
109
|
+
results = run_all_tasks(:treatment)
|
|
110
|
+
write_results("treatment", results)
|
|
111
|
+
|
|
112
|
+
puts "\nTreatment complete. Results saved."
|
|
113
|
+
results
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def run_report
|
|
117
|
+
baseline_file = Dir.glob(File.join(RESULTS_DIR, "baseline_*.json")).max
|
|
118
|
+
treatment_file = Dir.glob(File.join(RESULTS_DIR, "treatment_*.json")).max
|
|
119
|
+
|
|
120
|
+
unless baseline_file
|
|
121
|
+
puts "ERROR: No baseline results found in #{RESULTS_DIR}"
|
|
122
|
+
exit 1
|
|
123
|
+
end
|
|
124
|
+
unless treatment_file
|
|
125
|
+
puts "ERROR: No treatment results found in #{RESULTS_DIR}"
|
|
126
|
+
exit 1
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
baseline = JSON.parse(File.read(baseline_file), symbolize_names: true)
|
|
130
|
+
treatment = JSON.parse(File.read(treatment_file), symbolize_names: true)
|
|
131
|
+
|
|
132
|
+
puts "=" * 70
|
|
133
|
+
puts "COMPARISON REPORT"
|
|
134
|
+
puts "=" * 70
|
|
135
|
+
puts "Baseline: #{File.basename(baseline_file)}"
|
|
136
|
+
puts "Treatment: #{File.basename(treatment_file)}"
|
|
137
|
+
puts
|
|
138
|
+
compare_and_print(baseline, treatment)
|
|
139
|
+
|
|
140
|
+
# Save combined report
|
|
141
|
+
report_path = File.join(RESULTS_DIR, "report_#{timestamp}.json")
|
|
142
|
+
File.write(report_path, JSON.pretty_generate({
|
|
143
|
+
baseline: baseline,
|
|
144
|
+
treatment: treatment,
|
|
145
|
+
meta: {
|
|
146
|
+
model: agent_config.model_name,
|
|
147
|
+
timestamp: Time.now.iso8601,
|
|
148
|
+
tasks: TASKS.map { |t| t[:name] }
|
|
149
|
+
}
|
|
150
|
+
}))
|
|
151
|
+
puts
|
|
152
|
+
puts "Full report saved to: #{report_path}"
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
private
|
|
156
|
+
|
|
157
|
+
def agent_config
|
|
158
|
+
@agent_config ||= Clacky::AgentConfig.load
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def read_current_prompts
|
|
162
|
+
prompts = {}
|
|
163
|
+
PROMPT_FILES.each do |key, rel_path|
|
|
164
|
+
full_path = File.join(@project_root, rel_path)
|
|
165
|
+
prompts[key] = File.read(full_path)
|
|
166
|
+
end
|
|
167
|
+
prompts
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def read_baseline_prompts
|
|
171
|
+
prompts = {}
|
|
172
|
+
PROMPT_FILES.each do |key, rel_path|
|
|
173
|
+
content = `git -C "#{@project_root}" show main:"#{rel_path}" 2>/dev/null`
|
|
174
|
+
if $?.success? && !content.empty?
|
|
175
|
+
prompts[key] = content
|
|
176
|
+
else
|
|
177
|
+
puts " Warning: Could not read #{rel_path} from main, using current"
|
|
178
|
+
prompts[key] = @original_prompts[key]
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
prompts
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def write_prompts(prompts)
|
|
185
|
+
prompts.each do |key, content|
|
|
186
|
+
rel_path = PROMPT_FILES[key]
|
|
187
|
+
full_path = File.join(@project_root, rel_path)
|
|
188
|
+
File.write(full_path, content)
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def git_clean?
|
|
193
|
+
PROMPT_FILES.values.all? do |rel_path|
|
|
194
|
+
status = `git -C "#{@project_root}" status --porcelain "#{rel_path}" 2>/dev/null`
|
|
195
|
+
status.strip.empty?
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def run_all_tasks(variant)
|
|
200
|
+
results = {}
|
|
201
|
+
TASKS.each_with_index do |task, idx|
|
|
202
|
+
puts
|
|
203
|
+
puts "[#{idx + 1}/#{TASKS.length}] #{task[:name]}: #{task[:description]}"
|
|
204
|
+
results[task[:name]] = run_task(task, variant)
|
|
205
|
+
end
|
|
206
|
+
results
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def run_task(task, variant)
|
|
210
|
+
tmp_dir = File.join(Dir.tmpdir, "clacky_benchmark_#{variant}_#{task[:name]}_#{Process.pid}_#{Time.now.to_i}")
|
|
211
|
+
FileUtils.cp_r(FIXTURE_DIR, tmp_dir)
|
|
212
|
+
|
|
213
|
+
# Ensure tmp_dir is a git repo (cp_r preserves .git)
|
|
214
|
+
Dir.chdir(tmp_dir) do
|
|
215
|
+
system("git config user.email 'benchmark@test.com' >/dev/null 2>&1")
|
|
216
|
+
system("git config user.name 'Benchmark' >/dev/null 2>&1")
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
config = agent_config.dup
|
|
220
|
+
config.permission_mode = :auto_approve
|
|
221
|
+
|
|
222
|
+
client = Clacky::Client.new(
|
|
223
|
+
config.api_key,
|
|
224
|
+
base_url: config.base_url,
|
|
225
|
+
model: config.model_name,
|
|
226
|
+
anthropic_format: config.anthropic_format?
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
agent = Clacky::Agent.new(
|
|
230
|
+
client, config,
|
|
231
|
+
working_dir: tmp_dir,
|
|
232
|
+
ui: BenchmarkUI.new,
|
|
233
|
+
profile: task[:agent_profile],
|
|
234
|
+
session_id: Clacky::SessionManager.generate_id,
|
|
235
|
+
source: :manual
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
start_time = Time.now
|
|
239
|
+
agent.run(task[:prompt])
|
|
240
|
+
duration = Time.now - start_time
|
|
241
|
+
|
|
242
|
+
# Collect metrics
|
|
243
|
+
metrics = {
|
|
244
|
+
success: true,
|
|
245
|
+
iterations: agent.iterations,
|
|
246
|
+
total_cost: agent.total_cost.round(6),
|
|
247
|
+
cost_source: agent.cost_source.to_s,
|
|
248
|
+
duration_seconds: duration.round(2),
|
|
249
|
+
cache_creation_input_tokens: agent.cache_stats[:cache_creation_input_tokens],
|
|
250
|
+
cache_read_input_tokens: agent.cache_stats[:cache_read_input_tokens],
|
|
251
|
+
total_requests: agent.cache_stats[:total_requests],
|
|
252
|
+
cache_hit_requests: agent.cache_stats[:cache_hit_requests]
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
# Collect file changes
|
|
256
|
+
metrics[:file_changes] = collect_file_changes(tmp_dir)
|
|
257
|
+
|
|
258
|
+
# Collect assistant output for qualitative analysis
|
|
259
|
+
metrics[:assistant_messages] = agent.history.to_a
|
|
260
|
+
.select { |m| m[:role] == "assistant" }
|
|
261
|
+
.map { |m| extract_text(m[:content]) }
|
|
262
|
+
.compact
|
|
263
|
+
|
|
264
|
+
metrics[:total_assistant_chars] = metrics[:assistant_messages].join.length
|
|
265
|
+
|
|
266
|
+
# Cleanup
|
|
267
|
+
FileUtils.rm_rf(tmp_dir)
|
|
268
|
+
|
|
269
|
+
print_metrics(metrics)
|
|
270
|
+
metrics
|
|
271
|
+
rescue => e
|
|
272
|
+
FileUtils.rm_rf(tmp_dir) if defined?(tmp_dir) && tmp_dir
|
|
273
|
+
error_result = {
|
|
274
|
+
success: false,
|
|
275
|
+
error: e.message,
|
|
276
|
+
error_class: e.class.name,
|
|
277
|
+
iterations: defined?(agent) ? agent&.iterations : 0,
|
|
278
|
+
total_cost: defined?(agent) ? agent&.total_cost&.round(6) : 0
|
|
279
|
+
}
|
|
280
|
+
puts " ERROR: #{e.message}"
|
|
281
|
+
error_result
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def collect_file_changes(dir)
|
|
285
|
+
changes = {}
|
|
286
|
+
Dir.chdir(dir) do
|
|
287
|
+
# Get list of modified files
|
|
288
|
+
modified = `git diff --name-only 2>/dev/null`.strip.split("\n").reject(&:empty?)
|
|
289
|
+
modified.each do |f|
|
|
290
|
+
next unless File.exist?(f)
|
|
291
|
+
changes[f] = File.read(f)
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
changes
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def extract_text(content)
|
|
298
|
+
case content
|
|
299
|
+
when String then content
|
|
300
|
+
when Array
|
|
301
|
+
text_parts = content.select { |p| p.is_a?(Hash) && p[:type] == "text" }
|
|
302
|
+
text_parts.map { |p| p[:text] }.join(" ")
|
|
303
|
+
else
|
|
304
|
+
nil
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def print_metrics(metrics)
|
|
309
|
+
if metrics[:success]
|
|
310
|
+
puts " Iterations: #{metrics[:iterations]} | Cost: $#{metrics[:total_cost]} | Duration: #{metrics[:duration_seconds]}s"
|
|
311
|
+
puts " Cache: write=#{metrics[:cache_creation_input_tokens]} read=#{metrics[:cache_read_input_tokens]}"
|
|
312
|
+
puts " Assistant chars: #{metrics[:total_assistant_chars]}"
|
|
313
|
+
puts " Files changed: #{metrics[:file_changes]&.keys&.join(', ') || 'none'}"
|
|
314
|
+
else
|
|
315
|
+
puts " FAILED: #{metrics[:error]}"
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def write_results(name, results)
|
|
320
|
+
path = File.join(RESULTS_DIR, "#{name}_#{timestamp}.json")
|
|
321
|
+
File.write(path, JSON.pretty_generate(results))
|
|
322
|
+
puts "\n#{name.capitalize} results saved to: #{path}"
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
def timestamp
|
|
326
|
+
@timestamp ||= Time.now.strftime("%Y%m%d_%H%M%S")
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
def compare_and_print(baseline, treatment)
|
|
330
|
+
puts
|
|
331
|
+
printf "%-20s %12s %12s %12s\n", "Task", "Baseline", "Treatment", "Delta"
|
|
332
|
+
puts "-" * 60
|
|
333
|
+
|
|
334
|
+
TASKS.each do |task|
|
|
335
|
+
task_key = task[:name].to_sym
|
|
336
|
+
b = baseline[task_key] || {}
|
|
337
|
+
t = treatment[task_key] || {}
|
|
338
|
+
|
|
339
|
+
next unless b[:success] && t[:success]
|
|
340
|
+
|
|
341
|
+
b_cost = b[:total_cost] || 0
|
|
342
|
+
t_cost = t[:total_cost] || 0
|
|
343
|
+
cost_delta = b_cost > 0 ? "#{(t_cost / b_cost * 100).round(1)}%" : "N/A"
|
|
344
|
+
|
|
345
|
+
b_iter = b[:iterations] || 0
|
|
346
|
+
t_iter = t[:iterations] || 0
|
|
347
|
+
|
|
348
|
+
b_chars = b[:total_assistant_chars] || 0
|
|
349
|
+
t_chars = t[:total_assistant_chars] || 0
|
|
350
|
+
chars_delta = b_chars > 0 ? "#{(t_chars / b_chars.to_f * 100).round(1)}%" : "N/A"
|
|
351
|
+
|
|
352
|
+
printf "%-20s\n", task[:name]
|
|
353
|
+
printf " Cost: $%-10.6f $%-10.6f %s\n", b_cost, t_cost, cost_delta
|
|
354
|
+
printf " Iterations: %-11d %-11d %s\n", b_iter, t_iter, "#{t_iter - b_iter > 0 ? '+' : ''}#{t_iter - b_iter}"
|
|
355
|
+
printf " Assistant chars: %-11d %-11d %s\n", b_chars, t_chars, chars_delta
|
|
356
|
+
puts
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Totals
|
|
360
|
+
b_total_cost = 0
|
|
361
|
+
t_total_cost = 0
|
|
362
|
+
b_total_iter = 0
|
|
363
|
+
t_total_iter = 0
|
|
364
|
+
b_total_chars = 0
|
|
365
|
+
t_total_chars = 0
|
|
366
|
+
|
|
367
|
+
TASKS.each do |task|
|
|
368
|
+
task_key = task[:name].to_sym
|
|
369
|
+
b = baseline[task_key] || {}
|
|
370
|
+
t = treatment[task_key] || {}
|
|
371
|
+
next unless b[:success] && t[:success]
|
|
372
|
+
|
|
373
|
+
b_total_cost += b[:total_cost] || 0
|
|
374
|
+
t_total_cost += t[:total_cost] || 0
|
|
375
|
+
b_total_iter += b[:iterations] || 0
|
|
376
|
+
t_total_iter += t[:iterations] || 0
|
|
377
|
+
b_total_chars += b[:total_assistant_chars] || 0
|
|
378
|
+
t_total_chars += t[:total_assistant_chars] || 0
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
puts "-" * 60
|
|
382
|
+
printf "%-20s\n", "TOTALS"
|
|
383
|
+
cost_pct = b_total_cost > 0 ? (t_total_cost / b_total_cost * 100).round(1) : 0
|
|
384
|
+
printf " Total cost: $%-10.6f $%-10.6f %s%%\n", b_total_cost, t_total_cost, cost_pct
|
|
385
|
+
printf " Total iterations: %-11d %-11d %+d\n", b_total_iter, t_total_iter, t_total_iter - b_total_iter
|
|
386
|
+
chars_pct = b_total_chars > 0 ? (t_total_chars / b_total_chars.to_f * 100).round(1) : 0
|
|
387
|
+
printf " Total chars: %-11d %-11d %s%%\n", b_total_chars, t_total_chars, chars_pct
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
# Minimal UI that captures output without displaying
|
|
391
|
+
class BenchmarkUI
|
|
392
|
+
def log(msg, level: :info); end
|
|
393
|
+
def show_assistant_message(content, files: []); end
|
|
394
|
+
def show_tool_call(name, args); end
|
|
395
|
+
def show_tool_result(result); end
|
|
396
|
+
def show_tool_stdout(lines); end
|
|
397
|
+
def show_tool_error(error); end
|
|
398
|
+
def show_tool_args(formatted_args); end
|
|
399
|
+
def show_file_write_preview(path, is_new_file:); end
|
|
400
|
+
def show_file_edit_preview(path); end
|
|
401
|
+
def show_file_error(error_message); end
|
|
402
|
+
def show_shell_preview(command); end
|
|
403
|
+
def show_diff(old_content, new_content, max_lines: 50); end
|
|
404
|
+
def show_token_usage(token_data); end
|
|
405
|
+
def show_complete(iterations:, cost:, duration: nil, cache_stats: nil, awaiting_user_feedback: false, cost_source: nil); end
|
|
406
|
+
def append_output(content); end
|
|
407
|
+
def show_info(message, prefix_newline: true); end
|
|
408
|
+
def show_warning(message); end
|
|
409
|
+
def show_error(message); end
|
|
410
|
+
def show_success(message); end
|
|
411
|
+
def show_progress(message = nil, prefix_newline: true, progress_type: "thinking", phase: "active", metadata: {}); end
|
|
412
|
+
def start_progress(message: nil, style: :primary, quiet_on_fast_finish: false); end
|
|
413
|
+
def with_progress(message: nil, style: :primary, quiet_on_fast_finish: false)
|
|
414
|
+
yield if block_given?
|
|
415
|
+
end
|
|
416
|
+
def update_sessionbar(tasks: nil, cost: nil, cost_source: nil, status: nil, latency: nil); end
|
|
417
|
+
def update_todos(todos); end
|
|
418
|
+
def set_working_status; end
|
|
419
|
+
def set_idle_status; end
|
|
420
|
+
def request_confirmation(message, default: true); end
|
|
421
|
+
def clear_input; end
|
|
422
|
+
def set_input_tips(message, type: :info); end
|
|
423
|
+
def stop; end
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
if __FILE__ == $0
|
|
428
|
+
variant = ARGV[0]&.downcase
|
|
429
|
+
runner = BenchmarkRunner.new
|
|
430
|
+
|
|
431
|
+
case variant
|
|
432
|
+
when "baseline"
|
|
433
|
+
runner.run_baseline
|
|
434
|
+
when "treatment"
|
|
435
|
+
runner.run_treatment
|
|
436
|
+
when "report"
|
|
437
|
+
runner.run_report
|
|
438
|
+
else
|
|
439
|
+
runner.run
|
|
440
|
+
end
|
|
441
|
+
end
|