caik-cli 0.1.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -7
- package/dist/api-6OX4ICXN.js +9 -0
- package/dist/auto-improve-skills-2COKTU5C.js +8 -0
- package/dist/autoresearch-Y7WW6L4O.js +24 -0
- package/dist/chunk-2YHUDOJL.js +54 -0
- package/dist/chunk-3TXNZINH.js +775 -0
- package/dist/chunk-5MHNQAV4.js +317 -0
- package/dist/chunk-7AIZTHHZ.js +152 -0
- package/dist/chunk-D4IM3YRX.js +166 -0
- package/dist/chunk-DJJHS7KK.js +62 -0
- package/dist/chunk-DKZBQRR3.js +91 -0
- package/dist/chunk-FLSHJZLC.js +613 -0
- package/dist/chunk-H2ZKCXMJ.js +202 -0
- package/dist/chunk-ILMOSMD3.js +83 -0
- package/dist/chunk-KYTHKH6V.js +79 -0
- package/dist/chunk-LTKHLRM4.js +272 -0
- package/dist/chunk-T32AEP3O.js +146 -0
- package/dist/chunk-T73Z5UMA.js +14437 -0
- package/dist/chunk-TFKT7V7H.js +1545 -0
- package/dist/chunk-US4CYDNS.js +524 -0
- package/dist/chunk-ZLRN7Q7C.js +27 -0
- package/dist/claude-code-6DF4YARB.js +8 -0
- package/dist/config-CS7734SA.js +24 -0
- package/dist/correction-classifier-TLPKRNLI.js +93 -0
- package/dist/cursor-Z4XXDCAM.js +8 -0
- package/dist/daemon/autoresearch-2MAEM2YI.js +272 -0
- package/dist/daemon/chunk-545XA5CB.js +77 -0
- package/dist/daemon/chunk-HEYFAUHL.js +90 -0
- package/dist/daemon/chunk-MLKGABMK.js +9 -0
- package/dist/daemon/chunk-NJICGNCK.js +150 -0
- package/dist/daemon/chunk-OD5NUFH2.js +181 -0
- package/dist/daemon/chunk-SM2FSXIP.js +60 -0
- package/dist/daemon/chunk-UMDJFPN6.js +163 -0
- package/dist/daemon/config-F7HE3JRY.js +23 -0
- package/dist/daemon/db-QEXVVTAL.js +15 -0
- package/dist/daemon/eval-generator-OR2FAYLB.js +316 -0
- package/dist/daemon/improver-TGEK6MPE.js +186 -0
- package/dist/daemon/llm-FUJ2TBYT.js +11 -0
- package/dist/daemon/nudge-detector-NFRHWZY6.js +140 -0
- package/dist/daemon/platform-7N3LQDIB.js +16381 -0
- package/dist/daemon/registry-FI4GTO3H.js +20 -0
- package/dist/daemon/server.js +356 -0
- package/dist/daemon/trace-store-T7XFGQSX.js +19 -0
- package/dist/daemon-UXYMG46V.js +85 -0
- package/dist/db-TLNRIXLK.js +18 -0
- package/dist/eval-generator-GGMRPO3K.js +21 -0
- package/dist/eval-runner-EF4K6T5Y.js +15 -0
- package/dist/index.js +8033 -568
- package/dist/llm-3UUZX6PX.js +12 -0
- package/dist/platform-52NREMBS.js +33 -0
- package/dist/repo-installer-K6ADOW3E.js +25 -0
- package/dist/setup-P744STZE.js +16 -0
- package/dist/test-loop-Y7QQE55P.js +127 -0
- package/dist/trace-store-FVLMNNDK.js +20 -0
- package/package.json +9 -3
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
getTraceCount,
|
|
4
|
+
getTraces
|
|
5
|
+
} from "./chunk-7AIZTHHZ.js";
|
|
6
|
+
import {
|
|
7
|
+
runSingleCase
|
|
8
|
+
} from "./chunk-D4IM3YRX.js";
|
|
9
|
+
import {
|
|
10
|
+
callAnthropic,
|
|
11
|
+
parseLLMJson
|
|
12
|
+
} from "./chunk-DJJHS7KK.js";
|
|
13
|
+
|
|
14
|
+
// src/daemon/eval-generator.ts
|
|
15
|
+
import { randomUUID } from "crypto";
|
|
16
|
+
function rowToEvalCase(row) {
|
|
17
|
+
const value = JSON.parse(row.assertion_value);
|
|
18
|
+
let assertion;
|
|
19
|
+
switch (row.assertion_type) {
|
|
20
|
+
case "must_contain":
|
|
21
|
+
assertion = { type: "must_contain", pattern: value.pattern };
|
|
22
|
+
break;
|
|
23
|
+
case "must_not_contain":
|
|
24
|
+
assertion = { type: "must_not_contain", pattern: value.pattern };
|
|
25
|
+
break;
|
|
26
|
+
case "max_output_length":
|
|
27
|
+
assertion = { type: "max_output_length", tokens: value.tokens };
|
|
28
|
+
break;
|
|
29
|
+
case "llm_judge":
|
|
30
|
+
assertion = { type: "llm_judge", prompt: value.prompt };
|
|
31
|
+
break;
|
|
32
|
+
case "code_check":
|
|
33
|
+
assertion = { type: "code_check", fn: value.fn };
|
|
34
|
+
break;
|
|
35
|
+
// Behavioral types
|
|
36
|
+
case "behavioral_must_contain":
|
|
37
|
+
assertion = { type: "behavioral_must_contain", scenario: value.scenario, pattern: value.pattern };
|
|
38
|
+
break;
|
|
39
|
+
case "behavioral_must_not_contain":
|
|
40
|
+
assertion = { type: "behavioral_must_not_contain", scenario: value.scenario, pattern: value.pattern };
|
|
41
|
+
break;
|
|
42
|
+
case "behavioral_max_length":
|
|
43
|
+
assertion = { type: "behavioral_max_length", scenario: value.scenario, tokens: value.tokens };
|
|
44
|
+
break;
|
|
45
|
+
case "behavioral_judge":
|
|
46
|
+
assertion = { type: "behavioral_judge", scenario: value.scenario, criteria: value.criteria };
|
|
47
|
+
break;
|
|
48
|
+
case "behavioral_code_check":
|
|
49
|
+
assertion = { type: "behavioral_code_check", scenario: value.scenario, fn: value.fn };
|
|
50
|
+
break;
|
|
51
|
+
default:
|
|
52
|
+
assertion = { type: "must_contain", pattern: "UNKNOWN" };
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
id: row.id,
|
|
56
|
+
suiteSlug: row.suite_slug,
|
|
57
|
+
traceId: row.trace_id ?? void 0,
|
|
58
|
+
source: row.source,
|
|
59
|
+
category: row.category,
|
|
60
|
+
assertion,
|
|
61
|
+
description: row.description
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
function getEvalSuite(db, slug) {
|
|
65
|
+
const suiteRow = db.prepare("SELECT * FROM eval_suites WHERE slug = ?").get(slug);
|
|
66
|
+
if (!suiteRow) return null;
|
|
67
|
+
const caseRows = db.prepare("SELECT * FROM eval_cases WHERE suite_slug = ? ORDER BY created_at ASC").all(slug);
|
|
68
|
+
return {
|
|
69
|
+
slug: suiteRow.slug,
|
|
70
|
+
version: suiteRow.version,
|
|
71
|
+
cases: caseRows.map(rowToEvalCase),
|
|
72
|
+
tpr: suiteRow.tpr ?? void 0,
|
|
73
|
+
tnr: suiteRow.tnr ?? void 0,
|
|
74
|
+
validationSampleSize: suiteRow.validation_sample_size ?? void 0,
|
|
75
|
+
validatedAt: suiteRow.validated_at ?? void 0
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
function shouldGenerateEvals(db, slug) {
|
|
79
|
+
const counts = getTraceCount(db, slug);
|
|
80
|
+
if (counts.corrections < 3) return false;
|
|
81
|
+
const suite = getEvalSuite(db, slug);
|
|
82
|
+
if (!suite) return true;
|
|
83
|
+
const processedTraceIds = new Set(
|
|
84
|
+
suite.cases.filter((c) => c.traceId).map((c) => c.traceId)
|
|
85
|
+
);
|
|
86
|
+
const allTraces = getTraces(db, slug, { kind: "correction" });
|
|
87
|
+
const unprocessed = allTraces.filter((t) => !processedTraceIds.has(t.id));
|
|
88
|
+
return unprocessed.length > 0;
|
|
89
|
+
}
|
|
90
|
+
var EVAL_GEN_SYSTEM_STRUCTURAL = `You generate eval test cases for Claude Code artifacts (skills, prompts, rules, subagents, commands).
|
|
91
|
+
Given a correction trace \u2014 what the user asked, what the skill produced, how the user corrected it \u2014
|
|
92
|
+
generate ONE eval assertion that would catch this failure pattern.
|
|
93
|
+
|
|
94
|
+
Prefer deterministic assertions over LLM judges:
|
|
95
|
+
- "must_contain" \u2014 regex pattern that MUST be present in the skill. assertion_value: {"pattern": "regex_here"}
|
|
96
|
+
- "must_not_contain" \u2014 regex pattern that must NOT be present. assertion_value: {"pattern": "regex_here"}
|
|
97
|
+
- "code_check" \u2014 JavaScript function body receiving 'content' argument. assertion_value: {"fn": "return content.includes('...')"}
|
|
98
|
+
- "llm_judge" \u2014 ONLY when pattern-based is impossible. assertion_value: {"prompt": "judge prompt here"}
|
|
99
|
+
|
|
100
|
+
Return ONLY a JSON object:
|
|
101
|
+
{"assertion_type": "...", "assertion_value": {...}, "category": "...", "description": "One line describing the failure"}`;
|
|
102
|
+
var EVAL_GEN_SYSTEM_BEHAVIORAL = `You generate BEHAVIORAL eval test cases for Claude Code artifacts (skills, prompts, rules, subagents, commands).
|
|
103
|
+
A behavioral eval works by: (1) using the artifact content as a system prompt for an LLM, (2) sending a test scenario as the user message, (3) checking the simulated output.
|
|
104
|
+
|
|
105
|
+
Given a correction trace, generate ONE behavioral eval assertion with a realistic test scenario.
|
|
106
|
+
|
|
107
|
+
Available types:
|
|
108
|
+
- "behavioral_must_contain" \u2014 simulated output MUST match regex. assertion_value: {"scenario": "user request that triggers the skill", "pattern": "regex_for_expected_output"}
|
|
109
|
+
- "behavioral_must_not_contain" \u2014 simulated output must NOT match. assertion_value: {"scenario": "...", "pattern": "regex_for_unwanted_output"}
|
|
110
|
+
- "behavioral_max_length" \u2014 output must be under N tokens. assertion_value: {"scenario": "...", "tokens": 200}
|
|
111
|
+
- "behavioral_code_check" \u2014 JS function on output string. assertion_value: {"scenario": "...", "fn": "return content.split('\\n').length < 20"}
|
|
112
|
+
- "behavioral_judge" \u2014 ONLY when pattern-based is impossible. assertion_value: {"scenario": "...", "criteria": "what good output looks like"}
|
|
113
|
+
|
|
114
|
+
The scenario should be a realistic user request that would expose the weakness found in the correction.
|
|
115
|
+
Prefer behavioral_must_contain and behavioral_must_not_contain over behavioral_judge.
|
|
116
|
+
|
|
117
|
+
Return ONLY a JSON object:
|
|
118
|
+
{"assertion_type": "behavioral_...", "assertion_value": {...}, "category": "...", "description": "One line describing what the output should/shouldn't do"}`;
|
|
119
|
+
function isBehavioralCorrection(correctionType) {
|
|
120
|
+
return ["scope_adjusted", "format_changed", "output_modified"].includes(correctionType ?? "");
|
|
121
|
+
}
|
|
122
|
+
async function generateEvalCase(trace, apiKey, model = "claude-haiku-4-5-20251001") {
|
|
123
|
+
if (!trace.correctionType && !trace.correctionPrompt) return null;
|
|
124
|
+
const useBehavioral = isBehavioralCorrection(trace.correctionType);
|
|
125
|
+
const userMessage = `Correction trace:
|
|
126
|
+
- Skill slug: ${trace.slug}
|
|
127
|
+
- Correction type: ${trace.correctionType ?? "unknown"}
|
|
128
|
+
- User's correction message: ${trace.correctionPrompt ?? "(no message captured)"}
|
|
129
|
+
- Tool that was used: ${trace.toolName}
|
|
130
|
+
- Tool input: ${JSON.stringify(trace.toolInput, null, 2).slice(0, 2e3)}
|
|
131
|
+
- Tool response: ${JSON.stringify(trace.toolResponse, null, 2).slice(0, 2e3)}
|
|
132
|
+
|
|
133
|
+
Generate an eval assertion that would detect this failure pattern.`;
|
|
134
|
+
try {
|
|
135
|
+
const result = await callAnthropic(apiKey, {
|
|
136
|
+
model,
|
|
137
|
+
system: useBehavioral ? EVAL_GEN_SYSTEM_BEHAVIORAL : EVAL_GEN_SYSTEM_STRUCTURAL,
|
|
138
|
+
userMessage,
|
|
139
|
+
maxTokens: 512
|
|
140
|
+
});
|
|
141
|
+
const parsed = parseLLMJson(result.text);
|
|
142
|
+
if (!parsed.assertion_type || !parsed.assertion_value || !parsed.category || !parsed.description) {
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
let assertion;
|
|
146
|
+
switch (parsed.assertion_type) {
|
|
147
|
+
case "must_contain":
|
|
148
|
+
assertion = { type: "must_contain", pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
149
|
+
break;
|
|
150
|
+
case "must_not_contain":
|
|
151
|
+
assertion = { type: "must_not_contain", pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
152
|
+
break;
|
|
153
|
+
case "code_check":
|
|
154
|
+
assertion = { type: "code_check", fn: String(parsed.assertion_value.fn ?? "") };
|
|
155
|
+
break;
|
|
156
|
+
case "llm_judge":
|
|
157
|
+
assertion = { type: "llm_judge", prompt: String(parsed.assertion_value.prompt ?? "") };
|
|
158
|
+
break;
|
|
159
|
+
case "max_output_length":
|
|
160
|
+
assertion = { type: "max_output_length", tokens: Number(parsed.assertion_value.tokens ?? 500) };
|
|
161
|
+
break;
|
|
162
|
+
// Behavioral types
|
|
163
|
+
case "behavioral_must_contain":
|
|
164
|
+
assertion = { type: "behavioral_must_contain", scenario: String(parsed.assertion_value.scenario ?? ""), pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
165
|
+
break;
|
|
166
|
+
case "behavioral_must_not_contain":
|
|
167
|
+
assertion = { type: "behavioral_must_not_contain", scenario: String(parsed.assertion_value.scenario ?? ""), pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
168
|
+
break;
|
|
169
|
+
case "behavioral_max_length":
|
|
170
|
+
assertion = { type: "behavioral_max_length", scenario: String(parsed.assertion_value.scenario ?? ""), tokens: Number(parsed.assertion_value.tokens ?? 500) };
|
|
171
|
+
break;
|
|
172
|
+
case "behavioral_judge":
|
|
173
|
+
assertion = { type: "behavioral_judge", scenario: String(parsed.assertion_value.scenario ?? ""), criteria: String(parsed.assertion_value.criteria ?? "") };
|
|
174
|
+
break;
|
|
175
|
+
case "behavioral_code_check":
|
|
176
|
+
assertion = { type: "behavioral_code_check", scenario: String(parsed.assertion_value.scenario ?? ""), fn: String(parsed.assertion_value.fn ?? "") };
|
|
177
|
+
break;
|
|
178
|
+
default:
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
return {
|
|
182
|
+
evalCase: {
|
|
183
|
+
traceId: trace.id,
|
|
184
|
+
source: "trace",
|
|
185
|
+
category: parsed.category,
|
|
186
|
+
assertion,
|
|
187
|
+
description: parsed.description
|
|
188
|
+
},
|
|
189
|
+
llmCalls: 1
|
|
190
|
+
};
|
|
191
|
+
} catch {
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
function serializeAssertionValue(assertion) {
|
|
196
|
+
switch (assertion.type) {
|
|
197
|
+
case "must_contain":
|
|
198
|
+
case "must_not_contain":
|
|
199
|
+
return JSON.stringify({ pattern: assertion.pattern });
|
|
200
|
+
case "code_check":
|
|
201
|
+
return JSON.stringify({ fn: assertion.fn });
|
|
202
|
+
case "llm_judge":
|
|
203
|
+
return JSON.stringify({ prompt: assertion.prompt });
|
|
204
|
+
case "max_output_length":
|
|
205
|
+
return JSON.stringify({ tokens: assertion.tokens });
|
|
206
|
+
case "behavioral_must_contain":
|
|
207
|
+
case "behavioral_must_not_contain":
|
|
208
|
+
return JSON.stringify({ scenario: assertion.scenario, pattern: assertion.pattern });
|
|
209
|
+
case "behavioral_code_check":
|
|
210
|
+
return JSON.stringify({ scenario: assertion.scenario, fn: assertion.fn });
|
|
211
|
+
case "behavioral_judge":
|
|
212
|
+
return JSON.stringify({ scenario: assertion.scenario, criteria: assertion.criteria });
|
|
213
|
+
case "behavioral_max_length":
|
|
214
|
+
return JSON.stringify({ scenario: assertion.scenario, tokens: assertion.tokens });
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
var MAX_EVAL_CASES = 20;
|
|
218
|
+
async function generateEvalSuite(db, slug, apiKey, model = "claude-haiku-4-5-20251001") {
|
|
219
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
220
|
+
let totalLLMCalls = 0;
|
|
221
|
+
let suite = getEvalSuite(db, slug);
|
|
222
|
+
if (!suite) {
|
|
223
|
+
db.prepare(
|
|
224
|
+
"INSERT INTO eval_suites (slug, version, created_at, updated_at) VALUES (?, 1, ?, ?)"
|
|
225
|
+
).run(slug, now, now);
|
|
226
|
+
suite = { slug, version: 1, cases: [] };
|
|
227
|
+
}
|
|
228
|
+
const processedTraceIds = new Set(
|
|
229
|
+
suite.cases.filter((c) => c.traceId).map((c) => c.traceId)
|
|
230
|
+
);
|
|
231
|
+
const correctionTraces = getTraces(db, slug, { kind: "correction" });
|
|
232
|
+
const unprocessed = correctionTraces.filter((t) => !processedTraceIds.has(t.id));
|
|
233
|
+
const existingCategories = new Set(suite.cases.map((c) => c.category));
|
|
234
|
+
for (const trace of unprocessed) {
|
|
235
|
+
if (suite.cases.length >= MAX_EVAL_CASES) break;
|
|
236
|
+
if (totalLLMCalls >= MAX_EVAL_CASES) break;
|
|
237
|
+
const result = await generateEvalCase(trace, apiKey, model);
|
|
238
|
+
if (!result) continue;
|
|
239
|
+
totalLLMCalls += result.llmCalls;
|
|
240
|
+
if (existingCategories.has(result.evalCase.category)) continue;
|
|
241
|
+
existingCategories.add(result.evalCase.category);
|
|
242
|
+
const caseId = randomUUID();
|
|
243
|
+
const assertionValue = serializeAssertionValue(result.evalCase.assertion);
|
|
244
|
+
db.prepare(
|
|
245
|
+
`INSERT INTO eval_cases (id, suite_slug, trace_id, source, category, assertion_type, assertion_value, description, created_at)
|
|
246
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
247
|
+
).run(caseId, slug, trace.id, "trace", result.evalCase.category, result.evalCase.assertion.type, assertionValue, result.evalCase.description, now);
|
|
248
|
+
suite.cases.push({
|
|
249
|
+
id: caseId,
|
|
250
|
+
suiteSlug: slug,
|
|
251
|
+
traceId: trace.id,
|
|
252
|
+
source: "trace",
|
|
253
|
+
category: result.evalCase.category,
|
|
254
|
+
assertion: result.evalCase.assertion,
|
|
255
|
+
description: result.evalCase.description
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
if (suite.cases.length > MAX_EVAL_CASES) {
|
|
259
|
+
const toRemove = suite.cases.slice(0, suite.cases.length - MAX_EVAL_CASES);
|
|
260
|
+
for (const c of toRemove) {
|
|
261
|
+
db.prepare("DELETE FROM eval_cases WHERE id = ?").run(c.id);
|
|
262
|
+
}
|
|
263
|
+
suite.cases = suite.cases.slice(suite.cases.length - MAX_EVAL_CASES);
|
|
264
|
+
}
|
|
265
|
+
db.prepare("UPDATE eval_suites SET version = version + 1, updated_at = ? WHERE slug = ?").run(now, slug);
|
|
266
|
+
suite.version++;
|
|
267
|
+
return { suite, llmCalls: totalLLMCalls };
|
|
268
|
+
}
|
|
269
|
+
async function validateJudges(db, slug, apiKey, skillContent, simulationModel) {
|
|
270
|
+
const suite = getEvalSuite(db, slug);
|
|
271
|
+
if (!suite || suite.cases.length === 0) {
|
|
272
|
+
return { tpr: void 0, tnr: void 0, sampleSize: 0 };
|
|
273
|
+
}
|
|
274
|
+
const corrections = getTraces(db, slug, { kind: "correction" });
|
|
275
|
+
const simulationOptions = simulationModel ? { model: simulationModel, cache: /* @__PURE__ */ new Map() } : void 0;
|
|
276
|
+
let truePositives = 0;
|
|
277
|
+
for (const trace of corrections) {
|
|
278
|
+
const badContent = JSON.stringify(trace.toolResponse);
|
|
279
|
+
let anyFailed = false;
|
|
280
|
+
for (const evalCase of suite.cases) {
|
|
281
|
+
const result = await runSingleCase(badContent, evalCase, apiKey, simulationOptions);
|
|
282
|
+
if (!result.passed) {
|
|
283
|
+
anyFailed = true;
|
|
284
|
+
break;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
if (anyFailed) truePositives++;
|
|
288
|
+
}
|
|
289
|
+
let tnr;
|
|
290
|
+
if (skillContent) {
|
|
291
|
+
let allPassed = true;
|
|
292
|
+
for (const evalCase of suite.cases) {
|
|
293
|
+
const result = await runSingleCase(skillContent, evalCase, apiKey, simulationOptions);
|
|
294
|
+
if (!result.passed) {
|
|
295
|
+
allPassed = false;
|
|
296
|
+
break;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
tnr = allPassed ? 1 : 0;
|
|
300
|
+
}
|
|
301
|
+
const tpr = corrections.length > 0 ? truePositives / corrections.length : void 0;
|
|
302
|
+
const sampleSize = corrections.length + (skillContent ? 1 : 0);
|
|
303
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
304
|
+
db.prepare(
|
|
305
|
+
"UPDATE eval_suites SET tpr = ?, tnr = ?, validation_sample_size = ?, validated_at = ? WHERE slug = ?"
|
|
306
|
+
).run(tpr ?? null, tnr ?? null, sampleSize, now, slug);
|
|
307
|
+
return { tpr, tnr, sampleSize };
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
export {
|
|
311
|
+
getEvalSuite,
|
|
312
|
+
shouldGenerateEvals,
|
|
313
|
+
isBehavioralCorrection,
|
|
314
|
+
generateEvalCase,
|
|
315
|
+
generateEvalSuite,
|
|
316
|
+
validateJudges
|
|
317
|
+
};
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/daemon/trace-store.ts
|
|
4
|
+
import { randomUUID } from "crypto";
|
|
5
|
+
function bufferToolCall(db, sessionId, toolName, toolInput, toolResponse, success, slug) {
|
|
6
|
+
db.prepare(
|
|
7
|
+
`INSERT INTO session_buffer (session_id, type, timestamp, tool_name, tool_input, tool_response, success, slug)
|
|
8
|
+
VALUES (?, 'tool_call', ?, ?, ?, ?, ?, ?)`
|
|
9
|
+
).run(
|
|
10
|
+
sessionId,
|
|
11
|
+
(/* @__PURE__ */ new Date()).toISOString(),
|
|
12
|
+
toolName,
|
|
13
|
+
JSON.stringify(toolInput),
|
|
14
|
+
JSON.stringify(toolResponse),
|
|
15
|
+
success ? 1 : 0,
|
|
16
|
+
slug
|
|
17
|
+
);
|
|
18
|
+
}
|
|
19
|
+
function bufferPrompt(db, sessionId, prompt, correctionType, slug) {
|
|
20
|
+
const type = correctionType ? "correction" : "prompt";
|
|
21
|
+
db.prepare(
|
|
22
|
+
`INSERT INTO session_buffer (session_id, type, timestamp, prompt, correction_type, slug)
|
|
23
|
+
VALUES (?, ?, ?, ?, ?, ?)`
|
|
24
|
+
).run(sessionId, type, (/* @__PURE__ */ new Date()).toISOString(), prompt, correctionType ?? null, slug ?? null);
|
|
25
|
+
}
|
|
26
|
+
function buildTraces(db, sessionId) {
|
|
27
|
+
const rows = db.prepare("SELECT * FROM session_buffer WHERE session_id = ? ORDER BY id ASC").all(sessionId);
|
|
28
|
+
const traces = [];
|
|
29
|
+
const lastToolCall = /* @__PURE__ */ new Map();
|
|
30
|
+
for (const row of rows) {
|
|
31
|
+
if (row.type === "tool_call" && row.slug) {
|
|
32
|
+
lastToolCall.set(row.slug, row);
|
|
33
|
+
}
|
|
34
|
+
if (row.type === "correction" && row.correction_type) {
|
|
35
|
+
const slug = row.slug ?? "unknown";
|
|
36
|
+
const toolCall = lastToolCall.get(slug);
|
|
37
|
+
traces.push({
|
|
38
|
+
id: randomUUID(),
|
|
39
|
+
sessionId,
|
|
40
|
+
slug,
|
|
41
|
+
timestamp: row.timestamp,
|
|
42
|
+
kind: "correction",
|
|
43
|
+
toolName: toolCall?.tool_name ?? "unknown",
|
|
44
|
+
toolInput: toolCall?.tool_input ? JSON.parse(toolCall.tool_input) : {},
|
|
45
|
+
toolResponse: toolCall?.tool_response ? JSON.parse(toolCall.tool_response) : {},
|
|
46
|
+
correctionType: row.correction_type,
|
|
47
|
+
correctionPrompt: row.prompt ?? void 0
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
const successfulCalls = rows.filter(
|
|
52
|
+
(r) => r.type === "tool_call" && r.success === 1 && r.slug
|
|
53
|
+
);
|
|
54
|
+
for (const row of successfulCalls) {
|
|
55
|
+
if (Math.random() < 0.2) {
|
|
56
|
+
traces.push({
|
|
57
|
+
id: randomUUID(),
|
|
58
|
+
sessionId,
|
|
59
|
+
slug: row.slug,
|
|
60
|
+
timestamp: row.timestamp,
|
|
61
|
+
kind: "success",
|
|
62
|
+
toolName: row.tool_name ?? "unknown",
|
|
63
|
+
toolInput: row.tool_input ? JSON.parse(row.tool_input) : {},
|
|
64
|
+
toolResponse: row.tool_response ? JSON.parse(row.tool_response) : {}
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return traces;
|
|
69
|
+
}
|
|
70
|
+
function persistTraces(db, traces) {
|
|
71
|
+
const stmt = db.prepare(
|
|
72
|
+
`INSERT INTO traces (id, session_id, slug, timestamp, kind, tool_name, tool_input, tool_response,
|
|
73
|
+
correction_type, correction_prompt, skill_content_hash, created_at)
|
|
74
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
75
|
+
);
|
|
76
|
+
const insertMany = db.transaction((items) => {
|
|
77
|
+
for (const t of items) {
|
|
78
|
+
stmt.run(
|
|
79
|
+
t.id,
|
|
80
|
+
t.sessionId,
|
|
81
|
+
t.slug,
|
|
82
|
+
t.timestamp,
|
|
83
|
+
t.kind,
|
|
84
|
+
t.toolName,
|
|
85
|
+
JSON.stringify(t.toolInput),
|
|
86
|
+
JSON.stringify(t.toolResponse),
|
|
87
|
+
t.correctionType ?? null,
|
|
88
|
+
t.correctionPrompt ?? null,
|
|
89
|
+
t.skillContentHash ?? null,
|
|
90
|
+
(/* @__PURE__ */ new Date()).toISOString()
|
|
91
|
+
);
|
|
92
|
+
}
|
|
93
|
+
});
|
|
94
|
+
insertMany(traces);
|
|
95
|
+
}
|
|
96
|
+
function rowToTrace(row) {
|
|
97
|
+
return {
|
|
98
|
+
id: row.id,
|
|
99
|
+
sessionId: row.session_id,
|
|
100
|
+
slug: row.slug,
|
|
101
|
+
timestamp: row.timestamp,
|
|
102
|
+
kind: row.kind,
|
|
103
|
+
toolName: row.tool_name,
|
|
104
|
+
toolInput: row.tool_input ? JSON.parse(row.tool_input) : {},
|
|
105
|
+
toolResponse: row.tool_response ? JSON.parse(row.tool_response) : {},
|
|
106
|
+
correctionType: row.correction_type ?? void 0,
|
|
107
|
+
correctionPrompt: row.correction_prompt ?? void 0,
|
|
108
|
+
skillContentHash: row.skill_content_hash ?? void 0
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
function getTraces(db, slug, opts) {
|
|
112
|
+
let sql = "SELECT * FROM traces WHERE slug = ?";
|
|
113
|
+
const params = [slug];
|
|
114
|
+
if (opts?.kind) {
|
|
115
|
+
sql += " AND kind = ?";
|
|
116
|
+
params.push(opts.kind);
|
|
117
|
+
}
|
|
118
|
+
sql += " ORDER BY timestamp DESC";
|
|
119
|
+
if (opts?.limit) {
|
|
120
|
+
sql += " LIMIT ?";
|
|
121
|
+
params.push(opts.limit);
|
|
122
|
+
}
|
|
123
|
+
const rows = db.prepare(sql).all(...params);
|
|
124
|
+
return rows.map(rowToTrace);
|
|
125
|
+
}
|
|
126
|
+
function getTraceCount(db, slug) {
|
|
127
|
+
const row = db.prepare(
|
|
128
|
+
`SELECT
|
|
129
|
+
COUNT(*) as total,
|
|
130
|
+
SUM(CASE WHEN kind = 'correction' THEN 1 ELSE 0 END) as corrections,
|
|
131
|
+
SUM(CASE WHEN kind = 'success' THEN 1 ELSE 0 END) as successes
|
|
132
|
+
FROM traces WHERE slug = ?`
|
|
133
|
+
).get(slug);
|
|
134
|
+
return {
|
|
135
|
+
total: row.total,
|
|
136
|
+
corrections: row.corrections ?? 0,
|
|
137
|
+
successes: row.successes ?? 0
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
function clearSessionBuffer(db, sessionId) {
|
|
141
|
+
db.prepare("DELETE FROM session_buffer WHERE session_id = ?").run(sessionId);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
export {
|
|
145
|
+
bufferToolCall,
|
|
146
|
+
bufferPrompt,
|
|
147
|
+
buildTraces,
|
|
148
|
+
persistTraces,
|
|
149
|
+
getTraces,
|
|
150
|
+
getTraceCount,
|
|
151
|
+
clearSessionBuffer
|
|
152
|
+
};
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
callAnthropic
|
|
4
|
+
} from "./chunk-DJJHS7KK.js";
|
|
5
|
+
|
|
6
|
+
// src/daemon/eval-runner.ts
|
|
7
|
+
import { createHash } from "crypto";
|
|
8
|
+
function simCacheKey(skillContent, scenario) {
|
|
9
|
+
return createHash("sha256").update(skillContent + "\0" + scenario).digest("hex").slice(0, 16);
|
|
10
|
+
}
|
|
11
|
+
async function simulateSkill(skillContent, scenario, apiKey, simOpts) {
|
|
12
|
+
const key = simCacheKey(skillContent, scenario);
|
|
13
|
+
const cached = simOpts.cache.get(key);
|
|
14
|
+
if (cached !== void 0) return cached;
|
|
15
|
+
const result = await callAnthropic(apiKey, {
|
|
16
|
+
model: simOpts.model,
|
|
17
|
+
system: skillContent,
|
|
18
|
+
userMessage: scenario,
|
|
19
|
+
maxTokens: 2048
|
|
20
|
+
});
|
|
21
|
+
simOpts.cache.set(key, result.text);
|
|
22
|
+
return result.text;
|
|
23
|
+
}
|
|
24
|
+
function checkPatternAssertion(content, type, pattern) {
|
|
25
|
+
try {
|
|
26
|
+
let flags = "";
|
|
27
|
+
let cleanPattern = pattern;
|
|
28
|
+
if (cleanPattern.startsWith("(?i)")) {
|
|
29
|
+
flags = "i";
|
|
30
|
+
cleanPattern = cleanPattern.slice(4);
|
|
31
|
+
}
|
|
32
|
+
const regex = new RegExp(cleanPattern, flags);
|
|
33
|
+
const found = regex.test(content);
|
|
34
|
+
if (type === "must_contain") {
|
|
35
|
+
return found ? { passed: true, reason: `Pattern "${pattern}" found` } : { passed: false, reason: `Pattern "${pattern}" not found in content` };
|
|
36
|
+
} else {
|
|
37
|
+
return found ? { passed: false, reason: `Pattern "${pattern}" found in content (should be absent)` } : { passed: true, reason: `Pattern "${pattern}" correctly absent` };
|
|
38
|
+
}
|
|
39
|
+
} catch (err) {
|
|
40
|
+
return {
|
|
41
|
+
passed: false,
|
|
42
|
+
reason: `Invalid regex pattern: ${err instanceof Error ? err.message : String(err)}`
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function checkCodeAssertion(content, fn) {
|
|
47
|
+
try {
|
|
48
|
+
const check = new Function("content", fn);
|
|
49
|
+
const result = check(content);
|
|
50
|
+
return result ? { passed: true, reason: "Code check passed" } : { passed: false, reason: "Code check returned false" };
|
|
51
|
+
} catch (err) {
|
|
52
|
+
return {
|
|
53
|
+
passed: false,
|
|
54
|
+
reason: `Code check error: ${err instanceof Error ? err.message : String(err)}`
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
async function checkLLMJudge(skillContent, judgePrompt, apiKey) {
|
|
59
|
+
try {
|
|
60
|
+
const result = await callAnthropic(apiKey, {
|
|
61
|
+
model: "claude-haiku-4-5-20251001",
|
|
62
|
+
system: `You are an eval judge for Claude Code skills. Given a skill and a judge prompt, determine if the skill PASSES or FAILS the criteria. Respond with ONLY a JSON object: {"passed": true/false, "reason": "brief explanation"}`,
|
|
63
|
+
userMessage: `Skill content:
|
|
64
|
+
<skill>
|
|
65
|
+
${skillContent}
|
|
66
|
+
</skill>
|
|
67
|
+
|
|
68
|
+
Judge criteria: ${judgePrompt}`,
|
|
69
|
+
maxTokens: 256
|
|
70
|
+
});
|
|
71
|
+
const parsed = JSON.parse(result.text);
|
|
72
|
+
return parsed;
|
|
73
|
+
} catch (err) {
|
|
74
|
+
return {
|
|
75
|
+
passed: false,
|
|
76
|
+
reason: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
async function runSingleCase(skillContent, evalCase, apiKey, simOpts) {
|
|
81
|
+
const assertion = evalCase.assertion;
|
|
82
|
+
switch (assertion.type) {
|
|
83
|
+
// ── Structural (check SKILL.md text directly) ──────────────────
|
|
84
|
+
case "must_contain":
|
|
85
|
+
return checkPatternAssertion(skillContent, "must_contain", assertion.pattern);
|
|
86
|
+
case "must_not_contain":
|
|
87
|
+
return checkPatternAssertion(skillContent, "must_not_contain", assertion.pattern);
|
|
88
|
+
case "code_check":
|
|
89
|
+
return checkCodeAssertion(skillContent, assertion.fn);
|
|
90
|
+
case "llm_judge":
|
|
91
|
+
return checkLLMJudge(skillContent, assertion.prompt, apiKey);
|
|
92
|
+
case "max_output_length":
|
|
93
|
+
return {
|
|
94
|
+
passed: skillContent.length <= assertion.tokens * 4,
|
|
95
|
+
reason: skillContent.length <= assertion.tokens * 4 ? `Content length ${skillContent.length} within limit` : `Content length ${skillContent.length} exceeds token limit ${assertion.tokens}`
|
|
96
|
+
};
|
|
97
|
+
// ── Behavioral (simulate skill, check output) ──────────────────
|
|
98
|
+
case "behavioral_must_contain": {
|
|
99
|
+
if (!simOpts) return { passed: false, reason: "No simulation config for behavioral eval" };
|
|
100
|
+
const output = await simulateSkill(skillContent, assertion.scenario, apiKey, simOpts);
|
|
101
|
+
return checkPatternAssertion(output, "must_contain", assertion.pattern);
|
|
102
|
+
}
|
|
103
|
+
case "behavioral_must_not_contain": {
|
|
104
|
+
if (!simOpts) return { passed: false, reason: "No simulation config for behavioral eval" };
|
|
105
|
+
const output = await simulateSkill(skillContent, assertion.scenario, apiKey, simOpts);
|
|
106
|
+
return checkPatternAssertion(output, "must_not_contain", assertion.pattern);
|
|
107
|
+
}
|
|
108
|
+
case "behavioral_max_length": {
|
|
109
|
+
if (!simOpts) return { passed: false, reason: "No simulation config for behavioral eval" };
|
|
110
|
+
const output = await simulateSkill(skillContent, assertion.scenario, apiKey, simOpts);
|
|
111
|
+
const tokenEstimate = Math.round(output.length / 4);
|
|
112
|
+
return {
|
|
113
|
+
passed: tokenEstimate <= assertion.tokens,
|
|
114
|
+
reason: tokenEstimate <= assertion.tokens ? `Simulated output ~${tokenEstimate} tokens, within limit ${assertion.tokens}` : `Simulated output ~${tokenEstimate} tokens, exceeds limit ${assertion.tokens}`
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
case "behavioral_judge": {
|
|
118
|
+
if (!simOpts) return { passed: false, reason: "No simulation config for behavioral eval" };
|
|
119
|
+
const output = await simulateSkill(skillContent, assertion.scenario, apiKey, simOpts);
|
|
120
|
+
return checkLLMJudge(output, assertion.criteria, apiKey);
|
|
121
|
+
}
|
|
122
|
+
case "behavioral_code_check": {
|
|
123
|
+
if (!simOpts) return { passed: false, reason: "No simulation config for behavioral eval" };
|
|
124
|
+
const output = await simulateSkill(skillContent, assertion.scenario, apiKey, simOpts);
|
|
125
|
+
return checkCodeAssertion(output, assertion.fn);
|
|
126
|
+
}
|
|
127
|
+
default:
|
|
128
|
+
return { passed: false, reason: `Unknown assertion type` };
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
async function runEvalSuite(skillContent, suite, baselineContent, apiKey, simulationModel) {
|
|
132
|
+
if (suite.cases.length === 0) {
|
|
133
|
+
return {
|
|
134
|
+
passRate: 1,
|
|
135
|
+
passCount: 0,
|
|
136
|
+
totalCases: 0,
|
|
137
|
+
failedCaseIds: [],
|
|
138
|
+
lengthRatio: baselineContent.length > 0 ? skillContent.length / baselineContent.length : 1
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
let passCount = 0;
|
|
142
|
+
const failedCaseIds = [];
|
|
143
|
+
const simOpts = simulationModel ? { model: simulationModel, cache: /* @__PURE__ */ new Map() } : void 0;
|
|
144
|
+
for (const evalCase of suite.cases) {
|
|
145
|
+
const result = await runSingleCase(skillContent, evalCase, apiKey, simOpts);
|
|
146
|
+
if (result.passed) {
|
|
147
|
+
passCount++;
|
|
148
|
+
} else {
|
|
149
|
+
failedCaseIds.push(evalCase.id);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return {
|
|
153
|
+
passRate: passCount / suite.cases.length,
|
|
154
|
+
passCount,
|
|
155
|
+
totalCases: suite.cases.length,
|
|
156
|
+
failedCaseIds,
|
|
157
|
+
lengthRatio: baselineContent.length > 0 ? skillContent.length / baselineContent.length : 1
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export {
|
|
162
|
+
checkPatternAssertion,
|
|
163
|
+
checkCodeAssertion,
|
|
164
|
+
runSingleCase,
|
|
165
|
+
runEvalSuite
|
|
166
|
+
};
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/daemon/llm.ts
|
|
4
|
+
import { readFileSync, existsSync } from "fs";
|
|
5
|
+
import { join } from "path";
|
|
6
|
+
import { homedir } from "os";
|
|
7
|
+
var ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages";
|
|
8
|
+
function getAnthropicApiKey() {
|
|
9
|
+
if (process.env.ANTHROPIC_API_KEY) return process.env.ANTHROPIC_API_KEY;
|
|
10
|
+
try {
|
|
11
|
+
const configPath = join(homedir(), ".caik", "config.json");
|
|
12
|
+
if (existsSync(configPath)) {
|
|
13
|
+
const raw = JSON.parse(readFileSync(configPath, "utf-8"));
|
|
14
|
+
if (typeof raw.anthropicApiKey === "string") return raw.anthropicApiKey;
|
|
15
|
+
}
|
|
16
|
+
} catch {
|
|
17
|
+
}
|
|
18
|
+
return void 0;
|
|
19
|
+
}
|
|
20
|
+
async function callAnthropic(apiKey, opts) {
|
|
21
|
+
const body = JSON.stringify({
|
|
22
|
+
model: opts.model,
|
|
23
|
+
max_tokens: opts.maxTokens ?? 4096,
|
|
24
|
+
system: opts.system,
|
|
25
|
+
messages: [{ role: "user", content: opts.userMessage }]
|
|
26
|
+
});
|
|
27
|
+
const res = await fetch(ANTHROPIC_API_URL, {
|
|
28
|
+
method: "POST",
|
|
29
|
+
headers: {
|
|
30
|
+
"x-api-key": apiKey,
|
|
31
|
+
"anthropic-version": "2023-06-01",
|
|
32
|
+
"content-type": "application/json"
|
|
33
|
+
},
|
|
34
|
+
body
|
|
35
|
+
});
|
|
36
|
+
if (!res.ok) {
|
|
37
|
+
const text = await res.text();
|
|
38
|
+
throw new Error(`Anthropic API ${res.status}: ${text}`);
|
|
39
|
+
}
|
|
40
|
+
const data = await res.json();
|
|
41
|
+
const textBlock = data.content.find((b) => b.type === "text");
|
|
42
|
+
if (!textBlock?.text) throw new Error("No text in Anthropic response");
|
|
43
|
+
return {
|
|
44
|
+
text: textBlock.text,
|
|
45
|
+
inputTokens: data.usage?.input_tokens ?? 0,
|
|
46
|
+
outputTokens: data.usage?.output_tokens ?? 0
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
function parseLLMJson(text) {
|
|
50
|
+
let cleaned = text.trim();
|
|
51
|
+
if (cleaned.startsWith("```")) {
|
|
52
|
+
cleaned = cleaned.replace(/^```[a-z]*\n?/i, "");
|
|
53
|
+
cleaned = cleaned.replace(/\n?```\s*$/, "");
|
|
54
|
+
}
|
|
55
|
+
return JSON.parse(cleaned);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export {
|
|
59
|
+
getAnthropicApiKey,
|
|
60
|
+
callAnthropic,
|
|
61
|
+
parseLLMJson
|
|
62
|
+
};
|