caik-cli 0.1.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -7
- package/dist/api-6OX4ICXN.js +9 -0
- package/dist/auto-improve-skills-2COKTU5C.js +8 -0
- package/dist/autoresearch-Y7WW6L4O.js +24 -0
- package/dist/chunk-2YHUDOJL.js +54 -0
- package/dist/chunk-3TXNZINH.js +775 -0
- package/dist/chunk-5MHNQAV4.js +317 -0
- package/dist/chunk-7AIZTHHZ.js +152 -0
- package/dist/chunk-D4IM3YRX.js +166 -0
- package/dist/chunk-DJJHS7KK.js +62 -0
- package/dist/chunk-DKZBQRR3.js +91 -0
- package/dist/chunk-FLSHJZLC.js +613 -0
- package/dist/chunk-H2ZKCXMJ.js +202 -0
- package/dist/chunk-ILMOSMD3.js +83 -0
- package/dist/chunk-KYTHKH6V.js +79 -0
- package/dist/chunk-LTKHLRM4.js +272 -0
- package/dist/chunk-T32AEP3O.js +146 -0
- package/dist/chunk-T73Z5UMA.js +14437 -0
- package/dist/chunk-TFKT7V7H.js +1545 -0
- package/dist/chunk-US4CYDNS.js +524 -0
- package/dist/chunk-ZLRN7Q7C.js +27 -0
- package/dist/claude-code-6DF4YARB.js +8 -0
- package/dist/config-CS7734SA.js +24 -0
- package/dist/correction-classifier-TLPKRNLI.js +93 -0
- package/dist/cursor-Z4XXDCAM.js +8 -0
- package/dist/daemon/autoresearch-2MAEM2YI.js +272 -0
- package/dist/daemon/chunk-545XA5CB.js +77 -0
- package/dist/daemon/chunk-HEYFAUHL.js +90 -0
- package/dist/daemon/chunk-MLKGABMK.js +9 -0
- package/dist/daemon/chunk-NJICGNCK.js +150 -0
- package/dist/daemon/chunk-OD5NUFH2.js +181 -0
- package/dist/daemon/chunk-SM2FSXIP.js +60 -0
- package/dist/daemon/chunk-UMDJFPN6.js +163 -0
- package/dist/daemon/config-F7HE3JRY.js +23 -0
- package/dist/daemon/db-QEXVVTAL.js +15 -0
- package/dist/daemon/eval-generator-OR2FAYLB.js +316 -0
- package/dist/daemon/improver-TGEK6MPE.js +186 -0
- package/dist/daemon/llm-FUJ2TBYT.js +11 -0
- package/dist/daemon/nudge-detector-NFRHWZY6.js +140 -0
- package/dist/daemon/platform-7N3LQDIB.js +16381 -0
- package/dist/daemon/registry-FI4GTO3H.js +20 -0
- package/dist/daemon/server.js +356 -0
- package/dist/daemon/trace-store-T7XFGQSX.js +19 -0
- package/dist/daemon-UXYMG46V.js +85 -0
- package/dist/db-TLNRIXLK.js +18 -0
- package/dist/eval-generator-GGMRPO3K.js +21 -0
- package/dist/eval-runner-EF4K6T5Y.js +15 -0
- package/dist/index.js +8033 -568
- package/dist/llm-3UUZX6PX.js +12 -0
- package/dist/platform-52NREMBS.js +33 -0
- package/dist/repo-installer-K6ADOW3E.js +25 -0
- package/dist/setup-P744STZE.js +16 -0
- package/dist/test-loop-Y7QQE55P.js +127 -0
- package/dist/trace-store-FVLMNNDK.js +20 -0
- package/package.json +9 -3
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getTraceCount,
|
|
3
|
+
getTraces
|
|
4
|
+
} from "./chunk-NJICGNCK.js";
|
|
5
|
+
import {
|
|
6
|
+
runSingleCase
|
|
7
|
+
} from "./chunk-UMDJFPN6.js";
|
|
8
|
+
import {
|
|
9
|
+
callAnthropic,
|
|
10
|
+
parseLLMJson
|
|
11
|
+
} from "./chunk-SM2FSXIP.js";
|
|
12
|
+
import "./chunk-MLKGABMK.js";
|
|
13
|
+
|
|
14
|
+
// src/daemon/eval-generator.ts
|
|
15
|
+
import { randomUUID } from "crypto";
|
|
16
|
+
function rowToEvalCase(row) {
|
|
17
|
+
const value = JSON.parse(row.assertion_value);
|
|
18
|
+
let assertion;
|
|
19
|
+
switch (row.assertion_type) {
|
|
20
|
+
case "must_contain":
|
|
21
|
+
assertion = { type: "must_contain", pattern: value.pattern };
|
|
22
|
+
break;
|
|
23
|
+
case "must_not_contain":
|
|
24
|
+
assertion = { type: "must_not_contain", pattern: value.pattern };
|
|
25
|
+
break;
|
|
26
|
+
case "max_output_length":
|
|
27
|
+
assertion = { type: "max_output_length", tokens: value.tokens };
|
|
28
|
+
break;
|
|
29
|
+
case "llm_judge":
|
|
30
|
+
assertion = { type: "llm_judge", prompt: value.prompt };
|
|
31
|
+
break;
|
|
32
|
+
case "code_check":
|
|
33
|
+
assertion = { type: "code_check", fn: value.fn };
|
|
34
|
+
break;
|
|
35
|
+
// Behavioral types
|
|
36
|
+
case "behavioral_must_contain":
|
|
37
|
+
assertion = { type: "behavioral_must_contain", scenario: value.scenario, pattern: value.pattern };
|
|
38
|
+
break;
|
|
39
|
+
case "behavioral_must_not_contain":
|
|
40
|
+
assertion = { type: "behavioral_must_not_contain", scenario: value.scenario, pattern: value.pattern };
|
|
41
|
+
break;
|
|
42
|
+
case "behavioral_max_length":
|
|
43
|
+
assertion = { type: "behavioral_max_length", scenario: value.scenario, tokens: value.tokens };
|
|
44
|
+
break;
|
|
45
|
+
case "behavioral_judge":
|
|
46
|
+
assertion = { type: "behavioral_judge", scenario: value.scenario, criteria: value.criteria };
|
|
47
|
+
break;
|
|
48
|
+
case "behavioral_code_check":
|
|
49
|
+
assertion = { type: "behavioral_code_check", scenario: value.scenario, fn: value.fn };
|
|
50
|
+
break;
|
|
51
|
+
default:
|
|
52
|
+
assertion = { type: "must_contain", pattern: "UNKNOWN" };
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
id: row.id,
|
|
56
|
+
suiteSlug: row.suite_slug,
|
|
57
|
+
traceId: row.trace_id ?? void 0,
|
|
58
|
+
source: row.source,
|
|
59
|
+
category: row.category,
|
|
60
|
+
assertion,
|
|
61
|
+
description: row.description
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
function getEvalSuite(db, slug) {
|
|
65
|
+
const suiteRow = db.prepare("SELECT * FROM eval_suites WHERE slug = ?").get(slug);
|
|
66
|
+
if (!suiteRow) return null;
|
|
67
|
+
const caseRows = db.prepare("SELECT * FROM eval_cases WHERE suite_slug = ? ORDER BY created_at ASC").all(slug);
|
|
68
|
+
return {
|
|
69
|
+
slug: suiteRow.slug,
|
|
70
|
+
version: suiteRow.version,
|
|
71
|
+
cases: caseRows.map(rowToEvalCase),
|
|
72
|
+
tpr: suiteRow.tpr ?? void 0,
|
|
73
|
+
tnr: suiteRow.tnr ?? void 0,
|
|
74
|
+
validationSampleSize: suiteRow.validation_sample_size ?? void 0,
|
|
75
|
+
validatedAt: suiteRow.validated_at ?? void 0
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
function shouldGenerateEvals(db, slug) {
|
|
79
|
+
const counts = getTraceCount(db, slug);
|
|
80
|
+
if (counts.corrections < 3) return false;
|
|
81
|
+
const suite = getEvalSuite(db, slug);
|
|
82
|
+
if (!suite) return true;
|
|
83
|
+
const processedTraceIds = new Set(
|
|
84
|
+
suite.cases.filter((c) => c.traceId).map((c) => c.traceId)
|
|
85
|
+
);
|
|
86
|
+
const allTraces = getTraces(db, slug, { kind: "correction" });
|
|
87
|
+
const unprocessed = allTraces.filter((t) => !processedTraceIds.has(t.id));
|
|
88
|
+
return unprocessed.length > 0;
|
|
89
|
+
}
|
|
90
|
+
var EVAL_GEN_SYSTEM_STRUCTURAL = `You generate eval test cases for Claude Code artifacts (skills, prompts, rules, subagents, commands).
|
|
91
|
+
Given a correction trace \u2014 what the user asked, what the skill produced, how the user corrected it \u2014
|
|
92
|
+
generate ONE eval assertion that would catch this failure pattern.
|
|
93
|
+
|
|
94
|
+
Prefer deterministic assertions over LLM judges:
|
|
95
|
+
- "must_contain" \u2014 regex pattern that MUST be present in the skill. assertion_value: {"pattern": "regex_here"}
|
|
96
|
+
- "must_not_contain" \u2014 regex pattern that must NOT be present. assertion_value: {"pattern": "regex_here"}
|
|
97
|
+
- "code_check" \u2014 JavaScript function body receiving 'content' argument. assertion_value: {"fn": "return content.includes('...')"}
|
|
98
|
+
- "llm_judge" \u2014 ONLY when pattern-based is impossible. assertion_value: {"prompt": "judge prompt here"}
|
|
99
|
+
|
|
100
|
+
Return ONLY a JSON object:
|
|
101
|
+
{"assertion_type": "...", "assertion_value": {...}, "category": "...", "description": "One line describing the failure"}`;
|
|
102
|
+
var EVAL_GEN_SYSTEM_BEHAVIORAL = `You generate BEHAVIORAL eval test cases for Claude Code artifacts (skills, prompts, rules, subagents, commands).
|
|
103
|
+
A behavioral eval works by: (1) using the artifact content as a system prompt for an LLM, (2) sending a test scenario as the user message, (3) checking the simulated output.
|
|
104
|
+
|
|
105
|
+
Given a correction trace, generate ONE behavioral eval assertion with a realistic test scenario.
|
|
106
|
+
|
|
107
|
+
Available types:
|
|
108
|
+
- "behavioral_must_contain" \u2014 simulated output MUST match regex. assertion_value: {"scenario": "user request that triggers the skill", "pattern": "regex_for_expected_output"}
|
|
109
|
+
- "behavioral_must_not_contain" \u2014 simulated output must NOT match. assertion_value: {"scenario": "...", "pattern": "regex_for_unwanted_output"}
|
|
110
|
+
- "behavioral_max_length" \u2014 output must be under N tokens. assertion_value: {"scenario": "...", "tokens": 200}
|
|
111
|
+
- "behavioral_code_check" \u2014 JS function on output string. assertion_value: {"scenario": "...", "fn": "return content.split('\\n').length < 20"}
|
|
112
|
+
- "behavioral_judge" \u2014 ONLY when pattern-based is impossible. assertion_value: {"scenario": "...", "criteria": "what good output looks like"}
|
|
113
|
+
|
|
114
|
+
The scenario should be a realistic user request that would expose the weakness found in the correction.
|
|
115
|
+
Prefer behavioral_must_contain and behavioral_must_not_contain over behavioral_judge.
|
|
116
|
+
|
|
117
|
+
Return ONLY a JSON object:
|
|
118
|
+
{"assertion_type": "behavioral_...", "assertion_value": {...}, "category": "...", "description": "One line describing what the output should/shouldn't do"}`;
|
|
119
|
+
function isBehavioralCorrection(correctionType) {
|
|
120
|
+
return ["scope_adjusted", "format_changed", "output_modified"].includes(correctionType ?? "");
|
|
121
|
+
}
|
|
122
|
+
async function generateEvalCase(trace, apiKey, model = "claude-haiku-4-5-20251001") {
|
|
123
|
+
if (!trace.correctionType && !trace.correctionPrompt) return null;
|
|
124
|
+
const useBehavioral = isBehavioralCorrection(trace.correctionType);
|
|
125
|
+
const userMessage = `Correction trace:
|
|
126
|
+
- Skill slug: ${trace.slug}
|
|
127
|
+
- Correction type: ${trace.correctionType ?? "unknown"}
|
|
128
|
+
- User's correction message: ${trace.correctionPrompt ?? "(no message captured)"}
|
|
129
|
+
- Tool that was used: ${trace.toolName}
|
|
130
|
+
- Tool input: ${JSON.stringify(trace.toolInput, null, 2).slice(0, 2e3)}
|
|
131
|
+
- Tool response: ${JSON.stringify(trace.toolResponse, null, 2).slice(0, 2e3)}
|
|
132
|
+
|
|
133
|
+
Generate an eval assertion that would detect this failure pattern.`;
|
|
134
|
+
try {
|
|
135
|
+
const result = await callAnthropic(apiKey, {
|
|
136
|
+
model,
|
|
137
|
+
system: useBehavioral ? EVAL_GEN_SYSTEM_BEHAVIORAL : EVAL_GEN_SYSTEM_STRUCTURAL,
|
|
138
|
+
userMessage,
|
|
139
|
+
maxTokens: 512
|
|
140
|
+
});
|
|
141
|
+
const parsed = parseLLMJson(result.text);
|
|
142
|
+
if (!parsed.assertion_type || !parsed.assertion_value || !parsed.category || !parsed.description) {
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
let assertion;
|
|
146
|
+
switch (parsed.assertion_type) {
|
|
147
|
+
case "must_contain":
|
|
148
|
+
assertion = { type: "must_contain", pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
149
|
+
break;
|
|
150
|
+
case "must_not_contain":
|
|
151
|
+
assertion = { type: "must_not_contain", pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
152
|
+
break;
|
|
153
|
+
case "code_check":
|
|
154
|
+
assertion = { type: "code_check", fn: String(parsed.assertion_value.fn ?? "") };
|
|
155
|
+
break;
|
|
156
|
+
case "llm_judge":
|
|
157
|
+
assertion = { type: "llm_judge", prompt: String(parsed.assertion_value.prompt ?? "") };
|
|
158
|
+
break;
|
|
159
|
+
case "max_output_length":
|
|
160
|
+
assertion = { type: "max_output_length", tokens: Number(parsed.assertion_value.tokens ?? 500) };
|
|
161
|
+
break;
|
|
162
|
+
// Behavioral types
|
|
163
|
+
case "behavioral_must_contain":
|
|
164
|
+
assertion = { type: "behavioral_must_contain", scenario: String(parsed.assertion_value.scenario ?? ""), pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
165
|
+
break;
|
|
166
|
+
case "behavioral_must_not_contain":
|
|
167
|
+
assertion = { type: "behavioral_must_not_contain", scenario: String(parsed.assertion_value.scenario ?? ""), pattern: String(parsed.assertion_value.pattern ?? "") };
|
|
168
|
+
break;
|
|
169
|
+
case "behavioral_max_length":
|
|
170
|
+
assertion = { type: "behavioral_max_length", scenario: String(parsed.assertion_value.scenario ?? ""), tokens: Number(parsed.assertion_value.tokens ?? 500) };
|
|
171
|
+
break;
|
|
172
|
+
case "behavioral_judge":
|
|
173
|
+
assertion = { type: "behavioral_judge", scenario: String(parsed.assertion_value.scenario ?? ""), criteria: String(parsed.assertion_value.criteria ?? "") };
|
|
174
|
+
break;
|
|
175
|
+
case "behavioral_code_check":
|
|
176
|
+
assertion = { type: "behavioral_code_check", scenario: String(parsed.assertion_value.scenario ?? ""), fn: String(parsed.assertion_value.fn ?? "") };
|
|
177
|
+
break;
|
|
178
|
+
default:
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
return {
|
|
182
|
+
evalCase: {
|
|
183
|
+
traceId: trace.id,
|
|
184
|
+
source: "trace",
|
|
185
|
+
category: parsed.category,
|
|
186
|
+
assertion,
|
|
187
|
+
description: parsed.description
|
|
188
|
+
},
|
|
189
|
+
llmCalls: 1
|
|
190
|
+
};
|
|
191
|
+
} catch {
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
function serializeAssertionValue(assertion) {
|
|
196
|
+
switch (assertion.type) {
|
|
197
|
+
case "must_contain":
|
|
198
|
+
case "must_not_contain":
|
|
199
|
+
return JSON.stringify({ pattern: assertion.pattern });
|
|
200
|
+
case "code_check":
|
|
201
|
+
return JSON.stringify({ fn: assertion.fn });
|
|
202
|
+
case "llm_judge":
|
|
203
|
+
return JSON.stringify({ prompt: assertion.prompt });
|
|
204
|
+
case "max_output_length":
|
|
205
|
+
return JSON.stringify({ tokens: assertion.tokens });
|
|
206
|
+
case "behavioral_must_contain":
|
|
207
|
+
case "behavioral_must_not_contain":
|
|
208
|
+
return JSON.stringify({ scenario: assertion.scenario, pattern: assertion.pattern });
|
|
209
|
+
case "behavioral_code_check":
|
|
210
|
+
return JSON.stringify({ scenario: assertion.scenario, fn: assertion.fn });
|
|
211
|
+
case "behavioral_judge":
|
|
212
|
+
return JSON.stringify({ scenario: assertion.scenario, criteria: assertion.criteria });
|
|
213
|
+
case "behavioral_max_length":
|
|
214
|
+
return JSON.stringify({ scenario: assertion.scenario, tokens: assertion.tokens });
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
var MAX_EVAL_CASES = 20;
|
|
218
|
+
async function generateEvalSuite(db, slug, apiKey, model = "claude-haiku-4-5-20251001") {
|
|
219
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
220
|
+
let totalLLMCalls = 0;
|
|
221
|
+
let suite = getEvalSuite(db, slug);
|
|
222
|
+
if (!suite) {
|
|
223
|
+
db.prepare(
|
|
224
|
+
"INSERT INTO eval_suites (slug, version, created_at, updated_at) VALUES (?, 1, ?, ?)"
|
|
225
|
+
).run(slug, now, now);
|
|
226
|
+
suite = { slug, version: 1, cases: [] };
|
|
227
|
+
}
|
|
228
|
+
const processedTraceIds = new Set(
|
|
229
|
+
suite.cases.filter((c) => c.traceId).map((c) => c.traceId)
|
|
230
|
+
);
|
|
231
|
+
const correctionTraces = getTraces(db, slug, { kind: "correction" });
|
|
232
|
+
const unprocessed = correctionTraces.filter((t) => !processedTraceIds.has(t.id));
|
|
233
|
+
const existingCategories = new Set(suite.cases.map((c) => c.category));
|
|
234
|
+
for (const trace of unprocessed) {
|
|
235
|
+
if (suite.cases.length >= MAX_EVAL_CASES) break;
|
|
236
|
+
if (totalLLMCalls >= MAX_EVAL_CASES) break;
|
|
237
|
+
const result = await generateEvalCase(trace, apiKey, model);
|
|
238
|
+
if (!result) continue;
|
|
239
|
+
totalLLMCalls += result.llmCalls;
|
|
240
|
+
if (existingCategories.has(result.evalCase.category)) continue;
|
|
241
|
+
existingCategories.add(result.evalCase.category);
|
|
242
|
+
const caseId = randomUUID();
|
|
243
|
+
const assertionValue = serializeAssertionValue(result.evalCase.assertion);
|
|
244
|
+
db.prepare(
|
|
245
|
+
`INSERT INTO eval_cases (id, suite_slug, trace_id, source, category, assertion_type, assertion_value, description, created_at)
|
|
246
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
247
|
+
).run(caseId, slug, trace.id, "trace", result.evalCase.category, result.evalCase.assertion.type, assertionValue, result.evalCase.description, now);
|
|
248
|
+
suite.cases.push({
|
|
249
|
+
id: caseId,
|
|
250
|
+
suiteSlug: slug,
|
|
251
|
+
traceId: trace.id,
|
|
252
|
+
source: "trace",
|
|
253
|
+
category: result.evalCase.category,
|
|
254
|
+
assertion: result.evalCase.assertion,
|
|
255
|
+
description: result.evalCase.description
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
if (suite.cases.length > MAX_EVAL_CASES) {
|
|
259
|
+
const toRemove = suite.cases.slice(0, suite.cases.length - MAX_EVAL_CASES);
|
|
260
|
+
for (const c of toRemove) {
|
|
261
|
+
db.prepare("DELETE FROM eval_cases WHERE id = ?").run(c.id);
|
|
262
|
+
}
|
|
263
|
+
suite.cases = suite.cases.slice(suite.cases.length - MAX_EVAL_CASES);
|
|
264
|
+
}
|
|
265
|
+
db.prepare("UPDATE eval_suites SET version = version + 1, updated_at = ? WHERE slug = ?").run(now, slug);
|
|
266
|
+
suite.version++;
|
|
267
|
+
return { suite, llmCalls: totalLLMCalls };
|
|
268
|
+
}
|
|
269
|
+
async function validateJudges(db, slug, apiKey, skillContent, simulationModel) {
|
|
270
|
+
const suite = getEvalSuite(db, slug);
|
|
271
|
+
if (!suite || suite.cases.length === 0) {
|
|
272
|
+
return { tpr: void 0, tnr: void 0, sampleSize: 0 };
|
|
273
|
+
}
|
|
274
|
+
const corrections = getTraces(db, slug, { kind: "correction" });
|
|
275
|
+
const simulationOptions = simulationModel ? { model: simulationModel, cache: /* @__PURE__ */ new Map() } : void 0;
|
|
276
|
+
let truePositives = 0;
|
|
277
|
+
for (const trace of corrections) {
|
|
278
|
+
const badContent = JSON.stringify(trace.toolResponse);
|
|
279
|
+
let anyFailed = false;
|
|
280
|
+
for (const evalCase of suite.cases) {
|
|
281
|
+
const result = await runSingleCase(badContent, evalCase, apiKey, simulationOptions);
|
|
282
|
+
if (!result.passed) {
|
|
283
|
+
anyFailed = true;
|
|
284
|
+
break;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
if (anyFailed) truePositives++;
|
|
288
|
+
}
|
|
289
|
+
let tnr;
|
|
290
|
+
if (skillContent) {
|
|
291
|
+
let allPassed = true;
|
|
292
|
+
for (const evalCase of suite.cases) {
|
|
293
|
+
const result = await runSingleCase(skillContent, evalCase, apiKey, simulationOptions);
|
|
294
|
+
if (!result.passed) {
|
|
295
|
+
allPassed = false;
|
|
296
|
+
break;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
tnr = allPassed ? 1 : 0;
|
|
300
|
+
}
|
|
301
|
+
const tpr = corrections.length > 0 ? truePositives / corrections.length : void 0;
|
|
302
|
+
const sampleSize = corrections.length + (skillContent ? 1 : 0);
|
|
303
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
304
|
+
db.prepare(
|
|
305
|
+
"UPDATE eval_suites SET tpr = ?, tnr = ?, validation_sample_size = ?, validated_at = ? WHERE slug = ?"
|
|
306
|
+
).run(tpr ?? null, tnr ?? null, sampleSize, now, slug);
|
|
307
|
+
return { tpr, tnr, sampleSize };
|
|
308
|
+
}
|
|
309
|
+
export {
|
|
310
|
+
generateEvalCase,
|
|
311
|
+
generateEvalSuite,
|
|
312
|
+
getEvalSuite,
|
|
313
|
+
isBehavioralCorrection,
|
|
314
|
+
shouldGenerateEvals,
|
|
315
|
+
validateJudges
|
|
316
|
+
};
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import {
|
|
2
|
+
findRegistryEntry,
|
|
3
|
+
upsertRegistryEntry
|
|
4
|
+
} from "./chunk-HEYFAUHL.js";
|
|
5
|
+
import {
|
|
6
|
+
readConfig
|
|
7
|
+
} from "./chunk-545XA5CB.js";
|
|
8
|
+
import "./chunk-MLKGABMK.js";
|
|
9
|
+
|
|
10
|
+
// src/daemon/improver.ts
|
|
11
|
+
import { readFileSync, writeFileSync, mkdirSync, existsSync, appendFileSync } from "fs";
|
|
12
|
+
import { join, dirname } from "path";
|
|
13
|
+
import { homedir } from "os";
|
|
14
|
+
var ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages";
|
|
15
|
+
var IMPROVEMENT_MODEL = "claude-haiku-4-5-20251001";
|
|
16
|
+
var isDemo = process.env.CAIK_DEMO === "1";
|
|
17
|
+
var MIN_OBSERVATIONS = isDemo ? 1 : 5;
|
|
18
|
+
var MIN_CORRECTION_RATE = isDemo ? 0 : 0.3;
|
|
19
|
+
var MIN_LENGTH_RATIO = 0.5;
|
|
20
|
+
var MAX_LENGTH_RATIO = 2;
|
|
21
|
+
function getLogPath() {
|
|
22
|
+
return join(homedir(), ".caik", "improvement.log");
|
|
23
|
+
}
|
|
24
|
+
function log(message) {
|
|
25
|
+
const line = `[${(/* @__PURE__ */ new Date()).toISOString()}] ${message}
|
|
26
|
+
`;
|
|
27
|
+
try {
|
|
28
|
+
const logPath = getLogPath();
|
|
29
|
+
const dir = dirname(logPath);
|
|
30
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
31
|
+
appendFileSync(logPath, line, "utf-8");
|
|
32
|
+
} catch {
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
function getAnthropicApiKey() {
|
|
36
|
+
if (process.env.ANTHROPIC_API_KEY) return process.env.ANTHROPIC_API_KEY;
|
|
37
|
+
try {
|
|
38
|
+
const configPath = join(homedir(), ".caik", "config.json");
|
|
39
|
+
if (existsSync(configPath)) {
|
|
40
|
+
const raw = JSON.parse(readFileSync(configPath, "utf-8"));
|
|
41
|
+
if (typeof raw.anthropicApiKey === "string") return raw.anthropicApiKey;
|
|
42
|
+
}
|
|
43
|
+
} catch {
|
|
44
|
+
}
|
|
45
|
+
return void 0;
|
|
46
|
+
}
|
|
47
|
+
function stripPii(content) {
|
|
48
|
+
const home = homedir();
|
|
49
|
+
return content.replaceAll(home, "~/...");
|
|
50
|
+
}
|
|
51
|
+
function computeStats(observations) {
|
|
52
|
+
const total = observations.length;
|
|
53
|
+
const corrections = observations.filter((o) => o.correctionType !== void 0);
|
|
54
|
+
const correctionRate = total > 0 ? corrections.length / total : 0;
|
|
55
|
+
const correctionCounts = {};
|
|
56
|
+
for (const o of corrections) {
|
|
57
|
+
const t = o.correctionType;
|
|
58
|
+
correctionCounts[t] = (correctionCounts[t] ?? 0) + 1;
|
|
59
|
+
}
|
|
60
|
+
return { total, correctionRate, correctionCounts };
|
|
61
|
+
}
|
|
62
|
+
async function callAnthropic(apiKey, currentContent, stats) {
|
|
63
|
+
const correctionSummary = Object.entries(stats.correctionCounts).sort(([, a], [, b]) => b - a).map(([type, count]) => `- ${type}: ${count} occurrences`).join("\n");
|
|
64
|
+
const userMessage = `Here is the current artifact content:
|
|
65
|
+
|
|
66
|
+
<skill>
|
|
67
|
+
${currentContent}
|
|
68
|
+
</skill>
|
|
69
|
+
|
|
70
|
+
Observation summary:
|
|
71
|
+
- Total observations: ${stats.total}
|
|
72
|
+
- Correction rate: ${(stats.correctionRate * 100).toFixed(1)}%
|
|
73
|
+
- Correction types:
|
|
74
|
+
${correctionSummary}
|
|
75
|
+
|
|
76
|
+
Improve this artifact to reduce the correction rate. Focus on the most frequent correction types.`;
|
|
77
|
+
const body = JSON.stringify({
|
|
78
|
+
model: IMPROVEMENT_MODEL,
|
|
79
|
+
max_tokens: 4096,
|
|
80
|
+
system: "You are improving a Claude Code artifact based on observed failure patterns. Make minimal, targeted changes. Preserve the artifact's core purpose and structure. Return ONLY the improved content, no explanation.",
|
|
81
|
+
messages: [{ role: "user", content: userMessage }]
|
|
82
|
+
});
|
|
83
|
+
const res = await fetch(ANTHROPIC_API_URL, {
|
|
84
|
+
method: "POST",
|
|
85
|
+
headers: {
|
|
86
|
+
"x-api-key": apiKey,
|
|
87
|
+
"anthropic-version": "2023-06-01",
|
|
88
|
+
"content-type": "application/json"
|
|
89
|
+
},
|
|
90
|
+
body
|
|
91
|
+
});
|
|
92
|
+
if (!res.ok) {
|
|
93
|
+
const text = await res.text();
|
|
94
|
+
throw new Error(`Anthropic API ${res.status}: ${text}`);
|
|
95
|
+
}
|
|
96
|
+
const data = await res.json();
|
|
97
|
+
const textBlock = data.content.find((b) => b.type === "text");
|
|
98
|
+
if (!textBlock?.text) throw new Error("No text in Anthropic response");
|
|
99
|
+
return textBlock.text;
|
|
100
|
+
}
|
|
101
|
+
function versionOldContent(slug, content) {
|
|
102
|
+
const dir = join(homedir(), ".caik", "versions", slug);
|
|
103
|
+
mkdirSync(dir, { recursive: true });
|
|
104
|
+
const filename = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-") + ".md";
|
|
105
|
+
writeFileSync(join(dir, filename), content, "utf-8");
|
|
106
|
+
}
|
|
107
|
+
function reportContribution(slug, improvementType, stats, improvedContent) {
|
|
108
|
+
const config = readConfig();
|
|
109
|
+
const level = config.contributionLevel ?? "contributor";
|
|
110
|
+
if (level === "none") return;
|
|
111
|
+
const apiUrl = config.apiUrl ?? "https://www.caik.dev";
|
|
112
|
+
const apiKey = config.apiKey;
|
|
113
|
+
let payload = { slug, improved: true, type: improvementType };
|
|
114
|
+
if (level === "contributor" || level === "collective") {
|
|
115
|
+
payload.correctionDistribution = stats.correctionCounts;
|
|
116
|
+
payload.correctionRate = stats.correctionRate;
|
|
117
|
+
}
|
|
118
|
+
if (level === "collective" && improvedContent) {
|
|
119
|
+
payload.improvedContent = stripPii(improvedContent);
|
|
120
|
+
}
|
|
121
|
+
fetch(`${apiUrl}/api/v1/improvements`, {
|
|
122
|
+
method: "POST",
|
|
123
|
+
headers: {
|
|
124
|
+
"content-type": "application/json",
|
|
125
|
+
...apiKey ? { authorization: `Bearer ${apiKey}` } : {}
|
|
126
|
+
},
|
|
127
|
+
body: JSON.stringify(payload)
|
|
128
|
+
}).catch((err) => {
|
|
129
|
+
log(`Contribution report failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
async function runImprovement(slug, observations) {
|
|
133
|
+
try {
|
|
134
|
+
const entry = findRegistryEntry(slug);
|
|
135
|
+
if (!entry) {
|
|
136
|
+
return { slug, improved: false, error: "Not found in registry" };
|
|
137
|
+
}
|
|
138
|
+
const skillPath = entry.files[0];
|
|
139
|
+
if (!skillPath || !existsSync(skillPath)) {
|
|
140
|
+
return { slug, improved: false, error: "Skill file not found" };
|
|
141
|
+
}
|
|
142
|
+
const currentContent = readFileSync(skillPath, "utf-8");
|
|
143
|
+
const slugObs = observations.filter((o) => o.slug === slug);
|
|
144
|
+
const stats = computeStats(slugObs);
|
|
145
|
+
if (stats.total < MIN_OBSERVATIONS || stats.correctionRate < MIN_CORRECTION_RATE) {
|
|
146
|
+
return { slug, improved: false };
|
|
147
|
+
}
|
|
148
|
+
const apiKey = getAnthropicApiKey();
|
|
149
|
+
if (!apiKey) {
|
|
150
|
+
log(`No Anthropic API key available for improvement of ${slug}`);
|
|
151
|
+
return { slug, improved: false, error: "No API key" };
|
|
152
|
+
}
|
|
153
|
+
const improvedContent = await callAnthropic(apiKey, currentContent, stats);
|
|
154
|
+
const ratio = improvedContent.length / currentContent.length;
|
|
155
|
+
if (ratio < MIN_LENGTH_RATIO || ratio > MAX_LENGTH_RATIO) {
|
|
156
|
+
log(`Rejected improvement for ${slug}: length ratio ${ratio.toFixed(2)} out of bounds`);
|
|
157
|
+
return { slug, improved: false, error: `Length ratio ${ratio.toFixed(2)} out of bounds` };
|
|
158
|
+
}
|
|
159
|
+
versionOldContent(slug, currentContent);
|
|
160
|
+
const proposedPath = skillPath.replace(/SKILL\.md$/, "SKILL.proposed.md");
|
|
161
|
+
writeFileSync(proposedPath, improvedContent, "utf-8");
|
|
162
|
+
const topCorrection = Object.entries(stats.correctionCounts).sort(([, a], [, b]) => b - a)[0]?.[0] ?? "general";
|
|
163
|
+
const improvementType = `correction_fix:${topCorrection}`;
|
|
164
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
165
|
+
const updatedEntry = {
|
|
166
|
+
...entry,
|
|
167
|
+
updatedAt: now,
|
|
168
|
+
pendingImprovement: true,
|
|
169
|
+
improvementLog: [
|
|
170
|
+
...entry.improvementLog ?? [],
|
|
171
|
+
{ ts: now, type: improvementType }
|
|
172
|
+
]
|
|
173
|
+
};
|
|
174
|
+
upsertRegistryEntry(updatedEntry);
|
|
175
|
+
reportContribution(slug, improvementType, stats, improvedContent);
|
|
176
|
+
log(`Proposed improvement for ${slug}: ${improvementType}`);
|
|
177
|
+
return { slug, improved: true, improvementType };
|
|
178
|
+
} catch (err) {
|
|
179
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
180
|
+
log(`Improvement failed for ${slug}: ${message}`);
|
|
181
|
+
return { slug, improved: false, error: message };
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
export {
|
|
185
|
+
runImprovement
|
|
186
|
+
};
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import "./chunk-MLKGABMK.js";
|
|
2
|
+
|
|
3
|
+
// src/daemon/nudge-detector.ts
|
|
4
|
+
import { readFileSync, writeFileSync, readdirSync, existsSync, mkdirSync } from "fs";
|
|
5
|
+
import { join } from "path";
|
|
6
|
+
import { homedir } from "os";
|
|
7
|
+
var NUDGE_PATH = join(homedir(), ".caik", "contribution-nudges.json");
|
|
8
|
+
var AGENT_PRIMITIVES = /* @__PURE__ */ new Set([
|
|
9
|
+
"Read",
|
|
10
|
+
"Write",
|
|
11
|
+
"Edit",
|
|
12
|
+
"Bash",
|
|
13
|
+
"Grep",
|
|
14
|
+
"Glob",
|
|
15
|
+
"Agent",
|
|
16
|
+
"ToolSearch",
|
|
17
|
+
"WebFetch",
|
|
18
|
+
"WebSearch",
|
|
19
|
+
"NotebookEdit",
|
|
20
|
+
"TaskCreate",
|
|
21
|
+
"TaskUpdate",
|
|
22
|
+
"TaskGet",
|
|
23
|
+
"TaskList",
|
|
24
|
+
"TaskOutput",
|
|
25
|
+
"TaskStop",
|
|
26
|
+
"Skill",
|
|
27
|
+
"SendMessage",
|
|
28
|
+
"EnterPlanMode",
|
|
29
|
+
"ExitPlanMode",
|
|
30
|
+
"AskUserQuestion",
|
|
31
|
+
"EnterWorktree",
|
|
32
|
+
"ExitWorktree"
|
|
33
|
+
]);
|
|
34
|
+
var MIN_USES = 10;
|
|
35
|
+
var MIN_SESSIONS = 3;
|
|
36
|
+
var MIN_SUCCESS_RATE = 0.8;
|
|
37
|
+
var MAX_NUDGES = 3;
|
|
38
|
+
var DEDUP_WINDOW_MS = 7 * 24 * 60 * 60 * 1e3;
|
|
39
|
+
function isCaikTool(tool) {
|
|
40
|
+
return tool.startsWith("mcp__caik__") || tool.startsWith("mcp__caik-dev__") || tool.startsWith("caik");
|
|
41
|
+
}
|
|
42
|
+
function extractDisplayName(tool) {
|
|
43
|
+
if (tool.startsWith("mcp__")) {
|
|
44
|
+
const parts = tool.split("__");
|
|
45
|
+
return parts[1] ?? tool;
|
|
46
|
+
}
|
|
47
|
+
return tool;
|
|
48
|
+
}
|
|
49
|
+
function detectContributionNudges(observationsDir, registrySlugs) {
|
|
50
|
+
if (!existsSync(observationsDir)) return [];
|
|
51
|
+
let files;
|
|
52
|
+
try {
|
|
53
|
+
files = readdirSync(observationsDir).filter((f) => f.endsWith(".jsonl"));
|
|
54
|
+
} catch {
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
const toolStats = /* @__PURE__ */ new Map();
|
|
58
|
+
for (const file of files) {
|
|
59
|
+
const filePath = join(observationsDir, file);
|
|
60
|
+
let lines;
|
|
61
|
+
try {
|
|
62
|
+
const content = readFileSync(filePath, "utf-8").trim();
|
|
63
|
+
if (!content) continue;
|
|
64
|
+
lines = content.split("\n").filter(Boolean);
|
|
65
|
+
} catch {
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
for (const line of lines) {
|
|
69
|
+
let obs;
|
|
70
|
+
try {
|
|
71
|
+
obs = JSON.parse(line);
|
|
72
|
+
} catch {
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
const tool = obs.tool;
|
|
76
|
+
if (!tool) continue;
|
|
77
|
+
if (AGENT_PRIMITIVES.has(tool)) continue;
|
|
78
|
+
if (isCaikTool(tool)) continue;
|
|
79
|
+
if (registrySlugs.has(obs.slug)) continue;
|
|
80
|
+
let stats = toolStats.get(tool);
|
|
81
|
+
if (!stats) {
|
|
82
|
+
stats = { tool, uses: 0, sessions: /* @__PURE__ */ new Set(), successes: 0, lastUsed: obs.timestamp };
|
|
83
|
+
toolStats.set(tool, stats);
|
|
84
|
+
}
|
|
85
|
+
stats.uses++;
|
|
86
|
+
if (obs.sessionId) stats.sessions.add(obs.sessionId);
|
|
87
|
+
if (obs.success) stats.successes++;
|
|
88
|
+
if (obs.timestamp > stats.lastUsed) stats.lastUsed = obs.timestamp;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
92
|
+
const candidates = [];
|
|
93
|
+
for (const stats of toolStats.values()) {
|
|
94
|
+
if (stats.uses < MIN_USES) continue;
|
|
95
|
+
if (stats.sessions.size < MIN_SESSIONS) continue;
|
|
96
|
+
const successRate = stats.uses > 0 ? stats.successes / stats.uses : 0;
|
|
97
|
+
if (successRate < MIN_SUCCESS_RATE) continue;
|
|
98
|
+
const isMcp = stats.tool.startsWith("mcp__");
|
|
99
|
+
candidates.push({
|
|
100
|
+
type: isMcp ? "share_mcp_server" : "share_skill",
|
|
101
|
+
toolName: stats.tool,
|
|
102
|
+
displayName: extractDisplayName(stats.tool),
|
|
103
|
+
uses: stats.uses,
|
|
104
|
+
sessions: stats.sessions.size,
|
|
105
|
+
successRate: Math.round(successRate * 100) / 100,
|
|
106
|
+
lastUsed: stats.lastUsed,
|
|
107
|
+
createdAt: now
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
candidates.sort((a, b) => b.uses - a.uses);
|
|
111
|
+
return candidates.slice(0, MAX_NUDGES);
|
|
112
|
+
}
|
|
113
|
+
function readNudgeFile() {
|
|
114
|
+
try {
|
|
115
|
+
return JSON.parse(readFileSync(NUDGE_PATH, "utf-8"));
|
|
116
|
+
} catch {
|
|
117
|
+
return [];
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
function writeNudgeFile(nudges) {
|
|
121
|
+
const sevenDaysAgo = new Date(Date.now() - DEDUP_WINDOW_MS).toISOString();
|
|
122
|
+
const existing = readNudgeFile();
|
|
123
|
+
const recentlyNudged = new Set(
|
|
124
|
+
existing.filter((n) => n.createdAt > sevenDaysAgo).map((n) => n.toolName)
|
|
125
|
+
);
|
|
126
|
+
const newNudges = nudges.filter((n) => !recentlyNudged.has(n.toolName));
|
|
127
|
+
if (newNudges.length === 0) return;
|
|
128
|
+
const merged = [
|
|
129
|
+
...existing.filter((n) => n.createdAt > sevenDaysAgo),
|
|
130
|
+
...newNudges
|
|
131
|
+
].slice(0, 5);
|
|
132
|
+
const dir = join(homedir(), ".caik");
|
|
133
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true, mode: 448 });
|
|
134
|
+
writeFileSync(NUDGE_PATH, JSON.stringify(merged, null, 2), "utf-8");
|
|
135
|
+
}
|
|
136
|
+
export {
|
|
137
|
+
detectContributionNudges,
|
|
138
|
+
readNudgeFile,
|
|
139
|
+
writeNudgeFile
|
|
140
|
+
};
|