vskill 0.2.85 → 0.2.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +70 -3
- package/dist/commands/eval/run.js +2 -3
- package/dist/commands/eval/run.js.map +1 -1
- package/dist/eval/__tests__/activation-tester.test.js +106 -0
- package/dist/eval/__tests__/activation-tester.test.js.map +1 -1
- package/dist/eval/__tests__/prompt-builder.test.js +53 -1
- package/dist/eval/__tests__/prompt-builder.test.js.map +1 -1
- package/dist/eval/activation-tester.d.ts +8 -2
- package/dist/eval/activation-tester.js +48 -2
- package/dist/eval/activation-tester.js.map +1 -1
- package/dist/eval/benchmark-history.d.ts +4 -4
- package/dist/eval/benchmark-history.js.map +1 -1
- package/dist/eval/benchmark.d.ts +6 -1
- package/dist/eval/benchmark.js.map +1 -1
- package/dist/eval/comparator.js +13 -5
- package/dist/eval/comparator.js.map +1 -1
- package/dist/eval/prompt-builder.d.ts +7 -0
- package/dist/eval/prompt-builder.js +26 -2
- package/dist/eval/prompt-builder.js.map +1 -1
- package/dist/eval-server/api-routes.js +14 -11
- package/dist/eval-server/api-routes.js.map +1 -1
- package/dist/eval-server/eval-server.js +2 -1
- package/dist/eval-server/eval-server.js.map +1 -1
- package/dist/eval-server/improve-routes.js +20 -0
- package/dist/eval-server/improve-routes.js.map +1 -1
- package/dist/eval-server/model-compare-routes.js +49 -4
- package/dist/eval-server/model-compare-routes.js.map +1 -1
- package/dist/eval-ui/assets/index-CRZR_1WI.js +69 -0
- package/dist/eval-ui/assets/index-Cw1cSOQk.css +1 -0
- package/dist/eval-ui/index.html +3 -3
- package/dist/index.js +12 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/eval-ui/assets/index-9X1GJMu6.js +0 -88
- package/dist/eval-ui/assets/index-C9iDWI2Z.css +0 -1
package/README.md
CHANGED
|
@@ -265,14 +265,81 @@ Think of it like testing a recipe book: you don't cook the food, you check wheth
|
|
|
265
265
|
|
|
266
266
|
The **A/B comparison** randomly shuffles outputs as "Response A" and "Response B" before scoring, so the judge can't tell which used the skill. Each response is scored on content (1-5) and structure (1-5). The delta between skill and baseline averages produces a verdict: EFFECTIVE, MARGINAL, INEFFECTIVE, or DEGRADING.
|
|
267
267
|
|
|
268
|
+
### Unit testing vs integration testing
|
|
269
|
+
|
|
270
|
+
Skill evals are **unit tests** — they verify the skill's teaching quality in isolation, without calling external tools or APIs. This is a deliberate design choice:
|
|
271
|
+
|
|
272
|
+
| | Unit Tests (current) | Integration Tests |
|
|
273
|
+
|:---|:---|:---|
|
|
274
|
+
| **What** | Does the SKILL.md teach the right workflow? | Does the end-to-end tool execution work? |
|
|
275
|
+
| **Speed** | ~30s per case | ~3min per case |
|
|
276
|
+
| **Infrastructure** | None — any LLM provider | Real MCP servers, auth tokens, test data |
|
|
277
|
+
| **CI/CD** | Runs anywhere | Needs secrets, test workspaces |
|
|
278
|
+
| **Flakiness** | Low (deterministic text) | High (external APIs, rate limits) |
|
|
279
|
+
| **Coverage** | Workflow, tool selection, formatting, parameters | API compatibility, auth, error recovery |
|
|
280
|
+
|
|
281
|
+
**Why unit tests are sufficient for most skills:** The eval doesn't test whether Slack's API works — it tests whether your SKILL.md correctly teaches an LLM to use `slack_search_channels` before `slack_read_channel`, to use `thread_ts` for replies, and to format messages with `*bold*` instead of `**bold**`. If the teaching is correct, the execution follows.
|
|
282
|
+
|
|
283
|
+
#### MCP-dependent skills (Slack, GitHub, Linear, etc.)
|
|
284
|
+
|
|
285
|
+
Skills that reference MCP tools automatically get **simulation mode** during evals. The eval system detects MCP tool references in your SKILL.md and instructs the LLM to demonstrate the complete workflow with simulated tool responses. This means your assertions can test tool selection, parameter correctness, and workflow order — even without a real MCP connection.
|
|
286
|
+
|
|
287
|
+
```
|
|
288
|
+
Standard skill eval: MCP skill eval (automatic):
|
|
289
|
+
┌──────────┐ ┌──────────┐
|
|
290
|
+
│ SKILL.md │ → system prompt │ SKILL.md │ → system prompt
|
|
291
|
+
└──────────┘ └──────────┘ + simulation instructions
|
|
292
|
+
↓ ↓
|
|
293
|
+
┌──────────┐ ┌──────────┐
|
|
294
|
+
│ LLM │ → text response │ LLM │ → simulated workflow
|
|
295
|
+
└──────────┘ └──────────┘ (tool calls + mock responses)
|
|
296
|
+
↓ ↓
|
|
297
|
+
┌──────────┐ ┌──────────┐
|
|
298
|
+
│ Judge │ → pass/fail │ Judge │ → pass/fail
|
|
299
|
+
└──────────┘ └──────────┘
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
No configuration needed — if your SKILL.md mentions `slack_*`, `github_*`, `linear_*`, or `gws_*` tools, simulation mode activates automatically.
|
|
303
|
+
|
|
304
|
+
#### Activation testing
|
|
305
|
+
|
|
306
|
+
Skills can also include trigger accuracy tests in `evals/activation-prompts.json`:
|
|
307
|
+
|
|
308
|
+
```json
|
|
309
|
+
{
|
|
310
|
+
"prompts": [
|
|
311
|
+
{ "prompt": "check what's new in #engineering", "expected": "should_activate" },
|
|
312
|
+
{ "prompt": "send an email to the team", "expected": "should_not_activate" }
|
|
313
|
+
]
|
|
314
|
+
}
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
This tests whether your skill's `description` field in SKILL.md causes the skill to trigger on the right prompts (precision) and not miss relevant ones (recall). Results show TP/TN/FP/FN classification with precision, recall, and reliability metrics.
|
|
318
|
+
|
|
319
|
+
#### Cross-model testing
|
|
320
|
+
|
|
321
|
+
The eval system supports Claude (CLI or API), Anthropic API, and Ollama. Testing across models reveals:
|
|
322
|
+
- Whether your skill helps **weaker models** (Llama, Qwen) follow complex workflows
|
|
323
|
+
- Whether base model improvements have made a skill **unnecessary**
|
|
324
|
+
- Whether your simulation instructions are **clear enough** for smaller models
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
# Test with Opus (high-end)
|
|
328
|
+
VSKILL_EVAL_MODEL=opus npx vskill eval run my-skill
|
|
329
|
+
|
|
330
|
+
# Test with Ollama (open-source)
|
|
331
|
+
VSKILL_EVAL_PROVIDER=ollama VSKILL_EVAL_MODEL=llama3.1:8b npx vskill eval run my-skill
|
|
332
|
+
```
|
|
333
|
+
|
|
268
334
|
### Directory structure
|
|
269
335
|
|
|
270
336
|
```
|
|
271
337
|
your-skill/
|
|
272
|
-
├── SKILL.md
|
|
338
|
+
├── SKILL.md # The skill definition
|
|
273
339
|
└── evals/
|
|
274
|
-
├── evals.json
|
|
275
|
-
|
|
340
|
+
├── evals.json # Test cases + assertions
|
|
341
|
+
├── activation-prompts.json # Trigger accuracy tests (optional)
|
|
342
|
+
└── benchmark.json # Latest benchmark results (auto-generated)
|
|
276
343
|
```
|
|
277
344
|
|
|
278
345
|
### evals.json format
|
|
@@ -8,6 +8,7 @@ import { createLlmClient } from "../../eval/llm.js";
|
|
|
8
8
|
import { judgeAssertion } from "../../eval/judge.js";
|
|
9
9
|
import { writeBenchmark } from "../../eval/benchmark.js";
|
|
10
10
|
import { green, red, yellow, bold, dim, table } from "../../utils/output.js";
|
|
11
|
+
import { buildEvalSystemPrompt } from "../../eval/prompt-builder.js";
|
|
11
12
|
export async function runEvalRun(skillDir) {
|
|
12
13
|
// Load and validate evals.json
|
|
13
14
|
let evalsFile;
|
|
@@ -39,9 +40,7 @@ export async function runEvalRun(skillDir) {
|
|
|
39
40
|
else {
|
|
40
41
|
console.error(yellow(`Warning: No SKILL.md found at ${skillMdPath} — running evals without skill content`));
|
|
41
42
|
}
|
|
42
|
-
const systemPrompt = skillContent
|
|
43
|
-
? `You are an AI assistant with the following skill loaded. Use this skill's knowledge to answer the user's question.\n\n---\n${skillContent}\n---`
|
|
44
|
-
: "You are an AI assistant. Answer the user's question.";
|
|
43
|
+
const systemPrompt = buildEvalSystemPrompt(skillContent);
|
|
45
44
|
const client = createLlmClient();
|
|
46
45
|
const model = client.model;
|
|
47
46
|
const total = evalsFile.evals.length;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run.js","sourceRoot":"","sources":["../../../src/commands/eval/run.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6DAA6D;AAC7D,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAEzD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"run.js","sourceRoot":"","sources":["../../../src/commands/eval/run.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6DAA6D;AAC7D,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAEzD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAC7E,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAErE,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,QAAgB;IAC/C,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,EAAE,CAAC;YAC9C,IAAI,QAAQ,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;gBACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0BAA0B,QAAQ,mBAAmB,CAAC,CAAC,CAAC;YAC5E,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,8CAA8C;IAC9C,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,IAAI,YAAY,GAAG,EAAE,CAAC;IACtB,IAAI,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC5B,YAAY,GAAG,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACpD,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,iCAAiC,WAAW,wCAAwC,CAAC,CAAC,CAAC;IAC9G,CAAC;IAED,MAAM,YAAY,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IAEzD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;IAC3B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC;IACrC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,aAAa,KAAK,MAAM,KAAK,aAAa,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;IACrF,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAEtE,MAAM,cAAc,GAAoB,EAAE,CAAC;IAC3C,MAAM,SAAS,GAAe,EAAE,CAAC;IAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAChD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpC,IAAI,CAAC;YACH,6BAA6B;YAC7B,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,KAAK,KAAK,QAAQ,CAAC,IAAI,kBAAkB,CAAC,CAAC,CAAC;YAClF,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YACvE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,YAAY,QAAQ,CAAC,UAAU,CAAC,MAAM,gBAAgB,CAAC,CAAC,CAAC;YAElF,+BAA+B;YAC/B,MAAM,gBAAgB,GAAG,EAAE,CAAC;YAC5B,IAAI,SAAS,GAAG,CAAC,CAAC;YAElB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;gBAC5C,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,SAAS,CAAC,IAAI,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;gBACvE,gBAAgB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAC9B,IAAI,MAAM,CAAC,IAAI;oBAAE,SAAS,EAAE,CAAC;gBAE7B,MAAM,aAAa,GACjB,SAAS,CAAC,IAAI,CAAC,MAAM,GAAG,EAAE;oBACxB,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK;oBACrC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC;gBAErB,SAAS,CAAC,IAAI,CAAC;oBACb,QAAQ,CAAC,IAAI;oBACb,SAAS,CAAC,EAAE;oBACZ,aAAa;oBACb,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC;iBAC1C,CAAC,CAAC;YACL,CAAC;YAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;gBAC7C,CAAC,CAAC,SAAS,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM;gBACxC,CAAC,CAAC,CAAC,CAAC;YACN,MAAM,SAAS,GAAG,SAAS,KAAK,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC;YAC3D,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,SAAS,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,SAAS,CAAC,CAAC,CAAC;YAEpG,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;gBACnC,aAAa,EAAE,IAAI;gBACnB,SAAS,EAAE,QAAQ;gBACnB,UAAU,EAAE,gBAAgB;aAC7B,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC9B,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,OAAO;gBACf,aAAa,EAAG,GAAa,CAAC,OAAO;gBACrC,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,EAAE;aACf,CAAC,CAAC;YAEH,SAAS,CAAC,IAAI,CAAC;gBACb,QAAQ,CAAC,IAAI;gBACb,GAAG;gBACH,GAAG,CAAC,SAAS,GAAI,GAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACpD,MAAM,CAAC,OAAO,CAAC;aAChB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,sBAAsB;IACtB,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,SAAS,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;IAC/D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC;IAEvC,kBAAkB;IAClB,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,MAAM,CAAC;IACzE,OAAO,CAAC,GAAG,CACT,KAAK,KAAK,CAAC,GAAG,MAAM,SAAS,CAAC,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,EAAE,CACrI,CAAC;IAEF,uBAAuB;IACvB,MAAM,SAAS,GAAoB;QACjC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,KAAK;QACL,UAAU,EAAE,SAAS,CAAC,UAAU;QAChC,KAAK,EAAE,cAAc;KACtB,CAAC;IAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,uBAAuB,CAAC,CAAC,CAAC;AAC9E,CAAC"}
|
|
@@ -94,4 +94,110 @@ describe("testActivation", () => {
|
|
|
94
94
|
expect(summary.reliability).toBe(0);
|
|
95
95
|
});
|
|
96
96
|
});
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
// Auto-classification tests
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
const SKILL_META = {
|
|
101
|
+
name: "slack-messaging",
|
|
102
|
+
tags: ["slack", "messaging", "channels", "threads"],
|
|
103
|
+
};
|
|
104
|
+
describe("testActivation — auto-classification", () => {
|
|
105
|
+
it("auto-classifies prompts using skill name and tags", async () => {
|
|
106
|
+
const client = mockClient([
|
|
107
|
+
// Phase 1: classify "send a message in #general" → related
|
|
108
|
+
JSON.stringify({ related: true }),
|
|
109
|
+
// Phase 1: classify "I built my test" → not related
|
|
110
|
+
JSON.stringify({ related: false }),
|
|
111
|
+
// Phase 2: evaluate "send a message in #general" → activate
|
|
112
|
+
JSON.stringify({ activate: true, confidence: "high", reasoning: "Slack messaging" }),
|
|
113
|
+
// Phase 2: evaluate "I built my test" → no activate
|
|
114
|
+
JSON.stringify({ activate: false, confidence: "high", reasoning: "Software testing" }),
|
|
115
|
+
]);
|
|
116
|
+
const prompts = [
|
|
117
|
+
{ prompt: "send a message in #general", expected: "auto" },
|
|
118
|
+
{ prompt: "I built my test", expected: "auto" },
|
|
119
|
+
];
|
|
120
|
+
const summary = await testActivation("Slack messaging skill desc", prompts, client, undefined, SKILL_META);
|
|
121
|
+
expect(summary.tp).toBe(1);
|
|
122
|
+
expect(summary.tn).toBe(1);
|
|
123
|
+
expect(summary.fp).toBe(0);
|
|
124
|
+
expect(summary.fn).toBe(0);
|
|
125
|
+
expect(summary.autoClassifiedCount).toBe(2);
|
|
126
|
+
expect(summary.results[0].autoClassified).toBe(true);
|
|
127
|
+
expect(summary.results[0].expected).toBe("should_activate");
|
|
128
|
+
expect(summary.results[1].autoClassified).toBe(true);
|
|
129
|
+
expect(summary.results[1].expected).toBe("should_not_activate");
|
|
130
|
+
});
|
|
131
|
+
it("preserves manual labels when mixed with auto", async () => {
|
|
132
|
+
const client = mockClient([
|
|
133
|
+
// Phase 1: only the auto prompt gets classified → not related
|
|
134
|
+
JSON.stringify({ related: false }),
|
|
135
|
+
// Phase 2: evaluate all three
|
|
136
|
+
JSON.stringify({ activate: true, confidence: "high", reasoning: "Yes" }),
|
|
137
|
+
JSON.stringify({ activate: false, confidence: "high", reasoning: "No" }),
|
|
138
|
+
JSON.stringify({ activate: false, confidence: "high", reasoning: "No" }),
|
|
139
|
+
]);
|
|
140
|
+
const prompts = [
|
|
141
|
+
{ prompt: "send a slack message", expected: "should_activate" },
|
|
142
|
+
{ prompt: "write a poem", expected: "should_not_activate" },
|
|
143
|
+
{ prompt: "some unlabeled prompt", expected: "auto" },
|
|
144
|
+
];
|
|
145
|
+
const summary = await testActivation("desc", prompts, client, undefined, SKILL_META);
|
|
146
|
+
expect(summary.autoClassifiedCount).toBe(1);
|
|
147
|
+
expect(summary.results[0].autoClassified).toBe(false);
|
|
148
|
+
expect(summary.results[0].expected).toBe("should_activate");
|
|
149
|
+
expect(summary.results[1].autoClassified).toBe(false);
|
|
150
|
+
expect(summary.results[1].expected).toBe("should_not_activate");
|
|
151
|
+
expect(summary.results[2].autoClassified).toBe(true);
|
|
152
|
+
expect(summary.results[2].expected).toBe("should_not_activate");
|
|
153
|
+
});
|
|
154
|
+
it("falls back to should_activate when auto without meta", async () => {
|
|
155
|
+
const client = mockClient([
|
|
156
|
+
// No Phase 1 call — straight to Phase 2
|
|
157
|
+
JSON.stringify({ activate: true, confidence: "high", reasoning: "Yes" }),
|
|
158
|
+
]);
|
|
159
|
+
const prompts = [
|
|
160
|
+
{ prompt: "some prompt", expected: "auto" },
|
|
161
|
+
];
|
|
162
|
+
// No meta passed
|
|
163
|
+
const summary = await testActivation("desc", prompts, client);
|
|
164
|
+
expect(summary.results[0].expected).toBe("should_activate");
|
|
165
|
+
expect(summary.results[0].autoClassified).toBe(true);
|
|
166
|
+
expect(summary.results[0].classification).toBe("TP");
|
|
167
|
+
});
|
|
168
|
+
it("falls back to should_activate when classification LLM call fails", async () => {
|
|
169
|
+
let callCount = 0;
|
|
170
|
+
const client = {
|
|
171
|
+
model: "test",
|
|
172
|
+
generate: vi.fn(async () => {
|
|
173
|
+
callCount++;
|
|
174
|
+
if (callCount === 1)
|
|
175
|
+
throw new Error("classification failed");
|
|
176
|
+
// Phase 2: normal response
|
|
177
|
+
return {
|
|
178
|
+
text: JSON.stringify({ activate: true, confidence: "high", reasoning: "yes" }),
|
|
179
|
+
durationMs: 100, inputTokens: null, outputTokens: null,
|
|
180
|
+
};
|
|
181
|
+
}),
|
|
182
|
+
};
|
|
183
|
+
const prompts = [
|
|
184
|
+
{ prompt: "some prompt", expected: "auto" },
|
|
185
|
+
];
|
|
186
|
+
const summary = await testActivation("desc", prompts, client, undefined, SKILL_META);
|
|
187
|
+
expect(summary.results[0].expected).toBe("should_activate");
|
|
188
|
+
expect(summary.results[0].autoClassified).toBe(true);
|
|
189
|
+
expect(summary.results[0].classification).toBe("TP");
|
|
190
|
+
});
|
|
191
|
+
it("existing tests with explicit expected still work without meta", async () => {
|
|
192
|
+
const client = mockClient([
|
|
193
|
+
JSON.stringify({ activate: true, confidence: "high", reasoning: "Yes" }),
|
|
194
|
+
JSON.stringify({ activate: false, confidence: "high", reasoning: "No" }),
|
|
195
|
+
]);
|
|
196
|
+
const summary = await testActivation("desc", PROMPTS, client);
|
|
197
|
+
// No auto-classification, backward compatible
|
|
198
|
+
expect(summary.autoClassifiedCount).toBe(0);
|
|
199
|
+
expect(summary.results[0].autoClassified).toBe(false);
|
|
200
|
+
expect(summary.results[1].autoClassified).toBe(false);
|
|
201
|
+
});
|
|
202
|
+
});
|
|
97
203
|
//# sourceMappingURL=activation-tester.test.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"activation-tester.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/activation-tester.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAIzD,SAAS,UAAU,CAAC,SAAmB;IACrC,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,OAAO;QACL,KAAK,EAAE,YAAY;QACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE;YACzB,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC;YAClC,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;QAC1E,CAAC,CAAC;KACH,CAAC;AACJ,CAAC;AAED,MAAM,OAAO,GAAuB;IAClC,EAAE,MAAM,EAAE,wBAAwB,EAAE,QAAQ,EAAE,iBAAiB,EAAE;IACjE,EAAE,MAAM,EAAE,sBAAsB,EAAE,QAAQ,EAAE,qBAAqB,EAAE;CACpE,CAAC;AAEF,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;QACxE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,cAAc,EAAE,CAAC;YACjF,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,aAAa,EAAE,CAAC;SAClF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,wBAAwB,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAChF,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;QAC5E,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;YAC1E,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS,EAAE,mBAAmB,EAAE,CAAC;SACtF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACpC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qDAAqD,EAAE,KAAK,IAAI,EAAE;QACnE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;YAC5E,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC;SAC9E,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+BAA+B,EAAE,KAAK,IAAI,EAAE;QAC7C,MAAM,MAAM,GAAc;YACxB,KAAK,EAAE,MAAM;YACb,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE;gBACzB,MAAM,IAAI,KAAK,CAAC,aAAa,CAAC,CAAC;YACjC,CAAC,CAAC;SACH,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9B,8EAA8E;QAC9E,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;YACxE,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;SACzE,CAAC,CAAC;QAEH,MAAM,OAAO,GAAU,EAAE,CAAC;QAC1B,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACtE,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,yFAAyF;YACzF,iFAAiF;SAClF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;QACzD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACtC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
1
|
+
{"version":3,"file":"activation-tester.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/activation-tester.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAIzD,SAAS,UAAU,CAAC,SAAmB;IACrC,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,OAAO;QACL,KAAK,EAAE,YAAY;QACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE;YACzB,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC;YAClC,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;QAC1E,CAAC,CAAC;KACH,CAAC;AACJ,CAAC;AAED,MAAM,OAAO,GAAuB;IAClC,EAAE,MAAM,EAAE,wBAAwB,EAAE,QAAQ,EAAE,iBAAiB,EAAE;IACjE,EAAE,MAAM,EAAE,sBAAsB,EAAE,QAAQ,EAAE,qBAAqB,EAAE;CACpE,CAAC;AAEF,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;QACxE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,cAAc,EAAE,CAAC;YACjF,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,aAAa,EAAE,CAAC;SAClF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,wBAAwB,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAChF,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;QAC5E,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;YAC1E,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,KAAK,EAAE,SAAS,EAAE,mBAAmB,EAAE,CAAC;SACtF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACpC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qDAAqD,EAAE,KAAK,IAAI,EAAE;QACnE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;YAC5E,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC;SAC9E,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+BAA+B,EAAE,KAAK,IAAI,EAAE;QAC7C,MAAM,MAAM,GAAc;YACxB,KAAK,EAAE,MAAM;YACb,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE;gBACzB,MAAM,IAAI,KAAK,CAAC,aAAa,CAAC,CAAC;YACjC,CAAC,CAAC;SACH,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9B,8EAA8E;QAC9E,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;YACxE,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;SACzE,CAAC,CAAC;QAEH,MAAM,OAAO,GAAU,EAAE,CAAC;QAC1B,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACtE,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,yFAAyF;YACzF,iFAAiF;SAClF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9D,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;QACzD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACtC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,MAAM,UAAU,GAAc;IAC5B,IAAI,EAAE,iBAAiB;IACvB,IAAI,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,CAAC;CACpD,CAAC;AAEF,QAAQ,CAAC,sCAAsC,EAAE,GAAG,EAAE;IACpD,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,2DAA2D;YAC3D,IAAI,CAAC,SAAS,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;YACjC,oDAAoD;YACpD,IAAI,CAAC,SAAS,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;YAClC,4DAA4D;YAC5D,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,iBAAiB,EAAE,CAAC;YACpF,oDAAoD;YACpD,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC;SACvF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAuB;YAClC,EAAE,MAAM,EAAE,4BAA4B,EAAE,QAAQ,EAAE,MAAM,EAAE;YAC1D,EAAE,MAAM,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,EAAE;SAChD,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,4BAA4B,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;QAE3G,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,MAAM,CAAC,OAAO,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACrD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAC5D,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACrD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,8DAA8D;YAC9D,IAAI,CAAC,SAAS,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;YAClC,8BAA8B;YAC9B,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;YACxE,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;YACxE,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;SACzE,CAAC,CAAC;QAEH,MAAM,OAAO,GAAuB;YAClC,EAAE,MAAM,EAAE,sBAAsB,EAAE,QAAQ,EAAE,iBAAiB,EAAE;YAC/D,EAAE,MAAM,EAAE,cAAc,EAAE,QAAQ,EAAE,qBAAqB,EAAE;YAC3D,EAAE,MAAM,EAAE,uBAAuB,EAAE,QAAQ,EAAE,MAAM,EAAE;SACtD,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;QAErF,MAAM,CAAC,OAAO,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAC5D,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QAChE,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACrD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sDAAsD,EAAE,KAAK,IAAI,EAAE;QACpE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,wCAAwC;YACxC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;SACzE,CAAC,CAAC;QAEH,MAAM,OAAO,GAAuB;YAClC,EAAE,MAAM,EAAE,aAAa,EAAE,QAAQ,EAAE,MAAM,EAAE;SAC5C,CAAC;QAEF,iBAAiB;QACjB,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAE9D,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAC5D,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACrD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kEAAkE,EAAE,KAAK,IAAI,EAAE;QAChF,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,MAAM,MAAM,GAAc;YACxB,KAAK,EAAE,MAAM;YACb,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE;gBACzB,SAAS,EAAE,CAAC;gBACZ,IAAI,SAAS,KAAK,CAAC;oBAAE,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;gBAC9D,2BAA2B;gBAC3B,OAAO;oBACL,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;oBAC9E,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI;iBACvD,CAAC;YACJ,CAAC,CAAC;SACH,CAAC;QAEF,MAAM,OAAO,GAAuB;YAClC,EAAE,MAAM,EAAE,aAAa,EAAE,QAAQ,EAAE,MAAM,EAAE;SAC5C,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;QAErF,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAC5D,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACrD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+DAA+D,EAAE,KAAK,IAAI,EAAE;QAC7E,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;YACxE,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;SACzE,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAE9D,8CAA8C;QAC9C,MAAM,CAAC,OAAO,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtD,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { describe, it, expect } from "vitest";
|
|
2
|
-
import { buildEvalInitPrompt, parseGeneratedEvals, } from "../prompt-builder.js";
|
|
2
|
+
import { buildEvalInitPrompt, buildEvalSystemPrompt, buildBaselineSystemPrompt, parseGeneratedEvals, } from "../prompt-builder.js";
|
|
3
3
|
// ---------------------------------------------------------------------------
|
|
4
4
|
// buildEvalInitPrompt
|
|
5
5
|
// ---------------------------------------------------------------------------
|
|
@@ -26,6 +26,58 @@ describe("buildEvalInitPrompt", () => {
|
|
|
26
26
|
});
|
|
27
27
|
});
|
|
28
28
|
// ---------------------------------------------------------------------------
|
|
29
|
+
// buildEvalSystemPrompt (MCP-aware)
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
describe("buildEvalSystemPrompt", () => {
|
|
32
|
+
it("returns standard prompt for non-MCP skills", () => {
|
|
33
|
+
const result = buildEvalSystemPrompt("# My Skill\nDoes text processing.");
|
|
34
|
+
expect(result).toContain("You are an AI assistant enhanced with the following skill");
|
|
35
|
+
expect(result).toContain("# My Skill");
|
|
36
|
+
expect(result).not.toContain("Evaluation Mode");
|
|
37
|
+
});
|
|
38
|
+
it("returns simulation prompt for Slack MCP skill", () => {
|
|
39
|
+
const slackSkill = "Use slack_send_message and slack_read_channel to interact with Slack.";
|
|
40
|
+
const result = buildEvalSystemPrompt(slackSkill);
|
|
41
|
+
expect(result).toContain("Evaluation Mode");
|
|
42
|
+
expect(result).toContain("slack_send_message");
|
|
43
|
+
expect(result).toContain("slack_read_channel");
|
|
44
|
+
expect(result).toContain("Slack");
|
|
45
|
+
expect(result).toContain("Do NOT");
|
|
46
|
+
});
|
|
47
|
+
it("returns simulation prompt for GitHub MCP skill", () => {
|
|
48
|
+
const githubSkill = "Use github_create_pr to create pull requests.";
|
|
49
|
+
const result = buildEvalSystemPrompt(githubSkill);
|
|
50
|
+
expect(result).toContain("Evaluation Mode");
|
|
51
|
+
expect(result).toContain("github_create_pr");
|
|
52
|
+
expect(result).toContain("GitHub");
|
|
53
|
+
});
|
|
54
|
+
it("lists multiple MCP servers when skill uses several", () => {
|
|
55
|
+
const multiSkill = "Use slack_send_message for chat and github_create_pr for PRs and drive_list_files for docs.";
|
|
56
|
+
const result = buildEvalSystemPrompt(multiSkill);
|
|
57
|
+
expect(result).toContain("Slack");
|
|
58
|
+
expect(result).toContain("GitHub");
|
|
59
|
+
expect(result).toContain("Google Workspace");
|
|
60
|
+
});
|
|
61
|
+
it("returns baseline prompt for empty content", () => {
|
|
62
|
+
expect(buildEvalSystemPrompt("")).toBe("You are a helpful AI assistant.");
|
|
63
|
+
});
|
|
64
|
+
it("includes simulation instructions that prevent tool-unavailable responses", () => {
|
|
65
|
+
const skill = "Use slack_send_message to send messages.";
|
|
66
|
+
const result = buildEvalSystemPrompt(skill);
|
|
67
|
+
expect(result).toContain("Do NOT");
|
|
68
|
+
expect(result).toContain("tools are unavailable");
|
|
69
|
+
expect(result).toContain("Walk through each tool call step by step");
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// buildBaselineSystemPrompt
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
describe("buildBaselineSystemPrompt", () => {
|
|
76
|
+
it("returns baseline prompt", () => {
|
|
77
|
+
expect(buildBaselineSystemPrompt()).toBe("You are a helpful AI assistant.");
|
|
78
|
+
});
|
|
79
|
+
});
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
29
81
|
// parseGeneratedEvals
|
|
30
82
|
// ---------------------------------------------------------------------------
|
|
31
83
|
describe("parseGeneratedEvals", () => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt-builder.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/prompt-builder.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EACL,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sBAAsB,CAAC;AAE9B,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,MAAM,YAAY,GAAG,6CAA6C,CAAC;IAEnE,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,wBAAwB,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,GAAG,GAAG;;;;;;;;;;;;;;;;;;;;uBAoBO,CAAC;QAEpB,MAAM,MAAM,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC;QACxC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC7C,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,GAAG,GAAG,6CAA6C,CAAC;QAC1D,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,GAAG,GAAG,gCAAgC,CAAC;QAC7C,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,GAAG,GAAG,wCAAwC,CAAC;QACrD,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,sBAAsB;IAC1E,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
1
|
+
{"version":3,"file":"prompt-builder.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/prompt-builder.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EACL,mBAAmB,EACnB,qBAAqB,EACrB,yBAAyB,EACzB,mBAAmB,GACpB,MAAM,sBAAsB,CAAC;AAE9B,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,MAAM,YAAY,GAAG,6CAA6C,CAAC;IAEnE,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,wBAAwB,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,oCAAoC;AACpC,8EAA8E;AAE9E,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;IACrC,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,MAAM,GAAG,qBAAqB,CAAC,mCAAmC,CAAC,CAAC;QAC1E,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,2DAA2D,CAAC,CAAC;QACtF,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAClD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,UAAU,GAAG,uEAAuE,CAAC;QAC3F,MAAM,MAAM,GAAG,qBAAqB,CAAC,UAAU,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAC;QAC/C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAC;QAC/C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IACrC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,WAAW,GAAG,+CAA+C,CAAC;QACpE,MAAM,MAAM,GAAG,qBAAqB,CAAC,WAAW,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC;QAC7C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IACrC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oDAAoD,EAAE,GAAG,EAAE;QAC5D,MAAM,UAAU,GAAG,6FAA6F,CAAC;QACjH,MAAM,MAAM,GAAG,qBAAqB,CAAC,UAAU,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACnC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2CAA2C,EAAE,GAAG,EAAE;QACnD,MAAM,CAAC,qBAAqB,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;IAC5E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0EAA0E,EAAE,GAAG,EAAE;QAClF,MAAM,KAAK,GAAG,0CAA0C,CAAC;QACzD,MAAM,MAAM,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACnC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,uBAAuB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,0CAA0C,CAAC,CAAC;IACvE,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;IACzC,EAAE,CAAC,yBAAyB,EAAE,GAAG,EAAE;QACjC,MAAM,CAAC,yBAAyB,EAAE,CAAC,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;IAC9E,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,GAAG,GAAG;;;;;;;;;;;;;;;;;;;;uBAoBO,CAAC;QAEpB,MAAM,MAAM,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC;QACxC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC7C,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,GAAG,GAAG,6CAA6C,CAAC;QAC1D,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,GAAG,GAAG,gCAAgC,CAAC;QAC7C,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,GAAG,GAAG,wCAAwC,CAAC;QACrD,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,sBAAsB;IAC1E,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import type { LlmClient } from "./llm.js";
|
|
2
|
+
export interface SkillMeta {
|
|
3
|
+
name: string;
|
|
4
|
+
tags: string[];
|
|
5
|
+
}
|
|
2
6
|
export interface ActivationPrompt {
|
|
3
7
|
prompt: string;
|
|
4
|
-
expected: "should_activate" | "should_not_activate";
|
|
8
|
+
expected: "should_activate" | "should_not_activate" | "auto";
|
|
5
9
|
}
|
|
6
10
|
export interface ActivationResult {
|
|
7
11
|
prompt: string;
|
|
@@ -10,6 +14,7 @@ export interface ActivationResult {
|
|
|
10
14
|
confidence: "high" | "medium" | "low";
|
|
11
15
|
reasoning: string;
|
|
12
16
|
classification: "TP" | "TN" | "FP" | "FN";
|
|
17
|
+
autoClassified?: boolean;
|
|
13
18
|
}
|
|
14
19
|
export interface ActivationSummary {
|
|
15
20
|
results: ActivationResult[];
|
|
@@ -21,5 +26,6 @@ export interface ActivationSummary {
|
|
|
21
26
|
tn: number;
|
|
22
27
|
fp: number;
|
|
23
28
|
fn: number;
|
|
29
|
+
autoClassifiedCount: number;
|
|
24
30
|
}
|
|
25
|
-
export declare function testActivation(skillDescription: string, prompts: ActivationPrompt[], client: LlmClient, onResult?: (result: ActivationResult) => void): Promise<ActivationSummary>;
|
|
31
|
+
export declare function testActivation(skillDescription: string, prompts: ActivationPrompt[], client: LlmClient, onResult?: (result: ActivationResult) => void, meta?: SkillMeta): Promise<ActivationSummary>;
|
|
@@ -14,9 +14,52 @@ Respond with ONLY valid JSON:
|
|
|
14
14
|
"confidence": "high" | "medium" | "low",
|
|
15
15
|
"reasoning": "brief explanation"
|
|
16
16
|
}`;
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
const CLASSIFY_SYSTEM_PROMPT = `You decide if a user prompt is related to a specific AI skill.
|
|
18
|
+
Given the skill name and tags, determine if the user prompt is something this skill should handle.
|
|
19
|
+
Respond with ONLY valid JSON:
|
|
20
|
+
{"related": true/false}`;
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Phase 1: Auto-classify expected behavior from skill name + tags
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
async function classifyExpectation(meta, prompt, client) {
|
|
25
|
+
try {
|
|
26
|
+
const userPrompt = `Skill: ${meta.name}\nTags: ${meta.tags.join(", ")}\n\nUser prompt: ${prompt}`;
|
|
27
|
+
const { text } = await client.generate(CLASSIFY_SYSTEM_PROMPT, userPrompt);
|
|
28
|
+
const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/) || [null, text];
|
|
29
|
+
const json = JSON.parse(jsonMatch[1].trim());
|
|
30
|
+
return json.related ? "should_activate" : "should_not_activate";
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
return "should_activate";
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
async function resolvePrompts(prompts, client, meta) {
|
|
37
|
+
const resolved = [];
|
|
19
38
|
for (const p of prompts) {
|
|
39
|
+
if (p.expected === "auto") {
|
|
40
|
+
if (meta) {
|
|
41
|
+
const expected = await classifyExpectation(meta, p.prompt, client);
|
|
42
|
+
resolved.push({ prompt: p.prompt, expected, autoClassified: true });
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
resolved.push({ prompt: p.prompt, expected: "should_activate", autoClassified: true });
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
resolved.push({ prompt: p.prompt, expected: p.expected, autoClassified: false });
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return resolved;
|
|
53
|
+
}
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Phase 2: Evaluate activation against skill description
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
export async function testActivation(skillDescription, prompts, client, onResult, meta) {
|
|
58
|
+
// Phase 1: resolve auto expectations
|
|
59
|
+
const resolved = await resolvePrompts(prompts, client, meta);
|
|
60
|
+
// Phase 2: evaluate each prompt against description
|
|
61
|
+
const results = [];
|
|
62
|
+
for (const p of resolved) {
|
|
20
63
|
const userPrompt = `## Skill Description
|
|
21
64
|
${skillDescription}
|
|
22
65
|
|
|
@@ -40,6 +83,7 @@ Would this user prompt trigger this skill?`;
|
|
|
40
83
|
confidence,
|
|
41
84
|
reasoning: String(json.reasoning || ""),
|
|
42
85
|
classification,
|
|
86
|
+
autoClassified: p.autoClassified,
|
|
43
87
|
};
|
|
44
88
|
results.push(result);
|
|
45
89
|
onResult?.(result);
|
|
@@ -52,6 +96,7 @@ Would this user prompt trigger this skill?`;
|
|
|
52
96
|
confidence: "low",
|
|
53
97
|
reasoning: `Error: ${err instanceof Error ? err.message : String(err)}`,
|
|
54
98
|
classification: p.expected === "should_activate" ? "FN" : "TN",
|
|
99
|
+
autoClassified: p.autoClassified,
|
|
55
100
|
};
|
|
56
101
|
results.push(result);
|
|
57
102
|
onResult?.(result);
|
|
@@ -84,6 +129,7 @@ function computeSummary(results) {
|
|
|
84
129
|
tn,
|
|
85
130
|
fp,
|
|
86
131
|
fn,
|
|
132
|
+
autoClassifiedCount: results.filter((r) => r.autoClassified).length,
|
|
87
133
|
};
|
|
88
134
|
}
|
|
89
135
|
//# sourceMappingURL=activation-tester.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"activation-tester.js","sourceRoot":"","sources":["../../src/eval/activation-tester.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4EAA4E;AAC5E,8EAA8E;
|
|
1
|
+
{"version":3,"file":"activation-tester.js","sourceRoot":"","sources":["../../src/eval/activation-tester.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4EAA4E;AAC5E,8EAA8E;AAqC9E,MAAM,wBAAwB,GAAG;;;;;;;;;;;;EAY/B,CAAC;AAEH,MAAM,sBAAsB,GAAG;;;wBAGP,CAAC;AAEzB,8EAA8E;AAC9E,kEAAkE;AAClE,8EAA8E;AAE9E,KAAK,UAAU,mBAAmB,CAChC,IAAe,EACf,MAAc,EACd,MAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,UAAU,IAAI,CAAC,IAAI,WAAW,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,oBAAoB,MAAM,EAAE,CAAC;QAClG,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,sBAAsB,EAAE,UAAU,CAAC,CAAC;QAC3E,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAC7E,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC,CAAC;QAC9C,OAAO,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,iBAAiB,CAAC,CAAC,CAAC,qBAAqB,CAAC;IAClE,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,iBAAiB,CAAC;IAC3B,CAAC;AACH,CAAC;AAYD,KAAK,UAAU,cAAc,CAC3B,OAA2B,EAC3B,MAAiB,EACjB,IAAgB;IAEhB,MAAM,QAAQ,GAAqB,EAAE,CAAC;IACtC,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,QAAQ,KAAK,MAAM,EAAE,CAAC;YAC1B,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;gBACnE,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC;YACtE,CAAC;iBAAM,CAAC;gBACN,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,iBAAiB,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC;YACzF,CAAC;QACH,CAAC;aAAM,CAAC;YACN,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,cAAc,EAAE,KAAK,EAAE,CAAC,CAAC;QACnF,CAAC;IACH,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,8EAA8E;AAC9E,yDAAyD;AACzD,8EAA8E;AAE9E,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,gBAAwB,EACxB,OAA2B,EAC3B,MAAiB,EACjB,QAA6C,EAC7C,IAAgB;IAEhB,qCAAqC;IACrC,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,MAAM,EAAE,IAAI,CAAC,CAAC;IAE7D,oDAAoD;IACpD,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;QACzB,MAAM,UAAU,GAAG;EACrB,gBAAgB;;;EAGhB,CAAC,CAAC,MAAM;;2CAEiC,CAAC;QAExC,IAAI,CAAC;YACH,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,wBAAwB,EAAE,UAAU,CAAC,CAAC;YACvF,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,8BAA8B,CAAC,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;YACrF,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC,CAAC;YAE9C,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;YACjC,MAAM,UAAU,GAAG,CAAC,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC;gBACpE,CAAC,CAAE,IAAI,CAAC,UAAwC;gBAChD,CAAC,CAAC,KAAK,CAAC;YAEV,MAAM,cAAc,GAAG,cAAc,CAAC,CAAC,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;YAE5D,MAAM,MAAM,GAAqB;gBAC/B,MAAM,EAAE,CAAC,CAAC,MAAM;gBAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,QAAQ;gBACR,UAAU;gBACV,SAAS,EAAE,MAAM,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;gBACvC,cAAc;gBACd,cAAc,EAAE,CAAC,CAAC,cAAc;aACjC,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrB,QAAQ,EAAE,CAAC,MAAM,CAAC,CAAC;QACrB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,MAAM,GAAqB;gBAC/B,MAAM,EAAE,CAAC,CAAC,MAAM;gBAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,QAAQ,EAAE,KAAK;gBACf,UAAU,EAAE,KAAK;gBACjB,SAAS,EAAE,UAAU,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;gBACvE,cAAc,EAAE,CAAC,CAAC,QAAQ,KAAK,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI;gBAC9D,cAAc,EAAE,CAAC,CAAC,cAAc;aACjC,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrB,QAAQ,EAAE,CAAC,MAAM,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IAED,OAAO,cAAc,CAAC,OAAO,CAAC,CAAC;AACjC,CAAC;AAED,SAAS,cAAc,CACrB,QAAmD,EACnD,MAAe;IAEf,IAAI,QAAQ,KAAK,iBAAiB,IAAI,MAAM;QAAE,OAAO,IAAI,CAAC;IAC1D,IAAI,QAAQ,KAAK,iBAAiB,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAC3D,IAAI,QAAQ,KAAK,qBAAqB,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAC/D,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CAAC,OAA2B;IACjD,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,OAAO;QACL,OAAO;QACP,SAAS,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,WAAW,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9C,KAAK;QACL,EAAE;QACF,EAAE;QACF,EAAE;QACF,EAAE;QACF,mBAAmB,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,MAAM;KACpE,CAAC;AACJ,CAAC"}
|
|
@@ -5,7 +5,7 @@ export interface HistorySummary {
|
|
|
5
5
|
model: string;
|
|
6
6
|
skillName: string;
|
|
7
7
|
passRate: number;
|
|
8
|
-
type: "benchmark" | "comparison" | "baseline";
|
|
8
|
+
type: "benchmark" | "comparison" | "baseline" | "model-compare" | "improve";
|
|
9
9
|
caseCount: number;
|
|
10
10
|
totalDurationMs: number;
|
|
11
11
|
totalTokens: number | null;
|
|
@@ -14,14 +14,14 @@ export interface HistorySummary {
|
|
|
14
14
|
}
|
|
15
15
|
export interface HistoryFilter {
|
|
16
16
|
model?: string;
|
|
17
|
-
type?: "benchmark" | "comparison" | "baseline";
|
|
17
|
+
type?: "benchmark" | "comparison" | "baseline" | "model-compare" | "improve";
|
|
18
18
|
from?: string;
|
|
19
19
|
to?: string;
|
|
20
20
|
}
|
|
21
21
|
export interface CaseHistoryEntry {
|
|
22
22
|
timestamp: string;
|
|
23
23
|
model: string;
|
|
24
|
-
type: "benchmark" | "comparison" | "baseline";
|
|
24
|
+
type: "benchmark" | "comparison" | "baseline" | "model-compare" | "improve";
|
|
25
25
|
provider?: string;
|
|
26
26
|
pass_rate: number;
|
|
27
27
|
durationMs?: number;
|
|
@@ -39,7 +39,7 @@ export interface RegressionEntry {
|
|
|
39
39
|
change: "regression" | "improvement";
|
|
40
40
|
}
|
|
41
41
|
export declare function writeHistoryEntry(skillDir: string, result: BenchmarkResult & {
|
|
42
|
-
type?: "benchmark" | "comparison" | "baseline";
|
|
42
|
+
type?: "benchmark" | "comparison" | "baseline" | "model-compare" | "improve";
|
|
43
43
|
}): Promise<string>;
|
|
44
44
|
export declare function deleteHistoryEntry(skillDir: string, timestamp: string): Promise<boolean>;
|
|
45
45
|
export declare function listHistory(skillDir: string, filter?: HistoryFilter): Promise<HistorySummary[]>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark-history.js","sourceRoot":"","sources":["../../src/eval/benchmark-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gFAAgF;AAChF,8EAA8E;AAE9E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AA6ChD,SAAS,mBAAmB,CAAC,GAAW;IACtC,OAAO,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,0CAA0C;IAC1C,MAAM,EAAE,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAC3C,yDAAyD;IACzD,OAAO,EAAE,CAAC,OAAO,CAAC,0BAA0B,EAAE,WAAW,CAAC,CAAC;AAC7D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,MAA4E;IAE5E,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC/D,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAE5C,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAE3D,uDAAuD;IACvD,MAAM,cAAc,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAEvC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,MAAsB;IAEtB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,IAAI,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;IAE1E,2EAA2E;IAC3E,IAAI,MAAM,EAAE,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAC5E,MAAM,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QACtE,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACjC,MAAM,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YACpC,IAAI,QAAQ,IAAI,EAAE,GAAG,QAAQ;gBAAE,OAAO,KAAK,CAAC;YAC5C,IAAI,MAAM,IAAI,EAAE,GAAG,MAAM;gBAAE,OAAO,KAAK,CAAC;YACxC,OAAO,IAAI,CAAC;QACd,CAAC,CAAC,CAAC;IACL,CAAC;IAED,MAAM,OAAO,GAAqB,EAAE,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,MAAM,SAAS,GAAI,IAAI,CAAC,IAA+B,IAAI,WAAW,CAAC;YAEvE,gCAAgC;YAChC,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAC3D,IAAI,MAAM,EAAE,IAAI,IAAI,SAAS,KAAK,MAAM,CAAC,IAAI;gBAAE,SAAS;YAExD,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACpF,MAAM,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAC3D,CAAC,CACF,CAAC;YACF,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAChF,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;YAC3D,MAAM,WAAW,GAAG,SAAS;gBAC3B,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBACrD,CAAC,CAAC,IAAI,CAAC;YAET,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,SAAS,EAAE,IAAI,CAAC,UAAU;gBAC1B,QAAQ,EAAE,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;gBACtE,IAAI,EAAE,SAAS;gBACf,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;gBAC5B,eAAe;gBACf,WAAW;gBACX,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;aACtB,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QACpE,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAoB,CAAC;IAChD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAc,EACd,MAA2B;IAE3B,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,OAAO,GAAuB,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,EAAE,CAAC;QAC7E,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAE3D,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,MAAM,CAAC,CAAC;YAClE,IAAI,CAAC,YAAY;gBAAE,SAAS;YAE5B,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAG,IAAI,CAAC,IAAiC,IAAI,WAAW;gBAC5D,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,SAAS,EAAE,YAAY,CAAC,SAAS;gBACjC,UAAU,EAAE,YAAY,CAAC,UAAU;gBACnC,MAAM,EAAE,YAAY,CAAC,MAAM;gBAC3B,WAAW,EAAE,YAAY,CAAC,WAAW;gBACrC,YAAY,EAAE,YAAY,CAAC,YAAY;gBACvC,UAAU,EAAE,YAAY,CAAC,UAAU;aACpC,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAyBD,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,cAAc,EAAE,EAAE,EAAE,UAAU,EAAE,EAAE,EAAE,WAAW,EAAE,EAAE,EAAE,CAAC;IAC/E,CAAC;IAED,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAElE,qDAAqD;IACrD,MAAM,YAAY,GAAG,IAAI,GAAG,EAAyG,CAAC;IACtI,qBAAqB;IACrB,MAAM,QAAQ,GAAG,IAAI,GAAG,EAA4E,CAAC;IACrG,eAAe;IACf,MAAM,WAAW,GAA+B,EAAE,CAAC;IAEnD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YACxE,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,IAAI,WAAW,CAAC;YAC3C,IAAI,SAAS,KAAK,WAAW;gBAAE,SAAS,CAAC,sCAAsC;YAE/E,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAChF,MAAM,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACvG,MAAM,QAAQ,GAAG,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9E,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAEhF,QAAQ;YACR,WAAW,CAAC,IAAI,CAAC;gBACf,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,QAAQ;gBACR,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;YAEH,cAAc;YACd,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,eAAe,EAAE,CAAC,EAAE,CAAC;YAC/F,QAAQ,CAAC,IAAI,EAAE,CAAC;YAChB,QAAQ,CAAC,aAAa,IAAI,QAAQ,CAAC;YACnC,QAAQ,CAAC,eAAe,IAAI,eAAe,CAAC;YAC5C,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;YAEnC,sBAAsB;YACtB,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC3B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;oBAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;oBACnC,MAAM,IAAI,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC;oBAChI,IAAI,CAAC,KAAK,EAAE,CAAC;oBACb,IAAI,CAAC,CAAC,IAAI;wBAAE,IAAI,CAAC,MAAM,EAAE,CAAC;oBAC1B,mBAAmB;oBACnB,IAAI,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;oBACnB,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;gBAC9B,CAAC;YACH,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,EAAE,CAAC;SACrD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,EAAE,EAAE,CAAC,CAAC,EAAE;QACR,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,QAAQ,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9C,SAAS,EAAE,CAAC,CAAC,KAAK;QAClB,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;KACrB,CAAC,CAAC;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,yBAAyB;IAErE,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;SAC9C,GAAG,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACpB,KAAK;QACL,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,WAAW,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACtD,aAAa,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,eAAe,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;KAC3D,CAAC,CAAC;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,CAAC,CAAC;IAEjD,OAAO;QACL,SAAS,EAAE,WAAW,CAAC,MAAM;QAC7B,cAAc;QACd,UAAU;QACV,WAAW;KACZ,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,OAAwB,EACxB,QAAyB;IAEzB,MAAM,WAAW,GAAsB,EAAE,CAAC;IAE1C,sEAAsE;IACtE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAmB,CAAC;IAC3C,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAC/B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;YACnC,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC9B,IAAI,IAAI,KAAK,SAAS;gBAAE,SAAS,CAAC,sBAAsB;YAExD,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACpB,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,IAAI;oBACpB,aAAa,EAAE,KAAK;oBACpB,MAAM,EAAE,YAAY;iBACrB,CAAC,CAAC;YACL,CAAC;iBAAM,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC3B,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,KAAK;oBACrB,aAAa,EAAE,IAAI;oBACnB,MAAM,EAAE,aAAa;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC"}
|
|
1
|
+
{"version":3,"file":"benchmark-history.js","sourceRoot":"","sources":["../../src/eval/benchmark-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gFAAgF;AAChF,8EAA8E;AAE9E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AA6ChD,SAAS,mBAAmB,CAAC,GAAW;IACtC,OAAO,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,0CAA0C;IAC1C,MAAM,EAAE,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAC3C,yDAAyD;IACzD,OAAO,EAAE,CAAC,OAAO,CAAC,0BAA0B,EAAE,WAAW,CAAC,CAAC;AAC7D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,MAA0G;IAE1G,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC/D,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAE5C,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAE3D,uDAAuD;IACvD,MAAM,cAAc,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAEvC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,MAAsB;IAEtB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,IAAI,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;IAE1E,2EAA2E;IAC3E,IAAI,MAAM,EAAE,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAC5E,MAAM,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QACtE,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACjC,MAAM,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YACpC,IAAI,QAAQ,IAAI,EAAE,GAAG,QAAQ;gBAAE,OAAO,KAAK,CAAC;YAC5C,IAAI,MAAM,IAAI,EAAE,GAAG,MAAM;gBAAE,OAAO,KAAK,CAAC;YACxC,OAAO,IAAI,CAAC;QACd,CAAC,CAAC,CAAC;IACL,CAAC;IAED,MAAM,OAAO,GAAqB,EAAE,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,MAAM,SAAS,GAAI,IAAI,CAAC,IAA+B,IAAI,WAAW,CAAC;YAEvE,gCAAgC;YAChC,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAC3D,IAAI,MAAM,EAAE,IAAI,IAAI,SAAS,KAAK,MAAM,CAAC,IAAI;gBAAE,SAAS;YAExD,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACpF,MAAM,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAC3D,CAAC,CACF,CAAC;YACF,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAChF,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;YAC3D,MAAM,WAAW,GAAG,SAAS;gBAC3B,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBACrD,CAAC,CAAC,IAAI,CAAC;YAET,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,SAAS,EAAE,IAAI,CAAC,UAAU;gBAC1B,QAAQ,EAAE,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;gBACtE,IAAI,EAAE,SAAS;gBACf,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;gBAC5B,eAAe;gBACf,WAAW;gBACX,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;aACtB,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QACpE,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAoB,CAAC;IAChD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAc,EACd,MAA2B;IAE3B,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,OAAO,GAAuB,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,EAAE,CAAC;QAC7E,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAE3D,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,MAAM,CAAC,CAAC;YAClE,IAAI,CAAC,YAAY;gBAAE,SAAS;YAE5B,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAG,IAAI,CAAC,IAAiC,IAAI,WAAW;gBAC5D,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,SAAS,EAAE,YAAY,CAAC,SAAS;gBACjC,UAAU,EAAE,YAAY,CAAC,UAAU;gBACnC,MAAM,EAAE,YAAY,CAAC,MAAM;gBAC3B,WAAW,EAAE,YAAY,CAAC,WAAW;gBACrC,YAAY,EAAE,YAAY,CAAC,YAAY;gBACvC,UAAU,EAAE,YAAY,CAAC,UAAU;aACpC,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAyBD,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,cAAc,EAAE,EAAE,EAAE,UAAU,EAAE,EAAE,EAAE,WAAW,EAAE,EAAE,EAAE,CAAC;IAC/E,CAAC;IAED,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAElE,qDAAqD;IACrD,MAAM,YAAY,GAAG,IAAI,GAAG,EAAyG,CAAC;IACtI,qBAAqB;IACrB,MAAM,QAAQ,GAAG,IAAI,GAAG,EAA4E,CAAC;IACrG,eAAe;IACf,MAAM,WAAW,GAA+B,EAAE,CAAC;IAEnD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YACxE,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,IAAI,WAAW,CAAC;YAC3C,IAAI,SAAS,KAAK,WAAW;gBAAE,SAAS,CAAC,sCAAsC;YAE/E,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAChF,MAAM,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACvG,MAAM,QAAQ,GAAG,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9E,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAEhF,QAAQ;YACR,WAAW,CAAC,IAAI,CAAC;gBACf,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,QAAQ;gBACR,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;YAEH,cAAc;YACd,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,eAAe,EAAE,CAAC,EAAE,CAAC;YAC/F,QAAQ,CAAC,IAAI,EAAE,CAAC;YAChB,QAAQ,CAAC,aAAa,IAAI,QAAQ,CAAC;YACnC,QAAQ,CAAC,eAAe,IAAI,eAAe,CAAC;YAC5C,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;YAEnC,sBAAsB;YACtB,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC3B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;oBAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;oBACnC,MAAM,IAAI,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC;oBAChI,IAAI,CAAC,KAAK,EAAE,CAAC;oBACb,IAAI,CAAC,CAAC,IAAI;wBAAE,IAAI,CAAC,MAAM,EAAE,CAAC;oBAC1B,mBAAmB;oBACnB,IAAI,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;oBACnB,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;gBAC9B,CAAC;YACH,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,EAAE,CAAC;SACrD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,EAAE,EAAE,CAAC,CAAC,EAAE;QACR,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,QAAQ,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9C,SAAS,EAAE,CAAC,CAAC,KAAK;QAClB,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;KACrB,CAAC,CAAC;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,yBAAyB;IAErE,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;SAC9C,GAAG,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACpB,KAAK;QACL,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,WAAW,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACtD,aAAa,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,eAAe,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;KAC3D,CAAC,CAAC;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,CAAC,CAAC;IAEjD,OAAO;QACL,SAAS,EAAE,WAAW,CAAC,MAAM;QAC7B,cAAc;QACd,UAAU;QACV,WAAW;KACZ,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,OAAwB,EACxB,QAAyB;IAEzB,MAAM,WAAW,GAAsB,EAAE,CAAC;IAE1C,sEAAsE;IACtE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAmB,CAAC;IAC3C,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAC/B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;YACnC,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC9B,IAAI,IAAI,KAAK,SAAS;gBAAE,SAAS,CAAC,sBAAsB;YAExD,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACpB,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,IAAI;oBACpB,aAAa,EAAE,KAAK;oBACpB,MAAM,EAAE,YAAY;iBACrB,CAAC,CAAC;YACL,CAAC;iBAAM,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC3B,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,KAAK;oBACrB,aAAa,EAAE,IAAI;oBACnB,MAAM,EAAE,aAAa;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC"}
|
package/dist/eval/benchmark.d.ts
CHANGED
|
@@ -39,7 +39,7 @@ export interface BenchmarkResult {
|
|
|
39
39
|
skill_name: string;
|
|
40
40
|
cases: BenchmarkCase[];
|
|
41
41
|
overall_pass_rate?: number;
|
|
42
|
-
type?: "benchmark" | "comparison" | "baseline";
|
|
42
|
+
type?: "benchmark" | "comparison" | "baseline" | "model-compare" | "improve";
|
|
43
43
|
provider?: string;
|
|
44
44
|
totalDurationMs?: number;
|
|
45
45
|
totalInputTokens?: number | null;
|
|
@@ -53,6 +53,11 @@ export interface BenchmarkResult {
|
|
|
53
53
|
baselineRubricAvg: number;
|
|
54
54
|
delta: number;
|
|
55
55
|
};
|
|
56
|
+
improve?: {
|
|
57
|
+
original: string;
|
|
58
|
+
improved: string;
|
|
59
|
+
reasoning: string;
|
|
60
|
+
};
|
|
56
61
|
}
|
|
57
62
|
export declare function writeBenchmark(skillDir: string, result: BenchmarkResult): Promise<void>;
|
|
58
63
|
export declare function readBenchmark(skillDir: string): Promise<BenchmarkResult | null>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAmEjC,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAuB;IAEvB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAClD,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB;IAEhB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC3D,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC5C,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAoB,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|