@su-record/vibe 2.7.10 → 2.7.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +37 -37
- package/CLAUDE.md +126 -222
- package/LICENSE +21 -21
- package/README.md +580 -580
- package/agents/architect-low.md +41 -41
- package/agents/architect-medium.md +59 -59
- package/agents/architect.md +80 -80
- package/agents/build-error-resolver.md +115 -115
- package/agents/compounder.md +261 -261
- package/agents/diagrammer.md +178 -178
- package/agents/docs/api-documenter.md +99 -99
- package/agents/docs/changelog-writer.md +93 -93
- package/agents/e2e-tester.md +266 -266
- package/agents/explorer-low.md +42 -42
- package/agents/explorer-medium.md +59 -59
- package/agents/explorer.md +48 -48
- package/agents/implementer-low.md +43 -43
- package/agents/implementer-medium.md +52 -52
- package/agents/implementer.md +54 -54
- package/agents/junior-mentor.md +141 -141
- package/agents/planning/requirements-analyst.md +84 -84
- package/agents/planning/ux-advisor.md +83 -83
- package/agents/qa/acceptance-tester.md +86 -86
- package/agents/qa/edge-case-finder.md +93 -93
- package/agents/refactor-cleaner.md +143 -143
- package/agents/research/best-practices-agent.md +199 -199
- package/agents/research/codebase-patterns-agent.md +157 -157
- package/agents/research/framework-docs-agent.md +188 -188
- package/agents/research/security-advisory-agent.md +213 -213
- package/agents/review/architecture-reviewer.md +107 -107
- package/agents/review/complexity-reviewer.md +116 -116
- package/agents/review/data-integrity-reviewer.md +88 -88
- package/agents/review/git-history-reviewer.md +103 -103
- package/agents/review/performance-reviewer.md +86 -86
- package/agents/review/python-reviewer.md +150 -150
- package/agents/review/rails-reviewer.md +139 -139
- package/agents/review/react-reviewer.md +144 -144
- package/agents/review/security-reviewer.md +80 -80
- package/agents/review/simplicity-reviewer.md +140 -140
- package/agents/review/test-coverage-reviewer.md +116 -116
- package/agents/review/typescript-reviewer.md +127 -127
- package/agents/searcher.md +54 -54
- package/agents/simplifier.md +120 -120
- package/agents/tester.md +49 -49
- package/agents/ui/ui-a11y-auditor.md +93 -93
- package/agents/ui/ui-antipattern-detector.md +94 -94
- package/agents/ui/ui-dataviz-advisor.md +69 -69
- package/agents/ui/ui-design-system-gen.md +57 -57
- package/agents/ui/ui-industry-analyzer.md +49 -49
- package/agents/ui/ui-layout-architect.md +65 -65
- package/agents/ui/ui-stack-implementer.md +68 -68
- package/agents/ui/ux-compliance-reviewer.md +81 -81
- package/agents/ui-previewer.md +260 -260
- package/commands/vibe.run.md +83 -0
- package/commands/vibe.spec.review.md +558 -558
- package/commands/vibe.utils.md +413 -413
- package/commands/vibe.voice.md +79 -79
- package/dist/cli/auth.d.ts +1 -1
- package/dist/cli/auth.d.ts.map +1 -1
- package/dist/cli/auth.js +15 -7
- package/dist/cli/auth.js.map +1 -1
- package/dist/cli/collaborator.js +52 -52
- package/dist/cli/commands/evolution.js +12 -12
- package/dist/cli/commands/index.d.ts +1 -0
- package/dist/cli/commands/index.d.ts.map +1 -1
- package/dist/cli/commands/index.js +1 -0
- package/dist/cli/commands/index.js.map +1 -1
- package/dist/cli/commands/info.d.ts.map +1 -1
- package/dist/cli/commands/info.js +62 -56
- package/dist/cli/commands/info.js.map +1 -1
- package/dist/cli/commands/init.d.ts.map +1 -1
- package/dist/cli/commands/init.js +9 -6
- package/dist/cli/commands/init.js.map +1 -1
- package/dist/cli/commands/remove.js +14 -14
- package/dist/cli/commands/sentinel.js +27 -27
- package/dist/cli/commands/skills.d.ts +13 -0
- package/dist/cli/commands/skills.d.ts.map +1 -0
- package/dist/cli/commands/skills.js +83 -0
- package/dist/cli/commands/skills.js.map +1 -0
- package/dist/cli/commands/slack.js +10 -10
- package/dist/cli/commands/telegram.js +12 -12
- package/dist/cli/commands/update.d.ts.map +1 -1
- package/dist/cli/commands/update.js +3 -0
- package/dist/cli/commands/update.js.map +1 -1
- package/dist/cli/detect.js +32 -32
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +64 -47
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/llm/claude-commands.js +16 -16
- package/dist/cli/llm/config.js +18 -18
- package/dist/cli/llm/gemini-commands.js +47 -47
- package/dist/cli/llm/gpt-commands.js +19 -19
- package/dist/cli/llm/help.js +21 -21
- package/dist/cli/postinstall/constants.d.ts +8 -0
- package/dist/cli/postinstall/constants.d.ts.map +1 -1
- package/dist/cli/postinstall/constants.js +33 -0
- package/dist/cli/postinstall/constants.js.map +1 -1
- package/dist/cli/postinstall/cursor-agents.js +32 -32
- package/dist/cli/postinstall/cursor-rules.js +83 -83
- package/dist/cli/postinstall/cursor-skills.js +743 -743
- package/dist/cli/postinstall/index.d.ts +1 -1
- package/dist/cli/postinstall/index.d.ts.map +1 -1
- package/dist/cli/postinstall/index.js +1 -1
- package/dist/cli/postinstall/index.js.map +1 -1
- package/dist/cli/setup/ProjectSetup.d.ts.map +1 -1
- package/dist/cli/setup/ProjectSetup.js +5 -0
- package/dist/cli/setup/ProjectSetup.js.map +1 -1
- package/dist/cli/setup/Provisioner.js +42 -42
- package/dist/cli/types.d.ts +1 -0
- package/dist/cli/types.d.ts.map +1 -1
- package/dist/infra/lib/DeepInit.js +24 -24
- package/dist/infra/lib/IterationTracker.js +11 -11
- package/dist/infra/lib/PythonParser.js +108 -108
- package/dist/infra/lib/ReviewRace.js +96 -96
- package/dist/infra/lib/SkillFrontmatter.js +28 -28
- package/dist/infra/lib/SkillQualityGate.js +9 -9
- package/dist/infra/lib/SkillRepository.js +159 -159
- package/dist/infra/lib/UltraQA.js +99 -99
- package/dist/infra/lib/autonomy/AuditStore.js +41 -41
- package/dist/infra/lib/autonomy/ConfirmationStore.js +30 -30
- package/dist/infra/lib/autonomy/EventOutbox.js +38 -38
- package/dist/infra/lib/autonomy/PolicyEngine.js +18 -18
- package/dist/infra/lib/autonomy/SecuritySentinel.js +1 -1
- package/dist/infra/lib/autonomy/SuggestionStore.js +33 -33
- package/dist/infra/lib/embedding/VectorStore.js +22 -22
- package/dist/infra/lib/evolution/AgentAnalyzer.js +10 -10
- package/dist/infra/lib/evolution/DescriptionOptimizer.d.ts +79 -0
- package/dist/infra/lib/evolution/DescriptionOptimizer.d.ts.map +1 -0
- package/dist/infra/lib/evolution/DescriptionOptimizer.js +259 -0
- package/dist/infra/lib/evolution/DescriptionOptimizer.js.map +1 -0
- package/dist/infra/lib/evolution/GenerationRegistry.js +36 -36
- package/dist/infra/lib/evolution/InsightStore.js +90 -90
- package/dist/infra/lib/evolution/RollbackManager.js +5 -5
- package/dist/infra/lib/evolution/SkillBenchmark.d.ts +81 -0
- package/dist/infra/lib/evolution/SkillBenchmark.d.ts.map +1 -0
- package/dist/infra/lib/evolution/SkillBenchmark.js +233 -0
- package/dist/infra/lib/evolution/SkillBenchmark.js.map +1 -0
- package/dist/infra/lib/evolution/SkillClassifier.d.ts +35 -0
- package/dist/infra/lib/evolution/SkillClassifier.d.ts.map +1 -0
- package/dist/infra/lib/evolution/SkillClassifier.js +167 -0
- package/dist/infra/lib/evolution/SkillClassifier.js.map +1 -0
- package/dist/infra/lib/evolution/SkillEvalRunner.d.ts +102 -0
- package/dist/infra/lib/evolution/SkillEvalRunner.d.ts.map +1 -0
- package/dist/infra/lib/evolution/SkillEvalRunner.js +256 -0
- package/dist/infra/lib/evolution/SkillEvalRunner.js.map +1 -0
- package/dist/infra/lib/evolution/SkillGapDetector.js +10 -10
- package/dist/infra/lib/evolution/UsageTracker.js +28 -28
- package/dist/infra/lib/evolution/__tests__/eval.test.d.ts +2 -0
- package/dist/infra/lib/evolution/__tests__/eval.test.d.ts.map +1 -0
- package/dist/infra/lib/evolution/__tests__/eval.test.js +539 -0
- package/dist/infra/lib/evolution/__tests__/eval.test.js.map +1 -0
- package/dist/infra/lib/evolution/index.d.ts +8 -0
- package/dist/infra/lib/evolution/index.d.ts.map +1 -1
- package/dist/infra/lib/evolution/index.js +5 -0
- package/dist/infra/lib/evolution/index.js.map +1 -1
- package/dist/infra/lib/gemini/constants.js +14 -14
- package/dist/infra/lib/gemini/orchestration.js +5 -5
- package/dist/infra/lib/gpt/oauth.js +44 -44
- package/dist/infra/lib/gpt/orchestration.js +4 -4
- package/dist/infra/lib/memory/KnowledgeGraph.js +4 -4
- package/dist/infra/lib/memory/MemorySearch.js +57 -57
- package/dist/infra/lib/memory/MemoryStorage.js +181 -181
- package/dist/infra/lib/memory/ObservationStore.js +28 -28
- package/dist/infra/lib/memory/ReflectionStore.js +30 -30
- package/dist/infra/lib/memory/SessionRAGRetriever.js +7 -7
- package/dist/infra/lib/memory/SessionRAGStore.js +225 -225
- package/dist/infra/lib/memory/SessionSummarizer.js +9 -9
- package/dist/infra/orchestrator/AgentManager.js +12 -12
- package/dist/infra/orchestrator/AgentRegistry.js +65 -65
- package/dist/infra/orchestrator/MultiLlmResearch.js +8 -8
- package/dist/infra/orchestrator/SwarmOrchestrator.test.js +16 -16
- package/dist/infra/orchestrator/parallelResearch.js +24 -24
- package/dist/tools/convention/analyzeComplexity.test.js +115 -115
- package/dist/tools/convention/validateCodeQuality.test.js +104 -104
- package/dist/tools/memory/createMemoryTimeline.js +10 -10
- package/dist/tools/memory/getMemoryGraph.js +12 -12
- package/dist/tools/memory/getSessionContext.js +9 -9
- package/dist/tools/memory/linkMemories.js +14 -14
- package/dist/tools/memory/listMemories.js +4 -4
- package/dist/tools/memory/recallMemory.js +4 -4
- package/dist/tools/memory/saveMemory.js +4 -4
- package/dist/tools/memory/searchMemoriesAdvanced.js +23 -23
- package/dist/tools/semantic/analyzeDependencyGraph.js +12 -12
- package/dist/tools/semantic/astGrep.test.js +6 -6
- package/dist/tools/spec/prdParser.test.js +171 -171
- package/dist/tools/spec/specGenerator.js +169 -169
- package/dist/tools/spec/traceabilityMatrix.js +64 -64
- package/dist/tools/spec/traceabilityMatrix.test.js +28 -28
- package/hooks/gemini-hooks.json +73 -73
- package/hooks/hooks.json +137 -137
- package/hooks/scripts/code-check.js +70 -70
- package/hooks/scripts/context-save.js +212 -212
- package/hooks/scripts/hud-status.js +291 -291
- package/hooks/scripts/keyword-detector.js +214 -214
- package/hooks/scripts/llm-orchestrate.js +646 -646
- package/hooks/scripts/post-edit.js +32 -32
- package/hooks/scripts/pre-tool-guard.js +125 -125
- package/hooks/scripts/prompt-dispatcher.js +185 -185
- package/hooks/scripts/sentinel-guard.js +104 -104
- package/hooks/scripts/session-start.js +106 -106
- package/hooks/scripts/stop-notify.js +209 -209
- package/hooks/scripts/utils.js +100 -100
- package/languages/csharp-unity.md +515 -515
- package/languages/gdscript-godot.md +470 -470
- package/languages/ruby-rails.md +489 -489
- package/languages/typescript-angular.md +433 -433
- package/languages/typescript-astro.md +416 -416
- package/languages/typescript-electron.md +406 -406
- package/languages/typescript-nestjs.md +524 -524
- package/languages/typescript-svelte.md +407 -407
- package/languages/typescript-tauri.md +365 -365
- package/package.json +121 -121
- package/skills/agents-md/SKILL.md +120 -120
- package/skills/arch-guard/SKILL.md +180 -0
- package/skills/brand-assets/SKILL.md +146 -146
- package/skills/capability-loop/SKILL.md +167 -0
- package/skills/characterization-test/SKILL.md +206 -206
- package/skills/commerce-patterns/SKILL.md +59 -59
- package/skills/commit-push-pr/SKILL.md +75 -75
- package/skills/context7-usage/SKILL.md +105 -105
- package/skills/core-capabilities/SKILL.md +48 -48
- package/skills/e2e-commerce/SKILL.md +57 -57
- package/skills/exec-plan/SKILL.md +147 -0
- package/skills/frontend-design/SKILL.md +73 -73
- package/skills/git-worktree/SKILL.md +72 -72
- package/skills/handoff/SKILL.md +109 -109
- package/skills/parallel-research/SKILL.md +87 -87
- package/skills/priority-todos/SKILL.md +63 -63
- package/skills/seo-checklist/SKILL.md +57 -57
- package/skills/techdebt/SKILL.md +122 -122
- package/skills/tool-fallback/SKILL.md +103 -103
- package/skills/typescript-advanced-types/SKILL.md +65 -65
- package/skills/ui-ux-pro-max/SKILL.md +206 -206
- package/skills/vercel-react-best-practices/SKILL.md +59 -59
- package/skills/video-production/SKILL.md +51 -51
- package/vibe/config.json +29 -29
- package/vibe/constitution.md +227 -227
- package/vibe/rules/principles/communication-guide.md +98 -98
- package/vibe/rules/principles/development-philosophy.md +52 -52
- package/vibe/rules/principles/quick-start.md +102 -102
- package/vibe/rules/quality/bdd-contract-testing.md +393 -393
- package/vibe/rules/quality/checklist.md +276 -276
- package/vibe/rules/quality/performance.md +236 -236
- package/vibe/rules/quality/testing-strategy.md +440 -440
- package/vibe/rules/standards/anti-patterns.md +541 -541
- package/vibe/rules/standards/code-structure.md +291 -291
- package/vibe/rules/standards/complexity-metrics.md +313 -313
- package/vibe/rules/standards/git-workflow.md +237 -237
- package/vibe/rules/standards/naming-conventions.md +198 -198
- package/vibe/rules/standards/security.md +305 -305
- package/vibe/rules/writing/document-style.md +74 -74
- package/vibe/setup.sh +31 -31
- package/vibe/templates/constitution-template.md +252 -252
- package/vibe/templates/contract-backend-template.md +526 -526
- package/vibe/templates/contract-frontend-template.md +599 -599
- package/vibe/templates/feature-template.md +96 -96
- package/vibe/templates/spec-template.md +221 -221
- package/vibe/ui-ux-data/charts.csv +26 -26
- package/vibe/ui-ux-data/colors.csv +97 -97
- package/vibe/ui-ux-data/icons.csv +101 -101
- package/vibe/ui-ux-data/landing.csv +31 -31
- package/vibe/ui-ux-data/products.csv +96 -96
- package/vibe/ui-ux-data/react-performance.csv +45 -45
- package/vibe/ui-ux-data/stacks/astro.csv +54 -54
- package/vibe/ui-ux-data/stacks/flutter.csv +53 -53
- package/vibe/ui-ux-data/stacks/html-tailwind.csv +56 -56
- package/vibe/ui-ux-data/stacks/jetpack-compose.csv +53 -53
- package/vibe/ui-ux-data/stacks/nextjs.csv +53 -53
- package/vibe/ui-ux-data/stacks/nuxt-ui.csv +51 -51
- package/vibe/ui-ux-data/stacks/nuxtjs.csv +59 -59
- package/vibe/ui-ux-data/stacks/react-native.csv +52 -52
- package/vibe/ui-ux-data/stacks/react.csv +54 -54
- package/vibe/ui-ux-data/stacks/shadcn.csv +61 -61
- package/vibe/ui-ux-data/stacks/svelte.csv +54 -54
- package/vibe/ui-ux-data/stacks/swiftui.csv +51 -51
- package/vibe/ui-ux-data/stacks/vue.csv +50 -50
- package/vibe/ui-ux-data/styles.csv +68 -68
- package/vibe/ui-ux-data/typography.csv +57 -57
- package/vibe/ui-ux-data/ui-reasoning.csv +101 -101
- package/vibe/ui-ux-data/ux-guidelines.csv +99 -99
- package/vibe/ui-ux-data/version.json +31 -31
- package/vibe/ui-ux-data/web-interface.csv +31 -31
|
@@ -11,20 +11,20 @@ export class UsageTracker {
|
|
|
11
11
|
this.initializeTables();
|
|
12
12
|
}
|
|
13
13
|
initializeTables() {
|
|
14
|
-
this.db.exec(`
|
|
15
|
-
CREATE TABLE IF NOT EXISTS usage_events (
|
|
16
|
-
id TEXT PRIMARY KEY,
|
|
17
|
-
generationId TEXT NOT NULL,
|
|
18
|
-
sessionId TEXT,
|
|
19
|
-
matchedPrompt TEXT,
|
|
20
|
-
feedback TEXT CHECK(feedback IN ('positive','negative','neutral') OR feedback IS NULL),
|
|
21
|
-
createdAt TEXT NOT NULL
|
|
22
|
-
);
|
|
23
|
-
|
|
24
|
-
CREATE INDEX IF NOT EXISTS idx_ue_gen ON usage_events(generationId);
|
|
25
|
-
CREATE INDEX IF NOT EXISTS idx_ue_session ON usage_events(sessionId);
|
|
26
|
-
CREATE INDEX IF NOT EXISTS idx_ue_feedback ON usage_events(feedback);
|
|
27
|
-
CREATE INDEX IF NOT EXISTS idx_ue_created ON usage_events(createdAt);
|
|
14
|
+
this.db.exec(`
|
|
15
|
+
CREATE TABLE IF NOT EXISTS usage_events (
|
|
16
|
+
id TEXT PRIMARY KEY,
|
|
17
|
+
generationId TEXT NOT NULL,
|
|
18
|
+
sessionId TEXT,
|
|
19
|
+
matchedPrompt TEXT,
|
|
20
|
+
feedback TEXT CHECK(feedback IN ('positive','negative','neutral') OR feedback IS NULL),
|
|
21
|
+
createdAt TEXT NOT NULL
|
|
22
|
+
);
|
|
23
|
+
|
|
24
|
+
CREATE INDEX IF NOT EXISTS idx_ue_gen ON usage_events(generationId);
|
|
25
|
+
CREATE INDEX IF NOT EXISTS idx_ue_session ON usage_events(sessionId);
|
|
26
|
+
CREATE INDEX IF NOT EXISTS idx_ue_feedback ON usage_events(feedback);
|
|
27
|
+
CREATE INDEX IF NOT EXISTS idx_ue_created ON usage_events(createdAt);
|
|
28
28
|
`);
|
|
29
29
|
}
|
|
30
30
|
/**
|
|
@@ -33,9 +33,9 @@ export class UsageTracker {
|
|
|
33
33
|
recordUsage(generationId, sessionId, matchedPrompt) {
|
|
34
34
|
const id = `ue-${Date.now().toString(36)}-${randomUUID().replace(/-/g, '').slice(0, 8)}`;
|
|
35
35
|
const now = new Date().toISOString();
|
|
36
|
-
this.db.prepare(`
|
|
37
|
-
INSERT INTO usage_events (id, generationId, sessionId, matchedPrompt, createdAt)
|
|
38
|
-
VALUES (?, ?, ?, ?, ?)
|
|
36
|
+
this.db.prepare(`
|
|
37
|
+
INSERT INTO usage_events (id, generationId, sessionId, matchedPrompt, createdAt)
|
|
38
|
+
VALUES (?, ?, ?, ?, ?)
|
|
39
39
|
`).run(id, generationId, sessionId || null, matchedPrompt || null, now);
|
|
40
40
|
// Also increment usage count in generations table
|
|
41
41
|
this.registry.incrementUsage(generationId);
|
|
@@ -45,8 +45,8 @@ export class UsageTracker {
|
|
|
45
45
|
* Record explicit user feedback for a usage event
|
|
46
46
|
*/
|
|
47
47
|
setFeedback(eventId, feedback) {
|
|
48
|
-
const result = this.db.prepare(`
|
|
49
|
-
UPDATE usage_events SET feedback = ? WHERE id = ?
|
|
48
|
+
const result = this.db.prepare(`
|
|
49
|
+
UPDATE usage_events SET feedback = ? WHERE id = ?
|
|
50
50
|
`).run(feedback, eventId);
|
|
51
51
|
return result.changes > 0;
|
|
52
52
|
}
|
|
@@ -54,8 +54,8 @@ export class UsageTracker {
|
|
|
54
54
|
* Record explicit feedback for a generation (latest event)
|
|
55
55
|
*/
|
|
56
56
|
setFeedbackForGeneration(generationId, feedback) {
|
|
57
|
-
const event = this.db.prepare(`
|
|
58
|
-
SELECT id FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC LIMIT 1
|
|
57
|
+
const event = this.db.prepare(`
|
|
58
|
+
SELECT id FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC LIMIT 1
|
|
59
59
|
`).get(generationId);
|
|
60
60
|
if (!event)
|
|
61
61
|
return false;
|
|
@@ -76,9 +76,9 @@ export class UsageTracker {
|
|
|
76
76
|
feedback = 'neutral';
|
|
77
77
|
}
|
|
78
78
|
// Only apply to events without explicit feedback
|
|
79
|
-
const result = this.db.prepare(`
|
|
80
|
-
UPDATE usage_events SET feedback = ?
|
|
81
|
-
WHERE sessionId = ? AND feedback IS NULL
|
|
79
|
+
const result = this.db.prepare(`
|
|
80
|
+
UPDATE usage_events SET feedback = ?
|
|
81
|
+
WHERE sessionId = ? AND feedback IS NULL
|
|
82
82
|
`).run(feedback, sessionId);
|
|
83
83
|
return result.changes;
|
|
84
84
|
}
|
|
@@ -86,8 +86,8 @@ export class UsageTracker {
|
|
|
86
86
|
* Get usage events for a generation
|
|
87
87
|
*/
|
|
88
88
|
getByGeneration(generationId) {
|
|
89
|
-
const rows = this.db.prepare(`
|
|
90
|
-
SELECT * FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC
|
|
89
|
+
const rows = this.db.prepare(`
|
|
90
|
+
SELECT * FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC
|
|
91
91
|
`).all(generationId);
|
|
92
92
|
return rows.map(this.rowToEvent);
|
|
93
93
|
}
|
|
@@ -138,8 +138,8 @@ export class UsageTracker {
|
|
|
138
138
|
* Get total usage count for a generation
|
|
139
139
|
*/
|
|
140
140
|
getUsageCount(generationId) {
|
|
141
|
-
const result = this.db.prepare(`
|
|
142
|
-
SELECT COUNT(*) as cnt FROM usage_events WHERE generationId = ?
|
|
141
|
+
const result = this.db.prepare(`
|
|
142
|
+
SELECT COUNT(*) as cnt FROM usage_events WHERE generationId = ?
|
|
143
143
|
`).get(generationId);
|
|
144
144
|
return result.cnt;
|
|
145
145
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval.test.d.ts","sourceRoot":"","sources":["../../../../../src/infra/lib/evolution/__tests__/eval.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import { join } from 'path';
|
|
3
|
+
import { tmpdir } from 'os';
|
|
4
|
+
import { rmSync } from 'fs';
|
|
5
|
+
import { MemoryStorage } from '../../memory/MemoryStorage.js';
|
|
6
|
+
import { SkillEvalRunner } from '../SkillEvalRunner.js';
|
|
7
|
+
import { SkillBenchmark } from '../SkillBenchmark.js';
|
|
8
|
+
import { SkillClassifier } from '../SkillClassifier.js';
|
|
9
|
+
import { DescriptionOptimizer } from '../DescriptionOptimizer.js';
|
|
10
|
+
// ─── SkillEvalRunner ─────────────────────────────────────────────────────────
|
|
11
|
+
describe('SkillEvalRunner', () => {
|
|
12
|
+
let storage;
|
|
13
|
+
let runner;
|
|
14
|
+
let testDir;
|
|
15
|
+
beforeEach(() => {
|
|
16
|
+
testDir = join(tmpdir(), `eval-runner-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
|
17
|
+
storage = new MemoryStorage(testDir);
|
|
18
|
+
runner = new SkillEvalRunner(storage);
|
|
19
|
+
});
|
|
20
|
+
afterEach(() => {
|
|
21
|
+
storage.close();
|
|
22
|
+
try {
|
|
23
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
24
|
+
}
|
|
25
|
+
catch { /* ignore */ }
|
|
26
|
+
});
|
|
27
|
+
it('should create eval set and retrieve cases', () => {
|
|
28
|
+
const cases = runner.createEvalSet({
|
|
29
|
+
skillName: 'csv-analyzer',
|
|
30
|
+
evals: [
|
|
31
|
+
{
|
|
32
|
+
prompt: 'Analyze this CSV file and generate a summary',
|
|
33
|
+
expectedOutput: 'A statistical summary of the CSV data',
|
|
34
|
+
files: ['data.csv'],
|
|
35
|
+
assertions: [
|
|
36
|
+
{ description: 'Contains row count', type: 'contains', value: 'rows' },
|
|
37
|
+
{ description: 'Contains column info', type: 'contains', value: 'columns' },
|
|
38
|
+
],
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
prompt: 'Parse the CSV and find outliers',
|
|
42
|
+
expectedOutput: 'List of outlier values',
|
|
43
|
+
},
|
|
44
|
+
],
|
|
45
|
+
});
|
|
46
|
+
expect(cases).toHaveLength(2);
|
|
47
|
+
expect(cases[0].skillName).toBe('csv-analyzer');
|
|
48
|
+
expect(cases[0].assertions).toHaveLength(2);
|
|
49
|
+
expect(cases[0].files).toEqual(['data.csv']);
|
|
50
|
+
expect(cases[1].assertions).toHaveLength(0);
|
|
51
|
+
const retrieved = runner.getEvalCases('csv-analyzer');
|
|
52
|
+
expect(retrieved).toHaveLength(2);
|
|
53
|
+
});
|
|
54
|
+
it('should start and complete eval runs', () => {
|
|
55
|
+
const cases = runner.createEvalSet({
|
|
56
|
+
skillName: 'test-skill',
|
|
57
|
+
evals: [{ prompt: 'Test prompt', expectedOutput: 'Expected output' }],
|
|
58
|
+
});
|
|
59
|
+
const runId = runner.startRun(cases[0].id, 'test-skill', 'with_skill');
|
|
60
|
+
expect(runId).toBeTruthy();
|
|
61
|
+
runner.completeRun(runId, 'Generated output with rows and columns', [
|
|
62
|
+
{ assertionId: 'a1', description: 'Has content', passed: true, evidence: 'Output is non-empty' },
|
|
63
|
+
], 1500, 5000);
|
|
64
|
+
const runs = runner.getRunsForEval(cases[0].id);
|
|
65
|
+
expect(runs).toHaveLength(1);
|
|
66
|
+
expect(runs[0].status).toBe('passed');
|
|
67
|
+
expect(runs[0].durationMs).toBe(1500);
|
|
68
|
+
expect(runs[0].tokenCount).toBe(5000);
|
|
69
|
+
});
|
|
70
|
+
it('should mark run as failed when assertions fail', () => {
|
|
71
|
+
const cases = runner.createEvalSet({
|
|
72
|
+
skillName: 'test-skill',
|
|
73
|
+
evals: [{ prompt: 'Test', expectedOutput: 'Expected' }],
|
|
74
|
+
});
|
|
75
|
+
const runId = runner.startRun(cases[0].id, 'test-skill', 'with_skill');
|
|
76
|
+
runner.completeRun(runId, 'Bad output', [
|
|
77
|
+
{ assertionId: 'a1', description: 'Has rows', passed: false, evidence: 'Missing' },
|
|
78
|
+
{ assertionId: 'a2', description: 'Has cols', passed: true, evidence: 'Present' },
|
|
79
|
+
], 1000, 3000);
|
|
80
|
+
const runs = runner.getRunsForEval(cases[0].id);
|
|
81
|
+
expect(runs[0].status).toBe('failed');
|
|
82
|
+
});
|
|
83
|
+
it('should handle error runs', () => {
|
|
84
|
+
const cases = runner.createEvalSet({
|
|
85
|
+
skillName: 'test-skill',
|
|
86
|
+
evals: [{ prompt: 'Test', expectedOutput: 'Expected' }],
|
|
87
|
+
});
|
|
88
|
+
const runId = runner.startRun(cases[0].id, 'test-skill', 'baseline');
|
|
89
|
+
runner.failRun(runId, 'Connection timeout');
|
|
90
|
+
const runs = runner.getRunsForEval(cases[0].id);
|
|
91
|
+
expect(runs[0].status).toBe('error');
|
|
92
|
+
expect(runs[0].output).toBe('Connection timeout');
|
|
93
|
+
});
|
|
94
|
+
it('should grade output against assertions', () => {
|
|
95
|
+
const assertions = [
|
|
96
|
+
{ id: 'a1', description: 'Contains summary', type: 'contains', value: 'summary' },
|
|
97
|
+
{ id: 'a2', description: 'No errors', type: 'not_contains', value: 'error' },
|
|
98
|
+
{ id: 'a3', description: 'Has number', type: 'matches_regex', value: '\\d+' },
|
|
99
|
+
{ id: 'a4', description: 'Custom check', type: 'custom', value: 'quality > 8' },
|
|
100
|
+
];
|
|
101
|
+
const grades = runner.gradeOutput('Here is the summary: 42 items found', assertions);
|
|
102
|
+
expect(grades).toHaveLength(4);
|
|
103
|
+
expect(grades[0].passed).toBe(true); // contains 'summary'
|
|
104
|
+
expect(grades[1].passed).toBe(true); // not contains 'error'
|
|
105
|
+
expect(grades[2].passed).toBe(true); // matches \d+
|
|
106
|
+
expect(grades[3].passed).toBe(false); // custom always false without external grading
|
|
107
|
+
});
|
|
108
|
+
it('should grade failing contains assertion', () => {
|
|
109
|
+
const assertions = [
|
|
110
|
+
{ id: 'a1', description: 'Contains missing word', type: 'contains', value: 'nonexistent' },
|
|
111
|
+
];
|
|
112
|
+
const grades = runner.gradeOutput('Some output text', assertions);
|
|
113
|
+
expect(grades[0].passed).toBe(false);
|
|
114
|
+
expect(grades[0].evidence).toContain('does not contain');
|
|
115
|
+
});
|
|
116
|
+
it('should grade failing not_contains assertion', () => {
|
|
117
|
+
const assertions = [
|
|
118
|
+
{ id: 'a1', description: 'No errors', type: 'not_contains', value: 'error' },
|
|
119
|
+
];
|
|
120
|
+
const grades = runner.gradeOutput('An error occurred', assertions);
|
|
121
|
+
expect(grades[0].passed).toBe(false);
|
|
122
|
+
expect(grades[0].evidence).toContain('unexpectedly contains');
|
|
123
|
+
});
|
|
124
|
+
it('should handle invalid regex gracefully', () => {
|
|
125
|
+
const assertions = [
|
|
126
|
+
{ id: 'a1', description: 'Bad regex', type: 'matches_regex', value: '[invalid' },
|
|
127
|
+
];
|
|
128
|
+
const grades = runner.gradeOutput('test', assertions);
|
|
129
|
+
expect(grades[0].passed).toBe(false);
|
|
130
|
+
expect(grades[0].evidence).toContain('Invalid regex');
|
|
131
|
+
});
|
|
132
|
+
it('should get latest runs grouped by eval and variant', () => {
|
|
133
|
+
const cases = runner.createEvalSet({
|
|
134
|
+
skillName: 'grouped-skill',
|
|
135
|
+
evals: [
|
|
136
|
+
{ prompt: 'Eval 1', expectedOutput: 'Expected 1' },
|
|
137
|
+
{ prompt: 'Eval 2', expectedOutput: 'Expected 2' },
|
|
138
|
+
],
|
|
139
|
+
});
|
|
140
|
+
// Run both variants for eval 1
|
|
141
|
+
const wsRun = runner.startRun(cases[0].id, 'grouped-skill', 'with_skill');
|
|
142
|
+
runner.completeRun(wsRun, 'Output ws', [], 100, 500);
|
|
143
|
+
const blRun = runner.startRun(cases[0].id, 'grouped-skill', 'baseline');
|
|
144
|
+
runner.completeRun(blRun, 'Output bl', [], 200, 600);
|
|
145
|
+
// Only with_skill for eval 2
|
|
146
|
+
const wsRun2 = runner.startRun(cases[1].id, 'grouped-skill', 'with_skill');
|
|
147
|
+
runner.completeRun(wsRun2, 'Output ws2', [], 150, 550);
|
|
148
|
+
const grouped = runner.getLatestRuns('grouped-skill');
|
|
149
|
+
expect(grouped.size).toBe(2);
|
|
150
|
+
const eval1 = grouped.get(cases[0].id);
|
|
151
|
+
expect(eval1?.withSkill).not.toBeNull();
|
|
152
|
+
expect(eval1?.baseline).not.toBeNull();
|
|
153
|
+
const eval2 = grouped.get(cases[1].id);
|
|
154
|
+
expect(eval2?.withSkill).not.toBeNull();
|
|
155
|
+
expect(eval2?.baseline).toBeNull();
|
|
156
|
+
});
|
|
157
|
+
it('should delete eval set and associated runs', () => {
|
|
158
|
+
runner.createEvalSet({
|
|
159
|
+
skillName: 'to-delete',
|
|
160
|
+
evals: [{ prompt: 'Test', expectedOutput: 'Expected' }],
|
|
161
|
+
});
|
|
162
|
+
const cases = runner.getEvalCases('to-delete');
|
|
163
|
+
runner.startRun(cases[0].id, 'to-delete', 'with_skill');
|
|
164
|
+
const deleted = runner.deleteEvalSet('to-delete');
|
|
165
|
+
expect(deleted).toBe(1);
|
|
166
|
+
expect(runner.getEvalCases('to-delete')).toHaveLength(0);
|
|
167
|
+
});
|
|
168
|
+
it('should get eval case by ID', () => {
|
|
169
|
+
const cases = runner.createEvalSet({
|
|
170
|
+
skillName: 'by-id-test',
|
|
171
|
+
evals: [{ prompt: 'Specific prompt', expectedOutput: 'Specific output' }],
|
|
172
|
+
});
|
|
173
|
+
const found = runner.getEvalCase(cases[0].id);
|
|
174
|
+
expect(found).not.toBeNull();
|
|
175
|
+
expect(found.prompt).toBe('Specific prompt');
|
|
176
|
+
const notFound = runner.getEvalCase('nonexistent');
|
|
177
|
+
expect(notFound).toBeNull();
|
|
178
|
+
});
|
|
179
|
+
});
|
|
180
|
+
// ─── SkillBenchmark ──────────────────────────────────────────────────────────
|
|
181
|
+
describe('SkillBenchmark', () => {
|
|
182
|
+
let storage;
|
|
183
|
+
let runner;
|
|
184
|
+
let benchmark;
|
|
185
|
+
let testDir;
|
|
186
|
+
beforeEach(() => {
|
|
187
|
+
testDir = join(tmpdir(), `eval-bench-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
|
188
|
+
storage = new MemoryStorage(testDir);
|
|
189
|
+
runner = new SkillEvalRunner(storage);
|
|
190
|
+
benchmark = new SkillBenchmark(storage);
|
|
191
|
+
});
|
|
192
|
+
afterEach(() => {
|
|
193
|
+
storage.close();
|
|
194
|
+
try {
|
|
195
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
196
|
+
}
|
|
197
|
+
catch { /* ignore */ }
|
|
198
|
+
});
|
|
199
|
+
function setupSkillWithRuns(skillName, wsPass, blPass) {
|
|
200
|
+
const cases = runner.createEvalSet({
|
|
201
|
+
skillName,
|
|
202
|
+
evals: [
|
|
203
|
+
{ prompt: 'Eval A', expectedOutput: 'Expected A', assertions: [{ description: 'Check A', type: 'contains', value: 'result' }] },
|
|
204
|
+
{ prompt: 'Eval B', expectedOutput: 'Expected B', assertions: [{ description: 'Check B', type: 'contains', value: 'data' }] },
|
|
205
|
+
],
|
|
206
|
+
});
|
|
207
|
+
for (const evalCase of cases) {
|
|
208
|
+
const wsId = runner.startRun(evalCase.id, skillName, 'with_skill');
|
|
209
|
+
const wsOutput = wsPass ? 'Here is the result with data' : 'Incomplete output';
|
|
210
|
+
const wsGrades = runner.gradeOutput(wsOutput, evalCase.assertions);
|
|
211
|
+
runner.completeRun(wsId, wsOutput, wsGrades, 1200, 4500);
|
|
212
|
+
const blId = runner.startRun(evalCase.id, skillName, 'baseline');
|
|
213
|
+
const blOutput = blPass ? 'Here is the result with data' : 'No useful output';
|
|
214
|
+
const blGrades = runner.gradeOutput(blOutput, evalCase.assertions);
|
|
215
|
+
runner.completeRun(blId, blOutput, blGrades, 1500, 5000);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
it('should aggregate benchmark results', () => {
|
|
219
|
+
setupSkillWithRuns('bench-test', true, false);
|
|
220
|
+
const result = benchmark.aggregate('bench-test');
|
|
221
|
+
expect(result.skillName).toBe('bench-test');
|
|
222
|
+
expect(result.iteration).toBe(1);
|
|
223
|
+
expect(result.summary.totalEvals).toBe(2);
|
|
224
|
+
expect(result.summary.withSkill.passRate).toBe(1.0);
|
|
225
|
+
expect(result.summary.baseline.passRate).toBe(0);
|
|
226
|
+
expect(result.summary.delta.passRateDelta).toBe(1.0);
|
|
227
|
+
expect(result.evalBreakdowns).toHaveLength(2);
|
|
228
|
+
});
|
|
229
|
+
it('should track benchmark history across iterations', () => {
|
|
230
|
+
setupSkillWithRuns('history-test', true, false);
|
|
231
|
+
benchmark.aggregate('history-test');
|
|
232
|
+
// Second iteration with improved baseline
|
|
233
|
+
setupSkillWithRuns('history-test', true, true);
|
|
234
|
+
benchmark.aggregate('history-test');
|
|
235
|
+
const history = benchmark.getHistory('history-test');
|
|
236
|
+
expect(history).toHaveLength(2);
|
|
237
|
+
expect(history[0].iteration).toBe(1);
|
|
238
|
+
expect(history[1].iteration).toBe(2);
|
|
239
|
+
});
|
|
240
|
+
it('should get latest benchmark', () => {
|
|
241
|
+
setupSkillWithRuns('latest-test', true, false);
|
|
242
|
+
benchmark.aggregate('latest-test');
|
|
243
|
+
const latest = benchmark.getLatest('latest-test');
|
|
244
|
+
expect(latest).not.toBeNull();
|
|
245
|
+
expect(latest.iteration).toBe(1);
|
|
246
|
+
const none = benchmark.getLatest('nonexistent');
|
|
247
|
+
expect(none).toBeNull();
|
|
248
|
+
});
|
|
249
|
+
it('should compare two iterations', () => {
|
|
250
|
+
setupSkillWithRuns('compare-test', true, false);
|
|
251
|
+
benchmark.aggregate('compare-test');
|
|
252
|
+
setupSkillWithRuns('compare-test', true, true);
|
|
253
|
+
benchmark.aggregate('compare-test');
|
|
254
|
+
const comparison = benchmark.compare('compare-test', 1, 2);
|
|
255
|
+
expect(comparison.iterationA).not.toBeNull();
|
|
256
|
+
expect(comparison.iterationB).not.toBeNull();
|
|
257
|
+
expect(comparison.improvement).not.toBeNull();
|
|
258
|
+
});
|
|
259
|
+
it('should handle compare with missing iteration', () => {
|
|
260
|
+
const comparison = benchmark.compare('missing', 1, 2);
|
|
261
|
+
expect(comparison.iterationA).toBeNull();
|
|
262
|
+
expect(comparison.iterationB).toBeNull();
|
|
263
|
+
expect(comparison.improvement).toBeNull();
|
|
264
|
+
});
|
|
265
|
+
it('should format benchmark report as markdown', () => {
|
|
266
|
+
setupSkillWithRuns('report-test', true, false);
|
|
267
|
+
const result = benchmark.aggregate('report-test');
|
|
268
|
+
const report = benchmark.formatReport(result);
|
|
269
|
+
expect(report).toContain('# Benchmark: report-test');
|
|
270
|
+
expect(report).toContain('Pass Rate');
|
|
271
|
+
expect(report).toContain('Mean Duration');
|
|
272
|
+
expect(report).toContain('Per-Eval Breakdown');
|
|
273
|
+
});
|
|
274
|
+
it('should compute stddev for duration and tokens', () => {
|
|
275
|
+
const cases = runner.createEvalSet({
|
|
276
|
+
skillName: 'stddev-test',
|
|
277
|
+
evals: [
|
|
278
|
+
{ prompt: 'A', expectedOutput: 'A' },
|
|
279
|
+
{ prompt: 'B', expectedOutput: 'B' },
|
|
280
|
+
{ prompt: 'C', expectedOutput: 'C' },
|
|
281
|
+
],
|
|
282
|
+
});
|
|
283
|
+
// Varying durations and tokens
|
|
284
|
+
const durations = [1000, 2000, 3000];
|
|
285
|
+
const tokens = [4000, 5000, 6000];
|
|
286
|
+
for (let i = 0; i < cases.length; i++) {
|
|
287
|
+
const wsId = runner.startRun(cases[i].id, 'stddev-test', 'with_skill');
|
|
288
|
+
runner.completeRun(wsId, 'output', [], durations[i], tokens[i]);
|
|
289
|
+
const blId = runner.startRun(cases[i].id, 'stddev-test', 'baseline');
|
|
290
|
+
runner.completeRun(blId, 'output', [], durations[i] + 500, tokens[i] + 500);
|
|
291
|
+
}
|
|
292
|
+
const result = benchmark.aggregate('stddev-test');
|
|
293
|
+
expect(result.summary.withSkill.stddevDurationMs).toBeGreaterThan(0);
|
|
294
|
+
expect(result.summary.withSkill.stddevTokens).toBeGreaterThan(0);
|
|
295
|
+
});
|
|
296
|
+
});
|
|
297
|
+
// ─── SkillClassifier ─────────────────────────────────────────────────────────
|
|
298
|
+
describe('SkillClassifier', () => {
|
|
299
|
+
let storage;
|
|
300
|
+
let runner;
|
|
301
|
+
let benchmarkObj;
|
|
302
|
+
let classifier;
|
|
303
|
+
let testDir;
|
|
304
|
+
beforeEach(() => {
|
|
305
|
+
testDir = join(tmpdir(), `eval-class-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
|
306
|
+
storage = new MemoryStorage(testDir);
|
|
307
|
+
runner = new SkillEvalRunner(storage);
|
|
308
|
+
benchmarkObj = new SkillBenchmark(storage);
|
|
309
|
+
classifier = new SkillClassifier(storage);
|
|
310
|
+
});
|
|
311
|
+
afterEach(() => {
|
|
312
|
+
storage.close();
|
|
313
|
+
try {
|
|
314
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
315
|
+
}
|
|
316
|
+
catch { /* ignore */ }
|
|
317
|
+
});
|
|
318
|
+
function createBenchmark(skillName, wsPassRate, blPassRate) {
|
|
319
|
+
const totalEvals = 10;
|
|
320
|
+
const wsPassCount = Math.round(wsPassRate * totalEvals);
|
|
321
|
+
const blPassCount = Math.round(blPassRate * totalEvals);
|
|
322
|
+
const evals = Array.from({ length: totalEvals }, (_, i) => ({
|
|
323
|
+
prompt: `Eval ${i}`,
|
|
324
|
+
expectedOutput: `Expected ${i}`,
|
|
325
|
+
assertions: [{ description: 'Check', type: 'contains', value: 'pass' }],
|
|
326
|
+
}));
|
|
327
|
+
const cases = runner.createEvalSet({ skillName, evals });
|
|
328
|
+
for (let i = 0; i < cases.length; i++) {
|
|
329
|
+
const wsId = runner.startRun(cases[i].id, skillName, 'with_skill');
|
|
330
|
+
const wsOutput = i < wsPassCount ? 'pass' : 'fail';
|
|
331
|
+
const wsGrades = runner.gradeOutput(wsOutput, cases[i].assertions);
|
|
332
|
+
runner.completeRun(wsId, wsOutput, wsGrades, 1000, 5000);
|
|
333
|
+
const blId = runner.startRun(cases[i].id, skillName, 'baseline');
|
|
334
|
+
const blOutput = i < blPassCount ? 'pass' : 'fail';
|
|
335
|
+
const blGrades = runner.gradeOutput(blOutput, cases[i].assertions);
|
|
336
|
+
runner.completeRun(blId, blOutput, blGrades, 1200, 5500);
|
|
337
|
+
}
|
|
338
|
+
benchmarkObj.aggregate(skillName);
|
|
339
|
+
}
|
|
340
|
+
it('should classify as unknown when no benchmarks exist', () => {
|
|
341
|
+
const result = classifier.classify('nonexistent');
|
|
342
|
+
expect(result.category).toBe('unknown');
|
|
343
|
+
expect(result.confidence).toBe(0);
|
|
344
|
+
expect(result.trend).toBe('insufficient_data');
|
|
345
|
+
});
|
|
346
|
+
it('should classify as capability_uplift when baseline is high', () => {
|
|
347
|
+
createBenchmark('cap-uplift', 0.9, 0.8);
|
|
348
|
+
const result = classifier.classify('cap-uplift');
|
|
349
|
+
expect(result.category).toBe('capability_uplift');
|
|
350
|
+
expect(result.baselinePassRate).toBeGreaterThanOrEqual(0.7);
|
|
351
|
+
});
|
|
352
|
+
it('should classify as encoded_preference when baseline is low and gap is large', () => {
|
|
353
|
+
createBenchmark('enc-pref', 0.9, 0.1);
|
|
354
|
+
const result = classifier.classify('enc-pref');
|
|
355
|
+
expect(result.category).toBe('encoded_preference');
|
|
356
|
+
expect(result.baselinePassRate).toBeLessThanOrEqual(0.3);
|
|
357
|
+
expect(result.withSkillPassRate).toBeGreaterThan(result.baselinePassRate);
|
|
358
|
+
});
|
|
359
|
+
it('should detect converging trend as capability_uplift', () => {
|
|
360
|
+
// First benchmark: large gap
|
|
361
|
+
createBenchmark('converge', 0.9, 0.2);
|
|
362
|
+
// Second benchmark: gap shrinks
|
|
363
|
+
createBenchmark('converge', 0.9, 0.7);
|
|
364
|
+
const result = classifier.classify('converge');
|
|
365
|
+
expect(result.trend).toBe('converging');
|
|
366
|
+
});
|
|
367
|
+
it('should detect stable trend as encoded_preference', () => {
|
|
368
|
+
// Both benchmarks: consistent gap
|
|
369
|
+
createBenchmark('stable', 0.9, 0.1);
|
|
370
|
+
createBenchmark('stable', 0.9, 0.1);
|
|
371
|
+
const result = classifier.classify('stable');
|
|
372
|
+
expect(result.trend).toBe('stable');
|
|
373
|
+
});
|
|
374
|
+
it('should classify from explicit rates', () => {
|
|
375
|
+
const result = classifier.classifyFromRates('test-skill', 0.95, 0.1);
|
|
376
|
+
expect(result.category).toBe('encoded_preference');
|
|
377
|
+
expect(result.skillName).toBe('test-skill');
|
|
378
|
+
});
|
|
379
|
+
it('should detect becoming obsolete', () => {
|
|
380
|
+
createBenchmark('obsolete', 0.9, 0.85);
|
|
381
|
+
const result = classifier.isBecomingObsolete('obsolete');
|
|
382
|
+
expect(result.obsolete).toBe(true);
|
|
383
|
+
expect(result.reason).toContain('well without the skill');
|
|
384
|
+
});
|
|
385
|
+
it('should not flag non-obsolete skills', () => {
|
|
386
|
+
createBenchmark('healthy', 0.9, 0.1);
|
|
387
|
+
const result = classifier.isBecomingObsolete('healthy');
|
|
388
|
+
expect(result.obsolete).toBe(false);
|
|
389
|
+
});
|
|
390
|
+
});
|
|
391
|
+
// ─── DescriptionOptimizer ────────────────────────────────────────────────────
|
|
392
|
+
describe('DescriptionOptimizer', () => {
|
|
393
|
+
let storage;
|
|
394
|
+
let optimizer;
|
|
395
|
+
let testDir;
|
|
396
|
+
beforeEach(() => {
|
|
397
|
+
testDir = join(tmpdir(), `eval-opt-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
|
398
|
+
storage = new MemoryStorage(testDir);
|
|
399
|
+
optimizer = new DescriptionOptimizer(storage);
|
|
400
|
+
});
|
|
401
|
+
afterEach(() => {
|
|
402
|
+
storage.close();
|
|
403
|
+
try {
|
|
404
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
405
|
+
}
|
|
406
|
+
catch { /* ignore */ }
|
|
407
|
+
});
|
|
408
|
+
it('should split eval set into train/test with stratification', () => {
|
|
409
|
+
const queries = [
|
|
410
|
+
{ query: 'analyze this csv file', shouldTrigger: true },
|
|
411
|
+
{ query: 'parse my spreadsheet data', shouldTrigger: true },
|
|
412
|
+
{ query: 'generate a chart from data', shouldTrigger: true },
|
|
413
|
+
{ query: 'create csv summary', shouldTrigger: true },
|
|
414
|
+
{ query: 'write me an email', shouldTrigger: false },
|
|
415
|
+
{ query: 'fix this bug in the auth module', shouldTrigger: false },
|
|
416
|
+
{ query: 'deploy the application', shouldTrigger: false },
|
|
417
|
+
{ query: 'review my pull request', shouldTrigger: false },
|
|
418
|
+
];
|
|
419
|
+
const { train, test } = optimizer.splitEvalSet(queries);
|
|
420
|
+
// Both sets should have queries
|
|
421
|
+
expect(train.length).toBeGreaterThan(0);
|
|
422
|
+
expect(test.length).toBeGreaterThan(0);
|
|
423
|
+
// Combined should cover all queries
|
|
424
|
+
expect(train.length + test.length).toBe(queries.length);
|
|
425
|
+
// Both sets should have both types
|
|
426
|
+
expect(train.some(q => q.shouldTrigger)).toBe(true);
|
|
427
|
+
expect(train.some(q => !q.shouldTrigger)).toBe(true);
|
|
428
|
+
expect(test.some(q => q.shouldTrigger)).toBe(true);
|
|
429
|
+
expect(test.some(q => !q.shouldTrigger)).toBe(true);
|
|
430
|
+
});
|
|
431
|
+
it('should evaluate description against trigger queries', () => {
|
|
432
|
+
const description = 'Analyze CSV files, parse spreadsheet data, generate statistical summaries';
|
|
433
|
+
const queries = [
|
|
434
|
+
{ query: 'analyze this csv file and show me stats', shouldTrigger: true },
|
|
435
|
+
{ query: 'write a blog post about cooking', shouldTrigger: false },
|
|
436
|
+
];
|
|
437
|
+
const results = optimizer.evaluateDescription(description, queries);
|
|
438
|
+
expect(results).toHaveLength(2);
|
|
439
|
+
expect(results[0].shouldTrigger).toBe(true);
|
|
440
|
+
expect(results[1].shouldTrigger).toBe(false);
|
|
441
|
+
});
|
|
442
|
+
it('should score results correctly', () => {
|
|
443
|
+
const allCorrect = [
|
|
444
|
+
{ query: 'q1', shouldTrigger: true, didTrigger: true, triggerRate: 0.5, correct: true },
|
|
445
|
+
{ query: 'q2', shouldTrigger: false, didTrigger: false, triggerRate: 0.0, correct: true },
|
|
446
|
+
];
|
|
447
|
+
expect(optimizer.scoreResults(allCorrect)).toBe(1.0);
|
|
448
|
+
const halfCorrect = [
|
|
449
|
+
{ query: 'q1', shouldTrigger: true, didTrigger: true, triggerRate: 0.5, correct: true },
|
|
450
|
+
{ query: 'q2', shouldTrigger: false, didTrigger: true, triggerRate: 0.3, correct: false },
|
|
451
|
+
];
|
|
452
|
+
expect(optimizer.scoreResults(halfCorrect)).toBe(0.5);
|
|
453
|
+
expect(optimizer.scoreResults([])).toBe(0);
|
|
454
|
+
});
|
|
455
|
+
it('should suggest improvements for false negatives', () => {
|
|
456
|
+
const description = 'Process data files';
|
|
457
|
+
const failedResults = [
|
|
458
|
+
{ query: 'analyze csv spreadsheet', shouldTrigger: true, didTrigger: false, triggerRate: 0.05, correct: false },
|
|
459
|
+
];
|
|
460
|
+
const improved = optimizer.suggestImprovement(description, failedResults);
|
|
461
|
+
expect(improved).not.toBe(description);
|
|
462
|
+
expect(improved.length).toBeGreaterThan(description.length);
|
|
463
|
+
});
|
|
464
|
+
it('should suggest improvements for false positives', () => {
|
|
465
|
+
const description = 'Analyze data and generate reports from spreadsheets';
|
|
466
|
+
const failedResults = [
|
|
467
|
+
{ query: 'generate random passwords for security testing', shouldTrigger: false, didTrigger: true, triggerRate: 0.2, correct: false },
|
|
468
|
+
];
|
|
469
|
+
const improved = optimizer.suggestImprovement(description, failedResults);
|
|
470
|
+
expect(improved).not.toBe(description);
|
|
471
|
+
expect(improved).toContain('Does NOT');
|
|
472
|
+
});
|
|
473
|
+
it('should return original description when no failures', () => {
|
|
474
|
+
const description = 'Perfect description';
|
|
475
|
+
const improved = optimizer.suggestImprovement(description, []);
|
|
476
|
+
expect(improved).toBe(description);
|
|
477
|
+
});
|
|
478
|
+
it('should run full optimization loop', () => {
|
|
479
|
+
const queries = [
|
|
480
|
+
{ query: 'analyze this csv file and create a statistical report', shouldTrigger: true },
|
|
481
|
+
{ query: 'parse my data spreadsheet and find patterns', shouldTrigger: true },
|
|
482
|
+
{ query: 'summarize the csv columns with averages', shouldTrigger: true },
|
|
483
|
+
{ query: 'generate csv from database export', shouldTrigger: true },
|
|
484
|
+
{ query: 'help me write a novel', shouldTrigger: false },
|
|
485
|
+
{ query: 'fix the authentication bug', shouldTrigger: false },
|
|
486
|
+
{ query: 'deploy to production servers', shouldTrigger: false },
|
|
487
|
+
{ query: 'review this pull request code', shouldTrigger: false },
|
|
488
|
+
];
|
|
489
|
+
const result = optimizer.optimize('csv-analyzer', 'Analyze CSV files', queries, 3);
|
|
490
|
+
expect(result.skillName).toBe('csv-analyzer');
|
|
491
|
+
expect(result.originalDescription).toBe('Analyze CSV files');
|
|
492
|
+
expect(result.candidates.length).toBeGreaterThan(0);
|
|
493
|
+
expect(result.candidates.length).toBeLessThanOrEqual(3);
|
|
494
|
+
// Best description should be selected by test score
|
|
495
|
+
expect(result.bestDescription).toBeTruthy();
|
|
496
|
+
});
|
|
497
|
+
it('should persist and retrieve optimization history', () => {
|
|
498
|
+
const queries = [
|
|
499
|
+
{ query: 'analyze csv data', shouldTrigger: true },
|
|
500
|
+
{ query: 'write a poem', shouldTrigger: false },
|
|
501
|
+
{ query: 'parse spreadsheet', shouldTrigger: true },
|
|
502
|
+
{ query: 'cook dinner recipe', shouldTrigger: false },
|
|
503
|
+
];
|
|
504
|
+
optimizer.optimize('persist-test', 'Initial description', queries, 2);
|
|
505
|
+
const history = optimizer.getHistory('persist-test');
|
|
506
|
+
expect(history).toHaveLength(1);
|
|
507
|
+
const latest = optimizer.getLatest('persist-test');
|
|
508
|
+
expect(latest).not.toBeNull();
|
|
509
|
+
expect(latest.skillName).toBe('persist-test');
|
|
510
|
+
});
|
|
511
|
+
it('should evaluate candidate on both train and test sets', () => {
|
|
512
|
+
const train = [
|
|
513
|
+
{ query: 'analyze csv file', shouldTrigger: true },
|
|
514
|
+
{ query: 'write poetry', shouldTrigger: false },
|
|
515
|
+
];
|
|
516
|
+
const test = [
|
|
517
|
+
{ query: 'parse data spreadsheet', shouldTrigger: true },
|
|
518
|
+
{ query: 'fix security bug', shouldTrigger: false },
|
|
519
|
+
];
|
|
520
|
+
const candidate = optimizer.evaluateCandidate('Analyze CSV data files', train, test, 1);
|
|
521
|
+
expect(candidate.iteration).toBe(1);
|
|
522
|
+
expect(candidate.trainScore).toBeGreaterThanOrEqual(0);
|
|
523
|
+
expect(candidate.trainScore).toBeLessThanOrEqual(1);
|
|
524
|
+
expect(candidate.testScore).toBeGreaterThanOrEqual(0);
|
|
525
|
+
expect(candidate.testScore).toBeLessThanOrEqual(1);
|
|
526
|
+
expect(candidate.results).toHaveLength(4); // train + test
|
|
527
|
+
});
|
|
528
|
+
it('should stop optimization early on perfect score', () => {
|
|
529
|
+
// Simple case where description already matches perfectly
|
|
530
|
+
const queries = [
|
|
531
|
+
{ query: 'csv analysis report with statistics', shouldTrigger: true },
|
|
532
|
+
{ query: 'completely unrelated cooking recipe topic', shouldTrigger: false },
|
|
533
|
+
];
|
|
534
|
+
const result = optimizer.optimize('perfect-test', 'CSV analysis and statistics reporting tool', queries, 5);
|
|
535
|
+
// Should stop before max iterations if already perfect
|
|
536
|
+
expect(result.candidates.length).toBeLessThanOrEqual(5);
|
|
537
|
+
});
|
|
538
|
+
});
|
|
539
|
+
//# sourceMappingURL=eval.test.js.map
|