@su-record/vibe 2.7.18 → 2.7.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +37 -37
- package/CLAUDE.md +153 -153
- package/LICENSE +21 -21
- package/README.md +451 -449
- package/agents/architect-low.md +41 -41
- package/agents/architect-medium.md +59 -59
- package/agents/architect.md +80 -80
- package/agents/build-error-resolver.md +115 -115
- package/agents/compounder.md +261 -261
- package/agents/diagrammer.md +178 -178
- package/agents/docs/api-documenter.md +99 -99
- package/agents/docs/changelog-writer.md +93 -93
- package/agents/e2e-tester.md +294 -294
- package/agents/event/event-comms.md +78 -0
- package/agents/event/event-content.md +68 -0
- package/agents/event/event-image.md +95 -0
- package/agents/event/event-ops.md +84 -0
- package/agents/event/event-scheduler.md +69 -0
- package/agents/event/event-speaker.md +86 -0
- package/agents/explorer-low.md +42 -42
- package/agents/explorer-medium.md +59 -59
- package/agents/explorer.md +48 -48
- package/agents/implementer-low.md +43 -43
- package/agents/implementer-medium.md +52 -52
- package/agents/implementer.md +54 -54
- package/agents/junior-mentor.md +141 -141
- package/agents/planning/requirements-analyst.md +84 -84
- package/agents/planning/ux-advisor.md +83 -83
- package/agents/qa/acceptance-tester.md +86 -86
- package/agents/qa/edge-case-finder.md +93 -93
- package/agents/refactor-cleaner.md +143 -143
- package/agents/research/best-practices-agent.md +199 -199
- package/agents/research/codebase-patterns-agent.md +157 -157
- package/agents/research/framework-docs-agent.md +188 -188
- package/agents/research/security-advisory-agent.md +213 -213
- package/agents/review/architecture-reviewer.md +107 -107
- package/agents/review/complexity-reviewer.md +116 -116
- package/agents/review/data-integrity-reviewer.md +88 -88
- package/agents/review/git-history-reviewer.md +103 -103
- package/agents/review/performance-reviewer.md +86 -86
- package/agents/review/python-reviewer.md +150 -150
- package/agents/review/rails-reviewer.md +139 -139
- package/agents/review/react-reviewer.md +144 -144
- package/agents/review/security-reviewer.md +80 -80
- package/agents/review/simplicity-reviewer.md +140 -140
- package/agents/review/test-coverage-reviewer.md +116 -116
- package/agents/review/typescript-reviewer.md +127 -127
- package/agents/searcher.md +54 -54
- package/agents/simplifier.md +120 -120
- package/agents/tester.md +49 -49
- package/agents/ui/ui-a11y-auditor.md +93 -93
- package/agents/ui/ui-antipattern-detector.md +94 -94
- package/agents/ui/ui-dataviz-advisor.md +69 -69
- package/agents/ui/ui-design-system-gen.md +57 -57
- package/agents/ui/ui-industry-analyzer.md +49 -49
- package/agents/ui/ui-layout-architect.md +65 -65
- package/agents/ui/ui-stack-implementer.md +68 -68
- package/agents/ui/ux-compliance-reviewer.md +81 -81
- package/agents/ui-previewer.md +258 -258
- package/commands/vibe.analyze.md +379 -379
- package/commands/vibe.event.md +163 -0
- package/commands/vibe.review.md +607 -607
- package/commands/vibe.run.md +2217 -2124
- package/commands/vibe.spec.md +1195 -1195
- package/commands/vibe.spec.review.md +569 -569
- package/commands/vibe.trace.md +50 -0
- package/commands/vibe.utils.md +413 -413
- package/commands/vibe.verify.md +484 -484
- package/dist/__tests__/architecture.test.d.ts +2 -0
- package/dist/__tests__/architecture.test.d.ts.map +1 -0
- package/dist/__tests__/architecture.test.js +207 -0
- package/dist/__tests__/architecture.test.js.map +1 -0
- package/dist/cli/auth.js +3 -3
- package/dist/cli/auth.js.map +1 -1
- package/dist/cli/collaborator.js +52 -52
- package/dist/cli/commands/evolution.js +12 -12
- package/dist/cli/commands/info.d.ts.map +1 -1
- package/dist/cli/commands/info.js +45 -81
- package/dist/cli/commands/info.js.map +1 -1
- package/dist/cli/commands/init.js +5 -5
- package/dist/cli/commands/remove.js +14 -14
- package/dist/cli/commands/sentinel.js +27 -27
- package/dist/cli/commands/skills.js +5 -5
- package/dist/cli/commands/slack.js +10 -10
- package/dist/cli/commands/telegram.js +12 -12
- package/dist/cli/detect.d.ts.map +1 -1
- package/dist/cli/detect.js +55 -32
- package/dist/cli/detect.js.map +1 -1
- package/dist/cli/index.d.ts +1 -1
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +52 -52
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/llm/claude-commands.js +16 -16
- package/dist/cli/llm/config.js +18 -18
- package/dist/cli/llm/gemini-commands.js +16 -16
- package/dist/cli/llm/gpt-commands.js +19 -19
- package/dist/cli/llm/help.js +21 -21
- package/dist/cli/postinstall/constants.d.ts.map +1 -1
- package/dist/cli/postinstall/constants.js +24 -0
- package/dist/cli/postinstall/constants.js.map +1 -1
- package/dist/cli/postinstall/cursor-agents.js +32 -32
- package/dist/cli/postinstall/cursor-rules.js +83 -83
- package/dist/cli/postinstall/cursor-skills.js +743 -743
- package/dist/cli/setup/Provisioner.js +42 -42
- package/dist/infra/lib/AutomationLevel.d.ts +48 -0
- package/dist/infra/lib/AutomationLevel.d.ts.map +1 -0
- package/dist/infra/lib/AutomationLevel.js +157 -0
- package/dist/infra/lib/AutomationLevel.js.map +1 -0
- package/dist/infra/lib/DecisionTracer.d.ts +81 -0
- package/dist/infra/lib/DecisionTracer.d.ts.map +1 -0
- package/dist/infra/lib/DecisionTracer.js +135 -0
- package/dist/infra/lib/DecisionTracer.js.map +1 -0
- package/dist/infra/lib/DeepInit.js +24 -24
- package/dist/infra/lib/InteractiveCheckpoint.d.ts +75 -0
- package/dist/infra/lib/InteractiveCheckpoint.d.ts.map +1 -0
- package/dist/infra/lib/InteractiveCheckpoint.js +179 -0
- package/dist/infra/lib/InteractiveCheckpoint.js.map +1 -0
- package/dist/infra/lib/IterationTracker.d.ts +44 -0
- package/dist/infra/lib/IterationTracker.d.ts.map +1 -1
- package/dist/infra/lib/IterationTracker.js +267 -12
- package/dist/infra/lib/IterationTracker.js.map +1 -1
- package/dist/infra/lib/LoopBreaker.d.ts +56 -0
- package/dist/infra/lib/LoopBreaker.d.ts.map +1 -0
- package/dist/infra/lib/LoopBreaker.js +109 -0
- package/dist/infra/lib/LoopBreaker.js.map +1 -0
- package/dist/infra/lib/PythonParser.js +108 -108
- package/dist/infra/lib/ReviewRace.js +96 -96
- package/dist/infra/lib/SkillFrontmatter.js +28 -28
- package/dist/infra/lib/SkillQualityGate.js +9 -9
- package/dist/infra/lib/SkillRepository.js +159 -159
- package/dist/infra/lib/UltraQA.js +99 -99
- package/dist/infra/lib/VerificationLoop.d.ts +105 -0
- package/dist/infra/lib/VerificationLoop.d.ts.map +1 -0
- package/dist/infra/lib/VerificationLoop.js +189 -0
- package/dist/infra/lib/VerificationLoop.js.map +1 -0
- package/dist/infra/lib/__tests__/AutomationLevel.test.d.ts +2 -0
- package/dist/infra/lib/__tests__/AutomationLevel.test.d.ts.map +1 -0
- package/dist/infra/lib/__tests__/AutomationLevel.test.js +297 -0
- package/dist/infra/lib/__tests__/AutomationLevel.test.js.map +1 -0
- package/dist/infra/lib/__tests__/DecisionTracer.test.d.ts +2 -0
- package/dist/infra/lib/__tests__/DecisionTracer.test.d.ts.map +1 -0
- package/dist/infra/lib/__tests__/DecisionTracer.test.js +274 -0
- package/dist/infra/lib/__tests__/DecisionTracer.test.js.map +1 -0
- package/dist/infra/lib/__tests__/InteractiveCheckpoint.test.d.ts +2 -0
- package/dist/infra/lib/__tests__/InteractiveCheckpoint.test.d.ts.map +1 -0
- package/dist/infra/lib/__tests__/InteractiveCheckpoint.test.js +350 -0
- package/dist/infra/lib/__tests__/InteractiveCheckpoint.test.js.map +1 -0
- package/dist/infra/lib/__tests__/LoopBreaker.test.d.ts +2 -0
- package/dist/infra/lib/__tests__/LoopBreaker.test.d.ts.map +1 -0
- package/dist/infra/lib/__tests__/LoopBreaker.test.js +340 -0
- package/dist/infra/lib/__tests__/LoopBreaker.test.js.map +1 -0
- package/dist/infra/lib/__tests__/VerificationLoop.test.d.ts +2 -0
- package/dist/infra/lib/__tests__/VerificationLoop.test.d.ts.map +1 -0
- package/dist/infra/lib/__tests__/VerificationLoop.test.js +486 -0
- package/dist/infra/lib/__tests__/VerificationLoop.test.js.map +1 -0
- package/dist/infra/lib/autonomy/AuditStore.js +41 -41
- package/dist/infra/lib/autonomy/ConfirmationStore.js +30 -30
- package/dist/infra/lib/autonomy/EventOutbox.js +38 -38
- package/dist/infra/lib/autonomy/PolicyEngine.d.ts +3 -3
- package/dist/infra/lib/autonomy/PolicyEngine.js +18 -18
- package/dist/infra/lib/autonomy/SecuritySentinel.js +1 -1
- package/dist/infra/lib/autonomy/SuggestionStore.js +33 -33
- package/dist/infra/lib/embedding/VectorStore.js +22 -22
- package/dist/infra/lib/embedding/__tests__/EmbeddingProvider.test.js +4 -0
- package/dist/infra/lib/embedding/__tests__/EmbeddingProvider.test.js.map +1 -1
- package/dist/infra/lib/evolution/AgentAnalyzer.js +10 -10
- package/dist/infra/lib/evolution/DeprecationDetector.d.ts +68 -0
- package/dist/infra/lib/evolution/DeprecationDetector.d.ts.map +1 -0
- package/dist/infra/lib/evolution/DeprecationDetector.js +207 -0
- package/dist/infra/lib/evolution/DeprecationDetector.js.map +1 -0
- package/dist/infra/lib/evolution/DescriptionOptimizer.js +21 -21
- package/dist/infra/lib/evolution/GenerationRegistry.js +36 -36
- package/dist/infra/lib/evolution/InsightStore.js +90 -90
- package/dist/infra/lib/evolution/ParityTester.d.ts +74 -0
- package/dist/infra/lib/evolution/ParityTester.d.ts.map +1 -0
- package/dist/infra/lib/evolution/ParityTester.js +238 -0
- package/dist/infra/lib/evolution/ParityTester.js.map +1 -0
- package/dist/infra/lib/evolution/RollbackManager.js +5 -5
- package/dist/infra/lib/evolution/SkillBenchmark.js +23 -23
- package/dist/infra/lib/evolution/SkillEvalRunner.js +50 -50
- package/dist/infra/lib/evolution/SkillGapDetector.js +10 -10
- package/dist/infra/lib/evolution/UsageTracker.js +28 -28
- package/dist/infra/lib/evolution/__tests__/deprecation.test.d.ts +2 -0
- package/dist/infra/lib/evolution/__tests__/deprecation.test.d.ts.map +1 -0
- package/dist/infra/lib/evolution/__tests__/deprecation.test.js +251 -0
- package/dist/infra/lib/evolution/__tests__/deprecation.test.js.map +1 -0
- package/dist/infra/lib/evolution/__tests__/parity.test.d.ts +2 -0
- package/dist/infra/lib/evolution/__tests__/parity.test.d.ts.map +1 -0
- package/dist/infra/lib/evolution/__tests__/parity.test.js +319 -0
- package/dist/infra/lib/evolution/__tests__/parity.test.js.map +1 -0
- package/dist/infra/lib/evolution/index.d.ts +4 -0
- package/dist/infra/lib/evolution/index.d.ts.map +1 -1
- package/dist/infra/lib/evolution/index.js +3 -0
- package/dist/infra/lib/evolution/index.js.map +1 -1
- package/dist/infra/lib/gemini/orchestration.js +5 -5
- package/dist/infra/lib/gpt/orchestration.js +4 -4
- package/dist/infra/lib/gpt/specializations.d.ts +1 -1
- package/dist/infra/lib/gpt/specializations.js +1 -1
- package/dist/infra/lib/memory/KnowledgeGraph.js +4 -4
- package/dist/infra/lib/memory/MemorySearch.js +57 -57
- package/dist/infra/lib/memory/MemoryStorage.js +181 -181
- package/dist/infra/lib/memory/ObservationStore.js +28 -28
- package/dist/infra/lib/memory/ReflectionStore.js +30 -30
- package/dist/infra/lib/memory/SessionRAGRetriever.js +7 -7
- package/dist/infra/lib/memory/SessionRAGStore.js +225 -225
- package/dist/infra/lib/memory/SessionSummarizer.js +9 -9
- package/dist/infra/lib/telemetry/SkillTelemetry.d.ts +6 -0
- package/dist/infra/lib/telemetry/SkillTelemetry.d.ts.map +1 -1
- package/dist/infra/lib/telemetry/SkillTelemetry.js +11 -0
- package/dist/infra/lib/telemetry/SkillTelemetry.js.map +1 -1
- package/dist/infra/orchestrator/AgentManager.js +12 -12
- package/dist/infra/orchestrator/AgentRegistry.js +65 -65
- package/dist/infra/orchestrator/BackgroundManager.d.ts.map +1 -1
- package/dist/infra/orchestrator/BackgroundManager.js +2 -0
- package/dist/infra/orchestrator/BackgroundManager.js.map +1 -1
- package/dist/infra/orchestrator/MultiLlmResearch.js +8 -8
- package/dist/infra/orchestrator/PhasePipeline.js +1 -1
- package/dist/infra/orchestrator/PhasePipeline.js.map +1 -1
- package/dist/infra/orchestrator/SwarmOrchestrator.test.js +16 -16
- package/dist/infra/orchestrator/parallelResearch.js +24 -24
- package/dist/tools/convention/analyzeComplexity.test.js +115 -115
- package/dist/tools/convention/validateCodeQuality.test.js +104 -104
- package/dist/tools/index.d.ts +16 -19
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +15 -27
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/memory/createMemoryTimeline.js +10 -10
- package/dist/tools/memory/getMemoryGraph.js +12 -12
- package/dist/tools/memory/getSessionContext.js +9 -9
- package/dist/tools/memory/linkMemories.js +14 -14
- package/dist/tools/memory/listMemories.js +4 -4
- package/dist/tools/memory/recallMemory.js +4 -4
- package/dist/tools/memory/saveMemory.js +4 -4
- package/dist/tools/memory/searchMemoriesAdvanced.js +23 -23
- package/dist/tools/memory/startSession.js +1 -1
- package/dist/tools/memory/startSession.js.map +1 -1
- package/dist/tools/semantic/analyzeDependencyGraph.js +12 -12
- package/dist/tools/semantic/astGrep.test.js +6 -6
- package/dist/tools/spec/index.d.ts +0 -4
- package/dist/tools/spec/index.d.ts.map +1 -1
- package/dist/tools/spec/index.js +0 -4
- package/dist/tools/spec/index.js.map +1 -1
- package/dist/tools/spec/prdParser.test.js +171 -171
- package/dist/tools/spec/specGenerator.js +169 -169
- package/dist/tools/spec/traceabilityMatrix.js +64 -64
- package/dist/tools/spec/traceabilityMatrix.test.js +28 -28
- package/hooks/gemini-hooks.json +73 -73
- package/hooks/hooks.json +137 -137
- package/hooks/scripts/code-check.js +77 -77
- package/hooks/scripts/context-save.js +212 -212
- package/hooks/scripts/evolution-engine.js +69 -0
- package/hooks/scripts/hud-status.js +291 -291
- package/hooks/scripts/keyword-detector.js +214 -214
- package/hooks/scripts/llm-orchestrate.js +475 -475
- package/hooks/scripts/post-edit.js +32 -32
- package/hooks/scripts/pre-tool-guard.js +125 -125
- package/hooks/scripts/prompt-dispatcher.js +185 -185
- package/hooks/scripts/sentinel-guard.js +104 -104
- package/hooks/scripts/session-start.js +106 -106
- package/hooks/scripts/skill-injector.js +83 -0
- package/hooks/scripts/stop-notify.js +209 -209
- package/hooks/scripts/utils.js +100 -100
- package/languages/csharp-unity.md +515 -515
- package/languages/gdscript-godot.md +470 -470
- package/languages/ruby-rails.md +489 -489
- package/languages/typescript-angular.md +433 -433
- package/languages/typescript-astro.md +416 -416
- package/languages/typescript-electron.md +406 -406
- package/languages/typescript-nestjs.md +524 -524
- package/languages/typescript-svelte.md +407 -407
- package/languages/typescript-tauri.md +365 -365
- package/package.json +101 -123
- package/skills/agents-md/SKILL.md +120 -120
- package/skills/arch-guard/SKILL.md +180 -180
- package/skills/brand-assets/SKILL.md +146 -146
- package/skills/capability-loop/SKILL.md +167 -167
- package/skills/characterization-test/SKILL.md +206 -206
- package/skills/commerce-patterns/SKILL.md +63 -63
- package/skills/commit-push-pr/SKILL.md +75 -75
- package/skills/context7-usage/SKILL.md +105 -105
- package/skills/core-capabilities/SKILL.md +13 -13
- package/skills/e2e-commerce/SKILL.md +61 -61
- package/skills/event-comms/SKILL.md +161 -0
- package/skills/event-ops/SKILL.md +197 -0
- package/skills/event-planning/SKILL.md +131 -0
- package/skills/exec-plan/SKILL.md +147 -147
- package/skills/frontend-design/SKILL.md +12 -12
- package/skills/git-worktree/SKILL.md +72 -72
- package/skills/handoff/SKILL.md +109 -109
- package/skills/parallel-research/SKILL.md +87 -87
- package/skills/priority-todos/SKILL.md +63 -63
- package/skills/seo-checklist/SKILL.md +57 -57
- package/skills/techdebt/SKILL.md +122 -122
- package/skills/tool-fallback/SKILL.md +103 -103
- package/skills/typescript-advanced-types/SKILL.md +66 -66
- package/skills/ui-ux-pro-max/SKILL.md +221 -221
- package/skills/vercel-react-best-practices/SKILL.md +59 -59
- package/skills/video-production/SKILL.md +51 -51
- package/vibe/config.json +29 -29
- package/vibe/constitution.md +227 -227
- package/vibe/rules/principles/communication-guide.md +98 -98
- package/vibe/rules/principles/development-philosophy.md +52 -52
- package/vibe/rules/principles/quick-start.md +102 -102
- package/vibe/rules/quality/bdd-contract-testing.md +393 -393
- package/vibe/rules/quality/checklist.md +276 -276
- package/vibe/rules/quality/performance.md +236 -236
- package/vibe/rules/quality/testing-strategy.md +440 -440
- package/vibe/rules/standards/anti-patterns.md +541 -541
- package/vibe/rules/standards/code-structure.md +291 -291
- package/vibe/rules/standards/complexity-metrics.md +313 -313
- package/vibe/rules/standards/git-workflow.md +237 -237
- package/vibe/rules/standards/naming-conventions.md +198 -198
- package/vibe/rules/standards/security.md +305 -305
- package/vibe/rules/writing/document-style.md +74 -74
- package/vibe/setup.sh +31 -31
- package/vibe/templates/constitution-template.md +252 -252
- package/vibe/templates/contract-backend-template.md +526 -526
- package/vibe/templates/contract-frontend-template.md +599 -599
- package/vibe/templates/feature-template.md +96 -96
- package/vibe/templates/spec-template.md +221 -221
- package/vibe/ui-ux-data/charts.csv +26 -26
- package/vibe/ui-ux-data/colors.csv +97 -97
- package/vibe/ui-ux-data/icons.csv +101 -101
- package/vibe/ui-ux-data/landing.csv +31 -31
- package/vibe/ui-ux-data/products.csv +96 -96
- package/vibe/ui-ux-data/react-performance.csv +45 -45
- package/vibe/ui-ux-data/stacks/astro.csv +54 -54
- package/vibe/ui-ux-data/stacks/flutter.csv +53 -53
- package/vibe/ui-ux-data/stacks/html-tailwind.csv +56 -56
- package/vibe/ui-ux-data/stacks/jetpack-compose.csv +53 -53
- package/vibe/ui-ux-data/stacks/nextjs.csv +53 -53
- package/vibe/ui-ux-data/stacks/nuxt-ui.csv +51 -51
- package/vibe/ui-ux-data/stacks/nuxtjs.csv +59 -59
- package/vibe/ui-ux-data/stacks/react-native.csv +52 -52
- package/vibe/ui-ux-data/stacks/react.csv +54 -54
- package/vibe/ui-ux-data/stacks/shadcn.csv +61 -61
- package/vibe/ui-ux-data/stacks/svelte.csv +54 -54
- package/vibe/ui-ux-data/stacks/swiftui.csv +51 -51
- package/vibe/ui-ux-data/stacks/vue.csv +50 -50
- package/vibe/ui-ux-data/styles.csv +68 -68
- package/vibe/ui-ux-data/typography.csv +57 -57
- package/vibe/ui-ux-data/ui-reasoning.csv +101 -101
- package/vibe/ui-ux-data/ux-guidelines.csv +99 -99
- package/vibe/ui-ux-data/version.json +31 -31
- package/vibe/ui-ux-data/web-interface.csv +31 -31
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { MemoryStorage } from '../memory/MemoryStorage.js';
|
|
2
|
+
export interface ModelVersion {
|
|
3
|
+
id: string;
|
|
4
|
+
name: string;
|
|
5
|
+
registeredAt: string;
|
|
6
|
+
}
|
|
7
|
+
export interface ParityTestResult {
|
|
8
|
+
id: string;
|
|
9
|
+
skillName: string;
|
|
10
|
+
oldModel: string;
|
|
11
|
+
newModel: string;
|
|
12
|
+
/** Old model's baseline pass rate */
|
|
13
|
+
oldBaselinePassRate: number;
|
|
14
|
+
/** New model's baseline pass rate (without skill) */
|
|
15
|
+
newBaselinePassRate: number;
|
|
16
|
+
/** With-skill pass rate (reference) */
|
|
17
|
+
withSkillPassRate: number;
|
|
18
|
+
/** Parity score: how close new baseline is to with-skill (0-1, 1=identical) */
|
|
19
|
+
parityScore: number;
|
|
20
|
+
/** Whether the skill is becoming obsolete */
|
|
21
|
+
obsoleteCandidate: boolean;
|
|
22
|
+
/** Detailed per-eval comparison */
|
|
23
|
+
evalComparisons: EvalComparison[];
|
|
24
|
+
timestamp: string;
|
|
25
|
+
}
|
|
26
|
+
export interface EvalComparison {
|
|
27
|
+
evalId: string;
|
|
28
|
+
prompt: string;
|
|
29
|
+
oldBaselinePassed: boolean;
|
|
30
|
+
newBaselinePassed: boolean;
|
|
31
|
+
withSkillPassed: boolean;
|
|
32
|
+
/** Did new model baseline improve over old? */
|
|
33
|
+
improved: boolean;
|
|
34
|
+
}
|
|
35
|
+
export declare const PARITY_THRESHOLDS: {
|
|
36
|
+
/** New baseline >= this fraction of with-skill → obsolete candidate */
|
|
37
|
+
readonly OBSOLESCENCE_RATIO: 0.85;
|
|
38
|
+
/** Minimum improvement in baseline to consider significant */
|
|
39
|
+
readonly MIN_IMPROVEMENT: 0.1;
|
|
40
|
+
/** Minimum eval cases for reliable parity test */
|
|
41
|
+
readonly MIN_EVAL_CASES: 3;
|
|
42
|
+
};
|
|
43
|
+
export declare class ParityTester {
|
|
44
|
+
private db;
|
|
45
|
+
constructor(storage: MemoryStorage);
|
|
46
|
+
private initializeTables;
|
|
47
|
+
/** Register a model version */
|
|
48
|
+
registerModel(id: string, name: string): ModelVersion;
|
|
49
|
+
/** Get all registered models */
|
|
50
|
+
getModels(): ModelVersion[];
|
|
51
|
+
/** Record baseline eval results for a specific model */
|
|
52
|
+
recordModelBaseline(skillName: string, modelId: string, evalResults: Array<{
|
|
53
|
+
evalId: string;
|
|
54
|
+
passed: boolean;
|
|
55
|
+
output: string;
|
|
56
|
+
durationMs: number;
|
|
57
|
+
tokenCount: number;
|
|
58
|
+
prompt?: string;
|
|
59
|
+
}>): void;
|
|
60
|
+
/**
|
|
61
|
+
* Run a parity test by reading existing baseline data from model_baseline_results.
|
|
62
|
+
* with-skill reference data is also read from model_baseline_results with variant 'with_skill'.
|
|
63
|
+
*/
|
|
64
|
+
runParityTest(skillName: string, oldModel: string, newModel: string): ParityTestResult;
|
|
65
|
+
/** Get parity test history for a skill */
|
|
66
|
+
getHistory(skillName: string): ParityTestResult[];
|
|
67
|
+
/** Get latest parity test */
|
|
68
|
+
getLatest(skillName: string): ParityTestResult | null;
|
|
69
|
+
/** Format parity test as markdown report */
|
|
70
|
+
formatReport(result: ParityTestResult): string;
|
|
71
|
+
private getBaselineRows;
|
|
72
|
+
private buildComparisons;
|
|
73
|
+
}
|
|
74
|
+
//# sourceMappingURL=ParityTester.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ParityTester.d.ts","sourceRoot":"","sources":["../../../../src/infra/lib/evolution/ParityTester.ts"],"names":[],"mappings":"AAQA,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAE3D,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,gBAAgB;IAC/B,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,qCAAqC;IACrC,mBAAmB,EAAE,MAAM,CAAC;IAC5B,qDAAqD;IACrD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,uCAAuC;IACvC,iBAAiB,EAAE,MAAM,CAAC;IAC1B,+EAA+E;IAC/E,WAAW,EAAE,MAAM,CAAC;IACpB,6CAA6C;IAC7C,iBAAiB,EAAE,OAAO,CAAC;IAC3B,mCAAmC;IACnC,eAAe,EAAE,cAAc,EAAE,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,iBAAiB,EAAE,OAAO,CAAC;IAC3B,iBAAiB,EAAE,OAAO,CAAC;IAC3B,eAAe,EAAE,OAAO,CAAC;IACzB,+CAA+C;IAC/C,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,eAAO,MAAM,iBAAiB;IAC5B,uEAAuE;;IAEvE,8DAA8D;;IAE9D,kDAAkD;;CAE1C,CAAC;AAqCX,qBAAa,YAAY;IACvB,OAAO,CAAC,EAAE,CAA2C;gBAEzC,OAAO,EAAE,aAAa;IAKlC,OAAO,CAAC,gBAAgB;IA0CxB,+BAA+B;IACxB,aAAa,CAAC,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,YAAY;IAS5D,gCAAgC;IACzB,SAAS,IAAI,YAAY,EAAE;IAOlC,wDAAwD;IACjD,mBAAmB,CACxB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,EACf,WAAW,EAAE,KAAK,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,GAC/H,IAAI;IAiBP;;;OAGG;IACI,aAAa,CAClB,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,MAAM,GACf,gBAAgB;IAqCnB,0CAA0C;IACnC,UAAU,CAAC,SAAS,EAAE,MAAM,GAAG,gBAAgB,EAAE;IAOxD,6BAA6B;IACtB,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI;IAO5D,4CAA4C;IACrC,YAAY,CAAC,MAAM,EAAE,gBAAgB,GAAG,MAAM;IA8CrD,OAAO,CAAC,eAAe;IAQvB,OAAO,CAAC,gBAAgB;CAkCzB"}
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
// ParityTester — Model version parity testing for skill obsolescence detection
|
|
2
|
+
//
|
|
3
|
+
// 모델 업그레이드 시:
|
|
4
|
+
// 1. 기존 eval 케이스로 새 모델의 baseline(스킬 없이) 성능 측정
|
|
5
|
+
// 2. 이전 모델 baseline과 비교
|
|
6
|
+
// 3. 새 모델 baseline이 기존 with-skill 수준에 근접하면 → deprecation 후보
|
|
7
|
+
import { randomUUID } from 'crypto';
|
|
8
|
+
export const PARITY_THRESHOLDS = {
|
|
9
|
+
/** New baseline >= this fraction of with-skill → obsolete candidate */
|
|
10
|
+
OBSOLESCENCE_RATIO: 0.85,
|
|
11
|
+
/** Minimum improvement in baseline to consider significant */
|
|
12
|
+
MIN_IMPROVEMENT: 0.1,
|
|
13
|
+
/** Minimum eval cases for reliable parity test */
|
|
14
|
+
MIN_EVAL_CASES: 3,
|
|
15
|
+
};
|
|
16
|
+
export class ParityTester {
|
|
17
|
+
db;
|
|
18
|
+
constructor(storage) {
|
|
19
|
+
this.db = storage.getDatabase();
|
|
20
|
+
this.initializeTables();
|
|
21
|
+
}
|
|
22
|
+
initializeTables() {
|
|
23
|
+
this.db.exec(`
|
|
24
|
+
CREATE TABLE IF NOT EXISTS model_versions (
|
|
25
|
+
id TEXT PRIMARY KEY,
|
|
26
|
+
name TEXT NOT NULL,
|
|
27
|
+
registeredAt TEXT NOT NULL
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
CREATE TABLE IF NOT EXISTS model_baseline_results (
|
|
31
|
+
id TEXT PRIMARY KEY,
|
|
32
|
+
skillName TEXT NOT NULL,
|
|
33
|
+
modelId TEXT NOT NULL,
|
|
34
|
+
evalId TEXT NOT NULL,
|
|
35
|
+
prompt TEXT NOT NULL DEFAULT '',
|
|
36
|
+
passed INTEGER NOT NULL DEFAULT 0,
|
|
37
|
+
output TEXT NOT NULL DEFAULT '',
|
|
38
|
+
durationMs INTEGER NOT NULL DEFAULT 0,
|
|
39
|
+
tokenCount INTEGER NOT NULL DEFAULT 0,
|
|
40
|
+
createdAt TEXT NOT NULL
|
|
41
|
+
);
|
|
42
|
+
|
|
43
|
+
CREATE INDEX IF NOT EXISTS idx_mbr_skill_model ON model_baseline_results(skillName, modelId);
|
|
44
|
+
CREATE INDEX IF NOT EXISTS idx_mbr_eval ON model_baseline_results(evalId);
|
|
45
|
+
|
|
46
|
+
CREATE TABLE IF NOT EXISTS parity_tests (
|
|
47
|
+
id TEXT PRIMARY KEY,
|
|
48
|
+
skillName TEXT NOT NULL,
|
|
49
|
+
oldModel TEXT NOT NULL,
|
|
50
|
+
newModel TEXT NOT NULL,
|
|
51
|
+
oldBaselinePassRate REAL NOT NULL,
|
|
52
|
+
newBaselinePassRate REAL NOT NULL,
|
|
53
|
+
withSkillPassRate REAL NOT NULL,
|
|
54
|
+
parityScore REAL NOT NULL,
|
|
55
|
+
obsoleteCandidate INTEGER NOT NULL DEFAULT 0,
|
|
56
|
+
evalComparisons TEXT NOT NULL DEFAULT '[]',
|
|
57
|
+
createdAt TEXT NOT NULL
|
|
58
|
+
);
|
|
59
|
+
|
|
60
|
+
CREATE INDEX IF NOT EXISTS idx_pt_skill ON parity_tests(skillName);
|
|
61
|
+
`);
|
|
62
|
+
}
|
|
63
|
+
/** Register a model version */
|
|
64
|
+
registerModel(id, name) {
|
|
65
|
+
const now = new Date().toISOString();
|
|
66
|
+
this.db.prepare(`
|
|
67
|
+
INSERT OR REPLACE INTO model_versions (id, name, registeredAt)
|
|
68
|
+
VALUES (?, ?, ?)
|
|
69
|
+
`).run(id, name, now);
|
|
70
|
+
return { id, name, registeredAt: now };
|
|
71
|
+
}
|
|
72
|
+
/** Get all registered models */
|
|
73
|
+
getModels() {
|
|
74
|
+
const rows = this.db.prepare(`
|
|
75
|
+
SELECT * FROM model_versions ORDER BY registeredAt ASC
|
|
76
|
+
`).all();
|
|
77
|
+
return rows.map(r => ({ id: r.id, name: r.name, registeredAt: r.registeredAt }));
|
|
78
|
+
}
|
|
79
|
+
/** Record baseline eval results for a specific model */
|
|
80
|
+
recordModelBaseline(skillName, modelId, evalResults) {
|
|
81
|
+
const now = new Date().toISOString();
|
|
82
|
+
const insertStmt = this.db.prepare(`
|
|
83
|
+
INSERT INTO model_baseline_results (id, skillName, modelId, evalId, prompt, passed, output, durationMs, tokenCount, createdAt)
|
|
84
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
85
|
+
`);
|
|
86
|
+
const insertMany = this.db.transaction(() => {
|
|
87
|
+
for (const r of evalResults) {
|
|
88
|
+
const id = `mbr-${Date.now().toString(36)}-${randomUUID().replace(/-/g, '').slice(0, 8)}`;
|
|
89
|
+
insertStmt.run(id, skillName, modelId, r.evalId, r.prompt ?? '', r.passed ? 1 : 0, r.output, r.durationMs, r.tokenCount, now);
|
|
90
|
+
}
|
|
91
|
+
});
|
|
92
|
+
insertMany();
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Run a parity test by reading existing baseline data from model_baseline_results.
|
|
96
|
+
* with-skill reference data is also read from model_baseline_results with variant 'with_skill'.
|
|
97
|
+
*/
|
|
98
|
+
runParityTest(skillName, oldModel, newModel) {
|
|
99
|
+
const oldRows = this.getBaselineRows(skillName, oldModel);
|
|
100
|
+
const newRows = this.getBaselineRows(skillName, newModel);
|
|
101
|
+
const withSkillRows = this.getBaselineRows(skillName, 'with_skill');
|
|
102
|
+
const comparisons = this.buildComparisons(oldRows, newRows, withSkillRows);
|
|
103
|
+
const oldBaselinePassRate = computePassRate(oldRows);
|
|
104
|
+
const newBaselinePassRate = computePassRate(newRows);
|
|
105
|
+
const withSkillPassRate = computePassRate(withSkillRows);
|
|
106
|
+
const parityScore = withSkillPassRate > 0
|
|
107
|
+
? Math.min(1.0, newBaselinePassRate / withSkillPassRate)
|
|
108
|
+
: (newBaselinePassRate > 0 ? 1.0 : 0.0);
|
|
109
|
+
const obsoleteCandidate = parityScore >= PARITY_THRESHOLDS.OBSOLESCENCE_RATIO;
|
|
110
|
+
const id = `parity-${Date.now().toString(36)}-${randomUUID().replace(/-/g, '').slice(0, 8)}`;
|
|
111
|
+
const now = new Date().toISOString();
|
|
112
|
+
this.db.prepare(`
|
|
113
|
+
INSERT INTO parity_tests (id, skillName, oldModel, newModel, oldBaselinePassRate, newBaselinePassRate, withSkillPassRate, parityScore, obsoleteCandidate, evalComparisons, createdAt)
|
|
114
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
115
|
+
`).run(id, skillName, oldModel, newModel, oldBaselinePassRate, newBaselinePassRate, withSkillPassRate, parityScore, obsoleteCandidate ? 1 : 0, JSON.stringify(comparisons), now);
|
|
116
|
+
return {
|
|
117
|
+
id,
|
|
118
|
+
skillName,
|
|
119
|
+
oldModel,
|
|
120
|
+
newModel,
|
|
121
|
+
oldBaselinePassRate,
|
|
122
|
+
newBaselinePassRate,
|
|
123
|
+
withSkillPassRate,
|
|
124
|
+
parityScore,
|
|
125
|
+
obsoleteCandidate,
|
|
126
|
+
evalComparisons: comparisons,
|
|
127
|
+
timestamp: now,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
/** Get parity test history for a skill */
|
|
131
|
+
getHistory(skillName) {
|
|
132
|
+
const rows = this.db.prepare(`
|
|
133
|
+
SELECT * FROM parity_tests WHERE skillName = ? ORDER BY createdAt ASC
|
|
134
|
+
`).all(skillName);
|
|
135
|
+
return rows.map(rowToParityResult);
|
|
136
|
+
}
|
|
137
|
+
/** Get latest parity test */
|
|
138
|
+
getLatest(skillName) {
|
|
139
|
+
const row = this.db.prepare(`
|
|
140
|
+
SELECT * FROM parity_tests WHERE skillName = ? ORDER BY createdAt DESC LIMIT 1
|
|
141
|
+
`).get(skillName);
|
|
142
|
+
return row ? rowToParityResult(row) : null;
|
|
143
|
+
}
|
|
144
|
+
/** Format parity test as markdown report */
|
|
145
|
+
formatReport(result) {
|
|
146
|
+
const lines = [
|
|
147
|
+
`# Parity Report: ${result.skillName}`,
|
|
148
|
+
'',
|
|
149
|
+
`**Timestamp**: ${result.timestamp}`,
|
|
150
|
+
`**Old Model**: ${result.oldModel}`,
|
|
151
|
+
`**New Model**: ${result.newModel}`,
|
|
152
|
+
'',
|
|
153
|
+
'## Summary',
|
|
154
|
+
'',
|
|
155
|
+
'| Metric | Value |',
|
|
156
|
+
'|--------|-------|',
|
|
157
|
+
`| Old Baseline Pass Rate | ${pct(result.oldBaselinePassRate)} |`,
|
|
158
|
+
`| New Baseline Pass Rate | ${pct(result.newBaselinePassRate)} |`,
|
|
159
|
+
`| With-Skill Pass Rate | ${pct(result.withSkillPassRate)} |`,
|
|
160
|
+
`| Parity Score | ${result.parityScore.toFixed(3)} |`,
|
|
161
|
+
`| Obsolete Candidate | ${result.obsoleteCandidate ? 'YES' : 'No'} |`,
|
|
162
|
+
'',
|
|
163
|
+
];
|
|
164
|
+
if (result.evalComparisons.length > 0) {
|
|
165
|
+
lines.push('## Per-Eval Comparison', '');
|
|
166
|
+
lines.push('| Eval ID | Old Baseline | New Baseline | With Skill | Improved |');
|
|
167
|
+
lines.push('|---------|-------------|--------------|------------|----------|');
|
|
168
|
+
for (const c of result.evalComparisons) {
|
|
169
|
+
lines.push(`| ${c.evalId} | ${c.oldBaselinePassed ? 'PASS' : 'FAIL'} | ${c.newBaselinePassed ? 'PASS' : 'FAIL'} | ${c.withSkillPassed ? 'PASS' : 'FAIL'} | ${c.improved ? 'Yes' : 'No'} |`);
|
|
170
|
+
}
|
|
171
|
+
lines.push('');
|
|
172
|
+
}
|
|
173
|
+
if (result.obsoleteCandidate) {
|
|
174
|
+
lines.push('## Recommendation', '', `The new model (${result.newModel}) baseline achieves ${pct(result.newBaselinePassRate)} pass rate,`, `reaching ${pct(result.parityScore)} of the with-skill pass rate (${pct(result.withSkillPassRate)}).`, 'This skill is a **deprecation candidate** — consider retiring it.', '');
|
|
175
|
+
}
|
|
176
|
+
return lines.join('\n');
|
|
177
|
+
}
|
|
178
|
+
getBaselineRows(skillName, modelId) {
|
|
179
|
+
return this.db.prepare(`
|
|
180
|
+
SELECT * FROM model_baseline_results
|
|
181
|
+
WHERE skillName = ? AND modelId = ?
|
|
182
|
+
ORDER BY createdAt ASC
|
|
183
|
+
`).all(skillName, modelId);
|
|
184
|
+
}
|
|
185
|
+
buildComparisons(oldRows, newRows, withSkillRows) {
|
|
186
|
+
const allEvalIds = new Set([
|
|
187
|
+
...oldRows.map(r => r.evalId),
|
|
188
|
+
...newRows.map(r => r.evalId),
|
|
189
|
+
...withSkillRows.map(r => r.evalId),
|
|
190
|
+
]);
|
|
191
|
+
const oldByEval = new Map(oldRows.map(r => [r.evalId, r]));
|
|
192
|
+
const newByEval = new Map(newRows.map(r => [r.evalId, r]));
|
|
193
|
+
const wsById = new Map(withSkillRows.map(r => [r.evalId, r]));
|
|
194
|
+
return Array.from(allEvalIds).map(evalId => {
|
|
195
|
+
const oldRow = oldByEval.get(evalId);
|
|
196
|
+
const newRow = newByEval.get(evalId);
|
|
197
|
+
const wsRow = wsById.get(evalId);
|
|
198
|
+
const oldBaselinePassed = oldRow?.passed === 1;
|
|
199
|
+
const newBaselinePassed = newRow?.passed === 1;
|
|
200
|
+
const withSkillPassed = wsRow?.passed === 1;
|
|
201
|
+
const prompt = newRow?.prompt ?? oldRow?.prompt ?? wsRow?.prompt ?? evalId;
|
|
202
|
+
return {
|
|
203
|
+
evalId,
|
|
204
|
+
prompt,
|
|
205
|
+
oldBaselinePassed,
|
|
206
|
+
newBaselinePassed,
|
|
207
|
+
withSkillPassed,
|
|
208
|
+
improved: newBaselinePassed && !oldBaselinePassed,
|
|
209
|
+
};
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
// --- Utility functions ---
|
|
214
|
+
function computePassRate(rows) {
|
|
215
|
+
if (rows.length === 0)
|
|
216
|
+
return 0;
|
|
217
|
+
const passed = rows.filter(r => r.passed === 1).length;
|
|
218
|
+
return passed / rows.length;
|
|
219
|
+
}
|
|
220
|
+
function pct(value) {
|
|
221
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
222
|
+
}
|
|
223
|
+
function rowToParityResult(row) {
|
|
224
|
+
return {
|
|
225
|
+
id: row.id,
|
|
226
|
+
skillName: row.skillName,
|
|
227
|
+
oldModel: row.oldModel,
|
|
228
|
+
newModel: row.newModel,
|
|
229
|
+
oldBaselinePassRate: row.oldBaselinePassRate,
|
|
230
|
+
newBaselinePassRate: row.newBaselinePassRate,
|
|
231
|
+
withSkillPassRate: row.withSkillPassRate,
|
|
232
|
+
parityScore: row.parityScore,
|
|
233
|
+
obsoleteCandidate: row.obsoleteCandidate === 1,
|
|
234
|
+
evalComparisons: JSON.parse(row.evalComparisons),
|
|
235
|
+
timestamp: row.createdAt,
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
//# sourceMappingURL=ParityTester.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ParityTester.js","sourceRoot":"","sources":["../../../../src/infra/lib/evolution/ParityTester.ts"],"names":[],"mappings":"AAAA,+EAA+E;AAC/E,EAAE;AACF,cAAc;AACd,8CAA8C;AAC9C,wBAAwB;AACxB,4DAA4D;AAE5D,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAuCpC,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,uEAAuE;IACvE,kBAAkB,EAAE,IAAI;IACxB,8DAA8D;IAC9D,eAAe,EAAE,GAAG;IACpB,kDAAkD;IAClD,cAAc,EAAE,CAAC;CACT,CAAC;AAqCX,MAAM,OAAO,YAAY;IACf,EAAE,CAA2C;IAErD,YAAY,OAAsB;QAChC,IAAI,CAAC,EAAE,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;QAChC,IAAI,CAAC,gBAAgB,EAAE,CAAC;IAC1B,CAAC;IAEO,gBAAgB;QACtB,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAsCZ,CAAC,CAAC;IACL,CAAC;IAED,+BAA+B;IACxB,aAAa,CAAC,EAAU,EAAE,IAAY;QAC3C,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC;;;KAGf,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;QACtB,OAAO,EAAE,EAAE,EAAE,IAAI,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC;IACzC,CAAC;IAED,gCAAgC;IACzB,SAAS;QACd,MAAM,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC;;KAE5B,CAAC,CAAC,GAAG,EAAuB,CAAC;QAC9B,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,YAAY,EAAE,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC;IACnF,CAAC;IAED,wDAAwD;IACjD,mBAAmB,CACxB,SAAiB,EACjB,OAAe,EACf,WAAgI;QAEhI,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACrC,MAAM,UAAU,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC;;;KAGlC,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,IAAI,CAAC,EAAE,CAAC,WAAW,CAAC,GAAG,EAAE;YAC1C,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;gBAC5B,MAAM,EAAE,GAAG,OAAO,IAAI,CAAC,GAAG,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;gBAC1F,UAAU,CAAC,GAAG,CAAC,EAAE,EAAE,SAAS,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;YAChI,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,UAAU,EAAE,CAAC;IACf,CAAC;IAED;;;OAGG;IACI,aAAa,CAClB,SAAiB,EACjB,QAAgB,EAChB,QAAgB;QAEhB,MAAM,OAAO,GAAG,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC1D,MAAM,OAAO,GAAG,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC1D,MAAM,aAAa,GAAG,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE,YAAY,CAAC,CAAC;QAEpE,MAAM,WAAW,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;QAC3E,MAAM,mBAAmB,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;QACrD,MAAM,mBAAmB,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;QACrD,MAAM,iBAAiB,GAAG,eAAe,CAAC,aAAa,CAAC,CAAC;QACzD,MAAM,WAAW,GAAG,iBAAiB,GAAG,CAAC;YACvC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,mBAAmB,GAAG,iBAAiB,CAAC;YACxD,CAAC,CAAC,CAAC,mBAAmB,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC1C,MAAM,iBAAiB,GAAG,WAAW,IAAI,iBAAiB,CAAC,kBAAkB,CAAC;QAE9E,MAAM,EAAE,GAAG,UAAU,IAAI,CAAC,GAAG,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;QAC7F,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAErC,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC;;;KAGf,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE,mBAAmB,EAAE,mBAAmB,EAAE,iBAAiB,EAAE,WAAW,EAAE,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,EAAE,GAAG,CAAC,CAAC;QAEjL,OAAO;YACL,EAAE;YACF,SAAS;YACT,QAAQ;YACR,QAAQ;YACR,mBAAmB;YACnB,mBAAmB;YACnB,iBAAiB;YACjB,WAAW;YACX,iBAAiB;YACjB,eAAe,EAAE,WAAW;YAC5B,SAAS,EAAE,GAAG;SACf,CAAC;IACJ,CAAC;IAED,0CAA0C;IACnC,UAAU,CAAC,SAAiB;QACjC,MAAM,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC;;KAE5B,CAAC,CAAC,GAAG,CAAC,SAAS,CAAoB,CAAC;QACrC,OAAO,IAAI,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAC;IACrC,CAAC;IAED,6BAA6B;IACtB,SAAS,CAAC,SAAiB;QAChC,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC;;KAE3B,CAAC,CAAC,GAAG,CAAC,SAAS,CAA8B,CAAC;QAC/C,OAAO,GAAG,CAAC,CAAC,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAC7C,CAAC;IAED,4CAA4C;IACrC,YAAY,CAAC,MAAwB;QAC1C,MAAM,KAAK,GAAa;YACtB,oBAAoB,MAAM,CAAC,SAAS,EAAE;YACtC,EAAE;YACF,kBAAkB,MAAM,CAAC,SAAS,EAAE;YACpC,kBAAkB,MAAM,CAAC,QAAQ,EAAE;YACnC,kBAAkB,MAAM,CAAC,QAAQ,EAAE;YACnC,EAAE;YACF,YAAY;YACZ,EAAE;YACF,oBAAoB;YACpB,oBAAoB;YACpB,8BAA8B,GAAG,CAAC,MAAM,CAAC,mBAAmB,CAAC,IAAI;YACjE,8BAA8B,GAAG,CAAC,MAAM,CAAC,mBAAmB,CAAC,IAAI;YACjE,4BAA4B,GAAG,CAAC,MAAM,CAAC,iBAAiB,CAAC,IAAI;YAC7D,oBAAoB,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;YACrD,0BAA0B,MAAM,CAAC,iBAAiB,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,IAAI;YACrE,EAAE;SACH,CAAC;QAEF,IAAI,MAAM,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtC,KAAK,CAAC,IAAI,CAAC,wBAAwB,EAAE,EAAE,CAAC,CAAC;YACzC,KAAK,CAAC,IAAI,CAAC,mEAAmE,CAAC,CAAC;YAChF,KAAK,CAAC,IAAI,CAAC,kEAAkE,CAAC,CAAC;YAC/E,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;gBACvC,KAAK,CAAC,IAAI,CACR,KAAK,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,iBAAiB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,iBAAiB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,IAAI,CAChL,CAAC;YACJ,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjB,CAAC;QAED,IAAI,MAAM,CAAC,iBAAiB,EAAE,CAAC;YAC7B,KAAK,CAAC,IAAI,CACR,mBAAmB,EACnB,EAAE,EACF,kBAAkB,MAAM,CAAC,QAAQ,uBAAuB,GAAG,CAAC,MAAM,CAAC,mBAAmB,CAAC,aAAa,EACpG,YAAY,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,iCAAiC,GAAG,CAAC,MAAM,CAAC,iBAAiB,CAAC,IAAI,EACrG,mEAAmE,EACnE,EAAE,CACH,CAAC;QACJ,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAEO,eAAe,CAAC,SAAiB,EAAE,OAAe;QACxD,OAAO,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC;;;;KAItB,CAAC,CAAC,GAAG,CAAC,SAAS,EAAE,OAAO,CAAuB,CAAC;IACnD,CAAC;IAEO,gBAAgB,CACtB,OAA2B,EAC3B,OAA2B,EAC3B,aAAiC;QAEjC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC;YACzB,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YAC7B,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YAC7B,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;SACpC,CAAC,CAAC;QAEH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAE9D,OAAO,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE;YACzC,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YACrC,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YACjC,MAAM,iBAAiB,GAAG,MAAM,EAAE,MAAM,KAAK,CAAC,CAAC;YAC/C,MAAM,iBAAiB,GAAG,MAAM,EAAE,MAAM,KAAK,CAAC,CAAC;YAC/C,MAAM,eAAe,GAAG,KAAK,EAAE,MAAM,KAAK,CAAC,CAAC;YAC5C,MAAM,MAAM,GAAG,MAAM,EAAE,MAAM,IAAI,MAAM,EAAE,MAAM,IAAI,KAAK,EAAE,MAAM,IAAI,MAAM,CAAC;YAE3E,OAAO;gBACL,MAAM;gBACN,MAAM;gBACN,iBAAiB;gBACjB,iBAAiB;gBACjB,eAAe;gBACf,QAAQ,EAAE,iBAAiB,IAAI,CAAC,iBAAiB;aAClD,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAED,4BAA4B;AAE5B,SAAS,eAAe,CAAC,IAAwB;IAC/C,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAChC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;IACvD,OAAO,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;AAC9B,CAAC;AAED,SAAS,GAAG,CAAC,KAAa;IACxB,OAAO,GAAG,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;AACxC,CAAC;AAED,SAAS,iBAAiB,CAAC,GAAkB;IAC3C,OAAO;QACL,EAAE,EAAE,GAAG,CAAC,EAAE;QACV,SAAS,EAAE,GAAG,CAAC,SAAS;QACxB,QAAQ,EAAE,GAAG,CAAC,QAAQ;QACtB,QAAQ,EAAE,GAAG,CAAC,QAAQ;QACtB,mBAAmB,EAAE,GAAG,CAAC,mBAAmB;QAC5C,mBAAmB,EAAE,GAAG,CAAC,mBAAmB;QAC5C,iBAAiB,EAAE,GAAG,CAAC,iBAAiB;QACxC,WAAW,EAAE,GAAG,CAAC,WAAW;QAC5B,iBAAiB,EAAE,GAAG,CAAC,iBAAiB,KAAK,CAAC;QAC9C,eAAe,EAAE,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,eAAe,CAAC;QAChD,SAAS,EAAE,GAAG,CAAC,SAAS;KACzB,CAAC;AACJ,CAAC"}
|
|
@@ -78,14 +78,14 @@ export class RollbackManager {
|
|
|
78
78
|
let disabled = 0;
|
|
79
79
|
const transaction = this.db.transaction(() => {
|
|
80
80
|
// Update all non-disabled/non-deleted in DB
|
|
81
|
-
const result = this.db.prepare(`
|
|
82
|
-
UPDATE generations SET status = 'disabled', updatedAt = ?
|
|
83
|
-
WHERE status IN ('draft', 'testing', 'active')
|
|
81
|
+
const result = this.db.prepare(`
|
|
82
|
+
UPDATE generations SET status = 'disabled', updatedAt = ?
|
|
83
|
+
WHERE status IN ('draft', 'testing', 'active')
|
|
84
84
|
`).run(new Date().toISOString());
|
|
85
85
|
disabled = result.changes;
|
|
86
86
|
// Rename all active files
|
|
87
|
-
const activeGens = this.db.prepare(`
|
|
88
|
-
SELECT filePath FROM generations WHERE status = 'disabled' AND filePath IS NOT NULL
|
|
87
|
+
const activeGens = this.db.prepare(`
|
|
88
|
+
SELECT filePath FROM generations WHERE status = 'disabled' AND filePath IS NOT NULL
|
|
89
89
|
`).all();
|
|
90
90
|
for (const gen of activeGens) {
|
|
91
91
|
if (gen.filePath && existsSync(gen.filePath) && !gen.filePath.endsWith('.disabled')) {
|
|
@@ -16,18 +16,18 @@ export class SkillBenchmark {
|
|
|
16
16
|
this.initializeTables();
|
|
17
17
|
}
|
|
18
18
|
initializeTables() {
|
|
19
|
-
this.db.exec(`
|
|
20
|
-
CREATE TABLE IF NOT EXISTS skill_benchmarks (
|
|
21
|
-
id TEXT PRIMARY KEY,
|
|
22
|
-
skillName TEXT NOT NULL,
|
|
23
|
-
iteration INTEGER NOT NULL,
|
|
24
|
-
summary TEXT NOT NULL,
|
|
25
|
-
evalBreakdowns TEXT NOT NULL,
|
|
26
|
-
createdAt TEXT NOT NULL
|
|
27
|
-
);
|
|
28
|
-
|
|
29
|
-
CREATE INDEX IF NOT EXISTS idx_sb_skill ON skill_benchmarks(skillName);
|
|
30
|
-
CREATE INDEX IF NOT EXISTS idx_sb_iter ON skill_benchmarks(skillName, iteration);
|
|
19
|
+
this.db.exec(`
|
|
20
|
+
CREATE TABLE IF NOT EXISTS skill_benchmarks (
|
|
21
|
+
id TEXT PRIMARY KEY,
|
|
22
|
+
skillName TEXT NOT NULL,
|
|
23
|
+
iteration INTEGER NOT NULL,
|
|
24
|
+
summary TEXT NOT NULL,
|
|
25
|
+
evalBreakdowns TEXT NOT NULL,
|
|
26
|
+
createdAt TEXT NOT NULL
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_sb_skill ON skill_benchmarks(skillName);
|
|
30
|
+
CREATE INDEX IF NOT EXISTS idx_sb_iter ON skill_benchmarks(skillName, iteration);
|
|
31
31
|
`);
|
|
32
32
|
}
|
|
33
33
|
/**
|
|
@@ -74,9 +74,9 @@ export class SkillBenchmark {
|
|
|
74
74
|
};
|
|
75
75
|
const id = `bench-${Date.now().toString(36)}-${randomUUID().replace(/-/g, '').slice(0, 8)}`;
|
|
76
76
|
const now = new Date().toISOString();
|
|
77
|
-
this.db.prepare(`
|
|
78
|
-
INSERT INTO skill_benchmarks (id, skillName, iteration, summary, evalBreakdowns, createdAt)
|
|
79
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
77
|
+
this.db.prepare(`
|
|
78
|
+
INSERT INTO skill_benchmarks (id, skillName, iteration, summary, evalBreakdowns, createdAt)
|
|
79
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
80
80
|
`).run(id, skillName, iteration, JSON.stringify(summary), JSON.stringify(breakdowns), now);
|
|
81
81
|
return { id, skillName, iteration, timestamp: now, summary, evalBreakdowns: breakdowns };
|
|
82
82
|
}
|
|
@@ -84,8 +84,8 @@ export class SkillBenchmark {
|
|
|
84
84
|
* Get benchmark history for a skill
|
|
85
85
|
*/
|
|
86
86
|
getHistory(skillName) {
|
|
87
|
-
const rows = this.db.prepare(`
|
|
88
|
-
SELECT * FROM skill_benchmarks WHERE skillName = ? ORDER BY iteration ASC
|
|
87
|
+
const rows = this.db.prepare(`
|
|
88
|
+
SELECT * FROM skill_benchmarks WHERE skillName = ? ORDER BY iteration ASC
|
|
89
89
|
`).all(skillName);
|
|
90
90
|
return rows.map(this.rowToBenchmark);
|
|
91
91
|
}
|
|
@@ -93,8 +93,8 @@ export class SkillBenchmark {
|
|
|
93
93
|
* Get the latest benchmark for a skill
|
|
94
94
|
*/
|
|
95
95
|
getLatest(skillName) {
|
|
96
|
-
const row = this.db.prepare(`
|
|
97
|
-
SELECT * FROM skill_benchmarks WHERE skillName = ? ORDER BY iteration DESC LIMIT 1
|
|
96
|
+
const row = this.db.prepare(`
|
|
97
|
+
SELECT * FROM skill_benchmarks WHERE skillName = ? ORDER BY iteration DESC LIMIT 1
|
|
98
98
|
`).get(skillName);
|
|
99
99
|
return row ? this.rowToBenchmark(row) : null;
|
|
100
100
|
}
|
|
@@ -158,14 +158,14 @@ export class SkillBenchmark {
|
|
|
158
158
|
return lines.join('\n');
|
|
159
159
|
}
|
|
160
160
|
getBenchmarkByIteration(skillName, iteration) {
|
|
161
|
-
const row = this.db.prepare(`
|
|
162
|
-
SELECT * FROM skill_benchmarks WHERE skillName = ? AND iteration = ?
|
|
161
|
+
const row = this.db.prepare(`
|
|
162
|
+
SELECT * FROM skill_benchmarks WHERE skillName = ? AND iteration = ?
|
|
163
163
|
`).get(skillName, iteration);
|
|
164
164
|
return row ? this.rowToBenchmark(row) : null;
|
|
165
165
|
}
|
|
166
166
|
getNextIteration(skillName) {
|
|
167
|
-
const row = this.db.prepare(`
|
|
168
|
-
SELECT MAX(iteration) as maxIter FROM skill_benchmarks WHERE skillName = ?
|
|
167
|
+
const row = this.db.prepare(`
|
|
168
|
+
SELECT MAX(iteration) as maxIter FROM skill_benchmarks WHERE skillName = ?
|
|
169
169
|
`).get(skillName);
|
|
170
170
|
return (row.maxIter ?? 0) + 1;
|
|
171
171
|
}
|
|
@@ -14,36 +14,36 @@ export class SkillEvalRunner {
|
|
|
14
14
|
this.initializeTables();
|
|
15
15
|
}
|
|
16
16
|
initializeTables() {
|
|
17
|
-
this.db.exec(`
|
|
18
|
-
CREATE TABLE IF NOT EXISTS skill_eval_cases (
|
|
19
|
-
id TEXT PRIMARY KEY,
|
|
20
|
-
skillName TEXT NOT NULL,
|
|
21
|
-
prompt TEXT NOT NULL,
|
|
22
|
-
expectedOutput TEXT NOT NULL,
|
|
23
|
-
files TEXT DEFAULT '[]',
|
|
24
|
-
assertions TEXT DEFAULT '[]',
|
|
25
|
-
createdAt TEXT NOT NULL,
|
|
26
|
-
updatedAt TEXT NOT NULL
|
|
27
|
-
);
|
|
28
|
-
|
|
29
|
-
CREATE INDEX IF NOT EXISTS idx_sec_skill ON skill_eval_cases(skillName);
|
|
30
|
-
|
|
31
|
-
CREATE TABLE IF NOT EXISTS skill_eval_runs (
|
|
32
|
-
id TEXT PRIMARY KEY,
|
|
33
|
-
evalId TEXT NOT NULL,
|
|
34
|
-
skillName TEXT NOT NULL,
|
|
35
|
-
variant TEXT NOT NULL CHECK(variant IN ('with_skill','baseline')),
|
|
36
|
-
status TEXT NOT NULL DEFAULT 'pending' CHECK(status IN ('pending','running','passed','failed','error')),
|
|
37
|
-
output TEXT DEFAULT '',
|
|
38
|
-
grades TEXT DEFAULT '[]',
|
|
39
|
-
durationMs INTEGER DEFAULT 0,
|
|
40
|
-
tokenCount INTEGER DEFAULT 0,
|
|
41
|
-
createdAt TEXT NOT NULL
|
|
42
|
-
);
|
|
43
|
-
|
|
44
|
-
CREATE INDEX IF NOT EXISTS idx_ser_eval ON skill_eval_runs(evalId);
|
|
45
|
-
CREATE INDEX IF NOT EXISTS idx_ser_skill ON skill_eval_runs(skillName);
|
|
46
|
-
CREATE INDEX IF NOT EXISTS idx_ser_variant ON skill_eval_runs(variant);
|
|
17
|
+
this.db.exec(`
|
|
18
|
+
CREATE TABLE IF NOT EXISTS skill_eval_cases (
|
|
19
|
+
id TEXT PRIMARY KEY,
|
|
20
|
+
skillName TEXT NOT NULL,
|
|
21
|
+
prompt TEXT NOT NULL,
|
|
22
|
+
expectedOutput TEXT NOT NULL,
|
|
23
|
+
files TEXT DEFAULT '[]',
|
|
24
|
+
assertions TEXT DEFAULT '[]',
|
|
25
|
+
createdAt TEXT NOT NULL,
|
|
26
|
+
updatedAt TEXT NOT NULL
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_sec_skill ON skill_eval_cases(skillName);
|
|
30
|
+
|
|
31
|
+
CREATE TABLE IF NOT EXISTS skill_eval_runs (
|
|
32
|
+
id TEXT PRIMARY KEY,
|
|
33
|
+
evalId TEXT NOT NULL,
|
|
34
|
+
skillName TEXT NOT NULL,
|
|
35
|
+
variant TEXT NOT NULL CHECK(variant IN ('with_skill','baseline')),
|
|
36
|
+
status TEXT NOT NULL DEFAULT 'pending' CHECK(status IN ('pending','running','passed','failed','error')),
|
|
37
|
+
output TEXT DEFAULT '',
|
|
38
|
+
grades TEXT DEFAULT '[]',
|
|
39
|
+
durationMs INTEGER DEFAULT 0,
|
|
40
|
+
tokenCount INTEGER DEFAULT 0,
|
|
41
|
+
createdAt TEXT NOT NULL
|
|
42
|
+
);
|
|
43
|
+
|
|
44
|
+
CREATE INDEX IF NOT EXISTS idx_ser_eval ON skill_eval_runs(evalId);
|
|
45
|
+
CREATE INDEX IF NOT EXISTS idx_ser_skill ON skill_eval_runs(skillName);
|
|
46
|
+
CREATE INDEX IF NOT EXISTS idx_ser_variant ON skill_eval_runs(variant);
|
|
47
47
|
`);
|
|
48
48
|
}
|
|
49
49
|
/**
|
|
@@ -52,9 +52,9 @@ export class SkillEvalRunner {
|
|
|
52
52
|
createEvalSet(input) {
|
|
53
53
|
const cases = [];
|
|
54
54
|
const now = new Date().toISOString();
|
|
55
|
-
const insertStmt = this.db.prepare(`
|
|
56
|
-
INSERT INTO skill_eval_cases (id, skillName, prompt, expectedOutput, files, assertions, createdAt, updatedAt)
|
|
57
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
55
|
+
const insertStmt = this.db.prepare(`
|
|
56
|
+
INSERT INTO skill_eval_cases (id, skillName, prompt, expectedOutput, files, assertions, createdAt, updatedAt)
|
|
57
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
58
58
|
`);
|
|
59
59
|
const insertMany = this.db.transaction((evals) => {
|
|
60
60
|
for (const evalCase of evals) {
|
|
@@ -83,8 +83,8 @@ export class SkillEvalRunner {
|
|
|
83
83
|
* Get all eval cases for a skill
|
|
84
84
|
*/
|
|
85
85
|
getEvalCases(skillName) {
|
|
86
|
-
const rows = this.db.prepare(`
|
|
87
|
-
SELECT * FROM skill_eval_cases WHERE skillName = ? ORDER BY createdAt ASC
|
|
86
|
+
const rows = this.db.prepare(`
|
|
87
|
+
SELECT * FROM skill_eval_cases WHERE skillName = ? ORDER BY createdAt ASC
|
|
88
88
|
`).all(skillName);
|
|
89
89
|
return rows.map(this.rowToEvalCase);
|
|
90
90
|
}
|
|
@@ -92,8 +92,8 @@ export class SkillEvalRunner {
|
|
|
92
92
|
* Get a single eval case by ID
|
|
93
93
|
*/
|
|
94
94
|
getEvalCase(evalId) {
|
|
95
|
-
const row = this.db.prepare(`
|
|
96
|
-
SELECT * FROM skill_eval_cases WHERE id = ?
|
|
95
|
+
const row = this.db.prepare(`
|
|
96
|
+
SELECT * FROM skill_eval_cases WHERE id = ?
|
|
97
97
|
`).get(evalId);
|
|
98
98
|
return row ? this.rowToEvalCase(row) : null;
|
|
99
99
|
}
|
|
@@ -103,9 +103,9 @@ export class SkillEvalRunner {
|
|
|
103
103
|
startRun(evalId, skillName, variant) {
|
|
104
104
|
const id = `run-${Date.now().toString(36)}-${randomUUID().replace(/-/g, '').slice(0, 8)}`;
|
|
105
105
|
const now = new Date().toISOString();
|
|
106
|
-
this.db.prepare(`
|
|
107
|
-
INSERT INTO skill_eval_runs (id, evalId, skillName, variant, status, createdAt)
|
|
108
|
-
VALUES (?, ?, ?, ?, 'running', ?)
|
|
106
|
+
this.db.prepare(`
|
|
107
|
+
INSERT INTO skill_eval_runs (id, evalId, skillName, variant, status, createdAt)
|
|
108
|
+
VALUES (?, ?, ?, ?, 'running', ?)
|
|
109
109
|
`).run(id, evalId, skillName, variant, now);
|
|
110
110
|
return id;
|
|
111
111
|
}
|
|
@@ -115,18 +115,18 @@ export class SkillEvalRunner {
|
|
|
115
115
|
completeRun(runId, output, grades, durationMs, tokenCount) {
|
|
116
116
|
const allPassed = grades.length === 0 || grades.every(g => g.passed);
|
|
117
117
|
const status = allPassed ? 'passed' : 'failed';
|
|
118
|
-
this.db.prepare(`
|
|
119
|
-
UPDATE skill_eval_runs
|
|
120
|
-
SET status = ?, output = ?, grades = ?, durationMs = ?, tokenCount = ?
|
|
121
|
-
WHERE id = ?
|
|
118
|
+
this.db.prepare(`
|
|
119
|
+
UPDATE skill_eval_runs
|
|
120
|
+
SET status = ?, output = ?, grades = ?, durationMs = ?, tokenCount = ?
|
|
121
|
+
WHERE id = ?
|
|
122
122
|
`).run(status, output, JSON.stringify(grades), durationMs, tokenCount, runId);
|
|
123
123
|
}
|
|
124
124
|
/**
|
|
125
125
|
* Mark a run as errored
|
|
126
126
|
*/
|
|
127
127
|
failRun(runId, errorMessage) {
|
|
128
|
-
this.db.prepare(`
|
|
129
|
-
UPDATE skill_eval_runs SET status = 'error', output = ? WHERE id = ?
|
|
128
|
+
this.db.prepare(`
|
|
129
|
+
UPDATE skill_eval_runs SET status = 'error', output = ? WHERE id = ?
|
|
130
130
|
`).run(errorMessage, runId);
|
|
131
131
|
}
|
|
132
132
|
/**
|
|
@@ -181,8 +181,8 @@ export class SkillEvalRunner {
|
|
|
181
181
|
* Get all runs for an eval case
|
|
182
182
|
*/
|
|
183
183
|
getRunsForEval(evalId) {
|
|
184
|
-
const rows = this.db.prepare(`
|
|
185
|
-
SELECT * FROM skill_eval_runs WHERE evalId = ? ORDER BY createdAt DESC
|
|
184
|
+
const rows = this.db.prepare(`
|
|
185
|
+
SELECT * FROM skill_eval_runs WHERE evalId = ? ORDER BY createdAt DESC
|
|
186
186
|
`).all(evalId);
|
|
187
187
|
return rows.map(this.rowToRunResult);
|
|
188
188
|
}
|
|
@@ -190,8 +190,8 @@ export class SkillEvalRunner {
|
|
|
190
190
|
* Get all runs for a skill
|
|
191
191
|
*/
|
|
192
192
|
getRunsForSkill(skillName) {
|
|
193
|
-
const rows = this.db.prepare(`
|
|
194
|
-
SELECT * FROM skill_eval_runs WHERE skillName = ? ORDER BY createdAt DESC
|
|
193
|
+
const rows = this.db.prepare(`
|
|
194
|
+
SELECT * FROM skill_eval_runs WHERE skillName = ? ORDER BY createdAt DESC
|
|
195
195
|
`).all(skillName);
|
|
196
196
|
return rows.map(this.rowToRunResult);
|
|
197
197
|
}
|
|
@@ -17,9 +17,9 @@ export class SkillGapDetector {
|
|
|
17
17
|
const truncated = prompt.slice(0, 200);
|
|
18
18
|
const normalized = truncated.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
19
19
|
try {
|
|
20
|
-
this.db.prepare(`
|
|
21
|
-
INSERT INTO skill_gaps (id, prompt, normalizedPrompt, sessionId, createdAt)
|
|
22
|
-
VALUES (?, ?, ?, ?, ?)
|
|
20
|
+
this.db.prepare(`
|
|
21
|
+
INSERT INTO skill_gaps (id, prompt, normalizedPrompt, sessionId, createdAt)
|
|
22
|
+
VALUES (?, ?, ?, ?, ?)
|
|
23
23
|
`).run(id, truncated, normalized, sessionId || null, new Date().toISOString());
|
|
24
24
|
}
|
|
25
25
|
catch {
|
|
@@ -33,13 +33,13 @@ export class SkillGapDetector {
|
|
|
33
33
|
const result = { newGaps: [], totalClusters: 0 };
|
|
34
34
|
try {
|
|
35
35
|
// Cluster by normalizedPrompt
|
|
36
|
-
const clusters = this.db.prepare(`
|
|
37
|
-
SELECT normalizedPrompt, COUNT(*) as count, GROUP_CONCAT(prompt, '|||') as prompts
|
|
38
|
-
FROM skill_gaps
|
|
39
|
-
GROUP BY normalizedPrompt
|
|
40
|
-
HAVING count >= 3
|
|
41
|
-
ORDER BY count DESC
|
|
42
|
-
LIMIT ?
|
|
36
|
+
const clusters = this.db.prepare(`
|
|
37
|
+
SELECT normalizedPrompt, COUNT(*) as count, GROUP_CONCAT(prompt, '|||') as prompts
|
|
38
|
+
FROM skill_gaps
|
|
39
|
+
GROUP BY normalizedPrompt
|
|
40
|
+
HAVING count >= 3
|
|
41
|
+
ORDER BY count DESC
|
|
42
|
+
LIMIT ?
|
|
43
43
|
`).all(limit);
|
|
44
44
|
result.totalClusters = clusters.length;
|
|
45
45
|
for (const cluster of clusters) {
|