@su-record/vibe 2.7.10 → 2.7.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. package/.env.example +37 -37
  2. package/CLAUDE.md +126 -222
  3. package/LICENSE +21 -21
  4. package/README.md +580 -580
  5. package/agents/architect-low.md +41 -41
  6. package/agents/architect-medium.md +59 -59
  7. package/agents/architect.md +80 -80
  8. package/agents/build-error-resolver.md +115 -115
  9. package/agents/compounder.md +261 -261
  10. package/agents/diagrammer.md +178 -178
  11. package/agents/docs/api-documenter.md +99 -99
  12. package/agents/docs/changelog-writer.md +93 -93
  13. package/agents/e2e-tester.md +266 -266
  14. package/agents/explorer-low.md +42 -42
  15. package/agents/explorer-medium.md +59 -59
  16. package/agents/explorer.md +48 -48
  17. package/agents/implementer-low.md +43 -43
  18. package/agents/implementer-medium.md +52 -52
  19. package/agents/implementer.md +54 -54
  20. package/agents/junior-mentor.md +141 -141
  21. package/agents/planning/requirements-analyst.md +84 -84
  22. package/agents/planning/ux-advisor.md +83 -83
  23. package/agents/qa/acceptance-tester.md +86 -86
  24. package/agents/qa/edge-case-finder.md +93 -93
  25. package/agents/refactor-cleaner.md +143 -143
  26. package/agents/research/best-practices-agent.md +199 -199
  27. package/agents/research/codebase-patterns-agent.md +157 -157
  28. package/agents/research/framework-docs-agent.md +188 -188
  29. package/agents/research/security-advisory-agent.md +213 -213
  30. package/agents/review/architecture-reviewer.md +107 -107
  31. package/agents/review/complexity-reviewer.md +116 -116
  32. package/agents/review/data-integrity-reviewer.md +88 -88
  33. package/agents/review/git-history-reviewer.md +103 -103
  34. package/agents/review/performance-reviewer.md +86 -86
  35. package/agents/review/python-reviewer.md +150 -150
  36. package/agents/review/rails-reviewer.md +139 -139
  37. package/agents/review/react-reviewer.md +144 -144
  38. package/agents/review/security-reviewer.md +80 -80
  39. package/agents/review/simplicity-reviewer.md +140 -140
  40. package/agents/review/test-coverage-reviewer.md +116 -116
  41. package/agents/review/typescript-reviewer.md +127 -127
  42. package/agents/searcher.md +54 -54
  43. package/agents/simplifier.md +120 -120
  44. package/agents/tester.md +49 -49
  45. package/agents/ui/ui-a11y-auditor.md +93 -93
  46. package/agents/ui/ui-antipattern-detector.md +94 -94
  47. package/agents/ui/ui-dataviz-advisor.md +69 -69
  48. package/agents/ui/ui-design-system-gen.md +57 -57
  49. package/agents/ui/ui-industry-analyzer.md +49 -49
  50. package/agents/ui/ui-layout-architect.md +65 -65
  51. package/agents/ui/ui-stack-implementer.md +68 -68
  52. package/agents/ui/ux-compliance-reviewer.md +81 -81
  53. package/agents/ui-previewer.md +260 -260
  54. package/commands/vibe.run.md +83 -0
  55. package/commands/vibe.spec.review.md +558 -558
  56. package/commands/vibe.utils.md +413 -413
  57. package/commands/vibe.voice.md +79 -79
  58. package/dist/cli/auth.d.ts +1 -1
  59. package/dist/cli/auth.d.ts.map +1 -1
  60. package/dist/cli/auth.js +15 -7
  61. package/dist/cli/auth.js.map +1 -1
  62. package/dist/cli/collaborator.js +52 -52
  63. package/dist/cli/commands/evolution.js +12 -12
  64. package/dist/cli/commands/index.d.ts +1 -0
  65. package/dist/cli/commands/index.d.ts.map +1 -1
  66. package/dist/cli/commands/index.js +1 -0
  67. package/dist/cli/commands/index.js.map +1 -1
  68. package/dist/cli/commands/info.d.ts.map +1 -1
  69. package/dist/cli/commands/info.js +62 -56
  70. package/dist/cli/commands/info.js.map +1 -1
  71. package/dist/cli/commands/init.d.ts.map +1 -1
  72. package/dist/cli/commands/init.js +9 -6
  73. package/dist/cli/commands/init.js.map +1 -1
  74. package/dist/cli/commands/remove.js +14 -14
  75. package/dist/cli/commands/sentinel.js +27 -27
  76. package/dist/cli/commands/skills.d.ts +13 -0
  77. package/dist/cli/commands/skills.d.ts.map +1 -0
  78. package/dist/cli/commands/skills.js +83 -0
  79. package/dist/cli/commands/skills.js.map +1 -0
  80. package/dist/cli/commands/slack.js +10 -10
  81. package/dist/cli/commands/telegram.js +12 -12
  82. package/dist/cli/commands/update.d.ts.map +1 -1
  83. package/dist/cli/commands/update.js +3 -0
  84. package/dist/cli/commands/update.js.map +1 -1
  85. package/dist/cli/detect.js +32 -32
  86. package/dist/cli/index.d.ts.map +1 -1
  87. package/dist/cli/index.js +64 -47
  88. package/dist/cli/index.js.map +1 -1
  89. package/dist/cli/llm/claude-commands.js +16 -16
  90. package/dist/cli/llm/config.js +18 -18
  91. package/dist/cli/llm/gemini-commands.js +47 -47
  92. package/dist/cli/llm/gpt-commands.js +19 -19
  93. package/dist/cli/llm/help.js +21 -21
  94. package/dist/cli/postinstall/constants.d.ts +8 -0
  95. package/dist/cli/postinstall/constants.d.ts.map +1 -1
  96. package/dist/cli/postinstall/constants.js +33 -0
  97. package/dist/cli/postinstall/constants.js.map +1 -1
  98. package/dist/cli/postinstall/cursor-agents.js +32 -32
  99. package/dist/cli/postinstall/cursor-rules.js +83 -83
  100. package/dist/cli/postinstall/cursor-skills.js +743 -743
  101. package/dist/cli/postinstall/index.d.ts +1 -1
  102. package/dist/cli/postinstall/index.d.ts.map +1 -1
  103. package/dist/cli/postinstall/index.js +1 -1
  104. package/dist/cli/postinstall/index.js.map +1 -1
  105. package/dist/cli/setup/ProjectSetup.d.ts.map +1 -1
  106. package/dist/cli/setup/ProjectSetup.js +5 -0
  107. package/dist/cli/setup/ProjectSetup.js.map +1 -1
  108. package/dist/cli/setup/Provisioner.js +42 -42
  109. package/dist/cli/types.d.ts +1 -0
  110. package/dist/cli/types.d.ts.map +1 -1
  111. package/dist/infra/lib/DeepInit.js +24 -24
  112. package/dist/infra/lib/IterationTracker.js +11 -11
  113. package/dist/infra/lib/PythonParser.js +108 -108
  114. package/dist/infra/lib/ReviewRace.js +96 -96
  115. package/dist/infra/lib/SkillFrontmatter.js +28 -28
  116. package/dist/infra/lib/SkillQualityGate.js +9 -9
  117. package/dist/infra/lib/SkillRepository.js +159 -159
  118. package/dist/infra/lib/UltraQA.js +99 -99
  119. package/dist/infra/lib/autonomy/AuditStore.js +41 -41
  120. package/dist/infra/lib/autonomy/ConfirmationStore.js +30 -30
  121. package/dist/infra/lib/autonomy/EventOutbox.js +38 -38
  122. package/dist/infra/lib/autonomy/PolicyEngine.js +18 -18
  123. package/dist/infra/lib/autonomy/SecuritySentinel.js +1 -1
  124. package/dist/infra/lib/autonomy/SuggestionStore.js +33 -33
  125. package/dist/infra/lib/embedding/VectorStore.js +22 -22
  126. package/dist/infra/lib/evolution/AgentAnalyzer.js +10 -10
  127. package/dist/infra/lib/evolution/DescriptionOptimizer.d.ts +79 -0
  128. package/dist/infra/lib/evolution/DescriptionOptimizer.d.ts.map +1 -0
  129. package/dist/infra/lib/evolution/DescriptionOptimizer.js +259 -0
  130. package/dist/infra/lib/evolution/DescriptionOptimizer.js.map +1 -0
  131. package/dist/infra/lib/evolution/GenerationRegistry.js +36 -36
  132. package/dist/infra/lib/evolution/InsightStore.js +90 -90
  133. package/dist/infra/lib/evolution/RollbackManager.js +5 -5
  134. package/dist/infra/lib/evolution/SkillBenchmark.d.ts +81 -0
  135. package/dist/infra/lib/evolution/SkillBenchmark.d.ts.map +1 -0
  136. package/dist/infra/lib/evolution/SkillBenchmark.js +233 -0
  137. package/dist/infra/lib/evolution/SkillBenchmark.js.map +1 -0
  138. package/dist/infra/lib/evolution/SkillClassifier.d.ts +35 -0
  139. package/dist/infra/lib/evolution/SkillClassifier.d.ts.map +1 -0
  140. package/dist/infra/lib/evolution/SkillClassifier.js +167 -0
  141. package/dist/infra/lib/evolution/SkillClassifier.js.map +1 -0
  142. package/dist/infra/lib/evolution/SkillEvalRunner.d.ts +102 -0
  143. package/dist/infra/lib/evolution/SkillEvalRunner.d.ts.map +1 -0
  144. package/dist/infra/lib/evolution/SkillEvalRunner.js +256 -0
  145. package/dist/infra/lib/evolution/SkillEvalRunner.js.map +1 -0
  146. package/dist/infra/lib/evolution/SkillGapDetector.js +10 -10
  147. package/dist/infra/lib/evolution/UsageTracker.js +28 -28
  148. package/dist/infra/lib/evolution/__tests__/eval.test.d.ts +2 -0
  149. package/dist/infra/lib/evolution/__tests__/eval.test.d.ts.map +1 -0
  150. package/dist/infra/lib/evolution/__tests__/eval.test.js +539 -0
  151. package/dist/infra/lib/evolution/__tests__/eval.test.js.map +1 -0
  152. package/dist/infra/lib/evolution/index.d.ts +8 -0
  153. package/dist/infra/lib/evolution/index.d.ts.map +1 -1
  154. package/dist/infra/lib/evolution/index.js +5 -0
  155. package/dist/infra/lib/evolution/index.js.map +1 -1
  156. package/dist/infra/lib/gemini/constants.js +14 -14
  157. package/dist/infra/lib/gemini/orchestration.js +5 -5
  158. package/dist/infra/lib/gpt/oauth.js +44 -44
  159. package/dist/infra/lib/gpt/orchestration.js +4 -4
  160. package/dist/infra/lib/memory/KnowledgeGraph.js +4 -4
  161. package/dist/infra/lib/memory/MemorySearch.js +57 -57
  162. package/dist/infra/lib/memory/MemoryStorage.js +181 -181
  163. package/dist/infra/lib/memory/ObservationStore.js +28 -28
  164. package/dist/infra/lib/memory/ReflectionStore.js +30 -30
  165. package/dist/infra/lib/memory/SessionRAGRetriever.js +7 -7
  166. package/dist/infra/lib/memory/SessionRAGStore.js +225 -225
  167. package/dist/infra/lib/memory/SessionSummarizer.js +9 -9
  168. package/dist/infra/orchestrator/AgentManager.js +12 -12
  169. package/dist/infra/orchestrator/AgentRegistry.js +65 -65
  170. package/dist/infra/orchestrator/MultiLlmResearch.js +8 -8
  171. package/dist/infra/orchestrator/SwarmOrchestrator.test.js +16 -16
  172. package/dist/infra/orchestrator/parallelResearch.js +24 -24
  173. package/dist/tools/convention/analyzeComplexity.test.js +115 -115
  174. package/dist/tools/convention/validateCodeQuality.test.js +104 -104
  175. package/dist/tools/memory/createMemoryTimeline.js +10 -10
  176. package/dist/tools/memory/getMemoryGraph.js +12 -12
  177. package/dist/tools/memory/getSessionContext.js +9 -9
  178. package/dist/tools/memory/linkMemories.js +14 -14
  179. package/dist/tools/memory/listMemories.js +4 -4
  180. package/dist/tools/memory/recallMemory.js +4 -4
  181. package/dist/tools/memory/saveMemory.js +4 -4
  182. package/dist/tools/memory/searchMemoriesAdvanced.js +23 -23
  183. package/dist/tools/semantic/analyzeDependencyGraph.js +12 -12
  184. package/dist/tools/semantic/astGrep.test.js +6 -6
  185. package/dist/tools/spec/prdParser.test.js +171 -171
  186. package/dist/tools/spec/specGenerator.js +169 -169
  187. package/dist/tools/spec/traceabilityMatrix.js +64 -64
  188. package/dist/tools/spec/traceabilityMatrix.test.js +28 -28
  189. package/hooks/gemini-hooks.json +73 -73
  190. package/hooks/hooks.json +137 -137
  191. package/hooks/scripts/code-check.js +70 -70
  192. package/hooks/scripts/context-save.js +212 -212
  193. package/hooks/scripts/hud-status.js +291 -291
  194. package/hooks/scripts/keyword-detector.js +214 -214
  195. package/hooks/scripts/llm-orchestrate.js +646 -646
  196. package/hooks/scripts/post-edit.js +32 -32
  197. package/hooks/scripts/pre-tool-guard.js +125 -125
  198. package/hooks/scripts/prompt-dispatcher.js +185 -185
  199. package/hooks/scripts/sentinel-guard.js +104 -104
  200. package/hooks/scripts/session-start.js +106 -106
  201. package/hooks/scripts/stop-notify.js +209 -209
  202. package/hooks/scripts/utils.js +100 -100
  203. package/languages/csharp-unity.md +515 -515
  204. package/languages/gdscript-godot.md +470 -470
  205. package/languages/ruby-rails.md +489 -489
  206. package/languages/typescript-angular.md +433 -433
  207. package/languages/typescript-astro.md +416 -416
  208. package/languages/typescript-electron.md +406 -406
  209. package/languages/typescript-nestjs.md +524 -524
  210. package/languages/typescript-svelte.md +407 -407
  211. package/languages/typescript-tauri.md +365 -365
  212. package/package.json +121 -121
  213. package/skills/agents-md/SKILL.md +120 -120
  214. package/skills/arch-guard/SKILL.md +180 -0
  215. package/skills/brand-assets/SKILL.md +146 -146
  216. package/skills/capability-loop/SKILL.md +167 -0
  217. package/skills/characterization-test/SKILL.md +206 -206
  218. package/skills/commerce-patterns/SKILL.md +59 -59
  219. package/skills/commit-push-pr/SKILL.md +75 -75
  220. package/skills/context7-usage/SKILL.md +105 -105
  221. package/skills/core-capabilities/SKILL.md +48 -48
  222. package/skills/e2e-commerce/SKILL.md +57 -57
  223. package/skills/exec-plan/SKILL.md +147 -0
  224. package/skills/frontend-design/SKILL.md +73 -73
  225. package/skills/git-worktree/SKILL.md +72 -72
  226. package/skills/handoff/SKILL.md +109 -109
  227. package/skills/parallel-research/SKILL.md +87 -87
  228. package/skills/priority-todos/SKILL.md +63 -63
  229. package/skills/seo-checklist/SKILL.md +57 -57
  230. package/skills/techdebt/SKILL.md +122 -122
  231. package/skills/tool-fallback/SKILL.md +103 -103
  232. package/skills/typescript-advanced-types/SKILL.md +65 -65
  233. package/skills/ui-ux-pro-max/SKILL.md +206 -206
  234. package/skills/vercel-react-best-practices/SKILL.md +59 -59
  235. package/skills/video-production/SKILL.md +51 -51
  236. package/vibe/config.json +29 -29
  237. package/vibe/constitution.md +227 -227
  238. package/vibe/rules/principles/communication-guide.md +98 -98
  239. package/vibe/rules/principles/development-philosophy.md +52 -52
  240. package/vibe/rules/principles/quick-start.md +102 -102
  241. package/vibe/rules/quality/bdd-contract-testing.md +393 -393
  242. package/vibe/rules/quality/checklist.md +276 -276
  243. package/vibe/rules/quality/performance.md +236 -236
  244. package/vibe/rules/quality/testing-strategy.md +440 -440
  245. package/vibe/rules/standards/anti-patterns.md +541 -541
  246. package/vibe/rules/standards/code-structure.md +291 -291
  247. package/vibe/rules/standards/complexity-metrics.md +313 -313
  248. package/vibe/rules/standards/git-workflow.md +237 -237
  249. package/vibe/rules/standards/naming-conventions.md +198 -198
  250. package/vibe/rules/standards/security.md +305 -305
  251. package/vibe/rules/writing/document-style.md +74 -74
  252. package/vibe/setup.sh +31 -31
  253. package/vibe/templates/constitution-template.md +252 -252
  254. package/vibe/templates/contract-backend-template.md +526 -526
  255. package/vibe/templates/contract-frontend-template.md +599 -599
  256. package/vibe/templates/feature-template.md +96 -96
  257. package/vibe/templates/spec-template.md +221 -221
  258. package/vibe/ui-ux-data/charts.csv +26 -26
  259. package/vibe/ui-ux-data/colors.csv +97 -97
  260. package/vibe/ui-ux-data/icons.csv +101 -101
  261. package/vibe/ui-ux-data/landing.csv +31 -31
  262. package/vibe/ui-ux-data/products.csv +96 -96
  263. package/vibe/ui-ux-data/react-performance.csv +45 -45
  264. package/vibe/ui-ux-data/stacks/astro.csv +54 -54
  265. package/vibe/ui-ux-data/stacks/flutter.csv +53 -53
  266. package/vibe/ui-ux-data/stacks/html-tailwind.csv +56 -56
  267. package/vibe/ui-ux-data/stacks/jetpack-compose.csv +53 -53
  268. package/vibe/ui-ux-data/stacks/nextjs.csv +53 -53
  269. package/vibe/ui-ux-data/stacks/nuxt-ui.csv +51 -51
  270. package/vibe/ui-ux-data/stacks/nuxtjs.csv +59 -59
  271. package/vibe/ui-ux-data/stacks/react-native.csv +52 -52
  272. package/vibe/ui-ux-data/stacks/react.csv +54 -54
  273. package/vibe/ui-ux-data/stacks/shadcn.csv +61 -61
  274. package/vibe/ui-ux-data/stacks/svelte.csv +54 -54
  275. package/vibe/ui-ux-data/stacks/swiftui.csv +51 -51
  276. package/vibe/ui-ux-data/stacks/vue.csv +50 -50
  277. package/vibe/ui-ux-data/styles.csv +68 -68
  278. package/vibe/ui-ux-data/typography.csv +57 -57
  279. package/vibe/ui-ux-data/ui-reasoning.csv +101 -101
  280. package/vibe/ui-ux-data/ux-guidelines.csv +99 -99
  281. package/vibe/ui-ux-data/version.json +31 -31
  282. package/vibe/ui-ux-data/web-interface.csv +31 -31
@@ -11,20 +11,20 @@ export class UsageTracker {
11
11
  this.initializeTables();
12
12
  }
13
13
  initializeTables() {
14
- this.db.exec(`
15
- CREATE TABLE IF NOT EXISTS usage_events (
16
- id TEXT PRIMARY KEY,
17
- generationId TEXT NOT NULL,
18
- sessionId TEXT,
19
- matchedPrompt TEXT,
20
- feedback TEXT CHECK(feedback IN ('positive','negative','neutral') OR feedback IS NULL),
21
- createdAt TEXT NOT NULL
22
- );
23
-
24
- CREATE INDEX IF NOT EXISTS idx_ue_gen ON usage_events(generationId);
25
- CREATE INDEX IF NOT EXISTS idx_ue_session ON usage_events(sessionId);
26
- CREATE INDEX IF NOT EXISTS idx_ue_feedback ON usage_events(feedback);
27
- CREATE INDEX IF NOT EXISTS idx_ue_created ON usage_events(createdAt);
14
+ this.db.exec(`
15
+ CREATE TABLE IF NOT EXISTS usage_events (
16
+ id TEXT PRIMARY KEY,
17
+ generationId TEXT NOT NULL,
18
+ sessionId TEXT,
19
+ matchedPrompt TEXT,
20
+ feedback TEXT CHECK(feedback IN ('positive','negative','neutral') OR feedback IS NULL),
21
+ createdAt TEXT NOT NULL
22
+ );
23
+
24
+ CREATE INDEX IF NOT EXISTS idx_ue_gen ON usage_events(generationId);
25
+ CREATE INDEX IF NOT EXISTS idx_ue_session ON usage_events(sessionId);
26
+ CREATE INDEX IF NOT EXISTS idx_ue_feedback ON usage_events(feedback);
27
+ CREATE INDEX IF NOT EXISTS idx_ue_created ON usage_events(createdAt);
28
28
  `);
29
29
  }
30
30
  /**
@@ -33,9 +33,9 @@ export class UsageTracker {
33
33
  recordUsage(generationId, sessionId, matchedPrompt) {
34
34
  const id = `ue-${Date.now().toString(36)}-${randomUUID().replace(/-/g, '').slice(0, 8)}`;
35
35
  const now = new Date().toISOString();
36
- this.db.prepare(`
37
- INSERT INTO usage_events (id, generationId, sessionId, matchedPrompt, createdAt)
38
- VALUES (?, ?, ?, ?, ?)
36
+ this.db.prepare(`
37
+ INSERT INTO usage_events (id, generationId, sessionId, matchedPrompt, createdAt)
38
+ VALUES (?, ?, ?, ?, ?)
39
39
  `).run(id, generationId, sessionId || null, matchedPrompt || null, now);
40
40
  // Also increment usage count in generations table
41
41
  this.registry.incrementUsage(generationId);
@@ -45,8 +45,8 @@ export class UsageTracker {
45
45
  * Record explicit user feedback for a usage event
46
46
  */
47
47
  setFeedback(eventId, feedback) {
48
- const result = this.db.prepare(`
49
- UPDATE usage_events SET feedback = ? WHERE id = ?
48
+ const result = this.db.prepare(`
49
+ UPDATE usage_events SET feedback = ? WHERE id = ?
50
50
  `).run(feedback, eventId);
51
51
  return result.changes > 0;
52
52
  }
@@ -54,8 +54,8 @@ export class UsageTracker {
54
54
  * Record explicit feedback for a generation (latest event)
55
55
  */
56
56
  setFeedbackForGeneration(generationId, feedback) {
57
- const event = this.db.prepare(`
58
- SELECT id FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC LIMIT 1
57
+ const event = this.db.prepare(`
58
+ SELECT id FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC LIMIT 1
59
59
  `).get(generationId);
60
60
  if (!event)
61
61
  return false;
@@ -76,9 +76,9 @@ export class UsageTracker {
76
76
  feedback = 'neutral';
77
77
  }
78
78
  // Only apply to events without explicit feedback
79
- const result = this.db.prepare(`
80
- UPDATE usage_events SET feedback = ?
81
- WHERE sessionId = ? AND feedback IS NULL
79
+ const result = this.db.prepare(`
80
+ UPDATE usage_events SET feedback = ?
81
+ WHERE sessionId = ? AND feedback IS NULL
82
82
  `).run(feedback, sessionId);
83
83
  return result.changes;
84
84
  }
@@ -86,8 +86,8 @@ export class UsageTracker {
86
86
  * Get usage events for a generation
87
87
  */
88
88
  getByGeneration(generationId) {
89
- const rows = this.db.prepare(`
90
- SELECT * FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC
89
+ const rows = this.db.prepare(`
90
+ SELECT * FROM usage_events WHERE generationId = ? ORDER BY createdAt DESC
91
91
  `).all(generationId);
92
92
  return rows.map(this.rowToEvent);
93
93
  }
@@ -138,8 +138,8 @@ export class UsageTracker {
138
138
  * Get total usage count for a generation
139
139
  */
140
140
  getUsageCount(generationId) {
141
- const result = this.db.prepare(`
142
- SELECT COUNT(*) as cnt FROM usage_events WHERE generationId = ?
141
+ const result = this.db.prepare(`
142
+ SELECT COUNT(*) as cnt FROM usage_events WHERE generationId = ?
143
143
  `).get(generationId);
144
144
  return result.cnt;
145
145
  }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=eval.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval.test.d.ts","sourceRoot":"","sources":["../../../../../src/infra/lib/evolution/__tests__/eval.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,539 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import { join } from 'path';
3
+ import { tmpdir } from 'os';
4
+ import { rmSync } from 'fs';
5
+ import { MemoryStorage } from '../../memory/MemoryStorage.js';
6
+ import { SkillEvalRunner } from '../SkillEvalRunner.js';
7
+ import { SkillBenchmark } from '../SkillBenchmark.js';
8
+ import { SkillClassifier } from '../SkillClassifier.js';
9
+ import { DescriptionOptimizer } from '../DescriptionOptimizer.js';
10
+ // ─── SkillEvalRunner ─────────────────────────────────────────────────────────
11
+ describe('SkillEvalRunner', () => {
12
+ let storage;
13
+ let runner;
14
+ let testDir;
15
+ beforeEach(() => {
16
+ testDir = join(tmpdir(), `eval-runner-${Date.now()}-${Math.random().toString(36).slice(2)}`);
17
+ storage = new MemoryStorage(testDir);
18
+ runner = new SkillEvalRunner(storage);
19
+ });
20
+ afterEach(() => {
21
+ storage.close();
22
+ try {
23
+ rmSync(testDir, { recursive: true, force: true });
24
+ }
25
+ catch { /* ignore */ }
26
+ });
27
+ it('should create eval set and retrieve cases', () => {
28
+ const cases = runner.createEvalSet({
29
+ skillName: 'csv-analyzer',
30
+ evals: [
31
+ {
32
+ prompt: 'Analyze this CSV file and generate a summary',
33
+ expectedOutput: 'A statistical summary of the CSV data',
34
+ files: ['data.csv'],
35
+ assertions: [
36
+ { description: 'Contains row count', type: 'contains', value: 'rows' },
37
+ { description: 'Contains column info', type: 'contains', value: 'columns' },
38
+ ],
39
+ },
40
+ {
41
+ prompt: 'Parse the CSV and find outliers',
42
+ expectedOutput: 'List of outlier values',
43
+ },
44
+ ],
45
+ });
46
+ expect(cases).toHaveLength(2);
47
+ expect(cases[0].skillName).toBe('csv-analyzer');
48
+ expect(cases[0].assertions).toHaveLength(2);
49
+ expect(cases[0].files).toEqual(['data.csv']);
50
+ expect(cases[1].assertions).toHaveLength(0);
51
+ const retrieved = runner.getEvalCases('csv-analyzer');
52
+ expect(retrieved).toHaveLength(2);
53
+ });
54
+ it('should start and complete eval runs', () => {
55
+ const cases = runner.createEvalSet({
56
+ skillName: 'test-skill',
57
+ evals: [{ prompt: 'Test prompt', expectedOutput: 'Expected output' }],
58
+ });
59
+ const runId = runner.startRun(cases[0].id, 'test-skill', 'with_skill');
60
+ expect(runId).toBeTruthy();
61
+ runner.completeRun(runId, 'Generated output with rows and columns', [
62
+ { assertionId: 'a1', description: 'Has content', passed: true, evidence: 'Output is non-empty' },
63
+ ], 1500, 5000);
64
+ const runs = runner.getRunsForEval(cases[0].id);
65
+ expect(runs).toHaveLength(1);
66
+ expect(runs[0].status).toBe('passed');
67
+ expect(runs[0].durationMs).toBe(1500);
68
+ expect(runs[0].tokenCount).toBe(5000);
69
+ });
70
+ it('should mark run as failed when assertions fail', () => {
71
+ const cases = runner.createEvalSet({
72
+ skillName: 'test-skill',
73
+ evals: [{ prompt: 'Test', expectedOutput: 'Expected' }],
74
+ });
75
+ const runId = runner.startRun(cases[0].id, 'test-skill', 'with_skill');
76
+ runner.completeRun(runId, 'Bad output', [
77
+ { assertionId: 'a1', description: 'Has rows', passed: false, evidence: 'Missing' },
78
+ { assertionId: 'a2', description: 'Has cols', passed: true, evidence: 'Present' },
79
+ ], 1000, 3000);
80
+ const runs = runner.getRunsForEval(cases[0].id);
81
+ expect(runs[0].status).toBe('failed');
82
+ });
83
+ it('should handle error runs', () => {
84
+ const cases = runner.createEvalSet({
85
+ skillName: 'test-skill',
86
+ evals: [{ prompt: 'Test', expectedOutput: 'Expected' }],
87
+ });
88
+ const runId = runner.startRun(cases[0].id, 'test-skill', 'baseline');
89
+ runner.failRun(runId, 'Connection timeout');
90
+ const runs = runner.getRunsForEval(cases[0].id);
91
+ expect(runs[0].status).toBe('error');
92
+ expect(runs[0].output).toBe('Connection timeout');
93
+ });
94
+ it('should grade output against assertions', () => {
95
+ const assertions = [
96
+ { id: 'a1', description: 'Contains summary', type: 'contains', value: 'summary' },
97
+ { id: 'a2', description: 'No errors', type: 'not_contains', value: 'error' },
98
+ { id: 'a3', description: 'Has number', type: 'matches_regex', value: '\\d+' },
99
+ { id: 'a4', description: 'Custom check', type: 'custom', value: 'quality > 8' },
100
+ ];
101
+ const grades = runner.gradeOutput('Here is the summary: 42 items found', assertions);
102
+ expect(grades).toHaveLength(4);
103
+ expect(grades[0].passed).toBe(true); // contains 'summary'
104
+ expect(grades[1].passed).toBe(true); // not contains 'error'
105
+ expect(grades[2].passed).toBe(true); // matches \d+
106
+ expect(grades[3].passed).toBe(false); // custom always false without external grading
107
+ });
108
+ it('should grade failing contains assertion', () => {
109
+ const assertions = [
110
+ { id: 'a1', description: 'Contains missing word', type: 'contains', value: 'nonexistent' },
111
+ ];
112
+ const grades = runner.gradeOutput('Some output text', assertions);
113
+ expect(grades[0].passed).toBe(false);
114
+ expect(grades[0].evidence).toContain('does not contain');
115
+ });
116
+ it('should grade failing not_contains assertion', () => {
117
+ const assertions = [
118
+ { id: 'a1', description: 'No errors', type: 'not_contains', value: 'error' },
119
+ ];
120
+ const grades = runner.gradeOutput('An error occurred', assertions);
121
+ expect(grades[0].passed).toBe(false);
122
+ expect(grades[0].evidence).toContain('unexpectedly contains');
123
+ });
124
+ it('should handle invalid regex gracefully', () => {
125
+ const assertions = [
126
+ { id: 'a1', description: 'Bad regex', type: 'matches_regex', value: '[invalid' },
127
+ ];
128
+ const grades = runner.gradeOutput('test', assertions);
129
+ expect(grades[0].passed).toBe(false);
130
+ expect(grades[0].evidence).toContain('Invalid regex');
131
+ });
132
+ it('should get latest runs grouped by eval and variant', () => {
133
+ const cases = runner.createEvalSet({
134
+ skillName: 'grouped-skill',
135
+ evals: [
136
+ { prompt: 'Eval 1', expectedOutput: 'Expected 1' },
137
+ { prompt: 'Eval 2', expectedOutput: 'Expected 2' },
138
+ ],
139
+ });
140
+ // Run both variants for eval 1
141
+ const wsRun = runner.startRun(cases[0].id, 'grouped-skill', 'with_skill');
142
+ runner.completeRun(wsRun, 'Output ws', [], 100, 500);
143
+ const blRun = runner.startRun(cases[0].id, 'grouped-skill', 'baseline');
144
+ runner.completeRun(blRun, 'Output bl', [], 200, 600);
145
+ // Only with_skill for eval 2
146
+ const wsRun2 = runner.startRun(cases[1].id, 'grouped-skill', 'with_skill');
147
+ runner.completeRun(wsRun2, 'Output ws2', [], 150, 550);
148
+ const grouped = runner.getLatestRuns('grouped-skill');
149
+ expect(grouped.size).toBe(2);
150
+ const eval1 = grouped.get(cases[0].id);
151
+ expect(eval1?.withSkill).not.toBeNull();
152
+ expect(eval1?.baseline).not.toBeNull();
153
+ const eval2 = grouped.get(cases[1].id);
154
+ expect(eval2?.withSkill).not.toBeNull();
155
+ expect(eval2?.baseline).toBeNull();
156
+ });
157
+ it('should delete eval set and associated runs', () => {
158
+ runner.createEvalSet({
159
+ skillName: 'to-delete',
160
+ evals: [{ prompt: 'Test', expectedOutput: 'Expected' }],
161
+ });
162
+ const cases = runner.getEvalCases('to-delete');
163
+ runner.startRun(cases[0].id, 'to-delete', 'with_skill');
164
+ const deleted = runner.deleteEvalSet('to-delete');
165
+ expect(deleted).toBe(1);
166
+ expect(runner.getEvalCases('to-delete')).toHaveLength(0);
167
+ });
168
+ it('should get eval case by ID', () => {
169
+ const cases = runner.createEvalSet({
170
+ skillName: 'by-id-test',
171
+ evals: [{ prompt: 'Specific prompt', expectedOutput: 'Specific output' }],
172
+ });
173
+ const found = runner.getEvalCase(cases[0].id);
174
+ expect(found).not.toBeNull();
175
+ expect(found.prompt).toBe('Specific prompt');
176
+ const notFound = runner.getEvalCase('nonexistent');
177
+ expect(notFound).toBeNull();
178
+ });
179
+ });
180
+ // ─── SkillBenchmark ──────────────────────────────────────────────────────────
181
+ describe('SkillBenchmark', () => {
182
+ let storage;
183
+ let runner;
184
+ let benchmark;
185
+ let testDir;
186
+ beforeEach(() => {
187
+ testDir = join(tmpdir(), `eval-bench-${Date.now()}-${Math.random().toString(36).slice(2)}`);
188
+ storage = new MemoryStorage(testDir);
189
+ runner = new SkillEvalRunner(storage);
190
+ benchmark = new SkillBenchmark(storage);
191
+ });
192
+ afterEach(() => {
193
+ storage.close();
194
+ try {
195
+ rmSync(testDir, { recursive: true, force: true });
196
+ }
197
+ catch { /* ignore */ }
198
+ });
199
+ function setupSkillWithRuns(skillName, wsPass, blPass) {
200
+ const cases = runner.createEvalSet({
201
+ skillName,
202
+ evals: [
203
+ { prompt: 'Eval A', expectedOutput: 'Expected A', assertions: [{ description: 'Check A', type: 'contains', value: 'result' }] },
204
+ { prompt: 'Eval B', expectedOutput: 'Expected B', assertions: [{ description: 'Check B', type: 'contains', value: 'data' }] },
205
+ ],
206
+ });
207
+ for (const evalCase of cases) {
208
+ const wsId = runner.startRun(evalCase.id, skillName, 'with_skill');
209
+ const wsOutput = wsPass ? 'Here is the result with data' : 'Incomplete output';
210
+ const wsGrades = runner.gradeOutput(wsOutput, evalCase.assertions);
211
+ runner.completeRun(wsId, wsOutput, wsGrades, 1200, 4500);
212
+ const blId = runner.startRun(evalCase.id, skillName, 'baseline');
213
+ const blOutput = blPass ? 'Here is the result with data' : 'No useful output';
214
+ const blGrades = runner.gradeOutput(blOutput, evalCase.assertions);
215
+ runner.completeRun(blId, blOutput, blGrades, 1500, 5000);
216
+ }
217
+ }
218
+ it('should aggregate benchmark results', () => {
219
+ setupSkillWithRuns('bench-test', true, false);
220
+ const result = benchmark.aggregate('bench-test');
221
+ expect(result.skillName).toBe('bench-test');
222
+ expect(result.iteration).toBe(1);
223
+ expect(result.summary.totalEvals).toBe(2);
224
+ expect(result.summary.withSkill.passRate).toBe(1.0);
225
+ expect(result.summary.baseline.passRate).toBe(0);
226
+ expect(result.summary.delta.passRateDelta).toBe(1.0);
227
+ expect(result.evalBreakdowns).toHaveLength(2);
228
+ });
229
+ it('should track benchmark history across iterations', () => {
230
+ setupSkillWithRuns('history-test', true, false);
231
+ benchmark.aggregate('history-test');
232
+ // Second iteration with improved baseline
233
+ setupSkillWithRuns('history-test', true, true);
234
+ benchmark.aggregate('history-test');
235
+ const history = benchmark.getHistory('history-test');
236
+ expect(history).toHaveLength(2);
237
+ expect(history[0].iteration).toBe(1);
238
+ expect(history[1].iteration).toBe(2);
239
+ });
240
+ it('should get latest benchmark', () => {
241
+ setupSkillWithRuns('latest-test', true, false);
242
+ benchmark.aggregate('latest-test');
243
+ const latest = benchmark.getLatest('latest-test');
244
+ expect(latest).not.toBeNull();
245
+ expect(latest.iteration).toBe(1);
246
+ const none = benchmark.getLatest('nonexistent');
247
+ expect(none).toBeNull();
248
+ });
249
+ it('should compare two iterations', () => {
250
+ setupSkillWithRuns('compare-test', true, false);
251
+ benchmark.aggregate('compare-test');
252
+ setupSkillWithRuns('compare-test', true, true);
253
+ benchmark.aggregate('compare-test');
254
+ const comparison = benchmark.compare('compare-test', 1, 2);
255
+ expect(comparison.iterationA).not.toBeNull();
256
+ expect(comparison.iterationB).not.toBeNull();
257
+ expect(comparison.improvement).not.toBeNull();
258
+ });
259
+ it('should handle compare with missing iteration', () => {
260
+ const comparison = benchmark.compare('missing', 1, 2);
261
+ expect(comparison.iterationA).toBeNull();
262
+ expect(comparison.iterationB).toBeNull();
263
+ expect(comparison.improvement).toBeNull();
264
+ });
265
+ it('should format benchmark report as markdown', () => {
266
+ setupSkillWithRuns('report-test', true, false);
267
+ const result = benchmark.aggregate('report-test');
268
+ const report = benchmark.formatReport(result);
269
+ expect(report).toContain('# Benchmark: report-test');
270
+ expect(report).toContain('Pass Rate');
271
+ expect(report).toContain('Mean Duration');
272
+ expect(report).toContain('Per-Eval Breakdown');
273
+ });
274
+ it('should compute stddev for duration and tokens', () => {
275
+ const cases = runner.createEvalSet({
276
+ skillName: 'stddev-test',
277
+ evals: [
278
+ { prompt: 'A', expectedOutput: 'A' },
279
+ { prompt: 'B', expectedOutput: 'B' },
280
+ { prompt: 'C', expectedOutput: 'C' },
281
+ ],
282
+ });
283
+ // Varying durations and tokens
284
+ const durations = [1000, 2000, 3000];
285
+ const tokens = [4000, 5000, 6000];
286
+ for (let i = 0; i < cases.length; i++) {
287
+ const wsId = runner.startRun(cases[i].id, 'stddev-test', 'with_skill');
288
+ runner.completeRun(wsId, 'output', [], durations[i], tokens[i]);
289
+ const blId = runner.startRun(cases[i].id, 'stddev-test', 'baseline');
290
+ runner.completeRun(blId, 'output', [], durations[i] + 500, tokens[i] + 500);
291
+ }
292
+ const result = benchmark.aggregate('stddev-test');
293
+ expect(result.summary.withSkill.stddevDurationMs).toBeGreaterThan(0);
294
+ expect(result.summary.withSkill.stddevTokens).toBeGreaterThan(0);
295
+ });
296
+ });
297
+ // ─── SkillClassifier ─────────────────────────────────────────────────────────
298
+ describe('SkillClassifier', () => {
299
+ let storage;
300
+ let runner;
301
+ let benchmarkObj;
302
+ let classifier;
303
+ let testDir;
304
+ beforeEach(() => {
305
+ testDir = join(tmpdir(), `eval-class-${Date.now()}-${Math.random().toString(36).slice(2)}`);
306
+ storage = new MemoryStorage(testDir);
307
+ runner = new SkillEvalRunner(storage);
308
+ benchmarkObj = new SkillBenchmark(storage);
309
+ classifier = new SkillClassifier(storage);
310
+ });
311
+ afterEach(() => {
312
+ storage.close();
313
+ try {
314
+ rmSync(testDir, { recursive: true, force: true });
315
+ }
316
+ catch { /* ignore */ }
317
+ });
318
+ function createBenchmark(skillName, wsPassRate, blPassRate) {
319
+ const totalEvals = 10;
320
+ const wsPassCount = Math.round(wsPassRate * totalEvals);
321
+ const blPassCount = Math.round(blPassRate * totalEvals);
322
+ const evals = Array.from({ length: totalEvals }, (_, i) => ({
323
+ prompt: `Eval ${i}`,
324
+ expectedOutput: `Expected ${i}`,
325
+ assertions: [{ description: 'Check', type: 'contains', value: 'pass' }],
326
+ }));
327
+ const cases = runner.createEvalSet({ skillName, evals });
328
+ for (let i = 0; i < cases.length; i++) {
329
+ const wsId = runner.startRun(cases[i].id, skillName, 'with_skill');
330
+ const wsOutput = i < wsPassCount ? 'pass' : 'fail';
331
+ const wsGrades = runner.gradeOutput(wsOutput, cases[i].assertions);
332
+ runner.completeRun(wsId, wsOutput, wsGrades, 1000, 5000);
333
+ const blId = runner.startRun(cases[i].id, skillName, 'baseline');
334
+ const blOutput = i < blPassCount ? 'pass' : 'fail';
335
+ const blGrades = runner.gradeOutput(blOutput, cases[i].assertions);
336
+ runner.completeRun(blId, blOutput, blGrades, 1200, 5500);
337
+ }
338
+ benchmarkObj.aggregate(skillName);
339
+ }
340
+ it('should classify as unknown when no benchmarks exist', () => {
341
+ const result = classifier.classify('nonexistent');
342
+ expect(result.category).toBe('unknown');
343
+ expect(result.confidence).toBe(0);
344
+ expect(result.trend).toBe('insufficient_data');
345
+ });
346
+ it('should classify as capability_uplift when baseline is high', () => {
347
+ createBenchmark('cap-uplift', 0.9, 0.8);
348
+ const result = classifier.classify('cap-uplift');
349
+ expect(result.category).toBe('capability_uplift');
350
+ expect(result.baselinePassRate).toBeGreaterThanOrEqual(0.7);
351
+ });
352
+ it('should classify as encoded_preference when baseline is low and gap is large', () => {
353
+ createBenchmark('enc-pref', 0.9, 0.1);
354
+ const result = classifier.classify('enc-pref');
355
+ expect(result.category).toBe('encoded_preference');
356
+ expect(result.baselinePassRate).toBeLessThanOrEqual(0.3);
357
+ expect(result.withSkillPassRate).toBeGreaterThan(result.baselinePassRate);
358
+ });
359
+ it('should detect converging trend as capability_uplift', () => {
360
+ // First benchmark: large gap
361
+ createBenchmark('converge', 0.9, 0.2);
362
+ // Second benchmark: gap shrinks
363
+ createBenchmark('converge', 0.9, 0.7);
364
+ const result = classifier.classify('converge');
365
+ expect(result.trend).toBe('converging');
366
+ });
367
+ it('should detect stable trend as encoded_preference', () => {
368
+ // Both benchmarks: consistent gap
369
+ createBenchmark('stable', 0.9, 0.1);
370
+ createBenchmark('stable', 0.9, 0.1);
371
+ const result = classifier.classify('stable');
372
+ expect(result.trend).toBe('stable');
373
+ });
374
+ it('should classify from explicit rates', () => {
375
+ const result = classifier.classifyFromRates('test-skill', 0.95, 0.1);
376
+ expect(result.category).toBe('encoded_preference');
377
+ expect(result.skillName).toBe('test-skill');
378
+ });
379
+ it('should detect becoming obsolete', () => {
380
+ createBenchmark('obsolete', 0.9, 0.85);
381
+ const result = classifier.isBecomingObsolete('obsolete');
382
+ expect(result.obsolete).toBe(true);
383
+ expect(result.reason).toContain('well without the skill');
384
+ });
385
+ it('should not flag non-obsolete skills', () => {
386
+ createBenchmark('healthy', 0.9, 0.1);
387
+ const result = classifier.isBecomingObsolete('healthy');
388
+ expect(result.obsolete).toBe(false);
389
+ });
390
+ });
391
+ // ─── DescriptionOptimizer ────────────────────────────────────────────────────
392
+ describe('DescriptionOptimizer', () => {
393
+ let storage;
394
+ let optimizer;
395
+ let testDir;
396
+ beforeEach(() => {
397
+ testDir = join(tmpdir(), `eval-opt-${Date.now()}-${Math.random().toString(36).slice(2)}`);
398
+ storage = new MemoryStorage(testDir);
399
+ optimizer = new DescriptionOptimizer(storage);
400
+ });
401
+ afterEach(() => {
402
+ storage.close();
403
+ try {
404
+ rmSync(testDir, { recursive: true, force: true });
405
+ }
406
+ catch { /* ignore */ }
407
+ });
408
+ it('should split eval set into train/test with stratification', () => {
409
+ const queries = [
410
+ { query: 'analyze this csv file', shouldTrigger: true },
411
+ { query: 'parse my spreadsheet data', shouldTrigger: true },
412
+ { query: 'generate a chart from data', shouldTrigger: true },
413
+ { query: 'create csv summary', shouldTrigger: true },
414
+ { query: 'write me an email', shouldTrigger: false },
415
+ { query: 'fix this bug in the auth module', shouldTrigger: false },
416
+ { query: 'deploy the application', shouldTrigger: false },
417
+ { query: 'review my pull request', shouldTrigger: false },
418
+ ];
419
+ const { train, test } = optimizer.splitEvalSet(queries);
420
+ // Both sets should have queries
421
+ expect(train.length).toBeGreaterThan(0);
422
+ expect(test.length).toBeGreaterThan(0);
423
+ // Combined should cover all queries
424
+ expect(train.length + test.length).toBe(queries.length);
425
+ // Both sets should have both types
426
+ expect(train.some(q => q.shouldTrigger)).toBe(true);
427
+ expect(train.some(q => !q.shouldTrigger)).toBe(true);
428
+ expect(test.some(q => q.shouldTrigger)).toBe(true);
429
+ expect(test.some(q => !q.shouldTrigger)).toBe(true);
430
+ });
431
+ it('should evaluate description against trigger queries', () => {
432
+ const description = 'Analyze CSV files, parse spreadsheet data, generate statistical summaries';
433
+ const queries = [
434
+ { query: 'analyze this csv file and show me stats', shouldTrigger: true },
435
+ { query: 'write a blog post about cooking', shouldTrigger: false },
436
+ ];
437
+ const results = optimizer.evaluateDescription(description, queries);
438
+ expect(results).toHaveLength(2);
439
+ expect(results[0].shouldTrigger).toBe(true);
440
+ expect(results[1].shouldTrigger).toBe(false);
441
+ });
442
+ it('should score results correctly', () => {
443
+ const allCorrect = [
444
+ { query: 'q1', shouldTrigger: true, didTrigger: true, triggerRate: 0.5, correct: true },
445
+ { query: 'q2', shouldTrigger: false, didTrigger: false, triggerRate: 0.0, correct: true },
446
+ ];
447
+ expect(optimizer.scoreResults(allCorrect)).toBe(1.0);
448
+ const halfCorrect = [
449
+ { query: 'q1', shouldTrigger: true, didTrigger: true, triggerRate: 0.5, correct: true },
450
+ { query: 'q2', shouldTrigger: false, didTrigger: true, triggerRate: 0.3, correct: false },
451
+ ];
452
+ expect(optimizer.scoreResults(halfCorrect)).toBe(0.5);
453
+ expect(optimizer.scoreResults([])).toBe(0);
454
+ });
455
+ it('should suggest improvements for false negatives', () => {
456
+ const description = 'Process data files';
457
+ const failedResults = [
458
+ { query: 'analyze csv spreadsheet', shouldTrigger: true, didTrigger: false, triggerRate: 0.05, correct: false },
459
+ ];
460
+ const improved = optimizer.suggestImprovement(description, failedResults);
461
+ expect(improved).not.toBe(description);
462
+ expect(improved.length).toBeGreaterThan(description.length);
463
+ });
464
+ it('should suggest improvements for false positives', () => {
465
+ const description = 'Analyze data and generate reports from spreadsheets';
466
+ const failedResults = [
467
+ { query: 'generate random passwords for security testing', shouldTrigger: false, didTrigger: true, triggerRate: 0.2, correct: false },
468
+ ];
469
+ const improved = optimizer.suggestImprovement(description, failedResults);
470
+ expect(improved).not.toBe(description);
471
+ expect(improved).toContain('Does NOT');
472
+ });
473
+ it('should return original description when no failures', () => {
474
+ const description = 'Perfect description';
475
+ const improved = optimizer.suggestImprovement(description, []);
476
+ expect(improved).toBe(description);
477
+ });
478
+ it('should run full optimization loop', () => {
479
+ const queries = [
480
+ { query: 'analyze this csv file and create a statistical report', shouldTrigger: true },
481
+ { query: 'parse my data spreadsheet and find patterns', shouldTrigger: true },
482
+ { query: 'summarize the csv columns with averages', shouldTrigger: true },
483
+ { query: 'generate csv from database export', shouldTrigger: true },
484
+ { query: 'help me write a novel', shouldTrigger: false },
485
+ { query: 'fix the authentication bug', shouldTrigger: false },
486
+ { query: 'deploy to production servers', shouldTrigger: false },
487
+ { query: 'review this pull request code', shouldTrigger: false },
488
+ ];
489
+ const result = optimizer.optimize('csv-analyzer', 'Analyze CSV files', queries, 3);
490
+ expect(result.skillName).toBe('csv-analyzer');
491
+ expect(result.originalDescription).toBe('Analyze CSV files');
492
+ expect(result.candidates.length).toBeGreaterThan(0);
493
+ expect(result.candidates.length).toBeLessThanOrEqual(3);
494
+ // Best description should be selected by test score
495
+ expect(result.bestDescription).toBeTruthy();
496
+ });
497
+ it('should persist and retrieve optimization history', () => {
498
+ const queries = [
499
+ { query: 'analyze csv data', shouldTrigger: true },
500
+ { query: 'write a poem', shouldTrigger: false },
501
+ { query: 'parse spreadsheet', shouldTrigger: true },
502
+ { query: 'cook dinner recipe', shouldTrigger: false },
503
+ ];
504
+ optimizer.optimize('persist-test', 'Initial description', queries, 2);
505
+ const history = optimizer.getHistory('persist-test');
506
+ expect(history).toHaveLength(1);
507
+ const latest = optimizer.getLatest('persist-test');
508
+ expect(latest).not.toBeNull();
509
+ expect(latest.skillName).toBe('persist-test');
510
+ });
511
+ it('should evaluate candidate on both train and test sets', () => {
512
+ const train = [
513
+ { query: 'analyze csv file', shouldTrigger: true },
514
+ { query: 'write poetry', shouldTrigger: false },
515
+ ];
516
+ const test = [
517
+ { query: 'parse data spreadsheet', shouldTrigger: true },
518
+ { query: 'fix security bug', shouldTrigger: false },
519
+ ];
520
+ const candidate = optimizer.evaluateCandidate('Analyze CSV data files', train, test, 1);
521
+ expect(candidate.iteration).toBe(1);
522
+ expect(candidate.trainScore).toBeGreaterThanOrEqual(0);
523
+ expect(candidate.trainScore).toBeLessThanOrEqual(1);
524
+ expect(candidate.testScore).toBeGreaterThanOrEqual(0);
525
+ expect(candidate.testScore).toBeLessThanOrEqual(1);
526
+ expect(candidate.results).toHaveLength(4); // train + test
527
+ });
528
+ it('should stop optimization early on perfect score', () => {
529
+ // Simple case where description already matches perfectly
530
+ const queries = [
531
+ { query: 'csv analysis report with statistics', shouldTrigger: true },
532
+ { query: 'completely unrelated cooking recipe topic', shouldTrigger: false },
533
+ ];
534
+ const result = optimizer.optimize('perfect-test', 'CSV analysis and statistics reporting tool', queries, 5);
535
+ // Should stop before max iterations if already perfect
536
+ expect(result.candidates.length).toBeLessThanOrEqual(5);
537
+ });
538
+ });
539
+ //# sourceMappingURL=eval.test.js.map