@skillrecordings/cli 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/bin/skill.mjs +27 -0
  2. package/dist/chunk-2NCCVTEE.js +22342 -0
  3. package/dist/chunk-2NCCVTEE.js.map +1 -0
  4. package/dist/chunk-3E3GYSZR.js +7071 -0
  5. package/dist/chunk-3E3GYSZR.js.map +1 -0
  6. package/dist/chunk-F4EM72IH.js +86 -0
  7. package/dist/chunk-F4EM72IH.js.map +1 -0
  8. package/dist/chunk-FGP7KUQW.js +432 -0
  9. package/dist/chunk-FGP7KUQW.js.map +1 -0
  10. package/dist/chunk-H3D6VCME.js +55 -0
  11. package/dist/chunk-H3D6VCME.js.map +1 -0
  12. package/dist/chunk-HK3PEWFD.js +208 -0
  13. package/dist/chunk-HK3PEWFD.js.map +1 -0
  14. package/dist/chunk-KEV3QKXP.js +4495 -0
  15. package/dist/chunk-KEV3QKXP.js.map +1 -0
  16. package/dist/chunk-MG37YDAK.js +882 -0
  17. package/dist/chunk-MG37YDAK.js.map +1 -0
  18. package/dist/chunk-MLNDSBZ4.js +482 -0
  19. package/dist/chunk-MLNDSBZ4.js.map +1 -0
  20. package/dist/chunk-N2WIV2JV.js +22 -0
  21. package/dist/chunk-N2WIV2JV.js.map +1 -0
  22. package/dist/chunk-PWWRCN5W.js +2067 -0
  23. package/dist/chunk-PWWRCN5W.js.map +1 -0
  24. package/dist/chunk-SKHBM3XP.js +7746 -0
  25. package/dist/chunk-SKHBM3XP.js.map +1 -0
  26. package/dist/chunk-WFANXVQG.js +64 -0
  27. package/dist/chunk-WFANXVQG.js.map +1 -0
  28. package/dist/chunk-WYKL32C3.js +275 -0
  29. package/dist/chunk-WYKL32C3.js.map +1 -0
  30. package/dist/chunk-ZNF7XD2S.js +134 -0
  31. package/dist/chunk-ZNF7XD2S.js.map +1 -0
  32. package/dist/config-AUAIYDSI.js +20 -0
  33. package/dist/config-AUAIYDSI.js.map +1 -0
  34. package/dist/fileFromPath-XN7LXIBI.js +134 -0
  35. package/dist/fileFromPath-XN7LXIBI.js.map +1 -0
  36. package/dist/getMachineId-bsd-KW2E7VK3.js +42 -0
  37. package/dist/getMachineId-bsd-KW2E7VK3.js.map +1 -0
  38. package/dist/getMachineId-darwin-ROXJUJX5.js +42 -0
  39. package/dist/getMachineId-darwin-ROXJUJX5.js.map +1 -0
  40. package/dist/getMachineId-linux-KVZEHQSU.js +34 -0
  41. package/dist/getMachineId-linux-KVZEHQSU.js.map +1 -0
  42. package/dist/getMachineId-unsupported-PPRILPPA.js +25 -0
  43. package/dist/getMachineId-unsupported-PPRILPPA.js.map +1 -0
  44. package/dist/getMachineId-win-IIF36LEJ.js +44 -0
  45. package/dist/getMachineId-win-IIF36LEJ.js.map +1 -0
  46. package/dist/index.js +112703 -0
  47. package/dist/index.js.map +1 -0
  48. package/dist/lib-R6DEEJCP.js +7623 -0
  49. package/dist/lib-R6DEEJCP.js.map +1 -0
  50. package/dist/pipeline-IAVVAKTU.js +120 -0
  51. package/dist/pipeline-IAVVAKTU.js.map +1 -0
  52. package/dist/query-NTP5NVXN.js +25 -0
  53. package/dist/query-NTP5NVXN.js.map +1 -0
  54. package/dist/routing-BAEPFB7V.js +390 -0
  55. package/dist/routing-BAEPFB7V.js.map +1 -0
  56. package/dist/stripe-lookup-charge-EPRUMZDL.js +56 -0
  57. package/dist/stripe-lookup-charge-EPRUMZDL.js.map +1 -0
  58. package/dist/stripe-payment-history-SJPKA63N.js +67 -0
  59. package/dist/stripe-payment-history-SJPKA63N.js.map +1 -0
  60. package/dist/stripe-subscription-status-L4Z65GB3.js +58 -0
  61. package/dist/stripe-subscription-status-L4Z65GB3.js.map +1 -0
  62. package/dist/stripe-verify-refund-FZDKCIUQ.js +54 -0
  63. package/dist/stripe-verify-refund-FZDKCIUQ.js.map +1 -0
  64. package/dist/support-memory-WSG7SDKG.js +10 -0
  65. package/dist/support-memory-WSG7SDKG.js.map +1 -0
  66. package/package.json +10 -7
  67. package/.env.encrypted +0 -0
  68. package/CHANGELOG.md +0 -35
  69. package/data/tt-archive-dataset.json +0 -1
  70. package/data/validate-test-dataset.json +0 -97
  71. package/docs/CLI-AUTH.md +0 -504
  72. package/preload.ts +0 -18
  73. package/src/__tests__/init.test.ts +0 -74
  74. package/src/alignment-test.ts +0 -64
  75. package/src/check-apps.ts +0 -16
  76. package/src/commands/auth/decrypt.ts +0 -123
  77. package/src/commands/auth/encrypt.ts +0 -81
  78. package/src/commands/auth/index.ts +0 -50
  79. package/src/commands/auth/keygen.ts +0 -41
  80. package/src/commands/auth/status.ts +0 -164
  81. package/src/commands/axiom/forensic.ts +0 -868
  82. package/src/commands/axiom/index.ts +0 -697
  83. package/src/commands/build-dataset.ts +0 -311
  84. package/src/commands/db-status.ts +0 -47
  85. package/src/commands/deploys.ts +0 -219
  86. package/src/commands/eval-local/compare.ts +0 -171
  87. package/src/commands/eval-local/health.ts +0 -212
  88. package/src/commands/eval-local/index.ts +0 -76
  89. package/src/commands/eval-local/real-tools.ts +0 -416
  90. package/src/commands/eval-local/run.ts +0 -1168
  91. package/src/commands/eval-local/score-production.ts +0 -256
  92. package/src/commands/eval-local/seed.ts +0 -276
  93. package/src/commands/eval-pipeline/index.ts +0 -53
  94. package/src/commands/eval-pipeline/real-tools.ts +0 -492
  95. package/src/commands/eval-pipeline/run.ts +0 -1316
  96. package/src/commands/eval-pipeline/seed.ts +0 -395
  97. package/src/commands/eval-prompt.ts +0 -496
  98. package/src/commands/eval.test.ts +0 -253
  99. package/src/commands/eval.ts +0 -108
  100. package/src/commands/faq-classify.ts +0 -460
  101. package/src/commands/faq-cluster.ts +0 -135
  102. package/src/commands/faq-extract.ts +0 -249
  103. package/src/commands/faq-mine.ts +0 -432
  104. package/src/commands/faq-review.ts +0 -426
  105. package/src/commands/front/index.ts +0 -351
  106. package/src/commands/front/pull-conversations.ts +0 -275
  107. package/src/commands/front/tags.ts +0 -825
  108. package/src/commands/front-cache.ts +0 -1277
  109. package/src/commands/front-stats.ts +0 -75
  110. package/src/commands/health.test.ts +0 -82
  111. package/src/commands/health.ts +0 -362
  112. package/src/commands/init.test.ts +0 -89
  113. package/src/commands/init.ts +0 -106
  114. package/src/commands/inngest/client.ts +0 -294
  115. package/src/commands/inngest/events.ts +0 -296
  116. package/src/commands/inngest/investigate.ts +0 -382
  117. package/src/commands/inngest/runs.ts +0 -149
  118. package/src/commands/inngest/signal.ts +0 -143
  119. package/src/commands/kb-sync.ts +0 -498
  120. package/src/commands/memory/find.ts +0 -135
  121. package/src/commands/memory/get.ts +0 -87
  122. package/src/commands/memory/index.ts +0 -97
  123. package/src/commands/memory/stats.ts +0 -163
  124. package/src/commands/memory/store.ts +0 -49
  125. package/src/commands/memory/vote.ts +0 -159
  126. package/src/commands/pipeline.ts +0 -127
  127. package/src/commands/responses.ts +0 -856
  128. package/src/commands/tools.ts +0 -293
  129. package/src/commands/wizard.ts +0 -319
  130. package/src/index.ts +0 -172
  131. package/src/lib/crypto.ts +0 -56
  132. package/src/lib/env-loader.ts +0 -206
  133. package/src/lib/onepassword.ts +0 -137
  134. package/src/test-agent-local.ts +0 -115
  135. package/tsconfig.json +0 -11
  136. package/vitest.config.ts +0 -10
@@ -1,496 +0,0 @@
1
- /**
2
- * Prompt Evaluation Harness
3
- *
4
- * Tests prompt changes against real trigger messages.
5
- * Runs the agent with mocked tools, scores output quality.
6
- *
7
- * Usage:
8
- * skill eval-prompt # Run with current prompt
9
- * skill eval-prompt --prompt /path/to/new.md # Test a different prompt
10
- * skill eval-prompt --compare /path/to/new.md # Side-by-side comparison
11
- */
12
-
13
- import { readFileSync, writeFileSync, existsSync } from 'fs'
14
- import { join } from 'path'
15
- import type { Command } from 'commander'
16
- import { generateText, stepCountIs, tool } from 'ai'
17
- import { z } from 'zod'
18
-
19
- // Import the current production prompt
20
- import { SUPPORT_AGENT_PROMPT } from '@skillrecordings/core/agent'
21
-
22
- // ============================================================================
23
- // Quality Scorers (extracted from response-quality.eval.ts)
24
- // ============================================================================
25
-
26
- const leakPatterns = [
27
- /no instructor (configured|routing|assigned|set up)/i,
28
- /can't route this/i,
29
- /unable to route/i,
30
- /no (instructor|channel|inbox) (is )?configured/i,
31
- /system (doesn't|does not|cannot|can't)/i,
32
- /not configured for this app/i,
33
- /routing (not )?(set up|configured)/i,
34
- /tool (failed|error|returned)/i,
35
- /API (error|failed|token)/i,
36
- /forwarding (to|this)/i,
37
- /I'll note that this/i,
38
- /You'll want to reach out through/i,
39
- /should be routed/i,
40
- /should go to/i,
41
- /falls outside/i,
42
- ]
43
-
44
- const metaPatterns = [
45
- /^This (is|appears to be|seems|looks like) (a |an )?(clearly )?/i,
46
- /I (won't|will not|shouldn't|should not) (respond|draft|reply)/i,
47
- /I don't need to respond/i,
48
- /this (should|needs to) (go to|be forwarded|be routed)/i,
49
- /per my guidelines/i,
50
- /outside (the scope|my scope|customer support)/i,
51
- /not a (support request|customer service issue)/i,
52
- /is clearly (not|meant|personal|business)/i,
53
- /This (falls|is) outside/i,
54
- ]
55
-
56
- const bannedPatterns = [
57
- /^Great!/i,
58
- /I'd recommend/i,
59
- /I would recommend/i,
60
- /I'd suggest/i,
61
- /I would suggest/i,
62
- /Is there a specific area you're curious about/i,
63
- /Would you like help with/i,
64
- /Let me know if you have any other questions/i,
65
- /I hope this helps/i,
66
- /Happy to help/i,
67
- /I understand/i,
68
- /I hear you/i,
69
- /I apologize for any inconvenience/i,
70
- /Thanks (so much )?for (reaching out|sharing)/i,
71
- /—/, // em dash
72
- ]
73
-
74
- interface ScoreResult {
75
- leaks: string[]
76
- meta: string[]
77
- banned: string[]
78
- passed: boolean
79
- }
80
-
81
- function scoreResponse(text: string): ScoreResult {
82
- const leaks: string[] = []
83
- const meta: string[] = []
84
- const banned: string[] = []
85
-
86
- for (const p of leakPatterns) {
87
- const m = text.match(p)
88
- if (m) leaks.push(m[0])
89
- }
90
-
91
- for (const p of metaPatterns) {
92
- const m = text.match(p)
93
- if (m) meta.push(m[0])
94
- }
95
-
96
- for (const p of bannedPatterns) {
97
- const m = text.match(p)
98
- if (m) banned.push(m[0])
99
- }
100
-
101
- return {
102
- leaks,
103
- meta,
104
- banned,
105
- passed: leaks.length === 0 && meta.length === 0 && banned.length === 0,
106
- }
107
- }
108
-
109
- // ============================================================================
110
- // Mock Tools (minimal implementations for eval)
111
- // ============================================================================
112
-
113
- const mockTools = {
114
- lookupUser: tool({
115
- description: 'Look up user by email',
116
- inputSchema: z.object({
117
- email: z.string(),
118
- appId: z.string(),
119
- }),
120
- execute: async () => ({
121
- found: true,
122
- user: { id: 'mock-user', email: '[EMAIL]', name: 'Customer' },
123
- purchases: [{ id: 'purch-1', product: 'Total TypeScript', date: '2025-01-01' }],
124
- }),
125
- }),
126
-
127
- searchKnowledge: tool({
128
- description: 'Search knowledge base',
129
- inputSchema: z.object({ query: z.string(), appId: z.string() }),
130
- execute: async () => ({
131
- similarTickets: [],
132
- knowledge: [],
133
- goodResponses: [],
134
- }),
135
- }),
136
-
137
- draftResponse: tool({
138
- description: 'Draft a response',
139
- inputSchema: z.object({ body: z.string() }),
140
- execute: async ({ body }) => ({ drafted: true, body }),
141
- }),
142
-
143
- escalateToHuman: tool({
144
- description: 'Escalate to human',
145
- inputSchema: z.object({
146
- reason: z.string(),
147
- urgency: z.enum(['low', 'medium', 'high']),
148
- }),
149
- execute: async ({ reason, urgency }) => ({ escalated: true, reason, urgency }),
150
- }),
151
-
152
- assignToInstructor: tool({
153
- description: 'Assign to instructor',
154
- inputSchema: z.object({
155
- conversationId: z.string(),
156
- reason: z.string(),
157
- }),
158
- execute: async ({ conversationId, reason }) => ({
159
- status: 'pending_approval',
160
- conversationId,
161
- reason,
162
- message: 'Instructor assignment submitted for approval',
163
- }),
164
- }),
165
-
166
- memory_search: tool({
167
- description: 'Search memory',
168
- inputSchema: z.object({ query: z.string() }),
169
- execute: async () => ({ results: [], total: 0 }),
170
- }),
171
-
172
- searchProductContent: tool({
173
- description: 'Search product content',
174
- inputSchema: z.object({ query: z.string() }),
175
- execute: async () => ({ results: [] }),
176
- }),
177
- }
178
-
179
- // ============================================================================
180
- // Dataset Types
181
- // ============================================================================
182
-
183
- interface DatasetSample {
184
- id: string
185
- app: string
186
- conversationId: string
187
- triggerMessage: {
188
- subject: string
189
- body: string
190
- timestamp: number
191
- }
192
- agentResponse: {
193
- text: string
194
- category: string
195
- timestamp: string
196
- }
197
- }
198
-
199
- // ============================================================================
200
- // Eval Runner
201
- // ============================================================================
202
-
203
- interface EvalResult {
204
- id: string
205
- input: string
206
- output: string
207
- score: ScoreResult
208
- durationMs: number
209
- toolsCalled: string[]
210
- noDraft: boolean
211
- }
212
-
213
- async function runSingleEval(
214
- prompt: string,
215
- sample: DatasetSample,
216
- model: string
217
- ): Promise<EvalResult> {
218
- const startTime = Date.now()
219
- const input = `Subject: ${sample.triggerMessage.subject}\n\n${sample.triggerMessage.body}`
220
-
221
- try {
222
- const result = await generateText({
223
- model,
224
- system: prompt + '\n\nApp: total-typescript',
225
- messages: [{ role: 'user', content: input }],
226
- tools: mockTools,
227
- stopWhen: stepCountIs(10),
228
- })
229
-
230
- // Find draftResponse call
231
- const draftCall = result.steps.flatMap(s => s.toolCalls || [])
232
- .find(tc => tc.toolName === 'draftResponse')
233
-
234
- const toolsCalled = result.steps.flatMap(s => s.toolCalls || [])
235
- .map(tc => tc.toolName)
236
-
237
- const output = draftCall
238
- ? (draftCall.input as { body: string }).body
239
- : ''
240
-
241
- return {
242
- id: sample.id.slice(0, 8),
243
- input: input.slice(0, 100),
244
- output,
245
- score: scoreResponse(output),
246
- durationMs: Date.now() - startTime,
247
- toolsCalled,
248
- noDraft: !draftCall,
249
- }
250
- } catch (error) {
251
- return {
252
- id: sample.id.slice(0, 8),
253
- input: input.slice(0, 100),
254
- output: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
255
- score: { leaks: [], meta: [], banned: [], passed: false },
256
- durationMs: Date.now() - startTime,
257
- toolsCalled: [],
258
- noDraft: true,
259
- }
260
- }
261
- }
262
-
263
- async function runEval(options: {
264
- prompt?: string
265
- dataset?: string
266
- limit?: number
267
- model?: string
268
- output?: string
269
- json?: boolean
270
- }): Promise<void> {
271
- const {
272
- prompt: promptPath,
273
- dataset: datasetPath = 'data/eval-dataset.json',
274
- limit = 10,
275
- model = 'anthropic/claude-haiku-4-5', // Fast + cheap for evals
276
- output: outputPath,
277
- json = false,
278
- } = options
279
-
280
- // Load prompt
281
- let prompt = SUPPORT_AGENT_PROMPT
282
- if (promptPath) {
283
- if (!existsSync(promptPath)) {
284
- console.error(`Prompt file not found: ${promptPath}`)
285
- process.exit(1)
286
- }
287
- prompt = readFileSync(promptPath, 'utf-8')
288
- console.log(`Using prompt from: ${promptPath}`)
289
- } else {
290
- console.log('Using production prompt')
291
- }
292
-
293
- // Load dataset
294
- if (!existsSync(datasetPath)) {
295
- console.error(`Dataset not found: ${datasetPath}`)
296
- process.exit(1)
297
- }
298
- const dataset: DatasetSample[] = JSON.parse(readFileSync(datasetPath, 'utf-8'))
299
- const samples = dataset.slice(0, limit)
300
-
301
- console.log(`\n🧪 Running eval on ${samples.length} samples (model: ${model})\n`)
302
-
303
- const results: EvalResult[] = []
304
- let passed = 0
305
- let failed = 0
306
- let noDraft = 0
307
-
308
- for (let i = 0; i < samples.length; i++) {
309
- process.stdout.write(`\r Processing ${i + 1}/${samples.length}...`)
310
- const sample = samples[i]
311
- if (!sample) continue
312
- const result = await runSingleEval(prompt, sample, model)
313
- results.push(result)
314
-
315
- if (result.noDraft) {
316
- noDraft++
317
- } else if (result.score.passed) {
318
- passed++
319
- } else {
320
- failed++
321
- }
322
- }
323
-
324
- console.log('\n')
325
-
326
- // Summary
327
- console.log('📊 Results:')
328
- console.log(` ✅ Passed: ${passed}/${samples.length} (${((passed/samples.length)*100).toFixed(1)}%)`)
329
- console.log(` ❌ Failed: ${failed}/${samples.length}`)
330
- console.log(` 🚫 No draft: ${noDraft}/${samples.length}`)
331
-
332
- // Issue breakdown
333
- const allLeaks = results.flatMap(r => r.score.leaks)
334
- const allMeta = results.flatMap(r => r.score.meta)
335
- const allBanned = results.flatMap(r => r.score.banned)
336
-
337
- console.log('\n📋 Issue breakdown:')
338
- console.log(` 🚨 Internal leaks: ${allLeaks.length}`)
339
- console.log(` 💬 Meta-commentary: ${allMeta.length}`)
340
- console.log(` 🚫 Banned phrases: ${allBanned.length}`)
341
-
342
- // Show failures
343
- const failures = results.filter(r => !r.noDraft && !r.score.passed)
344
- if (failures.length > 0 && !json) {
345
- console.log('\n--- FAILURES ---\n')
346
- for (const f of failures.slice(0, 10)) {
347
- const issues = [
348
- ...f.score.leaks.map(l => `LEAK: "${l}"`),
349
- ...f.score.meta.map(m => `META: "${m}"`),
350
- ...f.score.banned.map(b => `BANNED: "${b}"`),
351
- ]
352
- console.log(`[${f.id}] ${issues.join(', ')}`)
353
- console.log(` Output: ${f.output.slice(0, 150)}...\n`)
354
- }
355
- }
356
-
357
- // JSON output
358
- if (json) {
359
- console.log(JSON.stringify({
360
- summary: { total: samples.length, passed, failed, noDraft },
361
- issues: { leaks: allLeaks.length, meta: allMeta.length, banned: allBanned.length },
362
- results,
363
- }, null, 2))
364
- }
365
-
366
- // Save results
367
- if (outputPath) {
368
- writeFileSync(outputPath, JSON.stringify(results, null, 2))
369
- console.log(`\nSaved results to ${outputPath}`)
370
- }
371
-
372
- // Exit code based on pass rate
373
- const passRate = passed / (passed + failed)
374
- process.exit(passRate >= 0.8 ? 0 : 1)
375
- }
376
-
377
- async function comparePrompts(options: {
378
- baseline?: string
379
- candidate: string
380
- dataset?: string
381
- limit?: number
382
- model?: string
383
- }): Promise<void> {
384
- const {
385
- baseline,
386
- candidate,
387
- dataset: datasetPath = 'data/eval-dataset.json',
388
- limit = 10,
389
- model = 'anthropic/claude-haiku-4-5',
390
- } = options
391
-
392
- // Load prompts
393
- const baselinePrompt = baseline
394
- ? readFileSync(baseline, 'utf-8')
395
- : SUPPORT_AGENT_PROMPT
396
- const candidatePrompt = readFileSync(candidate, 'utf-8')
397
-
398
- // Load dataset
399
- const dataset: DatasetSample[] = JSON.parse(readFileSync(datasetPath, 'utf-8'))
400
- const samples = dataset.slice(0, limit)
401
-
402
- console.log(`\n🔬 Comparing prompts on ${samples.length} samples\n`)
403
- console.log(` Baseline: ${baseline || 'production'}`)
404
- console.log(` Candidate: ${candidate}`)
405
- console.log('')
406
-
407
- let baselinePassed = 0
408
- let candidatePassed = 0
409
- const comparisons: Array<{
410
- id: string
411
- baselineScore: ScoreResult
412
- candidateScore: ScoreResult
413
- improved: boolean
414
- regressed: boolean
415
- }> = []
416
-
417
- for (let i = 0; i < samples.length; i++) {
418
- const sample = samples[i]
419
- if (!sample) continue
420
- process.stdout.write(`\r Processing ${i + 1}/${samples.length}...`)
421
-
422
- const baselineResult = await runSingleEval(baselinePrompt, sample, model)
423
- const candidateResult = await runSingleEval(candidatePrompt, sample, model)
424
-
425
- if (!baselineResult.noDraft && baselineResult.score.passed) baselinePassed++
426
- if (!candidateResult.noDraft && candidateResult.score.passed) candidatePassed++
427
-
428
- const baselineIssues = baselineResult.score.leaks.length + baselineResult.score.meta.length + baselineResult.score.banned.length
429
- const candidateIssues = candidateResult.score.leaks.length + candidateResult.score.meta.length + candidateResult.score.banned.length
430
-
431
- comparisons.push({
432
- id: sample.id.slice(0, 8),
433
- baselineScore: baselineResult.score,
434
- candidateScore: candidateResult.score,
435
- improved: candidateIssues < baselineIssues,
436
- regressed: candidateIssues > baselineIssues,
437
- })
438
- }
439
-
440
- console.log('\n\n📊 Comparison Results:\n')
441
- console.log(` Baseline pass rate: ${baselinePassed}/${samples.length} (${((baselinePassed/samples.length)*100).toFixed(1)}%)`)
442
- console.log(` Candidate pass rate: ${candidatePassed}/${samples.length} (${((candidatePassed/samples.length)*100).toFixed(1)}%)`)
443
-
444
- const improved = comparisons.filter(c => c.improved).length
445
- const regressed = comparisons.filter(c => c.regressed).length
446
- const same = comparisons.length - improved - regressed
447
-
448
- console.log(`\n ⬆️ Improved: ${improved}`)
449
- console.log(` ⬇️ Regressed: ${regressed}`)
450
- console.log(` ➡️ Same: ${same}`)
451
-
452
- if (candidatePassed > baselinePassed) {
453
- console.log('\n✅ Candidate is BETTER')
454
- process.exit(0)
455
- } else if (candidatePassed < baselinePassed) {
456
- console.log('\n❌ Candidate is WORSE')
457
- process.exit(1)
458
- } else {
459
- console.log('\n➡️ No significant difference')
460
- process.exit(0)
461
- }
462
- }
463
-
464
- // ============================================================================
465
- // CLI Registration
466
- // ============================================================================
467
-
468
- export function registerEvalPromptCommands(program: Command): void {
469
- const evalPrompt = program
470
- .command('eval-prompt')
471
- .description('Evaluate prompt quality against real trigger messages')
472
-
473
- evalPrompt
474
- .command('run')
475
- .description('Run eval with a prompt')
476
- .option('-p, --prompt <file>', 'Path to prompt file (default: production)')
477
- .option('-d, --dataset <file>', 'Path to dataset (default: data/eval-dataset.json)')
478
- .option('-l, --limit <n>', 'Max samples to eval', parseInt)
479
- .option('-m, --model <model>', 'Model to use (default: claude-haiku-4-5)')
480
- .option('-o, --output <file>', 'Save results to file')
481
- .option('--json', 'JSON output')
482
- .action(runEval)
483
-
484
- evalPrompt
485
- .command('compare')
486
- .description('Compare two prompts side-by-side')
487
- .requiredOption('-c, --candidate <file>', 'Candidate prompt file')
488
- .option('-b, --baseline <file>', 'Baseline prompt (default: production)')
489
- .option('-d, --dataset <file>', 'Path to dataset')
490
- .option('-l, --limit <n>', 'Max samples', parseInt)
491
- .option('-m, --model <model>', 'Model to use')
492
- .action(comparePrompts)
493
-
494
- // Default action runs eval
495
- evalPrompt.action(runEval)
496
- }