@skillrecordings/cli 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/bin/skill.mjs +21 -0
  2. package/dist/chunk-2NCCVTEE.js +22342 -0
  3. package/dist/chunk-2NCCVTEE.js.map +1 -0
  4. package/dist/chunk-3E3GYSZR.js +7071 -0
  5. package/dist/chunk-3E3GYSZR.js.map +1 -0
  6. package/dist/chunk-F4EM72IH.js +86 -0
  7. package/dist/chunk-F4EM72IH.js.map +1 -0
  8. package/dist/chunk-FGP7KUQW.js +432 -0
  9. package/dist/chunk-FGP7KUQW.js.map +1 -0
  10. package/dist/chunk-H3D6VCME.js +55 -0
  11. package/dist/chunk-H3D6VCME.js.map +1 -0
  12. package/dist/chunk-HK3PEWFD.js +208 -0
  13. package/dist/chunk-HK3PEWFD.js.map +1 -0
  14. package/dist/chunk-KEV3QKXP.js +4495 -0
  15. package/dist/chunk-KEV3QKXP.js.map +1 -0
  16. package/dist/chunk-MG37YDAK.js +882 -0
  17. package/dist/chunk-MG37YDAK.js.map +1 -0
  18. package/dist/chunk-MLNDSBZ4.js +482 -0
  19. package/dist/chunk-MLNDSBZ4.js.map +1 -0
  20. package/dist/chunk-N2WIV2JV.js +22 -0
  21. package/dist/chunk-N2WIV2JV.js.map +1 -0
  22. package/dist/chunk-PWWRCN5W.js +2067 -0
  23. package/dist/chunk-PWWRCN5W.js.map +1 -0
  24. package/dist/chunk-SKHBM3XP.js +7746 -0
  25. package/dist/chunk-SKHBM3XP.js.map +1 -0
  26. package/dist/chunk-WFANXVQG.js +64 -0
  27. package/dist/chunk-WFANXVQG.js.map +1 -0
  28. package/dist/chunk-WYKL32C3.js +275 -0
  29. package/dist/chunk-WYKL32C3.js.map +1 -0
  30. package/dist/chunk-ZNF7XD2S.js +134 -0
  31. package/dist/chunk-ZNF7XD2S.js.map +1 -0
  32. package/dist/config-AUAIYDSI.js +20 -0
  33. package/dist/config-AUAIYDSI.js.map +1 -0
  34. package/dist/fileFromPath-XN7LXIBI.js +134 -0
  35. package/dist/fileFromPath-XN7LXIBI.js.map +1 -0
  36. package/dist/getMachineId-bsd-KW2E7VK3.js +42 -0
  37. package/dist/getMachineId-bsd-KW2E7VK3.js.map +1 -0
  38. package/dist/getMachineId-darwin-ROXJUJX5.js +42 -0
  39. package/dist/getMachineId-darwin-ROXJUJX5.js.map +1 -0
  40. package/dist/getMachineId-linux-KVZEHQSU.js +34 -0
  41. package/dist/getMachineId-linux-KVZEHQSU.js.map +1 -0
  42. package/dist/getMachineId-unsupported-PPRILPPA.js +25 -0
  43. package/dist/getMachineId-unsupported-PPRILPPA.js.map +1 -0
  44. package/dist/getMachineId-win-IIF36LEJ.js +44 -0
  45. package/dist/getMachineId-win-IIF36LEJ.js.map +1 -0
  46. package/dist/index.js +112703 -0
  47. package/dist/index.js.map +1 -0
  48. package/dist/lib-R6DEEJCP.js +7623 -0
  49. package/dist/lib-R6DEEJCP.js.map +1 -0
  50. package/dist/pipeline-IAVVAKTU.js +120 -0
  51. package/dist/pipeline-IAVVAKTU.js.map +1 -0
  52. package/dist/query-NTP5NVXN.js +25 -0
  53. package/dist/query-NTP5NVXN.js.map +1 -0
  54. package/dist/routing-BAEPFB7V.js +390 -0
  55. package/dist/routing-BAEPFB7V.js.map +1 -0
  56. package/dist/stripe-lookup-charge-EPRUMZDL.js +56 -0
  57. package/dist/stripe-lookup-charge-EPRUMZDL.js.map +1 -0
  58. package/dist/stripe-payment-history-SJPKA63N.js +67 -0
  59. package/dist/stripe-payment-history-SJPKA63N.js.map +1 -0
  60. package/dist/stripe-subscription-status-L4Z65GB3.js +58 -0
  61. package/dist/stripe-subscription-status-L4Z65GB3.js.map +1 -0
  62. package/dist/stripe-verify-refund-FZDKCIUQ.js +54 -0
  63. package/dist/stripe-verify-refund-FZDKCIUQ.js.map +1 -0
  64. package/dist/support-memory-WSG7SDKG.js +10 -0
  65. package/dist/support-memory-WSG7SDKG.js.map +1 -0
  66. package/package.json +10 -7
  67. package/.env.encrypted +0 -0
  68. package/CHANGELOG.md +0 -35
  69. package/data/tt-archive-dataset.json +0 -1
  70. package/data/validate-test-dataset.json +0 -97
  71. package/docs/CLI-AUTH.md +0 -504
  72. package/preload.ts +0 -18
  73. package/src/__tests__/init.test.ts +0 -74
  74. package/src/alignment-test.ts +0 -64
  75. package/src/check-apps.ts +0 -16
  76. package/src/commands/auth/decrypt.ts +0 -123
  77. package/src/commands/auth/encrypt.ts +0 -81
  78. package/src/commands/auth/index.ts +0 -50
  79. package/src/commands/auth/keygen.ts +0 -41
  80. package/src/commands/auth/status.ts +0 -164
  81. package/src/commands/axiom/forensic.ts +0 -868
  82. package/src/commands/axiom/index.ts +0 -697
  83. package/src/commands/build-dataset.ts +0 -311
  84. package/src/commands/db-status.ts +0 -47
  85. package/src/commands/deploys.ts +0 -219
  86. package/src/commands/eval-local/compare.ts +0 -171
  87. package/src/commands/eval-local/health.ts +0 -212
  88. package/src/commands/eval-local/index.ts +0 -76
  89. package/src/commands/eval-local/real-tools.ts +0 -416
  90. package/src/commands/eval-local/run.ts +0 -1168
  91. package/src/commands/eval-local/score-production.ts +0 -256
  92. package/src/commands/eval-local/seed.ts +0 -276
  93. package/src/commands/eval-pipeline/index.ts +0 -53
  94. package/src/commands/eval-pipeline/real-tools.ts +0 -492
  95. package/src/commands/eval-pipeline/run.ts +0 -1316
  96. package/src/commands/eval-pipeline/seed.ts +0 -395
  97. package/src/commands/eval-prompt.ts +0 -496
  98. package/src/commands/eval.test.ts +0 -253
  99. package/src/commands/eval.ts +0 -108
  100. package/src/commands/faq-classify.ts +0 -460
  101. package/src/commands/faq-cluster.ts +0 -135
  102. package/src/commands/faq-extract.ts +0 -249
  103. package/src/commands/faq-mine.ts +0 -432
  104. package/src/commands/faq-review.ts +0 -426
  105. package/src/commands/front/index.ts +0 -351
  106. package/src/commands/front/pull-conversations.ts +0 -275
  107. package/src/commands/front/tags.ts +0 -825
  108. package/src/commands/front-cache.ts +0 -1277
  109. package/src/commands/front-stats.ts +0 -75
  110. package/src/commands/health.test.ts +0 -82
  111. package/src/commands/health.ts +0 -362
  112. package/src/commands/init.test.ts +0 -89
  113. package/src/commands/init.ts +0 -106
  114. package/src/commands/inngest/client.ts +0 -294
  115. package/src/commands/inngest/events.ts +0 -296
  116. package/src/commands/inngest/investigate.ts +0 -382
  117. package/src/commands/inngest/runs.ts +0 -149
  118. package/src/commands/inngest/signal.ts +0 -143
  119. package/src/commands/kb-sync.ts +0 -498
  120. package/src/commands/memory/find.ts +0 -135
  121. package/src/commands/memory/get.ts +0 -87
  122. package/src/commands/memory/index.ts +0 -97
  123. package/src/commands/memory/stats.ts +0 -163
  124. package/src/commands/memory/store.ts +0 -49
  125. package/src/commands/memory/vote.ts +0 -159
  126. package/src/commands/pipeline.ts +0 -127
  127. package/src/commands/responses.ts +0 -856
  128. package/src/commands/tools.ts +0 -293
  129. package/src/commands/wizard.ts +0 -319
  130. package/src/index.ts +0 -172
  131. package/src/lib/crypto.ts +0 -56
  132. package/src/lib/env-loader.ts +0 -206
  133. package/src/lib/onepassword.ts +0 -137
  134. package/src/test-agent-local.ts +0 -115
  135. package/tsconfig.json +0 -11
  136. package/vitest.config.ts +0 -10
@@ -1,1316 +0,0 @@
1
- /**
2
- * Pipeline step evaluation runner
3
- *
4
- * Runs actual pipeline steps against labeled scenarios and measures accuracy.
5
- */
6
-
7
- import { createHash } from 'crypto'
8
- import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'fs'
9
- import { dirname, join } from 'path'
10
- import {
11
- type ClassifyInput,
12
- type ClassifyOutput,
13
- type GatherOutput,
14
- type MessageCategory,
15
- type RouteAction,
16
- type RouteOutput,
17
- type ValidateOutput,
18
- type ValidationIssueType,
19
- classify,
20
- route,
21
- validate,
22
- } from '@skillrecordings/core/pipeline'
23
- import { readFile, writeFile } from 'fs/promises'
24
- import { glob } from 'glob'
25
- import {
26
- cleanupRealTools,
27
- createRealTools,
28
- initRealTools,
29
- isRealToolsAvailable,
30
- } from './real-tools'
31
-
32
- // ============================================================================
33
- // Concurrency helpers
34
- // ============================================================================
35
-
36
- /**
37
- * Run items in batches with controlled concurrency
38
- */
39
- async function runBatch<T, R>(
40
- items: T[],
41
- fn: (item: T, index: number) => Promise<R>,
42
- concurrency: number
43
- ): Promise<R[]> {
44
- const results: R[] = []
45
- for (let i = 0; i < items.length; i += concurrency) {
46
- const batch = items.slice(i, i + concurrency)
47
- const batchResults = await Promise.all(
48
- batch.map((item, batchIndex) => fn(item, i + batchIndex))
49
- )
50
- results.push(...batchResults)
51
- }
52
- return results
53
- }
54
-
55
- /**
56
- * Run items in batches with fail-fast support
57
- */
58
- async function runBatchWithFailFast<T, R extends { passed: boolean }>(
59
- items: T[],
60
- fn: (item: T, index: number) => Promise<R>,
61
- concurrency: number,
62
- failFast: boolean
63
- ): Promise<{ results: R[]; aborted: boolean }> {
64
- const results: R[] = []
65
- let aborted = false
66
-
67
- for (let i = 0; i < items.length && !aborted; i += concurrency) {
68
- const batch = items.slice(i, i + concurrency)
69
- const batchResults = await Promise.all(
70
- batch.map((item, batchIndex) => fn(item, i + batchIndex))
71
- )
72
- results.push(...batchResults)
73
-
74
- if (failFast && batchResults.some((r) => !r.passed)) {
75
- aborted = true
76
- }
77
- }
78
-
79
- return { results, aborted }
80
- }
81
-
82
- // ============================================================================
83
- // Classify cache helpers
84
- // ============================================================================
85
-
86
- const CACHE_DIR = '.eval-cache'
87
-
88
- function getCacheKey(scenarioId: string, classifySourceHash: string): string {
89
- return `classify-${scenarioId}-${classifySourceHash.slice(0, 8)}`
90
- }
91
-
92
- function getClassifySourceHash(): string {
93
- // Hash based on classify.ts content to invalidate cache when code changes
94
- try {
95
- // Try to read the classify source from core package
96
- const possiblePaths = [
97
- join(process.cwd(), 'packages/core/src/pipeline/classify.ts'),
98
- join(process.cwd(), '../core/src/pipeline/classify.ts'),
99
- ]
100
- for (const path of possiblePaths) {
101
- if (existsSync(path)) {
102
- const content = readFileSync(path, 'utf-8')
103
- return createHash('md5').update(content).digest('hex')
104
- }
105
- }
106
- } catch {
107
- // Fallback: use timestamp-based invalidation (cache for 1 hour)
108
- }
109
- // Fallback hash based on current hour
110
- return createHash('md5')
111
- .update(Math.floor(Date.now() / 300000).toString())
112
- .digest('hex')
113
- }
114
-
115
- function loadCachedClassify(cacheKey: string): ClassifyOutput | null {
116
- const cachePath = join(CACHE_DIR, `${cacheKey}.json`)
117
- try {
118
- if (existsSync(cachePath)) {
119
- return JSON.parse(readFileSync(cachePath, 'utf-8'))
120
- }
121
- } catch {
122
- // Cache miss or invalid
123
- }
124
- return null
125
- }
126
-
127
- function saveCachedClassify(cacheKey: string, result: ClassifyOutput): void {
128
- try {
129
- if (!existsSync(CACHE_DIR)) {
130
- mkdirSync(CACHE_DIR, { recursive: true })
131
- }
132
- const cachePath = join(CACHE_DIR, `${cacheKey}.json`)
133
- writeFileSync(cachePath, JSON.stringify(result))
134
- } catch {
135
- // Ignore cache write errors
136
- }
137
- }
138
-
139
- function clearClassifyCache(): void {
140
- try {
141
- if (existsSync(CACHE_DIR)) {
142
- rmSync(CACHE_DIR, { recursive: true, force: true })
143
- }
144
- } catch {
145
- // Ignore
146
- }
147
- }
148
-
149
- // ============================================================================
150
- // Types
151
- // ============================================================================
152
-
153
- type PipelineStep =
154
- | 'classify'
155
- | 'route'
156
- | 'gather'
157
- | 'draft'
158
- | 'validate'
159
- | 'e2e'
160
-
161
- interface RunOptions {
162
- step: PipelineStep
163
- scenarios?: string
164
- dataset?: string
165
- limit?: number
166
- verbose?: boolean
167
- json?: boolean
168
- model?: string
169
- forceLlm?: boolean
170
- realTools?: boolean
171
- parallel?: number
172
- cacheClassify?: boolean
173
- clearCache?: boolean
174
- failFast?: boolean
175
- quick?: boolean
176
- }
177
-
178
- interface Scenario {
179
- id: string
180
- name?: string
181
- trigger?: { subject: string; body: string }
182
- triggerMessage?: { subject: string; body: string }
183
- appId?: string
184
- // Expected values for evals
185
- expectedCategory?: MessageCategory
186
- expectedAction?: RouteAction
187
- expectedBehavior?: string
188
- category?: string // Fallback for backwards compat
189
- // Validate eval fields
190
- draft?: string // Pre-provided draft to validate
191
- assertions?: {
192
- noFabrication?: boolean
193
- noMetaCommentary?: boolean
194
- noInternalLeak?: boolean
195
- noBannedPhrases?: boolean
196
- mustNotContain?: string[]
197
- }
198
- // Context for validation (optional)
199
- context?: {
200
- customer?: string
201
- conversation?: unknown
202
- }
203
- }
204
-
205
- interface StepResult {
206
- scenarioId: string
207
- passed: boolean
208
- expected: string
209
- actual: string
210
- confidence?: number
211
- durationMs: number
212
- reasoning?: string
213
- }
214
-
215
- interface EvalMetrics {
216
- total: number
217
- passed: number
218
- failed: number
219
- accuracy: number
220
- durationMs: number
221
- // Per-category/action breakdown
222
- breakdown: Record<
223
- string,
224
- { tp: number; fp: number; fn: number; precision: number; recall: number }
225
- >
226
- // Special metrics
227
- falseSilenceRate?: number // For route: incorrectly silenced
228
- falseRespondRate?: number // For route: incorrectly responded
229
- }
230
-
231
- // ============================================================================
232
- // Main runner
233
- // ============================================================================
234
-
235
- export async function run(options: RunOptions): Promise<void> {
236
- const {
237
- step,
238
- scenarios: scenarioGlob,
239
- dataset,
240
- limit,
241
- verbose,
242
- json,
243
- model,
244
- forceLlm,
245
- realTools,
246
- parallel = 10,
247
- cacheClassify,
248
- clearCache,
249
- failFast,
250
- quick,
251
- } = options
252
-
253
- // Clear cache if requested
254
- if (clearCache) {
255
- clearClassifyCache()
256
- if (!json) {
257
- console.log('🗑️ Cleared classify cache\n')
258
- }
259
- }
260
-
261
- // Load scenarios
262
- let scenarios = await loadScenarios(scenarioGlob, dataset)
263
-
264
- // Apply quick filter (smoke test subset)
265
- if (quick) {
266
- scenarios = filterQuickScenarios(scenarios)
267
- if (!json) {
268
- console.log(`⚡ Quick mode: filtered to ${scenarios.length} scenarios\n`)
269
- }
270
- }
271
-
272
- if (limit && limit < scenarios.length) {
273
- scenarios = scenarios.slice(0, limit)
274
- }
275
-
276
- if (!json) {
277
- const parallelInfo = parallel > 1 ? ` (parallel: ${parallel})` : ''
278
- const flags = [
279
- cacheClassify ? 'cache' : null,
280
- failFast ? 'fail-fast' : null,
281
- ]
282
- .filter(Boolean)
283
- .join(', ')
284
- const flagsInfo = flags ? ` [${flags}]` : ''
285
- console.log(
286
- `\n🧪 Running ${step} eval on ${scenarios.length} scenarios${parallelInfo}${flagsInfo}\n`
287
- )
288
- }
289
-
290
- // Initialize real tools if requested
291
- if (realTools) {
292
- if (!json) {
293
- console.log('🔌 Connecting to Docker services...')
294
- }
295
- const status = await initRealTools(undefined, verbose && !json)
296
-
297
- if (!status.mysql && !status.qdrant) {
298
- console.error('❌ Failed to connect to any Docker services')
299
- console.error(' Make sure MySQL (3306) and Qdrant (6333) are running')
300
- process.exit(1)
301
- }
302
-
303
- if (!json) {
304
- console.log('')
305
- }
306
- }
307
-
308
- const startTime = Date.now()
309
- let results: StepResult[] = []
310
-
311
- try {
312
- const evalOptions = {
313
- verbose,
314
- model,
315
- forceLlm,
316
- realTools,
317
- parallel,
318
- cacheClassify,
319
- failFast,
320
- }
321
-
322
- switch (step) {
323
- case 'classify':
324
- results = await runClassifyEval(scenarios, evalOptions)
325
- break
326
- case 'route':
327
- results = await runRouteEval(scenarios, evalOptions)
328
- break
329
- case 'gather':
330
- results = await runGatherEval(scenarios, evalOptions)
331
- break
332
- case 'validate':
333
- results = await runValidateEval(scenarios, evalOptions)
334
- break
335
- case 'e2e':
336
- results = await runE2EEval(scenarios, evalOptions)
337
- break
338
- case 'draft':
339
- console.error(
340
- `Step "${step}" not yet implemented. Use e2e for full pipeline.`
341
- )
342
- process.exit(1)
343
- default:
344
- console.error(`Unknown step: ${step}`)
345
- process.exit(1)
346
- }
347
- } finally {
348
- // Clean up real tools connections
349
- if (realTools) {
350
- await cleanupRealTools()
351
- }
352
- }
353
-
354
- const totalDuration = Date.now() - startTime
355
- const metrics = computeMetrics(results, step, totalDuration)
356
-
357
- if (json) {
358
- console.log(JSON.stringify({ metrics, results }, null, 2))
359
- } else {
360
- printMetrics(step, metrics, verbose ? results : undefined)
361
- }
362
- }
363
-
364
- // ============================================================================
365
- // Scenario loading
366
- // ============================================================================
367
-
368
- async function loadScenarios(
369
- scenarioGlob?: string,
370
- datasetPath?: string
371
- ): Promise<Scenario[]> {
372
- if (datasetPath) {
373
- const content = await readFile(datasetPath, 'utf-8')
374
- const data = JSON.parse(content)
375
-
376
- // Handle comprehensive-dataset.json format
377
- return data.map((item: any) => ({
378
- id: item.id || item.conversationId,
379
- name: item.triggerMessage?.subject || item.name,
380
- trigger: item.trigger,
381
- triggerMessage: item.triggerMessage,
382
- appId: item.appId || item.app,
383
- expectedCategory: item.expectedCategory || inferCategory(item),
384
- expectedAction: item.expectedAction || inferAction(item),
385
- expectedBehavior: item.expectedBehavior,
386
- category: item.category,
387
- }))
388
- }
389
-
390
- if (scenarioGlob) {
391
- const files = await glob(scenarioGlob)
392
- if (files.length === 0) {
393
- console.error(`No scenario files found matching: ${scenarioGlob}`)
394
- process.exit(1)
395
- }
396
-
397
- return Promise.all(
398
- files.map(async (file) => {
399
- const content = await readFile(file, 'utf-8')
400
- return JSON.parse(content)
401
- })
402
- )
403
- }
404
-
405
- console.error('Must provide --scenarios or --dataset')
406
- process.exit(1)
407
- }
408
-
409
- /**
410
- * Infer expected category from scenario if not explicitly set
411
- */
412
- function inferCategory(item: any): MessageCategory | undefined {
413
- // If agentResponse has category, map it
414
- if (item.agentResponse?.category) {
415
- const catMap: Record<string, MessageCategory> = {
416
- 'tool-assisted': 'support_access',
417
- auto: 'system',
418
- spam: 'spam',
419
- }
420
- return catMap[item.agentResponse.category]
421
- }
422
-
423
- // Infer from message content
424
- const text =
425
- `${item.triggerMessage?.subject || ''} ${item.triggerMessage?.body || ''}`.toLowerCase()
426
-
427
- if (/refund|money back/i.test(text)) return 'support_refund'
428
- if (/can't access|lost access|no access|restore access/i.test(text))
429
- return 'support_access'
430
- if (/transfer|different email|wrong email/i.test(text))
431
- return 'support_transfer'
432
- if (/invoice|receipt/i.test(text)) return 'support_billing'
433
- if (/partnership|sponsor|backlink|outreach|seo/i.test(text)) return 'spam'
434
- if (/auto-reply|out of office|mailer-daemon/i.test(text)) return 'system'
435
- if (/thank|love|amazing|big fan/i.test(text)) return 'fan_mail'
436
-
437
- return undefined
438
- }
439
-
440
- /**
441
- * Filter scenarios for quick mode (smoke test subset)
442
- * Returns scenarios with smoke: true, or first 2 from each category
443
- */
444
- function filterQuickScenarios(scenarios: Scenario[]): Scenario[] {
445
- // First, try to use smoke flag
446
- const smokeScenarios = scenarios.filter((s: any) => s.smoke === true)
447
- if (smokeScenarios.length > 0) {
448
- return smokeScenarios
449
- }
450
-
451
- // Fallback: first 2 from each category
452
- const byCategory = new Map<string, Scenario[]>()
453
- for (const scenario of scenarios) {
454
- const cat =
455
- scenario.expectedCategory ||
456
- scenario.category ||
457
- scenario.expectedAction ||
458
- 'other'
459
- if (!byCategory.has(cat)) {
460
- byCategory.set(cat, [])
461
- }
462
- byCategory.get(cat)!.push(scenario)
463
- }
464
-
465
- const result: Scenario[] = []
466
- for (const [, categoryScenarios] of byCategory) {
467
- result.push(...categoryScenarios.slice(0, 2))
468
- }
469
-
470
- return result
471
- }
472
-
473
- /**
474
- * Infer expected action from scenario
475
- */
476
- function inferAction(item: any): RouteAction | undefined {
477
- const behavior = item.expectedBehavior?.toLowerCase() || ''
478
-
479
- if (behavior.includes('silent') || behavior.includes('ignore'))
480
- return 'silence'
481
- if (behavior.includes('escalate') || behavior.includes('human'))
482
- return 'escalate_human'
483
- if (behavior.includes('instructor')) return 'escalate_instructor'
484
- if (behavior.includes('respond') || behavior.includes('draft'))
485
- return 'respond'
486
-
487
- // If agent responded, it was probably meant to respond
488
- if (item.agentResponse?.text) return 'respond'
489
-
490
- return undefined
491
- }
492
-
493
- // ============================================================================
494
- // Step evaluators
495
- // ============================================================================
496
-
497
- interface EvalOptions {
498
- verbose?: boolean
499
- model?: string
500
- forceLlm?: boolean
501
- realTools?: boolean
502
- parallel?: number
503
- cacheClassify?: boolean
504
- failFast?: boolean
505
- }
506
-
507
- async function runClassifyEval(
508
- scenarios: Scenario[],
509
- options: EvalOptions
510
- ): Promise<StepResult[]> {
511
- const concurrency = options.parallel || 1
512
- const classifyHash = options.cacheClassify ? getClassifySourceHash() : ''
513
- let completed = 0
514
-
515
- const processScenario = async (scenario: Scenario): Promise<StepResult> => {
516
- const trigger = scenario.trigger || scenario.triggerMessage
517
- if (!trigger) {
518
- return {
519
- scenarioId: scenario.id,
520
- passed: false,
521
- expected: scenario.expectedCategory || 'unknown',
522
- actual: 'ERROR: no trigger',
523
- durationMs: 0,
524
- }
525
- }
526
-
527
- const input: ClassifyInput = {
528
- subject: trigger.subject,
529
- body: trigger.body,
530
- appId: scenario.appId,
531
- }
532
-
533
- const startTime = Date.now()
534
- try {
535
- let result: ClassifyOutput
536
-
537
- // Check cache if enabled
538
- if (options.cacheClassify) {
539
- const cacheKey = getCacheKey(scenario.id, classifyHash)
540
- const cached = loadCachedClassify(cacheKey)
541
- if (cached) {
542
- result = cached
543
- } else {
544
- result = await classify(input, {
545
- forceLLM: options.forceLlm,
546
- model: options.model,
547
- })
548
- saveCachedClassify(cacheKey, result)
549
- }
550
- } else {
551
- result = await classify(input, {
552
- forceLLM: options.forceLlm,
553
- model: options.model,
554
- })
555
- }
556
-
557
- const expected = scenario.expectedCategory || 'unknown'
558
- const passed = result.category === expected
559
-
560
- completed++
561
- if (!options.verbose) {
562
- process.stdout.write(
563
- `\r Processing ${completed}/${scenarios.length}...`
564
- )
565
- }
566
-
567
- if (options.verbose && !passed) {
568
- console.log(`\n❌ ${scenario.id}`)
569
- console.log(` Expected: ${expected}`)
570
- console.log(
571
- ` Actual: ${result.category} (${(result.confidence * 100).toFixed(0)}%)`
572
- )
573
- console.log(` Subject: ${trigger.subject.slice(0, 60)}...`)
574
- }
575
-
576
- return {
577
- scenarioId: scenario.id,
578
- passed,
579
- expected,
580
- actual: result.category,
581
- confidence: result.confidence,
582
- durationMs: Date.now() - startTime,
583
- reasoning: result.reasoning,
584
- }
585
- } catch (error) {
586
- completed++
587
- return {
588
- scenarioId: scenario.id,
589
- passed: false,
590
- expected: scenario.expectedCategory || 'unknown',
591
- actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
592
- durationMs: Date.now() - startTime,
593
- }
594
- }
595
- }
596
-
597
- const { results, aborted } = await runBatchWithFailFast(
598
- scenarios,
599
- (scenario) => processScenario(scenario),
600
- concurrency,
601
- options.failFast || false
602
- )
603
-
604
- if (!options.verbose) console.log('')
605
- if (aborted) {
606
- console.log('⚠️ Stopped early due to --fail-fast\n')
607
- }
608
- return results
609
- }
610
-
611
- async function runRouteEval(
612
- scenarios: Scenario[],
613
- options: EvalOptions
614
- ): Promise<StepResult[]> {
615
- const concurrency = options.parallel || 1
616
- const classifyHash = options.cacheClassify ? getClassifySourceHash() : ''
617
- let completed = 0
618
-
619
- const processScenario = async (scenario: Scenario): Promise<StepResult> => {
620
- const trigger = scenario.trigger || scenario.triggerMessage
621
- if (!trigger) {
622
- return {
623
- scenarioId: scenario.id,
624
- passed: false,
625
- expected: scenario.expectedAction || 'unknown',
626
- actual: 'ERROR: no trigger',
627
- durationMs: 0,
628
- }
629
- }
630
-
631
- const input: ClassifyInput = {
632
- subject: trigger.subject,
633
- body: trigger.body,
634
- appId: scenario.appId,
635
- }
636
-
637
- const startTime = Date.now()
638
- try {
639
- // First classify (with cache support), then route
640
- let classification: ClassifyOutput
641
-
642
- if (options.cacheClassify) {
643
- const cacheKey = getCacheKey(scenario.id, classifyHash)
644
- const cached = loadCachedClassify(cacheKey)
645
- if (cached) {
646
- classification = cached
647
- } else {
648
- classification = await classify(input, {
649
- forceLLM: options.forceLlm,
650
- model: options.model,
651
- })
652
- saveCachedClassify(cacheKey, classification)
653
- }
654
- } else {
655
- classification = await classify(input, {
656
- forceLLM: options.forceLlm,
657
- model: options.model,
658
- })
659
- }
660
-
661
- const routeResult = route({
662
- message: input,
663
- classification,
664
- appConfig: {
665
- appId: scenario.appId || 'eval',
666
- instructorConfigured: true,
667
- autoSendEnabled: false,
668
- },
669
- })
670
-
671
- const expected = scenario.expectedAction || 'respond'
672
- const passed = routeResult.action === expected
673
-
674
- completed++
675
- if (!options.verbose) {
676
- process.stdout.write(
677
- `\r Processing ${completed}/${scenarios.length}...`
678
- )
679
- }
680
-
681
- if (options.verbose && !passed) {
682
- console.log(`\n❌ ${scenario.id}`)
683
- console.log(` Expected: ${expected}`)
684
- console.log(` Actual: ${routeResult.action}`)
685
- console.log(` Category: ${classification.category}`)
686
- console.log(` Reason: ${routeResult.reason}`)
687
- }
688
-
689
- return {
690
- scenarioId: scenario.id,
691
- passed,
692
- expected,
693
- actual: routeResult.action,
694
- durationMs: Date.now() - startTime,
695
- reasoning: routeResult.reason,
696
- }
697
- } catch (error) {
698
- completed++
699
- return {
700
- scenarioId: scenario.id,
701
- passed: false,
702
- expected: scenario.expectedAction || 'respond',
703
- actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
704
- durationMs: Date.now() - startTime,
705
- }
706
- }
707
- }
708
-
709
- const { results, aborted } = await runBatchWithFailFast(
710
- scenarios,
711
- (scenario) => processScenario(scenario),
712
- concurrency,
713
- options.failFast || false
714
- )
715
-
716
- if (!options.verbose) console.log('')
717
- if (aborted) {
718
- console.log('⚠️ Stopped early due to --fail-fast\n')
719
- }
720
- return results
721
- }
722
-
723
- async function runGatherEval(
724
- scenarios: Scenario[],
725
- options: EvalOptions
726
- ): Promise<StepResult[]> {
727
- const concurrency = options.parallel || 1
728
- let completed = 0
729
-
730
- // Check if real tools are available
731
- const useRealTools = options.realTools && isRealToolsAvailable()
732
-
733
- if (!useRealTools) {
734
- // Fallback to mock behavior
735
- const results = scenarios.map((scenario) => ({
736
- scenarioId: scenario.id,
737
- passed: true,
738
- expected: 'context_complete',
739
- actual: 'context_complete',
740
- durationMs: 0,
741
- reasoning: 'Gather eval requires --real-tools flag with Docker services',
742
- }))
743
-
744
- if (!options.verbose) {
745
- console.log(` Processing ${scenarios.length}/${scenarios.length}...`)
746
- }
747
- console.log(
748
- '\n⚠️ Gather eval: Use --real-tools with Docker services for actual tool calls\n'
749
- )
750
- return results
751
- }
752
-
753
- const processScenario = async (scenario: Scenario): Promise<StepResult> => {
754
- const trigger = scenario.trigger || scenario.triggerMessage
755
- if (!trigger) {
756
- return {
757
- scenarioId: scenario.id,
758
- passed: false,
759
- expected: 'context_complete',
760
- actual: 'ERROR: no trigger',
761
- durationMs: 0,
762
- }
763
- }
764
-
765
- const startTime = Date.now()
766
- try {
767
- // Create real tools for this scenario
768
- const tools = createRealTools({
769
- appId: scenario.appId,
770
- customerEmail: scenario.context?.customer as string,
771
- })
772
-
773
- // Execute key tools to gather context
774
- const toolResults: string[] = []
775
- let userFound = false
776
- let knowledgeCount = 0
777
-
778
- // Try lookupUser
779
- const lookupUserExec = tools.lookupUser.execute
780
- if (lookupUserExec) {
781
- const userResult = await lookupUserExec(
782
- {
783
- email: (scenario.context?.customer as string) || '[EMAIL]',
784
- appId: scenario.appId || 'eval',
785
- },
786
- { toolCallId: 'test', messages: [] }
787
- )
788
- userFound = !!(userResult as any).found
789
- toolResults.push(`user:${userFound ? 'found' : 'not_found'}`)
790
- }
791
-
792
- // Try searchKnowledge
793
- const searchKnowledgeExec = tools.searchKnowledge.execute
794
- if (searchKnowledgeExec) {
795
- const knowledgeResult = await searchKnowledgeExec(
796
- {
797
- query: trigger.subject || trigger.body,
798
- appId: scenario.appId || 'eval',
799
- },
800
- { toolCallId: 'test', messages: [] }
801
- )
802
- knowledgeCount =
803
- ((knowledgeResult as any).knowledge?.length || 0) +
804
- ((knowledgeResult as any).similarTickets?.length || 0)
805
- toolResults.push(`knowledge:${knowledgeCount}`)
806
- }
807
-
808
- // Evaluate: pass if we got some context
809
- const hasContext = userFound || knowledgeCount > 0
810
- const expected = 'context_complete'
811
- const actual = hasContext ? 'context_complete' : 'context_incomplete'
812
-
813
- completed++
814
- if (!options.verbose) {
815
- process.stdout.write(
816
- `\r Processing ${completed}/${scenarios.length}...`
817
- )
818
- }
819
-
820
- if (options.verbose && !hasContext) {
821
- console.log(`\n⚠️ ${scenario.id}`)
822
- console.log(` Context: ${toolResults.join(', ')}`)
823
- }
824
-
825
- return {
826
- scenarioId: scenario.id,
827
- passed: hasContext,
828
- expected,
829
- actual,
830
- durationMs: Date.now() - startTime,
831
- reasoning: toolResults.join(', '),
832
- }
833
- } catch (error) {
834
- completed++
835
- return {
836
- scenarioId: scenario.id,
837
- passed: false,
838
- expected: 'context_complete',
839
- actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
840
- durationMs: Date.now() - startTime,
841
- }
842
- }
843
- }
844
-
845
- const { results, aborted } = await runBatchWithFailFast(
846
- scenarios,
847
- (scenario) => processScenario(scenario),
848
- concurrency,
849
- options.failFast || false
850
- )
851
-
852
- if (!options.verbose) console.log('')
853
- if (aborted) {
854
- console.log('⚠️ Stopped early due to --fail-fast\n')
855
- }
856
- return results
857
- }
858
-
859
- /**
860
- * Run validate eval against scenarios with drafts.
861
- *
862
- * Scenarios can include:
863
- * - `draft`: A pre-provided draft to validate
864
- * - `assertions`: Expected validation outcomes (noFabrication, noMetaCommentary, etc.)
865
- *
866
- * If no draft is provided, the scenario is skipped.
867
- * All validation checks are deterministic (no LLM calls).
868
- */
869
- async function runValidateEval(
870
- scenarios: Scenario[],
871
- options: EvalOptions
872
- ): Promise<StepResult[]> {
873
- const concurrency = options.parallel || 1
874
- let completed = 0
875
-
876
- // Filter to scenarios with drafts or assertions
877
- const validScenarios = scenarios.filter((s) => s.draft || s.assertions)
878
-
879
- if (validScenarios.length === 0) {
880
- console.log('\n⚠️ No scenarios with draft or assertions found.')
881
- console.log(' For validate eval, scenarios need either:')
882
- console.log(' - "draft": "text to validate"')
883
- console.log(' - "assertions": { "noFabrication": true, ... }\n')
884
- return []
885
- }
886
-
887
- const processScenario = async (scenario: Scenario): Promise<StepResult> => {
888
- // If scenario has no draft but has assertions, it's for checking generated drafts
889
- // For now, skip those (they'd need full pipeline)
890
- if (!scenario.draft) {
891
- return {
892
- scenarioId: scenario.id,
893
- passed: true, // Can't evaluate without draft
894
- expected: 'needs_draft',
895
- actual: 'skipped',
896
- durationMs: 0,
897
- reasoning: 'No draft provided - use e2e eval with assertions',
898
- }
899
- }
900
-
901
- const startTime = Date.now()
902
-
903
- // Create minimal context for validation
904
- // Fabrication check needs knowledge array to be empty to trigger
905
- const hasKnowledge = scenario.context?.customer === 'recent-purchase'
906
- const mockContext: GatherOutput = {
907
- user: hasKnowledge ? { id: 'test', email: '[EMAIL]' } : null,
908
- purchases: hasKnowledge
909
- ? [
910
- {
911
- id: 'p1',
912
- productId: 'prod1',
913
- productName: 'Test Product',
914
- purchasedAt: new Date().toISOString(),
915
- status: 'active',
916
- },
917
- ]
918
- : [],
919
- knowledge: hasKnowledge
920
- ? [
921
- {
922
- id: 'k1',
923
- type: 'faq',
924
- content: 'test knowledge',
925
- relevance: 0.9,
926
- },
927
- ]
928
- : [],
929
- history: [],
930
- priorMemory: [],
931
- priorConversations: [],
932
- gatherErrors: [],
933
- }
934
-
935
- try {
936
- const result = await validate({
937
- draft: scenario.draft,
938
- context: mockContext,
939
- strictMode: false,
940
- })
941
-
942
- // Map issue types to assertion names (unused but kept for documentation)
943
- const _issueTypeToAssertion: Record<
944
- ValidationIssueType,
945
- keyof NonNullable<Scenario['assertions']>
946
- > = {
947
- fabrication: 'noFabrication',
948
- meta_commentary: 'noMetaCommentary',
949
- internal_leak: 'noInternalLeak',
950
- banned_phrase: 'noBannedPhrases',
951
- too_short: 'noBannedPhrases', // No specific assertion
952
- too_long: 'noBannedPhrases', // No specific assertion
953
- bad_tone: 'noBannedPhrases', // No specific assertion
954
- repeated_mistake: 'noBannedPhrases', // No specific assertion
955
- relevance: 'noBannedPhrases', // No specific assertion for relevance
956
- ground_truth_mismatch: 'noBannedPhrases', // No specific assertion
957
- audience_inappropriate: 'noBannedPhrases', // No specific assertion
958
- tool_failure: 'noBannedPhrases', // No specific assertion
959
- }
960
-
961
- // Check if assertions match
962
- const assertions = scenario.assertions || {}
963
- const failedAssertions: string[] = []
964
- const foundIssueTypes = new Set(
965
- result.issues.map((i: { type: string }) => i.type)
966
- )
967
-
968
- // Check negative assertions (noX = expect no issues of type X)
969
- if (assertions.noFabrication && foundIssueTypes.has('fabrication')) {
970
- failedAssertions.push('noFabrication: found fabrication')
971
- }
972
- if (
973
- assertions.noMetaCommentary &&
974
- foundIssueTypes.has('meta_commentary')
975
- ) {
976
- failedAssertions.push('noMetaCommentary: found meta_commentary')
977
- }
978
- if (assertions.noInternalLeak && foundIssueTypes.has('internal_leak')) {
979
- failedAssertions.push('noInternalLeak: found internal_leak')
980
- }
981
- if (assertions.noBannedPhrases && foundIssueTypes.has('banned_phrase')) {
982
- failedAssertions.push('noBannedPhrases: found banned_phrase')
983
- }
984
-
985
- // Check mustNotContain patterns
986
- if (assertions.mustNotContain) {
987
- for (const pattern of assertions.mustNotContain) {
988
- if (scenario.draft.toLowerCase().includes(pattern.toLowerCase())) {
989
- failedAssertions.push(`mustNotContain: found "${pattern}"`)
990
- }
991
- }
992
- }
993
-
994
- const passed = failedAssertions.length === 0
995
- const issuesSummary = result.issues
996
- .map(
997
- (i: { type: string; match?: string }) =>
998
- `${i.type}:${i.match || 'none'}`
999
- )
1000
- .join(', ')
1001
-
1002
- completed++
1003
- if (!options.verbose) {
1004
- process.stdout.write(
1005
- `\r Processing ${completed}/${validScenarios.length}...`
1006
- )
1007
- }
1008
-
1009
- if (options.verbose && !passed) {
1010
- console.log(`\n❌ ${scenario.id}`)
1011
- console.log(` Failed assertions: ${failedAssertions.join(', ')}`)
1012
- console.log(` Issues found: ${issuesSummary || 'none'}`)
1013
- console.log(` Draft preview: ${scenario.draft.slice(0, 80)}...`)
1014
- }
1015
-
1016
- return {
1017
- scenarioId: scenario.id,
1018
- passed,
1019
- expected: 'valid',
1020
- actual: passed ? 'valid' : `invalid: ${failedAssertions.join('; ')}`,
1021
- durationMs: Date.now() - startTime,
1022
- reasoning: issuesSummary || 'no issues found',
1023
- }
1024
- } catch (error) {
1025
- completed++
1026
- return {
1027
- scenarioId: scenario.id,
1028
- passed: false,
1029
- expected: 'valid',
1030
- actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
1031
- durationMs: Date.now() - startTime,
1032
- }
1033
- }
1034
- }
1035
-
1036
- const { results, aborted } = await runBatchWithFailFast(
1037
- validScenarios,
1038
- (scenario) => processScenario(scenario),
1039
- concurrency,
1040
- options.failFast || false
1041
- )
1042
-
1043
- if (!options.verbose) console.log('')
1044
- if (aborted) {
1045
- console.log('⚠️ Stopped early due to --fail-fast\n')
1046
- }
1047
- return results
1048
- }
1049
-
1050
- async function runE2EEval(
1051
- scenarios: Scenario[],
1052
- options: EvalOptions
1053
- ): Promise<StepResult[]> {
1054
- const { runPipeline } = await import('@skillrecordings/core/pipeline')
1055
- const concurrency = options.parallel || 1
1056
- let completed = 0
1057
-
1058
- // Note: Real tools are available when --real-tools is passed
1059
- // They're initialized globally and accessible to the pipeline's gather step
1060
- if (options.realTools && options.verbose) {
1061
- const available = isRealToolsAvailable()
1062
- console.log(` Real tools: ${available ? 'connected' : 'not available'}\n`)
1063
- }
1064
-
1065
- const processScenario = async (scenario: Scenario): Promise<StepResult> => {
1066
- const trigger = scenario.trigger || scenario.triggerMessage
1067
- if (!trigger) {
1068
- return {
1069
- scenarioId: scenario.id,
1070
- passed: false,
1071
- expected: 'respond',
1072
- actual: 'ERROR: no trigger',
1073
- durationMs: 0,
1074
- }
1075
- }
1076
-
1077
- const startTime = Date.now()
1078
- try {
1079
- // Note: Real tools are initialized globally via initRealTools()
1080
- // The pipeline will use them via the gather step's tool providers
1081
- // when --real-tools is enabled and services are available
1082
-
1083
- const pipelineResult = await runPipeline(
1084
- {
1085
- message: {
1086
- subject: trigger.subject,
1087
- body: trigger.body,
1088
- appId: scenario.appId,
1089
- },
1090
- appConfig: {
1091
- appId: scenario.appId || 'eval',
1092
- instructorConfigured: true,
1093
- autoSendEnabled: false,
1094
- },
1095
- dryRun: true,
1096
- },
1097
- {
1098
- classifyModel: options.model,
1099
- draftModel: options.model,
1100
- }
1101
- )
1102
-
1103
- // For e2e, check if action matches expected
1104
- const expected = scenario.expectedAction || 'respond'
1105
- const passed = pipelineResult.action === expected
1106
-
1107
- completed++
1108
- if (!options.verbose) {
1109
- process.stdout.write(
1110
- `\r Processing ${completed}/${scenarios.length}...`
1111
- )
1112
- }
1113
-
1114
- if (options.verbose && !passed) {
1115
- console.log(`\n❌ ${scenario.id}`)
1116
- console.log(` Expected: ${expected}`)
1117
- console.log(` Actual: ${pipelineResult.action}`)
1118
- console.log(
1119
- ` Steps: ${pipelineResult.steps.map((s) => s.step).join(' → ')}`
1120
- )
1121
- }
1122
-
1123
- return {
1124
- scenarioId: scenario.id,
1125
- passed,
1126
- expected,
1127
- actual: pipelineResult.action,
1128
- durationMs: Date.now() - startTime,
1129
- reasoning: pipelineResult.steps
1130
- .map((s) => `${s.step}:${s.success}`)
1131
- .join(', '),
1132
- }
1133
- } catch (error) {
1134
- completed++
1135
- return {
1136
- scenarioId: scenario.id,
1137
- passed: false,
1138
- expected: scenario.expectedAction || 'respond',
1139
- actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
1140
- durationMs: Date.now() - startTime,
1141
- }
1142
- }
1143
- }
1144
-
1145
- const { results, aborted } = await runBatchWithFailFast(
1146
- scenarios,
1147
- (scenario) => processScenario(scenario),
1148
- concurrency,
1149
- options.failFast || false
1150
- )
1151
-
1152
- if (!options.verbose) console.log('')
1153
- if (aborted) {
1154
- console.log('⚠️ Stopped early due to --fail-fast\n')
1155
- }
1156
- return results
1157
- }
1158
-
1159
- // ============================================================================
1160
- // Metrics computation
1161
- // ============================================================================
1162
-
1163
- function computeMetrics(
1164
- results: StepResult[],
1165
- step: PipelineStep,
1166
- totalDurationMs: number
1167
- ): EvalMetrics {
1168
- const passed = results.filter((r) => r.passed).length
1169
- const failed = results.length - passed
1170
-
1171
- // Build breakdown by expected value
1172
- const breakdown: Record<
1173
- string,
1174
- { tp: number; fp: number; fn: number; precision: number; recall: number }
1175
- > = {}
1176
-
1177
- // Collect all unique labels
1178
- const labels = new Set<string>()
1179
- for (const r of results) {
1180
- labels.add(r.expected)
1181
- labels.add(r.actual)
1182
- }
1183
-
1184
- for (const label of labels) {
1185
- if (label.startsWith('ERROR')) continue
1186
-
1187
- let tp = 0
1188
- let fp = 0
1189
- let fn = 0
1190
-
1191
- for (const r of results) {
1192
- if (r.actual === label && r.expected === label) tp++
1193
- else if (r.actual === label && r.expected !== label) fp++
1194
- else if (r.actual !== label && r.expected === label) fn++
1195
- }
1196
-
1197
- const precision = tp + fp > 0 ? tp / (tp + fp) : 0
1198
- const recall = tp + fn > 0 ? tp / (tp + fn) : 0
1199
-
1200
- breakdown[label] = { tp, fp, fn, precision, recall }
1201
- }
1202
-
1203
- const metrics: EvalMetrics = {
1204
- total: results.length,
1205
- passed,
1206
- failed,
1207
- accuracy: results.length > 0 ? passed / results.length : 0,
1208
- durationMs: totalDurationMs,
1209
- breakdown,
1210
- }
1211
-
1212
- // Special metrics for route step
1213
- if (step === 'route') {
1214
- // False silence: expected respond but got silence
1215
- const falseSilence = results.filter(
1216
- (r) => r.expected === 'respond' && r.actual === 'silence'
1217
- ).length
1218
- const shouldRespond = results.filter((r) => r.expected === 'respond').length
1219
-
1220
- // False respond: expected silence but got respond
1221
- const falseRespond = results.filter(
1222
- (r) => r.expected === 'silence' && r.actual === 'respond'
1223
- ).length
1224
- const shouldSilence = results.filter((r) => r.expected === 'silence').length
1225
-
1226
- metrics.falseSilenceRate =
1227
- shouldRespond > 0 ? falseSilence / shouldRespond : 0
1228
- metrics.falseRespondRate =
1229
- shouldSilence > 0 ? falseRespond / shouldSilence : 0
1230
- }
1231
-
1232
- return metrics
1233
- }
1234
-
1235
- // ============================================================================
1236
- // Output
1237
- // ============================================================================
1238
-
1239
- function printMetrics(
1240
- step: PipelineStep,
1241
- metrics: EvalMetrics,
1242
- results?: StepResult[]
1243
- ): void {
1244
- const stepEmoji: Record<PipelineStep, string> = {
1245
- classify: '🏷️',
1246
- route: '🚦',
1247
- gather: '📦',
1248
- draft: '✍️',
1249
- validate: '✅',
1250
- e2e: '🔄',
1251
- }
1252
-
1253
- console.log(`${stepEmoji[step]} ${step.toUpperCase()} Eval Results\n`)
1254
- console.log(`Total: ${metrics.total}`)
1255
- console.log(
1256
- ` ✅ Passed: ${metrics.passed} (${(metrics.accuracy * 100).toFixed(1)}%)`
1257
- )
1258
- console.log(` ❌ Failed: ${metrics.failed}`)
1259
-
1260
- if (step === 'route' && metrics.falseSilenceRate !== undefined) {
1261
- console.log(`\nRouting Errors:`)
1262
- console.log(
1263
- ` False silence rate: ${(metrics.falseSilenceRate * 100).toFixed(1)}%`
1264
- )
1265
- console.log(
1266
- ` False respond rate: ${(metrics.falseRespondRate! * 100).toFixed(1)}%`
1267
- )
1268
- }
1269
-
1270
- // Show breakdown if there are multiple labels
1271
- const labelCount = Object.keys(metrics.breakdown).length
1272
- if (labelCount > 1 && labelCount <= 20) {
1273
- console.log(
1274
- `\nBreakdown by ${step === 'classify' ? 'category' : 'action'}:`
1275
- )
1276
-
1277
- const sorted = Object.entries(metrics.breakdown)
1278
- .filter(([label]) => !label.startsWith('ERROR'))
1279
- .sort((a, b) => b[1].tp + b[1].fn - (a[1].tp + a[1].fn))
1280
-
1281
- for (const [label, stats] of sorted) {
1282
- const total = stats.tp + stats.fn
1283
- if (total === 0) continue
1284
-
1285
- const precisionStr = (stats.precision * 100).toFixed(0)
1286
- const recallStr = (stats.recall * 100).toFixed(0)
1287
- console.log(
1288
- ` ${label}: ${stats.tp}/${total} (P=${precisionStr}% R=${recallStr}%)`
1289
- )
1290
- }
1291
- }
1292
-
1293
- // Latency
1294
- const avgLatency = metrics.durationMs / metrics.total
1295
- console.log(`\nLatency: ${avgLatency.toFixed(0)}ms avg`)
1296
-
1297
- // Show individual failures if verbose
1298
- if (results) {
1299
- const failures = results.filter((r) => !r.passed)
1300
- if (failures.length > 0) {
1301
- console.log(`\n--- FAILURES (${failures.length}) ---\n`)
1302
- for (const f of failures.slice(0, 10)) {
1303
- console.log(`❌ ${f.scenarioId}`)
1304
- console.log(` Expected: ${f.expected}`)
1305
- console.log(` Actual: ${f.actual}`)
1306
- if (f.reasoning) {
1307
- console.log(` Reason: ${f.reasoning.slice(0, 80)}...`)
1308
- }
1309
- console.log('')
1310
- }
1311
- if (failures.length > 10) {
1312
- console.log(` ... and ${failures.length - 10} more`)
1313
- }
1314
- }
1315
- }
1316
- }