@skillrecordings/cli 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skill.mjs +27 -0
- package/dist/chunk-2NCCVTEE.js +22342 -0
- package/dist/chunk-2NCCVTEE.js.map +1 -0
- package/dist/chunk-3E3GYSZR.js +7071 -0
- package/dist/chunk-3E3GYSZR.js.map +1 -0
- package/dist/chunk-F4EM72IH.js +86 -0
- package/dist/chunk-F4EM72IH.js.map +1 -0
- package/dist/chunk-FGP7KUQW.js +432 -0
- package/dist/chunk-FGP7KUQW.js.map +1 -0
- package/dist/chunk-H3D6VCME.js +55 -0
- package/dist/chunk-H3D6VCME.js.map +1 -0
- package/dist/chunk-HK3PEWFD.js +208 -0
- package/dist/chunk-HK3PEWFD.js.map +1 -0
- package/dist/chunk-KEV3QKXP.js +4495 -0
- package/dist/chunk-KEV3QKXP.js.map +1 -0
- package/dist/chunk-MG37YDAK.js +882 -0
- package/dist/chunk-MG37YDAK.js.map +1 -0
- package/dist/chunk-MLNDSBZ4.js +482 -0
- package/dist/chunk-MLNDSBZ4.js.map +1 -0
- package/dist/chunk-N2WIV2JV.js +22 -0
- package/dist/chunk-N2WIV2JV.js.map +1 -0
- package/dist/chunk-PWWRCN5W.js +2067 -0
- package/dist/chunk-PWWRCN5W.js.map +1 -0
- package/dist/chunk-SKHBM3XP.js +7746 -0
- package/dist/chunk-SKHBM3XP.js.map +1 -0
- package/dist/chunk-WFANXVQG.js +64 -0
- package/dist/chunk-WFANXVQG.js.map +1 -0
- package/dist/chunk-WYKL32C3.js +275 -0
- package/dist/chunk-WYKL32C3.js.map +1 -0
- package/dist/chunk-ZNF7XD2S.js +134 -0
- package/dist/chunk-ZNF7XD2S.js.map +1 -0
- package/dist/config-AUAIYDSI.js +20 -0
- package/dist/config-AUAIYDSI.js.map +1 -0
- package/dist/fileFromPath-XN7LXIBI.js +134 -0
- package/dist/fileFromPath-XN7LXIBI.js.map +1 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js +42 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js.map +1 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js +42 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js.map +1 -0
- package/dist/getMachineId-linux-KVZEHQSU.js +34 -0
- package/dist/getMachineId-linux-KVZEHQSU.js.map +1 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js +25 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js.map +1 -0
- package/dist/getMachineId-win-IIF36LEJ.js +44 -0
- package/dist/getMachineId-win-IIF36LEJ.js.map +1 -0
- package/dist/index.js +112703 -0
- package/dist/index.js.map +1 -0
- package/dist/lib-R6DEEJCP.js +7623 -0
- package/dist/lib-R6DEEJCP.js.map +1 -0
- package/dist/pipeline-IAVVAKTU.js +120 -0
- package/dist/pipeline-IAVVAKTU.js.map +1 -0
- package/dist/query-NTP5NVXN.js +25 -0
- package/dist/query-NTP5NVXN.js.map +1 -0
- package/dist/routing-BAEPFB7V.js +390 -0
- package/dist/routing-BAEPFB7V.js.map +1 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js +56 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js.map +1 -0
- package/dist/stripe-payment-history-SJPKA63N.js +67 -0
- package/dist/stripe-payment-history-SJPKA63N.js.map +1 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js +58 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js.map +1 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js +54 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js.map +1 -0
- package/dist/support-memory-WSG7SDKG.js +10 -0
- package/dist/support-memory-WSG7SDKG.js.map +1 -0
- package/package.json +10 -7
- package/.env.encrypted +0 -0
- package/CHANGELOG.md +0 -35
- package/data/tt-archive-dataset.json +0 -1
- package/data/validate-test-dataset.json +0 -97
- package/docs/CLI-AUTH.md +0 -504
- package/preload.ts +0 -18
- package/src/__tests__/init.test.ts +0 -74
- package/src/alignment-test.ts +0 -64
- package/src/check-apps.ts +0 -16
- package/src/commands/auth/decrypt.ts +0 -123
- package/src/commands/auth/encrypt.ts +0 -81
- package/src/commands/auth/index.ts +0 -50
- package/src/commands/auth/keygen.ts +0 -41
- package/src/commands/auth/status.ts +0 -164
- package/src/commands/axiom/forensic.ts +0 -868
- package/src/commands/axiom/index.ts +0 -697
- package/src/commands/build-dataset.ts +0 -311
- package/src/commands/db-status.ts +0 -47
- package/src/commands/deploys.ts +0 -219
- package/src/commands/eval-local/compare.ts +0 -171
- package/src/commands/eval-local/health.ts +0 -212
- package/src/commands/eval-local/index.ts +0 -76
- package/src/commands/eval-local/real-tools.ts +0 -416
- package/src/commands/eval-local/run.ts +0 -1168
- package/src/commands/eval-local/score-production.ts +0 -256
- package/src/commands/eval-local/seed.ts +0 -276
- package/src/commands/eval-pipeline/index.ts +0 -53
- package/src/commands/eval-pipeline/real-tools.ts +0 -492
- package/src/commands/eval-pipeline/run.ts +0 -1316
- package/src/commands/eval-pipeline/seed.ts +0 -395
- package/src/commands/eval-prompt.ts +0 -496
- package/src/commands/eval.test.ts +0 -253
- package/src/commands/eval.ts +0 -108
- package/src/commands/faq-classify.ts +0 -460
- package/src/commands/faq-cluster.ts +0 -135
- package/src/commands/faq-extract.ts +0 -249
- package/src/commands/faq-mine.ts +0 -432
- package/src/commands/faq-review.ts +0 -426
- package/src/commands/front/index.ts +0 -351
- package/src/commands/front/pull-conversations.ts +0 -275
- package/src/commands/front/tags.ts +0 -825
- package/src/commands/front-cache.ts +0 -1277
- package/src/commands/front-stats.ts +0 -75
- package/src/commands/health.test.ts +0 -82
- package/src/commands/health.ts +0 -362
- package/src/commands/init.test.ts +0 -89
- package/src/commands/init.ts +0 -106
- package/src/commands/inngest/client.ts +0 -294
- package/src/commands/inngest/events.ts +0 -296
- package/src/commands/inngest/investigate.ts +0 -382
- package/src/commands/inngest/runs.ts +0 -149
- package/src/commands/inngest/signal.ts +0 -143
- package/src/commands/kb-sync.ts +0 -498
- package/src/commands/memory/find.ts +0 -135
- package/src/commands/memory/get.ts +0 -87
- package/src/commands/memory/index.ts +0 -97
- package/src/commands/memory/stats.ts +0 -163
- package/src/commands/memory/store.ts +0 -49
- package/src/commands/memory/vote.ts +0 -159
- package/src/commands/pipeline.ts +0 -127
- package/src/commands/responses.ts +0 -856
- package/src/commands/tools.ts +0 -293
- package/src/commands/wizard.ts +0 -319
- package/src/index.ts +0 -172
- package/src/lib/crypto.ts +0 -56
- package/src/lib/env-loader.ts +0 -206
- package/src/lib/onepassword.ts +0 -137
- package/src/test-agent-local.ts +0 -115
- package/tsconfig.json +0 -11
- package/vitest.config.ts +0 -10
|
@@ -1,1316 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step evaluation runner
|
|
3
|
-
*
|
|
4
|
-
* Runs actual pipeline steps against labeled scenarios and measures accuracy.
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
import { createHash } from 'crypto'
|
|
8
|
-
import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'fs'
|
|
9
|
-
import { dirname, join } from 'path'
|
|
10
|
-
import {
|
|
11
|
-
type ClassifyInput,
|
|
12
|
-
type ClassifyOutput,
|
|
13
|
-
type GatherOutput,
|
|
14
|
-
type MessageCategory,
|
|
15
|
-
type RouteAction,
|
|
16
|
-
type RouteOutput,
|
|
17
|
-
type ValidateOutput,
|
|
18
|
-
type ValidationIssueType,
|
|
19
|
-
classify,
|
|
20
|
-
route,
|
|
21
|
-
validate,
|
|
22
|
-
} from '@skillrecordings/core/pipeline'
|
|
23
|
-
import { readFile, writeFile } from 'fs/promises'
|
|
24
|
-
import { glob } from 'glob'
|
|
25
|
-
import {
|
|
26
|
-
cleanupRealTools,
|
|
27
|
-
createRealTools,
|
|
28
|
-
initRealTools,
|
|
29
|
-
isRealToolsAvailable,
|
|
30
|
-
} from './real-tools'
|
|
31
|
-
|
|
32
|
-
// ============================================================================
|
|
33
|
-
// Concurrency helpers
|
|
34
|
-
// ============================================================================
|
|
35
|
-
|
|
36
|
-
/**
|
|
37
|
-
* Run items in batches with controlled concurrency
|
|
38
|
-
*/
|
|
39
|
-
async function runBatch<T, R>(
|
|
40
|
-
items: T[],
|
|
41
|
-
fn: (item: T, index: number) => Promise<R>,
|
|
42
|
-
concurrency: number
|
|
43
|
-
): Promise<R[]> {
|
|
44
|
-
const results: R[] = []
|
|
45
|
-
for (let i = 0; i < items.length; i += concurrency) {
|
|
46
|
-
const batch = items.slice(i, i + concurrency)
|
|
47
|
-
const batchResults = await Promise.all(
|
|
48
|
-
batch.map((item, batchIndex) => fn(item, i + batchIndex))
|
|
49
|
-
)
|
|
50
|
-
results.push(...batchResults)
|
|
51
|
-
}
|
|
52
|
-
return results
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
/**
|
|
56
|
-
* Run items in batches with fail-fast support
|
|
57
|
-
*/
|
|
58
|
-
async function runBatchWithFailFast<T, R extends { passed: boolean }>(
|
|
59
|
-
items: T[],
|
|
60
|
-
fn: (item: T, index: number) => Promise<R>,
|
|
61
|
-
concurrency: number,
|
|
62
|
-
failFast: boolean
|
|
63
|
-
): Promise<{ results: R[]; aborted: boolean }> {
|
|
64
|
-
const results: R[] = []
|
|
65
|
-
let aborted = false
|
|
66
|
-
|
|
67
|
-
for (let i = 0; i < items.length && !aborted; i += concurrency) {
|
|
68
|
-
const batch = items.slice(i, i + concurrency)
|
|
69
|
-
const batchResults = await Promise.all(
|
|
70
|
-
batch.map((item, batchIndex) => fn(item, i + batchIndex))
|
|
71
|
-
)
|
|
72
|
-
results.push(...batchResults)
|
|
73
|
-
|
|
74
|
-
if (failFast && batchResults.some((r) => !r.passed)) {
|
|
75
|
-
aborted = true
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
return { results, aborted }
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
// ============================================================================
|
|
83
|
-
// Classify cache helpers
|
|
84
|
-
// ============================================================================
|
|
85
|
-
|
|
86
|
-
const CACHE_DIR = '.eval-cache'
|
|
87
|
-
|
|
88
|
-
function getCacheKey(scenarioId: string, classifySourceHash: string): string {
|
|
89
|
-
return `classify-${scenarioId}-${classifySourceHash.slice(0, 8)}`
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
function getClassifySourceHash(): string {
|
|
93
|
-
// Hash based on classify.ts content to invalidate cache when code changes
|
|
94
|
-
try {
|
|
95
|
-
// Try to read the classify source from core package
|
|
96
|
-
const possiblePaths = [
|
|
97
|
-
join(process.cwd(), 'packages/core/src/pipeline/classify.ts'),
|
|
98
|
-
join(process.cwd(), '../core/src/pipeline/classify.ts'),
|
|
99
|
-
]
|
|
100
|
-
for (const path of possiblePaths) {
|
|
101
|
-
if (existsSync(path)) {
|
|
102
|
-
const content = readFileSync(path, 'utf-8')
|
|
103
|
-
return createHash('md5').update(content).digest('hex')
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
} catch {
|
|
107
|
-
// Fallback: use timestamp-based invalidation (cache for 1 hour)
|
|
108
|
-
}
|
|
109
|
-
// Fallback hash based on current hour
|
|
110
|
-
return createHash('md5')
|
|
111
|
-
.update(Math.floor(Date.now() / 300000).toString())
|
|
112
|
-
.digest('hex')
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
function loadCachedClassify(cacheKey: string): ClassifyOutput | null {
|
|
116
|
-
const cachePath = join(CACHE_DIR, `${cacheKey}.json`)
|
|
117
|
-
try {
|
|
118
|
-
if (existsSync(cachePath)) {
|
|
119
|
-
return JSON.parse(readFileSync(cachePath, 'utf-8'))
|
|
120
|
-
}
|
|
121
|
-
} catch {
|
|
122
|
-
// Cache miss or invalid
|
|
123
|
-
}
|
|
124
|
-
return null
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
function saveCachedClassify(cacheKey: string, result: ClassifyOutput): void {
|
|
128
|
-
try {
|
|
129
|
-
if (!existsSync(CACHE_DIR)) {
|
|
130
|
-
mkdirSync(CACHE_DIR, { recursive: true })
|
|
131
|
-
}
|
|
132
|
-
const cachePath = join(CACHE_DIR, `${cacheKey}.json`)
|
|
133
|
-
writeFileSync(cachePath, JSON.stringify(result))
|
|
134
|
-
} catch {
|
|
135
|
-
// Ignore cache write errors
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
function clearClassifyCache(): void {
|
|
140
|
-
try {
|
|
141
|
-
if (existsSync(CACHE_DIR)) {
|
|
142
|
-
rmSync(CACHE_DIR, { recursive: true, force: true })
|
|
143
|
-
}
|
|
144
|
-
} catch {
|
|
145
|
-
// Ignore
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
// ============================================================================
|
|
150
|
-
// Types
|
|
151
|
-
// ============================================================================
|
|
152
|
-
|
|
153
|
-
type PipelineStep =
|
|
154
|
-
| 'classify'
|
|
155
|
-
| 'route'
|
|
156
|
-
| 'gather'
|
|
157
|
-
| 'draft'
|
|
158
|
-
| 'validate'
|
|
159
|
-
| 'e2e'
|
|
160
|
-
|
|
161
|
-
interface RunOptions {
|
|
162
|
-
step: PipelineStep
|
|
163
|
-
scenarios?: string
|
|
164
|
-
dataset?: string
|
|
165
|
-
limit?: number
|
|
166
|
-
verbose?: boolean
|
|
167
|
-
json?: boolean
|
|
168
|
-
model?: string
|
|
169
|
-
forceLlm?: boolean
|
|
170
|
-
realTools?: boolean
|
|
171
|
-
parallel?: number
|
|
172
|
-
cacheClassify?: boolean
|
|
173
|
-
clearCache?: boolean
|
|
174
|
-
failFast?: boolean
|
|
175
|
-
quick?: boolean
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
interface Scenario {
|
|
179
|
-
id: string
|
|
180
|
-
name?: string
|
|
181
|
-
trigger?: { subject: string; body: string }
|
|
182
|
-
triggerMessage?: { subject: string; body: string }
|
|
183
|
-
appId?: string
|
|
184
|
-
// Expected values for evals
|
|
185
|
-
expectedCategory?: MessageCategory
|
|
186
|
-
expectedAction?: RouteAction
|
|
187
|
-
expectedBehavior?: string
|
|
188
|
-
category?: string // Fallback for backwards compat
|
|
189
|
-
// Validate eval fields
|
|
190
|
-
draft?: string // Pre-provided draft to validate
|
|
191
|
-
assertions?: {
|
|
192
|
-
noFabrication?: boolean
|
|
193
|
-
noMetaCommentary?: boolean
|
|
194
|
-
noInternalLeak?: boolean
|
|
195
|
-
noBannedPhrases?: boolean
|
|
196
|
-
mustNotContain?: string[]
|
|
197
|
-
}
|
|
198
|
-
// Context for validation (optional)
|
|
199
|
-
context?: {
|
|
200
|
-
customer?: string
|
|
201
|
-
conversation?: unknown
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
interface StepResult {
|
|
206
|
-
scenarioId: string
|
|
207
|
-
passed: boolean
|
|
208
|
-
expected: string
|
|
209
|
-
actual: string
|
|
210
|
-
confidence?: number
|
|
211
|
-
durationMs: number
|
|
212
|
-
reasoning?: string
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
interface EvalMetrics {
|
|
216
|
-
total: number
|
|
217
|
-
passed: number
|
|
218
|
-
failed: number
|
|
219
|
-
accuracy: number
|
|
220
|
-
durationMs: number
|
|
221
|
-
// Per-category/action breakdown
|
|
222
|
-
breakdown: Record<
|
|
223
|
-
string,
|
|
224
|
-
{ tp: number; fp: number; fn: number; precision: number; recall: number }
|
|
225
|
-
>
|
|
226
|
-
// Special metrics
|
|
227
|
-
falseSilenceRate?: number // For route: incorrectly silenced
|
|
228
|
-
falseRespondRate?: number // For route: incorrectly responded
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
// ============================================================================
|
|
232
|
-
// Main runner
|
|
233
|
-
// ============================================================================
|
|
234
|
-
|
|
235
|
-
export async function run(options: RunOptions): Promise<void> {
|
|
236
|
-
const {
|
|
237
|
-
step,
|
|
238
|
-
scenarios: scenarioGlob,
|
|
239
|
-
dataset,
|
|
240
|
-
limit,
|
|
241
|
-
verbose,
|
|
242
|
-
json,
|
|
243
|
-
model,
|
|
244
|
-
forceLlm,
|
|
245
|
-
realTools,
|
|
246
|
-
parallel = 10,
|
|
247
|
-
cacheClassify,
|
|
248
|
-
clearCache,
|
|
249
|
-
failFast,
|
|
250
|
-
quick,
|
|
251
|
-
} = options
|
|
252
|
-
|
|
253
|
-
// Clear cache if requested
|
|
254
|
-
if (clearCache) {
|
|
255
|
-
clearClassifyCache()
|
|
256
|
-
if (!json) {
|
|
257
|
-
console.log('🗑️ Cleared classify cache\n')
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
// Load scenarios
|
|
262
|
-
let scenarios = await loadScenarios(scenarioGlob, dataset)
|
|
263
|
-
|
|
264
|
-
// Apply quick filter (smoke test subset)
|
|
265
|
-
if (quick) {
|
|
266
|
-
scenarios = filterQuickScenarios(scenarios)
|
|
267
|
-
if (!json) {
|
|
268
|
-
console.log(`⚡ Quick mode: filtered to ${scenarios.length} scenarios\n`)
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
if (limit && limit < scenarios.length) {
|
|
273
|
-
scenarios = scenarios.slice(0, limit)
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
if (!json) {
|
|
277
|
-
const parallelInfo = parallel > 1 ? ` (parallel: ${parallel})` : ''
|
|
278
|
-
const flags = [
|
|
279
|
-
cacheClassify ? 'cache' : null,
|
|
280
|
-
failFast ? 'fail-fast' : null,
|
|
281
|
-
]
|
|
282
|
-
.filter(Boolean)
|
|
283
|
-
.join(', ')
|
|
284
|
-
const flagsInfo = flags ? ` [${flags}]` : ''
|
|
285
|
-
console.log(
|
|
286
|
-
`\n🧪 Running ${step} eval on ${scenarios.length} scenarios${parallelInfo}${flagsInfo}\n`
|
|
287
|
-
)
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
// Initialize real tools if requested
|
|
291
|
-
if (realTools) {
|
|
292
|
-
if (!json) {
|
|
293
|
-
console.log('🔌 Connecting to Docker services...')
|
|
294
|
-
}
|
|
295
|
-
const status = await initRealTools(undefined, verbose && !json)
|
|
296
|
-
|
|
297
|
-
if (!status.mysql && !status.qdrant) {
|
|
298
|
-
console.error('❌ Failed to connect to any Docker services')
|
|
299
|
-
console.error(' Make sure MySQL (3306) and Qdrant (6333) are running')
|
|
300
|
-
process.exit(1)
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
if (!json) {
|
|
304
|
-
console.log('')
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
const startTime = Date.now()
|
|
309
|
-
let results: StepResult[] = []
|
|
310
|
-
|
|
311
|
-
try {
|
|
312
|
-
const evalOptions = {
|
|
313
|
-
verbose,
|
|
314
|
-
model,
|
|
315
|
-
forceLlm,
|
|
316
|
-
realTools,
|
|
317
|
-
parallel,
|
|
318
|
-
cacheClassify,
|
|
319
|
-
failFast,
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
switch (step) {
|
|
323
|
-
case 'classify':
|
|
324
|
-
results = await runClassifyEval(scenarios, evalOptions)
|
|
325
|
-
break
|
|
326
|
-
case 'route':
|
|
327
|
-
results = await runRouteEval(scenarios, evalOptions)
|
|
328
|
-
break
|
|
329
|
-
case 'gather':
|
|
330
|
-
results = await runGatherEval(scenarios, evalOptions)
|
|
331
|
-
break
|
|
332
|
-
case 'validate':
|
|
333
|
-
results = await runValidateEval(scenarios, evalOptions)
|
|
334
|
-
break
|
|
335
|
-
case 'e2e':
|
|
336
|
-
results = await runE2EEval(scenarios, evalOptions)
|
|
337
|
-
break
|
|
338
|
-
case 'draft':
|
|
339
|
-
console.error(
|
|
340
|
-
`Step "${step}" not yet implemented. Use e2e for full pipeline.`
|
|
341
|
-
)
|
|
342
|
-
process.exit(1)
|
|
343
|
-
default:
|
|
344
|
-
console.error(`Unknown step: ${step}`)
|
|
345
|
-
process.exit(1)
|
|
346
|
-
}
|
|
347
|
-
} finally {
|
|
348
|
-
// Clean up real tools connections
|
|
349
|
-
if (realTools) {
|
|
350
|
-
await cleanupRealTools()
|
|
351
|
-
}
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
const totalDuration = Date.now() - startTime
|
|
355
|
-
const metrics = computeMetrics(results, step, totalDuration)
|
|
356
|
-
|
|
357
|
-
if (json) {
|
|
358
|
-
console.log(JSON.stringify({ metrics, results }, null, 2))
|
|
359
|
-
} else {
|
|
360
|
-
printMetrics(step, metrics, verbose ? results : undefined)
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
// ============================================================================
|
|
365
|
-
// Scenario loading
|
|
366
|
-
// ============================================================================
|
|
367
|
-
|
|
368
|
-
async function loadScenarios(
|
|
369
|
-
scenarioGlob?: string,
|
|
370
|
-
datasetPath?: string
|
|
371
|
-
): Promise<Scenario[]> {
|
|
372
|
-
if (datasetPath) {
|
|
373
|
-
const content = await readFile(datasetPath, 'utf-8')
|
|
374
|
-
const data = JSON.parse(content)
|
|
375
|
-
|
|
376
|
-
// Handle comprehensive-dataset.json format
|
|
377
|
-
return data.map((item: any) => ({
|
|
378
|
-
id: item.id || item.conversationId,
|
|
379
|
-
name: item.triggerMessage?.subject || item.name,
|
|
380
|
-
trigger: item.trigger,
|
|
381
|
-
triggerMessage: item.triggerMessage,
|
|
382
|
-
appId: item.appId || item.app,
|
|
383
|
-
expectedCategory: item.expectedCategory || inferCategory(item),
|
|
384
|
-
expectedAction: item.expectedAction || inferAction(item),
|
|
385
|
-
expectedBehavior: item.expectedBehavior,
|
|
386
|
-
category: item.category,
|
|
387
|
-
}))
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
if (scenarioGlob) {
|
|
391
|
-
const files = await glob(scenarioGlob)
|
|
392
|
-
if (files.length === 0) {
|
|
393
|
-
console.error(`No scenario files found matching: ${scenarioGlob}`)
|
|
394
|
-
process.exit(1)
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
return Promise.all(
|
|
398
|
-
files.map(async (file) => {
|
|
399
|
-
const content = await readFile(file, 'utf-8')
|
|
400
|
-
return JSON.parse(content)
|
|
401
|
-
})
|
|
402
|
-
)
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
console.error('Must provide --scenarios or --dataset')
|
|
406
|
-
process.exit(1)
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
/**
|
|
410
|
-
* Infer expected category from scenario if not explicitly set
|
|
411
|
-
*/
|
|
412
|
-
function inferCategory(item: any): MessageCategory | undefined {
|
|
413
|
-
// If agentResponse has category, map it
|
|
414
|
-
if (item.agentResponse?.category) {
|
|
415
|
-
const catMap: Record<string, MessageCategory> = {
|
|
416
|
-
'tool-assisted': 'support_access',
|
|
417
|
-
auto: 'system',
|
|
418
|
-
spam: 'spam',
|
|
419
|
-
}
|
|
420
|
-
return catMap[item.agentResponse.category]
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
// Infer from message content
|
|
424
|
-
const text =
|
|
425
|
-
`${item.triggerMessage?.subject || ''} ${item.triggerMessage?.body || ''}`.toLowerCase()
|
|
426
|
-
|
|
427
|
-
if (/refund|money back/i.test(text)) return 'support_refund'
|
|
428
|
-
if (/can't access|lost access|no access|restore access/i.test(text))
|
|
429
|
-
return 'support_access'
|
|
430
|
-
if (/transfer|different email|wrong email/i.test(text))
|
|
431
|
-
return 'support_transfer'
|
|
432
|
-
if (/invoice|receipt/i.test(text)) return 'support_billing'
|
|
433
|
-
if (/partnership|sponsor|backlink|outreach|seo/i.test(text)) return 'spam'
|
|
434
|
-
if (/auto-reply|out of office|mailer-daemon/i.test(text)) return 'system'
|
|
435
|
-
if (/thank|love|amazing|big fan/i.test(text)) return 'fan_mail'
|
|
436
|
-
|
|
437
|
-
return undefined
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
/**
|
|
441
|
-
* Filter scenarios for quick mode (smoke test subset)
|
|
442
|
-
* Returns scenarios with smoke: true, or first 2 from each category
|
|
443
|
-
*/
|
|
444
|
-
function filterQuickScenarios(scenarios: Scenario[]): Scenario[] {
|
|
445
|
-
// First, try to use smoke flag
|
|
446
|
-
const smokeScenarios = scenarios.filter((s: any) => s.smoke === true)
|
|
447
|
-
if (smokeScenarios.length > 0) {
|
|
448
|
-
return smokeScenarios
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
// Fallback: first 2 from each category
|
|
452
|
-
const byCategory = new Map<string, Scenario[]>()
|
|
453
|
-
for (const scenario of scenarios) {
|
|
454
|
-
const cat =
|
|
455
|
-
scenario.expectedCategory ||
|
|
456
|
-
scenario.category ||
|
|
457
|
-
scenario.expectedAction ||
|
|
458
|
-
'other'
|
|
459
|
-
if (!byCategory.has(cat)) {
|
|
460
|
-
byCategory.set(cat, [])
|
|
461
|
-
}
|
|
462
|
-
byCategory.get(cat)!.push(scenario)
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
const result: Scenario[] = []
|
|
466
|
-
for (const [, categoryScenarios] of byCategory) {
|
|
467
|
-
result.push(...categoryScenarios.slice(0, 2))
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
return result
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
/**
|
|
474
|
-
* Infer expected action from scenario
|
|
475
|
-
*/
|
|
476
|
-
function inferAction(item: any): RouteAction | undefined {
|
|
477
|
-
const behavior = item.expectedBehavior?.toLowerCase() || ''
|
|
478
|
-
|
|
479
|
-
if (behavior.includes('silent') || behavior.includes('ignore'))
|
|
480
|
-
return 'silence'
|
|
481
|
-
if (behavior.includes('escalate') || behavior.includes('human'))
|
|
482
|
-
return 'escalate_human'
|
|
483
|
-
if (behavior.includes('instructor')) return 'escalate_instructor'
|
|
484
|
-
if (behavior.includes('respond') || behavior.includes('draft'))
|
|
485
|
-
return 'respond'
|
|
486
|
-
|
|
487
|
-
// If agent responded, it was probably meant to respond
|
|
488
|
-
if (item.agentResponse?.text) return 'respond'
|
|
489
|
-
|
|
490
|
-
return undefined
|
|
491
|
-
}
|
|
492
|
-
|
|
493
|
-
// ============================================================================
|
|
494
|
-
// Step evaluators
|
|
495
|
-
// ============================================================================
|
|
496
|
-
|
|
497
|
-
interface EvalOptions {
|
|
498
|
-
verbose?: boolean
|
|
499
|
-
model?: string
|
|
500
|
-
forceLlm?: boolean
|
|
501
|
-
realTools?: boolean
|
|
502
|
-
parallel?: number
|
|
503
|
-
cacheClassify?: boolean
|
|
504
|
-
failFast?: boolean
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
async function runClassifyEval(
|
|
508
|
-
scenarios: Scenario[],
|
|
509
|
-
options: EvalOptions
|
|
510
|
-
): Promise<StepResult[]> {
|
|
511
|
-
const concurrency = options.parallel || 1
|
|
512
|
-
const classifyHash = options.cacheClassify ? getClassifySourceHash() : ''
|
|
513
|
-
let completed = 0
|
|
514
|
-
|
|
515
|
-
const processScenario = async (scenario: Scenario): Promise<StepResult> => {
|
|
516
|
-
const trigger = scenario.trigger || scenario.triggerMessage
|
|
517
|
-
if (!trigger) {
|
|
518
|
-
return {
|
|
519
|
-
scenarioId: scenario.id,
|
|
520
|
-
passed: false,
|
|
521
|
-
expected: scenario.expectedCategory || 'unknown',
|
|
522
|
-
actual: 'ERROR: no trigger',
|
|
523
|
-
durationMs: 0,
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
const input: ClassifyInput = {
|
|
528
|
-
subject: trigger.subject,
|
|
529
|
-
body: trigger.body,
|
|
530
|
-
appId: scenario.appId,
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
const startTime = Date.now()
|
|
534
|
-
try {
|
|
535
|
-
let result: ClassifyOutput
|
|
536
|
-
|
|
537
|
-
// Check cache if enabled
|
|
538
|
-
if (options.cacheClassify) {
|
|
539
|
-
const cacheKey = getCacheKey(scenario.id, classifyHash)
|
|
540
|
-
const cached = loadCachedClassify(cacheKey)
|
|
541
|
-
if (cached) {
|
|
542
|
-
result = cached
|
|
543
|
-
} else {
|
|
544
|
-
result = await classify(input, {
|
|
545
|
-
forceLLM: options.forceLlm,
|
|
546
|
-
model: options.model,
|
|
547
|
-
})
|
|
548
|
-
saveCachedClassify(cacheKey, result)
|
|
549
|
-
}
|
|
550
|
-
} else {
|
|
551
|
-
result = await classify(input, {
|
|
552
|
-
forceLLM: options.forceLlm,
|
|
553
|
-
model: options.model,
|
|
554
|
-
})
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
const expected = scenario.expectedCategory || 'unknown'
|
|
558
|
-
const passed = result.category === expected
|
|
559
|
-
|
|
560
|
-
completed++
|
|
561
|
-
if (!options.verbose) {
|
|
562
|
-
process.stdout.write(
|
|
563
|
-
`\r Processing ${completed}/${scenarios.length}...`
|
|
564
|
-
)
|
|
565
|
-
}
|
|
566
|
-
|
|
567
|
-
if (options.verbose && !passed) {
|
|
568
|
-
console.log(`\n❌ ${scenario.id}`)
|
|
569
|
-
console.log(` Expected: ${expected}`)
|
|
570
|
-
console.log(
|
|
571
|
-
` Actual: ${result.category} (${(result.confidence * 100).toFixed(0)}%)`
|
|
572
|
-
)
|
|
573
|
-
console.log(` Subject: ${trigger.subject.slice(0, 60)}...`)
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
return {
|
|
577
|
-
scenarioId: scenario.id,
|
|
578
|
-
passed,
|
|
579
|
-
expected,
|
|
580
|
-
actual: result.category,
|
|
581
|
-
confidence: result.confidence,
|
|
582
|
-
durationMs: Date.now() - startTime,
|
|
583
|
-
reasoning: result.reasoning,
|
|
584
|
-
}
|
|
585
|
-
} catch (error) {
|
|
586
|
-
completed++
|
|
587
|
-
return {
|
|
588
|
-
scenarioId: scenario.id,
|
|
589
|
-
passed: false,
|
|
590
|
-
expected: scenario.expectedCategory || 'unknown',
|
|
591
|
-
actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
592
|
-
durationMs: Date.now() - startTime,
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
const { results, aborted } = await runBatchWithFailFast(
|
|
598
|
-
scenarios,
|
|
599
|
-
(scenario) => processScenario(scenario),
|
|
600
|
-
concurrency,
|
|
601
|
-
options.failFast || false
|
|
602
|
-
)
|
|
603
|
-
|
|
604
|
-
if (!options.verbose) console.log('')
|
|
605
|
-
if (aborted) {
|
|
606
|
-
console.log('⚠️ Stopped early due to --fail-fast\n')
|
|
607
|
-
}
|
|
608
|
-
return results
|
|
609
|
-
}
|
|
610
|
-
|
|
611
|
-
async function runRouteEval(
|
|
612
|
-
scenarios: Scenario[],
|
|
613
|
-
options: EvalOptions
|
|
614
|
-
): Promise<StepResult[]> {
|
|
615
|
-
const concurrency = options.parallel || 1
|
|
616
|
-
const classifyHash = options.cacheClassify ? getClassifySourceHash() : ''
|
|
617
|
-
let completed = 0
|
|
618
|
-
|
|
619
|
-
const processScenario = async (scenario: Scenario): Promise<StepResult> => {
|
|
620
|
-
const trigger = scenario.trigger || scenario.triggerMessage
|
|
621
|
-
if (!trigger) {
|
|
622
|
-
return {
|
|
623
|
-
scenarioId: scenario.id,
|
|
624
|
-
passed: false,
|
|
625
|
-
expected: scenario.expectedAction || 'unknown',
|
|
626
|
-
actual: 'ERROR: no trigger',
|
|
627
|
-
durationMs: 0,
|
|
628
|
-
}
|
|
629
|
-
}
|
|
630
|
-
|
|
631
|
-
const input: ClassifyInput = {
|
|
632
|
-
subject: trigger.subject,
|
|
633
|
-
body: trigger.body,
|
|
634
|
-
appId: scenario.appId,
|
|
635
|
-
}
|
|
636
|
-
|
|
637
|
-
const startTime = Date.now()
|
|
638
|
-
try {
|
|
639
|
-
// First classify (with cache support), then route
|
|
640
|
-
let classification: ClassifyOutput
|
|
641
|
-
|
|
642
|
-
if (options.cacheClassify) {
|
|
643
|
-
const cacheKey = getCacheKey(scenario.id, classifyHash)
|
|
644
|
-
const cached = loadCachedClassify(cacheKey)
|
|
645
|
-
if (cached) {
|
|
646
|
-
classification = cached
|
|
647
|
-
} else {
|
|
648
|
-
classification = await classify(input, {
|
|
649
|
-
forceLLM: options.forceLlm,
|
|
650
|
-
model: options.model,
|
|
651
|
-
})
|
|
652
|
-
saveCachedClassify(cacheKey, classification)
|
|
653
|
-
}
|
|
654
|
-
} else {
|
|
655
|
-
classification = await classify(input, {
|
|
656
|
-
forceLLM: options.forceLlm,
|
|
657
|
-
model: options.model,
|
|
658
|
-
})
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
const routeResult = route({
|
|
662
|
-
message: input,
|
|
663
|
-
classification,
|
|
664
|
-
appConfig: {
|
|
665
|
-
appId: scenario.appId || 'eval',
|
|
666
|
-
instructorConfigured: true,
|
|
667
|
-
autoSendEnabled: false,
|
|
668
|
-
},
|
|
669
|
-
})
|
|
670
|
-
|
|
671
|
-
const expected = scenario.expectedAction || 'respond'
|
|
672
|
-
const passed = routeResult.action === expected
|
|
673
|
-
|
|
674
|
-
completed++
|
|
675
|
-
if (!options.verbose) {
|
|
676
|
-
process.stdout.write(
|
|
677
|
-
`\r Processing ${completed}/${scenarios.length}...`
|
|
678
|
-
)
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
if (options.verbose && !passed) {
|
|
682
|
-
console.log(`\n❌ ${scenario.id}`)
|
|
683
|
-
console.log(` Expected: ${expected}`)
|
|
684
|
-
console.log(` Actual: ${routeResult.action}`)
|
|
685
|
-
console.log(` Category: ${classification.category}`)
|
|
686
|
-
console.log(` Reason: ${routeResult.reason}`)
|
|
687
|
-
}
|
|
688
|
-
|
|
689
|
-
return {
|
|
690
|
-
scenarioId: scenario.id,
|
|
691
|
-
passed,
|
|
692
|
-
expected,
|
|
693
|
-
actual: routeResult.action,
|
|
694
|
-
durationMs: Date.now() - startTime,
|
|
695
|
-
reasoning: routeResult.reason,
|
|
696
|
-
}
|
|
697
|
-
} catch (error) {
|
|
698
|
-
completed++
|
|
699
|
-
return {
|
|
700
|
-
scenarioId: scenario.id,
|
|
701
|
-
passed: false,
|
|
702
|
-
expected: scenario.expectedAction || 'respond',
|
|
703
|
-
actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
704
|
-
durationMs: Date.now() - startTime,
|
|
705
|
-
}
|
|
706
|
-
}
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
const { results, aborted } = await runBatchWithFailFast(
|
|
710
|
-
scenarios,
|
|
711
|
-
(scenario) => processScenario(scenario),
|
|
712
|
-
concurrency,
|
|
713
|
-
options.failFast || false
|
|
714
|
-
)
|
|
715
|
-
|
|
716
|
-
if (!options.verbose) console.log('')
|
|
717
|
-
if (aborted) {
|
|
718
|
-
console.log('⚠️ Stopped early due to --fail-fast\n')
|
|
719
|
-
}
|
|
720
|
-
return results
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
async function runGatherEval(
|
|
724
|
-
scenarios: Scenario[],
|
|
725
|
-
options: EvalOptions
|
|
726
|
-
): Promise<StepResult[]> {
|
|
727
|
-
const concurrency = options.parallel || 1
|
|
728
|
-
let completed = 0
|
|
729
|
-
|
|
730
|
-
// Check if real tools are available
|
|
731
|
-
const useRealTools = options.realTools && isRealToolsAvailable()
|
|
732
|
-
|
|
733
|
-
if (!useRealTools) {
|
|
734
|
-
// Fallback to mock behavior
|
|
735
|
-
const results = scenarios.map((scenario) => ({
|
|
736
|
-
scenarioId: scenario.id,
|
|
737
|
-
passed: true,
|
|
738
|
-
expected: 'context_complete',
|
|
739
|
-
actual: 'context_complete',
|
|
740
|
-
durationMs: 0,
|
|
741
|
-
reasoning: 'Gather eval requires --real-tools flag with Docker services',
|
|
742
|
-
}))
|
|
743
|
-
|
|
744
|
-
if (!options.verbose) {
|
|
745
|
-
console.log(` Processing ${scenarios.length}/${scenarios.length}...`)
|
|
746
|
-
}
|
|
747
|
-
console.log(
|
|
748
|
-
'\n⚠️ Gather eval: Use --real-tools with Docker services for actual tool calls\n'
|
|
749
|
-
)
|
|
750
|
-
return results
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
const processScenario = async (scenario: Scenario): Promise<StepResult> => {
|
|
754
|
-
const trigger = scenario.trigger || scenario.triggerMessage
|
|
755
|
-
if (!trigger) {
|
|
756
|
-
return {
|
|
757
|
-
scenarioId: scenario.id,
|
|
758
|
-
passed: false,
|
|
759
|
-
expected: 'context_complete',
|
|
760
|
-
actual: 'ERROR: no trigger',
|
|
761
|
-
durationMs: 0,
|
|
762
|
-
}
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
const startTime = Date.now()
|
|
766
|
-
try {
|
|
767
|
-
// Create real tools for this scenario
|
|
768
|
-
const tools = createRealTools({
|
|
769
|
-
appId: scenario.appId,
|
|
770
|
-
customerEmail: scenario.context?.customer as string,
|
|
771
|
-
})
|
|
772
|
-
|
|
773
|
-
// Execute key tools to gather context
|
|
774
|
-
const toolResults: string[] = []
|
|
775
|
-
let userFound = false
|
|
776
|
-
let knowledgeCount = 0
|
|
777
|
-
|
|
778
|
-
// Try lookupUser
|
|
779
|
-
const lookupUserExec = tools.lookupUser.execute
|
|
780
|
-
if (lookupUserExec) {
|
|
781
|
-
const userResult = await lookupUserExec(
|
|
782
|
-
{
|
|
783
|
-
email: (scenario.context?.customer as string) || '[EMAIL]',
|
|
784
|
-
appId: scenario.appId || 'eval',
|
|
785
|
-
},
|
|
786
|
-
{ toolCallId: 'test', messages: [] }
|
|
787
|
-
)
|
|
788
|
-
userFound = !!(userResult as any).found
|
|
789
|
-
toolResults.push(`user:${userFound ? 'found' : 'not_found'}`)
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
// Try searchKnowledge
|
|
793
|
-
const searchKnowledgeExec = tools.searchKnowledge.execute
|
|
794
|
-
if (searchKnowledgeExec) {
|
|
795
|
-
const knowledgeResult = await searchKnowledgeExec(
|
|
796
|
-
{
|
|
797
|
-
query: trigger.subject || trigger.body,
|
|
798
|
-
appId: scenario.appId || 'eval',
|
|
799
|
-
},
|
|
800
|
-
{ toolCallId: 'test', messages: [] }
|
|
801
|
-
)
|
|
802
|
-
knowledgeCount =
|
|
803
|
-
((knowledgeResult as any).knowledge?.length || 0) +
|
|
804
|
-
((knowledgeResult as any).similarTickets?.length || 0)
|
|
805
|
-
toolResults.push(`knowledge:${knowledgeCount}`)
|
|
806
|
-
}
|
|
807
|
-
|
|
808
|
-
// Evaluate: pass if we got some context
|
|
809
|
-
const hasContext = userFound || knowledgeCount > 0
|
|
810
|
-
const expected = 'context_complete'
|
|
811
|
-
const actual = hasContext ? 'context_complete' : 'context_incomplete'
|
|
812
|
-
|
|
813
|
-
completed++
|
|
814
|
-
if (!options.verbose) {
|
|
815
|
-
process.stdout.write(
|
|
816
|
-
`\r Processing ${completed}/${scenarios.length}...`
|
|
817
|
-
)
|
|
818
|
-
}
|
|
819
|
-
|
|
820
|
-
if (options.verbose && !hasContext) {
|
|
821
|
-
console.log(`\n⚠️ ${scenario.id}`)
|
|
822
|
-
console.log(` Context: ${toolResults.join(', ')}`)
|
|
823
|
-
}
|
|
824
|
-
|
|
825
|
-
return {
|
|
826
|
-
scenarioId: scenario.id,
|
|
827
|
-
passed: hasContext,
|
|
828
|
-
expected,
|
|
829
|
-
actual,
|
|
830
|
-
durationMs: Date.now() - startTime,
|
|
831
|
-
reasoning: toolResults.join(', '),
|
|
832
|
-
}
|
|
833
|
-
} catch (error) {
|
|
834
|
-
completed++
|
|
835
|
-
return {
|
|
836
|
-
scenarioId: scenario.id,
|
|
837
|
-
passed: false,
|
|
838
|
-
expected: 'context_complete',
|
|
839
|
-
actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
840
|
-
durationMs: Date.now() - startTime,
|
|
841
|
-
}
|
|
842
|
-
}
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
const { results, aborted } = await runBatchWithFailFast(
|
|
846
|
-
scenarios,
|
|
847
|
-
(scenario) => processScenario(scenario),
|
|
848
|
-
concurrency,
|
|
849
|
-
options.failFast || false
|
|
850
|
-
)
|
|
851
|
-
|
|
852
|
-
if (!options.verbose) console.log('')
|
|
853
|
-
if (aborted) {
|
|
854
|
-
console.log('⚠️ Stopped early due to --fail-fast\n')
|
|
855
|
-
}
|
|
856
|
-
return results
|
|
857
|
-
}
|
|
858
|
-
|
|
859
|
-
/**
|
|
860
|
-
* Run validate eval against scenarios with drafts.
|
|
861
|
-
*
|
|
862
|
-
* Scenarios can include:
|
|
863
|
-
* - `draft`: A pre-provided draft to validate
|
|
864
|
-
* - `assertions`: Expected validation outcomes (noFabrication, noMetaCommentary, etc.)
|
|
865
|
-
*
|
|
866
|
-
* If no draft is provided, the scenario is skipped.
|
|
867
|
-
* All validation checks are deterministic (no LLM calls).
|
|
868
|
-
*/
|
|
869
|
-
async function runValidateEval(
|
|
870
|
-
scenarios: Scenario[],
|
|
871
|
-
options: EvalOptions
|
|
872
|
-
): Promise<StepResult[]> {
|
|
873
|
-
const concurrency = options.parallel || 1
|
|
874
|
-
let completed = 0
|
|
875
|
-
|
|
876
|
-
// Filter to scenarios with drafts or assertions
|
|
877
|
-
const validScenarios = scenarios.filter((s) => s.draft || s.assertions)
|
|
878
|
-
|
|
879
|
-
if (validScenarios.length === 0) {
|
|
880
|
-
console.log('\n⚠️ No scenarios with draft or assertions found.')
|
|
881
|
-
console.log(' For validate eval, scenarios need either:')
|
|
882
|
-
console.log(' - "draft": "text to validate"')
|
|
883
|
-
console.log(' - "assertions": { "noFabrication": true, ... }\n')
|
|
884
|
-
return []
|
|
885
|
-
}
|
|
886
|
-
|
|
887
|
-
const processScenario = async (scenario: Scenario): Promise<StepResult> => {
|
|
888
|
-
// If scenario has no draft but has assertions, it's for checking generated drafts
|
|
889
|
-
// For now, skip those (they'd need full pipeline)
|
|
890
|
-
if (!scenario.draft) {
|
|
891
|
-
return {
|
|
892
|
-
scenarioId: scenario.id,
|
|
893
|
-
passed: true, // Can't evaluate without draft
|
|
894
|
-
expected: 'needs_draft',
|
|
895
|
-
actual: 'skipped',
|
|
896
|
-
durationMs: 0,
|
|
897
|
-
reasoning: 'No draft provided - use e2e eval with assertions',
|
|
898
|
-
}
|
|
899
|
-
}
|
|
900
|
-
|
|
901
|
-
const startTime = Date.now()
|
|
902
|
-
|
|
903
|
-
// Create minimal context for validation
|
|
904
|
-
// Fabrication check needs knowledge array to be empty to trigger
|
|
905
|
-
const hasKnowledge = scenario.context?.customer === 'recent-purchase'
|
|
906
|
-
const mockContext: GatherOutput = {
|
|
907
|
-
user: hasKnowledge ? { id: 'test', email: '[EMAIL]' } : null,
|
|
908
|
-
purchases: hasKnowledge
|
|
909
|
-
? [
|
|
910
|
-
{
|
|
911
|
-
id: 'p1',
|
|
912
|
-
productId: 'prod1',
|
|
913
|
-
productName: 'Test Product',
|
|
914
|
-
purchasedAt: new Date().toISOString(),
|
|
915
|
-
status: 'active',
|
|
916
|
-
},
|
|
917
|
-
]
|
|
918
|
-
: [],
|
|
919
|
-
knowledge: hasKnowledge
|
|
920
|
-
? [
|
|
921
|
-
{
|
|
922
|
-
id: 'k1',
|
|
923
|
-
type: 'faq',
|
|
924
|
-
content: 'test knowledge',
|
|
925
|
-
relevance: 0.9,
|
|
926
|
-
},
|
|
927
|
-
]
|
|
928
|
-
: [],
|
|
929
|
-
history: [],
|
|
930
|
-
priorMemory: [],
|
|
931
|
-
priorConversations: [],
|
|
932
|
-
gatherErrors: [],
|
|
933
|
-
}
|
|
934
|
-
|
|
935
|
-
try {
|
|
936
|
-
const result = await validate({
|
|
937
|
-
draft: scenario.draft,
|
|
938
|
-
context: mockContext,
|
|
939
|
-
strictMode: false,
|
|
940
|
-
})
|
|
941
|
-
|
|
942
|
-
// Map issue types to assertion names (unused but kept for documentation)
|
|
943
|
-
const _issueTypeToAssertion: Record<
|
|
944
|
-
ValidationIssueType,
|
|
945
|
-
keyof NonNullable<Scenario['assertions']>
|
|
946
|
-
> = {
|
|
947
|
-
fabrication: 'noFabrication',
|
|
948
|
-
meta_commentary: 'noMetaCommentary',
|
|
949
|
-
internal_leak: 'noInternalLeak',
|
|
950
|
-
banned_phrase: 'noBannedPhrases',
|
|
951
|
-
too_short: 'noBannedPhrases', // No specific assertion
|
|
952
|
-
too_long: 'noBannedPhrases', // No specific assertion
|
|
953
|
-
bad_tone: 'noBannedPhrases', // No specific assertion
|
|
954
|
-
repeated_mistake: 'noBannedPhrases', // No specific assertion
|
|
955
|
-
relevance: 'noBannedPhrases', // No specific assertion for relevance
|
|
956
|
-
ground_truth_mismatch: 'noBannedPhrases', // No specific assertion
|
|
957
|
-
audience_inappropriate: 'noBannedPhrases', // No specific assertion
|
|
958
|
-
tool_failure: 'noBannedPhrases', // No specific assertion
|
|
959
|
-
}
|
|
960
|
-
|
|
961
|
-
// Check if assertions match
|
|
962
|
-
const assertions = scenario.assertions || {}
|
|
963
|
-
const failedAssertions: string[] = []
|
|
964
|
-
const foundIssueTypes = new Set(
|
|
965
|
-
result.issues.map((i: { type: string }) => i.type)
|
|
966
|
-
)
|
|
967
|
-
|
|
968
|
-
// Check negative assertions (noX = expect no issues of type X)
|
|
969
|
-
if (assertions.noFabrication && foundIssueTypes.has('fabrication')) {
|
|
970
|
-
failedAssertions.push('noFabrication: found fabrication')
|
|
971
|
-
}
|
|
972
|
-
if (
|
|
973
|
-
assertions.noMetaCommentary &&
|
|
974
|
-
foundIssueTypes.has('meta_commentary')
|
|
975
|
-
) {
|
|
976
|
-
failedAssertions.push('noMetaCommentary: found meta_commentary')
|
|
977
|
-
}
|
|
978
|
-
if (assertions.noInternalLeak && foundIssueTypes.has('internal_leak')) {
|
|
979
|
-
failedAssertions.push('noInternalLeak: found internal_leak')
|
|
980
|
-
}
|
|
981
|
-
if (assertions.noBannedPhrases && foundIssueTypes.has('banned_phrase')) {
|
|
982
|
-
failedAssertions.push('noBannedPhrases: found banned_phrase')
|
|
983
|
-
}
|
|
984
|
-
|
|
985
|
-
// Check mustNotContain patterns
|
|
986
|
-
if (assertions.mustNotContain) {
|
|
987
|
-
for (const pattern of assertions.mustNotContain) {
|
|
988
|
-
if (scenario.draft.toLowerCase().includes(pattern.toLowerCase())) {
|
|
989
|
-
failedAssertions.push(`mustNotContain: found "${pattern}"`)
|
|
990
|
-
}
|
|
991
|
-
}
|
|
992
|
-
}
|
|
993
|
-
|
|
994
|
-
const passed = failedAssertions.length === 0
|
|
995
|
-
const issuesSummary = result.issues
|
|
996
|
-
.map(
|
|
997
|
-
(i: { type: string; match?: string }) =>
|
|
998
|
-
`${i.type}:${i.match || 'none'}`
|
|
999
|
-
)
|
|
1000
|
-
.join(', ')
|
|
1001
|
-
|
|
1002
|
-
completed++
|
|
1003
|
-
if (!options.verbose) {
|
|
1004
|
-
process.stdout.write(
|
|
1005
|
-
`\r Processing ${completed}/${validScenarios.length}...`
|
|
1006
|
-
)
|
|
1007
|
-
}
|
|
1008
|
-
|
|
1009
|
-
if (options.verbose && !passed) {
|
|
1010
|
-
console.log(`\n❌ ${scenario.id}`)
|
|
1011
|
-
console.log(` Failed assertions: ${failedAssertions.join(', ')}`)
|
|
1012
|
-
console.log(` Issues found: ${issuesSummary || 'none'}`)
|
|
1013
|
-
console.log(` Draft preview: ${scenario.draft.slice(0, 80)}...`)
|
|
1014
|
-
}
|
|
1015
|
-
|
|
1016
|
-
return {
|
|
1017
|
-
scenarioId: scenario.id,
|
|
1018
|
-
passed,
|
|
1019
|
-
expected: 'valid',
|
|
1020
|
-
actual: passed ? 'valid' : `invalid: ${failedAssertions.join('; ')}`,
|
|
1021
|
-
durationMs: Date.now() - startTime,
|
|
1022
|
-
reasoning: issuesSummary || 'no issues found',
|
|
1023
|
-
}
|
|
1024
|
-
} catch (error) {
|
|
1025
|
-
completed++
|
|
1026
|
-
return {
|
|
1027
|
-
scenarioId: scenario.id,
|
|
1028
|
-
passed: false,
|
|
1029
|
-
expected: 'valid',
|
|
1030
|
-
actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
1031
|
-
durationMs: Date.now() - startTime,
|
|
1032
|
-
}
|
|
1033
|
-
}
|
|
1034
|
-
}
|
|
1035
|
-
|
|
1036
|
-
const { results, aborted } = await runBatchWithFailFast(
|
|
1037
|
-
validScenarios,
|
|
1038
|
-
(scenario) => processScenario(scenario),
|
|
1039
|
-
concurrency,
|
|
1040
|
-
options.failFast || false
|
|
1041
|
-
)
|
|
1042
|
-
|
|
1043
|
-
if (!options.verbose) console.log('')
|
|
1044
|
-
if (aborted) {
|
|
1045
|
-
console.log('⚠️ Stopped early due to --fail-fast\n')
|
|
1046
|
-
}
|
|
1047
|
-
return results
|
|
1048
|
-
}
|
|
1049
|
-
|
|
1050
|
-
async function runE2EEval(
|
|
1051
|
-
scenarios: Scenario[],
|
|
1052
|
-
options: EvalOptions
|
|
1053
|
-
): Promise<StepResult[]> {
|
|
1054
|
-
const { runPipeline } = await import('@skillrecordings/core/pipeline')
|
|
1055
|
-
const concurrency = options.parallel || 1
|
|
1056
|
-
let completed = 0
|
|
1057
|
-
|
|
1058
|
-
// Note: Real tools are available when --real-tools is passed
|
|
1059
|
-
// They're initialized globally and accessible to the pipeline's gather step
|
|
1060
|
-
if (options.realTools && options.verbose) {
|
|
1061
|
-
const available = isRealToolsAvailable()
|
|
1062
|
-
console.log(` Real tools: ${available ? 'connected' : 'not available'}\n`)
|
|
1063
|
-
}
|
|
1064
|
-
|
|
1065
|
-
const processScenario = async (scenario: Scenario): Promise<StepResult> => {
|
|
1066
|
-
const trigger = scenario.trigger || scenario.triggerMessage
|
|
1067
|
-
if (!trigger) {
|
|
1068
|
-
return {
|
|
1069
|
-
scenarioId: scenario.id,
|
|
1070
|
-
passed: false,
|
|
1071
|
-
expected: 'respond',
|
|
1072
|
-
actual: 'ERROR: no trigger',
|
|
1073
|
-
durationMs: 0,
|
|
1074
|
-
}
|
|
1075
|
-
}
|
|
1076
|
-
|
|
1077
|
-
const startTime = Date.now()
|
|
1078
|
-
try {
|
|
1079
|
-
// Note: Real tools are initialized globally via initRealTools()
|
|
1080
|
-
// The pipeline will use them via the gather step's tool providers
|
|
1081
|
-
// when --real-tools is enabled and services are available
|
|
1082
|
-
|
|
1083
|
-
const pipelineResult = await runPipeline(
|
|
1084
|
-
{
|
|
1085
|
-
message: {
|
|
1086
|
-
subject: trigger.subject,
|
|
1087
|
-
body: trigger.body,
|
|
1088
|
-
appId: scenario.appId,
|
|
1089
|
-
},
|
|
1090
|
-
appConfig: {
|
|
1091
|
-
appId: scenario.appId || 'eval',
|
|
1092
|
-
instructorConfigured: true,
|
|
1093
|
-
autoSendEnabled: false,
|
|
1094
|
-
},
|
|
1095
|
-
dryRun: true,
|
|
1096
|
-
},
|
|
1097
|
-
{
|
|
1098
|
-
classifyModel: options.model,
|
|
1099
|
-
draftModel: options.model,
|
|
1100
|
-
}
|
|
1101
|
-
)
|
|
1102
|
-
|
|
1103
|
-
// For e2e, check if action matches expected
|
|
1104
|
-
const expected = scenario.expectedAction || 'respond'
|
|
1105
|
-
const passed = pipelineResult.action === expected
|
|
1106
|
-
|
|
1107
|
-
completed++
|
|
1108
|
-
if (!options.verbose) {
|
|
1109
|
-
process.stdout.write(
|
|
1110
|
-
`\r Processing ${completed}/${scenarios.length}...`
|
|
1111
|
-
)
|
|
1112
|
-
}
|
|
1113
|
-
|
|
1114
|
-
if (options.verbose && !passed) {
|
|
1115
|
-
console.log(`\n❌ ${scenario.id}`)
|
|
1116
|
-
console.log(` Expected: ${expected}`)
|
|
1117
|
-
console.log(` Actual: ${pipelineResult.action}`)
|
|
1118
|
-
console.log(
|
|
1119
|
-
` Steps: ${pipelineResult.steps.map((s) => s.step).join(' → ')}`
|
|
1120
|
-
)
|
|
1121
|
-
}
|
|
1122
|
-
|
|
1123
|
-
return {
|
|
1124
|
-
scenarioId: scenario.id,
|
|
1125
|
-
passed,
|
|
1126
|
-
expected,
|
|
1127
|
-
actual: pipelineResult.action,
|
|
1128
|
-
durationMs: Date.now() - startTime,
|
|
1129
|
-
reasoning: pipelineResult.steps
|
|
1130
|
-
.map((s) => `${s.step}:${s.success}`)
|
|
1131
|
-
.join(', '),
|
|
1132
|
-
}
|
|
1133
|
-
} catch (error) {
|
|
1134
|
-
completed++
|
|
1135
|
-
return {
|
|
1136
|
-
scenarioId: scenario.id,
|
|
1137
|
-
passed: false,
|
|
1138
|
-
expected: scenario.expectedAction || 'respond',
|
|
1139
|
-
actual: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
1140
|
-
durationMs: Date.now() - startTime,
|
|
1141
|
-
}
|
|
1142
|
-
}
|
|
1143
|
-
}
|
|
1144
|
-
|
|
1145
|
-
const { results, aborted } = await runBatchWithFailFast(
|
|
1146
|
-
scenarios,
|
|
1147
|
-
(scenario) => processScenario(scenario),
|
|
1148
|
-
concurrency,
|
|
1149
|
-
options.failFast || false
|
|
1150
|
-
)
|
|
1151
|
-
|
|
1152
|
-
if (!options.verbose) console.log('')
|
|
1153
|
-
if (aborted) {
|
|
1154
|
-
console.log('⚠️ Stopped early due to --fail-fast\n')
|
|
1155
|
-
}
|
|
1156
|
-
return results
|
|
1157
|
-
}
|
|
1158
|
-
|
|
1159
|
-
// ============================================================================
|
|
1160
|
-
// Metrics computation
|
|
1161
|
-
// ============================================================================
|
|
1162
|
-
|
|
1163
|
-
function computeMetrics(
|
|
1164
|
-
results: StepResult[],
|
|
1165
|
-
step: PipelineStep,
|
|
1166
|
-
totalDurationMs: number
|
|
1167
|
-
): EvalMetrics {
|
|
1168
|
-
const passed = results.filter((r) => r.passed).length
|
|
1169
|
-
const failed = results.length - passed
|
|
1170
|
-
|
|
1171
|
-
// Build breakdown by expected value
|
|
1172
|
-
const breakdown: Record<
|
|
1173
|
-
string,
|
|
1174
|
-
{ tp: number; fp: number; fn: number; precision: number; recall: number }
|
|
1175
|
-
> = {}
|
|
1176
|
-
|
|
1177
|
-
// Collect all unique labels
|
|
1178
|
-
const labels = new Set<string>()
|
|
1179
|
-
for (const r of results) {
|
|
1180
|
-
labels.add(r.expected)
|
|
1181
|
-
labels.add(r.actual)
|
|
1182
|
-
}
|
|
1183
|
-
|
|
1184
|
-
for (const label of labels) {
|
|
1185
|
-
if (label.startsWith('ERROR')) continue
|
|
1186
|
-
|
|
1187
|
-
let tp = 0
|
|
1188
|
-
let fp = 0
|
|
1189
|
-
let fn = 0
|
|
1190
|
-
|
|
1191
|
-
for (const r of results) {
|
|
1192
|
-
if (r.actual === label && r.expected === label) tp++
|
|
1193
|
-
else if (r.actual === label && r.expected !== label) fp++
|
|
1194
|
-
else if (r.actual !== label && r.expected === label) fn++
|
|
1195
|
-
}
|
|
1196
|
-
|
|
1197
|
-
const precision = tp + fp > 0 ? tp / (tp + fp) : 0
|
|
1198
|
-
const recall = tp + fn > 0 ? tp / (tp + fn) : 0
|
|
1199
|
-
|
|
1200
|
-
breakdown[label] = { tp, fp, fn, precision, recall }
|
|
1201
|
-
}
|
|
1202
|
-
|
|
1203
|
-
const metrics: EvalMetrics = {
|
|
1204
|
-
total: results.length,
|
|
1205
|
-
passed,
|
|
1206
|
-
failed,
|
|
1207
|
-
accuracy: results.length > 0 ? passed / results.length : 0,
|
|
1208
|
-
durationMs: totalDurationMs,
|
|
1209
|
-
breakdown,
|
|
1210
|
-
}
|
|
1211
|
-
|
|
1212
|
-
// Special metrics for route step
|
|
1213
|
-
if (step === 'route') {
|
|
1214
|
-
// False silence: expected respond but got silence
|
|
1215
|
-
const falseSilence = results.filter(
|
|
1216
|
-
(r) => r.expected === 'respond' && r.actual === 'silence'
|
|
1217
|
-
).length
|
|
1218
|
-
const shouldRespond = results.filter((r) => r.expected === 'respond').length
|
|
1219
|
-
|
|
1220
|
-
// False respond: expected silence but got respond
|
|
1221
|
-
const falseRespond = results.filter(
|
|
1222
|
-
(r) => r.expected === 'silence' && r.actual === 'respond'
|
|
1223
|
-
).length
|
|
1224
|
-
const shouldSilence = results.filter((r) => r.expected === 'silence').length
|
|
1225
|
-
|
|
1226
|
-
metrics.falseSilenceRate =
|
|
1227
|
-
shouldRespond > 0 ? falseSilence / shouldRespond : 0
|
|
1228
|
-
metrics.falseRespondRate =
|
|
1229
|
-
shouldSilence > 0 ? falseRespond / shouldSilence : 0
|
|
1230
|
-
}
|
|
1231
|
-
|
|
1232
|
-
return metrics
|
|
1233
|
-
}
|
|
1234
|
-
|
|
1235
|
-
// ============================================================================
|
|
1236
|
-
// Output
|
|
1237
|
-
// ============================================================================
|
|
1238
|
-
|
|
1239
|
-
function printMetrics(
|
|
1240
|
-
step: PipelineStep,
|
|
1241
|
-
metrics: EvalMetrics,
|
|
1242
|
-
results?: StepResult[]
|
|
1243
|
-
): void {
|
|
1244
|
-
const stepEmoji: Record<PipelineStep, string> = {
|
|
1245
|
-
classify: '🏷️',
|
|
1246
|
-
route: '🚦',
|
|
1247
|
-
gather: '📦',
|
|
1248
|
-
draft: '✍️',
|
|
1249
|
-
validate: '✅',
|
|
1250
|
-
e2e: '🔄',
|
|
1251
|
-
}
|
|
1252
|
-
|
|
1253
|
-
console.log(`${stepEmoji[step]} ${step.toUpperCase()} Eval Results\n`)
|
|
1254
|
-
console.log(`Total: ${metrics.total}`)
|
|
1255
|
-
console.log(
|
|
1256
|
-
` ✅ Passed: ${metrics.passed} (${(metrics.accuracy * 100).toFixed(1)}%)`
|
|
1257
|
-
)
|
|
1258
|
-
console.log(` ❌ Failed: ${metrics.failed}`)
|
|
1259
|
-
|
|
1260
|
-
if (step === 'route' && metrics.falseSilenceRate !== undefined) {
|
|
1261
|
-
console.log(`\nRouting Errors:`)
|
|
1262
|
-
console.log(
|
|
1263
|
-
` False silence rate: ${(metrics.falseSilenceRate * 100).toFixed(1)}%`
|
|
1264
|
-
)
|
|
1265
|
-
console.log(
|
|
1266
|
-
` False respond rate: ${(metrics.falseRespondRate! * 100).toFixed(1)}%`
|
|
1267
|
-
)
|
|
1268
|
-
}
|
|
1269
|
-
|
|
1270
|
-
// Show breakdown if there are multiple labels
|
|
1271
|
-
const labelCount = Object.keys(metrics.breakdown).length
|
|
1272
|
-
if (labelCount > 1 && labelCount <= 20) {
|
|
1273
|
-
console.log(
|
|
1274
|
-
`\nBreakdown by ${step === 'classify' ? 'category' : 'action'}:`
|
|
1275
|
-
)
|
|
1276
|
-
|
|
1277
|
-
const sorted = Object.entries(metrics.breakdown)
|
|
1278
|
-
.filter(([label]) => !label.startsWith('ERROR'))
|
|
1279
|
-
.sort((a, b) => b[1].tp + b[1].fn - (a[1].tp + a[1].fn))
|
|
1280
|
-
|
|
1281
|
-
for (const [label, stats] of sorted) {
|
|
1282
|
-
const total = stats.tp + stats.fn
|
|
1283
|
-
if (total === 0) continue
|
|
1284
|
-
|
|
1285
|
-
const precisionStr = (stats.precision * 100).toFixed(0)
|
|
1286
|
-
const recallStr = (stats.recall * 100).toFixed(0)
|
|
1287
|
-
console.log(
|
|
1288
|
-
` ${label}: ${stats.tp}/${total} (P=${precisionStr}% R=${recallStr}%)`
|
|
1289
|
-
)
|
|
1290
|
-
}
|
|
1291
|
-
}
|
|
1292
|
-
|
|
1293
|
-
// Latency
|
|
1294
|
-
const avgLatency = metrics.durationMs / metrics.total
|
|
1295
|
-
console.log(`\nLatency: ${avgLatency.toFixed(0)}ms avg`)
|
|
1296
|
-
|
|
1297
|
-
// Show individual failures if verbose
|
|
1298
|
-
if (results) {
|
|
1299
|
-
const failures = results.filter((r) => !r.passed)
|
|
1300
|
-
if (failures.length > 0) {
|
|
1301
|
-
console.log(`\n--- FAILURES (${failures.length}) ---\n`)
|
|
1302
|
-
for (const f of failures.slice(0, 10)) {
|
|
1303
|
-
console.log(`❌ ${f.scenarioId}`)
|
|
1304
|
-
console.log(` Expected: ${f.expected}`)
|
|
1305
|
-
console.log(` Actual: ${f.actual}`)
|
|
1306
|
-
if (f.reasoning) {
|
|
1307
|
-
console.log(` Reason: ${f.reasoning.slice(0, 80)}...`)
|
|
1308
|
-
}
|
|
1309
|
-
console.log('')
|
|
1310
|
-
}
|
|
1311
|
-
if (failures.length > 10) {
|
|
1312
|
-
console.log(` ... and ${failures.length - 10} more`)
|
|
1313
|
-
}
|
|
1314
|
-
}
|
|
1315
|
-
}
|
|
1316
|
-
}
|