@skillrecordings/cli 0.1.0 โ 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skill.mjs +21 -0
- package/dist/chunk-2NCCVTEE.js +22342 -0
- package/dist/chunk-2NCCVTEE.js.map +1 -0
- package/dist/chunk-3E3GYSZR.js +7071 -0
- package/dist/chunk-3E3GYSZR.js.map +1 -0
- package/dist/chunk-F4EM72IH.js +86 -0
- package/dist/chunk-F4EM72IH.js.map +1 -0
- package/dist/chunk-FGP7KUQW.js +432 -0
- package/dist/chunk-FGP7KUQW.js.map +1 -0
- package/dist/chunk-H3D6VCME.js +55 -0
- package/dist/chunk-H3D6VCME.js.map +1 -0
- package/dist/chunk-HK3PEWFD.js +208 -0
- package/dist/chunk-HK3PEWFD.js.map +1 -0
- package/dist/chunk-KEV3QKXP.js +4495 -0
- package/dist/chunk-KEV3QKXP.js.map +1 -0
- package/dist/chunk-MG37YDAK.js +882 -0
- package/dist/chunk-MG37YDAK.js.map +1 -0
- package/dist/chunk-MLNDSBZ4.js +482 -0
- package/dist/chunk-MLNDSBZ4.js.map +1 -0
- package/dist/chunk-N2WIV2JV.js +22 -0
- package/dist/chunk-N2WIV2JV.js.map +1 -0
- package/dist/chunk-PWWRCN5W.js +2067 -0
- package/dist/chunk-PWWRCN5W.js.map +1 -0
- package/dist/chunk-SKHBM3XP.js +7746 -0
- package/dist/chunk-SKHBM3XP.js.map +1 -0
- package/dist/chunk-WFANXVQG.js +64 -0
- package/dist/chunk-WFANXVQG.js.map +1 -0
- package/dist/chunk-WYKL32C3.js +275 -0
- package/dist/chunk-WYKL32C3.js.map +1 -0
- package/dist/chunk-ZNF7XD2S.js +134 -0
- package/dist/chunk-ZNF7XD2S.js.map +1 -0
- package/dist/config-AUAIYDSI.js +20 -0
- package/dist/config-AUAIYDSI.js.map +1 -0
- package/dist/fileFromPath-XN7LXIBI.js +134 -0
- package/dist/fileFromPath-XN7LXIBI.js.map +1 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js +42 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js.map +1 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js +42 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js.map +1 -0
- package/dist/getMachineId-linux-KVZEHQSU.js +34 -0
- package/dist/getMachineId-linux-KVZEHQSU.js.map +1 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js +25 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js.map +1 -0
- package/dist/getMachineId-win-IIF36LEJ.js +44 -0
- package/dist/getMachineId-win-IIF36LEJ.js.map +1 -0
- package/dist/index.js +112703 -0
- package/dist/index.js.map +1 -0
- package/dist/lib-R6DEEJCP.js +7623 -0
- package/dist/lib-R6DEEJCP.js.map +1 -0
- package/dist/pipeline-IAVVAKTU.js +120 -0
- package/dist/pipeline-IAVVAKTU.js.map +1 -0
- package/dist/query-NTP5NVXN.js +25 -0
- package/dist/query-NTP5NVXN.js.map +1 -0
- package/dist/routing-BAEPFB7V.js +390 -0
- package/dist/routing-BAEPFB7V.js.map +1 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js +56 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js.map +1 -0
- package/dist/stripe-payment-history-SJPKA63N.js +67 -0
- package/dist/stripe-payment-history-SJPKA63N.js.map +1 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js +58 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js.map +1 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js +54 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js.map +1 -0
- package/dist/support-memory-WSG7SDKG.js +10 -0
- package/dist/support-memory-WSG7SDKG.js.map +1 -0
- package/package.json +10 -7
- package/.env.encrypted +0 -0
- package/CHANGELOG.md +0 -35
- package/data/tt-archive-dataset.json +0 -1
- package/data/validate-test-dataset.json +0 -97
- package/docs/CLI-AUTH.md +0 -504
- package/preload.ts +0 -18
- package/src/__tests__/init.test.ts +0 -74
- package/src/alignment-test.ts +0 -64
- package/src/check-apps.ts +0 -16
- package/src/commands/auth/decrypt.ts +0 -123
- package/src/commands/auth/encrypt.ts +0 -81
- package/src/commands/auth/index.ts +0 -50
- package/src/commands/auth/keygen.ts +0 -41
- package/src/commands/auth/status.ts +0 -164
- package/src/commands/axiom/forensic.ts +0 -868
- package/src/commands/axiom/index.ts +0 -697
- package/src/commands/build-dataset.ts +0 -311
- package/src/commands/db-status.ts +0 -47
- package/src/commands/deploys.ts +0 -219
- package/src/commands/eval-local/compare.ts +0 -171
- package/src/commands/eval-local/health.ts +0 -212
- package/src/commands/eval-local/index.ts +0 -76
- package/src/commands/eval-local/real-tools.ts +0 -416
- package/src/commands/eval-local/run.ts +0 -1168
- package/src/commands/eval-local/score-production.ts +0 -256
- package/src/commands/eval-local/seed.ts +0 -276
- package/src/commands/eval-pipeline/index.ts +0 -53
- package/src/commands/eval-pipeline/real-tools.ts +0 -492
- package/src/commands/eval-pipeline/run.ts +0 -1316
- package/src/commands/eval-pipeline/seed.ts +0 -395
- package/src/commands/eval-prompt.ts +0 -496
- package/src/commands/eval.test.ts +0 -253
- package/src/commands/eval.ts +0 -108
- package/src/commands/faq-classify.ts +0 -460
- package/src/commands/faq-cluster.ts +0 -135
- package/src/commands/faq-extract.ts +0 -249
- package/src/commands/faq-mine.ts +0 -432
- package/src/commands/faq-review.ts +0 -426
- package/src/commands/front/index.ts +0 -351
- package/src/commands/front/pull-conversations.ts +0 -275
- package/src/commands/front/tags.ts +0 -825
- package/src/commands/front-cache.ts +0 -1277
- package/src/commands/front-stats.ts +0 -75
- package/src/commands/health.test.ts +0 -82
- package/src/commands/health.ts +0 -362
- package/src/commands/init.test.ts +0 -89
- package/src/commands/init.ts +0 -106
- package/src/commands/inngest/client.ts +0 -294
- package/src/commands/inngest/events.ts +0 -296
- package/src/commands/inngest/investigate.ts +0 -382
- package/src/commands/inngest/runs.ts +0 -149
- package/src/commands/inngest/signal.ts +0 -143
- package/src/commands/kb-sync.ts +0 -498
- package/src/commands/memory/find.ts +0 -135
- package/src/commands/memory/get.ts +0 -87
- package/src/commands/memory/index.ts +0 -97
- package/src/commands/memory/stats.ts +0 -163
- package/src/commands/memory/store.ts +0 -49
- package/src/commands/memory/vote.ts +0 -159
- package/src/commands/pipeline.ts +0 -127
- package/src/commands/responses.ts +0 -856
- package/src/commands/tools.ts +0 -293
- package/src/commands/wizard.ts +0 -319
- package/src/index.ts +0 -172
- package/src/lib/crypto.ts +0 -56
- package/src/lib/env-loader.ts +0 -206
- package/src/lib/onepassword.ts +0 -137
- package/src/test-agent-local.ts +0 -115
- package/tsconfig.json +0 -11
- package/vitest.config.ts +0 -10
|
@@ -1,1168 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Run eval suite against local environment
|
|
3
|
-
*
|
|
4
|
-
* Scenario-aware mocks that analyze trigger messages to return
|
|
5
|
-
* contextually appropriate data. No more static canned responses.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import { SUPPORT_AGENT_PROMPT } from '@skillrecordings/core/agent'
|
|
9
|
-
import {
|
|
10
|
-
BannedPhrases,
|
|
11
|
-
Helpfulness,
|
|
12
|
-
InternalStateLeakage,
|
|
13
|
-
MetaCommentary,
|
|
14
|
-
ProductFabrication,
|
|
15
|
-
} from '@skillrecordings/core/evals/scorers'
|
|
16
|
-
import { generateText, stepCountIs, tool } from 'ai'
|
|
17
|
-
import { readFile, writeFile } from 'fs/promises'
|
|
18
|
-
import { glob } from 'glob'
|
|
19
|
-
import { z } from 'zod'
|
|
20
|
-
import { cleanupRealTools, createRealTools, initRealTools } from './real-tools'
|
|
21
|
-
|
|
22
|
-
interface RunOptions {
|
|
23
|
-
scenarios?: string
|
|
24
|
-
dataset?: string
|
|
25
|
-
output?: string
|
|
26
|
-
baseline?: string
|
|
27
|
-
failThreshold?: number
|
|
28
|
-
verbose?: boolean
|
|
29
|
-
json?: boolean
|
|
30
|
-
prompt?: string
|
|
31
|
-
model?: string
|
|
32
|
-
limit?: number
|
|
33
|
-
realTools?: boolean // Use real Docker services instead of mocks
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
interface Scenario {
|
|
37
|
-
id: string
|
|
38
|
-
name?: string
|
|
39
|
-
subject?: string
|
|
40
|
-
appId?: string
|
|
41
|
-
trigger?: {
|
|
42
|
-
subject: string
|
|
43
|
-
body: string
|
|
44
|
-
}
|
|
45
|
-
triggerMessage?: {
|
|
46
|
-
subject: string
|
|
47
|
-
body: string
|
|
48
|
-
}
|
|
49
|
-
expectedBehavior?: string
|
|
50
|
-
category?: string
|
|
51
|
-
// Additional context from dataset
|
|
52
|
-
agentResponse?: {
|
|
53
|
-
text: string
|
|
54
|
-
category: string
|
|
55
|
-
}
|
|
56
|
-
conversationHistory?: Array<{
|
|
57
|
-
direction: 'in' | 'out'
|
|
58
|
-
body: string
|
|
59
|
-
timestamp: number
|
|
60
|
-
}>
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
interface ScenarioResult {
|
|
64
|
-
id: string
|
|
65
|
-
name: string
|
|
66
|
-
passed: boolean
|
|
67
|
-
durationMs: number
|
|
68
|
-
output: string
|
|
69
|
-
toolCalls: string[]
|
|
70
|
-
noDraft: boolean
|
|
71
|
-
scores: {
|
|
72
|
-
internalLeaks: { passed: boolean; matches: string[] }
|
|
73
|
-
metaCommentary: { passed: boolean; matches: string[] }
|
|
74
|
-
bannedPhrases: { passed: boolean; matches: string[] }
|
|
75
|
-
fabrication: { passed: boolean; matches: string[] }
|
|
76
|
-
helpfulness: { score: number }
|
|
77
|
-
}
|
|
78
|
-
category: string
|
|
79
|
-
failureReasons: string[]
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
interface RunSummary {
|
|
83
|
-
total: number
|
|
84
|
-
passed: number
|
|
85
|
-
failed: number
|
|
86
|
-
noDraft: number
|
|
87
|
-
passRate: number
|
|
88
|
-
durationMs: number
|
|
89
|
-
byCategory: Record<
|
|
90
|
-
string,
|
|
91
|
-
{ passed: number; failed: number; noDraft: number }
|
|
92
|
-
>
|
|
93
|
-
failures: {
|
|
94
|
-
internalLeaks: number
|
|
95
|
-
metaCommentary: number
|
|
96
|
-
bannedPhrases: number
|
|
97
|
-
fabrication: number
|
|
98
|
-
}
|
|
99
|
-
latency: {
|
|
100
|
-
p50: number
|
|
101
|
-
p95: number
|
|
102
|
-
p99: number
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* Scenario classifier - analyzes message content to determine
|
|
108
|
-
* what type of support request this is
|
|
109
|
-
*/
|
|
110
|
-
type ScenarioType =
|
|
111
|
-
| 'access_issue' // Can't access, lost access, login problems
|
|
112
|
-
| 'refund_request' // Wants money back
|
|
113
|
-
| 'transfer_request' // Move purchase to different email
|
|
114
|
-
| 'technical_help' // How do I use X, code questions
|
|
115
|
-
| 'product_inquiry' // What's included, pricing, availability
|
|
116
|
-
| 'zoom_link' // Missing workshop/event access
|
|
117
|
-
| 'invoice_request' // Need invoice, receipt
|
|
118
|
-
| 'fan_mail' // Personal message to instructor
|
|
119
|
-
| 'spam' // Vendor outreach, not real support
|
|
120
|
-
| 'general' // Catch-all
|
|
121
|
-
|
|
122
|
-
function classifyScenario(subject: string, body: string): ScenarioType {
|
|
123
|
-
// Normalize text - remove newlines, extra spaces
|
|
124
|
-
const text = `${subject} ${body}`.toLowerCase().replace(/\s+/g, ' ')
|
|
125
|
-
|
|
126
|
-
// Access issues
|
|
127
|
-
if (
|
|
128
|
-
text.includes("don't have access") ||
|
|
129
|
-
text.includes("can't access") ||
|
|
130
|
-
text.includes('lost access') ||
|
|
131
|
-
text.includes('no access') ||
|
|
132
|
-
text.includes("can't log in") ||
|
|
133
|
-
text.includes('cannot login') ||
|
|
134
|
-
text.includes('restore access') ||
|
|
135
|
-
text.includes('logging in with github') ||
|
|
136
|
-
text.includes('login with github') ||
|
|
137
|
-
text.includes('logged in with github') ||
|
|
138
|
-
text.includes('different email') ||
|
|
139
|
-
text.includes('restore the access')
|
|
140
|
-
) {
|
|
141
|
-
return 'access_issue'
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// Refund requests
|
|
145
|
-
if (
|
|
146
|
-
text.includes('refund') ||
|
|
147
|
-
text.includes('money back') ||
|
|
148
|
-
(text.includes('cancel') && text.includes('purchase')) ||
|
|
149
|
-
text.includes('charge back') ||
|
|
150
|
-
text.includes("didn't mean to buy")
|
|
151
|
-
) {
|
|
152
|
-
return 'refund_request'
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
// Transfer requests
|
|
156
|
-
if (
|
|
157
|
-
text.includes('transfer') ||
|
|
158
|
-
(text.includes('move') && text.includes('email')) ||
|
|
159
|
-
text.includes('change email') ||
|
|
160
|
-
text.includes('wrong email')
|
|
161
|
-
) {
|
|
162
|
-
return 'transfer_request'
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
// Zoom/workshop access
|
|
166
|
-
if (
|
|
167
|
-
text.includes('zoom') ||
|
|
168
|
-
(text.includes('workshop') &&
|
|
169
|
-
(text.includes('link') || text.includes('access'))) ||
|
|
170
|
-
text.includes('calendar invite') ||
|
|
171
|
-
text.includes('live event')
|
|
172
|
-
) {
|
|
173
|
-
return 'zoom_link'
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
// Invoice/receipt
|
|
177
|
-
if (
|
|
178
|
-
text.includes('invoice') ||
|
|
179
|
-
text.includes('receipt') ||
|
|
180
|
-
(text.includes('tax') && text.includes('document'))
|
|
181
|
-
) {
|
|
182
|
-
return 'invoice_request'
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
// Product inquiry
|
|
186
|
-
if (
|
|
187
|
-
text.includes('sold out') ||
|
|
188
|
-
(text.includes('buy') && text.includes('button')) ||
|
|
189
|
-
text.includes('discount') ||
|
|
190
|
-
text.includes('pricing') ||
|
|
191
|
-
text.includes("what's included") ||
|
|
192
|
-
text.includes("what's the difference")
|
|
193
|
-
) {
|
|
194
|
-
return 'product_inquiry'
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
// Technical help
|
|
198
|
-
if (
|
|
199
|
-
text.includes('how do i') ||
|
|
200
|
-
text.includes('how to') ||
|
|
201
|
-
text.includes('error') ||
|
|
202
|
-
text.includes('not working') ||
|
|
203
|
-
(text.includes('typescript') && text.includes('help')) ||
|
|
204
|
-
text.includes('code') ||
|
|
205
|
-
text.includes('tutorial')
|
|
206
|
-
) {
|
|
207
|
-
return 'technical_help'
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
// Fan mail / personal
|
|
211
|
-
if (
|
|
212
|
-
(text.includes('thank you') && text.includes('course')) ||
|
|
213
|
-
text.includes('changed my career') ||
|
|
214
|
-
text.includes('love your') ||
|
|
215
|
-
text.includes('big fan') ||
|
|
216
|
-
text.includes('appreciate')
|
|
217
|
-
) {
|
|
218
|
-
return 'fan_mail'
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
// Spam/vendor
|
|
222
|
-
if (
|
|
223
|
-
text.includes('partnership') ||
|
|
224
|
-
text.includes('sponsor') ||
|
|
225
|
-
text.includes('backlink') ||
|
|
226
|
-
text.includes('seo') ||
|
|
227
|
-
text.includes('guest post')
|
|
228
|
-
) {
|
|
229
|
-
return 'spam'
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
return 'general'
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
/**
|
|
236
|
-
* Create scenario-aware mock tools
|
|
237
|
-
*
|
|
238
|
-
* Each scenario type gets appropriate mock responses that
|
|
239
|
-
* trigger realistic agent behavior
|
|
240
|
-
*/
|
|
241
|
-
function createMockTools(scenarioType: ScenarioType, scenario: Scenario) {
|
|
242
|
-
const trigger = scenario.trigger ||
|
|
243
|
-
scenario.triggerMessage || { subject: '', body: '' }
|
|
244
|
-
|
|
245
|
-
// Extract email from trigger if present
|
|
246
|
-
const emailMatch = trigger.body.match(
|
|
247
|
-
/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/
|
|
248
|
-
)
|
|
249
|
-
const customerEmail = emailMatch?.[1] || '[EMAIL]'
|
|
250
|
-
|
|
251
|
-
return {
|
|
252
|
-
lookupUser: tool({
|
|
253
|
-
description: 'Look up user by email',
|
|
254
|
-
inputSchema: z.object({
|
|
255
|
-
email: z.string(),
|
|
256
|
-
appId: z.string(),
|
|
257
|
-
}),
|
|
258
|
-
execute: async ({ email }) => {
|
|
259
|
-
// Scenario-aware responses
|
|
260
|
-
switch (scenarioType) {
|
|
261
|
-
case 'access_issue':
|
|
262
|
-
// User found but no purchase - classic "different email" scenario
|
|
263
|
-
if (
|
|
264
|
-
trigger.body.toLowerCase().includes('different email') ||
|
|
265
|
-
trigger.body.toLowerCase().includes('github')
|
|
266
|
-
) {
|
|
267
|
-
return {
|
|
268
|
-
found: true,
|
|
269
|
-
user: { id: 'user_123', email, name: 'Customer' },
|
|
270
|
-
purchases: [], // No purchases - that's the problem!
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
// Otherwise user might have purchase but access issue
|
|
274
|
-
return {
|
|
275
|
-
found: true,
|
|
276
|
-
user: { id: 'user_123', email, name: 'Customer' },
|
|
277
|
-
purchases: [
|
|
278
|
-
{
|
|
279
|
-
id: 'purch_1',
|
|
280
|
-
product:
|
|
281
|
-
scenario.appId === 'ai-hero'
|
|
282
|
-
? 'AI Hero Workshop'
|
|
283
|
-
: 'Total TypeScript',
|
|
284
|
-
date: '2025-12-15',
|
|
285
|
-
status: 'active',
|
|
286
|
-
},
|
|
287
|
-
],
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
case 'refund_request':
|
|
291
|
-
// User with recent purchase
|
|
292
|
-
return {
|
|
293
|
-
found: true,
|
|
294
|
-
user: { id: 'user_123', email, name: 'Customer' },
|
|
295
|
-
purchases: [
|
|
296
|
-
{
|
|
297
|
-
id: 'purch_refund_1',
|
|
298
|
-
product:
|
|
299
|
-
scenario.appId === 'ai-hero'
|
|
300
|
-
? 'AI Hero Workshop'
|
|
301
|
-
: 'Total TypeScript Pro',
|
|
302
|
-
date: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000)
|
|
303
|
-
.toISOString()
|
|
304
|
-
.split('T')[0], // 7 days ago
|
|
305
|
-
status: 'active',
|
|
306
|
-
amount: 249,
|
|
307
|
-
},
|
|
308
|
-
],
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
case 'transfer_request':
|
|
312
|
-
return {
|
|
313
|
-
found: true,
|
|
314
|
-
user: { id: 'user_123', email, name: 'Customer' },
|
|
315
|
-
purchases: [
|
|
316
|
-
{
|
|
317
|
-
id: 'purch_transfer_1',
|
|
318
|
-
product: 'Total TypeScript',
|
|
319
|
-
date: '2025-01-10',
|
|
320
|
-
status: 'active',
|
|
321
|
-
},
|
|
322
|
-
],
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
case 'zoom_link':
|
|
326
|
-
case 'product_inquiry':
|
|
327
|
-
case 'technical_help':
|
|
328
|
-
case 'invoice_request':
|
|
329
|
-
// Found with purchase
|
|
330
|
-
return {
|
|
331
|
-
found: true,
|
|
332
|
-
user: { id: 'user_123', email, name: 'Customer' },
|
|
333
|
-
purchases: [
|
|
334
|
-
{
|
|
335
|
-
id: 'purch_1',
|
|
336
|
-
product:
|
|
337
|
-
scenario.appId === 'ai-hero'
|
|
338
|
-
? 'Ralph Workshop Ticket'
|
|
339
|
-
: 'Total TypeScript',
|
|
340
|
-
date: '2025-01-15',
|
|
341
|
-
status: 'active',
|
|
342
|
-
},
|
|
343
|
-
],
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
case 'fan_mail':
|
|
347
|
-
case 'spam':
|
|
348
|
-
// Might not even need to look up
|
|
349
|
-
return {
|
|
350
|
-
found: false,
|
|
351
|
-
user: null,
|
|
352
|
-
purchases: [],
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
default:
|
|
356
|
-
return {
|
|
357
|
-
found: true,
|
|
358
|
-
user: { id: 'user_123', email, name: 'Customer' },
|
|
359
|
-
purchases: [
|
|
360
|
-
{
|
|
361
|
-
id: 'purch_1',
|
|
362
|
-
product: 'Total TypeScript',
|
|
363
|
-
date: '2025-01-01',
|
|
364
|
-
status: 'active',
|
|
365
|
-
},
|
|
366
|
-
],
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
},
|
|
370
|
-
}),
|
|
371
|
-
|
|
372
|
-
searchKnowledge: tool({
|
|
373
|
-
description: 'Search knowledge base',
|
|
374
|
-
inputSchema: z.object({ query: z.string(), appId: z.string() }),
|
|
375
|
-
execute: async ({ query }) => {
|
|
376
|
-
// Return relevant knowledge for technical questions
|
|
377
|
-
if (scenarioType === 'technical_help') {
|
|
378
|
-
return {
|
|
379
|
-
similarTickets: [
|
|
380
|
-
{
|
|
381
|
-
data: 'Similar question answered: Check the TypeScript handbook section on generics.',
|
|
382
|
-
score: 0.85,
|
|
383
|
-
},
|
|
384
|
-
],
|
|
385
|
-
knowledge: [
|
|
386
|
-
{
|
|
387
|
-
data: 'For TypeScript fundamentals, start with the Beginner TypeScript tutorial.',
|
|
388
|
-
score: 0.9,
|
|
389
|
-
},
|
|
390
|
-
],
|
|
391
|
-
goodResponses: [
|
|
392
|
-
{
|
|
393
|
-
data: 'Example response: "For that specific error, try narrowing the type first..."',
|
|
394
|
-
score: 0.8,
|
|
395
|
-
},
|
|
396
|
-
],
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
// Minimal/empty for other scenarios to avoid fabrication
|
|
401
|
-
return {
|
|
402
|
-
similarTickets: [],
|
|
403
|
-
knowledge: [],
|
|
404
|
-
goodResponses: [],
|
|
405
|
-
}
|
|
406
|
-
},
|
|
407
|
-
}),
|
|
408
|
-
|
|
409
|
-
searchProductContent: tool({
|
|
410
|
-
description: 'Search product content',
|
|
411
|
-
inputSchema: z.object({ query: z.string() }),
|
|
412
|
-
execute: async ({ query }) => {
|
|
413
|
-
if (scenarioType === 'technical_help') {
|
|
414
|
-
return {
|
|
415
|
-
results: [
|
|
416
|
-
{
|
|
417
|
-
title: 'Beginner TypeScript Tutorial',
|
|
418
|
-
type: 'course',
|
|
419
|
-
url: 'https://totaltypescript.com/tutorials/beginners-typescript',
|
|
420
|
-
},
|
|
421
|
-
],
|
|
422
|
-
}
|
|
423
|
-
}
|
|
424
|
-
return { results: [] }
|
|
425
|
-
},
|
|
426
|
-
}),
|
|
427
|
-
|
|
428
|
-
draftResponse: tool({
|
|
429
|
-
description: 'Draft a response to send to customer',
|
|
430
|
-
inputSchema: z.object({ body: z.string() }),
|
|
431
|
-
execute: async ({ body }) => ({ drafted: true, body }),
|
|
432
|
-
}),
|
|
433
|
-
|
|
434
|
-
escalateToHuman: tool({
|
|
435
|
-
description: 'Escalate to human support',
|
|
436
|
-
inputSchema: z.object({
|
|
437
|
-
reason: z.string(),
|
|
438
|
-
urgency: z.enum(['low', 'medium', 'high']),
|
|
439
|
-
}),
|
|
440
|
-
execute: async ({ reason, urgency }) => ({
|
|
441
|
-
escalated: true,
|
|
442
|
-
reason,
|
|
443
|
-
urgency,
|
|
444
|
-
}),
|
|
445
|
-
}),
|
|
446
|
-
|
|
447
|
-
assignToInstructor: tool({
|
|
448
|
-
description:
|
|
449
|
-
'Assign conversation to instructor for personal correspondence',
|
|
450
|
-
inputSchema: z.object({
|
|
451
|
-
conversationId: z.string(),
|
|
452
|
-
reason: z.string(),
|
|
453
|
-
}),
|
|
454
|
-
execute: async ({ conversationId, reason }) => ({
|
|
455
|
-
status: 'pending_approval',
|
|
456
|
-
conversationId,
|
|
457
|
-
reason,
|
|
458
|
-
message: 'Instructor assignment submitted for approval',
|
|
459
|
-
}),
|
|
460
|
-
}),
|
|
461
|
-
|
|
462
|
-
memory_search: tool({
|
|
463
|
-
description: 'Search semantic memory',
|
|
464
|
-
inputSchema: z.object({ query: z.string() }),
|
|
465
|
-
execute: async () => ({ results: [], total: 0 }),
|
|
466
|
-
}),
|
|
467
|
-
|
|
468
|
-
memory_store: tool({
|
|
469
|
-
description: 'Store learning in memory',
|
|
470
|
-
inputSchema: z.object({
|
|
471
|
-
content: z.string(),
|
|
472
|
-
tags: z.array(z.string()).optional(),
|
|
473
|
-
}),
|
|
474
|
-
execute: async () => ({ stored: true, id: 'mem_mock_1' }),
|
|
475
|
-
}),
|
|
476
|
-
|
|
477
|
-
memory_vote: tool({
|
|
478
|
-
description: 'Vote on memory usefulness',
|
|
479
|
-
inputSchema: z.object({
|
|
480
|
-
memoryId: z.string(),
|
|
481
|
-
vote: z.enum(['up', 'down']),
|
|
482
|
-
}),
|
|
483
|
-
execute: async () => ({ success: true }),
|
|
484
|
-
}),
|
|
485
|
-
|
|
486
|
-
memory_cite: tool({
|
|
487
|
-
description: 'Cite a memory as used',
|
|
488
|
-
inputSchema: z.object({ memoryId: z.string() }),
|
|
489
|
-
execute: async () => ({ cited: true }),
|
|
490
|
-
}),
|
|
491
|
-
|
|
492
|
-
processRefund: tool({
|
|
493
|
-
description: 'Process a refund',
|
|
494
|
-
inputSchema: z.object({
|
|
495
|
-
purchaseId: z.string(),
|
|
496
|
-
appId: z.string(),
|
|
497
|
-
reason: z.string(),
|
|
498
|
-
}),
|
|
499
|
-
execute: async ({ purchaseId, reason }) => ({
|
|
500
|
-
status: 'pending_approval',
|
|
501
|
-
purchaseId,
|
|
502
|
-
reason,
|
|
503
|
-
message: 'Refund submitted for approval',
|
|
504
|
-
}),
|
|
505
|
-
}),
|
|
506
|
-
|
|
507
|
-
transferPurchase: tool({
|
|
508
|
-
description: 'Transfer purchase to another email',
|
|
509
|
-
inputSchema: z.object({
|
|
510
|
-
purchaseId: z.string(),
|
|
511
|
-
appId: z.string(),
|
|
512
|
-
fromUserId: z.string(),
|
|
513
|
-
toEmail: z.string(),
|
|
514
|
-
reason: z.string(),
|
|
515
|
-
}),
|
|
516
|
-
execute: async () => ({
|
|
517
|
-
status: 'pending_approval',
|
|
518
|
-
message: 'Transfer submitted for approval',
|
|
519
|
-
}),
|
|
520
|
-
}),
|
|
521
|
-
|
|
522
|
-
check_product_availability: tool({
|
|
523
|
-
description: 'Check if product is available or sold out',
|
|
524
|
-
inputSchema: z.object({
|
|
525
|
-
productId: z.string().optional(),
|
|
526
|
-
appId: z.string(),
|
|
527
|
-
}),
|
|
528
|
-
execute: async () => {
|
|
529
|
-
// Default: available
|
|
530
|
-
if (scenarioType === 'product_inquiry') {
|
|
531
|
-
return {
|
|
532
|
-
soldOut: false,
|
|
533
|
-
quantityRemaining: 12,
|
|
534
|
-
quantityAvailable: 50,
|
|
535
|
-
enrollmentOpen: true,
|
|
536
|
-
}
|
|
537
|
-
}
|
|
538
|
-
return {
|
|
539
|
-
soldOut: false,
|
|
540
|
-
quantityRemaining: -1, // unlimited
|
|
541
|
-
enrollmentOpen: true,
|
|
542
|
-
}
|
|
543
|
-
},
|
|
544
|
-
}),
|
|
545
|
-
|
|
546
|
-
getPaymentHistory: tool({
|
|
547
|
-
description: 'Get payment history from Stripe',
|
|
548
|
-
inputSchema: z.object({
|
|
549
|
-
customerEmail: z.string(),
|
|
550
|
-
limit: z.number().optional(),
|
|
551
|
-
}),
|
|
552
|
-
execute: async () => ({
|
|
553
|
-
charges: [
|
|
554
|
-
{
|
|
555
|
-
id: 'ch_mock_1',
|
|
556
|
-
amount: 24900,
|
|
557
|
-
status: 'succeeded',
|
|
558
|
-
created: Date.now() - 7 * 24 * 60 * 60 * 1000,
|
|
559
|
-
},
|
|
560
|
-
],
|
|
561
|
-
}),
|
|
562
|
-
}),
|
|
563
|
-
|
|
564
|
-
getSubscriptionStatus: tool({
|
|
565
|
-
description: 'Get subscription status',
|
|
566
|
-
inputSchema: z.object({
|
|
567
|
-
customerId: z.string(),
|
|
568
|
-
stripeAccountId: z.string(),
|
|
569
|
-
}),
|
|
570
|
-
execute: async () => ({
|
|
571
|
-
subscription: null, // Most products aren't subscriptions
|
|
572
|
-
}),
|
|
573
|
-
}),
|
|
574
|
-
|
|
575
|
-
lookupCharge: tool({
|
|
576
|
-
description: 'Look up specific charge',
|
|
577
|
-
inputSchema: z.object({ chargeId: z.string() }),
|
|
578
|
-
execute: async ({ chargeId }) => ({
|
|
579
|
-
charge: {
|
|
580
|
-
id: chargeId,
|
|
581
|
-
amount: 24900,
|
|
582
|
-
status: 'succeeded',
|
|
583
|
-
refunded: false,
|
|
584
|
-
},
|
|
585
|
-
}),
|
|
586
|
-
}),
|
|
587
|
-
|
|
588
|
-
verifyRefund: tool({
|
|
589
|
-
description: 'Verify refund status',
|
|
590
|
-
inputSchema: z.object({ refundId: z.string() }),
|
|
591
|
-
execute: async ({ refundId }) => ({
|
|
592
|
-
refund: {
|
|
593
|
-
id: refundId,
|
|
594
|
-
status: 'succeeded',
|
|
595
|
-
amount: 24900,
|
|
596
|
-
},
|
|
597
|
-
}),
|
|
598
|
-
}),
|
|
599
|
-
}
|
|
600
|
-
}
|
|
601
|
-
|
|
602
|
-
export async function run(options: RunOptions): Promise<void> {
|
|
603
|
-
const {
|
|
604
|
-
scenarios: scenarioGlob,
|
|
605
|
-
dataset: datasetPath,
|
|
606
|
-
output,
|
|
607
|
-
baseline,
|
|
608
|
-
failThreshold = 0.8,
|
|
609
|
-
verbose = false,
|
|
610
|
-
json = false,
|
|
611
|
-
prompt: promptPath,
|
|
612
|
-
model = 'anthropic/claude-haiku-4-5',
|
|
613
|
-
limit,
|
|
614
|
-
realTools = false,
|
|
615
|
-
} = options
|
|
616
|
-
|
|
617
|
-
// Initialize real tools if flag is set
|
|
618
|
-
if (realTools) {
|
|
619
|
-
if (!json) console.log('๐ง Using REAL tools (Docker services)...')
|
|
620
|
-
try {
|
|
621
|
-
await initRealTools()
|
|
622
|
-
if (!json) console.log('โ
Connected to MySQL and Qdrant')
|
|
623
|
-
} catch (error) {
|
|
624
|
-
console.error('โ Failed to connect to Docker services:', error)
|
|
625
|
-
console.error(
|
|
626
|
-
' Make sure services are running: docker compose -f docker/eval.yml up -d'
|
|
627
|
-
)
|
|
628
|
-
process.exit(1)
|
|
629
|
-
}
|
|
630
|
-
}
|
|
631
|
-
|
|
632
|
-
// Load prompt
|
|
633
|
-
let systemPrompt = SUPPORT_AGENT_PROMPT
|
|
634
|
-
if (promptPath) {
|
|
635
|
-
systemPrompt = await readFile(promptPath, 'utf-8')
|
|
636
|
-
if (!json) console.log(`Using prompt from: ${promptPath}`)
|
|
637
|
-
} else {
|
|
638
|
-
if (!json) console.log('Using production prompt')
|
|
639
|
-
}
|
|
640
|
-
|
|
641
|
-
// Load scenarios from either scenarios glob or dataset file
|
|
642
|
-
let scenarios: Scenario[] = []
|
|
643
|
-
|
|
644
|
-
if (datasetPath) {
|
|
645
|
-
// Load from dataset file (comprehensive-dataset.json format)
|
|
646
|
-
const datasetContent = await readFile(datasetPath, 'utf-8')
|
|
647
|
-
const dataset = JSON.parse(datasetContent)
|
|
648
|
-
scenarios = dataset.map((item: any) => {
|
|
649
|
-
const trigger = item.triggerMessage || {
|
|
650
|
-
subject: item.subject || '',
|
|
651
|
-
body: '',
|
|
652
|
-
}
|
|
653
|
-
const fullText = `${trigger.subject} ${trigger.body}`.toLowerCase()
|
|
654
|
-
|
|
655
|
-
// Detect app from content
|
|
656
|
-
let detectedApp = 'total-typescript'
|
|
657
|
-
if (
|
|
658
|
-
fullText.includes('ai hero') ||
|
|
659
|
-
fullText.includes('aihero.dev') ||
|
|
660
|
-
fullText.includes('ai-hero') ||
|
|
661
|
-
fullText.includes('ralph') ||
|
|
662
|
-
fullText.includes('autonomous software engineers')
|
|
663
|
-
) {
|
|
664
|
-
detectedApp = 'ai-hero'
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
return {
|
|
668
|
-
id: item.id || item.conversationId,
|
|
669
|
-
name: trigger.subject || 'Unknown',
|
|
670
|
-
trigger,
|
|
671
|
-
triggerMessage: item.triggerMessage,
|
|
672
|
-
category: item.category || 'general',
|
|
673
|
-
appId: item.app !== 'unknown' ? item.app : detectedApp,
|
|
674
|
-
agentResponse: item.agentResponse,
|
|
675
|
-
conversationHistory: item.conversationHistory,
|
|
676
|
-
}
|
|
677
|
-
})
|
|
678
|
-
} else {
|
|
679
|
-
// Load from scenario files
|
|
680
|
-
const glob_ = scenarioGlob || 'fixtures/scenarios/**/*.json'
|
|
681
|
-
const scenarioFiles = await glob(glob_)
|
|
682
|
-
|
|
683
|
-
if (scenarioFiles.length === 0) {
|
|
684
|
-
console.error('No scenarios found. Use --scenarios or --dataset')
|
|
685
|
-
process.exit(1)
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
scenarios = await Promise.all(
|
|
689
|
-
scenarioFiles.map(async (file) => {
|
|
690
|
-
const content = await readFile(file, 'utf-8')
|
|
691
|
-
return JSON.parse(content)
|
|
692
|
-
})
|
|
693
|
-
)
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
// Apply limit
|
|
697
|
-
if (limit && limit < scenarios.length) {
|
|
698
|
-
scenarios = scenarios.slice(0, limit)
|
|
699
|
-
}
|
|
700
|
-
|
|
701
|
-
if (!json) {
|
|
702
|
-
console.log(
|
|
703
|
-
`\n๐งช Running ${scenarios.length} scenarios (model: ${model})\n`
|
|
704
|
-
)
|
|
705
|
-
}
|
|
706
|
-
|
|
707
|
-
const startTime = Date.now()
|
|
708
|
-
const results: ScenarioResult[] = []
|
|
709
|
-
|
|
710
|
-
for (let i = 0; i < scenarios.length; i++) {
|
|
711
|
-
if (!json) {
|
|
712
|
-
process.stdout.write(`\r Processing ${i + 1}/${scenarios.length}...`)
|
|
713
|
-
}
|
|
714
|
-
|
|
715
|
-
const scenario = scenarios[i]
|
|
716
|
-
if (!scenario) continue
|
|
717
|
-
const result = await runScenario(
|
|
718
|
-
scenario,
|
|
719
|
-
systemPrompt,
|
|
720
|
-
model,
|
|
721
|
-
verbose,
|
|
722
|
-
realTools
|
|
723
|
-
)
|
|
724
|
-
results.push(result)
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
// Cleanup real tools if used
|
|
728
|
-
if (realTools) {
|
|
729
|
-
await cleanupRealTools()
|
|
730
|
-
}
|
|
731
|
-
|
|
732
|
-
if (!json) {
|
|
733
|
-
console.log('\n')
|
|
734
|
-
}
|
|
735
|
-
|
|
736
|
-
const totalDuration = Date.now() - startTime
|
|
737
|
-
const summary = aggregateResults(results, totalDuration)
|
|
738
|
-
|
|
739
|
-
// Compare to baseline if provided
|
|
740
|
-
if (baseline) {
|
|
741
|
-
try {
|
|
742
|
-
const baselineContent = await readFile(baseline, 'utf-8')
|
|
743
|
-
const baselineData = JSON.parse(baselineContent)
|
|
744
|
-
printComparison(summary, baselineData.summary || baselineData)
|
|
745
|
-
} catch (e) {
|
|
746
|
-
console.error('Could not load baseline:', e)
|
|
747
|
-
}
|
|
748
|
-
}
|
|
749
|
-
|
|
750
|
-
// Save results if output specified
|
|
751
|
-
if (output) {
|
|
752
|
-
await writeFile(output, JSON.stringify({ summary, results }, null, 2))
|
|
753
|
-
if (!json) {
|
|
754
|
-
console.log(`Results saved to ${output}`)
|
|
755
|
-
}
|
|
756
|
-
}
|
|
757
|
-
|
|
758
|
-
if (json) {
|
|
759
|
-
console.log(JSON.stringify({ summary, results }, null, 2))
|
|
760
|
-
} else {
|
|
761
|
-
printSummary(summary, failThreshold)
|
|
762
|
-
|
|
763
|
-
// Show failures if verbose
|
|
764
|
-
if (verbose) {
|
|
765
|
-
const failures = results.filter((r) => !r.passed && !r.noDraft)
|
|
766
|
-
if (failures.length > 0) {
|
|
767
|
-
console.log('\n--- FAILURES ---\n')
|
|
768
|
-
for (const f of failures.slice(0, 10)) {
|
|
769
|
-
console.log(`โ ${f.name}`)
|
|
770
|
-
for (const reason of f.failureReasons) {
|
|
771
|
-
console.log(` โโ ${reason}`)
|
|
772
|
-
}
|
|
773
|
-
if (f.output) {
|
|
774
|
-
console.log(` Output: ${f.output.slice(0, 150)}...`)
|
|
775
|
-
}
|
|
776
|
-
console.log('')
|
|
777
|
-
}
|
|
778
|
-
}
|
|
779
|
-
}
|
|
780
|
-
}
|
|
781
|
-
|
|
782
|
-
// Exit with error if below threshold
|
|
783
|
-
const effectivePassRate =
|
|
784
|
-
summary.passed / (summary.passed + summary.failed) || 0
|
|
785
|
-
if (effectivePassRate < failThreshold && summary.failed > 0) {
|
|
786
|
-
process.exit(1)
|
|
787
|
-
}
|
|
788
|
-
}
|
|
789
|
-
|
|
790
|
-
async function runScenario(
|
|
791
|
-
scenario: Scenario,
|
|
792
|
-
systemPrompt: string,
|
|
793
|
-
model: string,
|
|
794
|
-
verbose?: boolean,
|
|
795
|
-
useRealTools?: boolean
|
|
796
|
-
): Promise<ScenarioResult> {
|
|
797
|
-
const startTime = Date.now()
|
|
798
|
-
const failureReasons: string[] = []
|
|
799
|
-
|
|
800
|
-
// Build input message
|
|
801
|
-
const trigger = scenario.trigger ||
|
|
802
|
-
scenario.triggerMessage || { subject: '', body: '' }
|
|
803
|
-
const input = `Subject: ${trigger.subject}\n\n${trigger.body}`
|
|
804
|
-
const name = scenario.name || trigger.subject || scenario.id
|
|
805
|
-
|
|
806
|
-
// Classify scenario and create appropriate tools (mock or real)
|
|
807
|
-
const scenarioType = classifyScenario(trigger.subject, trigger.body)
|
|
808
|
-
if (verbose) {
|
|
809
|
-
console.log(
|
|
810
|
-
`[CLASSIFY] "${trigger.subject.slice(0, 50)}..." โ ${scenarioType}`
|
|
811
|
-
)
|
|
812
|
-
if (useRealTools) {
|
|
813
|
-
console.log(`[TOOLS] Using REAL Docker services`)
|
|
814
|
-
}
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
// Use real tools if flag is set, otherwise use mocks
|
|
818
|
-
const tools = useRealTools
|
|
819
|
-
? createRealTools({
|
|
820
|
-
appId: scenario.appId,
|
|
821
|
-
customerEmail: trigger.body.match(
|
|
822
|
-
/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/
|
|
823
|
-
)?.[1],
|
|
824
|
-
})
|
|
825
|
-
: createMockTools(scenarioType, scenario)
|
|
826
|
-
|
|
827
|
-
// Use scenarioType as category for better tracking
|
|
828
|
-
const category = scenarioType
|
|
829
|
-
|
|
830
|
-
let output = ''
|
|
831
|
-
let toolCalls: string[] = []
|
|
832
|
-
let noDraft = false
|
|
833
|
-
|
|
834
|
-
try {
|
|
835
|
-
// Add explicit tool requirement - the LLM must use draftResponse, not text output
|
|
836
|
-
const evalSystemPrompt =
|
|
837
|
-
systemPrompt +
|
|
838
|
-
`
|
|
839
|
-
|
|
840
|
-
## CRITICAL: Tool Usage Requirements
|
|
841
|
-
1. You MUST use draftResponse to send ANY reply to the customer
|
|
842
|
-
2. NEVER output text responses directly - you are in a tool-use only mode
|
|
843
|
-
3. Your only outputs should be tool calls. No explanatory text.
|
|
844
|
-
4. If you want to respond to the customer, call draftResponse with the response body
|
|
845
|
-
5. If you decide not to respond, make no tool calls at all
|
|
846
|
-
|
|
847
|
-
Think step by step:
|
|
848
|
-
1. Analyze the customer message
|
|
849
|
-
2. Call relevant tools (lookupUser, searchKnowledge, etc.)
|
|
850
|
-
3. Based on tool results, either:
|
|
851
|
-
- Call draftResponse with your reply, OR
|
|
852
|
-
- Make no response (for spam, vendor emails, already handled, etc.)
|
|
853
|
-
|
|
854
|
-
App: ${scenario.appId || 'total-typescript'}`
|
|
855
|
-
|
|
856
|
-
const result = await generateText({
|
|
857
|
-
model,
|
|
858
|
-
system: evalSystemPrompt,
|
|
859
|
-
messages: [{ role: 'user', content: input }],
|
|
860
|
-
tools,
|
|
861
|
-
stopWhen: stepCountIs(10), // Match production - use stopWhen for multi-step
|
|
862
|
-
})
|
|
863
|
-
|
|
864
|
-
// Extract tool calls
|
|
865
|
-
toolCalls = result.steps
|
|
866
|
-
.flatMap((s) => s.toolCalls || [])
|
|
867
|
-
.map((tc) => tc.toolName)
|
|
868
|
-
|
|
869
|
-
// Debug all steps when verbose
|
|
870
|
-
if (verbose) {
|
|
871
|
-
console.log(
|
|
872
|
-
`\n[TRACE] ${name} (${result.steps.length} steps, reason: ${result.finishReason})`
|
|
873
|
-
)
|
|
874
|
-
for (let i = 0; i < result.steps.length; i++) {
|
|
875
|
-
const step = result.steps[i]
|
|
876
|
-
if (!step) continue
|
|
877
|
-
const calls = (step.toolCalls || [])
|
|
878
|
-
.map((tc) => `${tc.toolName}`)
|
|
879
|
-
.join(', ')
|
|
880
|
-
console.log(
|
|
881
|
-
` Step ${i + 1}: ${calls || 'no tool calls'} [reason: ${step.finishReason}]`
|
|
882
|
-
)
|
|
883
|
-
for (const tr of step.toolResults || []) {
|
|
884
|
-
const preview = JSON.stringify(tr.output).slice(0, 300)
|
|
885
|
-
console.log(` โ ${preview}`)
|
|
886
|
-
}
|
|
887
|
-
if (step.text) {
|
|
888
|
-
console.log(` text: ${step.text.slice(0, 100)}...`)
|
|
889
|
-
}
|
|
890
|
-
}
|
|
891
|
-
}
|
|
892
|
-
|
|
893
|
-
// Find draftResponse output - this is the only way to send to customers
|
|
894
|
-
// Text output without draftResponse is internal reasoning (not sent)
|
|
895
|
-
const draftCall = result.steps
|
|
896
|
-
.flatMap((s) => s.toolCalls || [])
|
|
897
|
-
.find((tc) => tc.toolName === 'draftResponse')
|
|
898
|
-
|
|
899
|
-
if (draftCall) {
|
|
900
|
-
// Explicit draft call - this is a customer response
|
|
901
|
-
output = (draftCall.input as { body: string }).body
|
|
902
|
-
if (verbose) {
|
|
903
|
-
console.log(` โ
DRAFTED: ${output.slice(0, 100)}...`)
|
|
904
|
-
}
|
|
905
|
-
} else {
|
|
906
|
-
// No draftResponse = correctly silent (even if there's reasoning text)
|
|
907
|
-
noDraft = true
|
|
908
|
-
if (verbose) {
|
|
909
|
-
if (result.text && result.text.trim().length > 0) {
|
|
910
|
-
console.log(` ๐ซ SILENT (reasoning): ${result.text.slice(0, 80)}...`)
|
|
911
|
-
} else {
|
|
912
|
-
console.log(` ๐ซ SILENT (no output)`)
|
|
913
|
-
}
|
|
914
|
-
}
|
|
915
|
-
}
|
|
916
|
-
} catch (error) {
|
|
917
|
-
output = `ERROR: ${error instanceof Error ? error.message : 'Unknown error'}`
|
|
918
|
-
failureReasons.push(output)
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
const durationMs = Date.now() - startTime
|
|
922
|
-
|
|
923
|
-
// Run quality scorers on output
|
|
924
|
-
const leakResult = InternalStateLeakage({ output })
|
|
925
|
-
const metaResult = MetaCommentary({ output })
|
|
926
|
-
const bannedResult = BannedPhrases({ output })
|
|
927
|
-
const fabResult = ProductFabrication({ output })
|
|
928
|
-
const helpResult = Helpfulness({ output })
|
|
929
|
-
|
|
930
|
-
const scores = {
|
|
931
|
-
internalLeaks: {
|
|
932
|
-
passed: leakResult.score === 1,
|
|
933
|
-
matches: leakResult.metadata?.foundLeaks || [],
|
|
934
|
-
},
|
|
935
|
-
metaCommentary: {
|
|
936
|
-
passed: metaResult.score === 1,
|
|
937
|
-
matches: metaResult.metadata?.foundMeta || [],
|
|
938
|
-
},
|
|
939
|
-
bannedPhrases: {
|
|
940
|
-
passed: bannedResult.score === 1,
|
|
941
|
-
matches: bannedResult.metadata?.foundBanned || [],
|
|
942
|
-
},
|
|
943
|
-
fabrication: {
|
|
944
|
-
passed: fabResult.score === 1,
|
|
945
|
-
matches: fabResult.metadata?.foundFabrication || [],
|
|
946
|
-
},
|
|
947
|
-
helpfulness: {
|
|
948
|
-
score: helpResult.score,
|
|
949
|
-
},
|
|
950
|
-
}
|
|
951
|
-
|
|
952
|
-
// Build failure reasons
|
|
953
|
-
if (!scores.internalLeaks.passed) {
|
|
954
|
-
failureReasons.push(
|
|
955
|
-
`Internal leak: ${scores.internalLeaks.matches.join(', ')}`
|
|
956
|
-
)
|
|
957
|
-
}
|
|
958
|
-
if (!scores.metaCommentary.passed) {
|
|
959
|
-
failureReasons.push(
|
|
960
|
-
`Meta commentary: ${scores.metaCommentary.matches.join(', ')}`
|
|
961
|
-
)
|
|
962
|
-
}
|
|
963
|
-
if (!scores.bannedPhrases.passed) {
|
|
964
|
-
failureReasons.push(
|
|
965
|
-
`Banned phrase: ${scores.bannedPhrases.matches.join(', ')}`
|
|
966
|
-
)
|
|
967
|
-
}
|
|
968
|
-
if (!scores.fabrication.passed) {
|
|
969
|
-
failureReasons.push(`Fabrication: ${scores.fabrication.matches.join(', ')}`)
|
|
970
|
-
}
|
|
971
|
-
|
|
972
|
-
// Determine pass/fail based on expectedBehavior
|
|
973
|
-
// Check if agent behavior matches what the scenario expects
|
|
974
|
-
const expectedBehavior = scenario.expectedBehavior?.toLowerCase() || ''
|
|
975
|
-
|
|
976
|
-
// Expected to draft a response?
|
|
977
|
-
const shouldDraft =
|
|
978
|
-
expectedBehavior.includes('draft') ||
|
|
979
|
-
expectedBehavior.includes('respond') ||
|
|
980
|
-
expectedBehavior.includes('help') ||
|
|
981
|
-
expectedBehavior.includes('ask_for_details')
|
|
982
|
-
|
|
983
|
-
// Expected to stay silent?
|
|
984
|
-
const shouldBeSilent =
|
|
985
|
-
expectedBehavior.includes('silent') ||
|
|
986
|
-
expectedBehavior.includes('ignore') ||
|
|
987
|
-
expectedBehavior.includes('no_response')
|
|
988
|
-
|
|
989
|
-
// Expected to escalate?
|
|
990
|
-
const shouldEscalate =
|
|
991
|
-
expectedBehavior.includes('escalate') ||
|
|
992
|
-
expectedBehavior.includes('human') ||
|
|
993
|
-
expectedBehavior.includes('approval')
|
|
994
|
-
|
|
995
|
-
// Check for escalation in tool calls
|
|
996
|
-
const didEscalate =
|
|
997
|
-
toolCalls.includes('escalateToHuman') ||
|
|
998
|
-
toolCalls.includes('assignToInstructor')
|
|
999
|
-
|
|
1000
|
-
let passed = true
|
|
1001
|
-
|
|
1002
|
-
// If expected draft but got silence โ FAIL
|
|
1003
|
-
if (shouldDraft && noDraft) {
|
|
1004
|
-
passed = false
|
|
1005
|
-
failureReasons.push('Expected draft response but agent stayed silent')
|
|
1006
|
-
}
|
|
1007
|
-
|
|
1008
|
-
// If expected silence but got draft โ check draft quality
|
|
1009
|
-
if (shouldBeSilent && !noDraft) {
|
|
1010
|
-
// Draft when should be silent is a failure
|
|
1011
|
-
passed = false
|
|
1012
|
-
failureReasons.push('Expected silence but agent drafted a response')
|
|
1013
|
-
}
|
|
1014
|
-
|
|
1015
|
-
// If expected escalate but didn't โ FAIL
|
|
1016
|
-
if (shouldEscalate && !didEscalate) {
|
|
1017
|
-
passed = false
|
|
1018
|
-
failureReasons.push('Expected escalation but agent did not escalate')
|
|
1019
|
-
}
|
|
1020
|
-
|
|
1021
|
-
// If drafted, also check quality
|
|
1022
|
-
if (!noDraft) {
|
|
1023
|
-
if (!scores.internalLeaks.passed) passed = false
|
|
1024
|
-
if (!scores.metaCommentary.passed) passed = false
|
|
1025
|
-
if (!scores.bannedPhrases.passed) passed = false
|
|
1026
|
-
if (!scores.fabrication.passed) passed = false
|
|
1027
|
-
}
|
|
1028
|
-
|
|
1029
|
-
// If no expectedBehavior specified, fall back to old logic
|
|
1030
|
-
if (!expectedBehavior) {
|
|
1031
|
-
passed =
|
|
1032
|
-
noDraft ||
|
|
1033
|
-
(scores.internalLeaks.passed &&
|
|
1034
|
-
scores.metaCommentary.passed &&
|
|
1035
|
-
scores.bannedPhrases.passed &&
|
|
1036
|
-
scores.fabrication.passed)
|
|
1037
|
-
}
|
|
1038
|
-
|
|
1039
|
-
return {
|
|
1040
|
-
id: scenario.id,
|
|
1041
|
-
name,
|
|
1042
|
-
passed,
|
|
1043
|
-
durationMs,
|
|
1044
|
-
output,
|
|
1045
|
-
toolCalls,
|
|
1046
|
-
noDraft,
|
|
1047
|
-
scores,
|
|
1048
|
-
category,
|
|
1049
|
-
failureReasons,
|
|
1050
|
-
}
|
|
1051
|
-
}
|
|
1052
|
-
|
|
1053
|
-
function aggregateResults(
|
|
1054
|
-
results: ScenarioResult[],
|
|
1055
|
-
totalDurationMs: number
|
|
1056
|
-
): RunSummary {
|
|
1057
|
-
const passed = results.filter((r) => r.passed).length
|
|
1058
|
-
const noDraft = results.filter((r) => r.noDraft && r.passed).length // Only count as noDraft if also passed
|
|
1059
|
-
const failed = results.filter((r) => !r.passed).length // Failed is anything that didn't pass
|
|
1060
|
-
|
|
1061
|
-
// Group by category
|
|
1062
|
-
const byCategory: Record<
|
|
1063
|
-
string,
|
|
1064
|
-
{ passed: number; failed: number; noDraft: number }
|
|
1065
|
-
> = {}
|
|
1066
|
-
for (const result of results) {
|
|
1067
|
-
const category = result.category || 'general'
|
|
1068
|
-
if (!byCategory[category]) {
|
|
1069
|
-
byCategory[category] = { passed: 0, failed: 0, noDraft: 0 }
|
|
1070
|
-
}
|
|
1071
|
-
if (result.noDraft) {
|
|
1072
|
-
byCategory[category].noDraft++
|
|
1073
|
-
} else if (result.passed) {
|
|
1074
|
-
byCategory[category].passed++
|
|
1075
|
-
} else {
|
|
1076
|
-
byCategory[category].failed++
|
|
1077
|
-
}
|
|
1078
|
-
}
|
|
1079
|
-
|
|
1080
|
-
// Count failure types (only for non-noDraft results)
|
|
1081
|
-
const withDrafts = results.filter((r) => !r.noDraft)
|
|
1082
|
-
const failures = {
|
|
1083
|
-
internalLeaks: withDrafts.filter((r) => !r.scores.internalLeaks.passed)
|
|
1084
|
-
.length,
|
|
1085
|
-
metaCommentary: withDrafts.filter((r) => !r.scores.metaCommentary.passed)
|
|
1086
|
-
.length,
|
|
1087
|
-
bannedPhrases: withDrafts.filter((r) => !r.scores.bannedPhrases.passed)
|
|
1088
|
-
.length,
|
|
1089
|
-
fabrication: withDrafts.filter((r) => !r.scores.fabrication.passed).length,
|
|
1090
|
-
}
|
|
1091
|
-
|
|
1092
|
-
// Calculate latency percentiles
|
|
1093
|
-
const durations = results.map((r) => r.durationMs).sort((a, b) => a - b)
|
|
1094
|
-
const latency = {
|
|
1095
|
-
p50: durations[Math.floor(durations.length * 0.5)] || 0,
|
|
1096
|
-
p95: durations[Math.floor(durations.length * 0.95)] || 0,
|
|
1097
|
-
p99: durations[Math.floor(durations.length * 0.99)] || 0,
|
|
1098
|
-
}
|
|
1099
|
-
|
|
1100
|
-
return {
|
|
1101
|
-
total: results.length,
|
|
1102
|
-
passed,
|
|
1103
|
-
failed,
|
|
1104
|
-
noDraft,
|
|
1105
|
-
passRate: results.length > 0 ? passed / results.length : 0,
|
|
1106
|
-
durationMs: totalDurationMs,
|
|
1107
|
-
byCategory,
|
|
1108
|
-
failures,
|
|
1109
|
-
latency,
|
|
1110
|
-
}
|
|
1111
|
-
}
|
|
1112
|
-
|
|
1113
|
-
function printSummary(summary: RunSummary, threshold: number): void {
|
|
1114
|
-
console.log('๐งช Eval Results\n')
|
|
1115
|
-
console.log(`Scenarios: ${summary.total} total`)
|
|
1116
|
-
console.log(
|
|
1117
|
-
` โ
Passed: ${summary.passed} (${(summary.passRate * 100).toFixed(1)}%)`
|
|
1118
|
-
)
|
|
1119
|
-
console.log(` โ Failed: ${summary.failed}`)
|
|
1120
|
-
console.log(` ๐ซ No draft: ${summary.noDraft}`)
|
|
1121
|
-
|
|
1122
|
-
if (summary.failed > 0) {
|
|
1123
|
-
console.log('\nQuality Breakdown (drafts with issues):')
|
|
1124
|
-
if (summary.failures.internalLeaks > 0) {
|
|
1125
|
-
console.log(` ๐จ Internal leaks: ${summary.failures.internalLeaks}`)
|
|
1126
|
-
}
|
|
1127
|
-
if (summary.failures.metaCommentary > 0) {
|
|
1128
|
-
console.log(` ๐ฌ Meta-commentary: ${summary.failures.metaCommentary}`)
|
|
1129
|
-
}
|
|
1130
|
-
if (summary.failures.bannedPhrases > 0) {
|
|
1131
|
-
console.log(` ๐ซ Banned phrases: ${summary.failures.bannedPhrases}`)
|
|
1132
|
-
}
|
|
1133
|
-
if (summary.failures.fabrication > 0) {
|
|
1134
|
-
console.log(` ๐ญ Fabrication: ${summary.failures.fabrication}`)
|
|
1135
|
-
}
|
|
1136
|
-
}
|
|
1137
|
-
|
|
1138
|
-
console.log('\nBy Category:')
|
|
1139
|
-
for (const [cat, stats] of Object.entries(summary.byCategory)) {
|
|
1140
|
-
const total = stats.passed + stats.failed + stats.noDraft
|
|
1141
|
-
console.log(
|
|
1142
|
-
` ${cat}: ${stats.passed}โ
${stats.failed}โ ${stats.noDraft}๐ซ (${total} total)`
|
|
1143
|
-
)
|
|
1144
|
-
}
|
|
1145
|
-
|
|
1146
|
-
console.log('\nLatency:')
|
|
1147
|
-
console.log(` p50: ${summary.latency.p50}ms`)
|
|
1148
|
-
console.log(` p95: ${summary.latency.p95}ms`)
|
|
1149
|
-
console.log(` p99: ${summary.latency.p99}ms`)
|
|
1150
|
-
|
|
1151
|
-
const effectivePassRate =
|
|
1152
|
-
summary.passed / (summary.passed + summary.failed) || 1
|
|
1153
|
-
const passIcon = effectivePassRate >= threshold ? 'โ
' : 'โ'
|
|
1154
|
-
console.log(
|
|
1155
|
-
`\nDraft quality: ${(effectivePassRate * 100).toFixed(1)}% (threshold: ${(threshold * 100).toFixed(1)}%) ${passIcon}`
|
|
1156
|
-
)
|
|
1157
|
-
}
|
|
1158
|
-
|
|
1159
|
-
function printComparison(current: RunSummary, baseline: RunSummary): void {
|
|
1160
|
-
console.log('\n๐ฌ Comparison to Baseline\n')
|
|
1161
|
-
|
|
1162
|
-
const passRateDelta = current.passRate - baseline.passRate
|
|
1163
|
-
const passRateIcon = passRateDelta >= 0 ? 'โฌ๏ธ' : 'โฌ๏ธ'
|
|
1164
|
-
|
|
1165
|
-
console.log(
|
|
1166
|
-
`Pass rate: ${(baseline.passRate * 100).toFixed(1)}% โ ${(current.passRate * 100).toFixed(1)}% ${passRateDelta > 0 ? '+' : ''}${(passRateDelta * 100).toFixed(1)}% ${passRateIcon}`
|
|
1167
|
-
)
|
|
1168
|
-
}
|