@skillrecordings/cli 0.1.0 โ†’ 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/bin/skill.mjs +21 -0
  2. package/dist/chunk-2NCCVTEE.js +22342 -0
  3. package/dist/chunk-2NCCVTEE.js.map +1 -0
  4. package/dist/chunk-3E3GYSZR.js +7071 -0
  5. package/dist/chunk-3E3GYSZR.js.map +1 -0
  6. package/dist/chunk-F4EM72IH.js +86 -0
  7. package/dist/chunk-F4EM72IH.js.map +1 -0
  8. package/dist/chunk-FGP7KUQW.js +432 -0
  9. package/dist/chunk-FGP7KUQW.js.map +1 -0
  10. package/dist/chunk-H3D6VCME.js +55 -0
  11. package/dist/chunk-H3D6VCME.js.map +1 -0
  12. package/dist/chunk-HK3PEWFD.js +208 -0
  13. package/dist/chunk-HK3PEWFD.js.map +1 -0
  14. package/dist/chunk-KEV3QKXP.js +4495 -0
  15. package/dist/chunk-KEV3QKXP.js.map +1 -0
  16. package/dist/chunk-MG37YDAK.js +882 -0
  17. package/dist/chunk-MG37YDAK.js.map +1 -0
  18. package/dist/chunk-MLNDSBZ4.js +482 -0
  19. package/dist/chunk-MLNDSBZ4.js.map +1 -0
  20. package/dist/chunk-N2WIV2JV.js +22 -0
  21. package/dist/chunk-N2WIV2JV.js.map +1 -0
  22. package/dist/chunk-PWWRCN5W.js +2067 -0
  23. package/dist/chunk-PWWRCN5W.js.map +1 -0
  24. package/dist/chunk-SKHBM3XP.js +7746 -0
  25. package/dist/chunk-SKHBM3XP.js.map +1 -0
  26. package/dist/chunk-WFANXVQG.js +64 -0
  27. package/dist/chunk-WFANXVQG.js.map +1 -0
  28. package/dist/chunk-WYKL32C3.js +275 -0
  29. package/dist/chunk-WYKL32C3.js.map +1 -0
  30. package/dist/chunk-ZNF7XD2S.js +134 -0
  31. package/dist/chunk-ZNF7XD2S.js.map +1 -0
  32. package/dist/config-AUAIYDSI.js +20 -0
  33. package/dist/config-AUAIYDSI.js.map +1 -0
  34. package/dist/fileFromPath-XN7LXIBI.js +134 -0
  35. package/dist/fileFromPath-XN7LXIBI.js.map +1 -0
  36. package/dist/getMachineId-bsd-KW2E7VK3.js +42 -0
  37. package/dist/getMachineId-bsd-KW2E7VK3.js.map +1 -0
  38. package/dist/getMachineId-darwin-ROXJUJX5.js +42 -0
  39. package/dist/getMachineId-darwin-ROXJUJX5.js.map +1 -0
  40. package/dist/getMachineId-linux-KVZEHQSU.js +34 -0
  41. package/dist/getMachineId-linux-KVZEHQSU.js.map +1 -0
  42. package/dist/getMachineId-unsupported-PPRILPPA.js +25 -0
  43. package/dist/getMachineId-unsupported-PPRILPPA.js.map +1 -0
  44. package/dist/getMachineId-win-IIF36LEJ.js +44 -0
  45. package/dist/getMachineId-win-IIF36LEJ.js.map +1 -0
  46. package/dist/index.js +112703 -0
  47. package/dist/index.js.map +1 -0
  48. package/dist/lib-R6DEEJCP.js +7623 -0
  49. package/dist/lib-R6DEEJCP.js.map +1 -0
  50. package/dist/pipeline-IAVVAKTU.js +120 -0
  51. package/dist/pipeline-IAVVAKTU.js.map +1 -0
  52. package/dist/query-NTP5NVXN.js +25 -0
  53. package/dist/query-NTP5NVXN.js.map +1 -0
  54. package/dist/routing-BAEPFB7V.js +390 -0
  55. package/dist/routing-BAEPFB7V.js.map +1 -0
  56. package/dist/stripe-lookup-charge-EPRUMZDL.js +56 -0
  57. package/dist/stripe-lookup-charge-EPRUMZDL.js.map +1 -0
  58. package/dist/stripe-payment-history-SJPKA63N.js +67 -0
  59. package/dist/stripe-payment-history-SJPKA63N.js.map +1 -0
  60. package/dist/stripe-subscription-status-L4Z65GB3.js +58 -0
  61. package/dist/stripe-subscription-status-L4Z65GB3.js.map +1 -0
  62. package/dist/stripe-verify-refund-FZDKCIUQ.js +54 -0
  63. package/dist/stripe-verify-refund-FZDKCIUQ.js.map +1 -0
  64. package/dist/support-memory-WSG7SDKG.js +10 -0
  65. package/dist/support-memory-WSG7SDKG.js.map +1 -0
  66. package/package.json +10 -7
  67. package/.env.encrypted +0 -0
  68. package/CHANGELOG.md +0 -35
  69. package/data/tt-archive-dataset.json +0 -1
  70. package/data/validate-test-dataset.json +0 -97
  71. package/docs/CLI-AUTH.md +0 -504
  72. package/preload.ts +0 -18
  73. package/src/__tests__/init.test.ts +0 -74
  74. package/src/alignment-test.ts +0 -64
  75. package/src/check-apps.ts +0 -16
  76. package/src/commands/auth/decrypt.ts +0 -123
  77. package/src/commands/auth/encrypt.ts +0 -81
  78. package/src/commands/auth/index.ts +0 -50
  79. package/src/commands/auth/keygen.ts +0 -41
  80. package/src/commands/auth/status.ts +0 -164
  81. package/src/commands/axiom/forensic.ts +0 -868
  82. package/src/commands/axiom/index.ts +0 -697
  83. package/src/commands/build-dataset.ts +0 -311
  84. package/src/commands/db-status.ts +0 -47
  85. package/src/commands/deploys.ts +0 -219
  86. package/src/commands/eval-local/compare.ts +0 -171
  87. package/src/commands/eval-local/health.ts +0 -212
  88. package/src/commands/eval-local/index.ts +0 -76
  89. package/src/commands/eval-local/real-tools.ts +0 -416
  90. package/src/commands/eval-local/run.ts +0 -1168
  91. package/src/commands/eval-local/score-production.ts +0 -256
  92. package/src/commands/eval-local/seed.ts +0 -276
  93. package/src/commands/eval-pipeline/index.ts +0 -53
  94. package/src/commands/eval-pipeline/real-tools.ts +0 -492
  95. package/src/commands/eval-pipeline/run.ts +0 -1316
  96. package/src/commands/eval-pipeline/seed.ts +0 -395
  97. package/src/commands/eval-prompt.ts +0 -496
  98. package/src/commands/eval.test.ts +0 -253
  99. package/src/commands/eval.ts +0 -108
  100. package/src/commands/faq-classify.ts +0 -460
  101. package/src/commands/faq-cluster.ts +0 -135
  102. package/src/commands/faq-extract.ts +0 -249
  103. package/src/commands/faq-mine.ts +0 -432
  104. package/src/commands/faq-review.ts +0 -426
  105. package/src/commands/front/index.ts +0 -351
  106. package/src/commands/front/pull-conversations.ts +0 -275
  107. package/src/commands/front/tags.ts +0 -825
  108. package/src/commands/front-cache.ts +0 -1277
  109. package/src/commands/front-stats.ts +0 -75
  110. package/src/commands/health.test.ts +0 -82
  111. package/src/commands/health.ts +0 -362
  112. package/src/commands/init.test.ts +0 -89
  113. package/src/commands/init.ts +0 -106
  114. package/src/commands/inngest/client.ts +0 -294
  115. package/src/commands/inngest/events.ts +0 -296
  116. package/src/commands/inngest/investigate.ts +0 -382
  117. package/src/commands/inngest/runs.ts +0 -149
  118. package/src/commands/inngest/signal.ts +0 -143
  119. package/src/commands/kb-sync.ts +0 -498
  120. package/src/commands/memory/find.ts +0 -135
  121. package/src/commands/memory/get.ts +0 -87
  122. package/src/commands/memory/index.ts +0 -97
  123. package/src/commands/memory/stats.ts +0 -163
  124. package/src/commands/memory/store.ts +0 -49
  125. package/src/commands/memory/vote.ts +0 -159
  126. package/src/commands/pipeline.ts +0 -127
  127. package/src/commands/responses.ts +0 -856
  128. package/src/commands/tools.ts +0 -293
  129. package/src/commands/wizard.ts +0 -319
  130. package/src/index.ts +0 -172
  131. package/src/lib/crypto.ts +0 -56
  132. package/src/lib/env-loader.ts +0 -206
  133. package/src/lib/onepassword.ts +0 -137
  134. package/src/test-agent-local.ts +0 -115
  135. package/tsconfig.json +0 -11
  136. package/vitest.config.ts +0 -10
@@ -1,1168 +0,0 @@
1
- /**
2
- * Run eval suite against local environment
3
- *
4
- * Scenario-aware mocks that analyze trigger messages to return
5
- * contextually appropriate data. No more static canned responses.
6
- */
7
-
8
- import { SUPPORT_AGENT_PROMPT } from '@skillrecordings/core/agent'
9
- import {
10
- BannedPhrases,
11
- Helpfulness,
12
- InternalStateLeakage,
13
- MetaCommentary,
14
- ProductFabrication,
15
- } from '@skillrecordings/core/evals/scorers'
16
- import { generateText, stepCountIs, tool } from 'ai'
17
- import { readFile, writeFile } from 'fs/promises'
18
- import { glob } from 'glob'
19
- import { z } from 'zod'
20
- import { cleanupRealTools, createRealTools, initRealTools } from './real-tools'
21
-
22
- interface RunOptions {
23
- scenarios?: string
24
- dataset?: string
25
- output?: string
26
- baseline?: string
27
- failThreshold?: number
28
- verbose?: boolean
29
- json?: boolean
30
- prompt?: string
31
- model?: string
32
- limit?: number
33
- realTools?: boolean // Use real Docker services instead of mocks
34
- }
35
-
36
- interface Scenario {
37
- id: string
38
- name?: string
39
- subject?: string
40
- appId?: string
41
- trigger?: {
42
- subject: string
43
- body: string
44
- }
45
- triggerMessage?: {
46
- subject: string
47
- body: string
48
- }
49
- expectedBehavior?: string
50
- category?: string
51
- // Additional context from dataset
52
- agentResponse?: {
53
- text: string
54
- category: string
55
- }
56
- conversationHistory?: Array<{
57
- direction: 'in' | 'out'
58
- body: string
59
- timestamp: number
60
- }>
61
- }
62
-
63
- interface ScenarioResult {
64
- id: string
65
- name: string
66
- passed: boolean
67
- durationMs: number
68
- output: string
69
- toolCalls: string[]
70
- noDraft: boolean
71
- scores: {
72
- internalLeaks: { passed: boolean; matches: string[] }
73
- metaCommentary: { passed: boolean; matches: string[] }
74
- bannedPhrases: { passed: boolean; matches: string[] }
75
- fabrication: { passed: boolean; matches: string[] }
76
- helpfulness: { score: number }
77
- }
78
- category: string
79
- failureReasons: string[]
80
- }
81
-
82
- interface RunSummary {
83
- total: number
84
- passed: number
85
- failed: number
86
- noDraft: number
87
- passRate: number
88
- durationMs: number
89
- byCategory: Record<
90
- string,
91
- { passed: number; failed: number; noDraft: number }
92
- >
93
- failures: {
94
- internalLeaks: number
95
- metaCommentary: number
96
- bannedPhrases: number
97
- fabrication: number
98
- }
99
- latency: {
100
- p50: number
101
- p95: number
102
- p99: number
103
- }
104
- }
105
-
106
- /**
107
- * Scenario classifier - analyzes message content to determine
108
- * what type of support request this is
109
- */
110
- type ScenarioType =
111
- | 'access_issue' // Can't access, lost access, login problems
112
- | 'refund_request' // Wants money back
113
- | 'transfer_request' // Move purchase to different email
114
- | 'technical_help' // How do I use X, code questions
115
- | 'product_inquiry' // What's included, pricing, availability
116
- | 'zoom_link' // Missing workshop/event access
117
- | 'invoice_request' // Need invoice, receipt
118
- | 'fan_mail' // Personal message to instructor
119
- | 'spam' // Vendor outreach, not real support
120
- | 'general' // Catch-all
121
-
122
- function classifyScenario(subject: string, body: string): ScenarioType {
123
- // Normalize text - remove newlines, extra spaces
124
- const text = `${subject} ${body}`.toLowerCase().replace(/\s+/g, ' ')
125
-
126
- // Access issues
127
- if (
128
- text.includes("don't have access") ||
129
- text.includes("can't access") ||
130
- text.includes('lost access') ||
131
- text.includes('no access') ||
132
- text.includes("can't log in") ||
133
- text.includes('cannot login') ||
134
- text.includes('restore access') ||
135
- text.includes('logging in with github') ||
136
- text.includes('login with github') ||
137
- text.includes('logged in with github') ||
138
- text.includes('different email') ||
139
- text.includes('restore the access')
140
- ) {
141
- return 'access_issue'
142
- }
143
-
144
- // Refund requests
145
- if (
146
- text.includes('refund') ||
147
- text.includes('money back') ||
148
- (text.includes('cancel') && text.includes('purchase')) ||
149
- text.includes('charge back') ||
150
- text.includes("didn't mean to buy")
151
- ) {
152
- return 'refund_request'
153
- }
154
-
155
- // Transfer requests
156
- if (
157
- text.includes('transfer') ||
158
- (text.includes('move') && text.includes('email')) ||
159
- text.includes('change email') ||
160
- text.includes('wrong email')
161
- ) {
162
- return 'transfer_request'
163
- }
164
-
165
- // Zoom/workshop access
166
- if (
167
- text.includes('zoom') ||
168
- (text.includes('workshop') &&
169
- (text.includes('link') || text.includes('access'))) ||
170
- text.includes('calendar invite') ||
171
- text.includes('live event')
172
- ) {
173
- return 'zoom_link'
174
- }
175
-
176
- // Invoice/receipt
177
- if (
178
- text.includes('invoice') ||
179
- text.includes('receipt') ||
180
- (text.includes('tax') && text.includes('document'))
181
- ) {
182
- return 'invoice_request'
183
- }
184
-
185
- // Product inquiry
186
- if (
187
- text.includes('sold out') ||
188
- (text.includes('buy') && text.includes('button')) ||
189
- text.includes('discount') ||
190
- text.includes('pricing') ||
191
- text.includes("what's included") ||
192
- text.includes("what's the difference")
193
- ) {
194
- return 'product_inquiry'
195
- }
196
-
197
- // Technical help
198
- if (
199
- text.includes('how do i') ||
200
- text.includes('how to') ||
201
- text.includes('error') ||
202
- text.includes('not working') ||
203
- (text.includes('typescript') && text.includes('help')) ||
204
- text.includes('code') ||
205
- text.includes('tutorial')
206
- ) {
207
- return 'technical_help'
208
- }
209
-
210
- // Fan mail / personal
211
- if (
212
- (text.includes('thank you') && text.includes('course')) ||
213
- text.includes('changed my career') ||
214
- text.includes('love your') ||
215
- text.includes('big fan') ||
216
- text.includes('appreciate')
217
- ) {
218
- return 'fan_mail'
219
- }
220
-
221
- // Spam/vendor
222
- if (
223
- text.includes('partnership') ||
224
- text.includes('sponsor') ||
225
- text.includes('backlink') ||
226
- text.includes('seo') ||
227
- text.includes('guest post')
228
- ) {
229
- return 'spam'
230
- }
231
-
232
- return 'general'
233
- }
234
-
235
- /**
236
- * Create scenario-aware mock tools
237
- *
238
- * Each scenario type gets appropriate mock responses that
239
- * trigger realistic agent behavior
240
- */
241
- function createMockTools(scenarioType: ScenarioType, scenario: Scenario) {
242
- const trigger = scenario.trigger ||
243
- scenario.triggerMessage || { subject: '', body: '' }
244
-
245
- // Extract email from trigger if present
246
- const emailMatch = trigger.body.match(
247
- /([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/
248
- )
249
- const customerEmail = emailMatch?.[1] || '[EMAIL]'
250
-
251
- return {
252
- lookupUser: tool({
253
- description: 'Look up user by email',
254
- inputSchema: z.object({
255
- email: z.string(),
256
- appId: z.string(),
257
- }),
258
- execute: async ({ email }) => {
259
- // Scenario-aware responses
260
- switch (scenarioType) {
261
- case 'access_issue':
262
- // User found but no purchase - classic "different email" scenario
263
- if (
264
- trigger.body.toLowerCase().includes('different email') ||
265
- trigger.body.toLowerCase().includes('github')
266
- ) {
267
- return {
268
- found: true,
269
- user: { id: 'user_123', email, name: 'Customer' },
270
- purchases: [], // No purchases - that's the problem!
271
- }
272
- }
273
- // Otherwise user might have purchase but access issue
274
- return {
275
- found: true,
276
- user: { id: 'user_123', email, name: 'Customer' },
277
- purchases: [
278
- {
279
- id: 'purch_1',
280
- product:
281
- scenario.appId === 'ai-hero'
282
- ? 'AI Hero Workshop'
283
- : 'Total TypeScript',
284
- date: '2025-12-15',
285
- status: 'active',
286
- },
287
- ],
288
- }
289
-
290
- case 'refund_request':
291
- // User with recent purchase
292
- return {
293
- found: true,
294
- user: { id: 'user_123', email, name: 'Customer' },
295
- purchases: [
296
- {
297
- id: 'purch_refund_1',
298
- product:
299
- scenario.appId === 'ai-hero'
300
- ? 'AI Hero Workshop'
301
- : 'Total TypeScript Pro',
302
- date: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000)
303
- .toISOString()
304
- .split('T')[0], // 7 days ago
305
- status: 'active',
306
- amount: 249,
307
- },
308
- ],
309
- }
310
-
311
- case 'transfer_request':
312
- return {
313
- found: true,
314
- user: { id: 'user_123', email, name: 'Customer' },
315
- purchases: [
316
- {
317
- id: 'purch_transfer_1',
318
- product: 'Total TypeScript',
319
- date: '2025-01-10',
320
- status: 'active',
321
- },
322
- ],
323
- }
324
-
325
- case 'zoom_link':
326
- case 'product_inquiry':
327
- case 'technical_help':
328
- case 'invoice_request':
329
- // Found with purchase
330
- return {
331
- found: true,
332
- user: { id: 'user_123', email, name: 'Customer' },
333
- purchases: [
334
- {
335
- id: 'purch_1',
336
- product:
337
- scenario.appId === 'ai-hero'
338
- ? 'Ralph Workshop Ticket'
339
- : 'Total TypeScript',
340
- date: '2025-01-15',
341
- status: 'active',
342
- },
343
- ],
344
- }
345
-
346
- case 'fan_mail':
347
- case 'spam':
348
- // Might not even need to look up
349
- return {
350
- found: false,
351
- user: null,
352
- purchases: [],
353
- }
354
-
355
- default:
356
- return {
357
- found: true,
358
- user: { id: 'user_123', email, name: 'Customer' },
359
- purchases: [
360
- {
361
- id: 'purch_1',
362
- product: 'Total TypeScript',
363
- date: '2025-01-01',
364
- status: 'active',
365
- },
366
- ],
367
- }
368
- }
369
- },
370
- }),
371
-
372
- searchKnowledge: tool({
373
- description: 'Search knowledge base',
374
- inputSchema: z.object({ query: z.string(), appId: z.string() }),
375
- execute: async ({ query }) => {
376
- // Return relevant knowledge for technical questions
377
- if (scenarioType === 'technical_help') {
378
- return {
379
- similarTickets: [
380
- {
381
- data: 'Similar question answered: Check the TypeScript handbook section on generics.',
382
- score: 0.85,
383
- },
384
- ],
385
- knowledge: [
386
- {
387
- data: 'For TypeScript fundamentals, start with the Beginner TypeScript tutorial.',
388
- score: 0.9,
389
- },
390
- ],
391
- goodResponses: [
392
- {
393
- data: 'Example response: "For that specific error, try narrowing the type first..."',
394
- score: 0.8,
395
- },
396
- ],
397
- }
398
- }
399
-
400
- // Minimal/empty for other scenarios to avoid fabrication
401
- return {
402
- similarTickets: [],
403
- knowledge: [],
404
- goodResponses: [],
405
- }
406
- },
407
- }),
408
-
409
- searchProductContent: tool({
410
- description: 'Search product content',
411
- inputSchema: z.object({ query: z.string() }),
412
- execute: async ({ query }) => {
413
- if (scenarioType === 'technical_help') {
414
- return {
415
- results: [
416
- {
417
- title: 'Beginner TypeScript Tutorial',
418
- type: 'course',
419
- url: 'https://totaltypescript.com/tutorials/beginners-typescript',
420
- },
421
- ],
422
- }
423
- }
424
- return { results: [] }
425
- },
426
- }),
427
-
428
- draftResponse: tool({
429
- description: 'Draft a response to send to customer',
430
- inputSchema: z.object({ body: z.string() }),
431
- execute: async ({ body }) => ({ drafted: true, body }),
432
- }),
433
-
434
- escalateToHuman: tool({
435
- description: 'Escalate to human support',
436
- inputSchema: z.object({
437
- reason: z.string(),
438
- urgency: z.enum(['low', 'medium', 'high']),
439
- }),
440
- execute: async ({ reason, urgency }) => ({
441
- escalated: true,
442
- reason,
443
- urgency,
444
- }),
445
- }),
446
-
447
- assignToInstructor: tool({
448
- description:
449
- 'Assign conversation to instructor for personal correspondence',
450
- inputSchema: z.object({
451
- conversationId: z.string(),
452
- reason: z.string(),
453
- }),
454
- execute: async ({ conversationId, reason }) => ({
455
- status: 'pending_approval',
456
- conversationId,
457
- reason,
458
- message: 'Instructor assignment submitted for approval',
459
- }),
460
- }),
461
-
462
- memory_search: tool({
463
- description: 'Search semantic memory',
464
- inputSchema: z.object({ query: z.string() }),
465
- execute: async () => ({ results: [], total: 0 }),
466
- }),
467
-
468
- memory_store: tool({
469
- description: 'Store learning in memory',
470
- inputSchema: z.object({
471
- content: z.string(),
472
- tags: z.array(z.string()).optional(),
473
- }),
474
- execute: async () => ({ stored: true, id: 'mem_mock_1' }),
475
- }),
476
-
477
- memory_vote: tool({
478
- description: 'Vote on memory usefulness',
479
- inputSchema: z.object({
480
- memoryId: z.string(),
481
- vote: z.enum(['up', 'down']),
482
- }),
483
- execute: async () => ({ success: true }),
484
- }),
485
-
486
- memory_cite: tool({
487
- description: 'Cite a memory as used',
488
- inputSchema: z.object({ memoryId: z.string() }),
489
- execute: async () => ({ cited: true }),
490
- }),
491
-
492
- processRefund: tool({
493
- description: 'Process a refund',
494
- inputSchema: z.object({
495
- purchaseId: z.string(),
496
- appId: z.string(),
497
- reason: z.string(),
498
- }),
499
- execute: async ({ purchaseId, reason }) => ({
500
- status: 'pending_approval',
501
- purchaseId,
502
- reason,
503
- message: 'Refund submitted for approval',
504
- }),
505
- }),
506
-
507
- transferPurchase: tool({
508
- description: 'Transfer purchase to another email',
509
- inputSchema: z.object({
510
- purchaseId: z.string(),
511
- appId: z.string(),
512
- fromUserId: z.string(),
513
- toEmail: z.string(),
514
- reason: z.string(),
515
- }),
516
- execute: async () => ({
517
- status: 'pending_approval',
518
- message: 'Transfer submitted for approval',
519
- }),
520
- }),
521
-
522
- check_product_availability: tool({
523
- description: 'Check if product is available or sold out',
524
- inputSchema: z.object({
525
- productId: z.string().optional(),
526
- appId: z.string(),
527
- }),
528
- execute: async () => {
529
- // Default: available
530
- if (scenarioType === 'product_inquiry') {
531
- return {
532
- soldOut: false,
533
- quantityRemaining: 12,
534
- quantityAvailable: 50,
535
- enrollmentOpen: true,
536
- }
537
- }
538
- return {
539
- soldOut: false,
540
- quantityRemaining: -1, // unlimited
541
- enrollmentOpen: true,
542
- }
543
- },
544
- }),
545
-
546
- getPaymentHistory: tool({
547
- description: 'Get payment history from Stripe',
548
- inputSchema: z.object({
549
- customerEmail: z.string(),
550
- limit: z.number().optional(),
551
- }),
552
- execute: async () => ({
553
- charges: [
554
- {
555
- id: 'ch_mock_1',
556
- amount: 24900,
557
- status: 'succeeded',
558
- created: Date.now() - 7 * 24 * 60 * 60 * 1000,
559
- },
560
- ],
561
- }),
562
- }),
563
-
564
- getSubscriptionStatus: tool({
565
- description: 'Get subscription status',
566
- inputSchema: z.object({
567
- customerId: z.string(),
568
- stripeAccountId: z.string(),
569
- }),
570
- execute: async () => ({
571
- subscription: null, // Most products aren't subscriptions
572
- }),
573
- }),
574
-
575
- lookupCharge: tool({
576
- description: 'Look up specific charge',
577
- inputSchema: z.object({ chargeId: z.string() }),
578
- execute: async ({ chargeId }) => ({
579
- charge: {
580
- id: chargeId,
581
- amount: 24900,
582
- status: 'succeeded',
583
- refunded: false,
584
- },
585
- }),
586
- }),
587
-
588
- verifyRefund: tool({
589
- description: 'Verify refund status',
590
- inputSchema: z.object({ refundId: z.string() }),
591
- execute: async ({ refundId }) => ({
592
- refund: {
593
- id: refundId,
594
- status: 'succeeded',
595
- amount: 24900,
596
- },
597
- }),
598
- }),
599
- }
600
- }
601
-
602
- export async function run(options: RunOptions): Promise<void> {
603
- const {
604
- scenarios: scenarioGlob,
605
- dataset: datasetPath,
606
- output,
607
- baseline,
608
- failThreshold = 0.8,
609
- verbose = false,
610
- json = false,
611
- prompt: promptPath,
612
- model = 'anthropic/claude-haiku-4-5',
613
- limit,
614
- realTools = false,
615
- } = options
616
-
617
- // Initialize real tools if flag is set
618
- if (realTools) {
619
- if (!json) console.log('๐Ÿ”ง Using REAL tools (Docker services)...')
620
- try {
621
- await initRealTools()
622
- if (!json) console.log('โœ… Connected to MySQL and Qdrant')
623
- } catch (error) {
624
- console.error('โŒ Failed to connect to Docker services:', error)
625
- console.error(
626
- ' Make sure services are running: docker compose -f docker/eval.yml up -d'
627
- )
628
- process.exit(1)
629
- }
630
- }
631
-
632
- // Load prompt
633
- let systemPrompt = SUPPORT_AGENT_PROMPT
634
- if (promptPath) {
635
- systemPrompt = await readFile(promptPath, 'utf-8')
636
- if (!json) console.log(`Using prompt from: ${promptPath}`)
637
- } else {
638
- if (!json) console.log('Using production prompt')
639
- }
640
-
641
- // Load scenarios from either scenarios glob or dataset file
642
- let scenarios: Scenario[] = []
643
-
644
- if (datasetPath) {
645
- // Load from dataset file (comprehensive-dataset.json format)
646
- const datasetContent = await readFile(datasetPath, 'utf-8')
647
- const dataset = JSON.parse(datasetContent)
648
- scenarios = dataset.map((item: any) => {
649
- const trigger = item.triggerMessage || {
650
- subject: item.subject || '',
651
- body: '',
652
- }
653
- const fullText = `${trigger.subject} ${trigger.body}`.toLowerCase()
654
-
655
- // Detect app from content
656
- let detectedApp = 'total-typescript'
657
- if (
658
- fullText.includes('ai hero') ||
659
- fullText.includes('aihero.dev') ||
660
- fullText.includes('ai-hero') ||
661
- fullText.includes('ralph') ||
662
- fullText.includes('autonomous software engineers')
663
- ) {
664
- detectedApp = 'ai-hero'
665
- }
666
-
667
- return {
668
- id: item.id || item.conversationId,
669
- name: trigger.subject || 'Unknown',
670
- trigger,
671
- triggerMessage: item.triggerMessage,
672
- category: item.category || 'general',
673
- appId: item.app !== 'unknown' ? item.app : detectedApp,
674
- agentResponse: item.agentResponse,
675
- conversationHistory: item.conversationHistory,
676
- }
677
- })
678
- } else {
679
- // Load from scenario files
680
- const glob_ = scenarioGlob || 'fixtures/scenarios/**/*.json'
681
- const scenarioFiles = await glob(glob_)
682
-
683
- if (scenarioFiles.length === 0) {
684
- console.error('No scenarios found. Use --scenarios or --dataset')
685
- process.exit(1)
686
- }
687
-
688
- scenarios = await Promise.all(
689
- scenarioFiles.map(async (file) => {
690
- const content = await readFile(file, 'utf-8')
691
- return JSON.parse(content)
692
- })
693
- )
694
- }
695
-
696
- // Apply limit
697
- if (limit && limit < scenarios.length) {
698
- scenarios = scenarios.slice(0, limit)
699
- }
700
-
701
- if (!json) {
702
- console.log(
703
- `\n๐Ÿงช Running ${scenarios.length} scenarios (model: ${model})\n`
704
- )
705
- }
706
-
707
- const startTime = Date.now()
708
- const results: ScenarioResult[] = []
709
-
710
- for (let i = 0; i < scenarios.length; i++) {
711
- if (!json) {
712
- process.stdout.write(`\r Processing ${i + 1}/${scenarios.length}...`)
713
- }
714
-
715
- const scenario = scenarios[i]
716
- if (!scenario) continue
717
- const result = await runScenario(
718
- scenario,
719
- systemPrompt,
720
- model,
721
- verbose,
722
- realTools
723
- )
724
- results.push(result)
725
- }
726
-
727
- // Cleanup real tools if used
728
- if (realTools) {
729
- await cleanupRealTools()
730
- }
731
-
732
- if (!json) {
733
- console.log('\n')
734
- }
735
-
736
- const totalDuration = Date.now() - startTime
737
- const summary = aggregateResults(results, totalDuration)
738
-
739
- // Compare to baseline if provided
740
- if (baseline) {
741
- try {
742
- const baselineContent = await readFile(baseline, 'utf-8')
743
- const baselineData = JSON.parse(baselineContent)
744
- printComparison(summary, baselineData.summary || baselineData)
745
- } catch (e) {
746
- console.error('Could not load baseline:', e)
747
- }
748
- }
749
-
750
- // Save results if output specified
751
- if (output) {
752
- await writeFile(output, JSON.stringify({ summary, results }, null, 2))
753
- if (!json) {
754
- console.log(`Results saved to ${output}`)
755
- }
756
- }
757
-
758
- if (json) {
759
- console.log(JSON.stringify({ summary, results }, null, 2))
760
- } else {
761
- printSummary(summary, failThreshold)
762
-
763
- // Show failures if verbose
764
- if (verbose) {
765
- const failures = results.filter((r) => !r.passed && !r.noDraft)
766
- if (failures.length > 0) {
767
- console.log('\n--- FAILURES ---\n')
768
- for (const f of failures.slice(0, 10)) {
769
- console.log(`โŒ ${f.name}`)
770
- for (const reason of f.failureReasons) {
771
- console.log(` โ””โ”€ ${reason}`)
772
- }
773
- if (f.output) {
774
- console.log(` Output: ${f.output.slice(0, 150)}...`)
775
- }
776
- console.log('')
777
- }
778
- }
779
- }
780
- }
781
-
782
- // Exit with error if below threshold
783
- const effectivePassRate =
784
- summary.passed / (summary.passed + summary.failed) || 0
785
- if (effectivePassRate < failThreshold && summary.failed > 0) {
786
- process.exit(1)
787
- }
788
- }
789
-
790
- async function runScenario(
791
- scenario: Scenario,
792
- systemPrompt: string,
793
- model: string,
794
- verbose?: boolean,
795
- useRealTools?: boolean
796
- ): Promise<ScenarioResult> {
797
- const startTime = Date.now()
798
- const failureReasons: string[] = []
799
-
800
- // Build input message
801
- const trigger = scenario.trigger ||
802
- scenario.triggerMessage || { subject: '', body: '' }
803
- const input = `Subject: ${trigger.subject}\n\n${trigger.body}`
804
- const name = scenario.name || trigger.subject || scenario.id
805
-
806
- // Classify scenario and create appropriate tools (mock or real)
807
- const scenarioType = classifyScenario(trigger.subject, trigger.body)
808
- if (verbose) {
809
- console.log(
810
- `[CLASSIFY] "${trigger.subject.slice(0, 50)}..." โ†’ ${scenarioType}`
811
- )
812
- if (useRealTools) {
813
- console.log(`[TOOLS] Using REAL Docker services`)
814
- }
815
- }
816
-
817
- // Use real tools if flag is set, otherwise use mocks
818
- const tools = useRealTools
819
- ? createRealTools({
820
- appId: scenario.appId,
821
- customerEmail: trigger.body.match(
822
- /([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/
823
- )?.[1],
824
- })
825
- : createMockTools(scenarioType, scenario)
826
-
827
- // Use scenarioType as category for better tracking
828
- const category = scenarioType
829
-
830
- let output = ''
831
- let toolCalls: string[] = []
832
- let noDraft = false
833
-
834
- try {
835
- // Add explicit tool requirement - the LLM must use draftResponse, not text output
836
- const evalSystemPrompt =
837
- systemPrompt +
838
- `
839
-
840
- ## CRITICAL: Tool Usage Requirements
841
- 1. You MUST use draftResponse to send ANY reply to the customer
842
- 2. NEVER output text responses directly - you are in a tool-use only mode
843
- 3. Your only outputs should be tool calls. No explanatory text.
844
- 4. If you want to respond to the customer, call draftResponse with the response body
845
- 5. If you decide not to respond, make no tool calls at all
846
-
847
- Think step by step:
848
- 1. Analyze the customer message
849
- 2. Call relevant tools (lookupUser, searchKnowledge, etc.)
850
- 3. Based on tool results, either:
851
- - Call draftResponse with your reply, OR
852
- - Make no response (for spam, vendor emails, already handled, etc.)
853
-
854
- App: ${scenario.appId || 'total-typescript'}`
855
-
856
- const result = await generateText({
857
- model,
858
- system: evalSystemPrompt,
859
- messages: [{ role: 'user', content: input }],
860
- tools,
861
- stopWhen: stepCountIs(10), // Match production - use stopWhen for multi-step
862
- })
863
-
864
- // Extract tool calls
865
- toolCalls = result.steps
866
- .flatMap((s) => s.toolCalls || [])
867
- .map((tc) => tc.toolName)
868
-
869
- // Debug all steps when verbose
870
- if (verbose) {
871
- console.log(
872
- `\n[TRACE] ${name} (${result.steps.length} steps, reason: ${result.finishReason})`
873
- )
874
- for (let i = 0; i < result.steps.length; i++) {
875
- const step = result.steps[i]
876
- if (!step) continue
877
- const calls = (step.toolCalls || [])
878
- .map((tc) => `${tc.toolName}`)
879
- .join(', ')
880
- console.log(
881
- ` Step ${i + 1}: ${calls || 'no tool calls'} [reason: ${step.finishReason}]`
882
- )
883
- for (const tr of step.toolResults || []) {
884
- const preview = JSON.stringify(tr.output).slice(0, 300)
885
- console.log(` โ†’ ${preview}`)
886
- }
887
- if (step.text) {
888
- console.log(` text: ${step.text.slice(0, 100)}...`)
889
- }
890
- }
891
- }
892
-
893
- // Find draftResponse output - this is the only way to send to customers
894
- // Text output without draftResponse is internal reasoning (not sent)
895
- const draftCall = result.steps
896
- .flatMap((s) => s.toolCalls || [])
897
- .find((tc) => tc.toolName === 'draftResponse')
898
-
899
- if (draftCall) {
900
- // Explicit draft call - this is a customer response
901
- output = (draftCall.input as { body: string }).body
902
- if (verbose) {
903
- console.log(` โœ… DRAFTED: ${output.slice(0, 100)}...`)
904
- }
905
- } else {
906
- // No draftResponse = correctly silent (even if there's reasoning text)
907
- noDraft = true
908
- if (verbose) {
909
- if (result.text && result.text.trim().length > 0) {
910
- console.log(` ๐Ÿšซ SILENT (reasoning): ${result.text.slice(0, 80)}...`)
911
- } else {
912
- console.log(` ๐Ÿšซ SILENT (no output)`)
913
- }
914
- }
915
- }
916
- } catch (error) {
917
- output = `ERROR: ${error instanceof Error ? error.message : 'Unknown error'}`
918
- failureReasons.push(output)
919
- }
920
-
921
- const durationMs = Date.now() - startTime
922
-
923
- // Run quality scorers on output
924
- const leakResult = InternalStateLeakage({ output })
925
- const metaResult = MetaCommentary({ output })
926
- const bannedResult = BannedPhrases({ output })
927
- const fabResult = ProductFabrication({ output })
928
- const helpResult = Helpfulness({ output })
929
-
930
- const scores = {
931
- internalLeaks: {
932
- passed: leakResult.score === 1,
933
- matches: leakResult.metadata?.foundLeaks || [],
934
- },
935
- metaCommentary: {
936
- passed: metaResult.score === 1,
937
- matches: metaResult.metadata?.foundMeta || [],
938
- },
939
- bannedPhrases: {
940
- passed: bannedResult.score === 1,
941
- matches: bannedResult.metadata?.foundBanned || [],
942
- },
943
- fabrication: {
944
- passed: fabResult.score === 1,
945
- matches: fabResult.metadata?.foundFabrication || [],
946
- },
947
- helpfulness: {
948
- score: helpResult.score,
949
- },
950
- }
951
-
952
- // Build failure reasons
953
- if (!scores.internalLeaks.passed) {
954
- failureReasons.push(
955
- `Internal leak: ${scores.internalLeaks.matches.join(', ')}`
956
- )
957
- }
958
- if (!scores.metaCommentary.passed) {
959
- failureReasons.push(
960
- `Meta commentary: ${scores.metaCommentary.matches.join(', ')}`
961
- )
962
- }
963
- if (!scores.bannedPhrases.passed) {
964
- failureReasons.push(
965
- `Banned phrase: ${scores.bannedPhrases.matches.join(', ')}`
966
- )
967
- }
968
- if (!scores.fabrication.passed) {
969
- failureReasons.push(`Fabrication: ${scores.fabrication.matches.join(', ')}`)
970
- }
971
-
972
- // Determine pass/fail based on expectedBehavior
973
- // Check if agent behavior matches what the scenario expects
974
- const expectedBehavior = scenario.expectedBehavior?.toLowerCase() || ''
975
-
976
- // Expected to draft a response?
977
- const shouldDraft =
978
- expectedBehavior.includes('draft') ||
979
- expectedBehavior.includes('respond') ||
980
- expectedBehavior.includes('help') ||
981
- expectedBehavior.includes('ask_for_details')
982
-
983
- // Expected to stay silent?
984
- const shouldBeSilent =
985
- expectedBehavior.includes('silent') ||
986
- expectedBehavior.includes('ignore') ||
987
- expectedBehavior.includes('no_response')
988
-
989
- // Expected to escalate?
990
- const shouldEscalate =
991
- expectedBehavior.includes('escalate') ||
992
- expectedBehavior.includes('human') ||
993
- expectedBehavior.includes('approval')
994
-
995
- // Check for escalation in tool calls
996
- const didEscalate =
997
- toolCalls.includes('escalateToHuman') ||
998
- toolCalls.includes('assignToInstructor')
999
-
1000
- let passed = true
1001
-
1002
- // If expected draft but got silence โ†’ FAIL
1003
- if (shouldDraft && noDraft) {
1004
- passed = false
1005
- failureReasons.push('Expected draft response but agent stayed silent')
1006
- }
1007
-
1008
- // If expected silence but got draft โ†’ check draft quality
1009
- if (shouldBeSilent && !noDraft) {
1010
- // Draft when should be silent is a failure
1011
- passed = false
1012
- failureReasons.push('Expected silence but agent drafted a response')
1013
- }
1014
-
1015
- // If expected escalate but didn't โ†’ FAIL
1016
- if (shouldEscalate && !didEscalate) {
1017
- passed = false
1018
- failureReasons.push('Expected escalation but agent did not escalate')
1019
- }
1020
-
1021
- // If drafted, also check quality
1022
- if (!noDraft) {
1023
- if (!scores.internalLeaks.passed) passed = false
1024
- if (!scores.metaCommentary.passed) passed = false
1025
- if (!scores.bannedPhrases.passed) passed = false
1026
- if (!scores.fabrication.passed) passed = false
1027
- }
1028
-
1029
- // If no expectedBehavior specified, fall back to old logic
1030
- if (!expectedBehavior) {
1031
- passed =
1032
- noDraft ||
1033
- (scores.internalLeaks.passed &&
1034
- scores.metaCommentary.passed &&
1035
- scores.bannedPhrases.passed &&
1036
- scores.fabrication.passed)
1037
- }
1038
-
1039
- return {
1040
- id: scenario.id,
1041
- name,
1042
- passed,
1043
- durationMs,
1044
- output,
1045
- toolCalls,
1046
- noDraft,
1047
- scores,
1048
- category,
1049
- failureReasons,
1050
- }
1051
- }
1052
-
1053
- function aggregateResults(
1054
- results: ScenarioResult[],
1055
- totalDurationMs: number
1056
- ): RunSummary {
1057
- const passed = results.filter((r) => r.passed).length
1058
- const noDraft = results.filter((r) => r.noDraft && r.passed).length // Only count as noDraft if also passed
1059
- const failed = results.filter((r) => !r.passed).length // Failed is anything that didn't pass
1060
-
1061
- // Group by category
1062
- const byCategory: Record<
1063
- string,
1064
- { passed: number; failed: number; noDraft: number }
1065
- > = {}
1066
- for (const result of results) {
1067
- const category = result.category || 'general'
1068
- if (!byCategory[category]) {
1069
- byCategory[category] = { passed: 0, failed: 0, noDraft: 0 }
1070
- }
1071
- if (result.noDraft) {
1072
- byCategory[category].noDraft++
1073
- } else if (result.passed) {
1074
- byCategory[category].passed++
1075
- } else {
1076
- byCategory[category].failed++
1077
- }
1078
- }
1079
-
1080
- // Count failure types (only for non-noDraft results)
1081
- const withDrafts = results.filter((r) => !r.noDraft)
1082
- const failures = {
1083
- internalLeaks: withDrafts.filter((r) => !r.scores.internalLeaks.passed)
1084
- .length,
1085
- metaCommentary: withDrafts.filter((r) => !r.scores.metaCommentary.passed)
1086
- .length,
1087
- bannedPhrases: withDrafts.filter((r) => !r.scores.bannedPhrases.passed)
1088
- .length,
1089
- fabrication: withDrafts.filter((r) => !r.scores.fabrication.passed).length,
1090
- }
1091
-
1092
- // Calculate latency percentiles
1093
- const durations = results.map((r) => r.durationMs).sort((a, b) => a - b)
1094
- const latency = {
1095
- p50: durations[Math.floor(durations.length * 0.5)] || 0,
1096
- p95: durations[Math.floor(durations.length * 0.95)] || 0,
1097
- p99: durations[Math.floor(durations.length * 0.99)] || 0,
1098
- }
1099
-
1100
- return {
1101
- total: results.length,
1102
- passed,
1103
- failed,
1104
- noDraft,
1105
- passRate: results.length > 0 ? passed / results.length : 0,
1106
- durationMs: totalDurationMs,
1107
- byCategory,
1108
- failures,
1109
- latency,
1110
- }
1111
- }
1112
-
1113
- function printSummary(summary: RunSummary, threshold: number): void {
1114
- console.log('๐Ÿงช Eval Results\n')
1115
- console.log(`Scenarios: ${summary.total} total`)
1116
- console.log(
1117
- ` โœ… Passed: ${summary.passed} (${(summary.passRate * 100).toFixed(1)}%)`
1118
- )
1119
- console.log(` โŒ Failed: ${summary.failed}`)
1120
- console.log(` ๐Ÿšซ No draft: ${summary.noDraft}`)
1121
-
1122
- if (summary.failed > 0) {
1123
- console.log('\nQuality Breakdown (drafts with issues):')
1124
- if (summary.failures.internalLeaks > 0) {
1125
- console.log(` ๐Ÿšจ Internal leaks: ${summary.failures.internalLeaks}`)
1126
- }
1127
- if (summary.failures.metaCommentary > 0) {
1128
- console.log(` ๐Ÿ’ฌ Meta-commentary: ${summary.failures.metaCommentary}`)
1129
- }
1130
- if (summary.failures.bannedPhrases > 0) {
1131
- console.log(` ๐Ÿšซ Banned phrases: ${summary.failures.bannedPhrases}`)
1132
- }
1133
- if (summary.failures.fabrication > 0) {
1134
- console.log(` ๐ŸŽญ Fabrication: ${summary.failures.fabrication}`)
1135
- }
1136
- }
1137
-
1138
- console.log('\nBy Category:')
1139
- for (const [cat, stats] of Object.entries(summary.byCategory)) {
1140
- const total = stats.passed + stats.failed + stats.noDraft
1141
- console.log(
1142
- ` ${cat}: ${stats.passed}โœ… ${stats.failed}โŒ ${stats.noDraft}๐Ÿšซ (${total} total)`
1143
- )
1144
- }
1145
-
1146
- console.log('\nLatency:')
1147
- console.log(` p50: ${summary.latency.p50}ms`)
1148
- console.log(` p95: ${summary.latency.p95}ms`)
1149
- console.log(` p99: ${summary.latency.p99}ms`)
1150
-
1151
- const effectivePassRate =
1152
- summary.passed / (summary.passed + summary.failed) || 1
1153
- const passIcon = effectivePassRate >= threshold ? 'โœ…' : 'โŒ'
1154
- console.log(
1155
- `\nDraft quality: ${(effectivePassRate * 100).toFixed(1)}% (threshold: ${(threshold * 100).toFixed(1)}%) ${passIcon}`
1156
- )
1157
- }
1158
-
1159
- function printComparison(current: RunSummary, baseline: RunSummary): void {
1160
- console.log('\n๐Ÿ”ฌ Comparison to Baseline\n')
1161
-
1162
- const passRateDelta = current.passRate - baseline.passRate
1163
- const passRateIcon = passRateDelta >= 0 ? 'โฌ†๏ธ' : 'โฌ‡๏ธ'
1164
-
1165
- console.log(
1166
- `Pass rate: ${(baseline.passRate * 100).toFixed(1)}% โ†’ ${(current.passRate * 100).toFixed(1)}% ${passRateDelta > 0 ? '+' : ''}${(passRateDelta * 100).toFixed(1)}% ${passRateIcon}`
1167
- )
1168
- }