@skillrecordings/cli 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skill.mjs +27 -0
- package/dist/chunk-2NCCVTEE.js +22342 -0
- package/dist/chunk-2NCCVTEE.js.map +1 -0
- package/dist/chunk-3E3GYSZR.js +7071 -0
- package/dist/chunk-3E3GYSZR.js.map +1 -0
- package/dist/chunk-F4EM72IH.js +86 -0
- package/dist/chunk-F4EM72IH.js.map +1 -0
- package/dist/chunk-FGP7KUQW.js +432 -0
- package/dist/chunk-FGP7KUQW.js.map +1 -0
- package/dist/chunk-H3D6VCME.js +55 -0
- package/dist/chunk-H3D6VCME.js.map +1 -0
- package/dist/chunk-HK3PEWFD.js +208 -0
- package/dist/chunk-HK3PEWFD.js.map +1 -0
- package/dist/chunk-KEV3QKXP.js +4495 -0
- package/dist/chunk-KEV3QKXP.js.map +1 -0
- package/dist/chunk-MG37YDAK.js +882 -0
- package/dist/chunk-MG37YDAK.js.map +1 -0
- package/dist/chunk-MLNDSBZ4.js +482 -0
- package/dist/chunk-MLNDSBZ4.js.map +1 -0
- package/dist/chunk-N2WIV2JV.js +22 -0
- package/dist/chunk-N2WIV2JV.js.map +1 -0
- package/dist/chunk-PWWRCN5W.js +2067 -0
- package/dist/chunk-PWWRCN5W.js.map +1 -0
- package/dist/chunk-SKHBM3XP.js +7746 -0
- package/dist/chunk-SKHBM3XP.js.map +1 -0
- package/dist/chunk-WFANXVQG.js +64 -0
- package/dist/chunk-WFANXVQG.js.map +1 -0
- package/dist/chunk-WYKL32C3.js +275 -0
- package/dist/chunk-WYKL32C3.js.map +1 -0
- package/dist/chunk-ZNF7XD2S.js +134 -0
- package/dist/chunk-ZNF7XD2S.js.map +1 -0
- package/dist/config-AUAIYDSI.js +20 -0
- package/dist/config-AUAIYDSI.js.map +1 -0
- package/dist/fileFromPath-XN7LXIBI.js +134 -0
- package/dist/fileFromPath-XN7LXIBI.js.map +1 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js +42 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js.map +1 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js +42 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js.map +1 -0
- package/dist/getMachineId-linux-KVZEHQSU.js +34 -0
- package/dist/getMachineId-linux-KVZEHQSU.js.map +1 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js +25 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js.map +1 -0
- package/dist/getMachineId-win-IIF36LEJ.js +44 -0
- package/dist/getMachineId-win-IIF36LEJ.js.map +1 -0
- package/dist/index.js +112703 -0
- package/dist/index.js.map +1 -0
- package/dist/lib-R6DEEJCP.js +7623 -0
- package/dist/lib-R6DEEJCP.js.map +1 -0
- package/dist/pipeline-IAVVAKTU.js +120 -0
- package/dist/pipeline-IAVVAKTU.js.map +1 -0
- package/dist/query-NTP5NVXN.js +25 -0
- package/dist/query-NTP5NVXN.js.map +1 -0
- package/dist/routing-BAEPFB7V.js +390 -0
- package/dist/routing-BAEPFB7V.js.map +1 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js +56 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js.map +1 -0
- package/dist/stripe-payment-history-SJPKA63N.js +67 -0
- package/dist/stripe-payment-history-SJPKA63N.js.map +1 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js +58 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js.map +1 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js +54 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js.map +1 -0
- package/dist/support-memory-WSG7SDKG.js +10 -0
- package/dist/support-memory-WSG7SDKG.js.map +1 -0
- package/package.json +10 -7
- package/.env.encrypted +0 -0
- package/CHANGELOG.md +0 -35
- package/data/tt-archive-dataset.json +0 -1
- package/data/validate-test-dataset.json +0 -97
- package/docs/CLI-AUTH.md +0 -504
- package/preload.ts +0 -18
- package/src/__tests__/init.test.ts +0 -74
- package/src/alignment-test.ts +0 -64
- package/src/check-apps.ts +0 -16
- package/src/commands/auth/decrypt.ts +0 -123
- package/src/commands/auth/encrypt.ts +0 -81
- package/src/commands/auth/index.ts +0 -50
- package/src/commands/auth/keygen.ts +0 -41
- package/src/commands/auth/status.ts +0 -164
- package/src/commands/axiom/forensic.ts +0 -868
- package/src/commands/axiom/index.ts +0 -697
- package/src/commands/build-dataset.ts +0 -311
- package/src/commands/db-status.ts +0 -47
- package/src/commands/deploys.ts +0 -219
- package/src/commands/eval-local/compare.ts +0 -171
- package/src/commands/eval-local/health.ts +0 -212
- package/src/commands/eval-local/index.ts +0 -76
- package/src/commands/eval-local/real-tools.ts +0 -416
- package/src/commands/eval-local/run.ts +0 -1168
- package/src/commands/eval-local/score-production.ts +0 -256
- package/src/commands/eval-local/seed.ts +0 -276
- package/src/commands/eval-pipeline/index.ts +0 -53
- package/src/commands/eval-pipeline/real-tools.ts +0 -492
- package/src/commands/eval-pipeline/run.ts +0 -1316
- package/src/commands/eval-pipeline/seed.ts +0 -395
- package/src/commands/eval-prompt.ts +0 -496
- package/src/commands/eval.test.ts +0 -253
- package/src/commands/eval.ts +0 -108
- package/src/commands/faq-classify.ts +0 -460
- package/src/commands/faq-cluster.ts +0 -135
- package/src/commands/faq-extract.ts +0 -249
- package/src/commands/faq-mine.ts +0 -432
- package/src/commands/faq-review.ts +0 -426
- package/src/commands/front/index.ts +0 -351
- package/src/commands/front/pull-conversations.ts +0 -275
- package/src/commands/front/tags.ts +0 -825
- package/src/commands/front-cache.ts +0 -1277
- package/src/commands/front-stats.ts +0 -75
- package/src/commands/health.test.ts +0 -82
- package/src/commands/health.ts +0 -362
- package/src/commands/init.test.ts +0 -89
- package/src/commands/init.ts +0 -106
- package/src/commands/inngest/client.ts +0 -294
- package/src/commands/inngest/events.ts +0 -296
- package/src/commands/inngest/investigate.ts +0 -382
- package/src/commands/inngest/runs.ts +0 -149
- package/src/commands/inngest/signal.ts +0 -143
- package/src/commands/kb-sync.ts +0 -498
- package/src/commands/memory/find.ts +0 -135
- package/src/commands/memory/get.ts +0 -87
- package/src/commands/memory/index.ts +0 -97
- package/src/commands/memory/stats.ts +0 -163
- package/src/commands/memory/store.ts +0 -49
- package/src/commands/memory/vote.ts +0 -159
- package/src/commands/pipeline.ts +0 -127
- package/src/commands/responses.ts +0 -856
- package/src/commands/tools.ts +0 -293
- package/src/commands/wizard.ts +0 -319
- package/src/index.ts +0 -172
- package/src/lib/crypto.ts +0 -56
- package/src/lib/env-loader.ts +0 -206
- package/src/lib/onepassword.ts +0 -137
- package/src/test-agent-local.ts +0 -115
- package/tsconfig.json +0 -11
- package/vitest.config.ts +0 -10
|
@@ -1,496 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Prompt Evaluation Harness
|
|
3
|
-
*
|
|
4
|
-
* Tests prompt changes against real trigger messages.
|
|
5
|
-
* Runs the agent with mocked tools, scores output quality.
|
|
6
|
-
*
|
|
7
|
-
* Usage:
|
|
8
|
-
* skill eval-prompt # Run with current prompt
|
|
9
|
-
* skill eval-prompt --prompt /path/to/new.md # Test a different prompt
|
|
10
|
-
* skill eval-prompt --compare /path/to/new.md # Side-by-side comparison
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
import { readFileSync, writeFileSync, existsSync } from 'fs'
|
|
14
|
-
import { join } from 'path'
|
|
15
|
-
import type { Command } from 'commander'
|
|
16
|
-
import { generateText, stepCountIs, tool } from 'ai'
|
|
17
|
-
import { z } from 'zod'
|
|
18
|
-
|
|
19
|
-
// Import the current production prompt
|
|
20
|
-
import { SUPPORT_AGENT_PROMPT } from '@skillrecordings/core/agent'
|
|
21
|
-
|
|
22
|
-
// ============================================================================
|
|
23
|
-
// Quality Scorers (extracted from response-quality.eval.ts)
|
|
24
|
-
// ============================================================================
|
|
25
|
-
|
|
26
|
-
const leakPatterns = [
|
|
27
|
-
/no instructor (configured|routing|assigned|set up)/i,
|
|
28
|
-
/can't route this/i,
|
|
29
|
-
/unable to route/i,
|
|
30
|
-
/no (instructor|channel|inbox) (is )?configured/i,
|
|
31
|
-
/system (doesn't|does not|cannot|can't)/i,
|
|
32
|
-
/not configured for this app/i,
|
|
33
|
-
/routing (not )?(set up|configured)/i,
|
|
34
|
-
/tool (failed|error|returned)/i,
|
|
35
|
-
/API (error|failed|token)/i,
|
|
36
|
-
/forwarding (to|this)/i,
|
|
37
|
-
/I'll note that this/i,
|
|
38
|
-
/You'll want to reach out through/i,
|
|
39
|
-
/should be routed/i,
|
|
40
|
-
/should go to/i,
|
|
41
|
-
/falls outside/i,
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
const metaPatterns = [
|
|
45
|
-
/^This (is|appears to be|seems|looks like) (a |an )?(clearly )?/i,
|
|
46
|
-
/I (won't|will not|shouldn't|should not) (respond|draft|reply)/i,
|
|
47
|
-
/I don't need to respond/i,
|
|
48
|
-
/this (should|needs to) (go to|be forwarded|be routed)/i,
|
|
49
|
-
/per my guidelines/i,
|
|
50
|
-
/outside (the scope|my scope|customer support)/i,
|
|
51
|
-
/not a (support request|customer service issue)/i,
|
|
52
|
-
/is clearly (not|meant|personal|business)/i,
|
|
53
|
-
/This (falls|is) outside/i,
|
|
54
|
-
]
|
|
55
|
-
|
|
56
|
-
const bannedPatterns = [
|
|
57
|
-
/^Great!/i,
|
|
58
|
-
/I'd recommend/i,
|
|
59
|
-
/I would recommend/i,
|
|
60
|
-
/I'd suggest/i,
|
|
61
|
-
/I would suggest/i,
|
|
62
|
-
/Is there a specific area you're curious about/i,
|
|
63
|
-
/Would you like help with/i,
|
|
64
|
-
/Let me know if you have any other questions/i,
|
|
65
|
-
/I hope this helps/i,
|
|
66
|
-
/Happy to help/i,
|
|
67
|
-
/I understand/i,
|
|
68
|
-
/I hear you/i,
|
|
69
|
-
/I apologize for any inconvenience/i,
|
|
70
|
-
/Thanks (so much )?for (reaching out|sharing)/i,
|
|
71
|
-
/—/, // em dash
|
|
72
|
-
]
|
|
73
|
-
|
|
74
|
-
interface ScoreResult {
|
|
75
|
-
leaks: string[]
|
|
76
|
-
meta: string[]
|
|
77
|
-
banned: string[]
|
|
78
|
-
passed: boolean
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
function scoreResponse(text: string): ScoreResult {
|
|
82
|
-
const leaks: string[] = []
|
|
83
|
-
const meta: string[] = []
|
|
84
|
-
const banned: string[] = []
|
|
85
|
-
|
|
86
|
-
for (const p of leakPatterns) {
|
|
87
|
-
const m = text.match(p)
|
|
88
|
-
if (m) leaks.push(m[0])
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
for (const p of metaPatterns) {
|
|
92
|
-
const m = text.match(p)
|
|
93
|
-
if (m) meta.push(m[0])
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
for (const p of bannedPatterns) {
|
|
97
|
-
const m = text.match(p)
|
|
98
|
-
if (m) banned.push(m[0])
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
return {
|
|
102
|
-
leaks,
|
|
103
|
-
meta,
|
|
104
|
-
banned,
|
|
105
|
-
passed: leaks.length === 0 && meta.length === 0 && banned.length === 0,
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
// ============================================================================
|
|
110
|
-
// Mock Tools (minimal implementations for eval)
|
|
111
|
-
// ============================================================================
|
|
112
|
-
|
|
113
|
-
const mockTools = {
|
|
114
|
-
lookupUser: tool({
|
|
115
|
-
description: 'Look up user by email',
|
|
116
|
-
inputSchema: z.object({
|
|
117
|
-
email: z.string(),
|
|
118
|
-
appId: z.string(),
|
|
119
|
-
}),
|
|
120
|
-
execute: async () => ({
|
|
121
|
-
found: true,
|
|
122
|
-
user: { id: 'mock-user', email: '[EMAIL]', name: 'Customer' },
|
|
123
|
-
purchases: [{ id: 'purch-1', product: 'Total TypeScript', date: '2025-01-01' }],
|
|
124
|
-
}),
|
|
125
|
-
}),
|
|
126
|
-
|
|
127
|
-
searchKnowledge: tool({
|
|
128
|
-
description: 'Search knowledge base',
|
|
129
|
-
inputSchema: z.object({ query: z.string(), appId: z.string() }),
|
|
130
|
-
execute: async () => ({
|
|
131
|
-
similarTickets: [],
|
|
132
|
-
knowledge: [],
|
|
133
|
-
goodResponses: [],
|
|
134
|
-
}),
|
|
135
|
-
}),
|
|
136
|
-
|
|
137
|
-
draftResponse: tool({
|
|
138
|
-
description: 'Draft a response',
|
|
139
|
-
inputSchema: z.object({ body: z.string() }),
|
|
140
|
-
execute: async ({ body }) => ({ drafted: true, body }),
|
|
141
|
-
}),
|
|
142
|
-
|
|
143
|
-
escalateToHuman: tool({
|
|
144
|
-
description: 'Escalate to human',
|
|
145
|
-
inputSchema: z.object({
|
|
146
|
-
reason: z.string(),
|
|
147
|
-
urgency: z.enum(['low', 'medium', 'high']),
|
|
148
|
-
}),
|
|
149
|
-
execute: async ({ reason, urgency }) => ({ escalated: true, reason, urgency }),
|
|
150
|
-
}),
|
|
151
|
-
|
|
152
|
-
assignToInstructor: tool({
|
|
153
|
-
description: 'Assign to instructor',
|
|
154
|
-
inputSchema: z.object({
|
|
155
|
-
conversationId: z.string(),
|
|
156
|
-
reason: z.string(),
|
|
157
|
-
}),
|
|
158
|
-
execute: async ({ conversationId, reason }) => ({
|
|
159
|
-
status: 'pending_approval',
|
|
160
|
-
conversationId,
|
|
161
|
-
reason,
|
|
162
|
-
message: 'Instructor assignment submitted for approval',
|
|
163
|
-
}),
|
|
164
|
-
}),
|
|
165
|
-
|
|
166
|
-
memory_search: tool({
|
|
167
|
-
description: 'Search memory',
|
|
168
|
-
inputSchema: z.object({ query: z.string() }),
|
|
169
|
-
execute: async () => ({ results: [], total: 0 }),
|
|
170
|
-
}),
|
|
171
|
-
|
|
172
|
-
searchProductContent: tool({
|
|
173
|
-
description: 'Search product content',
|
|
174
|
-
inputSchema: z.object({ query: z.string() }),
|
|
175
|
-
execute: async () => ({ results: [] }),
|
|
176
|
-
}),
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
// ============================================================================
|
|
180
|
-
// Dataset Types
|
|
181
|
-
// ============================================================================
|
|
182
|
-
|
|
183
|
-
interface DatasetSample {
|
|
184
|
-
id: string
|
|
185
|
-
app: string
|
|
186
|
-
conversationId: string
|
|
187
|
-
triggerMessage: {
|
|
188
|
-
subject: string
|
|
189
|
-
body: string
|
|
190
|
-
timestamp: number
|
|
191
|
-
}
|
|
192
|
-
agentResponse: {
|
|
193
|
-
text: string
|
|
194
|
-
category: string
|
|
195
|
-
timestamp: string
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// ============================================================================
|
|
200
|
-
// Eval Runner
|
|
201
|
-
// ============================================================================
|
|
202
|
-
|
|
203
|
-
interface EvalResult {
|
|
204
|
-
id: string
|
|
205
|
-
input: string
|
|
206
|
-
output: string
|
|
207
|
-
score: ScoreResult
|
|
208
|
-
durationMs: number
|
|
209
|
-
toolsCalled: string[]
|
|
210
|
-
noDraft: boolean
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
async function runSingleEval(
|
|
214
|
-
prompt: string,
|
|
215
|
-
sample: DatasetSample,
|
|
216
|
-
model: string
|
|
217
|
-
): Promise<EvalResult> {
|
|
218
|
-
const startTime = Date.now()
|
|
219
|
-
const input = `Subject: ${sample.triggerMessage.subject}\n\n${sample.triggerMessage.body}`
|
|
220
|
-
|
|
221
|
-
try {
|
|
222
|
-
const result = await generateText({
|
|
223
|
-
model,
|
|
224
|
-
system: prompt + '\n\nApp: total-typescript',
|
|
225
|
-
messages: [{ role: 'user', content: input }],
|
|
226
|
-
tools: mockTools,
|
|
227
|
-
stopWhen: stepCountIs(10),
|
|
228
|
-
})
|
|
229
|
-
|
|
230
|
-
// Find draftResponse call
|
|
231
|
-
const draftCall = result.steps.flatMap(s => s.toolCalls || [])
|
|
232
|
-
.find(tc => tc.toolName === 'draftResponse')
|
|
233
|
-
|
|
234
|
-
const toolsCalled = result.steps.flatMap(s => s.toolCalls || [])
|
|
235
|
-
.map(tc => tc.toolName)
|
|
236
|
-
|
|
237
|
-
const output = draftCall
|
|
238
|
-
? (draftCall.input as { body: string }).body
|
|
239
|
-
: ''
|
|
240
|
-
|
|
241
|
-
return {
|
|
242
|
-
id: sample.id.slice(0, 8),
|
|
243
|
-
input: input.slice(0, 100),
|
|
244
|
-
output,
|
|
245
|
-
score: scoreResponse(output),
|
|
246
|
-
durationMs: Date.now() - startTime,
|
|
247
|
-
toolsCalled,
|
|
248
|
-
noDraft: !draftCall,
|
|
249
|
-
}
|
|
250
|
-
} catch (error) {
|
|
251
|
-
return {
|
|
252
|
-
id: sample.id.slice(0, 8),
|
|
253
|
-
input: input.slice(0, 100),
|
|
254
|
-
output: `ERROR: ${error instanceof Error ? error.message : 'Unknown'}`,
|
|
255
|
-
score: { leaks: [], meta: [], banned: [], passed: false },
|
|
256
|
-
durationMs: Date.now() - startTime,
|
|
257
|
-
toolsCalled: [],
|
|
258
|
-
noDraft: true,
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
async function runEval(options: {
|
|
264
|
-
prompt?: string
|
|
265
|
-
dataset?: string
|
|
266
|
-
limit?: number
|
|
267
|
-
model?: string
|
|
268
|
-
output?: string
|
|
269
|
-
json?: boolean
|
|
270
|
-
}): Promise<void> {
|
|
271
|
-
const {
|
|
272
|
-
prompt: promptPath,
|
|
273
|
-
dataset: datasetPath = 'data/eval-dataset.json',
|
|
274
|
-
limit = 10,
|
|
275
|
-
model = 'anthropic/claude-haiku-4-5', // Fast + cheap for evals
|
|
276
|
-
output: outputPath,
|
|
277
|
-
json = false,
|
|
278
|
-
} = options
|
|
279
|
-
|
|
280
|
-
// Load prompt
|
|
281
|
-
let prompt = SUPPORT_AGENT_PROMPT
|
|
282
|
-
if (promptPath) {
|
|
283
|
-
if (!existsSync(promptPath)) {
|
|
284
|
-
console.error(`Prompt file not found: ${promptPath}`)
|
|
285
|
-
process.exit(1)
|
|
286
|
-
}
|
|
287
|
-
prompt = readFileSync(promptPath, 'utf-8')
|
|
288
|
-
console.log(`Using prompt from: ${promptPath}`)
|
|
289
|
-
} else {
|
|
290
|
-
console.log('Using production prompt')
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
// Load dataset
|
|
294
|
-
if (!existsSync(datasetPath)) {
|
|
295
|
-
console.error(`Dataset not found: ${datasetPath}`)
|
|
296
|
-
process.exit(1)
|
|
297
|
-
}
|
|
298
|
-
const dataset: DatasetSample[] = JSON.parse(readFileSync(datasetPath, 'utf-8'))
|
|
299
|
-
const samples = dataset.slice(0, limit)
|
|
300
|
-
|
|
301
|
-
console.log(`\n🧪 Running eval on ${samples.length} samples (model: ${model})\n`)
|
|
302
|
-
|
|
303
|
-
const results: EvalResult[] = []
|
|
304
|
-
let passed = 0
|
|
305
|
-
let failed = 0
|
|
306
|
-
let noDraft = 0
|
|
307
|
-
|
|
308
|
-
for (let i = 0; i < samples.length; i++) {
|
|
309
|
-
process.stdout.write(`\r Processing ${i + 1}/${samples.length}...`)
|
|
310
|
-
const sample = samples[i]
|
|
311
|
-
if (!sample) continue
|
|
312
|
-
const result = await runSingleEval(prompt, sample, model)
|
|
313
|
-
results.push(result)
|
|
314
|
-
|
|
315
|
-
if (result.noDraft) {
|
|
316
|
-
noDraft++
|
|
317
|
-
} else if (result.score.passed) {
|
|
318
|
-
passed++
|
|
319
|
-
} else {
|
|
320
|
-
failed++
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
console.log('\n')
|
|
325
|
-
|
|
326
|
-
// Summary
|
|
327
|
-
console.log('📊 Results:')
|
|
328
|
-
console.log(` ✅ Passed: ${passed}/${samples.length} (${((passed/samples.length)*100).toFixed(1)}%)`)
|
|
329
|
-
console.log(` ❌ Failed: ${failed}/${samples.length}`)
|
|
330
|
-
console.log(` 🚫 No draft: ${noDraft}/${samples.length}`)
|
|
331
|
-
|
|
332
|
-
// Issue breakdown
|
|
333
|
-
const allLeaks = results.flatMap(r => r.score.leaks)
|
|
334
|
-
const allMeta = results.flatMap(r => r.score.meta)
|
|
335
|
-
const allBanned = results.flatMap(r => r.score.banned)
|
|
336
|
-
|
|
337
|
-
console.log('\n📋 Issue breakdown:')
|
|
338
|
-
console.log(` 🚨 Internal leaks: ${allLeaks.length}`)
|
|
339
|
-
console.log(` 💬 Meta-commentary: ${allMeta.length}`)
|
|
340
|
-
console.log(` 🚫 Banned phrases: ${allBanned.length}`)
|
|
341
|
-
|
|
342
|
-
// Show failures
|
|
343
|
-
const failures = results.filter(r => !r.noDraft && !r.score.passed)
|
|
344
|
-
if (failures.length > 0 && !json) {
|
|
345
|
-
console.log('\n--- FAILURES ---\n')
|
|
346
|
-
for (const f of failures.slice(0, 10)) {
|
|
347
|
-
const issues = [
|
|
348
|
-
...f.score.leaks.map(l => `LEAK: "${l}"`),
|
|
349
|
-
...f.score.meta.map(m => `META: "${m}"`),
|
|
350
|
-
...f.score.banned.map(b => `BANNED: "${b}"`),
|
|
351
|
-
]
|
|
352
|
-
console.log(`[${f.id}] ${issues.join(', ')}`)
|
|
353
|
-
console.log(` Output: ${f.output.slice(0, 150)}...\n`)
|
|
354
|
-
}
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
// JSON output
|
|
358
|
-
if (json) {
|
|
359
|
-
console.log(JSON.stringify({
|
|
360
|
-
summary: { total: samples.length, passed, failed, noDraft },
|
|
361
|
-
issues: { leaks: allLeaks.length, meta: allMeta.length, banned: allBanned.length },
|
|
362
|
-
results,
|
|
363
|
-
}, null, 2))
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
// Save results
|
|
367
|
-
if (outputPath) {
|
|
368
|
-
writeFileSync(outputPath, JSON.stringify(results, null, 2))
|
|
369
|
-
console.log(`\nSaved results to ${outputPath}`)
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
// Exit code based on pass rate
|
|
373
|
-
const passRate = passed / (passed + failed)
|
|
374
|
-
process.exit(passRate >= 0.8 ? 0 : 1)
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
async function comparePrompts(options: {
|
|
378
|
-
baseline?: string
|
|
379
|
-
candidate: string
|
|
380
|
-
dataset?: string
|
|
381
|
-
limit?: number
|
|
382
|
-
model?: string
|
|
383
|
-
}): Promise<void> {
|
|
384
|
-
const {
|
|
385
|
-
baseline,
|
|
386
|
-
candidate,
|
|
387
|
-
dataset: datasetPath = 'data/eval-dataset.json',
|
|
388
|
-
limit = 10,
|
|
389
|
-
model = 'anthropic/claude-haiku-4-5',
|
|
390
|
-
} = options
|
|
391
|
-
|
|
392
|
-
// Load prompts
|
|
393
|
-
const baselinePrompt = baseline
|
|
394
|
-
? readFileSync(baseline, 'utf-8')
|
|
395
|
-
: SUPPORT_AGENT_PROMPT
|
|
396
|
-
const candidatePrompt = readFileSync(candidate, 'utf-8')
|
|
397
|
-
|
|
398
|
-
// Load dataset
|
|
399
|
-
const dataset: DatasetSample[] = JSON.parse(readFileSync(datasetPath, 'utf-8'))
|
|
400
|
-
const samples = dataset.slice(0, limit)
|
|
401
|
-
|
|
402
|
-
console.log(`\n🔬 Comparing prompts on ${samples.length} samples\n`)
|
|
403
|
-
console.log(` Baseline: ${baseline || 'production'}`)
|
|
404
|
-
console.log(` Candidate: ${candidate}`)
|
|
405
|
-
console.log('')
|
|
406
|
-
|
|
407
|
-
let baselinePassed = 0
|
|
408
|
-
let candidatePassed = 0
|
|
409
|
-
const comparisons: Array<{
|
|
410
|
-
id: string
|
|
411
|
-
baselineScore: ScoreResult
|
|
412
|
-
candidateScore: ScoreResult
|
|
413
|
-
improved: boolean
|
|
414
|
-
regressed: boolean
|
|
415
|
-
}> = []
|
|
416
|
-
|
|
417
|
-
for (let i = 0; i < samples.length; i++) {
|
|
418
|
-
const sample = samples[i]
|
|
419
|
-
if (!sample) continue
|
|
420
|
-
process.stdout.write(`\r Processing ${i + 1}/${samples.length}...`)
|
|
421
|
-
|
|
422
|
-
const baselineResult = await runSingleEval(baselinePrompt, sample, model)
|
|
423
|
-
const candidateResult = await runSingleEval(candidatePrompt, sample, model)
|
|
424
|
-
|
|
425
|
-
if (!baselineResult.noDraft && baselineResult.score.passed) baselinePassed++
|
|
426
|
-
if (!candidateResult.noDraft && candidateResult.score.passed) candidatePassed++
|
|
427
|
-
|
|
428
|
-
const baselineIssues = baselineResult.score.leaks.length + baselineResult.score.meta.length + baselineResult.score.banned.length
|
|
429
|
-
const candidateIssues = candidateResult.score.leaks.length + candidateResult.score.meta.length + candidateResult.score.banned.length
|
|
430
|
-
|
|
431
|
-
comparisons.push({
|
|
432
|
-
id: sample.id.slice(0, 8),
|
|
433
|
-
baselineScore: baselineResult.score,
|
|
434
|
-
candidateScore: candidateResult.score,
|
|
435
|
-
improved: candidateIssues < baselineIssues,
|
|
436
|
-
regressed: candidateIssues > baselineIssues,
|
|
437
|
-
})
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
console.log('\n\n📊 Comparison Results:\n')
|
|
441
|
-
console.log(` Baseline pass rate: ${baselinePassed}/${samples.length} (${((baselinePassed/samples.length)*100).toFixed(1)}%)`)
|
|
442
|
-
console.log(` Candidate pass rate: ${candidatePassed}/${samples.length} (${((candidatePassed/samples.length)*100).toFixed(1)}%)`)
|
|
443
|
-
|
|
444
|
-
const improved = comparisons.filter(c => c.improved).length
|
|
445
|
-
const regressed = comparisons.filter(c => c.regressed).length
|
|
446
|
-
const same = comparisons.length - improved - regressed
|
|
447
|
-
|
|
448
|
-
console.log(`\n ⬆️ Improved: ${improved}`)
|
|
449
|
-
console.log(` ⬇️ Regressed: ${regressed}`)
|
|
450
|
-
console.log(` ➡️ Same: ${same}`)
|
|
451
|
-
|
|
452
|
-
if (candidatePassed > baselinePassed) {
|
|
453
|
-
console.log('\n✅ Candidate is BETTER')
|
|
454
|
-
process.exit(0)
|
|
455
|
-
} else if (candidatePassed < baselinePassed) {
|
|
456
|
-
console.log('\n❌ Candidate is WORSE')
|
|
457
|
-
process.exit(1)
|
|
458
|
-
} else {
|
|
459
|
-
console.log('\n➡️ No significant difference')
|
|
460
|
-
process.exit(0)
|
|
461
|
-
}
|
|
462
|
-
}
|
|
463
|
-
|
|
464
|
-
// ============================================================================
|
|
465
|
-
// CLI Registration
|
|
466
|
-
// ============================================================================
|
|
467
|
-
|
|
468
|
-
export function registerEvalPromptCommands(program: Command): void {
|
|
469
|
-
const evalPrompt = program
|
|
470
|
-
.command('eval-prompt')
|
|
471
|
-
.description('Evaluate prompt quality against real trigger messages')
|
|
472
|
-
|
|
473
|
-
evalPrompt
|
|
474
|
-
.command('run')
|
|
475
|
-
.description('Run eval with a prompt')
|
|
476
|
-
.option('-p, --prompt <file>', 'Path to prompt file (default: production)')
|
|
477
|
-
.option('-d, --dataset <file>', 'Path to dataset (default: data/eval-dataset.json)')
|
|
478
|
-
.option('-l, --limit <n>', 'Max samples to eval', parseInt)
|
|
479
|
-
.option('-m, --model <model>', 'Model to use (default: claude-haiku-4-5)')
|
|
480
|
-
.option('-o, --output <file>', 'Save results to file')
|
|
481
|
-
.option('--json', 'JSON output')
|
|
482
|
-
.action(runEval)
|
|
483
|
-
|
|
484
|
-
evalPrompt
|
|
485
|
-
.command('compare')
|
|
486
|
-
.description('Compare two prompts side-by-side')
|
|
487
|
-
.requiredOption('-c, --candidate <file>', 'Candidate prompt file')
|
|
488
|
-
.option('-b, --baseline <file>', 'Baseline prompt (default: production)')
|
|
489
|
-
.option('-d, --dataset <file>', 'Path to dataset')
|
|
490
|
-
.option('-l, --limit <n>', 'Max samples', parseInt)
|
|
491
|
-
.option('-m, --model <model>', 'Model to use')
|
|
492
|
-
.action(comparePrompts)
|
|
493
|
-
|
|
494
|
-
// Default action runs eval
|
|
495
|
-
evalPrompt.action(runEval)
|
|
496
|
-
}
|