@skillrecordings/cli 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skill.mjs +21 -0
- package/dist/chunk-2NCCVTEE.js +22342 -0
- package/dist/chunk-2NCCVTEE.js.map +1 -0
- package/dist/chunk-3E3GYSZR.js +7071 -0
- package/dist/chunk-3E3GYSZR.js.map +1 -0
- package/dist/chunk-F4EM72IH.js +86 -0
- package/dist/chunk-F4EM72IH.js.map +1 -0
- package/dist/chunk-FGP7KUQW.js +432 -0
- package/dist/chunk-FGP7KUQW.js.map +1 -0
- package/dist/chunk-H3D6VCME.js +55 -0
- package/dist/chunk-H3D6VCME.js.map +1 -0
- package/dist/chunk-HK3PEWFD.js +208 -0
- package/dist/chunk-HK3PEWFD.js.map +1 -0
- package/dist/chunk-KEV3QKXP.js +4495 -0
- package/dist/chunk-KEV3QKXP.js.map +1 -0
- package/dist/chunk-MG37YDAK.js +882 -0
- package/dist/chunk-MG37YDAK.js.map +1 -0
- package/dist/chunk-MLNDSBZ4.js +482 -0
- package/dist/chunk-MLNDSBZ4.js.map +1 -0
- package/dist/chunk-N2WIV2JV.js +22 -0
- package/dist/chunk-N2WIV2JV.js.map +1 -0
- package/dist/chunk-PWWRCN5W.js +2067 -0
- package/dist/chunk-PWWRCN5W.js.map +1 -0
- package/dist/chunk-SKHBM3XP.js +7746 -0
- package/dist/chunk-SKHBM3XP.js.map +1 -0
- package/dist/chunk-WFANXVQG.js +64 -0
- package/dist/chunk-WFANXVQG.js.map +1 -0
- package/dist/chunk-WYKL32C3.js +275 -0
- package/dist/chunk-WYKL32C3.js.map +1 -0
- package/dist/chunk-ZNF7XD2S.js +134 -0
- package/dist/chunk-ZNF7XD2S.js.map +1 -0
- package/dist/config-AUAIYDSI.js +20 -0
- package/dist/config-AUAIYDSI.js.map +1 -0
- package/dist/fileFromPath-XN7LXIBI.js +134 -0
- package/dist/fileFromPath-XN7LXIBI.js.map +1 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js +42 -0
- package/dist/getMachineId-bsd-KW2E7VK3.js.map +1 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js +42 -0
- package/dist/getMachineId-darwin-ROXJUJX5.js.map +1 -0
- package/dist/getMachineId-linux-KVZEHQSU.js +34 -0
- package/dist/getMachineId-linux-KVZEHQSU.js.map +1 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js +25 -0
- package/dist/getMachineId-unsupported-PPRILPPA.js.map +1 -0
- package/dist/getMachineId-win-IIF36LEJ.js +44 -0
- package/dist/getMachineId-win-IIF36LEJ.js.map +1 -0
- package/dist/index.js +112703 -0
- package/dist/index.js.map +1 -0
- package/dist/lib-R6DEEJCP.js +7623 -0
- package/dist/lib-R6DEEJCP.js.map +1 -0
- package/dist/pipeline-IAVVAKTU.js +120 -0
- package/dist/pipeline-IAVVAKTU.js.map +1 -0
- package/dist/query-NTP5NVXN.js +25 -0
- package/dist/query-NTP5NVXN.js.map +1 -0
- package/dist/routing-BAEPFB7V.js +390 -0
- package/dist/routing-BAEPFB7V.js.map +1 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js +56 -0
- package/dist/stripe-lookup-charge-EPRUMZDL.js.map +1 -0
- package/dist/stripe-payment-history-SJPKA63N.js +67 -0
- package/dist/stripe-payment-history-SJPKA63N.js.map +1 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js +58 -0
- package/dist/stripe-subscription-status-L4Z65GB3.js.map +1 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js +54 -0
- package/dist/stripe-verify-refund-FZDKCIUQ.js.map +1 -0
- package/dist/support-memory-WSG7SDKG.js +10 -0
- package/dist/support-memory-WSG7SDKG.js.map +1 -0
- package/package.json +10 -7
- package/.env.encrypted +0 -0
- package/CHANGELOG.md +0 -35
- package/data/tt-archive-dataset.json +0 -1
- package/data/validate-test-dataset.json +0 -97
- package/docs/CLI-AUTH.md +0 -504
- package/preload.ts +0 -18
- package/src/__tests__/init.test.ts +0 -74
- package/src/alignment-test.ts +0 -64
- package/src/check-apps.ts +0 -16
- package/src/commands/auth/decrypt.ts +0 -123
- package/src/commands/auth/encrypt.ts +0 -81
- package/src/commands/auth/index.ts +0 -50
- package/src/commands/auth/keygen.ts +0 -41
- package/src/commands/auth/status.ts +0 -164
- package/src/commands/axiom/forensic.ts +0 -868
- package/src/commands/axiom/index.ts +0 -697
- package/src/commands/build-dataset.ts +0 -311
- package/src/commands/db-status.ts +0 -47
- package/src/commands/deploys.ts +0 -219
- package/src/commands/eval-local/compare.ts +0 -171
- package/src/commands/eval-local/health.ts +0 -212
- package/src/commands/eval-local/index.ts +0 -76
- package/src/commands/eval-local/real-tools.ts +0 -416
- package/src/commands/eval-local/run.ts +0 -1168
- package/src/commands/eval-local/score-production.ts +0 -256
- package/src/commands/eval-local/seed.ts +0 -276
- package/src/commands/eval-pipeline/index.ts +0 -53
- package/src/commands/eval-pipeline/real-tools.ts +0 -492
- package/src/commands/eval-pipeline/run.ts +0 -1316
- package/src/commands/eval-pipeline/seed.ts +0 -395
- package/src/commands/eval-prompt.ts +0 -496
- package/src/commands/eval.test.ts +0 -253
- package/src/commands/eval.ts +0 -108
- package/src/commands/faq-classify.ts +0 -460
- package/src/commands/faq-cluster.ts +0 -135
- package/src/commands/faq-extract.ts +0 -249
- package/src/commands/faq-mine.ts +0 -432
- package/src/commands/faq-review.ts +0 -426
- package/src/commands/front/index.ts +0 -351
- package/src/commands/front/pull-conversations.ts +0 -275
- package/src/commands/front/tags.ts +0 -825
- package/src/commands/front-cache.ts +0 -1277
- package/src/commands/front-stats.ts +0 -75
- package/src/commands/health.test.ts +0 -82
- package/src/commands/health.ts +0 -362
- package/src/commands/init.test.ts +0 -89
- package/src/commands/init.ts +0 -106
- package/src/commands/inngest/client.ts +0 -294
- package/src/commands/inngest/events.ts +0 -296
- package/src/commands/inngest/investigate.ts +0 -382
- package/src/commands/inngest/runs.ts +0 -149
- package/src/commands/inngest/signal.ts +0 -143
- package/src/commands/kb-sync.ts +0 -498
- package/src/commands/memory/find.ts +0 -135
- package/src/commands/memory/get.ts +0 -87
- package/src/commands/memory/index.ts +0 -97
- package/src/commands/memory/stats.ts +0 -163
- package/src/commands/memory/store.ts +0 -49
- package/src/commands/memory/vote.ts +0 -159
- package/src/commands/pipeline.ts +0 -127
- package/src/commands/responses.ts +0 -856
- package/src/commands/tools.ts +0 -293
- package/src/commands/wizard.ts +0 -319
- package/src/index.ts +0 -172
- package/src/lib/crypto.ts +0 -56
- package/src/lib/env-loader.ts +0 -206
- package/src/lib/onepassword.ts +0 -137
- package/src/test-agent-local.ts +0 -115
- package/tsconfig.json +0 -11
- package/vitest.config.ts +0 -10
|
@@ -1,256 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Score REAL production responses from the dataset
|
|
3
|
-
*
|
|
4
|
-
* No mocks, no generation - just score what was actually sent to customers.
|
|
5
|
-
* This gives us the TRUE baseline quality of production.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import {
|
|
9
|
-
BannedPhrases,
|
|
10
|
-
Helpfulness,
|
|
11
|
-
InternalStateLeakage,
|
|
12
|
-
MetaCommentary,
|
|
13
|
-
ProductFabrication,
|
|
14
|
-
} from '@skillrecordings/core/evals/scorers'
|
|
15
|
-
import { readFile, writeFile } from 'fs/promises'
|
|
16
|
-
|
|
17
|
-
interface DatasetItem {
|
|
18
|
-
id: string
|
|
19
|
-
app: string
|
|
20
|
-
conversationId: string
|
|
21
|
-
customerEmail: string
|
|
22
|
-
triggerMessage: {
|
|
23
|
-
subject: string
|
|
24
|
-
body: string
|
|
25
|
-
timestamp: number
|
|
26
|
-
}
|
|
27
|
-
agentResponse?: {
|
|
28
|
-
text: string
|
|
29
|
-
category: string
|
|
30
|
-
timestamp: string
|
|
31
|
-
}
|
|
32
|
-
conversationHistory: Array<{
|
|
33
|
-
direction: 'in' | 'out'
|
|
34
|
-
body: string
|
|
35
|
-
timestamp: number
|
|
36
|
-
}>
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
interface ScoreResult {
|
|
40
|
-
id: string
|
|
41
|
-
subject: string
|
|
42
|
-
hadResponse: boolean
|
|
43
|
-
productionResponse: string
|
|
44
|
-
scores: {
|
|
45
|
-
internalLeaks: { passed: boolean; matches: string[] }
|
|
46
|
-
metaCommentary: { passed: boolean; matches: string[] }
|
|
47
|
-
bannedPhrases: { passed: boolean; matches: string[] }
|
|
48
|
-
fabrication: { passed: boolean; matches: string[] }
|
|
49
|
-
helpfulness: { score: number }
|
|
50
|
-
}
|
|
51
|
-
passed: boolean
|
|
52
|
-
failureReasons: string[]
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
interface ScoreOptions {
|
|
56
|
-
dataset: string
|
|
57
|
-
output?: string
|
|
58
|
-
verbose?: boolean
|
|
59
|
-
json?: boolean
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
export async function scoreProduction(options: ScoreOptions): Promise<void> {
|
|
63
|
-
const { dataset: datasetPath, output, verbose, json } = options
|
|
64
|
-
|
|
65
|
-
const datasetContent = await readFile(datasetPath, 'utf-8')
|
|
66
|
-
const dataset: DatasetItem[] = JSON.parse(datasetContent)
|
|
67
|
-
|
|
68
|
-
if (!json) {
|
|
69
|
-
console.log(`\n📊 Scoring ${dataset.length} production responses\n`)
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
const results: ScoreResult[] = []
|
|
73
|
-
let passed = 0
|
|
74
|
-
let failed = 0
|
|
75
|
-
let noResponse = 0
|
|
76
|
-
|
|
77
|
-
const failures = {
|
|
78
|
-
internalLeaks: 0,
|
|
79
|
-
metaCommentary: 0,
|
|
80
|
-
bannedPhrases: 0,
|
|
81
|
-
fabrication: 0,
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
for (const item of dataset) {
|
|
85
|
-
const response = item.agentResponse?.text || ''
|
|
86
|
-
const subject = item.triggerMessage?.subject || 'Unknown'
|
|
87
|
-
|
|
88
|
-
if (!response || response.trim().length === 0) {
|
|
89
|
-
noResponse++
|
|
90
|
-
results.push({
|
|
91
|
-
id: item.id,
|
|
92
|
-
subject,
|
|
93
|
-
hadResponse: false,
|
|
94
|
-
productionResponse: '',
|
|
95
|
-
scores: {
|
|
96
|
-
internalLeaks: { passed: true, matches: [] },
|
|
97
|
-
metaCommentary: { passed: true, matches: [] },
|
|
98
|
-
bannedPhrases: { passed: true, matches: [] },
|
|
99
|
-
fabrication: { passed: true, matches: [] },
|
|
100
|
-
helpfulness: { score: 0 },
|
|
101
|
-
},
|
|
102
|
-
passed: true, // No response = can't fail quality
|
|
103
|
-
failureReasons: [],
|
|
104
|
-
})
|
|
105
|
-
continue
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// Score the production response
|
|
109
|
-
const leakResult = InternalStateLeakage({ output: response })
|
|
110
|
-
const metaResult = MetaCommentary({ output: response })
|
|
111
|
-
const bannedResult = BannedPhrases({ output: response })
|
|
112
|
-
const fabResult = ProductFabrication({ output: response })
|
|
113
|
-
const helpResult = Helpfulness({ output: response })
|
|
114
|
-
|
|
115
|
-
const scores = {
|
|
116
|
-
internalLeaks: {
|
|
117
|
-
passed: leakResult.score === 1,
|
|
118
|
-
matches: leakResult.metadata?.foundLeaks || [],
|
|
119
|
-
},
|
|
120
|
-
metaCommentary: {
|
|
121
|
-
passed: metaResult.score === 1,
|
|
122
|
-
matches: metaResult.metadata?.foundMeta || [],
|
|
123
|
-
},
|
|
124
|
-
bannedPhrases: {
|
|
125
|
-
passed: bannedResult.score === 1,
|
|
126
|
-
matches: bannedResult.metadata?.foundBanned || [],
|
|
127
|
-
},
|
|
128
|
-
fabrication: {
|
|
129
|
-
passed: fabResult.score === 1,
|
|
130
|
-
matches: fabResult.metadata?.foundFabrication || [],
|
|
131
|
-
},
|
|
132
|
-
helpfulness: {
|
|
133
|
-
score: helpResult.score,
|
|
134
|
-
},
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
const failureReasons: string[] = []
|
|
138
|
-
if (!scores.internalLeaks.passed) {
|
|
139
|
-
failureReasons.push(
|
|
140
|
-
`Internal leak: ${scores.internalLeaks.matches.join(', ')}`
|
|
141
|
-
)
|
|
142
|
-
failures.internalLeaks++
|
|
143
|
-
}
|
|
144
|
-
if (!scores.metaCommentary.passed) {
|
|
145
|
-
failureReasons.push(
|
|
146
|
-
`Meta commentary: ${scores.metaCommentary.matches.join(', ')}`
|
|
147
|
-
)
|
|
148
|
-
failures.metaCommentary++
|
|
149
|
-
}
|
|
150
|
-
if (!scores.bannedPhrases.passed) {
|
|
151
|
-
failureReasons.push(
|
|
152
|
-
`Banned phrase: ${scores.bannedPhrases.matches.join(', ')}`
|
|
153
|
-
)
|
|
154
|
-
failures.bannedPhrases++
|
|
155
|
-
}
|
|
156
|
-
if (!scores.fabrication.passed) {
|
|
157
|
-
failureReasons.push(
|
|
158
|
-
`Fabrication: ${scores.fabrication.matches.join(', ')}`
|
|
159
|
-
)
|
|
160
|
-
failures.fabrication++
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
const itemPassed = failureReasons.length === 0
|
|
164
|
-
if (itemPassed) {
|
|
165
|
-
passed++
|
|
166
|
-
} else {
|
|
167
|
-
failed++
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
results.push({
|
|
171
|
-
id: item.id,
|
|
172
|
-
subject,
|
|
173
|
-
hadResponse: true,
|
|
174
|
-
productionResponse: response,
|
|
175
|
-
scores,
|
|
176
|
-
passed: itemPassed,
|
|
177
|
-
failureReasons,
|
|
178
|
-
})
|
|
179
|
-
|
|
180
|
-
if (verbose && !itemPassed) {
|
|
181
|
-
console.log(`❌ ${subject.slice(0, 60)}...`)
|
|
182
|
-
for (const reason of failureReasons) {
|
|
183
|
-
console.log(` └─ ${reason}`)
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
// Summary
|
|
189
|
-
const withResponses = passed + failed
|
|
190
|
-
const passRate = withResponses > 0 ? (passed / withResponses) * 100 : 0
|
|
191
|
-
|
|
192
|
-
if (output) {
|
|
193
|
-
await writeFile(
|
|
194
|
-
output,
|
|
195
|
-
JSON.stringify(
|
|
196
|
-
{
|
|
197
|
-
summary: {
|
|
198
|
-
total: dataset.length,
|
|
199
|
-
withResponses,
|
|
200
|
-
noResponse,
|
|
201
|
-
passed,
|
|
202
|
-
failed,
|
|
203
|
-
passRate,
|
|
204
|
-
failures,
|
|
205
|
-
},
|
|
206
|
-
results,
|
|
207
|
-
},
|
|
208
|
-
null,
|
|
209
|
-
2
|
|
210
|
-
)
|
|
211
|
-
)
|
|
212
|
-
if (!json) console.log(`Results saved to ${output}`)
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
if (json) {
|
|
216
|
-
console.log(
|
|
217
|
-
JSON.stringify(
|
|
218
|
-
{
|
|
219
|
-
summary: {
|
|
220
|
-
total: dataset.length,
|
|
221
|
-
withResponses,
|
|
222
|
-
noResponse,
|
|
223
|
-
passed,
|
|
224
|
-
failed,
|
|
225
|
-
passRate,
|
|
226
|
-
failures,
|
|
227
|
-
},
|
|
228
|
-
results,
|
|
229
|
-
},
|
|
230
|
-
null,
|
|
231
|
-
2
|
|
232
|
-
)
|
|
233
|
-
)
|
|
234
|
-
} else {
|
|
235
|
-
console.log('📊 Production Response Quality\n')
|
|
236
|
-
console.log(`Total conversations: ${dataset.length}`)
|
|
237
|
-
console.log(` With response: ${withResponses}`)
|
|
238
|
-
console.log(` No response: ${noResponse}`)
|
|
239
|
-
console.log('')
|
|
240
|
-
console.log(`Quality (responses only):`)
|
|
241
|
-
console.log(` ✅ Passed: ${passed} (${passRate.toFixed(1)}%)`)
|
|
242
|
-
console.log(` ❌ Failed: ${failed}`)
|
|
243
|
-
|
|
244
|
-
if (failed > 0) {
|
|
245
|
-
console.log('\nFailure breakdown:')
|
|
246
|
-
if (failures.internalLeaks > 0)
|
|
247
|
-
console.log(` 🚨 Internal leaks: ${failures.internalLeaks}`)
|
|
248
|
-
if (failures.metaCommentary > 0)
|
|
249
|
-
console.log(` 💬 Meta-commentary: ${failures.metaCommentary}`)
|
|
250
|
-
if (failures.bannedPhrases > 0)
|
|
251
|
-
console.log(` 🚫 Banned phrases: ${failures.bannedPhrases}`)
|
|
252
|
-
if (failures.fabrication > 0)
|
|
253
|
-
console.log(` 🎭 Fabrication: ${failures.fabrication}`)
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
}
|
|
@@ -1,276 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Seed command for local eval environment
|
|
3
|
-
* Populates MySQL and Qdrant with test fixtures
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import { join } from 'path'
|
|
7
|
-
import { createOllamaClient } from '@skillrecordings/core/adapters/ollama'
|
|
8
|
-
import { createQdrantClient } from '@skillrecordings/core/adapters/qdrant'
|
|
9
|
-
import { readFile, readdir } from 'fs/promises'
|
|
10
|
-
import { glob } from 'glob'
|
|
11
|
-
import matter from 'gray-matter'
|
|
12
|
-
|
|
13
|
-
interface SeedOptions {
|
|
14
|
-
clean?: boolean
|
|
15
|
-
fixtures?: string
|
|
16
|
-
json?: boolean
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
interface SeedResult {
|
|
20
|
-
apps: number
|
|
21
|
-
customers: number
|
|
22
|
-
knowledge: number
|
|
23
|
-
scenarios: number
|
|
24
|
-
embeddings: number
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export async function seed(options: SeedOptions): Promise<void> {
|
|
28
|
-
const fixturesPath = options.fixtures || 'fixtures'
|
|
29
|
-
|
|
30
|
-
if (!options.json) {
|
|
31
|
-
console.log('\n🌱 Seeding local eval environment...\n')
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
const result: SeedResult = {
|
|
35
|
-
apps: 0,
|
|
36
|
-
customers: 0,
|
|
37
|
-
knowledge: 0,
|
|
38
|
-
scenarios: 0,
|
|
39
|
-
embeddings: 0,
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
try {
|
|
43
|
-
// Get MySQL connection
|
|
44
|
-
const mysql = await import('mysql2/promise')
|
|
45
|
-
const connection = await mysql.createConnection({
|
|
46
|
-
host: 'localhost',
|
|
47
|
-
port: 3306,
|
|
48
|
-
user: 'eval_user',
|
|
49
|
-
password: 'eval_pass',
|
|
50
|
-
database: 'support_eval',
|
|
51
|
-
})
|
|
52
|
-
|
|
53
|
-
if (options.clean) {
|
|
54
|
-
if (!options.json) console.log('🧹 Cleaning existing data...')
|
|
55
|
-
await cleanDatabase(connection)
|
|
56
|
-
|
|
57
|
-
// Also clean Qdrant
|
|
58
|
-
const qdrant = createQdrantClient()
|
|
59
|
-
await qdrant.deleteCollection()
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
// 1. Seed apps
|
|
63
|
-
if (!options.json) console.log('📦 Seeding apps...')
|
|
64
|
-
const apps = await loadJsonFiles(join(fixturesPath, 'apps'))
|
|
65
|
-
result.apps = await seedApps(connection, apps)
|
|
66
|
-
|
|
67
|
-
// 2. Seed customers (stored as JSON for mock lookups)
|
|
68
|
-
if (!options.json) console.log('👥 Loading customer fixtures...')
|
|
69
|
-
const customers = await loadJsonFiles(join(fixturesPath, 'customers'))
|
|
70
|
-
result.customers = customers.length
|
|
71
|
-
// Customers are used by mock integration client, not stored in DB
|
|
72
|
-
|
|
73
|
-
// 3. Seed knowledge base with embeddings
|
|
74
|
-
if (!options.json) console.log('📚 Seeding knowledge base...')
|
|
75
|
-
const knowledge = await loadKnowledgeFiles(join(fixturesPath, 'knowledge'))
|
|
76
|
-
result.knowledge = knowledge.length
|
|
77
|
-
result.embeddings = await seedKnowledgeBase(knowledge)
|
|
78
|
-
|
|
79
|
-
// 4. Count scenarios
|
|
80
|
-
const scenarioFiles = await glob(join(fixturesPath, 'scenarios/**/*.json'))
|
|
81
|
-
result.scenarios = scenarioFiles.length
|
|
82
|
-
|
|
83
|
-
await connection.end()
|
|
84
|
-
|
|
85
|
-
if (options.json) {
|
|
86
|
-
console.log(JSON.stringify({ success: true, result }, null, 2))
|
|
87
|
-
} else {
|
|
88
|
-
console.log('\n✅ Seeding complete!\n')
|
|
89
|
-
console.log(` Apps: ${result.apps}`)
|
|
90
|
-
console.log(` Customers: ${result.customers}`)
|
|
91
|
-
console.log(` Knowledge: ${result.knowledge} documents`)
|
|
92
|
-
console.log(` Embeddings: ${result.embeddings}`)
|
|
93
|
-
console.log(` Scenarios: ${result.scenarios}\n`)
|
|
94
|
-
}
|
|
95
|
-
} catch (error) {
|
|
96
|
-
if (options.json) {
|
|
97
|
-
console.log(
|
|
98
|
-
JSON.stringify({
|
|
99
|
-
success: false,
|
|
100
|
-
error: error instanceof Error ? error.message : 'Unknown error',
|
|
101
|
-
})
|
|
102
|
-
)
|
|
103
|
-
} else {
|
|
104
|
-
console.error('❌ Seeding failed:', error)
|
|
105
|
-
}
|
|
106
|
-
process.exit(1)
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
async function cleanDatabase(connection: any): Promise<void> {
|
|
111
|
-
// Disable foreign key checks temporarily
|
|
112
|
-
await connection.execute('SET FOREIGN_KEY_CHECKS = 0')
|
|
113
|
-
|
|
114
|
-
const tables = [
|
|
115
|
-
'SUPPORT_trust_scores',
|
|
116
|
-
'SUPPORT_audit_log',
|
|
117
|
-
'SUPPORT_approval_requests',
|
|
118
|
-
'SUPPORT_actions',
|
|
119
|
-
'SUPPORT_conversations',
|
|
120
|
-
'SUPPORT_apps',
|
|
121
|
-
]
|
|
122
|
-
|
|
123
|
-
for (const table of tables) {
|
|
124
|
-
await connection.execute(`TRUNCATE TABLE ${table}`)
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
await connection.execute('SET FOREIGN_KEY_CHECKS = 1')
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
async function loadJsonFiles(dirPath: string): Promise<any[]> {
|
|
131
|
-
try {
|
|
132
|
-
const files = await readdir(dirPath)
|
|
133
|
-
const jsonFiles = files.filter((f) => f.endsWith('.json'))
|
|
134
|
-
|
|
135
|
-
const items = await Promise.all(
|
|
136
|
-
jsonFiles.map(async (file) => {
|
|
137
|
-
const content = await readFile(join(dirPath, file), 'utf-8')
|
|
138
|
-
return JSON.parse(content)
|
|
139
|
-
})
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
return items
|
|
143
|
-
} catch (error) {
|
|
144
|
-
return []
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
interface KnowledgeDoc {
|
|
149
|
-
id: string
|
|
150
|
-
content: string
|
|
151
|
-
type: string
|
|
152
|
-
app: string
|
|
153
|
-
tags: string[]
|
|
154
|
-
filePath: string
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
function generateUUID(): string {
|
|
158
|
-
// Simple UUID v4 generation
|
|
159
|
-
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, (c) => {
|
|
160
|
-
const r = (Math.random() * 16) | 0
|
|
161
|
-
const v = c === 'x' ? r : (r & 0x3) | 0x8
|
|
162
|
-
return v.toString(16)
|
|
163
|
-
})
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
async function loadKnowledgeFiles(basePath: string): Promise<KnowledgeDoc[]> {
|
|
167
|
-
const files = await glob(join(basePath, '**/*.md'))
|
|
168
|
-
const docs: KnowledgeDoc[] = []
|
|
169
|
-
|
|
170
|
-
for (const filePath of files) {
|
|
171
|
-
const content = await readFile(filePath, 'utf-8')
|
|
172
|
-
const { data: frontmatter, content: body } = matter(content)
|
|
173
|
-
|
|
174
|
-
// Generate UUID for Qdrant compatibility
|
|
175
|
-
const id = generateUUID()
|
|
176
|
-
|
|
177
|
-
docs.push({
|
|
178
|
-
id,
|
|
179
|
-
content: body.trim(),
|
|
180
|
-
type: frontmatter.type || 'general',
|
|
181
|
-
app: frontmatter.app || 'unknown',
|
|
182
|
-
tags: frontmatter.tags || [],
|
|
183
|
-
filePath,
|
|
184
|
-
})
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
return docs
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
async function seedApps(connection: any, apps: any[]): Promise<number> {
|
|
191
|
-
for (const app of apps) {
|
|
192
|
-
await connection.execute(
|
|
193
|
-
`INSERT INTO SUPPORT_apps (
|
|
194
|
-
id, slug, name, front_inbox_id, instructor_teammate_id,
|
|
195
|
-
stripe_account_id, stripe_connected, integration_base_url,
|
|
196
|
-
webhook_secret, capabilities
|
|
197
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
198
|
-
ON DUPLICATE KEY UPDATE
|
|
199
|
-
name = VALUES(name),
|
|
200
|
-
integration_base_url = VALUES(integration_base_url)`,
|
|
201
|
-
[
|
|
202
|
-
app.id,
|
|
203
|
-
app.slug,
|
|
204
|
-
app.name,
|
|
205
|
-
app.front_inbox_id,
|
|
206
|
-
app.instructor_teammate_id || null,
|
|
207
|
-
app.stripe_account_id || null,
|
|
208
|
-
app.stripe_connected || false,
|
|
209
|
-
app.integration_base_url,
|
|
210
|
-
app.webhook_secret,
|
|
211
|
-
JSON.stringify(app.capabilities || []),
|
|
212
|
-
]
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
// Seed default trust scores for this app
|
|
216
|
-
const categories = ['refund', 'access', 'technical', 'general']
|
|
217
|
-
for (const category of categories) {
|
|
218
|
-
const id = `ts_${app.id}_${category}`
|
|
219
|
-
await connection.execute(
|
|
220
|
-
`INSERT INTO SUPPORT_trust_scores (id, app_id, category, trust_score, sample_count)
|
|
221
|
-
VALUES (?, ?, ?, 0.75, 25)
|
|
222
|
-
ON DUPLICATE KEY UPDATE id = id`,
|
|
223
|
-
[id, app.id, category]
|
|
224
|
-
)
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
return apps.length
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
async function seedKnowledgeBase(docs: KnowledgeDoc[]): Promise<number> {
|
|
232
|
-
if (docs.length === 0) return 0
|
|
233
|
-
|
|
234
|
-
const qdrant = createQdrantClient()
|
|
235
|
-
const ollama = createOllamaClient()
|
|
236
|
-
|
|
237
|
-
// Ensure model is available
|
|
238
|
-
await ollama.ensureModel()
|
|
239
|
-
|
|
240
|
-
// Ensure collection exists
|
|
241
|
-
// Use 1024 for mxbai-embed-large, 768 for nomic-embed-text
|
|
242
|
-
const embeddingModel = process.env.EMBEDDING_MODEL || 'mxbai-embed-large'
|
|
243
|
-
const vectorSize = embeddingModel.includes('mxbai') ? 1024 : 768
|
|
244
|
-
await qdrant.ensureCollection(vectorSize)
|
|
245
|
-
|
|
246
|
-
let embeddedCount = 0
|
|
247
|
-
|
|
248
|
-
for (const doc of docs) {
|
|
249
|
-
try {
|
|
250
|
-
// Generate embedding
|
|
251
|
-
const embedding = await ollama.embed(doc.content)
|
|
252
|
-
|
|
253
|
-
// Store in Qdrant
|
|
254
|
-
await qdrant.upsert([
|
|
255
|
-
{
|
|
256
|
-
id: doc.id,
|
|
257
|
-
vector: embedding,
|
|
258
|
-
payload: {
|
|
259
|
-
content: doc.content,
|
|
260
|
-
type: doc.type,
|
|
261
|
-
app: doc.app,
|
|
262
|
-
tags: doc.tags,
|
|
263
|
-
},
|
|
264
|
-
},
|
|
265
|
-
])
|
|
266
|
-
|
|
267
|
-
embeddedCount++
|
|
268
|
-
process.stdout.write(`\r Embedded: ${embeddedCount}/${docs.length}`)
|
|
269
|
-
} catch (error) {
|
|
270
|
-
console.error(`\n Failed to embed ${doc.id}:`, error)
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
console.log('') // New line after progress
|
|
275
|
-
return embeddedCount
|
|
276
|
-
}
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Eval-pipeline CLI commands
|
|
3
|
-
*
|
|
4
|
-
* Run evals against individual pipeline steps or full e2e.
|
|
5
|
-
* Uses actual pipeline implementations from @skillrecordings/core/pipeline.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import type { Command } from 'commander'
|
|
9
|
-
import { run } from './run'
|
|
10
|
-
import { seed } from './seed'
|
|
11
|
-
|
|
12
|
-
export function registerEvalPipelineCommands(program: Command): void {
|
|
13
|
-
const evalPipeline = program
|
|
14
|
-
.command('eval-pipeline')
|
|
15
|
-
.description('Evaluate pipeline steps against labeled scenarios')
|
|
16
|
-
|
|
17
|
-
// Run subcommand (main functionality)
|
|
18
|
-
evalPipeline
|
|
19
|
-
.command('run')
|
|
20
|
-
.description('Run eval suite against pipeline steps')
|
|
21
|
-
.option(
|
|
22
|
-
'--step <step>',
|
|
23
|
-
'Which step to test: classify | route | gather | draft | validate | e2e',
|
|
24
|
-
'classify'
|
|
25
|
-
)
|
|
26
|
-
.option('--scenarios <glob>', 'Scenario files glob pattern')
|
|
27
|
-
.option('--dataset <file>', 'Dataset JSON file (alternative to scenarios)')
|
|
28
|
-
.option('--limit <n>', 'Max scenarios to run', parseInt)
|
|
29
|
-
.option('--verbose', 'Show individual scenario results')
|
|
30
|
-
.option('--json', 'JSON output for scripting')
|
|
31
|
-
.option(
|
|
32
|
-
'--model <model>',
|
|
33
|
-
'Model for LLM steps',
|
|
34
|
-
'anthropic/claude-haiku-4-5'
|
|
35
|
-
)
|
|
36
|
-
.option('--force-llm', 'Skip fast path, always use LLM (classify step)')
|
|
37
|
-
.option('--real-tools', 'Use real Docker MySQL/Qdrant instead of mocks')
|
|
38
|
-
.option('--parallel <n>', 'Run N scenarios concurrently', parseInt, 10)
|
|
39
|
-
.option('--cache-classify', 'Cache classify results between runs')
|
|
40
|
-
.option('--clear-cache', 'Clear cached classify results before run')
|
|
41
|
-
.option('--fail-fast', 'Stop on first failure')
|
|
42
|
-
.option('--quick', 'Run smoke test subset (~10 scenarios)')
|
|
43
|
-
.action(run)
|
|
44
|
-
|
|
45
|
-
// Seed subcommand
|
|
46
|
-
evalPipeline
|
|
47
|
-
.command('seed')
|
|
48
|
-
.description('Seed MySQL and Qdrant with test fixtures')
|
|
49
|
-
.option('--clean', 'Drop and recreate all data')
|
|
50
|
-
.option('--fixtures <path>', 'Path to fixtures directory', 'fixtures')
|
|
51
|
-
.option('--json', 'JSON output for scripting')
|
|
52
|
-
.action(seed)
|
|
53
|
-
}
|