@rudderjs/ai 1.4.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +484 -7
- package/boost/guidelines.md +62 -2
- package/boost/skills/ai-tools/SKILL.md +14 -5
- package/dist/agent.d.ts +66 -15
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +529 -58
- package/dist/agent.js.map +1 -1
- package/dist/budget/pricing.d.ts +124 -0
- package/dist/budget/pricing.d.ts.map +1 -0
- package/dist/budget/pricing.js +175 -0
- package/dist/budget/pricing.js.map +1 -0
- package/dist/budget/storage.d.ts +104 -0
- package/dist/budget/storage.d.ts.map +1 -0
- package/dist/budget/storage.js +0 -0
- package/dist/budget/storage.js.map +1 -0
- package/dist/budget/with-budget.d.ts +119 -0
- package/dist/budget/with-budget.d.ts.map +1 -0
- package/dist/budget/with-budget.js +175 -0
- package/dist/budget/with-budget.js.map +1 -0
- package/dist/budget-orm/index.d.ts +96 -0
- package/dist/budget-orm/index.d.ts.map +1 -0
- package/dist/budget-orm/index.js +177 -0
- package/dist/budget-orm/index.js.map +1 -0
- package/dist/commands/ai-eval.d.ts +93 -0
- package/dist/commands/ai-eval.d.ts.map +1 -0
- package/dist/commands/ai-eval.js +378 -0
- package/dist/commands/ai-eval.js.map +1 -0
- package/dist/computer-use/actions.d.ts +214 -0
- package/dist/computer-use/actions.d.ts.map +1 -0
- package/dist/computer-use/actions.js +48 -0
- package/dist/computer-use/actions.js.map +1 -0
- package/dist/computer-use/errors.d.ts +57 -0
- package/dist/computer-use/errors.d.ts.map +1 -0
- package/dist/computer-use/errors.js +76 -0
- package/dist/computer-use/errors.js.map +1 -0
- package/dist/computer-use/index.d.ts +53 -0
- package/dist/computer-use/index.d.ts.map +1 -0
- package/dist/computer-use/index.js +51 -0
- package/dist/computer-use/index.js.map +1 -0
- package/dist/computer-use/playwright.d.ts +76 -0
- package/dist/computer-use/playwright.d.ts.map +1 -0
- package/dist/computer-use/playwright.js +270 -0
- package/dist/computer-use/playwright.js.map +1 -0
- package/dist/computer-use/tool.d.ts +154 -0
- package/dist/computer-use/tool.d.ts.map +1 -0
- package/dist/computer-use/tool.js +210 -0
- package/dist/computer-use/tool.js.map +1 -0
- package/dist/eval/fixtures.d.ts +65 -0
- package/dist/eval/fixtures.d.ts.map +1 -0
- package/dist/eval/fixtures.js +110 -0
- package/dist/eval/fixtures.js.map +1 -0
- package/dist/eval/html-reporter.d.ts +25 -0
- package/dist/eval/html-reporter.d.ts.map +1 -0
- package/dist/eval/html-reporter.js +209 -0
- package/dist/eval/html-reporter.js.map +1 -0
- package/dist/eval/index.d.ts +271 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +510 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/json-reporter.d.ts +43 -0
- package/dist/eval/json-reporter.d.ts.map +1 -0
- package/dist/eval/json-reporter.js +40 -0
- package/dist/eval/json-reporter.js.map +1 -0
- package/dist/fake.d.ts +36 -1
- package/dist/fake.d.ts.map +1 -1
- package/dist/fake.js +49 -2
- package/dist/fake.js.map +1 -1
- package/dist/file-search.d.ts +168 -0
- package/dist/file-search.d.ts.map +1 -0
- package/dist/file-search.js +158 -0
- package/dist/file-search.js.map +1 -0
- package/dist/handoff.d.ts +95 -0
- package/dist/handoff.d.ts.map +1 -0
- package/dist/handoff.js +78 -0
- package/dist/handoff.js.map +1 -0
- package/dist/index.d.ts +29 -5
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +22 -2
- package/dist/index.js.map +1 -1
- package/dist/mcp/client-tools.d.ts +39 -0
- package/dist/mcp/client-tools.d.ts.map +1 -0
- package/dist/mcp/client-tools.js +147 -0
- package/dist/mcp/client-tools.js.map +1 -0
- package/dist/mcp/index.d.ts +16 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +15 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/server-from-agent.d.ts +24 -0
- package/dist/mcp/server-from-agent.d.ts.map +1 -0
- package/dist/mcp/server-from-agent.js +113 -0
- package/dist/mcp/server-from-agent.js.map +1 -0
- package/dist/mcp/types.d.ts +64 -0
- package/dist/mcp/types.d.ts.map +1 -0
- package/dist/mcp/types.js +6 -0
- package/dist/mcp/types.js.map +1 -0
- package/dist/memory-embedding/index.d.ts +121 -0
- package/dist/memory-embedding/index.d.ts.map +1 -0
- package/dist/memory-embedding/index.js +229 -0
- package/dist/memory-embedding/index.js.map +1 -0
- package/dist/memory-extract.d.ts +60 -0
- package/dist/memory-extract.d.ts.map +1 -0
- package/dist/memory-extract.js +163 -0
- package/dist/memory-extract.js.map +1 -0
- package/dist/memory-inject.d.ts +39 -0
- package/dist/memory-inject.d.ts.map +1 -0
- package/dist/memory-inject.js +135 -0
- package/dist/memory-inject.js.map +1 -0
- package/dist/memory-orm/index.d.ts +118 -0
- package/dist/memory-orm/index.d.ts.map +1 -0
- package/dist/memory-orm/index.js +187 -0
- package/dist/memory-orm/index.js.map +1 -0
- package/dist/memory.d.ts +55 -0
- package/dist/memory.d.ts.map +1 -0
- package/dist/memory.js +132 -0
- package/dist/memory.js.map +1 -0
- package/dist/observers.d.ts +22 -0
- package/dist/observers.d.ts.map +1 -1
- package/dist/observers.js.map +1 -1
- package/dist/provider-tools.d.ts +15 -1
- package/dist/provider-tools.d.ts.map +1 -1
- package/dist/provider-tools.js +21 -1
- package/dist/provider-tools.js.map +1 -1
- package/dist/providers/anthropic.d.ts +9 -1
- package/dist/providers/anthropic.d.ts.map +1 -1
- package/dist/providers/anthropic.js +66 -11
- package/dist/providers/anthropic.js.map +1 -1
- package/dist/providers/bedrock.d.ts +60 -0
- package/dist/providers/bedrock.d.ts.map +1 -0
- package/dist/providers/bedrock.js +167 -0
- package/dist/providers/bedrock.js.map +1 -0
- package/dist/providers/elevenlabs.d.ts +98 -0
- package/dist/providers/elevenlabs.d.ts.map +1 -0
- package/dist/providers/elevenlabs.js +229 -0
- package/dist/providers/elevenlabs.js.map +1 -0
- package/dist/providers/google.d.ts +83 -1
- package/dist/providers/google.d.ts.map +1 -1
- package/dist/providers/google.js +491 -8
- package/dist/providers/google.js.map +1 -1
- package/dist/providers/openai.d.ts +8 -1
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +215 -5
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers/openrouter.d.ts +43 -0
- package/dist/providers/openrouter.d.ts.map +1 -0
- package/dist/providers/openrouter.js +21 -0
- package/dist/providers/openrouter.js.map +1 -0
- package/dist/providers/voyage.d.ts +91 -0
- package/dist/providers/voyage.d.ts.map +1 -0
- package/dist/providers/voyage.js +166 -0
- package/dist/providers/voyage.js.map +1 -0
- package/dist/queue-job.d.ts +69 -4
- package/dist/queue-job.d.ts.map +1 -1
- package/dist/queue-job.js +114 -11
- package/dist/queue-job.js.map +1 -1
- package/dist/registry.d.ts +3 -1
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +10 -0
- package/dist/registry.js.map +1 -1
- package/dist/server/provider.d.ts.map +1 -1
- package/dist/server/provider.js +38 -1
- package/dist/server/provider.js.map +1 -1
- package/dist/similarity-search.d.ts +163 -0
- package/dist/similarity-search.d.ts.map +1 -0
- package/dist/similarity-search.js +147 -0
- package/dist/similarity-search.js.map +1 -0
- package/dist/sub-agent-run-store.d.ts +40 -3
- package/dist/sub-agent-run-store.d.ts.map +1 -1
- package/dist/sub-agent-run-store.js.map +1 -1
- package/dist/tool.d.ts +59 -0
- package/dist/tool.d.ts.map +1 -1
- package/dist/tool.js +45 -4
- package/dist/tool.js.map +1 -1
- package/dist/types.d.ts +285 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/vector-stores/index.d.ts +96 -0
- package/dist/vector-stores/index.d.ts.map +1 -0
- package/dist/vector-stores/index.js +153 -0
- package/dist/vector-stores/index.js.map +1 -0
- package/package.json +43 -4
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@rudderjs/ai/budget-orm` — ORM-backed {@link BudgetStorage} for #A6 Phase 4.
|
|
3
|
+
*
|
|
4
|
+
* Production-grade replacement for `memoryBudgetStorage()` (which is
|
|
5
|
+
* single-process only). Persists per-user spend counters in a
|
|
6
|
+
* `BudgetUsage` table via the registered `@rudderjs/orm` adapter — works
|
|
7
|
+
* across queue workers, web processes, and horizontally-scaled deployments.
|
|
8
|
+
*
|
|
9
|
+
* Wire it into your AI middleware:
|
|
10
|
+
*
|
|
11
|
+
* ```ts
|
|
12
|
+
* import { withBudget } from '@rudderjs/ai'
|
|
13
|
+
* import { ormBudgetStorage } from '@rudderjs/ai/budget-orm'
|
|
14
|
+
*
|
|
15
|
+
* const budgeted = withBudget({
|
|
16
|
+
* user: (ctx) => ctx.context as string,
|
|
17
|
+
* budget: () => ({ daily: 0.50, monthly: 10 }),
|
|
18
|
+
* storage: ormBudgetStorage(),
|
|
19
|
+
* })
|
|
20
|
+
* ```
|
|
21
|
+
*
|
|
22
|
+
* The schema lives at {@link budgetUsagePrismaSchema} — copy it into your
|
|
23
|
+
* Prisma schema (or a new `prisma/schema/<file>.prisma` if you use the
|
|
24
|
+
* multi-file setup). The `@@unique([userId, period, periodKey])`
|
|
25
|
+
* constraint is the one load-bearing index — without it, the
|
|
26
|
+
* find-or-create path can race and produce duplicate rows.
|
|
27
|
+
*
|
|
28
|
+
* # Atomicity caveat
|
|
29
|
+
*
|
|
30
|
+
* `checkAndDebit` does a read-then-conditional-increment. The increment
|
|
31
|
+
* itself is atomic (`UPDATE col = col + n`), but the cap check sits
|
|
32
|
+
* between the read and the write. Under high concurrency for a single
|
|
33
|
+
* user (more than ~1 in-flight budgeted request at a time), total spend
|
|
34
|
+
* can briefly exceed `cap` by up to `costUsd × concurrency`. For typical
|
|
35
|
+
* apps this is a non-issue.
|
|
36
|
+
*
|
|
37
|
+
* Strict guarantees require a database transaction with serializable
|
|
38
|
+
* isolation or a Redis-backed counter — both planned as follow-ups. File
|
|
39
|
+
* an issue if you hit this in production.
|
|
40
|
+
*/
|
|
41
|
+
import { Model } from '@rudderjs/orm';
|
|
42
|
+
import { periodKey as buildPeriodKey, } from '../budget/storage.js';
|
|
43
|
+
// ─── ORM Model ────────────────────────────────────────────
|
|
44
|
+
/**
|
|
45
|
+
* Model row backing {@link OrmBudgetStorage}. Exposed so apps that
|
|
46
|
+
* want admin views (e.g. "show me top spenders this month") can use
|
|
47
|
+
* `BudgetUsageRecord.where(...).get()` instead of routing every read
|
|
48
|
+
* through the {@link BudgetStorage} interface.
|
|
49
|
+
*
|
|
50
|
+
* The `@@unique([userId, period, periodKey])` constraint is required —
|
|
51
|
+
* without it, two concurrent first-writes for the same user/period
|
|
52
|
+
* create duplicate rows and the cap accounting silently drifts.
|
|
53
|
+
*/
|
|
54
|
+
export class BudgetUsageRecord extends Model {
|
|
55
|
+
static table = 'budgetUsage';
|
|
56
|
+
static fillable = ['userId', 'period', 'periodKey', 'spent'];
|
|
57
|
+
}
|
|
58
|
+
// ─── BudgetStorage adapter ────────────────────────────────
|
|
59
|
+
/**
|
|
60
|
+
* Production `BudgetStorage` backed by the registered `@rudderjs/orm`
|
|
61
|
+
* adapter. See the module JSDoc for setup + the atomicity caveat.
|
|
62
|
+
*/
|
|
63
|
+
export class OrmBudgetStorage {
|
|
64
|
+
async checkAndDebit(opts) {
|
|
65
|
+
if (!Number.isFinite(opts.cap) || opts.cap < 0) {
|
|
66
|
+
throw new Error(`[RudderJS AI] BudgetStorage: cap must be a non-negative finite number, got ${opts.cap}`);
|
|
67
|
+
}
|
|
68
|
+
if (!Number.isFinite(opts.costUsd) || opts.costUsd < 0) {
|
|
69
|
+
throw new Error(`[RudderJS AI] BudgetStorage: costUsd must be a non-negative finite number, got ${opts.costUsd}`);
|
|
70
|
+
}
|
|
71
|
+
const now = opts.now ?? new Date();
|
|
72
|
+
const key = buildPeriodKey(opts.period, now, opts.timezone);
|
|
73
|
+
const existing = await BudgetUsageRecord
|
|
74
|
+
.where('userId', opts.userId)
|
|
75
|
+
.where('period', opts.period)
|
|
76
|
+
.where('periodKey', key)
|
|
77
|
+
.first();
|
|
78
|
+
// ─── No row yet — first write for this period ─────────
|
|
79
|
+
if (!existing) {
|
|
80
|
+
// Pure-read on an empty bucket — still empty after.
|
|
81
|
+
if (opts.costUsd === 0) {
|
|
82
|
+
return { allowed: true, spent: 0, cap: opts.cap };
|
|
83
|
+
}
|
|
84
|
+
// Single debit larger than cap — refuse before creating the row,
|
|
85
|
+
// so we don't pollute storage with denied requests.
|
|
86
|
+
if (opts.costUsd > opts.cap) {
|
|
87
|
+
return { allowed: false, spent: 0, cap: opts.cap };
|
|
88
|
+
}
|
|
89
|
+
try {
|
|
90
|
+
await BudgetUsageRecord.create({
|
|
91
|
+
userId: opts.userId,
|
|
92
|
+
period: opts.period,
|
|
93
|
+
periodKey: key,
|
|
94
|
+
spent: opts.costUsd,
|
|
95
|
+
});
|
|
96
|
+
return { allowed: true, spent: opts.costUsd, cap: opts.cap };
|
|
97
|
+
}
|
|
98
|
+
catch (e) {
|
|
99
|
+
// Race: another caller created the row between our `first()` and
|
|
100
|
+
// `create()`. Re-read and fall through to the increment path.
|
|
101
|
+
// We deliberately don't sniff the error type — any create failure
|
|
102
|
+
// means the row may now exist; let the re-read decide.
|
|
103
|
+
const refetched = await BudgetUsageRecord
|
|
104
|
+
.where('userId', opts.userId)
|
|
105
|
+
.where('period', opts.period)
|
|
106
|
+
.where('periodKey', key)
|
|
107
|
+
.first();
|
|
108
|
+
if (!refetched)
|
|
109
|
+
throw e; // not a unique-constraint race; surface the original error
|
|
110
|
+
return this._applyIncrementPath(refetched, opts);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
return this._applyIncrementPath(existing, opts);
|
|
114
|
+
}
|
|
115
|
+
/** Apply the read-then-conditional-increment path on an existing row. */
|
|
116
|
+
async _applyIncrementPath(row, opts) {
|
|
117
|
+
const current = Number(row.spent ?? 0);
|
|
118
|
+
// Pure read.
|
|
119
|
+
if (opts.costUsd === 0) {
|
|
120
|
+
return { allowed: true, spent: current, cap: opts.cap };
|
|
121
|
+
}
|
|
122
|
+
// Cap check — read-then-decide. Atomic under single-writer; under
|
|
123
|
+
// concurrent writers, see the module-level atomicity caveat.
|
|
124
|
+
if (current + opts.costUsd > opts.cap) {
|
|
125
|
+
return { allowed: false, spent: current, cap: opts.cap };
|
|
126
|
+
}
|
|
127
|
+
const updated = await BudgetUsageRecord.increment(row.id, 'spent', opts.costUsd);
|
|
128
|
+
const newSpent = Number(updated?.spent ?? current + opts.costUsd);
|
|
129
|
+
return { allowed: true, spent: newSpent, cap: opts.cap };
|
|
130
|
+
}
|
|
131
|
+
async reset(userId, period, now, timezone) {
|
|
132
|
+
const key = buildPeriodKey(period, now ?? new Date(), timezone);
|
|
133
|
+
await BudgetUsageRecord
|
|
134
|
+
.where('userId', userId)
|
|
135
|
+
.where('period', period)
|
|
136
|
+
.where('periodKey', key)
|
|
137
|
+
.deleteAll();
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Convenience factory — returns a fresh {@link OrmBudgetStorage}
|
|
142
|
+
* instance. Prefer this over `new OrmBudgetStorage()` for symmetry with
|
|
143
|
+
* `memoryBudgetStorage()`.
|
|
144
|
+
*/
|
|
145
|
+
export function ormBudgetStorage() {
|
|
146
|
+
return new OrmBudgetStorage();
|
|
147
|
+
}
|
|
148
|
+
// ─── Schema reference ─────────────────────────────────────
|
|
149
|
+
/**
|
|
150
|
+
* Reference Prisma schema for `OrmBudgetStorage`. Copy into your
|
|
151
|
+
* `prisma/schema/<file>.prisma` (or paste alongside an existing model).
|
|
152
|
+
*
|
|
153
|
+
* The `@@unique([userId, period, periodKey])` constraint is required —
|
|
154
|
+
* without it the find-or-create path can race and produce duplicate
|
|
155
|
+
* rows, breaking cap accounting.
|
|
156
|
+
*
|
|
157
|
+
* SQLite stores `Float` as `REAL`; Postgres / MySQL as `DOUBLE
|
|
158
|
+
* PRECISION` / `DOUBLE`. All three give 15+ significant digits — more
|
|
159
|
+
* than enough for sub-cent budget tracking.
|
|
160
|
+
*/
|
|
161
|
+
export const budgetUsagePrismaSchema = `model BudgetUsage {
|
|
162
|
+
id String @id @default(cuid())
|
|
163
|
+
userId String
|
|
164
|
+
/// 'daily' | 'monthly'
|
|
165
|
+
period String
|
|
166
|
+
/// YYYY-MM-DD (daily) or YYYY-MM (monthly), in the configured timezone
|
|
167
|
+
periodKey String
|
|
168
|
+
/// Cumulative USD spend in this period
|
|
169
|
+
spent Float @default(0)
|
|
170
|
+
createdAt DateTime @default(now())
|
|
171
|
+
updatedAt DateTime @updatedAt
|
|
172
|
+
|
|
173
|
+
@@unique([userId, period, periodKey])
|
|
174
|
+
@@index([userId])
|
|
175
|
+
}
|
|
176
|
+
`;
|
|
177
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/budget-orm/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAuCG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAA;AACrC,OAAO,EAKL,SAAS,IAAI,cAAc,GAC5B,MAAM,sBAAsB,CAAA;AAE7B,6DAA6D;AAE7D;;;;;;;;;GASG;AACH,MAAM,OAAO,iBAAkB,SAAQ,KAAK;IAC1C,MAAM,CAAU,KAAK,GAAM,aAAa,CAAA;IACxC,MAAM,CAAU,QAAQ,GAAG,CAAC,QAAQ,EAAE,QAAQ,EAAE,WAAW,EAAE,OAAO,CAAC,CAAA;;AAcvE,6DAA6D;AAE7D;;;GAGG;AACH,MAAM,OAAO,gBAAgB;IAC3B,KAAK,CAAC,aAAa,CAAC,IAAwB;QAC1C,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,8EAA8E,IAAI,CAAC,GAAG,EAAE,CAAC,CAAA;QAC3G,CAAC;QACD,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,OAAO,GAAG,CAAC,EAAE,CAAC;YACvD,MAAM,IAAI,KAAK,CAAC,kFAAkF,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;QACnH,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,IAAI,IAAI,IAAI,EAAE,CAAA;QAClC,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAA;QAE3D,MAAM,QAAQ,GAAG,MAAM,iBAAiB;aACrC,KAAK,CAAC,QAAQ,EAAK,IAAI,CAAC,MAAM,CAAC;aAC/B,KAAK,CAAC,QAAQ,EAAK,IAAI,CAAC,MAAM,CAAC;aAC/B,KAAK,CAAC,WAAW,EAAE,GAAG,CAAC;aACvB,KAAK,EAAyC,CAAA;QAEjD,yDAAyD;QACzD,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,oDAAoD;YACpD,IAAI,IAAI,CAAC,OAAO,KAAK,CAAC,EAAE,CAAC;gBACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAA;YACnD,CAAC;YACD,iEAAiE;YACjE,oDAAoD;YACpD,IAAI,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;gBAC5B,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAA;YACpD,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,iBAAiB,CAAC,MAAM,CAAC;oBAC7B,MAAM,EAAK,IAAI,CAAC,MAAM;oBACtB,MAAM,EAAK,IAAI,CAAC,MAAM;oBACtB,SAAS,EAAE,GAAG;oBACd,KAAK,EAAM,IAAI,CAAC,OAAO;iBACxB,CAAC,CAAA;gBACF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAA;YAC9D,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,iEAAiE;gBACjE,8DAA8D;gBAC9D,kEAAkE;gBAClE,uDAAuD;gBACvD,MAAM,SAAS,GAAG,MAAM,iBAAiB;qBACtC,KAAK,CAAC,QAAQ,EAAK,IAAI,CAAC,MAAM,CAAC;qBAC/B,KAAK,CAAC,QAAQ,EAAK,IAAI,CAAC,MAAM,CAAC;qBAC/B,KAAK,CAAC,WAAW,EAAE,GAAG,CAAC;qBACvB,KAAK,EAAyC,CAAA;gBACjD,IAAI,CAAC,SAAS;oBAAE,MAAM,CAAC,CAAA,CAAE,2DAA2D;gBACpF,OAAO,IAAI,CAAC,mBAAmB,CAAC,SAAS,EAAE,IAAI,CAAC,CAAA;YAClD,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC,mBAAmB,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAA;IACjD,CAAC;IAED,yEAAyE;IACjE,KAAK,CAAC,mBAAmB,CAC/B,GAAuB,EACvB,IAAwB;QAExB,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC,CAAA;QAEtC,aAAa;QACb,IAAI,IAAI,CAAC,OAAO,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAA;QACzD,CAAC;QAED,kEAAkE;QAClE,6DAA6D;QAC7D,IAAI,OAAO,GAAG,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACtC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAA;QAC1D,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,iBAAiB,CAAC,SAAS,CAAC,GAAG,CAAC,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,CAAiC,CAAA;QAChH,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,EAAE,KAAK,IAAI,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,CAAA;QACjE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAA;IAC1D,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAc,EAAE,MAAoB,EAAE,GAAU,EAAE,QAAiB;QAC7E,MAAM,GAAG,GAAG,cAAc,CAAC,MAAM,EAAE,GAAG,IAAI,IAAI,IAAI,EAAE,EAAE,QAAQ,CAAC,CAAA;QAC/D,MAAM,iBAAiB;aACpB,KAAK,CAAC,QAAQ,EAAK,MAAM,CAAC;aAC1B,KAAK,CAAC,QAAQ,EAAK,MAAM,CAAC;aAC1B,KAAK,CAAC,WAAW,EAAE,GAAG,CAAC;aACvB,SAAS,EAAE,CAAA;IAChB,CAAC;CACF;AAED;;;;GAIG;AACH,MAAM,UAAU,gBAAgB;IAC9B,OAAO,IAAI,gBAAgB,EAAE,CAAA;AAC/B,CAAC;AAED,6DAA6D;AAE7D;;;;;;;;;;;GAWG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG;;;;;;;;;;;;;;;CAetC,CAAA"}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `pnpm rudder ai:eval` — discover `evals/**\/*.eval.ts` suites,
|
|
3
|
+
* run each, and report. Console reporter by default; `--json` emits
|
|
4
|
+
* a machine-readable envelope to stdout for CI.
|
|
5
|
+
*
|
|
6
|
+
* Registered from the CLI loader (`packages/cli/src/index.ts`)
|
|
7
|
+
* — the AiProvider doesn't own this so it surfaces even when the
|
|
8
|
+
* user app fails to boot, matching the `command:list --json`
|
|
9
|
+
* graceful-degradation pattern from #349.
|
|
10
|
+
*/
|
|
11
|
+
import type { EvalSuite } from '../eval/index.js';
|
|
12
|
+
type Rudder = {
|
|
13
|
+
command(name: string, handler: (args: string[]) => void | Promise<void>): {
|
|
14
|
+
description(text: string): unknown;
|
|
15
|
+
};
|
|
16
|
+
};
|
|
17
|
+
/** CLI flags + positional name filter. */
|
|
18
|
+
export interface AiEvalOptions {
|
|
19
|
+
/** Substring filter (case-insensitive) applied to suite names. */
|
|
20
|
+
filter?: string;
|
|
21
|
+
/** Stop on the first failing suite. */
|
|
22
|
+
bail: boolean;
|
|
23
|
+
/** Emit `{ suites: [...] }` JSON to stdout. */
|
|
24
|
+
json: boolean;
|
|
25
|
+
/**
|
|
26
|
+
* Run against the real provider, capture each case's assistant
|
|
27
|
+
* turns to `evals/__fixtures__/<suite>/<case>.json`. Existing
|
|
28
|
+
* fixtures are overwritten — diff in your VCS to see what changed.
|
|
29
|
+
* Default `false`.
|
|
30
|
+
*/
|
|
31
|
+
record?: boolean;
|
|
32
|
+
/**
|
|
33
|
+
* Swap the runtime with `AiFake.fake()` and feed each case its
|
|
34
|
+
* recorded fixture via `respondWithSequence`. Zero API calls,
|
|
35
|
+
* deterministic regression tests. Cases without a fixture fall
|
|
36
|
+
* through to a normal run with a stderr warning. Default `false`.
|
|
37
|
+
*/
|
|
38
|
+
replay?: boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Path for a self-contained HTML report (#A5 Phase 5). Pasteable
|
|
41
|
+
* into PR comments / Slack threads. Coexists with `--json` (JSON
|
|
42
|
+
* still goes to stdout, HTML goes to disk).
|
|
43
|
+
*/
|
|
44
|
+
html?: string;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Test seam — every external dependency gets an injectable
|
|
48
|
+
* override. The CLI handler defaults each to its real impl.
|
|
49
|
+
*/
|
|
50
|
+
export interface AiEvalDeps {
|
|
51
|
+
cwd?: string;
|
|
52
|
+
stdout?: {
|
|
53
|
+
write(s: string): boolean | void;
|
|
54
|
+
};
|
|
55
|
+
stderr?: {
|
|
56
|
+
write(s: string): boolean | void;
|
|
57
|
+
};
|
|
58
|
+
/** Override the file walk (test harness returns a virtual list). */
|
|
59
|
+
discover?: (cwd: string, pattern: string) => Promise<string[]>;
|
|
60
|
+
/** Override file → suite loader (test harness uses an in-memory map). */
|
|
61
|
+
loadSuite?: (absPath: string) => Promise<EvalSuite | null>;
|
|
62
|
+
/** Override config lookup (test harness skips `@rudderjs/core`). */
|
|
63
|
+
configPattern?: () => string | null | Promise<string | null>;
|
|
64
|
+
/**
|
|
65
|
+
* Override fixtures directory (defaults to `<cwd>/evals/__fixtures__`).
|
|
66
|
+
* Tests point to a tmpdir to keep round-trips off the source tree.
|
|
67
|
+
*/
|
|
68
|
+
fixturesDir?: string;
|
|
69
|
+
}
|
|
70
|
+
/** Register the `ai:eval` command on the rudder runner. */
|
|
71
|
+
export declare function registerAiEvalCommand(rudder: Rudder): void;
|
|
72
|
+
/**
|
|
73
|
+
* Parse the rest-of-line. Recognizes:
|
|
74
|
+
* - boolean flags: `--bail`, `--json`, `--record`, `--replay`
|
|
75
|
+
* - value flags : `--html <path>` or `--html=<path>`
|
|
76
|
+
* - one positional name filter (anything not consumed above)
|
|
77
|
+
*/
|
|
78
|
+
export declare function parseArgs(args: string[]): AiEvalOptions;
|
|
79
|
+
/**
|
|
80
|
+
* Execute the CLI flow. Returns the process exit code (0 = all pass,
|
|
81
|
+
* 1 = at least one suite had a failure or no suites discovered).
|
|
82
|
+
*
|
|
83
|
+
* The handler is `process.exit`-free so tests can drive it directly.
|
|
84
|
+
*/
|
|
85
|
+
export declare function runEvalCli(opts: AiEvalOptions, deps?: AiEvalDeps): Promise<number>;
|
|
86
|
+
/**
|
|
87
|
+
* Recursive walk constrained to a `<dir>/**\/*<suffix>` shape.
|
|
88
|
+
* Returns absolute paths sorted lexicographically for stable test
|
|
89
|
+
* output and predictable `--bail` ordering.
|
|
90
|
+
*/
|
|
91
|
+
export declare function discoverSuiteFiles(cwd: string, pattern: string): Promise<string[]>;
|
|
92
|
+
export {};
|
|
93
|
+
//# sourceMappingURL=ai-eval.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-eval.d.ts","sourceRoot":"","sources":["../../src/commands/ai-eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAMH,OAAO,KAAK,EAAE,SAAS,EAAiC,MAAM,kBAAkB,CAAA;AAUhF,KAAK,MAAM,GAAG;IACZ,OAAO,CACL,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,GAChD;QAAE,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAA;KAAE,CAAA;CAC1C,CAAA;AAED,0CAA0C;AAC1C,MAAM,WAAW,aAAa;IAC5B,kEAAkE;IAClE,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,uCAAuC;IACvC,IAAI,EAAK,OAAO,CAAA;IAChB,+CAA+C;IAC/C,IAAI,EAAK,OAAO,CAAA;IAChB;;;;;OAKG;IACH,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB;;;;;OAKG;IACH,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB;;;;OAIG;IACH,IAAI,CAAC,EAAE,MAAM,CAAA;CACd;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IACzB,GAAG,CAAC,EAAS,MAAM,CAAA;IACnB,MAAM,CAAC,EAAM;QAAE,KAAK,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CAAA;KAAE,CAAA;IACjD,MAAM,CAAC,EAAM;QAAE,KAAK,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CAAA;KAAE,CAAA;IACjD,oEAAoE;IACpE,QAAQ,CAAC,EAAI,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,EAAE,CAAC,CAAA;IAChE,yEAAyE;IACzE,SAAS,CAAC,EAAG,CAAC,OAAO,EAAE,MAAM,KAAK,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC,CAAA;IAC3D,oEAAoE;IACpE,aAAa,CAAC,EAAE,MAAM,MAAM,GAAG,IAAI,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAAA;IAC5D;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED,2DAA2D;AAC3D,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,GAAG,IAAI,CAO1D;AAMD;;;;;GAKG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,aAAa,CA8BvD;AAID;;;;;GAKG;AACH,wBAAsB,UAAU,CAAC,IAAI,EAAE,aAAa,EAAE,IAAI,GAAE,UAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAsE5F;AA4JD;;;;GAIG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAMxF"}
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `pnpm rudder ai:eval` — discover `evals/**\/*.eval.ts` suites,
|
|
3
|
+
* run each, and report. Console reporter by default; `--json` emits
|
|
4
|
+
* a machine-readable envelope to stdout for CI.
|
|
5
|
+
*
|
|
6
|
+
* Registered from the CLI loader (`packages/cli/src/index.ts`)
|
|
7
|
+
* — the AiProvider doesn't own this so it surfaces even when the
|
|
8
|
+
* user app fails to boot, matching the `command:list --json`
|
|
9
|
+
* graceful-degradation pattern from #349.
|
|
10
|
+
*/
|
|
11
|
+
import { readdir } from 'node:fs/promises';
|
|
12
|
+
import path from 'node:path';
|
|
13
|
+
import { pathToFileURL } from 'node:url';
|
|
14
|
+
import { runSuite, reportConsole, evalSuite, stepsFromResponse } from '../eval/index.js';
|
|
15
|
+
import { reportJson } from '../eval/json-reporter.js';
|
|
16
|
+
import { reportHtml } from '../eval/html-reporter.js';
|
|
17
|
+
import { defaultFixturesDir, readFixture, writeFixture } from '../eval/fixtures.js';
|
|
18
|
+
import { AiFake } from '../fake.js';
|
|
19
|
+
/** Register the `ai:eval` command on the rudder runner. */
|
|
20
|
+
export function registerAiEvalCommand(rudder) {
|
|
21
|
+
rudder.command('ai:eval', async (rawArgs) => {
|
|
22
|
+
const code = await runEvalCli(parseArgs(rawArgs));
|
|
23
|
+
if (code !== 0)
|
|
24
|
+
process.exit(code);
|
|
25
|
+
}).description('Run eval suites — pnpm rudder ai:eval [name-pattern] [--bail] [--json] [--record|--replay] [--html <path>]');
|
|
26
|
+
}
|
|
27
|
+
// ─── Args parser ─────────────────────────────────────────
|
|
28
|
+
const VALUE_FLAGS = new Set(['--html']);
|
|
29
|
+
/**
|
|
30
|
+
* Parse the rest-of-line. Recognizes:
|
|
31
|
+
* - boolean flags: `--bail`, `--json`, `--record`, `--replay`
|
|
32
|
+
* - value flags : `--html <path>` or `--html=<path>`
|
|
33
|
+
* - one positional name filter (anything not consumed above)
|
|
34
|
+
*/
|
|
35
|
+
export function parseArgs(args) {
|
|
36
|
+
const positional = [];
|
|
37
|
+
const opts = { bail: false, json: false };
|
|
38
|
+
for (let i = 0; i < args.length; i++) {
|
|
39
|
+
const a = args[i];
|
|
40
|
+
if (!a.startsWith('--')) {
|
|
41
|
+
positional.push(a);
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
// `--flag=value` form
|
|
45
|
+
const eq = a.indexOf('=');
|
|
46
|
+
const name = eq >= 0 ? a.slice(0, eq) : a;
|
|
47
|
+
const inline = eq >= 0 ? a.slice(eq + 1) : undefined;
|
|
48
|
+
if (name === '--bail') {
|
|
49
|
+
opts.bail = true;
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
if (name === '--json') {
|
|
53
|
+
opts.json = true;
|
|
54
|
+
continue;
|
|
55
|
+
}
|
|
56
|
+
if (name === '--record') {
|
|
57
|
+
opts.record = true;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
if (name === '--replay') {
|
|
61
|
+
opts.replay = true;
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
if (VALUE_FLAGS.has(name)) {
|
|
65
|
+
const value = inline ?? args[i + 1];
|
|
66
|
+
if (!inline)
|
|
67
|
+
i++; // consumed the next arg
|
|
68
|
+
if (!value)
|
|
69
|
+
throw new Error(`[RudderJS AI] ${name} requires a value`);
|
|
70
|
+
if (name === '--html')
|
|
71
|
+
opts.html = value;
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
// unknown flag — surface as positional so the user sees the typo
|
|
75
|
+
positional.push(a);
|
|
76
|
+
}
|
|
77
|
+
if (positional[0])
|
|
78
|
+
opts.filter = positional[0];
|
|
79
|
+
return opts;
|
|
80
|
+
}
|
|
81
|
+
// ─── Runner ──────────────────────────────────────────────
|
|
82
|
+
/**
|
|
83
|
+
* Execute the CLI flow. Returns the process exit code (0 = all pass,
|
|
84
|
+
* 1 = at least one suite had a failure or no suites discovered).
|
|
85
|
+
*
|
|
86
|
+
* The handler is `process.exit`-free so tests can drive it directly.
|
|
87
|
+
*/
|
|
88
|
+
export async function runEvalCli(opts, deps = {}) {
|
|
89
|
+
const cwd = deps.cwd ?? process.cwd();
|
|
90
|
+
const stdout = deps.stdout ?? process.stdout;
|
|
91
|
+
const stderr = deps.stderr ?? process.stderr;
|
|
92
|
+
if (opts.record && opts.replay) {
|
|
93
|
+
stderr.write('[ai:eval] --record and --replay are mutually exclusive\n');
|
|
94
|
+
return 1;
|
|
95
|
+
}
|
|
96
|
+
const pattern = await Promise.resolve((deps.configPattern ?? loadConfigPattern)()) ?? 'evals/**/*.eval.ts';
|
|
97
|
+
const discover = deps.discover ?? discoverSuiteFiles;
|
|
98
|
+
const files = await discover(cwd, pattern);
|
|
99
|
+
if (files.length === 0) {
|
|
100
|
+
stderr.write(`[ai:eval] no suites found matching ${pattern}\n`);
|
|
101
|
+
return opts.json ? emitJson(stdout, []) : 1;
|
|
102
|
+
}
|
|
103
|
+
const loader = deps.loadSuite ?? defaultSuiteLoader;
|
|
104
|
+
const fixturesDir = deps.fixturesDir ?? defaultFixturesDir(cwd);
|
|
105
|
+
const reports = [];
|
|
106
|
+
const fullReports = [];
|
|
107
|
+
let exitCode = 0;
|
|
108
|
+
// `--replay` swaps the global runtime once, restored when we're done.
|
|
109
|
+
// The per-case fixture is set on the AiFake instance inside the
|
|
110
|
+
// wrapped agent factory just before each case's `agent.prompt()`.
|
|
111
|
+
let fake = null;
|
|
112
|
+
if (opts.replay)
|
|
113
|
+
fake = AiFake.fake();
|
|
114
|
+
try {
|
|
115
|
+
for (const file of files) {
|
|
116
|
+
let suite;
|
|
117
|
+
try {
|
|
118
|
+
suite = await loader(file);
|
|
119
|
+
}
|
|
120
|
+
catch (err) {
|
|
121
|
+
stderr.write(`[ai:eval] failed to load ${path.relative(cwd, file)}: ${formatError(err)}\n`);
|
|
122
|
+
exitCode = 1;
|
|
123
|
+
if (opts.bail)
|
|
124
|
+
break;
|
|
125
|
+
continue;
|
|
126
|
+
}
|
|
127
|
+
if (!suite) {
|
|
128
|
+
stderr.write(`[ai:eval] ${path.relative(cwd, file)} has no default eval suite — skipping\n`);
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
if (opts.filter && !suite.name.toLowerCase().includes(opts.filter.toLowerCase()))
|
|
132
|
+
continue;
|
|
133
|
+
const decorated = await decorateForMode(suite, opts, { fixturesDir, stderr, fake });
|
|
134
|
+
const report = await runSuite(decorated);
|
|
135
|
+
fullReports.push(report);
|
|
136
|
+
if (opts.json) {
|
|
137
|
+
reports.push(reportJson(report));
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
reportConsole(report, { log: (s) => stdout.write(`${s}\n`) });
|
|
141
|
+
}
|
|
142
|
+
if (report.failed > 0) {
|
|
143
|
+
exitCode = 1;
|
|
144
|
+
if (opts.bail)
|
|
145
|
+
break;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
finally {
|
|
150
|
+
if (fake)
|
|
151
|
+
fake.restore();
|
|
152
|
+
}
|
|
153
|
+
if (opts.json)
|
|
154
|
+
emitJson(stdout, reports);
|
|
155
|
+
if (opts.html)
|
|
156
|
+
await writeHtmlReport(opts.html, fullReports, cwd, stderr);
|
|
157
|
+
return exitCode;
|
|
158
|
+
}
|
|
159
|
+
async function writeHtmlReport(htmlPath, reports, cwd, stderr) {
|
|
160
|
+
const { writeFile, mkdir } = await import('node:fs/promises');
|
|
161
|
+
const abs = path.isAbsolute(htmlPath) ? htmlPath : path.resolve(cwd, htmlPath);
|
|
162
|
+
try {
|
|
163
|
+
await mkdir(path.dirname(abs), { recursive: true });
|
|
164
|
+
await writeFile(abs, reportHtml(reports));
|
|
165
|
+
stderr.write(`[ai:eval] wrote HTML report → ${path.relative(cwd, abs)}\n`);
|
|
166
|
+
}
|
|
167
|
+
catch (err) {
|
|
168
|
+
stderr.write(`[ai:eval] failed to write HTML report ${abs}: ${formatError(err)}\n`);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
function emitJson(stdout, suites) {
|
|
172
|
+
stdout.write(`${JSON.stringify({ suites }, null, 2)}\n`);
|
|
173
|
+
return 0;
|
|
174
|
+
}
|
|
175
|
+
function formatError(err) {
|
|
176
|
+
return err instanceof Error ? err.message : String(err);
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Wrap a suite so each case captures the response (`--record`) or
|
|
180
|
+
* pre-loads the fake's sequence (`--replay`) before running. A
|
|
181
|
+
* normal run returns the suite untouched.
|
|
182
|
+
*
|
|
183
|
+
* Implemented as a per-case `agent` / `assert` decoration so the
|
|
184
|
+
* runner stays unchanged — `runSuite` doesn't need to know about
|
|
185
|
+
* the fixture format. The original `agent`/`assert` for each case
|
|
186
|
+
* are still called; we just slip work in around them.
|
|
187
|
+
*
|
|
188
|
+
* For replay, fixtures load up-front (sync factory contract) so the
|
|
189
|
+
* AiFake is primed before each `agent.prompt()` runs.
|
|
190
|
+
*/
|
|
191
|
+
async function decorateForMode(suite, opts, ctx) {
|
|
192
|
+
if (!opts.record && !opts.replay)
|
|
193
|
+
return suite;
|
|
194
|
+
// Pre-load every fixture for replay so the per-case factory can
|
|
195
|
+
// call `respondWithSequence` synchronously.
|
|
196
|
+
const replaySteps = new Map();
|
|
197
|
+
if (opts.replay) {
|
|
198
|
+
for (let i = 0; i < suite.spec.cases.length; i++) {
|
|
199
|
+
const c = suite.spec.cases[i];
|
|
200
|
+
const caseName = c.name ?? `case-${i}`;
|
|
201
|
+
try {
|
|
202
|
+
const fixture = await readFixture(ctx.fixturesDir, suite.name, caseName);
|
|
203
|
+
if (fixture)
|
|
204
|
+
replaySteps.set(caseName, fixture.steps);
|
|
205
|
+
else
|
|
206
|
+
ctx.stderr.write(`[ai:eval] no fixture for ${suite.name}/${caseName} — running against live provider\n`);
|
|
207
|
+
}
|
|
208
|
+
catch (err) {
|
|
209
|
+
ctx.stderr.write(`[ai:eval] fixture load error for ${suite.name}/${caseName}: ${formatError(err)}\n`);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
const wrapped = suite.spec.cases.map((c, i) => {
|
|
214
|
+
const caseName = c.name ?? `case-${i}`;
|
|
215
|
+
const baseFactory = c.agent ?? suite.spec.agent;
|
|
216
|
+
const baseAssert = c.assert;
|
|
217
|
+
const factory = opts.replay
|
|
218
|
+
? wrapReplayFactory(baseFactory, replaySteps.get(caseName), ctx.fake)
|
|
219
|
+
: baseFactory;
|
|
220
|
+
const assert = opts.record
|
|
221
|
+
? wrapRecordAssert(baseAssert, suite.name, caseName, c.input, ctx)
|
|
222
|
+
: baseAssert;
|
|
223
|
+
const out = {
|
|
224
|
+
input: c.input,
|
|
225
|
+
assert,
|
|
226
|
+
agent: factory,
|
|
227
|
+
};
|
|
228
|
+
if (c.name)
|
|
229
|
+
out.name = c.name;
|
|
230
|
+
if (c.timeout !== undefined)
|
|
231
|
+
out.timeout = c.timeout;
|
|
232
|
+
if (c.skip !== undefined)
|
|
233
|
+
out.skip = c.skip;
|
|
234
|
+
return out;
|
|
235
|
+
});
|
|
236
|
+
const newSpec = {
|
|
237
|
+
agent: suite.spec.agent,
|
|
238
|
+
cases: wrapped,
|
|
239
|
+
};
|
|
240
|
+
if (suite.spec.timeout !== undefined)
|
|
241
|
+
newSpec.timeout = suite.spec.timeout;
|
|
242
|
+
return evalSuite(suite.name, newSpec);
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Replay path: before each case runs, prime the shared `AiFake`
|
|
246
|
+
* with the case's recorded steps. When the fixture is missing the
|
|
247
|
+
* factory still returns the agent — it'll hit whatever the AiFake
|
|
248
|
+
* is currently scripted to return (typically falling back to the
|
|
249
|
+
* default ambient response, which surfaces as an obvious diff in
|
|
250
|
+
* the case's assertion).
|
|
251
|
+
*/
|
|
252
|
+
function wrapReplayFactory(base, steps, fake) {
|
|
253
|
+
return () => {
|
|
254
|
+
if (fake && steps)
|
|
255
|
+
fake.respondWithSequence(steps);
|
|
256
|
+
return base();
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
/**
|
|
260
|
+
* Record path: after each case's assertion runs, capture the
|
|
261
|
+
* agent response's assistant turns to the fixture file. Wrapping
|
|
262
|
+
* the assert is the cleanest hook — the runner already passes
|
|
263
|
+
* `response` into it, and the wrapped fn still returns the
|
|
264
|
+
* original assertion's result.
|
|
265
|
+
*/
|
|
266
|
+
function wrapRecordAssert(base, suite, caseName, input, ctx) {
|
|
267
|
+
return async (response, mctx) => {
|
|
268
|
+
try {
|
|
269
|
+
const file = await writeFixture(ctx.fixturesDir, suite, caseName, {
|
|
270
|
+
input,
|
|
271
|
+
steps: stepsFromResponse(response),
|
|
272
|
+
});
|
|
273
|
+
ctx.stderr.write(`[ai:eval] recorded ${path.relative(process.cwd(), file)}\n`);
|
|
274
|
+
}
|
|
275
|
+
catch (err) {
|
|
276
|
+
ctx.stderr.write(`[ai:eval] failed to record ${suite}/${caseName}: ${formatError(err)}\n`);
|
|
277
|
+
}
|
|
278
|
+
return base(response, mctx);
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
// ─── File discovery ──────────────────────────────────────
|
|
282
|
+
/**
|
|
283
|
+
* Recursive walk constrained to a `<dir>/**\/*<suffix>` shape.
|
|
284
|
+
* Returns absolute paths sorted lexicographically for stable test
|
|
285
|
+
* output and predictable `--bail` ordering.
|
|
286
|
+
*/
|
|
287
|
+
export async function discoverSuiteFiles(cwd, pattern) {
|
|
288
|
+
const { root, suffix } = parsePattern(pattern);
|
|
289
|
+
const absRoot = path.resolve(cwd, root);
|
|
290
|
+
const out = [];
|
|
291
|
+
await walk(absRoot, suffix, out);
|
|
292
|
+
return out.sort();
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Tiny pattern parser — supports `<dir>/**\/*<suffix>` and bare
|
|
296
|
+
* `*<suffix>` (current directory). Anything more elaborate is
|
|
297
|
+
* deferred to userland (run a custom script that imports `runSuite`).
|
|
298
|
+
*
|
|
299
|
+
* Examples:
|
|
300
|
+
* `evals/**\/*.eval.ts` → root=`evals`, suffix=`.eval.ts`
|
|
301
|
+
* `tests/agents/**\/*.ts` → root=`tests/agents`, suffix=`.ts`
|
|
302
|
+
* `*.eval.ts` → root=`.`, suffix=`.eval.ts`
|
|
303
|
+
*/
|
|
304
|
+
function parsePattern(pattern) {
|
|
305
|
+
const doubleStar = pattern.indexOf('**');
|
|
306
|
+
let prefix;
|
|
307
|
+
let postfix;
|
|
308
|
+
if (doubleStar >= 0) {
|
|
309
|
+
prefix = pattern.slice(0, doubleStar).replace(/\/$/, '');
|
|
310
|
+
postfix = pattern.slice(doubleStar + 2).replace(/^\//, '');
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
const lastSlash = pattern.lastIndexOf('/');
|
|
314
|
+
prefix = lastSlash >= 0 ? pattern.slice(0, lastSlash) : '';
|
|
315
|
+
postfix = lastSlash >= 0 ? pattern.slice(lastSlash + 1) : pattern;
|
|
316
|
+
}
|
|
317
|
+
if (!postfix.startsWith('*')) {
|
|
318
|
+
throw new Error(`[RudderJS AI] Unsupported eval pattern "${pattern}". ` +
|
|
319
|
+
`Expected <dir>/**/*<suffix> or *<suffix>.`);
|
|
320
|
+
}
|
|
321
|
+
return {
|
|
322
|
+
root: prefix || '.',
|
|
323
|
+
suffix: postfix.slice(1),
|
|
324
|
+
};
|
|
325
|
+
}
|
|
326
|
+
async function walk(dir, suffix, out) {
|
|
327
|
+
let entries;
|
|
328
|
+
try {
|
|
329
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
330
|
+
}
|
|
331
|
+
catch (err) {
|
|
332
|
+
if (err.code === 'ENOENT')
|
|
333
|
+
return;
|
|
334
|
+
throw err;
|
|
335
|
+
}
|
|
336
|
+
for (const entry of entries) {
|
|
337
|
+
const p = path.join(dir, entry.name);
|
|
338
|
+
if (entry.isDirectory()) {
|
|
339
|
+
if (entry.name === 'node_modules' || entry.name.startsWith('.'))
|
|
340
|
+
continue;
|
|
341
|
+
await walk(p, suffix, out);
|
|
342
|
+
}
|
|
343
|
+
else if (entry.isFile() && entry.name.endsWith(suffix)) {
|
|
344
|
+
out.push(p);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
// ─── Suite loader ────────────────────────────────────────
|
|
349
|
+
async function defaultSuiteLoader(file) {
|
|
350
|
+
const mod = await import(pathToFileURL(file).href);
|
|
351
|
+
const candidate = (mod['default'] ?? mod['suite']);
|
|
352
|
+
if (!candidate || typeof candidate.name !== 'string' || !candidate.spec)
|
|
353
|
+
return null;
|
|
354
|
+
return candidate;
|
|
355
|
+
}
|
|
356
|
+
// ─── Config lookup ───────────────────────────────────────
|
|
357
|
+
/**
|
|
358
|
+
* Read `config('ai').eval.pattern` from the booted app. Returns
|
|
359
|
+
* `null` (default pattern) when `@rudderjs/core` isn't loadable
|
|
360
|
+
* or the app didn't boot — the CLI must still work in
|
|
361
|
+
* introspective mode (#349).
|
|
362
|
+
*/
|
|
363
|
+
async function loadConfigPattern() {
|
|
364
|
+
try {
|
|
365
|
+
// Dynamic import so the static graph doesn't pin `@rudderjs/core`
|
|
366
|
+
// (optional peer). Falls back to default when core isn't loadable
|
|
367
|
+
// or the app didn't boot.
|
|
368
|
+
const core = await import('@rudderjs/core');
|
|
369
|
+
if (typeof core.config !== 'function')
|
|
370
|
+
return null;
|
|
371
|
+
const cfg = core.config('ai');
|
|
372
|
+
return cfg?.eval?.pattern ?? null;
|
|
373
|
+
}
|
|
374
|
+
catch {
|
|
375
|
+
return null;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
//# sourceMappingURL=ai-eval.js.map
|