@pugi/cli 0.1.0-beta.31 → 0.1.0-beta.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/smoke.js +133 -0
- package/dist/core/auth/ensure-authenticated.js +129 -0
- package/dist/core/bash-classifier.js +108 -1
- package/dist/core/codegraph/decision-store.js +248 -0
- package/dist/core/codegraph/detect-repo.js +459 -0
- package/dist/core/codegraph/install.js +134 -0
- package/dist/core/codegraph/offer-hook.js +220 -0
- package/dist/core/diagnostics/probes/status-snapshot.js +50 -4
- package/dist/core/onboarding/ensure-initialized.js +133 -0
- package/dist/core/repl/session.js +370 -9
- package/dist/core/repl/slash-commands.js +68 -5
- package/dist/core/smoke/headless-driver.js +174 -0
- package/dist/core/smoke/orchestrator.js +194 -0
- package/dist/core/smoke/runner.js +238 -0
- package/dist/core/smoke/scenario-parser.js +316 -0
- package/dist/runtime/cli.js +453 -11
- package/dist/runtime/commands/cancel.js +231 -0
- package/dist/runtime/commands/codegraph-status.js +227 -0
- package/dist/runtime/commands/permissions.js +23 -0
- package/dist/runtime/commands/redo-blob-store.js +92 -0
- package/dist/runtime/commands/redo.js +361 -0
- package/dist/runtime/commands/status.js +11 -3
- package/dist/runtime/commands/undo.js +32 -0
- package/dist/runtime/headless-repl.js +195 -0
- package/dist/runtime/version.js +1 -1
- package/dist/tui/permissions-picker.js +78 -0
- package/dist/tui/render.js +35 -0
- package/dist/tui/status-bar.js +1 -1
- package/dist/tui/tool-stream-pane.js +45 -3
- package/package.json +7 -4
- package/test/scenarios/codegen-create-file.scenario.txt +13 -0
- package/test/scenarios/compact-force.scenario.txt +11 -0
- package/test/scenarios/identity.scenario.txt +11 -0
- package/test/scenarios/persona-handoff.scenario.txt +11 -0
- package/test/scenarios/walkback.scenario.txt +12 -0
- package/dist/core/engine/compaction-hook.js +0 -154
- package/dist/core/init/scaffold.js +0 -195
- package/dist/core/memory/dual-write.spec.js +0 -297
- package/dist/core/memory-sync/queue.spec.js +0 -105
- package/dist/core/repl/codebase-survey.js +0 -308
- package/dist/core/repl/init-interview.js +0 -457
- package/dist/core/repl/onboarding-state.js +0 -297
- package/dist/runtime/commands/memory.spec.js +0 -174
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Headless subprocess driver for the smoke harness (Phase 1, 2026-05-27).
|
|
3
|
+
*
|
|
4
|
+
* Spawns `pugi --headless`, feeds the scenario's user-input lines one
|
|
5
|
+
* at a time, and collects every JSON envelope written to stdout. The
|
|
6
|
+
* driver is the bridge between the in-process scenario parser and the
|
|
7
|
+
* real CLI binary — running scenarios via subprocess is what lets
|
|
8
|
+
* `pugi smoke` validate the AS-PUBLISHED CLI, not just the in-process
|
|
9
|
+
* code path.
|
|
10
|
+
*
|
|
11
|
+
* Phase 1 invariants:
|
|
12
|
+
*
|
|
13
|
+
* - One scenario = one subprocess. We do NOT reuse processes across
|
|
14
|
+
* scenarios; each run starts in a fresh temp workspace so file
|
|
15
|
+
* assertions are deterministic.
|
|
16
|
+
*
|
|
17
|
+
* - Stdin is fed line-by-line on a timer (default 200ms gap). We
|
|
18
|
+
* could fire all lines at once, but spacing them out gives the
|
|
19
|
+
* engine a chance to emit `persona-turn` envelopes between user
|
|
20
|
+
* turns. Mirrors real-operator typing cadence — feeds a more
|
|
21
|
+
* realistic test signal than a burst write.
|
|
22
|
+
*
|
|
23
|
+
* - Hard timeout per scenario (default 30s). The driver kills the
|
|
24
|
+
* subprocess on timeout and surfaces a clean executor-error so the
|
|
25
|
+
* report still shows the row instead of hanging CI.
|
|
26
|
+
*
|
|
27
|
+
* - Stderr is captured but NOT parsed. We forward it to the optional
|
|
28
|
+
* `onStderr` sink so the operator can see what the headless CLI
|
|
29
|
+
* wrote to stderr (banners, MCP load notices) — handy for
|
|
30
|
+
* debugging a failing scenario locally.
|
|
31
|
+
*/
|
|
32
|
+
import { spawn } from 'node:child_process';
|
|
33
|
+
import { mkdtempSync, rmSync } from 'node:fs';
|
|
34
|
+
import { tmpdir } from 'node:os';
|
|
35
|
+
import { resolve } from 'node:path';
|
|
36
|
+
/**
|
|
37
|
+
* Run a parsed scenario through the headless CLI. Returns the captured
|
|
38
|
+
* envelope stream + the workspace root the process ran in so the
|
|
39
|
+
* runner can resolve `EXPECT_FILE` paths.
|
|
40
|
+
*/
|
|
41
|
+
export async function runHeadlessScenario(scenario, opts = {}) {
|
|
42
|
+
const pugiBin = opts.pugiBin ?? 'pugi';
|
|
43
|
+
const extraArgs = opts.extraArgs ?? [];
|
|
44
|
+
const timeoutMs = opts.timeoutMs ?? 30_000;
|
|
45
|
+
const lineGapMs = opts.lineGapMs ?? 200;
|
|
46
|
+
const onStderr = opts.onStderr ?? noopChunk;
|
|
47
|
+
const spawner = opts.spawner ?? defaultSpawner;
|
|
48
|
+
const workspaceRoot = mkdtempSync(resolve(tmpdir(), 'pugi-smoke-'));
|
|
49
|
+
let cleanedUp = false;
|
|
50
|
+
const args = ['--headless', ...extraArgs];
|
|
51
|
+
const child = spawner(pugiBin, args, {
|
|
52
|
+
cwd: workspaceRoot,
|
|
53
|
+
env: { ...process.env, PUGI_HEADLESS: '1' },
|
|
54
|
+
});
|
|
55
|
+
const envelopes = [];
|
|
56
|
+
let stdoutBuffer = '';
|
|
57
|
+
child.stdout.setEncoding('utf8');
|
|
58
|
+
child.stdout.on('data', (chunk) => {
|
|
59
|
+
stdoutBuffer += chunk;
|
|
60
|
+
let newlineIndex = stdoutBuffer.indexOf('\n');
|
|
61
|
+
while (newlineIndex !== -1) {
|
|
62
|
+
const line = stdoutBuffer.slice(0, newlineIndex);
|
|
63
|
+
stdoutBuffer = stdoutBuffer.slice(newlineIndex + 1);
|
|
64
|
+
const parsed = parseEnvelopeLine(line);
|
|
65
|
+
if (parsed)
|
|
66
|
+
envelopes.push(parsed);
|
|
67
|
+
newlineIndex = stdoutBuffer.indexOf('\n');
|
|
68
|
+
}
|
|
69
|
+
});
|
|
70
|
+
child.stderr.setEncoding('utf8');
|
|
71
|
+
child.stderr.on('data', (chunk) => onStderr(chunk));
|
|
72
|
+
const userInputs = scenario.steps
|
|
73
|
+
.filter((s) => s.kind === 'user-input')
|
|
74
|
+
.map((s) => s.body);
|
|
75
|
+
// Feed inputs serially with a small gap. We don't await the engine's
|
|
76
|
+
// response — Phase 1 evaluates the WHOLE envelope stream after the
|
|
77
|
+
// process closes, so the gap is purely to give the engine room to
|
|
78
|
+
// emit between turns. A future phase can swap this for a barrier
|
|
79
|
+
// ("wait for `persona-turn` before sending the next `user-turn`")
|
|
80
|
+
// once envelope ordering is sealed.
|
|
81
|
+
const feedPromise = (async () => {
|
|
82
|
+
for (const line of userInputs) {
|
|
83
|
+
child.stdin.write(`${line}\n`);
|
|
84
|
+
await delay(lineGapMs);
|
|
85
|
+
}
|
|
86
|
+
child.stdin.end();
|
|
87
|
+
})();
|
|
88
|
+
const timeoutHandle = setTimeout(() => {
|
|
89
|
+
child.kill('SIGTERM');
|
|
90
|
+
}, timeoutMs);
|
|
91
|
+
// Don't keep node alive purely for the timeout — the spawned child
|
|
92
|
+
// already holds the loop via its IPC pipe.
|
|
93
|
+
timeoutHandle.unref?.();
|
|
94
|
+
await new Promise((resolvePromise) => {
|
|
95
|
+
child.on('close', () => resolvePromise());
|
|
96
|
+
});
|
|
97
|
+
clearTimeout(timeoutHandle);
|
|
98
|
+
// Flush any tail bytes that did not end with a newline.
|
|
99
|
+
if (stdoutBuffer.length > 0) {
|
|
100
|
+
const parsed = parseEnvelopeLine(stdoutBuffer);
|
|
101
|
+
if (parsed)
|
|
102
|
+
envelopes.push(parsed);
|
|
103
|
+
stdoutBuffer = '';
|
|
104
|
+
}
|
|
105
|
+
await feedPromise.catch(() => {
|
|
106
|
+
/* feed promise rejects when the child closes early — ignore */
|
|
107
|
+
});
|
|
108
|
+
// Cleanup the temp workspace lazily — the orchestrator may want to
|
|
109
|
+
// inspect it after the run. Callers that need a fresh dir per scenario
|
|
110
|
+
// should pass a custom workspaceRoot in a future phase; for now the
|
|
111
|
+
// runner resolves EXPECT_FILE paths against this directory and the
|
|
112
|
+
// OS reaps the tmp tree on reboot.
|
|
113
|
+
// (Explicit cleanup helper exposed for tests that want determinism.)
|
|
114
|
+
void cleanedUp;
|
|
115
|
+
return { envelopes, workspaceRoot };
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Tear down a workspace created by `runHeadlessScenario`. Exposed
|
|
119
|
+
* separately because the orchestrator wants the directory to survive
|
|
120
|
+
* the run for EXPECT_FILE evaluation.
|
|
121
|
+
*/
|
|
122
|
+
export function cleanupWorkspace(workspaceRoot) {
|
|
123
|
+
try {
|
|
124
|
+
rmSync(workspaceRoot, { recursive: true, force: true });
|
|
125
|
+
}
|
|
126
|
+
catch {
|
|
127
|
+
/* best-effort */
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Parse one line of headless stdout into an envelope. Returns null
|
|
132
|
+
* when the line is empty or unparseable — non-fatal so the runner can
|
|
133
|
+
* tolerate stray banner output.
|
|
134
|
+
*/
|
|
135
|
+
export function parseEnvelopeLine(raw) {
|
|
136
|
+
const line = raw.trim();
|
|
137
|
+
if (line.length === 0)
|
|
138
|
+
return null;
|
|
139
|
+
if (!line.startsWith('{'))
|
|
140
|
+
return null;
|
|
141
|
+
try {
|
|
142
|
+
const parsed = JSON.parse(line);
|
|
143
|
+
if (typeof parsed['kind'] !== 'string')
|
|
144
|
+
return null;
|
|
145
|
+
if (typeof parsed['body'] !== 'string')
|
|
146
|
+
return null;
|
|
147
|
+
const ts = typeof parsed['ts'] === 'number'
|
|
148
|
+
? parsed['ts']
|
|
149
|
+
: Number(parsed['ts'] ?? Date.now());
|
|
150
|
+
return {
|
|
151
|
+
kind: parsed['kind'],
|
|
152
|
+
body: parsed['body'],
|
|
153
|
+
ts: Number.isFinite(ts) ? ts : Date.now(),
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
function defaultSpawner(bin, args, options) {
|
|
161
|
+
return spawn(bin, args, {
|
|
162
|
+
cwd: options.cwd,
|
|
163
|
+
env: options.env,
|
|
164
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
function delay(ms) {
|
|
168
|
+
return new Promise((resolve) => {
|
|
169
|
+
const handle = setTimeout(resolve, ms);
|
|
170
|
+
handle.unref?.();
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
function noopChunk(_chunk) { }
|
|
174
|
+
//# sourceMappingURL=headless-driver.js.map
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smoke orchestrator — glues the scenario parser, the headless
|
|
3
|
+
* subprocess driver, and the runner into a single "load → run → report"
|
|
4
|
+
* pipeline. The CLI surface (`pugi smoke`) and the standalone script
|
|
5
|
+
* (`scripts/run-scenarios.ts`) both call into this module so the two
|
|
6
|
+
* entry points share one code path.
|
|
7
|
+
*
|
|
8
|
+
* Phase 1 boundary — `runSmoke` is responsible for:
|
|
9
|
+
*
|
|
10
|
+
* 1. Discovering scenario files under `scenariosDir` (glob match on
|
|
11
|
+
* `*.scenario.txt`).
|
|
12
|
+
* 2. Parsing each file via `parseScenario`. Parse errors are surfaced
|
|
13
|
+
* via the report but do not stop the run — every scenario gets a
|
|
14
|
+
* chance to fail with a clean diagnostic.
|
|
15
|
+
* 3. Driving each scenario through the headless executor (the
|
|
16
|
+
* executor is injected so tests can swap it for a deterministic
|
|
17
|
+
* stub; production wires `runHeadlessScenario` from
|
|
18
|
+
* `headless-driver.ts`).
|
|
19
|
+
* 4. Filtering by `--filter <pattern>` (compiles to fnmatch-lite).
|
|
20
|
+
* 5. Computing pass/fail/summary numbers.
|
|
21
|
+
*
|
|
22
|
+
* The orchestrator is intentionally synchronous (apart from the
|
|
23
|
+
* per-scenario `await`) — running scenarios in parallel is a Phase 2
|
|
24
|
+
* concern. The corpus is small and sequential output is easier to read.
|
|
25
|
+
*/
|
|
26
|
+
import { readdirSync, readFileSync, statSync } from 'node:fs';
|
|
27
|
+
import { resolve } from 'node:path';
|
|
28
|
+
import { parseScenario, } from './scenario-parser.js';
|
|
29
|
+
import { runScenario, } from './runner.js';
|
|
30
|
+
/**
|
|
31
|
+
* Top-level smoke entry. Returns the report so the CLI can pretty-print
|
|
32
|
+
* it AND set `process.exitCode` deterministically.
|
|
33
|
+
*/
|
|
34
|
+
export async function runSmoke(opts) {
|
|
35
|
+
const log = opts.log ?? noopLog;
|
|
36
|
+
const now = opts.now ?? Date.now;
|
|
37
|
+
const allScenarios = loadScenariosFromDir(opts.scenariosDir);
|
|
38
|
+
const visible = opts.filter && opts.filter.length > 0
|
|
39
|
+
? filterByPattern(allScenarios, opts.filter)
|
|
40
|
+
: allScenarios;
|
|
41
|
+
const results = [];
|
|
42
|
+
let passed = 0;
|
|
43
|
+
let failed = 0;
|
|
44
|
+
for (const item of visible) {
|
|
45
|
+
log(`pugi smoke: running ${item.scenario.id}`);
|
|
46
|
+
if (item.parseErrors.length > 0) {
|
|
47
|
+
results.push({
|
|
48
|
+
id: item.scenario.id,
|
|
49
|
+
filePath: item.scenario.filePath,
|
|
50
|
+
status: 'parse-error',
|
|
51
|
+
durationMs: 0,
|
|
52
|
+
assertionCount: 0,
|
|
53
|
+
failures: [],
|
|
54
|
+
parseErrors: item.parseErrors,
|
|
55
|
+
});
|
|
56
|
+
failed += 1;
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
let envelopes = [];
|
|
60
|
+
let workspaceRoot = '.';
|
|
61
|
+
try {
|
|
62
|
+
const out = await opts.executor(item.scenario);
|
|
63
|
+
envelopes = out.envelopes;
|
|
64
|
+
workspaceRoot = out.workspaceRoot;
|
|
65
|
+
}
|
|
66
|
+
catch (error) {
|
|
67
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
68
|
+
results.push({
|
|
69
|
+
id: item.scenario.id,
|
|
70
|
+
filePath: item.scenario.filePath,
|
|
71
|
+
status: 'executor-error',
|
|
72
|
+
durationMs: 0,
|
|
73
|
+
assertionCount: 0,
|
|
74
|
+
failures: [],
|
|
75
|
+
executorError: message,
|
|
76
|
+
});
|
|
77
|
+
failed += 1;
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
const result = runScenario({
|
|
81
|
+
scenario: item.scenario,
|
|
82
|
+
envelopes,
|
|
83
|
+
workspaceRoot,
|
|
84
|
+
now,
|
|
85
|
+
});
|
|
86
|
+
results.push({
|
|
87
|
+
id: result.id,
|
|
88
|
+
filePath: item.scenario.filePath,
|
|
89
|
+
status: result.passed ? 'passed' : 'failed',
|
|
90
|
+
durationMs: result.durationMs,
|
|
91
|
+
assertionCount: result.assertionCount,
|
|
92
|
+
failures: result.failures,
|
|
93
|
+
});
|
|
94
|
+
if (result.passed)
|
|
95
|
+
passed += 1;
|
|
96
|
+
else
|
|
97
|
+
failed += 1;
|
|
98
|
+
}
|
|
99
|
+
const total = visible.length;
|
|
100
|
+
const skipped = allScenarios.length - visible.length;
|
|
101
|
+
const exitCode = failed === 0 ? 0 : 1;
|
|
102
|
+
return { total, passed, failed, skipped, results, exitCode };
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Walk `dir` for `*.scenario.txt` files (non-recursive). Returns each
|
|
106
|
+
* file's parsed scenario + collected parse errors so the orchestrator
|
|
107
|
+
* can surface malformed files as failed runs rather than skipping them.
|
|
108
|
+
*/
|
|
109
|
+
export function loadScenariosFromDir(dir) {
|
|
110
|
+
let names = [];
|
|
111
|
+
try {
|
|
112
|
+
names = readdirSync(dir);
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
return [];
|
|
116
|
+
}
|
|
117
|
+
const out = [];
|
|
118
|
+
for (const name of names) {
|
|
119
|
+
if (!name.endsWith('.scenario.txt'))
|
|
120
|
+
continue;
|
|
121
|
+
const filePath = resolve(dir, name);
|
|
122
|
+
let stat;
|
|
123
|
+
try {
|
|
124
|
+
stat = statSync(filePath);
|
|
125
|
+
}
|
|
126
|
+
catch {
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
if (!stat.isFile())
|
|
130
|
+
continue;
|
|
131
|
+
const body = readFileSync(filePath, 'utf8');
|
|
132
|
+
const parsed = parseScenario(filePath, body);
|
|
133
|
+
if (parsed.scenario) {
|
|
134
|
+
out.push({ scenario: parsed.scenario, parseErrors: parsed.errors });
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
// Stable sort by id so report output is deterministic across
|
|
138
|
+
// filesystems with different readdir order.
|
|
139
|
+
out.sort((a, b) => a.scenario.id.localeCompare(b.scenario.id));
|
|
140
|
+
return out;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Render a SmokeReport into a multi-line human-readable string. Kept
|
|
144
|
+
* separate from `runSmoke` so the CLI can pick its own format (text vs
|
|
145
|
+
* JSON). The default text format mirrors `node:test`'s tap-lite output:
|
|
146
|
+
*
|
|
147
|
+
* ok 1 - identity (12ms)
|
|
148
|
+
* not ok 2 - codegen-create-file (8ms)
|
|
149
|
+
* line 5: EXPECT failed — no envelope matched ...
|
|
150
|
+
*
|
|
151
|
+
* pugi smoke: 1 passed, 1 failed
|
|
152
|
+
*/
|
|
153
|
+
export function renderReportText(report) {
|
|
154
|
+
const lines = [];
|
|
155
|
+
for (let i = 0; i < report.results.length; i += 1) {
|
|
156
|
+
const r = report.results[i];
|
|
157
|
+
if (!r)
|
|
158
|
+
continue;
|
|
159
|
+
const ordinal = i + 1;
|
|
160
|
+
if (r.status === 'passed') {
|
|
161
|
+
lines.push(`ok ${ordinal} - ${r.id} (${r.durationMs}ms)`);
|
|
162
|
+
}
|
|
163
|
+
else if (r.status === 'failed') {
|
|
164
|
+
lines.push(`not ok ${ordinal} - ${r.id} (${r.durationMs}ms)`);
|
|
165
|
+
for (const f of r.failures) {
|
|
166
|
+
lines.push(` line ${f.line}: ${f.message}`);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
else if (r.status === 'parse-error') {
|
|
170
|
+
lines.push(`not ok ${ordinal} - ${r.id} (parse error)`);
|
|
171
|
+
for (const e of r.parseErrors ?? []) {
|
|
172
|
+
lines.push(` line ${e.line}: ${e.message}`);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
else if (r.status === 'executor-error') {
|
|
176
|
+
lines.push(`not ok ${ordinal} - ${r.id} (executor error)`);
|
|
177
|
+
lines.push(` ${r.executorError ?? 'unknown executor failure'}`);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
lines.push('');
|
|
181
|
+
const skippedSuffix = report.skipped > 0 ? `, ${report.skipped} skipped` : '';
|
|
182
|
+
lines.push(`pugi smoke: ${report.passed} passed, ${report.failed} failed${skippedSuffix}`);
|
|
183
|
+
return lines.join('\n');
|
|
184
|
+
}
|
|
185
|
+
function filterByPattern(scenarios, pattern) {
|
|
186
|
+
if (!pattern.includes('*')) {
|
|
187
|
+
return scenarios.filter((s) => s.scenario.id.includes(pattern));
|
|
188
|
+
}
|
|
189
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&');
|
|
190
|
+
const re = new RegExp(`^${escaped.replace(/\*/g, '.*')}$`);
|
|
191
|
+
return scenarios.filter((s) => re.test(s.scenario.id));
|
|
192
|
+
}
|
|
193
|
+
function noopLog(_line) { }
|
|
194
|
+
//# sourceMappingURL=orchestrator.js.map
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scenario runner for the Pugi MCP test harness (BIG TRACK 10 Phase 1,
|
|
3
|
+
* 2026-05-27).
|
|
4
|
+
*
|
|
5
|
+
* Drives a parsed scenario against a stream of `HeadlessEnvelope`
|
|
6
|
+
* objects (the same shape `pugi --headless` emits on stdout) and a
|
|
7
|
+
* filesystem checker for `EXPECT_FILE` directives. The runner is
|
|
8
|
+
* deliberately decoupled from the subprocess spawn so the spec file
|
|
9
|
+
* can inject deterministic envelope arrays without spawning a real
|
|
10
|
+
* `pugi` binary — that strategy keeps the test cycle under 200ms while
|
|
11
|
+
* still exercising the matching semantics every CI run depends on.
|
|
12
|
+
*
|
|
13
|
+
* Matching semantics (the rules the corpus authors care about):
|
|
14
|
+
*
|
|
15
|
+
* - `EXPECT:` after a `>` user-input line scans envelopes that
|
|
16
|
+
* arrived AFTER that user-input. The cursor resets on each new
|
|
17
|
+
* `>`. If no envelope satisfies the pattern, the assertion fails.
|
|
18
|
+
*
|
|
19
|
+
* - `EXPECT_NOT:` runs the inverse — passes if NO envelope in the
|
|
20
|
+
* post-`>` window satisfies the pattern. A negative assertion that
|
|
21
|
+
* fires on every input line gives the operator a clean signal when
|
|
22
|
+
* a forbidden phrase ("Mira") shows up.
|
|
23
|
+
*
|
|
24
|
+
* - `EXPECT_FILE:` runs once at the END of the scenario, against the
|
|
25
|
+
* final filesystem snapshot. The runner does not race the
|
|
26
|
+
* subprocess — by the time we evaluate file assertions the
|
|
27
|
+
* headless process has exited (or been terminated).
|
|
28
|
+
*
|
|
29
|
+
* Result shape mirrors `node:test` style: top-level pass/fail plus an
|
|
30
|
+
* array of per-assertion records so the CLI can print a grouped
|
|
31
|
+
* summary. Each failure carries the originating line number so the
|
|
32
|
+
* operator can jump straight to the scenario source.
|
|
33
|
+
*/
|
|
34
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
35
|
+
import { resolve } from 'node:path';
|
|
36
|
+
/**
|
|
37
|
+
* Run the assertions in `scenario` against the given envelope stream
|
|
38
|
+
* and filesystem snapshot. Pure function — no I/O outside the filesystem
|
|
39
|
+
* stat that `EXPECT_FILE` performs, and even that is gated by a step
|
|
40
|
+
* actually existing.
|
|
41
|
+
*/
|
|
42
|
+
export function runScenario(inputs) {
|
|
43
|
+
const now = inputs.now ?? Date.now;
|
|
44
|
+
const startedAt = now();
|
|
45
|
+
const failures = [];
|
|
46
|
+
let assertionCount = 0;
|
|
47
|
+
// Group steps into runs anchored by `>` user-input lines. Each run
|
|
48
|
+
// owns the EXPECT/EXPECT_NOT assertions that follow it until the
|
|
49
|
+
// next `>`. EXPECT_FILE is collected globally and evaluated after
|
|
50
|
+
// every user-input run is processed.
|
|
51
|
+
const runs = [];
|
|
52
|
+
const fileChecks = [];
|
|
53
|
+
for (const step of inputs.scenario.steps) {
|
|
54
|
+
if (step.kind === 'user-input') {
|
|
55
|
+
runs.push({ userStep: step, expects: [] });
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
if (step.kind === 'expect') {
|
|
59
|
+
// Assertions that appear BEFORE any `>` attach to a synthetic
|
|
60
|
+
// pre-run so the matching pass still sees them. Rare in practice,
|
|
61
|
+
// but the parser allows it and the runner should not silently
|
|
62
|
+
// drop them.
|
|
63
|
+
if (runs.length === 0)
|
|
64
|
+
runs.push({ userStep: null, expects: [] });
|
|
65
|
+
const current = runs[runs.length - 1];
|
|
66
|
+
if (current)
|
|
67
|
+
current.expects.push(step);
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
if (step.kind === 'expect-file') {
|
|
71
|
+
fileChecks.push(step);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// Build a per-run envelope window. We walk the envelope stream once,
|
|
75
|
+
// assigning each envelope to the most recent user-turn we have seen.
|
|
76
|
+
// The first `user-turn` envelope after a `>` is the marker for that
|
|
77
|
+
// run; assertions match within the slice up to (but not including)
|
|
78
|
+
// the NEXT `user-turn` envelope.
|
|
79
|
+
const userTurnIndices = [];
|
|
80
|
+
for (let i = 0; i < inputs.envelopes.length; i += 1) {
|
|
81
|
+
const env = inputs.envelopes[i];
|
|
82
|
+
if (env && env.kind === 'user-turn')
|
|
83
|
+
userTurnIndices.push(i);
|
|
84
|
+
}
|
|
85
|
+
for (let runIdx = 0; runIdx < runs.length; runIdx += 1) {
|
|
86
|
+
const run = runs[runIdx];
|
|
87
|
+
if (!run)
|
|
88
|
+
continue;
|
|
89
|
+
let start = 0;
|
|
90
|
+
let end = inputs.envelopes.length;
|
|
91
|
+
if (run.userStep && userTurnIndices[runIdx] !== undefined) {
|
|
92
|
+
start = (userTurnIndices[runIdx] ?? 0) + 1;
|
|
93
|
+
const nextUserTurn = userTurnIndices[runIdx + 1];
|
|
94
|
+
end = nextUserTurn ?? inputs.envelopes.length;
|
|
95
|
+
}
|
|
96
|
+
const window = inputs.envelopes.slice(start, end);
|
|
97
|
+
for (const expectation of run.expects) {
|
|
98
|
+
assertionCount += 1;
|
|
99
|
+
const matched = window.some((env) => matchesEnvelope(env, expectation.pattern));
|
|
100
|
+
if (expectation.polarity === 'positive' && !matched) {
|
|
101
|
+
failures.push({
|
|
102
|
+
line: expectation.line,
|
|
103
|
+
message: `EXPECT failed — no envelope matched ${describePattern(expectation.pattern)}`,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
else if (expectation.polarity === 'negative' && matched) {
|
|
107
|
+
failures.push({
|
|
108
|
+
line: expectation.line,
|
|
109
|
+
message: `EXPECT_NOT failed — envelope matched ${describePattern(expectation.pattern)}`,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
for (const check of fileChecks) {
|
|
115
|
+
assertionCount += 1;
|
|
116
|
+
const absolute = resolve(inputs.workspaceRoot, check.file);
|
|
117
|
+
if (!existsSync(absolute)) {
|
|
118
|
+
failures.push({
|
|
119
|
+
line: check.line,
|
|
120
|
+
message: `EXPECT_FILE failed — ${check.file} does not exist`,
|
|
121
|
+
});
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
if (check.content !== undefined) {
|
|
125
|
+
const body = readFileSync(absolute, 'utf8');
|
|
126
|
+
if (!body.includes(check.content)) {
|
|
127
|
+
failures.push({
|
|
128
|
+
line: check.line,
|
|
129
|
+
message: `EXPECT_FILE failed — ${check.file} does not contain "${check.content}"`,
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
const durationMs = Math.max(0, now() - startedAt);
|
|
135
|
+
return {
|
|
136
|
+
id: inputs.scenario.id,
|
|
137
|
+
passed: failures.length === 0,
|
|
138
|
+
failures,
|
|
139
|
+
durationMs,
|
|
140
|
+
assertionCount,
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Filter parsed scenarios by a simple glob-ish substring matcher.
|
|
145
|
+
* `*` matches any run of characters; otherwise we fall back to plain
|
|
146
|
+
* substring containment so `pugi smoke --filter identity` works as
|
|
147
|
+
* the operator expects. The matcher is intentionally NOT a full RegExp
|
|
148
|
+
* (no anchors, no character classes) because scenarios are addressed
|
|
149
|
+
* by short ids — a `--filter "id*"` form is the maximum complexity
|
|
150
|
+
* the corpus needs.
|
|
151
|
+
*/
|
|
152
|
+
export function filterScenarios(scenarios, pattern) {
|
|
153
|
+
if (!pattern || pattern.length === 0)
|
|
154
|
+
return scenarios;
|
|
155
|
+
const matcher = compileFilterPattern(pattern);
|
|
156
|
+
return scenarios.filter((s) => matcher(s.id));
|
|
157
|
+
}
|
|
158
|
+
function compileFilterPattern(pattern) {
|
|
159
|
+
if (!pattern.includes('*')) {
|
|
160
|
+
return (id) => id.includes(pattern);
|
|
161
|
+
}
|
|
162
|
+
// Escape RegExp metacharacters except `*`, then translate `*` to
|
|
163
|
+
// `.*`. This is the dropbox/glob "fnmatch lite" approach — predictable
|
|
164
|
+
// and small.
|
|
165
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&');
|
|
166
|
+
const re = new RegExp(`^${escaped.replace(/\*/g, '.*')}$`);
|
|
167
|
+
return (id) => re.test(id);
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Decide whether a single envelope satisfies a pattern. Exported for
|
|
171
|
+
* tests that want to probe the matching logic without building a full
|
|
172
|
+
* scenario object.
|
|
173
|
+
*/
|
|
174
|
+
export function matchesEnvelope(env, pattern) {
|
|
175
|
+
if (pattern.kind === 'persona-turn-contains') {
|
|
176
|
+
if (env.kind !== 'persona-turn')
|
|
177
|
+
return false;
|
|
178
|
+
return pattern.substrings.some((s) => env.body.includes(s));
|
|
179
|
+
}
|
|
180
|
+
if (pattern.kind === 'tool-call') {
|
|
181
|
+
if (env.kind !== 'tool-call')
|
|
182
|
+
return false;
|
|
183
|
+
// The body is JSON. Tool calls that don't parse as JSON cannot
|
|
184
|
+
// match — surface a clean fail instead of crashing.
|
|
185
|
+
let parsed;
|
|
186
|
+
try {
|
|
187
|
+
parsed = JSON.parse(env.body);
|
|
188
|
+
}
|
|
189
|
+
catch {
|
|
190
|
+
return false;
|
|
191
|
+
}
|
|
192
|
+
if (!isRecord(parsed))
|
|
193
|
+
return false;
|
|
194
|
+
if (pattern.tool !== undefined && parsed['tool'] !== pattern.tool) {
|
|
195
|
+
return false;
|
|
196
|
+
}
|
|
197
|
+
if (pattern.argsSubset !== undefined) {
|
|
198
|
+
const args = parsed['args'];
|
|
199
|
+
if (!isRecord(args))
|
|
200
|
+
return false;
|
|
201
|
+
for (const [k, v] of Object.entries(pattern.argsSubset)) {
|
|
202
|
+
if (String(args[k]) !== v)
|
|
203
|
+
return false;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
return true;
|
|
207
|
+
}
|
|
208
|
+
if (pattern.kind === 'envelope-kind') {
|
|
209
|
+
return env.kind === pattern.envelopeKind;
|
|
210
|
+
}
|
|
211
|
+
return false;
|
|
212
|
+
}
|
|
213
|
+
function isRecord(value) {
|
|
214
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
215
|
+
}
|
|
216
|
+
function describePattern(pattern) {
|
|
217
|
+
if (pattern.kind === 'persona-turn-contains') {
|
|
218
|
+
return `persona-turn containing one of [${pattern.substrings
|
|
219
|
+
.map((s) => `"${s}"`)
|
|
220
|
+
.join(', ')}]`;
|
|
221
|
+
}
|
|
222
|
+
if (pattern.kind === 'tool-call') {
|
|
223
|
+
const parts = [];
|
|
224
|
+
if (pattern.tool)
|
|
225
|
+
parts.push(`tool=${pattern.tool}`);
|
|
226
|
+
if (pattern.argsSubset) {
|
|
227
|
+
for (const [k, v] of Object.entries(pattern.argsSubset)) {
|
|
228
|
+
parts.push(`${k}=${v}`);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
return `tool-call ${parts.join(' ')}`.trim();
|
|
232
|
+
}
|
|
233
|
+
if (pattern.kind === 'envelope-kind') {
|
|
234
|
+
return `envelope kind=${pattern.envelopeKind}`;
|
|
235
|
+
}
|
|
236
|
+
return 'unknown pattern';
|
|
237
|
+
}
|
|
238
|
+
//# sourceMappingURL=runner.js.map
|