unbound-cli 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,503 @@
1
+ const readline = require('readline');
2
+ const api = require('../api');
3
+ const output = require('../output');
4
+ const config = require('../config');
5
+
6
+ // Backend caps at 2000 chars; we cap at 1800 to leave headroom for the
7
+ // sanitization passes (newline collapse + non-printable strip) below.
8
+ const MAX_PROMPT_LEN = 1800;
9
+
10
+ // Mirrors TOOL_ACTIONS in src/commands/policy.js. Inlined intentionally to keep
11
+ // this module standalone; the enum is short and unlikely to change frequently.
12
+ const TOOL_ACTIONS = ['AUDIT', 'BLOCK', 'WARN', 'REQUIRE_SLACK_APPROVAL'];
13
+
14
+ const OUT_OF_SCOPE_KEYWORDS = Object.freeze([
15
+ 'staging', 'production', 'prod', 'dev', 'qa', 'testing',
16
+ 'archived', 'archive',
17
+ 'business hour', 'after hour', 'outside hour', 'weekend',
18
+ 'except', 'unless', 'but not',
19
+ 'private repo', 'public repo', 'private project', 'public project',
20
+ ]);
21
+
22
+ // Single source of truth for --action override validation. Used both early
23
+ // (in runTerminalPromptCreate, to fail before the assist POST) and late
24
+ // (in mergeAiAndFlags, when applying the override). Returns the normalized
25
+ // upper-cased value, or undefined when --action was not provided.
26
+ function validateActionOverride(action) {
27
+ if (typeof action !== 'string' || !action.trim().length) return undefined;
28
+ const up = action.toUpperCase().trim();
29
+ if (!TOOL_ACTIONS.includes(up)) {
30
+ const e = new Error(`--action must be one of: ${TOOL_ACTIONS.join(', ')}.`);
31
+ e.exitCode = 2;
32
+ throw e;
33
+ }
34
+ return up;
35
+ }
36
+
37
+ // Module-level cache. Each CLI invocation is a fresh Node process so cache
38
+ // lifetime is exactly one command.
39
+ let _privilegesCache = null;
40
+
41
+ async function loadPrivileges() {
42
+ if (_privilegesCache) return _privilegesCache;
43
+ _privilegesCache = await api.get('/api/v1/users/privileges/');
44
+ return _privilegesCache;
45
+ }
46
+
47
+ function validatePromptPreflight(opts) {
48
+ if (!opts || typeof opts.prompt !== 'string') {
49
+ throw new Error('--prompt is required.');
50
+ }
51
+ const collapsed = opts.prompt
52
+ .replace(/\n{2,}/g, '\n')
53
+ .replace(/[^\x20-\x7e\n\t]/g, '');
54
+ const sanitizedPrompt = collapsed.trim();
55
+ if (sanitizedPrompt.length > MAX_PROMPT_LEN) {
56
+ const e = new Error(`Input is too long (max ${MAX_PROMPT_LEN} characters).`);
57
+ e.exitCode = 2;
58
+ throw e;
59
+ }
60
+ const warnings = [];
61
+ const lower = sanitizedPrompt.toLowerCase();
62
+ for (const token of OUT_OF_SCOPE_KEYWORDS) {
63
+ if (lower.includes(token)) warnings.push(token);
64
+ }
65
+ return { sanitizedPrompt, warnings };
66
+ }
67
+
68
+ function mergeAiAndFlags(formUpdates, opts) {
69
+ const f = formUpdates || {};
70
+ const body = {
71
+ policy_type: 'TERMINAL_COMMAND',
72
+ };
73
+ if (f.name) body.name = f.name;
74
+ if (f.description) body.description = f.description;
75
+ if (f.command_family) body.command_family = f.command_family;
76
+
77
+ const field = f.selected_field || '';
78
+ const value = f.field_value || '';
79
+ body.config = field ? { [field]: value } : {};
80
+
81
+ body.action = (f.action ? String(f.action).toUpperCase() : 'AUDIT');
82
+
83
+ // Flag overrides win over AI-supplied values. --action is validated against
84
+ // TOOL_ACTIONS via validateActionOverride. --name and --description are
85
+ // trimmed before they override; whitespace-only inputs fall back silently to
86
+ // the AI value (non-interactive callers may template these and an empty
87
+ // result is recoverable).
88
+ const flagName = typeof opts.name === 'string' ? opts.name.trim() : '';
89
+ if (flagName) body.name = flagName;
90
+ const flagDesc = typeof opts.description === 'string' ? opts.description.trim() : '';
91
+ if (flagDesc) body.description = flagDesc;
92
+ const normalizedAction = validateActionOverride(opts.action);
93
+ if (normalizedAction) body.action = normalizedAction;
94
+
95
+ body.enabled = !opts.disabled;
96
+
97
+ if (opts.customMessage) body.custom_message = opts.customMessage;
98
+
99
+ if (Array.isArray(opts.scopeUserGroupIds) && opts.scopeUserGroupIds.length) {
100
+ body.scope_user_group_ids = opts.scopeUserGroupIds;
101
+ }
102
+
103
+ return body;
104
+ }
105
+
106
+ function colorAction(action) {
107
+ const colors = output.colors;
108
+ if (!action) return '(unspecified)';
109
+ if (action === 'BLOCK') return colors.red(action);
110
+ if (action === 'WARN' || action === 'REQUIRE_SLACK_APPROVAL') return colors.yellow(action);
111
+ if (action === 'AUDIT') return colors.dim(action);
112
+ return action;
113
+ }
114
+
115
+ function renderTerminalPreview(body, explanation) {
116
+ const scope = body.scope_user_group_ids && body.scope_user_group_ids.length
117
+ ? `groups: ${body.scope_user_group_ids.join(', ')}`
118
+ : '(org-wide)';
119
+ const family = body.command_family || '(unspecified)';
120
+ const cfg = body.config || {};
121
+ const cfgKeys = Object.keys(cfg);
122
+ const fieldKey = cfgKeys[0] || '(unspecified)';
123
+ const pattern = cfgKeys.length ? String(cfg[cfgKeys[0]]) : '(unspecified)';
124
+
125
+ // Header goes to stdout (NOT output.info, which writes to stderr) so the
126
+ // whole preview — header, keyValue rows, explanation — is captured by a
127
+ // single `tee` / `> file`. Leading blank line gives the header breathing room.
128
+ console.log('');
129
+ console.log(output.colors.cyan('ℹ Resolved policy (from AI assist):'));
130
+
131
+ const rows = [
132
+ ['Name', body.name || '(unspecified)'],
133
+ ];
134
+ if (body.description) rows.push(['Description', body.description]);
135
+ rows.push(['Type', body.policy_type]);
136
+ rows.push(['Command family', family]);
137
+ rows.push(['Field', fieldKey]);
138
+ rows.push(['Pattern', pattern]);
139
+ rows.push(['Action', colorAction(body.action)]);
140
+ if (body.custom_message) rows.push(['Custom message', body.custom_message]);
141
+ rows.push(['Scope (groups)', scope]);
142
+ rows.push(['Enabled', String(body.enabled)]);
143
+ output.keyValue(rows);
144
+
145
+ if (explanation) {
146
+ console.log('');
147
+ console.log(`AI assist explanation: ${output.colors.dim(explanation)}`);
148
+ }
149
+ console.log('');
150
+ }
151
+
152
+ function routeBackendError(err) {
153
+ // ApiError carries statusCode + body. Non-ApiError (network/timeout) reaches
154
+ // this from a try/catch wrapper — we treat anything without statusCode as
155
+ // network/timeout.
156
+ if (!err || !err.statusCode) {
157
+ let host = '';
158
+ try { host = new URL(config.getBaseUrl()).host; } catch { host = config.getBaseUrl(); }
159
+ return {
160
+ message: `Network error reaching ${host}: ${(err && err.message) || 'unknown'}. Check connectivity. Falling back to flag-based creation is not blocked.`,
161
+ exitCode: 4,
162
+ };
163
+ }
164
+ const code = err.statusCode;
165
+ if (code === 401 || code === 403) {
166
+ return {
167
+ message: 'Authentication failed / not authorized. Tool policies require admin. Run `unbound whoami` to check role.',
168
+ exitCode: 3,
169
+ };
170
+ }
171
+ if (code === 400 || code === 422) {
172
+ const bodyStr = typeof err.body === 'string' ? err.body : JSON.stringify(err.body);
173
+ return {
174
+ message: `Request validation failed: ${bodyStr}.`,
175
+ exitCode: 2,
176
+ };
177
+ }
178
+ if (code >= 500) {
179
+ return {
180
+ message: 'Server error. Try again, or fall back to flag-based creation (see `unbound policy tool --help`).',
181
+ exitCode: 4,
182
+ };
183
+ }
184
+ // Other non-2xx — treat as backend error with whatever body it gave.
185
+ const bodyStr = typeof err.body === 'string' ? err.body : JSON.stringify(err.body);
186
+ return { message: `Request failed: ${bodyStr}.`, exitCode: 2 };
187
+ }
188
+
189
+ function routeSuccessFalse(error) {
190
+ const msg = error || '';
191
+ if (msg.toLowerCase().includes('input is too long')) {
192
+ return { message: `${msg} Try shortening to under 1800 characters.`, exitCode: 2 };
193
+ }
194
+ if (msg.toLowerCase().includes('could not determine command family')) {
195
+ return {
196
+ message: `${msg} Try naming the command type explicitly (e.g. \`block git pushes\`, \`audit npm installs\`). Or use \`unbound policy tool families\` to see available families and pass \`--command-family\` directly.`,
197
+ exitCode: 2,
198
+ };
199
+ }
200
+ return { message: msg, exitCode: 2 };
201
+ }
202
+
203
+ function confirmCreate() {
204
+ return promptYesNo('Create policy? [Y/n] ', true);
205
+ }
206
+
207
+ function confirmContinue() {
208
+ return promptYesNo('Continue anyway? The endpoint will ignore those parts. [y/N] ', false);
209
+ }
210
+
211
+ function promptYesNo(question, defaultYes) {
212
+ return new Promise((resolve) => {
213
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
214
+ let resolved = false;
215
+ const done = (v) => { if (!resolved) { resolved = true; resolve(v); rl.close(); } };
216
+ rl.on('close', () => done(false));
217
+ rl.question(question, (answer) => {
218
+ const a = (answer || '').trim().toLowerCase();
219
+ if (a === '') return done(defaultYes);
220
+ done(a === 'y' || a === 'yes');
221
+ });
222
+ });
223
+ }
224
+
225
+ // Public entry — orchestrates pre-flight, admin probe, assist call, merge,
226
+ // preview, and returns {body, confirmed, explanation}. Throws on any non-success
227
+ // branch with err.exitCode set so the caller can map it to process.exitCode.
228
+ async function runTerminalPromptCreate(opts) {
229
+ if (!config.isLoggedIn()) {
230
+ const e = new Error('Not logged in. Run `unbound login` first.');
231
+ e.exitCode = 3;
232
+ throw e;
233
+ }
234
+
235
+ // Validate --action override early so we don't fire the assist call only to
236
+ // discover the override is invalid afterwards. Result is discarded — we only
237
+ // care about the throw side-effect; mergeAiAndFlags re-validates and applies.
238
+ validateActionOverride(opts.action);
239
+
240
+ const { sanitizedPrompt, warnings } = validatePromptPreflight(opts);
241
+ for (const w of warnings) {
242
+ output.warn(`Your prompt mentions \`${w}\`. Tool policies cannot scope by environment / project / time / exception clauses. The endpoint will ignore those parts.`);
243
+ }
244
+ if (warnings.length && !opts.yes && !opts.json) {
245
+ const cont = await confirmContinue();
246
+ if (!cont) {
247
+ const e = new Error('Aborted by user (out-of-scope keywords in prompt).');
248
+ e.exitCode = 0;
249
+ throw e;
250
+ }
251
+ }
252
+
253
+ let privileges;
254
+ try {
255
+ privileges = await loadPrivileges();
256
+ } catch (err) {
257
+ const routed = routeBackendError(err);
258
+ const e = new Error(routed.message);
259
+ e.exitCode = routed.exitCode;
260
+ throw e;
261
+ }
262
+ if (!privileges || !privileges.is_admin) {
263
+ let role = 'unknown';
264
+ if (privileges && privileges.is_manager) role = 'Manager';
265
+ else if (privileges && privileges.is_member) role = 'Member';
266
+ const e = new Error(`Tool policy creation requires admin role; current role: ${role}.`);
267
+ e.exitCode = 3;
268
+ throw e;
269
+ }
270
+
271
+ let response;
272
+ try {
273
+ response = await api.post('/api/v1/command-policies/assist/', {
274
+ body: {
275
+ user_input: sanitizedPrompt,
276
+ current_form_state: {
277
+ command_family: '',
278
+ selected_field: '',
279
+ field_value: '',
280
+ action: '',
281
+ name: '',
282
+ description: '',
283
+ },
284
+ },
285
+ });
286
+ } catch (err) {
287
+ const routed = routeBackendError(err);
288
+ const e = new Error(routed.message);
289
+ e.exitCode = routed.exitCode;
290
+ throw e;
291
+ }
292
+
293
+ if (!response || response.success !== true) {
294
+ const routed = routeSuccessFalse(response && response.error);
295
+ const e = new Error(routed.message);
296
+ e.exitCode = routed.exitCode;
297
+ throw e;
298
+ }
299
+
300
+ const body = mergeAiAndFlags(response.form_updates, opts);
301
+
302
+ if ((body.action === 'BLOCK' || body.action === 'WARN') && !body.custom_message) {
303
+ // Attribute the action choice correctly: if the user passed --action, the
304
+ // AI didn't set it. Misattribution is confusing in BLOCK/WARN flows where
305
+ // the user explicitly opted in.
306
+ const userSet = typeof opts.action === 'string' && opts.action.trim().length > 0;
307
+ const msg = userSet
308
+ ? `--action ${body.action} requires --custom-message. BLOCK and WARN policies need a user-facing message — add --custom-message "<text>" or drop --action ${body.action}.`
309
+ : `AI assist set --action to ${body.action} but no --custom-message was provided. BLOCK and WARN policies require a user-facing message. Re-run with --custom-message "<text>".`;
310
+ const e = new Error(msg);
311
+ e.exitCode = 2;
312
+ throw e;
313
+ }
314
+
315
+ if (!opts.json) renderTerminalPreview(body, response.explanation);
316
+
317
+ let confirmed = true;
318
+ if (!opts.yes && !opts.json) {
319
+ confirmed = await confirmCreate();
320
+ }
321
+ return { body, confirmed, explanation: response.explanation };
322
+ }
323
+
324
+ function routeMcpSuccessFalse(error) {
325
+ const msg = error || '';
326
+ if (msg.toLowerCase().includes('input is too long')) {
327
+ return { message: `${msg} Try shortening to under 1800 characters.`, exitCode: 2 };
328
+ }
329
+ return { message: msg, exitCode: 2 };
330
+ }
331
+
332
+ function mergeAiAndFlagsMcp(aiResponse, opts) {
333
+ const r = aiResponse || {};
334
+ const body = {
335
+ policy_type: 'MCP_TOOL',
336
+ mcp_canonical_group_id: r.canonical_group_id,
337
+ mcp_tools: Array.isArray(r.mcp_tools) ? r.mcp_tools : [],
338
+ };
339
+ // Backend serializer treats `config` as a required object on MCP_TOOL
340
+ // policies; the flag-path always sends `config: {}`. Match it here so the
341
+ // AI-assist path doesn't 422 when the backend tightens this validation.
342
+ body.config = {};
343
+ if (r.name) body.name = r.name;
344
+ if (r.description) body.description = r.description;
345
+ body.action = (r.action ? String(r.action).toUpperCase() : 'AUDIT');
346
+ if (r.custom_message) body.custom_message = r.custom_message;
347
+
348
+ const flagName = typeof opts.name === 'string' ? opts.name.trim() : '';
349
+ if (flagName) body.name = flagName;
350
+ const flagDesc = typeof opts.description === 'string' ? opts.description.trim() : '';
351
+ if (flagDesc) body.description = flagDesc;
352
+ const normalizedAction = validateActionOverride(opts.action);
353
+ if (normalizedAction) body.action = normalizedAction;
354
+ const flagCustom = typeof opts.customMessage === 'string' ? opts.customMessage.trim() : '';
355
+ if (flagCustom) body.custom_message = flagCustom;
356
+
357
+ body.enabled = !opts.disabled;
358
+
359
+ if (Array.isArray(opts.scopeUserGroupIds) && opts.scopeUserGroupIds.length) {
360
+ body.scope_user_group_ids = opts.scopeUserGroupIds;
361
+ }
362
+
363
+ return body;
364
+ }
365
+
366
+ function renderMcpPreview(body, explanation) {
367
+ const scope = body.scope_user_group_ids && body.scope_user_group_ids.length
368
+ ? `groups: ${body.scope_user_group_ids.join(', ')}`
369
+ : '(org-wide)';
370
+ const tools = Array.isArray(body.mcp_tools) ? body.mcp_tools.join(', ') : '';
371
+
372
+ console.log('');
373
+ console.log(output.colors.cyan('ℹ Resolved MCP policy (from AI assist):'));
374
+
375
+ const rows = [
376
+ ['Name', body.name || '(unspecified)'],
377
+ ];
378
+ if (body.description) rows.push(['Description', body.description]);
379
+ rows.push(['Type', body.policy_type]);
380
+ rows.push(['Service', `canonical_group_id: ${body.mcp_canonical_group_id}`]);
381
+ rows.push(['Tools', tools]);
382
+ rows.push(['Action', colorAction(body.action)]);
383
+ if (body.custom_message) rows.push(['Custom message', body.custom_message]);
384
+ rows.push(['Scope (groups)', scope]);
385
+ rows.push(['Enabled', String(body.enabled)]);
386
+ output.keyValue(rows);
387
+
388
+ if (explanation) {
389
+ console.log('');
390
+ console.log(`AI assist explanation: ${output.colors.dim(explanation)}`);
391
+ }
392
+ console.log('');
393
+ }
394
+
395
+ async function runMcpPromptCreate(opts) {
396
+ if (!config.isLoggedIn()) {
397
+ const e = new Error('Not logged in. Run `unbound login` first.');
398
+ e.exitCode = 3;
399
+ throw e;
400
+ }
401
+
402
+ validateActionOverride(opts.action);
403
+
404
+ const { sanitizedPrompt, warnings } = validatePromptPreflight(opts);
405
+ for (const w of warnings) {
406
+ output.warn(`Your prompt mentions \`${w}\`. Tool policies cannot scope by environment / project / time / exception clauses. The endpoint will ignore those parts.`);
407
+ }
408
+ if (warnings.length && !opts.yes && !opts.json) {
409
+ const cont = await confirmContinue();
410
+ if (!cont) {
411
+ const e = new Error('Aborted by user (out-of-scope keywords in prompt).');
412
+ e.exitCode = 0;
413
+ throw e;
414
+ }
415
+ }
416
+
417
+ let privileges;
418
+ try {
419
+ privileges = await loadPrivileges();
420
+ } catch (err) {
421
+ const routed = routeBackendError(err);
422
+ const e = new Error(routed.message);
423
+ e.exitCode = routed.exitCode;
424
+ throw e;
425
+ }
426
+ if (!privileges || !privileges.is_admin) {
427
+ let role = 'unknown';
428
+ if (privileges && privileges.is_manager) role = 'Manager';
429
+ else if (privileges && privileges.is_member) role = 'Member';
430
+ const e = new Error(`Tool policy creation requires admin role; current role: ${role}.`);
431
+ e.exitCode = 3;
432
+ throw e;
433
+ }
434
+
435
+ let response;
436
+ try {
437
+ response = await api.post('/api/v1/command-policies/assist-mcp/', {
438
+ body: {
439
+ user_input: sanitizedPrompt,
440
+ current_form_state: {
441
+ mcp_canonical_group_id: '',
442
+ mcp_tools: [],
443
+ name: '',
444
+ description: '',
445
+ action: '',
446
+ },
447
+ },
448
+ });
449
+ } catch (err) {
450
+ const routed = routeBackendError(err);
451
+ const e = new Error(routed.message);
452
+ e.exitCode = routed.exitCode;
453
+ throw e;
454
+ }
455
+
456
+ if (!response || response.success !== true) {
457
+ const routed = routeMcpSuccessFalse(response && response.error);
458
+ const e = new Error(routed.message);
459
+ e.exitCode = routed.exitCode;
460
+ throw e;
461
+ }
462
+
463
+ if (!response.canonical_group_id
464
+ || !Array.isArray(response.mcp_tools)
465
+ || response.mcp_tools.length === 0) {
466
+ const e = new Error('AI assist could not match any tools for your description. Try naming the service and tools more directly, or use `unbound policy tool create-mcp --name "..." --mcp-server <server> (--mcp-tool <tool> | --mcp-action-type <read|write|destructive>) --action <action>`.');
467
+ e.exitCode = 2;
468
+ throw e;
469
+ }
470
+
471
+ const body = mergeAiAndFlagsMcp(response, opts);
472
+
473
+ if ((body.action === 'BLOCK' || body.action === 'WARN') && !body.custom_message) {
474
+ const userSet = typeof opts.action === 'string' && opts.action.trim().length > 0;
475
+ const msg = userSet
476
+ ? `--action ${body.action} requires --custom-message. BLOCK and WARN policies need a user-facing message — add --custom-message "<text>" or drop --action ${body.action}.`
477
+ : `AI assist set --action to ${body.action} but no --custom-message was provided. BLOCK and WARN policies require a user-facing message. Re-run with --custom-message "<text>".`;
478
+ const e = new Error(msg);
479
+ e.exitCode = 2;
480
+ throw e;
481
+ }
482
+
483
+ if (!opts.json) renderMcpPreview(body, response.explanation);
484
+
485
+ let confirmed = true;
486
+ if (!opts.yes && !opts.json) {
487
+ confirmed = await confirmCreate();
488
+ }
489
+ return { body, confirmed, explanation: response.explanation };
490
+ }
491
+
492
+ module.exports = {
493
+ MAX_PROMPT_LEN,
494
+ OUT_OF_SCOPE_KEYWORDS,
495
+ validatePromptPreflight,
496
+ mergeAiAndFlags,
497
+ renderTerminalPreview,
498
+ routeBackendError,
499
+ runTerminalPromptCreate,
500
+ mergeAiAndFlagsMcp,
501
+ renderMcpPreview,
502
+ runMcpPromptCreate,
503
+ };
@@ -0,0 +1,45 @@
1
+ # Eval harness — WEB-4887 Phase 1
2
+
3
+ Twenty prompts that exercise the `unbound policy tool create-terminal --prompt`
4
+ path end-to-end. Used to measure the ticket's acceptance criterion: "Claude Code
5
+ should always pick our AI assist endpoint over creating policies on its own."
6
+
7
+ This eval is **manual**. CI does NOT run it — running against the live AI-assist
8
+ endpoint burns tokens and produces non-deterministic results.
9
+
10
+ ## Composition
11
+
12
+ | Category | Count | Expected outcome |
13
+ |---|---|---|
14
+ | `single_intent_in_scope` | 6 | success — assist returns `success: true` |
15
+ | `detailed_in_scope` | 4 | success — long but single-intent prompts |
16
+ | `out_of_scope` | 4 | success — CLI warns but proceeds; endpoint usually still answers |
17
+ | `oversize` | 3 | `preflight_rejected` — CLI rejects locally before sending |
18
+ | `nonsense` | 3 | `success_false` — endpoint returns `success: false` |
19
+
20
+ ## Fixture shape
21
+
22
+ `policy-prompts.json` is an array of `{id, category, prompt, expected_outcome}`.
23
+ The three `oversize` entries store a marker (`OVERSIZE_FILL_<N>`) and the runner
24
+ expands it to an N-character string at execution time so the JSON stays
25
+ readable in source.
26
+
27
+ ## Manual run
28
+
29
+ ```bash
30
+ # Make sure you are logged in as an admin against staging:
31
+ unbound login --api-key <STAGING_ADMIN_KEY> --backend-url https://staging-backend.getunbound.ai
32
+
33
+ # Then run the skeleton — today it just summarizes the prompt set.
34
+ node test/eval/run-eval.js
35
+ ```
36
+
37
+ The skeleton stops at the "load + summary" boundary. To turn it into a real
38
+ eval, fill in the TODO in `run-eval.js`: for each prompt, either spawn
39
+ `unbound policy tool create-terminal --prompt "<p>" --yes` directly and grade
40
+ the exit code / response, OR (for the pick-rate measurement) ask Claude Code to
41
+ satisfy the natural-language ask and check whether it invoked the `--prompt`
42
+ flag. Aggregate pick-rate (% of prompts where Claude picked our path) and
43
+ success-rate (% of in-scope prompts where the endpoint returned `success: true`).
44
+
45
+ Spec acceptance gate: pick-rate >= 90%, success-rate >= 80% on in-scope prompts.
@@ -0,0 +1,122 @@
1
+ [
2
+ {
3
+ "id": "single-1",
4
+ "category": "single_intent_in_scope",
5
+ "prompt": "block rm -rf",
6
+ "expected_outcome": "success"
7
+ },
8
+ {
9
+ "id": "single-2",
10
+ "category": "single_intent_in_scope",
11
+ "prompt": "audit git pushes to main",
12
+ "expected_outcome": "success"
13
+ },
14
+ {
15
+ "id": "single-3",
16
+ "category": "single_intent_in_scope",
17
+ "prompt": "warn on npm install",
18
+ "expected_outcome": "success"
19
+ },
20
+ {
21
+ "id": "single-4",
22
+ "category": "single_intent_in_scope",
23
+ "prompt": "block curl to external hosts",
24
+ "expected_outcome": "success"
25
+ },
26
+ {
27
+ "id": "single-5",
28
+ "category": "single_intent_in_scope",
29
+ "prompt": "audit docker push commands",
30
+ "expected_outcome": "success"
31
+ },
32
+ {
33
+ "id": "single-6",
34
+ "category": "single_intent_in_scope",
35
+ "prompt": "block force-push on git",
36
+ "expected_outcome": "success"
37
+ },
38
+ {
39
+ "id": "detailed-1",
40
+ "category": "detailed_in_scope",
41
+ "prompt": "Our team has had repeated incidents where engineers accidentally delete large amounts of work using recursive force removes. Please create a policy that blocks any invocation of rm with the recursive and force flags so destructive deletions are prevented at the shell level. The policy should fire whenever the command begins with rm and contains the -rf combination.",
42
+ "expected_outcome": "success"
43
+ },
44
+ {
45
+ "id": "detailed-2",
46
+ "category": "detailed_in_scope",
47
+ "prompt": "We want to track every git push so we can build an internal audit log of who is pushing what and where. Create a policy that audits any git push command regardless of the remote or branch. We do not want to block these — only audit them. The policy should match the command pattern git push and any arguments that follow.",
48
+ "expected_outcome": "success"
49
+ },
50
+ {
51
+ "id": "detailed-3",
52
+ "category": "detailed_in_scope",
53
+ "prompt": "Engineers occasionally try to install npm packages from inside our coding agents and we want a chance to review those installs before they happen. Please warn the user any time npm install is invoked. The custom message we want is: this install must be reviewed by your tech lead first. The action should be WARN, not BLOCK.",
54
+ "expected_outcome": "success"
55
+ },
56
+ {
57
+ "id": "detailed-4",
58
+ "category": "detailed_in_scope",
59
+ "prompt": "We need to stop sudo from being used inside the coding agent shell entirely. Any sudo invocation should be blocked outright with a custom message explaining that elevated privileges are not permitted from the AI tool path. The policy should match any command beginning with sudo, regardless of the subcommand.",
60
+ "expected_outcome": "success"
61
+ },
62
+ {
63
+ "id": "oos-env",
64
+ "category": "out_of_scope",
65
+ "prompt": "block rm -rf in the staging environment",
66
+ "expected_outcome": "success"
67
+ },
68
+ {
69
+ "id": "oos-exception",
70
+ "category": "out_of_scope",
71
+ "prompt": "block all writes except in the docs directory",
72
+ "expected_outcome": "success"
73
+ },
74
+ {
75
+ "id": "oos-time",
76
+ "category": "out_of_scope",
77
+ "prompt": "block git pushes after business hours",
78
+ "expected_outcome": "success"
79
+ },
80
+ {
81
+ "id": "oos-compound",
82
+ "category": "out_of_scope",
83
+ "prompt": "block rm -rf and audit git push",
84
+ "expected_outcome": "success"
85
+ },
86
+ {
87
+ "id": "oversize-1",
88
+ "category": "oversize",
89
+ "prompt": "OVERSIZE_FILL_2200",
90
+ "expected_outcome": "preflight_rejected"
91
+ },
92
+ {
93
+ "id": "oversize-2",
94
+ "category": "oversize",
95
+ "prompt": "OVERSIZE_FILL_2500",
96
+ "expected_outcome": "preflight_rejected"
97
+ },
98
+ {
99
+ "id": "oversize-3",
100
+ "category": "oversize",
101
+ "prompt": "OVERSIZE_FILL_3000",
102
+ "expected_outcome": "preflight_rejected"
103
+ },
104
+ {
105
+ "id": "nonsense-1",
106
+ "category": "nonsense",
107
+ "prompt": "asdfqwer purple potato",
108
+ "expected_outcome": "success_false"
109
+ },
110
+ {
111
+ "id": "nonsense-2",
112
+ "category": "nonsense",
113
+ "prompt": "make a policy for the thing that does the thing",
114
+ "expected_outcome": "success_false"
115
+ },
116
+ {
117
+ "id": "nonsense-3",
118
+ "category": "nonsense",
119
+ "prompt": "lorem ipsum dolor sit amet",
120
+ "expected_outcome": "success_false"
121
+ }
122
+ ]