llm-cli-gateway 1.17.0 → 1.17.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import { existsSync, unlinkSync, writeFileSync } from "fs";
6
6
  import { tmpdir } from "os";
7
7
  import { join, isAbsolute } from "path";
8
8
  import { randomUUID } from "crypto";
9
- import { z } from "zod";
9
+ import { z } from "zod/v3";
10
10
  /** Prefix for gateway-generated session IDs. Enforces provenance structurally. */
11
11
  export const GATEWAY_SESSION_PREFIX = "gw-";
12
12
  /**
@@ -262,57 +262,54 @@ export const GEMINI_APPROVAL_MODES = ["default", "auto_edit", "yolo", "plan"];
262
262
  */
263
263
  export const CODEX_SANDBOX_MODES = ["read-only", "workspace-write", "danger-full-access"];
264
264
  /**
265
- * Codex approval modes (for `--ask-for-approval <mode>`).
265
+ * Deprecated Codex approval modes. Current Codex no longer exposes an
266
+ * `--ask-for-approval` flag; the MCP input is temporarily retained so older
267
+ * callers do not fail schema validation, but it emits no CLI argv.
266
268
  */
267
269
  export const CODEX_ASK_FOR_APPROVAL_MODES = ["untrusted", "on-request", "never"];
268
270
  /**
269
- * Resolve Codex `--sandbox` / `--ask-for-approval` args from the modern
270
- * params + legacy `fullAuto` shorthand.
271
+ * Resolve current Codex sandbox args from the modern params + legacy
272
+ * `fullAuto` shorthand. Current Codex exposes `--sandbox`, but no longer
273
+ * exposes `--ask-for-approval` or `--full-auto`.
271
274
  *
272
275
  * Precedence:
273
- * 1. If `useLegacyFullAutoFlag && fullAuto`, emit `--full-auto` directly
274
- * (escape hatch; deprecated).
275
- * 2. Else explicit `sandboxMode` / `askForApproval` always emit their
276
- * flags. If `fullAuto: true` is set alongside, a warning is attached
277
- * and the explicit values win.
278
- * 3. Else if `fullAuto: true`, expand to
279
- * `--sandbox workspace-write --ask-for-approval never`.
276
+ * 1. Explicit `sandboxMode` emits `--sandbox <mode>`.
277
+ * 2. Else if `fullAuto: true`, expand to `--sandbox workspace-write`.
278
+ * 3. Deprecated `askForApproval` and `useLegacyFullAutoFlag` emit no argv
279
+ * and return warnings for callers to surface/log.
280
280
  * 4. Else emit nothing.
281
281
  */
282
282
  export function resolveCodexSandboxFlags(input) {
283
283
  const { sandboxMode, askForApproval, fullAuto, useLegacyFullAutoFlag } = input;
284
- // deprecated: prefer sandboxMode + askForApproval; will be removed after Mistral GA.
285
- if (useLegacyFullAutoFlag && fullAuto) {
286
- return { args: ["--full-auto"] };
287
- }
288
- const explicit = Boolean(sandboxMode || askForApproval);
289
- if (explicit) {
290
- const args = [];
291
- if (sandboxMode)
292
- args.push("--sandbox", sandboxMode);
293
- if (askForApproval)
294
- args.push("--ask-for-approval", askForApproval);
295
- const warning = fullAuto
296
- ? "fullAuto was set alongside explicit sandboxMode/askForApproval; explicit values win. fullAuto is deprecated."
297
- : undefined;
298
- return { args, warning };
299
- }
300
- if (fullAuto) {
301
- return {
302
- args: ["--sandbox", "workspace-write", "--ask-for-approval", "never"],
303
- };
284
+ const args = [];
285
+ const warnings = [];
286
+ if (useLegacyFullAutoFlag) {
287
+ warnings.push("useLegacyFullAutoFlag is deprecated and ignored because current Codex no longer accepts --full-auto.");
304
288
  }
305
- return { args: [] };
289
+ if (askForApproval) {
290
+ warnings.push("askForApproval is deprecated and ignored because current Codex no longer accepts --ask-for-approval.");
291
+ }
292
+ if (sandboxMode) {
293
+ args.push("--sandbox", sandboxMode);
294
+ if (fullAuto) {
295
+ warnings.push("fullAuto was set alongside explicit sandboxMode; sandboxMode wins. fullAuto is deprecated.");
296
+ }
297
+ }
298
+ else if (fullAuto) {
299
+ args.push("--sandbox", "workspace-write");
300
+ }
301
+ return { args, warning: warnings.length > 0 ? warnings.join(" ") : undefined };
306
302
  }
307
303
  /**
308
304
  * Flags that `codex exec resume` rejects (the original session's policy is
309
305
  * inherited). Callers must drop these when building resume argv.
310
306
  *
311
- * Verified against `codex exec resume --help` (codex-cli 0.133.0):
312
- * `--full-auto`, `--sandbox`, `--ask-for-approval`, `--add-dir`, `-C`, and
313
- * `--search` are rejected. `--output-schema` and `-c key=value` ARE accepted
314
- * on resume and therefore are NOT in this filter (Phase 4 slice α restored
315
- * the previously-silent drop of those two).
307
+ * Verified against `codex exec resume --help` (codex-cli 0.135.0):
308
+ * `--sandbox`, `--add-dir`, `-C`, `--cd`, `--profile`, and `--search` are rejected.
309
+ * Deprecated `--full-auto` / `--ask-for-approval` are kept here defensively so
310
+ * legacy pre-filtered segments are stripped instead of reaching spawn.
311
+ * `--output-schema` and `-c key=value` ARE accepted on resume and therefore are
312
+ * NOT in this filter (Phase 4 slice α restored the previously-silent drop of those two).
316
313
  */
317
314
  export const CODEX_RESUME_FILTERED_FLAGS = new Set([
318
315
  "--full-auto",
@@ -320,6 +317,8 @@ export const CODEX_RESUME_FILTERED_FLAGS = new Set([
320
317
  "--ask-for-approval",
321
318
  "--add-dir",
322
319
  "-C",
320
+ "--cd",
321
+ "--profile",
323
322
  "--search",
324
323
  ]);
325
324
  /**
@@ -331,13 +330,15 @@ const CODEX_RESUME_FILTERED_FLAGS_WITH_VALUE = new Set([
331
330
  "--ask-for-approval",
332
331
  "--add-dir",
333
332
  "-C",
333
+ "--cd",
334
+ "--profile",
334
335
  ]);
335
336
  /**
336
337
  * Strip resume-incompatible flag/value pairs from a Codex argv segment.
337
338
  *
338
339
  * Bare flags (`--full-auto`, `--search`) drop without consuming a value.
339
- * Value-taking flags (`--sandbox`, `--ask-for-approval`, `--add-dir`, `-C`,
340
- * `--output-schema`) drop together with their immediately-following value.
340
+ * Value-taking flags (`--sandbox`, `--ask-for-approval`, `--add-dir`, `-C`, `--cd`,
341
+ * `--profile`) drop together with their immediately-following value.
341
342
  */
342
343
  export function filterCodexResumeFlags(args) {
343
344
  const out = [];
@@ -371,7 +372,7 @@ export const CLAUDE_EFFORT_LEVELS = ["low", "medium", "high", "xhigh", "max"];
371
372
  export const CLAUDE_HIGH_IMPACT_PARAMS_SCHEMA = z
372
373
  .object({
373
374
  agent: z.string().optional(),
374
- agents: z.record(z.record(z.unknown())).optional(),
375
+ agents: z.record(z.string(), z.record(z.string(), z.unknown())).optional(),
375
376
  forkSession: z.boolean().optional(),
376
377
  systemPrompt: z.string().optional(),
377
378
  appendSystemPrompt: z.string().optional(),
@@ -549,7 +550,7 @@ export function findMissingImagePath(images) {
549
550
  * params before they reach `prepareCodexRequest`.
550
551
  */
551
552
  export const CODEX_HIGH_IMPACT_PARAMS_SCHEMA = z.object({
552
- outputSchema: z.union([z.string(), z.record(z.unknown())]).optional(),
553
+ outputSchema: z.union([z.string(), z.record(z.string(), z.unknown())]).optional(),
553
554
  search: z.boolean().optional(),
554
555
  profile: z.string().optional(),
555
556
  configOverrides: CODEX_CONFIG_OVERRIDES_SCHEMA,
@@ -578,8 +579,9 @@ export function prepareCodexHighImpactFlags(input) {
578
579
  args.push("--output-schema", schema.path);
579
580
  cleanup = schema.cleanup;
580
581
  }
582
+ const warnings = [];
581
583
  if (input.search) {
582
- args.push("--search");
584
+ warnings.push("search is deprecated and ignored because current Codex exec no longer accepts --search.");
583
585
  }
584
586
  if (input.profile) {
585
587
  args.push("--profile", input.profile);
@@ -599,7 +601,12 @@ export function prepareCodexHighImpactFlags(input) {
599
601
  if (input.ignoreRules) {
600
602
  args.push("--ignore-rules");
601
603
  }
602
- return { args, cleanup, missingImagePath: null };
604
+ return {
605
+ args,
606
+ cleanup,
607
+ missingImagePath: null,
608
+ warning: warnings.length > 0 ? warnings.join(" ") : undefined,
609
+ };
603
610
  }
604
611
  export function prepareCodexForkRequest(input) {
605
612
  const { prompt, sessionId, forkLast } = input;
@@ -76,7 +76,7 @@ export class FileSessionManager {
76
76
  const data = readFileSync(this.storagePath, "utf-8");
77
77
  this.storage = JSON.parse(data);
78
78
  }
79
- catch (error) {
79
+ catch {
80
80
  // If file is corrupted, start fresh
81
81
  this.storage = { sessions: {}, activeSession: createEmptyActiveSessions() };
82
82
  }
@@ -4,6 +4,15 @@
4
4
  * Each line of stdout is a complete JSON object. This parser extracts the
5
5
  * final result text, cost, usage, and metadata from the stream.
6
6
  */
7
+ function stringOrNull(value) {
8
+ return typeof value === "string" ? value : null;
9
+ }
10
+ function numberOrNull(value) {
11
+ return typeof value === "number" && Number.isFinite(value) ? value : null;
12
+ }
13
+ function numberOrZero(value) {
14
+ return typeof value === "number" && Number.isFinite(value) ? value : 0;
15
+ }
7
16
  /**
8
17
  * Parse completed NDJSON stdout from `claude --output-format stream-json --include-partial-messages`.
9
18
  *
@@ -30,6 +39,9 @@ export function parseStreamJson(stdout) {
30
39
  // Skip malformed lines
31
40
  continue;
32
41
  }
42
+ if (!parsed || typeof parsed !== "object") {
43
+ continue;
44
+ }
33
45
  if (parsed.type === "result") {
34
46
  resultEvent = parsed;
35
47
  }
@@ -44,21 +56,21 @@ export function parseStreamJson(stdout) {
44
56
  if (resultEvent) {
45
57
  const usage = resultEvent.usage
46
58
  ? {
47
- inputTokens: resultEvent.usage.input_tokens ?? 0,
48
- outputTokens: resultEvent.usage.output_tokens ?? 0,
49
- cacheReadInputTokens: resultEvent.usage.cache_read_input_tokens ?? 0,
50
- cacheCreationInputTokens: resultEvent.usage.cache_creation_input_tokens ?? 0,
59
+ inputTokens: numberOrZero(resultEvent.usage.input_tokens),
60
+ outputTokens: numberOrZero(resultEvent.usage.output_tokens),
61
+ cacheReadInputTokens: numberOrZero(resultEvent.usage.cache_read_input_tokens),
62
+ cacheCreationInputTokens: numberOrZero(resultEvent.usage.cache_creation_input_tokens),
51
63
  }
52
64
  : null;
53
65
  return {
54
- text: resultEvent.result ?? "",
55
- costUsd: resultEvent.total_cost_usd ?? null,
66
+ text: typeof resultEvent.result === "string" ? resultEvent.result : "",
67
+ costUsd: numberOrNull(resultEvent.total_cost_usd),
56
68
  usage,
57
- sessionId: resultEvent.session_id ?? systemEvent?.session_id ?? null,
58
- model: systemEvent?.model ?? resultEvent.model ?? null,
59
- durationApiMs: resultEvent.duration_api_ms ?? null,
69
+ sessionId: stringOrNull(resultEvent.session_id) ?? stringOrNull(systemEvent?.session_id),
70
+ model: stringOrNull(systemEvent?.model) ?? stringOrNull(resultEvent.model),
71
+ durationApiMs: numberOrNull(resultEvent.duration_api_ms),
60
72
  isError: resultEvent.is_error === true,
61
- numTurns: resultEvent.num_turns ?? null,
73
+ numTurns: numberOrNull(resultEvent.num_turns),
62
74
  };
63
75
  }
64
76
  // Fallback: extract text from assistant event
@@ -67,7 +79,10 @@ export function parseStreamJson(stdout) {
67
79
  let text = "";
68
80
  if (message?.content && Array.isArray(message.content)) {
69
81
  text = message.content
70
- .filter((block) => block.type === "text")
82
+ .filter((block) => block &&
83
+ typeof block === "object" &&
84
+ block.type === "text" &&
85
+ typeof block.text === "string")
71
86
  .map((block) => block.text)
72
87
  .join("");
73
88
  }
@@ -75,8 +90,8 @@ export function parseStreamJson(stdout) {
75
90
  text,
76
91
  costUsd: null,
77
92
  usage: null,
78
- sessionId: systemEvent?.session_id ?? null,
79
- model: systemEvent?.model ?? message?.model ?? null,
93
+ sessionId: stringOrNull(systemEvent?.session_id),
94
+ model: stringOrNull(systemEvent?.model) ?? stringOrNull(message?.model),
80
95
  durationApiMs: null,
81
96
  isError: false,
82
97
  numTurns: null,
@@ -87,8 +102,8 @@ export function parseStreamJson(stdout) {
87
102
  text: "",
88
103
  costUsd: null,
89
104
  usage: null,
90
- sessionId: systemEvent?.session_id ?? null,
91
- model: systemEvent?.model ?? null,
105
+ sessionId: stringOrNull(systemEvent?.session_id),
106
+ model: stringOrNull(systemEvent?.model),
92
107
  durationApiMs: null,
93
108
  isError: false,
94
109
  numTurns: null,
@@ -93,6 +93,20 @@ export declare function validateUpstreamCliArgs(cli: CliType, args: readonly str
93
93
  export declare function assertUpstreamCliArgs(cli: CliType, args: readonly string[]): void;
94
94
  export declare function validateUpstreamCliEnv(cli: CliType, env: Record<string, string> | undefined): ContractValidationResult;
95
95
  export declare function assertUpstreamCliEnv(cli: CliType, env: Record<string, string> | undefined): void;
96
+ /**
97
+ * Best-effort, advisory-only extraction of long-form flags from raw --help text.
98
+ * Returns a sorted array of unique `--foo-bar` style flags discovered in the output.
99
+ *
100
+ * Heuristics:
101
+ * - Matches common option declaration lines emitted by clap, yargs, commander, custom TUIs, etc.
102
+ * - Lowercases for stable comparison against our contract keys.
103
+ * - Intentionally conservative: ignores obvious noise (URLs, prose in descriptions).
104
+ *
105
+ * This powers the bidirectional drift detector (extra flags the installed binary
106
+ * advertises that our contract does not yet allow). It is NEVER used for argv
107
+ * validation — only for the upstream scanner and `upstream_contracts` probe reports.
108
+ */
109
+ export declare function extractDiscoveredFlags(helpText: string): readonly string[];
96
110
  export interface InstalledCliContractProbe {
97
111
  cli: CliType;
98
112
  executable: string;
@@ -101,6 +115,16 @@ export interface InstalledCliContractProbe {
101
115
  available: boolean;
102
116
  checkedHelpCommands: string[][];
103
117
  missingFlags: string[];
118
+ /** Flags present in the installed binary's --help but absent from the declared contract. */
119
+ extraFlags: readonly string[];
120
+ /** Sorted list of long flags discovered in the help text (for snapshot diffing). */
121
+ discoveredFlags: readonly string[];
122
+ /** Stable hash of the concatenated help output (detects subtle text changes even if flag set is stable). */
123
+ helpHash?: string;
124
+ /** Best-effort version string scraped from the help/version output (if present). */
125
+ versionHint?: string;
126
+ /** ISO timestamp when this probe was performed. */
127
+ probedAt: string;
104
128
  warnings: string[];
105
129
  }
106
130
  export declare function probeInstalledCliContract(cli: CliType, timeoutMs?: number): InstalledCliContractProbe;
@@ -1,4 +1,5 @@
1
1
  import { spawnSync } from "node:child_process";
2
+ import { createHash } from "node:crypto";
2
3
  import { envWithExtendedPath, getExtendedPath, resolveCommandForSpawn } from "./executor.js";
3
4
  const PERMISSION_MODES = [
4
5
  "default",
@@ -255,12 +256,12 @@ export const UPSTREAM_CLI_CONTRACTS = {
255
256
  "workingDir",
256
257
  "addDir",
257
258
  ],
258
- resumeOnlyFlags: ["--last"],
259
+ resumeOnlyFlags: ["--last", "--all"],
259
260
  // Phase 4 slice α (v1.8.0) verified that `codex exec resume` accepts
260
261
  // `--output-schema` and `-c` (codex-cli 0.133.0 `exec resume --help`),
261
- // so they're no longer forbidden. `--search` stays forbidden (resume
262
- // inherits the original session's web-search state).
263
- resumeForbiddenFlags: ["--sandbox", "--ask-for-approval", "--full-auto", "--search"],
262
+ // so they're no longer forbidden. Current resume help does not accept
263
+ // session-profile or working-directory policy flags.
264
+ resumeForbiddenFlags: ["--sandbox", "-C", "--cd", "--add-dir", "--profile"],
264
265
  flags: {
265
266
  "--last": { arity: "none", description: "Resume latest session" },
266
267
  "--model": { arity: "one", description: "Model selector" },
@@ -269,12 +270,6 @@ export const UPSTREAM_CLI_CONTRACTS = {
269
270
  values: ["read-only", "workspace-write", "danger-full-access"],
270
271
  description: "Sandbox policy",
271
272
  },
272
- "--ask-for-approval": {
273
- arity: "one",
274
- values: ["untrusted", "on-request", "never"],
275
- description: "Approval policy",
276
- },
277
- "--full-auto": { arity: "none", description: "Legacy full-auto shortcut" },
278
273
  "--dangerously-bypass-approvals-and-sandbox": {
279
274
  arity: "none",
280
275
  description: "Disable approvals and sandbox",
@@ -282,25 +277,62 @@ export const UPSTREAM_CLI_CONTRACTS = {
282
277
  "--json": { arity: "none", description: "JSONL event stream" },
283
278
  "--skip-git-repo-check": { arity: "none", description: "Allow non-git cwd" },
284
279
  "--output-schema": { arity: "one", description: "Structured output JSON schema path" },
285
- "--search": { arity: "none", description: "Enable web search" },
286
280
  "--profile": { arity: "one", description: "Config profile" },
287
281
  "-c": {
288
282
  arity: "one",
289
283
  pattern: /^[a-zA-Z0-9._]+=([^\r\n]*)$/,
290
284
  description: "Config override key=value",
291
285
  },
286
+ "--config": {
287
+ arity: "one",
288
+ pattern: /^[a-zA-Z0-9._]+=([^\r\n]*)$/,
289
+ description: "Config override key=value",
290
+ },
291
+ "--enable": { arity: "one", description: "Enable a Codex feature flag" },
292
+ "--disable": { arity: "one", description: "Disable a Codex feature flag" },
293
+ "--strict-config": {
294
+ arity: "none",
295
+ description: "Reject unrecognized config.toml fields",
296
+ },
292
297
  "--ephemeral": { arity: "none", description: "Do not persist session" },
293
298
  "-i": { arity: "one", description: "Image path" },
299
+ "--image": { arity: "one", description: "Image path" },
294
300
  "--ignore-user-config": { arity: "none", description: "Ignore user config" },
295
301
  "--ignore-rules": { arity: "none", description: "Ignore rule files" },
296
- // The gateway only ever emits the short form `-C` (codex 0.134.0 accepts
297
- // both `-C` and `--cd` as aliases). The contract registers exactly what
298
- // we emit; if a future code path emits `--cd` instead, the contract
299
- // check will fail loudly — which is the intended catch.
302
+ "--oss": { arity: "none", description: "Use open-source provider" },
303
+ "--local-provider": {
304
+ arity: "one",
305
+ values: ["lmstudio", "ollama"],
306
+ description: "Local open-source provider",
307
+ },
308
+ "--color": {
309
+ arity: "one",
310
+ values: ["always", "never", "auto"],
311
+ description: "Output color mode",
312
+ },
313
+ "--output-last-message": {
314
+ arity: "one",
315
+ description: "Write the final agent message to a file",
316
+ },
317
+ "--dangerously-bypass-hook-trust": {
318
+ arity: "none",
319
+ description: "Run enabled hooks without persisted hook trust",
320
+ },
321
+ "--version": { arity: "none", description: "Print version" },
322
+ "--all": {
323
+ arity: "none",
324
+ description: "Resume picker: show all sessions without cwd filtering",
325
+ },
326
+ // The gateway emits the short form `-C`, and the advisory contract also
327
+ // tracks the long `--cd` alias advertised by current Codex exec help.
300
328
  "-C": {
301
329
  arity: "one",
302
330
  description: "Working root for the session (Phase 4 slice ζ; new sessions only)",
303
331
  },
332
+ "--cd": {
333
+ arity: "one",
334
+ description: "Working root for the session",
335
+ },
304
336
  "--add-dir": {
305
337
  arity: "one",
306
338
  description: "Additional writable workspace directory (Phase 4 slice ζ; repeat once per directory; new sessions only)",
@@ -320,6 +352,18 @@ export const UPSTREAM_CLI_CONTRACTS = {
320
352
  args: ["exec", "--sandbox", "workspace", "hello"],
321
353
  expect: "fail",
322
354
  },
355
+ {
356
+ id: "codex-ask-for-approval-unsupported",
357
+ description: "Current Codex CLI no longer accepts --ask-for-approval",
358
+ args: ["exec", "--ask-for-approval", "never", "hello"],
359
+ expect: "fail",
360
+ },
361
+ {
362
+ id: "codex-full-auto-unsupported",
363
+ description: "Current Codex CLI no longer accepts --full-auto",
364
+ args: ["exec", "--full-auto", "hello"],
365
+ expect: "fail",
366
+ },
323
367
  {
324
368
  // Phase 4 slice α: --output-schema IS accepted on resume per
325
369
  // codex-cli 0.133.0; this fixture pins the new behaviour so future
@@ -336,9 +380,9 @@ export const UPSTREAM_CLI_CONTRACTS = {
336
380
  expect: "pass",
337
381
  },
338
382
  {
339
- id: "codex-resume-search-still-forbidden",
340
- description: "Phase 4 slice α: --search remains forbidden on resume",
341
- args: ["exec", "resume", "--search", "session-id", "hello"],
383
+ id: "codex-search-unsupported",
384
+ description: "Current Codex exec no longer accepts --search",
385
+ args: ["exec", "--search", "hello"],
342
386
  expect: "fail",
343
387
  },
344
388
  {
@@ -361,6 +405,41 @@ export const UPSTREAM_CLI_CONTRACTS = {
361
405
  ],
362
406
  expect: "pass",
363
407
  },
408
+ {
409
+ id: "codex-current-exec-help-surface",
410
+ description: "Current Codex exec advertises additional config, output, provider, and safety flags",
411
+ args: [
412
+ "exec",
413
+ "--config",
414
+ "features.foo=true",
415
+ "--enable",
416
+ "foo",
417
+ "--disable",
418
+ "bar",
419
+ "--strict-config",
420
+ "--image",
421
+ "/tmp/a.png",
422
+ "--oss",
423
+ "--local-provider",
424
+ "ollama",
425
+ "--color",
426
+ "auto",
427
+ "--cd",
428
+ "/tmp/work",
429
+ "--output-last-message",
430
+ "/tmp/out.txt",
431
+ "--dangerously-bypass-hook-trust",
432
+ "--version",
433
+ "hello",
434
+ ],
435
+ expect: "pass",
436
+ },
437
+ {
438
+ id: "codex-current-resume-help-surface",
439
+ description: "Current Codex resume advertises --all for disabling cwd filtering",
440
+ args: ["exec", "resume", "--all", "session-id", "hello"],
441
+ expect: "pass",
442
+ },
364
443
  ],
365
444
  },
366
445
  gemini: {
@@ -554,6 +633,38 @@ export const UPSTREAM_CLI_CONTRACTS = {
554
633
  arity: "one",
555
634
  description: "Permission deny rule (Phase 4 slice θ; repeat once per rule per `grok --help`)",
556
635
  },
636
+ "--agent": { arity: "one", description: "Agent name or definition file path" },
637
+ "--agents": { arity: "one", description: "Inline subagent definitions JSON" },
638
+ "--best-of-n": {
639
+ arity: "one",
640
+ pattern: /^[1-9][0-9]*$/,
641
+ description: "Run the task N ways in parallel and pick the best",
642
+ },
643
+ "--check": { arity: "none", description: "Append a self-verification loop" },
644
+ "--disable-web-search": {
645
+ arity: "none",
646
+ description: "Disable web search and web fetch tools",
647
+ },
648
+ "--experimental-memory": { arity: "none", description: "Enable cross-session memory" },
649
+ "--no-alt-screen": { arity: "none", description: "Run inline without alt screen" },
650
+ "--no-memory": { arity: "none", description: "Disable cross-session memory" },
651
+ "--no-plan": { arity: "none", description: "Disable plan mode" },
652
+ "--no-subagents": { arity: "none", description: "Disable subagent spawning" },
653
+ "--oauth": { arity: "none", description: "Use OAuth during authentication" },
654
+ "--prompt-file": { arity: "one", description: "Single-turn prompt from a file" },
655
+ "--prompt-json": { arity: "one", description: "Single-turn prompt JSON blocks" },
656
+ "--restore-code": {
657
+ arity: "none",
658
+ description: "Check out the original session commit when resuming",
659
+ },
660
+ "--single": { arity: "one", description: "Single-turn prompt" },
661
+ "--todo-gate": { arity: "none", description: "Enable runtime turn-end TodoGate" },
662
+ "--verbatim": { arity: "none", description: "Send prompt exactly as given" },
663
+ "--version": { arity: "none", description: "Print version" },
664
+ "--worktree": {
665
+ arity: "optional",
666
+ description: "Start the session in a new git worktree, optionally named",
667
+ },
557
668
  },
558
669
  env: {},
559
670
  conformanceFixtures: [
@@ -617,6 +728,40 @@ export const UPSTREAM_CLI_CONTRACTS = {
617
728
  args: ["-p", "hello", "--deny", "write", "--deny", "kill"],
618
729
  expect: "pass",
619
730
  },
731
+ {
732
+ id: "grok-current-help-surface",
733
+ description: "Current Grok Build help advertises agent, prompt, memory, web, and worktree flags",
734
+ args: [
735
+ "-p",
736
+ "hello",
737
+ "--agent",
738
+ "reviewer",
739
+ "--agents",
740
+ "{}",
741
+ "--best-of-n",
742
+ "2",
743
+ "--check",
744
+ "--disable-web-search",
745
+ "--experimental-memory",
746
+ "--no-alt-screen",
747
+ "--no-memory",
748
+ "--no-plan",
749
+ "--no-subagents",
750
+ "--oauth",
751
+ "--prompt-file",
752
+ "/tmp/prompt.md",
753
+ "--prompt-json",
754
+ "[]",
755
+ "--restore-code",
756
+ "--single",
757
+ "single prompt",
758
+ "--todo-gate",
759
+ "--verbatim",
760
+ "--version",
761
+ "--worktree",
762
+ ],
763
+ expect: "pass",
764
+ },
620
765
  ],
621
766
  },
622
767
  mistral: {
@@ -948,6 +1093,39 @@ function validateFlagValue(cli, arg, flag, value, index, violations) {
948
1093
  });
949
1094
  }
950
1095
  }
1096
+ /**
1097
+ * Best-effort, advisory-only extraction of long-form flags from raw --help text.
1098
+ * Returns a sorted array of unique `--foo-bar` style flags discovered in the output.
1099
+ *
1100
+ * Heuristics:
1101
+ * - Matches common option declaration lines emitted by clap, yargs, commander, custom TUIs, etc.
1102
+ * - Lowercases for stable comparison against our contract keys.
1103
+ * - Intentionally conservative: ignores obvious noise (URLs, prose in descriptions).
1104
+ *
1105
+ * This powers the bidirectional drift detector (extra flags the installed binary
1106
+ * advertises that our contract does not yet allow). It is NEVER used for argv
1107
+ * validation — only for the upstream scanner and `upstream_contracts` probe reports.
1108
+ */
1109
+ export function extractDiscoveredFlags(helpText) {
1110
+ const discovered = new Set();
1111
+ // Long flags: --foo, --foo-bar, --foo_bar (some CLIs normalize _ to - in display).
1112
+ // Only inspect option declaration lines so prose such as
1113
+ // "(Claude Code: --allowedTools)" does not create false drift.
1114
+ const longRe = /--([a-z0-9][a-z0-9_-]{1,}[a-z0-9]?)/g;
1115
+ for (const line of helpText.split(/\r?\n/)) {
1116
+ const trimmed = line.trimStart();
1117
+ if (!trimmed.startsWith("-"))
1118
+ continue;
1119
+ const declaration = trimmed.split(/\s{2,}/, 1)[0] ?? "";
1120
+ for (const match of declaration.matchAll(longRe)) {
1121
+ const name = `--${match[1].toLowerCase().replace(/_/g, "-")}`;
1122
+ if (name === "--help")
1123
+ continue;
1124
+ discovered.add(name);
1125
+ }
1126
+ }
1127
+ return Array.from(discovered).sort();
1128
+ }
951
1129
  export function probeInstalledCliContract(cli, timeoutMs = 5_000) {
952
1130
  const contract = UPSTREAM_CLI_CONTRACTS[cli];
953
1131
  const outputs = [];
@@ -979,6 +1157,11 @@ export function probeInstalledCliContract(cli, timeoutMs = 5_000) {
979
1157
  available: false,
980
1158
  checkedHelpCommands: contract.helpArgs,
981
1159
  missingFlags: [],
1160
+ extraFlags: [],
1161
+ discoveredFlags: [],
1162
+ helpHash: undefined,
1163
+ versionHint: undefined,
1164
+ probedAt: new Date().toISOString(),
982
1165
  warnings: [result.error.message],
983
1166
  };
984
1167
  }
@@ -989,6 +1172,13 @@ export function probeInstalledCliContract(cli, timeoutMs = 5_000) {
989
1172
  }
990
1173
  const helpText = outputs.join("\n");
991
1174
  const missingFlags = Object.keys(contract.flags).filter(flag => !helpText.includes(flag));
1175
+ const discoveredFlags = extractDiscoveredFlags(helpText);
1176
+ const contractFlagSet = new Set(Object.keys(contract.flags));
1177
+ const extraFlags = discoveredFlags.filter(f => !contractFlagSet.has(f));
1178
+ // Cheap version hint: first line that looks like a version banner
1179
+ const versionMatch = helpText.match(/^\s*(?:[A-Za-z][\w .-]+)?v?\d+\.\d+\S*/m);
1180
+ const versionHint = versionMatch ? versionMatch[0].trim().slice(0, 80) : undefined;
1181
+ const helpHash = createHash("sha256").update(helpText).digest("hex");
992
1182
  return {
993
1183
  cli,
994
1184
  executable: contract.executable,
@@ -997,6 +1187,11 @@ export function probeInstalledCliContract(cli, timeoutMs = 5_000) {
997
1187
  available: true,
998
1188
  checkedHelpCommands: contract.helpArgs,
999
1189
  missingFlags,
1190
+ extraFlags,
1191
+ discoveredFlags,
1192
+ helpHash,
1193
+ versionHint,
1194
+ probedAt: new Date().toISOString(),
1000
1195
  warnings,
1001
1196
  };
1002
1197
  }
@@ -1,4 +1,4 @@
1
- import { z } from "zod";
1
+ import { z } from "zod/v3";
2
2
  import { getAvailableCliInfo } from "./model-registry.js";
3
3
  import { collectValidationJobResult, startJudgeSynthesis, startValidationRun, } from "./validation-orchestrator.js";
4
4
  const providerSchema = z.enum(["claude", "codex", "gemini", "grok", "mistral"]);