@f-o-h/cli 0.1.49 → 0.1.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -10
- package/dist/foh.js +56 -7
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -118,11 +118,11 @@ Use this when testing whether a clean coding agent can start from public docs
|
|
|
118
118
|
and the public npm package without private repo context:
|
|
119
119
|
|
|
120
120
|
```bash
|
|
121
|
-
foh eval external-agent batch \
|
|
122
|
-
--models openai/codex,anthropic/claude,cursor/agent \
|
|
123
|
-
--prompt-version blank-setup.v1 \
|
|
124
|
-
--json
|
|
125
|
-
```
|
|
121
|
+
foh eval external-agent batch \
|
|
122
|
+
--models openai/codex,anthropic/claude,cursor/agent \
|
|
123
|
+
--prompt-version blank-setup.v1 \
|
|
124
|
+
--json
|
|
125
|
+
```
|
|
126
126
|
|
|
127
127
|
Run each returned launch command in a clean agent terminal:
|
|
128
128
|
|
|
@@ -137,11 +137,25 @@ The command writes a versioned prompt, launches an instrumented shell, captures
|
|
|
137
137
|
FOH CLI commands into `commands.ndjson`, and finalizes `run.json` as an
|
|
138
138
|
`external_agent_run.v1` artifact when the shell exits.
|
|
139
139
|
|
|
140
|
-
Run artifacts include `eval_state` so repeated benchmark runs make reuse
|
|
141
|
-
explicit: org, agent, and widget reuse are expected; fresh paid phone-number
|
|
142
|
-
creation is not expected.
|
|
143
|
-
|
|
144
|
-
For
|
|
140
|
+
Run artifacts include `eval_state` so repeated benchmark runs make reuse
|
|
141
|
+
explicit: org, agent, and widget reuse are expected; fresh paid phone-number
|
|
142
|
+
creation is not expected.
|
|
143
|
+
|
|
144
|
+
For a planted knowledge-miss benchmark:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
foh eval external-agent batch \
|
|
148
|
+
--models openai/codex \
|
|
149
|
+
--prompt-version knowledge-miss.v1 \
|
|
150
|
+
--knowledge-question "Does 12 Acacia Avenue allow Saturday viewings?" \
|
|
151
|
+
--expected-answer "Saturday viewings are available by appointment only." \
|
|
152
|
+
--json
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
The prompt instructs the clean agent to run `foh knowledge query`, preserve the
|
|
156
|
+
failure packet, and convert it with `foh bug improve`.
|
|
157
|
+
|
|
158
|
+
For guarded programmable-runner planning:
|
|
145
159
|
|
|
146
160
|
```bash
|
|
147
161
|
foh eval external-agent execute \
|
package/dist/foh.js
CHANGED
|
@@ -32801,7 +32801,7 @@ var StdioServerTransport = class {
|
|
|
32801
32801
|
};
|
|
32802
32802
|
|
|
32803
32803
|
// src/lib/cli-version.ts
|
|
32804
|
-
var CLI_VERSION = "0.1.
|
|
32804
|
+
var CLI_VERSION = "0.1.51";
|
|
32805
32805
|
|
|
32806
32806
|
// src/commands/mcp-serve.ts
|
|
32807
32807
|
var DEFAULT_TIMEOUT_MS = 12e4;
|
|
@@ -40064,8 +40064,37 @@ function inferShell(raw) {
|
|
|
40064
40064
|
if (process.platform === "win32") return { command: "powershell.exe", args: ["-NoLogo", "-NoProfile"], label: "powershell" };
|
|
40065
40065
|
return { command: process.env.SHELL || "sh", args: [], label: process.env.SHELL || "sh" };
|
|
40066
40066
|
}
|
|
40067
|
-
function
|
|
40068
|
-
const
|
|
40067
|
+
function replayPromptContext(replayFile) {
|
|
40068
|
+
const file2 = String(replayFile || "").trim();
|
|
40069
|
+
if (!file2) return "";
|
|
40070
|
+
return [
|
|
40071
|
+
"",
|
|
40072
|
+
"Replay artifact context:",
|
|
40073
|
+
`- Replay file: ${file2}`,
|
|
40074
|
+
`- Start by running: npx --yes @f-o-h/cli@latest agent replay --file ${quoteArg(file2)} --json`,
|
|
40075
|
+
"- If the replay packet exposes trace/test next commands, run the safest read-only command and convert any failure into `foh bug improve` evidence."
|
|
40076
|
+
].join("\n");
|
|
40077
|
+
}
|
|
40078
|
+
function knowledgeMissPromptContext(knowledgeQuestion, expectedAnswer) {
|
|
40079
|
+
const question = String(knowledgeQuestion || "").trim();
|
|
40080
|
+
if (!question) return "";
|
|
40081
|
+
const expected = String(expectedAnswer || "").trim();
|
|
40082
|
+
return [
|
|
40083
|
+
"",
|
|
40084
|
+
"Planted knowledge-miss context:",
|
|
40085
|
+
`- Question to diagnose: ${question}`,
|
|
40086
|
+
...expected ? [`- Expected answer or missing fact: ${expected}`] : [],
|
|
40087
|
+
`- Start by running: npx --yes @f-o-h/cli@latest knowledge query --agent <agent_id> --text ${quoteArg(question)} --explain --json`,
|
|
40088
|
+
"- If the query returns no match or low confidence, write the failure packet and convert it with `foh bug improve --source-type knowledge_miss --from-file <packet.json> --json`.",
|
|
40089
|
+
"- Do not patch around the miss manually; produce the smallest redacted artifact that explains whether the fix belongs to docs, ingestion, retrieval, config, or runtime."
|
|
40090
|
+
].join("\n");
|
|
40091
|
+
}
|
|
40092
|
+
function writePrompt(runDir, promptVersion, context = {}) {
|
|
40093
|
+
const prompt = [
|
|
40094
|
+
PROMPTS[promptVersion] ?? PROMPTS[DEFAULT_PROMPT_VERSION],
|
|
40095
|
+
replayPromptContext(context.replayFile),
|
|
40096
|
+
knowledgeMissPromptContext(context.knowledgeQuestion, context.expectedAnswer)
|
|
40097
|
+
].join("");
|
|
40069
40098
|
const path2 = (0, import_path14.join)(runDir, "prompt.txt");
|
|
40070
40099
|
(0, import_fs16.writeFileSync)(path2, `${prompt}
|
|
40071
40100
|
`, "utf8");
|
|
@@ -40128,6 +40157,11 @@ function buildRunArtifact(input) {
|
|
|
40128
40157
|
commands_run: commands.map((command) => command.command),
|
|
40129
40158
|
docs_pages_used: agentMetadata.docs_pages_used,
|
|
40130
40159
|
eval_state: buildDefaultEvalState(),
|
|
40160
|
+
context: {
|
|
40161
|
+
replay_file: input.session.replay_file ?? null,
|
|
40162
|
+
knowledge_question: input.session.knowledge_question ?? null,
|
|
40163
|
+
expected_answer: input.session.expected_answer ?? null
|
|
40164
|
+
},
|
|
40131
40165
|
artifacts: {
|
|
40132
40166
|
terminal_transcript: null,
|
|
40133
40167
|
command_log: "commands.ndjson",
|
|
@@ -40148,16 +40182,19 @@ function buildRunArtifact(input) {
|
|
|
40148
40182
|
function registerEval(program3) {
|
|
40149
40183
|
const evalCommand = program3.command("eval").description("Run or summarize external-agent evaluation workflows");
|
|
40150
40184
|
const external = evalCommand.command("external-agent").description("Capture clean external coding-agent setup attempts");
|
|
40151
|
-
external.command("batch").description("Create a deterministic multi-model external-agent batch plan").option("--models <list>", "Comma-separated provider/model list", DEFAULT_BATCH_MODELS).option("--prompt-version <version>", "Prompt version", DEFAULT_PROMPT_VERSION).option("--workspace-type <type>", "Workspace type label", "clean-no-repo").option("--agent-shell <name>", "Agent shell label", "vscode-terminal").option("--out-dir <path>", "Batch output directory").option("--json", "Output as JSON").action(async (opts) => {
|
|
40185
|
+
external.command("batch").description("Create a deterministic multi-model external-agent batch plan").option("--models <list>", "Comma-separated provider/model list", DEFAULT_BATCH_MODELS).option("--prompt-version <version>", "Prompt version", DEFAULT_PROMPT_VERSION).option("--replay-file <path>", "Local transcript/replay artifact to seed replay-failure prompts").option("--knowledge-question <text>", "Question to seed knowledge-miss prompts").option("--expected-answer <text>", "Expected answer or missing fact for planted knowledge-miss prompts").option("--workspace-type <type>", "Workspace type label", "clean-no-repo").option("--agent-shell <name>", "Agent shell label", "vscode-terminal").option("--out-dir <path>", "Batch output directory").option("--json", "Output as JSON").action(async (opts) => {
|
|
40152
40186
|
const promptVersion = String(opts.promptVersion || DEFAULT_PROMPT_VERSION);
|
|
40153
40187
|
const batchDir = (0, import_path14.resolve)(String(opts.outDir || defaultBatchDir(promptVersion)));
|
|
40188
|
+
const replayFile = opts.replayFile ? (0, import_path14.resolve)(String(opts.replayFile)) : void 0;
|
|
40189
|
+
const knowledgeQuestion = opts.knowledgeQuestion ? String(opts.knowledgeQuestion) : void 0;
|
|
40190
|
+
const expectedAnswer = opts.expectedAnswer ? String(opts.expectedAnswer) : void 0;
|
|
40154
40191
|
const models = parseModelList(String(opts.models || DEFAULT_BATCH_MODELS));
|
|
40155
40192
|
(0, import_fs16.mkdirSync)(batchDir, { recursive: true });
|
|
40156
40193
|
const runs = models.map((model, index) => {
|
|
40157
40194
|
const runId = `${String(index + 1).padStart(2, "0")}-${safeSlug(model.provider)}-${safeSlug(model.name)}`;
|
|
40158
40195
|
const runDir = (0, import_path14.join)(batchDir, runId);
|
|
40159
40196
|
(0, import_fs16.mkdirSync)(runDir, { recursive: true });
|
|
40160
|
-
const promptPath = writePrompt(runDir, promptVersion);
|
|
40197
|
+
const promptPath = writePrompt(runDir, promptVersion, { replayFile, knowledgeQuestion, expectedAnswer });
|
|
40161
40198
|
const commandArgs = [
|
|
40162
40199
|
"eval",
|
|
40163
40200
|
"external-agent",
|
|
@@ -40175,6 +40212,9 @@ function registerEval(program3) {
|
|
|
40175
40212
|
"--out-dir",
|
|
40176
40213
|
runDir
|
|
40177
40214
|
];
|
|
40215
|
+
if (replayFile) commandArgs.push("--replay-file", replayFile);
|
|
40216
|
+
if (knowledgeQuestion) commandArgs.push("--knowledge-question", knowledgeQuestion);
|
|
40217
|
+
if (expectedAnswer) commandArgs.push("--expected-answer", expectedAnswer);
|
|
40178
40218
|
return {
|
|
40179
40219
|
run_id: runId,
|
|
40180
40220
|
model_provider: model.provider,
|
|
@@ -40191,6 +40231,9 @@ function registerEval(program3) {
|
|
|
40191
40231
|
created_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
40192
40232
|
batch_dir: batchDir,
|
|
40193
40233
|
prompt_version: promptVersion,
|
|
40234
|
+
replay_file: replayFile ?? null,
|
|
40235
|
+
knowledge_question: knowledgeQuestion ?? null,
|
|
40236
|
+
expected_answer: expectedAnswer ?? null,
|
|
40194
40237
|
workspace_type: String(opts.workspaceType || "clean-no-repo"),
|
|
40195
40238
|
agent_shell: String(opts.agentShell || "vscode-terminal"),
|
|
40196
40239
|
run_count: runs.length,
|
|
@@ -40215,13 +40258,16 @@ function registerEval(program3) {
|
|
|
40215
40258
|
extra: { batch }
|
|
40216
40259
|
}), { json: Boolean(opts.json) });
|
|
40217
40260
|
});
|
|
40218
|
-
external.command("run").description("Launch an instrumented shell and emit external_agent_run.v1 when it exits").option("--model-provider <name>", "Model provider label", "unknown").option("--model-name <name>", "Model name label", "unknown-model").option("--prompt-version <version>", "Prompt version", DEFAULT_PROMPT_VERSION).option("--workspace-type <type>", "Workspace type label", "clean-no-repo").option("--agent-shell <name>", "Agent shell label", "vscode-terminal").option("--out-dir <path>", "Run output directory").option("--status <status>", "Final status when not interactively classified: pass|hold|fail", "hold").option("--reason-code <code>", "Failure/hold reason code", "external_agent_run_needs_review").option("--shell <command>", "Shell command to launch for capture").option("--no-shell", "Do not launch a shell; create/finalize artifacts immediately").option("--json", "Output as JSON").action(async (opts) => {
|
|
40261
|
+
external.command("run").description("Launch an instrumented shell and emit external_agent_run.v1 when it exits").option("--model-provider <name>", "Model provider label", "unknown").option("--model-name <name>", "Model name label", "unknown-model").option("--prompt-version <version>", "Prompt version", DEFAULT_PROMPT_VERSION).option("--replay-file <path>", "Local transcript/replay artifact to seed replay-failure prompts").option("--knowledge-question <text>", "Question to seed knowledge-miss prompts").option("--expected-answer <text>", "Expected answer or missing fact for planted knowledge-miss prompts").option("--workspace-type <type>", "Workspace type label", "clean-no-repo").option("--agent-shell <name>", "Agent shell label", "vscode-terminal").option("--out-dir <path>", "Run output directory").option("--status <status>", "Final status when not interactively classified: pass|hold|fail", "hold").option("--reason-code <code>", "Failure/hold reason code", "external_agent_run_needs_review").option("--shell <command>", "Shell command to launch for capture").option("--no-shell", "Do not launch a shell; create/finalize artifacts immediately").option("--json", "Output as JSON").action(async (opts) => {
|
|
40219
40262
|
const status = normalizeStatus(opts.status);
|
|
40220
40263
|
const promptVersion = String(opts.promptVersion || DEFAULT_PROMPT_VERSION);
|
|
40221
40264
|
const runDir = (0, import_path14.resolve)(String(opts.outDir || defaultRunDir(opts.modelName, promptVersion)));
|
|
40265
|
+
const replayFile = opts.replayFile ? (0, import_path14.resolve)(String(opts.replayFile)) : void 0;
|
|
40266
|
+
const knowledgeQuestion = opts.knowledgeQuestion ? String(opts.knowledgeQuestion) : void 0;
|
|
40267
|
+
const expectedAnswer = opts.expectedAnswer ? String(opts.expectedAnswer) : void 0;
|
|
40222
40268
|
(0, import_fs16.mkdirSync)(runDir, { recursive: true });
|
|
40223
40269
|
const runId = runDir.split(/[\\/]/).filter(Boolean).slice(-1)[0];
|
|
40224
|
-
const promptPath = writePrompt(runDir, promptVersion);
|
|
40270
|
+
const promptPath = writePrompt(runDir, promptVersion, { replayFile, knowledgeQuestion, expectedAnswer });
|
|
40225
40271
|
const shell = inferShell(opts.shell);
|
|
40226
40272
|
const session = {
|
|
40227
40273
|
schema_version: "external_agent_capture_session.v1",
|
|
@@ -40230,6 +40276,9 @@ function registerEval(program3) {
|
|
|
40230
40276
|
model_provider: String(opts.modelProvider || "unknown"),
|
|
40231
40277
|
model_name: String(opts.modelName || "unknown-model"),
|
|
40232
40278
|
prompt_version: promptVersion,
|
|
40279
|
+
replay_file: replayFile ?? null,
|
|
40280
|
+
knowledge_question: knowledgeQuestion ?? null,
|
|
40281
|
+
expected_answer: expectedAnswer ?? null,
|
|
40233
40282
|
workspace_type: String(opts.workspaceType || "clean-no-repo"),
|
|
40234
40283
|
agent_shell: String(opts.agentShell || shell.label),
|
|
40235
40284
|
manual_intervention_count: 0,
|