@oh-my-pi/pi-coding-agent 13.18.0 → 14.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +316 -1
- package/package.json +86 -24
- package/scripts/format-prompts.ts +2 -2
- package/src/autoresearch/apply-contract-to-state.ts +24 -0
- package/src/autoresearch/contract.ts +0 -44
- package/src/autoresearch/dashboard.ts +1 -2
- package/src/autoresearch/git.ts +116 -30
- package/src/autoresearch/helpers.ts +49 -0
- package/src/autoresearch/index.ts +28 -187
- package/src/autoresearch/prompt.md +26 -9
- package/src/autoresearch/state.ts +0 -6
- package/src/autoresearch/tools/init-experiment.ts +202 -117
- package/src/autoresearch/tools/log-experiment.ts +123 -178
- package/src/autoresearch/tools/run-experiment.ts +48 -10
- package/src/autoresearch/types.ts +2 -2
- package/src/capability/index.ts +4 -2
- package/src/cli/file-processor.ts +3 -3
- package/src/cli/grep-cli.ts +8 -8
- package/src/cli/grievances-cli.ts +78 -0
- package/src/cli/read-cli.ts +67 -0
- package/src/cli/setup-cli.ts +4 -4
- package/src/cli/update-cli.ts +3 -3
- package/src/cli.ts +2 -0
- package/src/commands/grep.ts +6 -1
- package/src/commands/grievances.ts +20 -0
- package/src/commands/read.ts +33 -0
- package/src/commit/agentic/agent.ts +5 -8
- package/src/commit/agentic/index.ts +22 -26
- package/src/commit/agentic/tools/analyze-file.ts +3 -3
- package/src/commit/agentic/tools/git-file-diff.ts +3 -6
- package/src/commit/agentic/tools/git-hunk.ts +3 -3
- package/src/commit/agentic/tools/git-overview.ts +6 -9
- package/src/commit/agentic/tools/index.ts +6 -8
- package/src/commit/agentic/tools/propose-commit.ts +4 -7
- package/src/commit/agentic/tools/recent-commits.ts +3 -3
- package/src/commit/agentic/tools/split-commit.ts +4 -4
- package/src/commit/agentic/validation.ts +1 -1
- package/src/commit/analysis/conventional.ts +4 -4
- package/src/commit/analysis/summary.ts +3 -3
- package/src/commit/changelog/generate.ts +4 -4
- package/src/commit/changelog/index.ts +5 -9
- package/src/commit/map-reduce/map-phase.ts +4 -4
- package/src/commit/map-reduce/reduce-phase.ts +4 -4
- package/src/commit/pipeline.ts +13 -16
- package/src/config/keybindings.ts +7 -6
- package/src/config/prompt-templates.ts +44 -226
- package/src/config/resolve-config-value.ts +4 -2
- package/src/config/settings-schema.ts +98 -2
- package/src/config/settings.ts +25 -26
- package/src/dap/client.ts +674 -0
- package/src/dap/config.ts +150 -0
- package/src/dap/defaults.json +211 -0
- package/src/dap/index.ts +4 -0
- package/src/dap/session.ts +1255 -0
- package/src/dap/types.ts +600 -0
- package/src/debug/log-viewer.ts +3 -2
- package/src/discovery/builtin.ts +1 -2
- package/src/discovery/codex.ts +2 -2
- package/src/discovery/github.ts +2 -1
- package/src/discovery/helpers.ts +2 -2
- package/src/discovery/opencode.ts +2 -2
- package/src/edit/diff.ts +818 -0
- package/src/edit/index.ts +309 -0
- package/src/edit/line-hash.ts +67 -0
- package/src/edit/modes/chunk.ts +454 -0
- package/src/{patch → edit/modes}/hashline.ts +741 -361
- package/src/{patch/applicator.ts → edit/modes/patch.ts} +420 -117
- package/src/{patch/fuzzy.ts → edit/modes/replace.ts} +519 -197
- package/src/{patch → edit}/normalize.ts +97 -76
- package/src/{patch/shared.ts → edit/renderer.ts} +181 -108
- package/src/exec/bash-executor.ts +4 -2
- package/src/exec/idle-timeout-watchdog.ts +126 -0
- package/src/exec/non-interactive-env.ts +5 -0
- package/src/extensibility/custom-commands/bundled/ci-green/index.ts +6 -18
- package/src/extensibility/custom-commands/bundled/review/index.ts +45 -43
- package/src/extensibility/custom-commands/loader.ts +1 -2
- package/src/extensibility/custom-tools/loader.ts +34 -11
- package/src/extensibility/custom-tools/types.ts +1 -1
- package/src/extensibility/extensions/loader.ts +9 -4
- package/src/extensibility/extensions/runner.ts +24 -1
- package/src/extensibility/extensions/types.ts +4 -2
- package/src/extensibility/hooks/loader.ts +5 -6
- package/src/extensibility/hooks/types.ts +2 -2
- package/src/extensibility/plugins/doctor.ts +2 -1
- package/src/extensibility/plugins/marketplace/fetcher.ts +2 -57
- package/src/extensibility/plugins/marketplace/source-resolver.ts +4 -4
- package/src/extensibility/slash-commands.ts +3 -7
- package/src/index.ts +3 -1
- package/src/internal-urls/docs-index.generated.ts +11 -11
- package/src/ipy/executor.ts +58 -17
- package/src/ipy/gateway-coordinator.ts +6 -4
- package/src/ipy/kernel.ts +45 -22
- package/src/ipy/runtime.ts +2 -2
- package/src/lsp/client.ts +7 -4
- package/src/lsp/clients/lsp-linter-client.ts +4 -4
- package/src/lsp/config.ts +2 -2
- package/src/lsp/defaults.json +688 -154
- package/src/lsp/index.ts +234 -45
- package/src/lsp/lspmux.ts +2 -2
- package/src/lsp/startup-events.ts +13 -0
- package/src/lsp/types.ts +12 -1
- package/src/lsp/utils.ts +8 -1
- package/src/main.ts +125 -47
- package/src/memories/index.ts +4 -5
- package/src/modes/acp/acp-agent.ts +563 -163
- package/src/modes/acp/acp-event-mapper.ts +9 -1
- package/src/modes/acp/acp-mode.ts +4 -2
- package/src/modes/components/agent-dashboard.ts +3 -4
- package/src/modes/components/diff.ts +6 -7
- package/src/modes/components/footer.ts +9 -29
- package/src/modes/components/hook-editor.ts +3 -3
- package/src/modes/components/hook-selector.ts +6 -1
- package/src/modes/components/read-tool-group.ts +6 -12
- package/src/modes/components/session-observer-overlay.ts +472 -0
- package/src/modes/components/settings-defs.ts +24 -0
- package/src/modes/components/status-line.ts +15 -61
- package/src/modes/components/tool-execution.ts +1 -1
- package/src/modes/components/welcome.ts +1 -1
- package/src/modes/controllers/btw-controller.ts +2 -2
- package/src/modes/controllers/command-controller.ts +4 -2
- package/src/modes/controllers/event-controller.ts +59 -2
- package/src/modes/controllers/extension-ui-controller.ts +1 -0
- package/src/modes/controllers/input-controller.ts +15 -8
- package/src/modes/controllers/selector-controller.ts +26 -0
- package/src/modes/index.ts +20 -2
- package/src/modes/interactive-mode.ts +278 -69
- package/src/modes/rpc/host-tools.ts +186 -0
- package/src/modes/rpc/rpc-client.ts +178 -13
- package/src/modes/rpc/rpc-mode.ts +73 -3
- package/src/modes/rpc/rpc-types.ts +53 -1
- package/src/modes/session-observer-registry.ts +146 -0
- package/src/modes/shared.ts +0 -42
- package/src/modes/theme/theme.ts +80 -8
- package/src/modes/types.ts +4 -2
- package/src/modes/utils/keybinding-matchers.ts +9 -0
- package/src/prompts/system/custom-system-prompt.md +5 -0
- package/src/prompts/system/system-prompt.md +8 -1
- package/src/prompts/tools/chunk-edit.md +219 -0
- package/src/prompts/tools/debug.md +43 -0
- package/src/prompts/tools/grep.md +3 -0
- package/src/prompts/tools/lsp.md +5 -5
- package/src/prompts/tools/read-chunk.md +17 -0
- package/src/prompts/tools/read.md +19 -5
- package/src/sdk.ts +216 -165
- package/src/secrets/index.ts +1 -1
- package/src/secrets/obfuscator.ts +25 -17
- package/src/session/agent-session.ts +381 -286
- package/src/session/agent-storage.ts +12 -12
- package/src/session/compaction/branch-summarization.ts +3 -3
- package/src/session/compaction/compaction.ts +5 -6
- package/src/session/compaction/utils.ts +3 -3
- package/src/session/history-storage.ts +62 -19
- package/src/session/messages.ts +3 -3
- package/src/session/session-dump-format.ts +203 -0
- package/src/session/session-manager.ts +15 -5
- package/src/session/session-storage.ts +4 -2
- package/src/session/streaming-output.ts +1 -1
- package/src/session/tool-choice-queue.ts +213 -0
- package/src/slash-commands/builtin-registry.ts +56 -8
- package/src/ssh/connection-manager.ts +2 -2
- package/src/ssh/sshfs-mount.ts +5 -5
- package/src/stt/downloader.ts +4 -4
- package/src/stt/recorder.ts +4 -4
- package/src/stt/transcriber.ts +2 -2
- package/src/system-prompt.ts +25 -13
- package/src/task/agents.ts +5 -6
- package/src/task/commands.ts +2 -5
- package/src/task/executor.ts +32 -4
- package/src/task/index.ts +91 -82
- package/src/task/template.ts +2 -2
- package/src/task/types.ts +25 -0
- package/src/task/worktree.ts +131 -149
- package/src/tools/ask.ts +2 -3
- package/src/tools/ast-edit.ts +7 -7
- package/src/tools/ast-grep.ts +7 -7
- package/src/tools/auto-generated-guard.ts +36 -41
- package/src/tools/await-tool.ts +2 -2
- package/src/tools/bash.ts +5 -23
- package/src/tools/browser.ts +4 -5
- package/src/tools/calculator.ts +2 -3
- package/src/tools/cancel-job.ts +2 -2
- package/src/tools/checkpoint.ts +3 -3
- package/src/tools/debug.ts +1007 -0
- package/src/tools/exit-plan-mode.ts +3 -3
- package/src/tools/fetch.ts +67 -3
- package/src/tools/find.ts +4 -5
- package/src/tools/fs-cache-invalidation.ts +5 -0
- package/src/tools/gemini-image.ts +13 -5
- package/src/tools/gh.ts +130 -308
- package/src/tools/grep.ts +57 -9
- package/src/tools/index.ts +44 -22
- package/src/tools/inspect-image.ts +4 -4
- package/src/tools/output-meta.ts +1 -1
- package/src/tools/python.ts +19 -6
- package/src/tools/read.ts +211 -146
- package/src/tools/render-mermaid.ts +2 -3
- package/src/tools/render-utils.ts +20 -6
- package/src/tools/renderers.ts +3 -1
- package/src/tools/report-tool-issue.ts +80 -0
- package/src/tools/resolve.ts +70 -39
- package/src/tools/search-tool-bm25.ts +2 -2
- package/src/tools/ssh.ts +2 -2
- package/src/tools/todo-write.ts +2 -2
- package/src/tools/tool-timeouts.ts +1 -0
- package/src/tools/write.ts +5 -6
- package/src/tui/tree-list.ts +3 -1
- package/src/utils/clipboard.ts +80 -0
- package/src/utils/commit-message-generator.ts +2 -3
- package/src/utils/edit-mode.ts +49 -0
- package/src/utils/external-editor.ts +11 -5
- package/src/utils/file-display-mode.ts +6 -5
- package/src/utils/file-mentions.ts +8 -7
- package/src/utils/git.ts +1400 -0
- package/src/utils/image-loading.ts +98 -0
- package/src/utils/title-generator.ts +2 -3
- package/src/utils/tools-manager.ts +6 -6
- package/src/web/scrapers/choosealicense.ts +1 -1
- package/src/web/search/index.ts +3 -3
- package/src/web/search/render.ts +6 -4
- package/src/autoresearch/command-initialize.md +0 -34
- package/src/commit/git/errors.ts +0 -9
- package/src/commit/git/index.ts +0 -210
- package/src/commit/git/operations.ts +0 -54
- package/src/patch/diff.ts +0 -433
- package/src/patch/index.ts +0 -888
- package/src/patch/parser.ts +0 -532
- package/src/patch/types.ts +0 -292
- package/src/prompts/agents/oracle.md +0 -77
- package/src/tools/gh-cli.ts +0 -125
- package/src/tools/pending-action.ts +0 -49
- package/src/utils/child-process.ts +0 -88
- package/src/utils/frontmatter.ts +0 -117
- package/src/utils/image-input.ts +0 -274
- package/src/utils/mime.ts +0 -53
- package/src/utils/prompt-format.ts +0 -170
|
@@ -8,7 +8,11 @@ Autoresearch mode is active.
|
|
|
8
8
|
Primary goal:
|
|
9
9
|
{{goal}}
|
|
10
10
|
{{else}}
|
|
11
|
+
{{#if has_autoresearch_md}}
|
|
11
12
|
Primary goal is documented in `autoresearch.md` for this session.
|
|
13
|
+
{{else}}
|
|
14
|
+
There is no `autoresearch.md` yet. Infer what to optimize from the latest user message and the conversation; after you create `autoresearch.md`, keep it as the durable source of truth for goal and benchmark contract.
|
|
15
|
+
{{/if}}
|
|
12
16
|
{{/if}}
|
|
13
17
|
|
|
14
18
|
Working directory:
|
|
@@ -63,7 +67,7 @@ An unlogged run artifact exists at `{{pending_run_directory}}`.
|
|
|
63
67
|
|
|
64
68
|
- `init_experiment` — initialize or reset the experiment session for the current optimization target.
|
|
65
69
|
- `run_experiment` — run a benchmark or experiment command with timing, output capture, structured metric parsing, and optional backpressure checks.
|
|
66
|
-
- `log_experiment` — record the result, update the dashboard, persist JSONL history, auto-commit kept experiments, and
|
|
70
|
+
- `log_experiment` — record the result, update the dashboard, persist JSONL history, auto-commit kept experiments, and revert only run-modified files for discarded or failed experiments (pre-existing uncommitted changes are preserved).
|
|
67
71
|
|
|
68
72
|
### Operating protocol
|
|
69
73
|
|
|
@@ -83,6 +87,8 @@ An unlogged run artifact exists at `{{pending_run_directory}}`.
|
|
|
83
87
|
- Use the same workload every run unless you intentionally re-initialize with a new segment.
|
|
84
88
|
- Keep the measurement harness, evaluator, and fixed benchmark inputs stable unless you intentionally start a new segment and document the change.
|
|
85
89
|
4. Initialize the loop with `init_experiment` before the first logged run of a segment.
|
|
90
|
+
- Pass `from_autoresearch_md: true` with only `name` to load the benchmark contract from `autoresearch.md` without mirroring every field in the tool call.
|
|
91
|
+
- Use `abandon_unlogged_runs: true` only when you intentionally discard unlogged run artifacts and need a fresh segment (for example after a bad or obsolete benchmark directory).
|
|
86
92
|
5. Run a baseline first.
|
|
87
93
|
- Establish the baseline metric before attempting optimizations.
|
|
88
94
|
- Track secondary metrics only when they matter to correctness, quality, or obvious regressions.
|
|
@@ -90,7 +96,9 @@ An unlogged run artifact exists at `{{pending_run_directory}}`.
|
|
|
90
96
|
- Make one coherent experiment at a time.
|
|
91
97
|
- Run `run_experiment`.
|
|
92
98
|
- Interpret the result honestly.
|
|
93
|
-
- Call `log_experiment` after every run.
|
|
99
|
+
- Call `log_experiment` after every run (it refreshes benchmark/scope fields from `autoresearch.md` before logging so keep validation matches the file on disk).
|
|
100
|
+
- Use `run_experiment` with `force: true` only when you must override the segment benchmark command or skip the direct-`autoresearch.sh` rule.
|
|
101
|
+
- On `log_experiment`, `force: true` relaxes ASI requirements and allows keeping a primary-metric regression; prefer normal logging when possible.
|
|
94
102
|
7. Keep the primary metric as the decision maker.
|
|
95
103
|
- `keep` when the primary metric improves.
|
|
96
104
|
- `discard` when it regresses or stays flat.
|
|
@@ -137,7 +145,11 @@ Suggested structure:
|
|
|
137
145
|
{{#if has_goal}}
|
|
138
146
|
- {{goal}}
|
|
139
147
|
{{else}}
|
|
148
|
+
{{#if has_autoresearch_md}}
|
|
140
149
|
- document the active target here before the first benchmark
|
|
150
|
+
{{else}}
|
|
151
|
+
- (derive from the user's messages, then record here)
|
|
152
|
+
{{/if}}
|
|
141
153
|
{{/if}}
|
|
142
154
|
|
|
143
155
|
## Benchmark
|
|
@@ -194,15 +206,20 @@ Resume from the existing notes:
|
|
|
194
206
|
{{else}}
|
|
195
207
|
### Initial setup
|
|
196
208
|
|
|
197
|
-
`autoresearch.md` does not exist yet.
|
|
209
|
+
`autoresearch.md` does not exist yet. You decide the benchmark contract, harness, and scope from the user's messages and the repository—do not ask the user to re-type benchmark commands or metric names in a separate UI prompt.
|
|
210
|
+
|
|
211
|
+
Before the first benchmark:
|
|
198
212
|
|
|
199
|
-
|
|
213
|
+
- Write `autoresearch.md` with goal, benchmark command (must be a **direct** invocation of `autoresearch.sh`, e.g. `bash autoresearch.sh`), primary metric name and unit, direction (`lower` or `higher`), tradeoff metrics if relevant, files in scope, off limits, and constraints.
|
|
214
|
+
- Add a short preflight section: prerequisites, one-time setup, and the comparability invariant that must stay fixed across runs.
|
|
215
|
+
- Mark ground-truth evaluators, fixed datasets, and other measurement-critical files as off limits or hard constraints when they define the benchmark contract.
|
|
216
|
+
- Write or update `autoresearch.program.md` when you learn durable heuristics, failure patterns, or repo-specific strategy for later resume turns.
|
|
217
|
+
- Create `autoresearch.sh` as the canonical benchmark entrypoint; print the primary metric as `METRIC <name>=<number>` and optional secondary metrics as additional `METRIC` lines.
|
|
218
|
+
- Optionally add `autoresearch.checks.sh` if correctness or quality needs a hard gate.
|
|
219
|
+
- Call `init_experiment` with arguments that match `autoresearch.md` exactly (benchmark command, metric, unit, direction, scope paths, off limits, constraints).
|
|
220
|
+
- Run and log the baseline.
|
|
200
221
|
|
|
201
|
-
|
|
202
|
-
- write `autoresearch.sh`
|
|
203
|
-
- optionally write `autoresearch.checks.sh`
|
|
204
|
-
- run `init_experiment`
|
|
205
|
-
- run and log the baseline
|
|
222
|
+
Until `init_experiment` succeeds, only autoresearch control files (`autoresearch.md`, `autoresearch.sh`, `autoresearch.program.md`, `autoresearch.ideas.md`, `autoresearch.checks.sh`) may be edited; after initialization, respect Files in Scope from the contract.
|
|
206
223
|
|
|
207
224
|
{{/if}}
|
|
208
225
|
{{#if has_checks}}
|
|
@@ -34,7 +34,6 @@ export function createExperimentState(): ExperimentState {
|
|
|
34
34
|
scopePaths: [],
|
|
35
35
|
offLimits: [],
|
|
36
36
|
constraints: [],
|
|
37
|
-
segmentFingerprint: null,
|
|
38
37
|
};
|
|
39
38
|
}
|
|
40
39
|
|
|
@@ -203,8 +202,6 @@ export function reconstructStateFromJsonl(workDir: string): ReconstructedExperim
|
|
|
203
202
|
state.scopePaths = cloneStringArray(configEntry.scopePaths);
|
|
204
203
|
state.offLimits = cloneStringArray(configEntry.offLimits);
|
|
205
204
|
state.constraints = cloneStringArray(configEntry.constraints);
|
|
206
|
-
state.segmentFingerprint =
|
|
207
|
-
typeof configEntry.segmentFingerprint === "string" ? configEntry.segmentFingerprint : null;
|
|
208
205
|
state.secondaryMetrics = hydrateMetricDefs(configEntry.secondaryMetrics);
|
|
209
206
|
continue;
|
|
210
207
|
}
|
|
@@ -322,9 +319,6 @@ function parseConfigEntry(value: unknown): AutoresearchJsonConfigEntry | null {
|
|
|
322
319
|
candidate.constraints.filter((item): item is string => typeof item === "string"),
|
|
323
320
|
);
|
|
324
321
|
}
|
|
325
|
-
if (typeof candidate.segmentFingerprint === "string" && candidate.segmentFingerprint.trim().length > 0) {
|
|
326
|
-
config.segmentFingerprint = candidate.segmentFingerprint;
|
|
327
|
-
}
|
|
328
322
|
return config;
|
|
329
323
|
}
|
|
330
324
|
|
|
@@ -6,15 +6,15 @@ import { Type } from "@sinclair/typebox";
|
|
|
6
6
|
import type { ToolDefinition } from "../../extensibility/extensions";
|
|
7
7
|
import type { Theme } from "../../modes/theme/theme";
|
|
8
8
|
import { replaceTabs, truncateToWidth } from "../../tools/render-utils";
|
|
9
|
+
import { applyAutoresearchContractToExperimentState } from "../apply-contract-to-state";
|
|
9
10
|
import {
|
|
10
|
-
buildAutoresearchSegmentFingerprint,
|
|
11
11
|
contractListsEqual,
|
|
12
12
|
contractPathListsEqual,
|
|
13
13
|
loadAutoresearchScriptSnapshot,
|
|
14
14
|
readAutoresearchContract,
|
|
15
15
|
} from "../contract";
|
|
16
16
|
import {
|
|
17
|
-
|
|
17
|
+
abandonUnloggedAutoresearchRuns,
|
|
18
18
|
isAutoresearchShCommand,
|
|
19
19
|
readMaxExperiments,
|
|
20
20
|
readPendingRunSummary,
|
|
@@ -28,9 +28,29 @@ const initExperimentSchema = Type.Object({
|
|
|
28
28
|
name: Type.String({
|
|
29
29
|
description: "Human-readable experiment name.",
|
|
30
30
|
}),
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
from_autoresearch_md: Type.Optional(
|
|
32
|
+
Type.Boolean({
|
|
33
|
+
description:
|
|
34
|
+
"When true, load benchmark command, metrics, scope, off-limits, and constraints from autoresearch.md instead of passing mirrored fields below.",
|
|
35
|
+
}),
|
|
36
|
+
),
|
|
37
|
+
abandon_unlogged_runs: Type.Optional(
|
|
38
|
+
Type.Boolean({
|
|
39
|
+
description:
|
|
40
|
+
"When true, mark all completed but unlogged run artifacts as abandoned so initialization can proceed without logging them first.",
|
|
41
|
+
}),
|
|
42
|
+
),
|
|
43
|
+
new_segment: Type.Optional(
|
|
44
|
+
Type.Boolean({
|
|
45
|
+
description:
|
|
46
|
+
"When true, force a new segment even when the contract fields have not changed. Without this, re-initialization with matching contract is a no-op.",
|
|
47
|
+
}),
|
|
48
|
+
),
|
|
49
|
+
metric_name: Type.Optional(
|
|
50
|
+
Type.String({
|
|
51
|
+
description: "Primary metric name shown in the dashboard. Required when from_autoresearch_md is false.",
|
|
52
|
+
}),
|
|
53
|
+
),
|
|
34
54
|
metric_unit: Type.Optional(
|
|
35
55
|
Type.String({
|
|
36
56
|
description: "Unit for the primary metric, for example µs, ms, s, kb, or empty.",
|
|
@@ -41,13 +61,17 @@ const initExperimentSchema = Type.Object({
|
|
|
41
61
|
description: "Whether lower or higher values are better. Defaults to lower.",
|
|
42
62
|
}),
|
|
43
63
|
),
|
|
44
|
-
benchmark_command: Type.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
64
|
+
benchmark_command: Type.Optional(
|
|
65
|
+
Type.String({
|
|
66
|
+
description: "Benchmark command recorded in autoresearch.md. Required when from_autoresearch_md is false.",
|
|
67
|
+
}),
|
|
68
|
+
),
|
|
69
|
+
scope_paths: Type.Optional(
|
|
70
|
+
Type.Array(Type.String(), {
|
|
71
|
+
description: "Files in Scope from autoresearch.md. Required when from_autoresearch_md is false.",
|
|
72
|
+
minItems: 1,
|
|
73
|
+
}),
|
|
74
|
+
),
|
|
51
75
|
off_limits: Type.Optional(
|
|
52
76
|
Type.Array(Type.String(), {
|
|
53
77
|
description: "Off Limits paths from autoresearch.md.",
|
|
@@ -86,25 +110,43 @@ export function createInitExperimentTool(
|
|
|
86
110
|
const state = runtime.state;
|
|
87
111
|
const isReinitializing = state.results.length > 0;
|
|
88
112
|
const workDir = resolveWorkDir(ctx.cwd);
|
|
89
|
-
const
|
|
113
|
+
const loggedRunNumbers = collectLoggedRunNumbers(state.results);
|
|
114
|
+
|
|
115
|
+
let abandonSummary = "";
|
|
116
|
+
if (params.abandon_unlogged_runs === true) {
|
|
117
|
+
const abandoned = await abandonUnloggedAutoresearchRuns(workDir, loggedRunNumbers);
|
|
118
|
+
if (abandoned > 0) {
|
|
119
|
+
abandonSummary =
|
|
120
|
+
abandoned === 1
|
|
121
|
+
? "Abandoned 1 unlogged run artifact.\n"
|
|
122
|
+
: `Abandoned ${abandoned} unlogged run artifacts.\n`;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const pendingRun = await readPendingRunSummary(workDir, loggedRunNumbers);
|
|
90
127
|
if (pendingRun) {
|
|
128
|
+
const metricInfo = pendingRun.parsedPrimary !== null ? `, metric=${pendingRun.parsedPrimary}` : "";
|
|
129
|
+
const passedInfo = pendingRun.passed ? "passed" : "failed";
|
|
91
130
|
return {
|
|
92
131
|
content: [
|
|
93
132
|
{
|
|
94
133
|
type: "text",
|
|
95
134
|
text:
|
|
96
|
-
|
|
97
|
-
|
|
135
|
+
abandonSummary +
|
|
136
|
+
`Error: run #${pendingRun.runNumber} has not been logged yet.\n` +
|
|
137
|
+
`Pending: command="${pendingRun.command}"${metricInfo}, ${passedInfo}\n` +
|
|
138
|
+
"Call log_experiment before re-initializing, or pass abandon_unlogged_runs=true.",
|
|
98
139
|
},
|
|
99
140
|
],
|
|
100
141
|
};
|
|
101
142
|
}
|
|
143
|
+
|
|
102
144
|
const contractResult = readAutoresearchContract(workDir);
|
|
103
145
|
const scriptSnapshot = loadAutoresearchScriptSnapshot(workDir);
|
|
104
146
|
const errors = [...contractResult.errors, ...scriptSnapshot.errors];
|
|
105
147
|
if (errors.length > 0) {
|
|
106
148
|
return {
|
|
107
|
-
content: [{ type: "text", text:
|
|
149
|
+
content: [{ type: "text", text: `${abandonSummary}Error: ${errors.join(" ")}` }],
|
|
108
150
|
};
|
|
109
151
|
}
|
|
110
152
|
|
|
@@ -117,118 +159,161 @@ export function createInitExperimentTool(
|
|
|
117
159
|
{
|
|
118
160
|
type: "text",
|
|
119
161
|
text:
|
|
162
|
+
abandonSummary +
|
|
120
163
|
"Error: Benchmark.command in autoresearch.md must invoke `autoresearch.sh` directly. " +
|
|
121
164
|
"Move the real workload into `autoresearch.sh` and re-run init_experiment.",
|
|
122
165
|
},
|
|
123
166
|
],
|
|
124
167
|
};
|
|
125
168
|
}
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
169
|
+
|
|
170
|
+
const fromMd = params.from_autoresearch_md === true;
|
|
171
|
+
if (!fromMd) {
|
|
172
|
+
const metricName = params.metric_name?.trim();
|
|
173
|
+
const benchmarkCommand = params.benchmark_command?.trim();
|
|
174
|
+
const scopePaths = params.scope_paths;
|
|
175
|
+
if (!metricName || !benchmarkCommand || !scopePaths || scopePaths.length === 0) {
|
|
176
|
+
return {
|
|
177
|
+
content: [
|
|
178
|
+
{
|
|
179
|
+
type: "text",
|
|
180
|
+
text:
|
|
181
|
+
abandonSummary +
|
|
182
|
+
"Error: when from_autoresearch_md is false or omitted, metric_name, benchmark_command, and scope_paths are required and must match autoresearch.md. " +
|
|
183
|
+
"Alternatively pass from_autoresearch_md=true with only name (plus optional flags).",
|
|
184
|
+
},
|
|
185
|
+
],
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
if (benchmarkContract.command !== benchmarkCommand) {
|
|
189
|
+
return {
|
|
190
|
+
content: [
|
|
191
|
+
{
|
|
192
|
+
type: "text",
|
|
193
|
+
text:
|
|
194
|
+
abandonSummary +
|
|
195
|
+
"Error: benchmark_command does not match autoresearch.md. " +
|
|
196
|
+
`Expected: ${benchmarkContract.command ?? "(missing)"}\nReceived: ${params.benchmark_command}`,
|
|
197
|
+
},
|
|
198
|
+
],
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
if (benchmarkContract.primaryMetric !== metricName) {
|
|
202
|
+
return {
|
|
203
|
+
content: [
|
|
204
|
+
{
|
|
205
|
+
type: "text",
|
|
206
|
+
text:
|
|
207
|
+
abandonSummary +
|
|
208
|
+
"Error: metric_name does not match autoresearch.md. " +
|
|
209
|
+
`Expected: ${benchmarkContract.primaryMetric ?? "(missing)"}\nReceived: ${params.metric_name}`,
|
|
210
|
+
},
|
|
211
|
+
],
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
if ((params.metric_unit ?? "") !== expectedMetricUnit) {
|
|
215
|
+
return {
|
|
216
|
+
content: [
|
|
217
|
+
{
|
|
218
|
+
type: "text",
|
|
219
|
+
text:
|
|
220
|
+
abandonSummary +
|
|
221
|
+
"Error: metric_unit does not match autoresearch.md. " +
|
|
222
|
+
`Expected: ${expectedMetricUnit || "(empty)"}\nReceived: ${params.metric_unit ?? "(empty)"}`,
|
|
223
|
+
},
|
|
224
|
+
],
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
if ((params.direction ?? "lower") !== expectedDirection) {
|
|
228
|
+
return {
|
|
229
|
+
content: [
|
|
230
|
+
{
|
|
231
|
+
type: "text",
|
|
232
|
+
text:
|
|
233
|
+
abandonSummary +
|
|
234
|
+
"Error: direction does not match autoresearch.md. " +
|
|
235
|
+
`Expected: ${expectedDirection}\nReceived: ${params.direction ?? "lower"}`,
|
|
236
|
+
},
|
|
237
|
+
],
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
if (!contractPathListsEqual(scopePaths, contractResult.contract.scopePaths)) {
|
|
241
|
+
return {
|
|
242
|
+
content: [
|
|
243
|
+
{
|
|
244
|
+
type: "text",
|
|
245
|
+
text:
|
|
246
|
+
abandonSummary +
|
|
247
|
+
"Error: scope_paths do not match autoresearch.md. " +
|
|
248
|
+
`Expected: ${contractResult.contract.scopePaths.join(", ")}`,
|
|
249
|
+
},
|
|
250
|
+
],
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
if (!contractPathListsEqual(params.off_limits ?? [], contractResult.contract.offLimits)) {
|
|
254
|
+
return {
|
|
255
|
+
content: [
|
|
256
|
+
{
|
|
257
|
+
type: "text",
|
|
258
|
+
text:
|
|
259
|
+
abandonSummary +
|
|
260
|
+
"Error: off_limits do not match autoresearch.md. " +
|
|
261
|
+
`Expected: ${contractResult.contract.offLimits.join(", ") || "(empty)"}`,
|
|
262
|
+
},
|
|
263
|
+
],
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
if (!contractListsEqual(params.constraints ?? [], contractResult.contract.constraints)) {
|
|
267
|
+
return {
|
|
268
|
+
content: [
|
|
269
|
+
{
|
|
270
|
+
type: "text",
|
|
271
|
+
text:
|
|
272
|
+
abandonSummary +
|
|
273
|
+
"Error: constraints do not match autoresearch.md. " +
|
|
274
|
+
`Expected: ${contractResult.contract.constraints.join(", ") || "(empty)"}`,
|
|
275
|
+
},
|
|
276
|
+
],
|
|
277
|
+
};
|
|
278
|
+
}
|
|
209
279
|
}
|
|
210
280
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
281
|
+
// Check if contract matches current state — if so, re-init is a no-op
|
|
282
|
+
if (isReinitializing && params.new_segment !== true) {
|
|
283
|
+
const contract = contractResult.contract;
|
|
284
|
+
const bm = contract.benchmark;
|
|
285
|
+
const contractMatches =
|
|
286
|
+
(bm.primaryMetric ?? "metric") === state.metricName &&
|
|
287
|
+
bm.metricUnit === state.metricUnit &&
|
|
288
|
+
(bm.direction ?? "lower") === state.bestDirection &&
|
|
289
|
+
(bm.command ?? null) === state.benchmarkCommand &&
|
|
290
|
+
contractPathListsEqual(contract.scopePaths, state.scopePaths) &&
|
|
291
|
+
contractPathListsEqual(contract.offLimits, state.offLimits) &&
|
|
292
|
+
contractListsEqual(contract.constraints, state.constraints);
|
|
293
|
+
if (contractMatches) {
|
|
294
|
+
runtime.autoresearchMode = true;
|
|
295
|
+
runtime.autoResumeArmed = true;
|
|
296
|
+
options.dashboard.updateWidget(ctx, runtime);
|
|
297
|
+
options.dashboard.requestRender();
|
|
298
|
+
return {
|
|
299
|
+
content: [
|
|
300
|
+
{
|
|
301
|
+
type: "text",
|
|
302
|
+
text:
|
|
303
|
+
abandonSummary +
|
|
304
|
+
`Experiment session already initialized with matching contract. Continuing segment ${state.currentSegment}.`,
|
|
305
|
+
},
|
|
306
|
+
],
|
|
307
|
+
details: { state: cloneExperimentState(state) },
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
}
|
|
215
311
|
|
|
312
|
+
applyAutoresearchContractToExperimentState(contractResult.contract, state);
|
|
216
313
|
state.name = params.name;
|
|
217
|
-
state.metricName = params.metric_name;
|
|
218
|
-
state.metricUnit = params.metric_unit ?? "";
|
|
219
|
-
state.bestDirection = params.direction ?? "lower";
|
|
220
314
|
state.maxExperiments = readMaxExperiments(ctx.cwd);
|
|
221
315
|
state.bestMetric = null;
|
|
222
316
|
state.confidence = null;
|
|
223
|
-
state.secondaryMetrics = benchmarkContract.secondaryMetrics.map(name => ({
|
|
224
|
-
name,
|
|
225
|
-
unit: inferMetricUnitFromName(name),
|
|
226
|
-
}));
|
|
227
|
-
state.benchmarkCommand = params.benchmark_command.trim();
|
|
228
|
-
state.scopePaths = [...contractResult.contract.scopePaths];
|
|
229
|
-
state.offLimits = [...contractResult.contract.offLimits];
|
|
230
|
-
state.constraints = [...contractResult.contract.constraints];
|
|
231
|
-
state.segmentFingerprint = segmentFingerprint;
|
|
232
317
|
if (isReinitializing) {
|
|
233
318
|
state.currentSegment += 1;
|
|
234
319
|
}
|
|
@@ -245,7 +330,6 @@ export function createInitExperimentTool(
|
|
|
245
330
|
scopePaths: state.scopePaths,
|
|
246
331
|
offLimits: state.offLimits,
|
|
247
332
|
constraints: state.constraints,
|
|
248
|
-
segmentFingerprint,
|
|
249
333
|
});
|
|
250
334
|
|
|
251
335
|
if (isReinitializing) {
|
|
@@ -267,6 +351,7 @@ export function createInitExperimentTool(
|
|
|
267
351
|
options.dashboard.requestRender();
|
|
268
352
|
|
|
269
353
|
const lines = [
|
|
354
|
+
abandonSummary.trimEnd(),
|
|
270
355
|
`Experiment initialized: ${state.name}`,
|
|
271
356
|
`Metric: ${state.metricName} (${state.metricUnit || "unitless"}, ${state.bestDirection} is better)`,
|
|
272
357
|
`Benchmark command: ${state.benchmarkCommand}`,
|
|
@@ -275,7 +360,7 @@ export function createInitExperimentTool(
|
|
|
275
360
|
isReinitializing
|
|
276
361
|
? "Previous results remain in history. This starts a new segment and requires a fresh baseline."
|
|
277
362
|
: "Now run the baseline experiment and log it.",
|
|
278
|
-
];
|
|
363
|
+
].filter(line => line.length > 0);
|
|
279
364
|
if (state.maxExperiments !== null) {
|
|
280
365
|
lines.push(`Max iterations: ${state.maxExperiments}`);
|
|
281
366
|
}
|