typeclaw 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/require-parallel.ts +41 -15
- package/src/agent/live-subagents.ts +0 -1
- package/src/agent/session-origin.ts +10 -0
- package/src/agent/subagent-completion-reminder.ts +4 -1
- package/src/agent/subagents.ts +72 -13
- package/src/agent/system-prompt.ts +5 -5
- package/src/agent/tools/channel-reply.ts +47 -7
- package/src/agent/tools/channel-send.ts +43 -11
- package/src/agent/tools/restart.ts +13 -2
- package/src/agent/tools/runtime-notice.ts +41 -0
- package/src/agent/tools/spawn-subagent.ts +0 -1
- package/src/agent/tools/subagent-output.ts +3 -51
- package/src/bundled-plugins/memory/README.md +11 -11
- package/src/bundled-plugins/memory/dreaming-state.ts +51 -2
- package/src/bundled-plugins/memory/index.ts +77 -26
- package/src/bundled-plugins/memory/memory-retrieval.ts +7 -1
- package/src/bundled-plugins/memory/migration.ts +91 -16
- package/src/bundled-plugins/memory/stream-io.ts +71 -1
- package/src/channels/adapters/kakaotalk-classify.ts +4 -1
- package/src/channels/adapters/kakaotalk.ts +1 -1
- package/src/channels/manager.ts +7 -0
- package/src/channels/router.ts +260 -15
- package/src/channels/schema.ts +1 -1
- package/src/cli/compose.ts +23 -2
- package/src/cli/logs.ts +17 -2
- package/src/compose/logs.ts +8 -4
- package/src/config/config.ts +8 -0
- package/src/container/index.ts +1 -1
- package/src/container/logs.ts +38 -11
- package/src/init/dockerfile.ts +147 -4
- package/src/inspect/live.ts +32 -1
- package/src/inspect/render.ts +32 -0
- package/src/inspect/replay.ts +44 -0
- package/src/inspect/types.ts +26 -0
- package/src/run/index.ts +28 -11
- package/src/server/index.ts +59 -19
- package/src/shared/protocol.ts +30 -0
- package/src/skills/typeclaw-codex-cli/SKILL.md +324 -0
- package/src/skills/typeclaw-codex-cli/references/auth-flow.md +131 -0
- package/src/skills/typeclaw-codex-cli/references/stop-hook.md +92 -0
- package/src/skills/typeclaw-codex-cli/references/tmux-driving.md +239 -0
- package/src/skills/typeclaw-config/SKILL.md +32 -31
- package/src/test-helpers/wait-for.ts +15 -7
- package/typeclaw.schema.json +24 -11
package/package.json
CHANGED
|
@@ -1,29 +1,55 @@
|
|
|
1
|
-
// Preloaded by bunfig.toml `[test] preload`.
|
|
2
|
-
//
|
|
3
|
-
//
|
|
4
|
-
//
|
|
5
|
-
//
|
|
6
|
-
//
|
|
1
|
+
// Preloaded by bunfig.toml `[test] preload`. Two responsibilities:
|
|
2
|
+
// 1. Deny `bun test` without --parallel.
|
|
3
|
+
// 2. Raise the per-test default timeout from Bun's 5000ms.
|
|
4
|
+
//
|
|
5
|
+
// Why deny serial runs: Serial runs are ~3.4x slower (44s → 13s, see commit
|
|
6
|
+
// 1c66d5e), and Bun has no bunfig knob for the flag yet (verified against
|
|
7
|
+
// bunfig.zig in oven-sh/bun main, May 2026). Without this guard, IDE test
|
|
8
|
+
// runners and ad-hoc shells silently fall back to the slow path.
|
|
7
9
|
//
|
|
8
10
|
// Detection: Bun strips CLI flags from `Bun.argv` before invoking the
|
|
9
11
|
// preload, so we can't scrape the flag directly. Instead we look for
|
|
10
12
|
// BUN_TEST_WORKER_ID, which Bun sets in the preload env exactly when
|
|
11
|
-
// `--parallel` is active (the variable carries the worker index for
|
|
12
|
-
//
|
|
13
|
-
//
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
//
|
|
13
|
+
// `--parallel` is active (the variable carries the worker index for the
|
|
14
|
+
// IPC handshake between coordinator and workers). Empirically verified
|
|
15
|
+
// against bun 1.3.14: present under --parallel, absent under serial. If
|
|
16
|
+
// a future Bun version renames this var, the guard fails closed (treats
|
|
17
|
+
// every run as serial → always denies), which is the safe direction.
|
|
18
|
+
//
|
|
19
|
+
// Bypass with TYPECLAW_ALLOW_SERIAL_TESTS=1 when debugging a flaky test
|
|
20
|
+
// where worker contention obscures the failure.
|
|
17
21
|
//
|
|
18
|
-
//
|
|
19
|
-
//
|
|
22
|
+
// Why raise the default timeout: A growing number of tests in this repo
|
|
23
|
+
// either spawn child processes (`bun run typeclaw …` via Bun.spawn from
|
|
24
|
+
// src/cli/index.test.ts, src/cli/role.test.ts, src/cli/status.test.ts,
|
|
25
|
+
// src/init/dockerfile.test.ts agent-browser wrapper, etc.) or boot the
|
|
26
|
+
// in-process agent (`startAgent({ port: 0, … })` from src/run/plugin.test.ts).
|
|
27
|
+
// Both shapes have a happy-path cost well under 1s but a worst-case cost
|
|
28
|
+
// that races Bun's 5000ms ceiling under `--parallel` contention. The
|
|
29
|
+
// repeating failure mode is "this test timed out after 5000ms" appearing
|
|
30
|
+
// on different tests across runs at a rough ~3-15% rate per full-suite
|
|
31
|
+
// invocation — not a real bug, just resource starvation. Raising the
|
|
32
|
+
// default to 30s eliminates the false positives without masking real
|
|
33
|
+
// hangs (a wedged test still fails, just 6x slower than before). The
|
|
34
|
+
// happy path is unaffected because tests complete in their actual
|
|
35
|
+
// runtime, not the timeout budget.
|
|
36
|
+
//
|
|
37
|
+
// 30s was chosen as ~75x the observed happy-path cold-start (~400ms) for
|
|
38
|
+
// the heaviest subprocess tests, matching the in-house convention used in
|
|
39
|
+
// pi-coding-agent's subprocess fixtures and Bun's own integration-test
|
|
40
|
+
// suites (see oven-sh/bun test/cli/install/*.test.ts which set 5-minute
|
|
41
|
+
// timeouts for full installs). Individual tests that genuinely need more
|
|
42
|
+
// can still pass an explicit 3rd arg to `test()` to override locally.
|
|
43
|
+
|
|
44
|
+
import { setDefaultTimeout } from 'bun:test'
|
|
20
45
|
|
|
21
46
|
const isParallelWorker = typeof process.env.BUN_TEST_WORKER_ID === 'string'
|
|
22
47
|
|
|
23
48
|
if (isParallelWorker) {
|
|
24
|
-
|
|
49
|
+
setDefaultTimeout(30_000)
|
|
25
50
|
} else if (process.env.TYPECLAW_ALLOW_SERIAL_TESTS === '1') {
|
|
26
51
|
console.warn('[require-parallel] Running serially — TYPECLAW_ALLOW_SERIAL_TESTS=1 set.')
|
|
52
|
+
setDefaultTimeout(30_000)
|
|
27
53
|
} else {
|
|
28
54
|
console.error('')
|
|
29
55
|
console.error(' ✗ `bun test` without --parallel is denied in this repo.')
|
|
@@ -231,6 +231,16 @@ function renderChannelOrigin(
|
|
|
231
231
|
'the answer — both in the same turn. The ack is not your reply; the answer',
|
|
232
232
|
'is. Once the answer lands, end your turn.',
|
|
233
233
|
'',
|
|
234
|
+
'**Backgrounded work does not end the obligation.** If you spawn a',
|
|
235
|
+
'subagent with `run_in_background: true` to answer the current inbound,',
|
|
236
|
+
"you have promised a reply you have not delivered yet. Don't end the",
|
|
237
|
+
'turn with `NO_REPLY` — the system will not surface the subagent result',
|
|
238
|
+
'on its own. When the subagent-completion `<system-reminder>` arrives,',
|
|
239
|
+
'fetch the result with `subagent_output` and send it via `channel_reply`',
|
|
240
|
+
'in that turn. `NO_REPLY` is only legal on the post-result turn if there',
|
|
241
|
+
'is genuinely nothing user-facing to share (e.g. the result is empty or',
|
|
242
|
+
'identical to something you already replied with this conversation).',
|
|
243
|
+
'',
|
|
234
244
|
'Do not send a second reply just to rephrase, restate, or "confirm in',
|
|
235
245
|
'plain language" something you already said.',
|
|
236
246
|
'',
|
|
@@ -21,7 +21,10 @@ export type CompletionReminderArgs = {
|
|
|
21
21
|
const CHANNEL_REPLY_NUDGE =
|
|
22
22
|
'This reminder is a system message, not a user inbound — but you are in a channel session, ' +
|
|
23
23
|
'so end your turn via `channel_reply` (or `channel_send`) to surface the result. ' +
|
|
24
|
-
'Plain-text output is invisible here. If
|
|
24
|
+
'Plain-text output is invisible here. If you spawned this subagent to answer a user, ' +
|
|
25
|
+
'this is the turn where that promised reply lands — fetch the result via `subagent_output` ' +
|
|
26
|
+
'and send it. `NO_REPLY` is only correct when the result is genuinely empty or duplicates ' +
|
|
27
|
+
'something you already replied with in this conversation.'
|
|
25
28
|
|
|
26
29
|
export function renderSubagentCompletionReminder(args: CompletionReminderArgs): string {
|
|
27
30
|
const durationStr = formatReminderDuration(args.durationMs)
|
package/src/agent/subagents.ts
CHANGED
|
@@ -48,6 +48,20 @@ export type SubagentShared<P = unknown> = {
|
|
|
48
48
|
toolResultBudget?: ToolResultBudget
|
|
49
49
|
visibility?: 'public' | 'internal'
|
|
50
50
|
requiresSpecificPermission?: boolean
|
|
51
|
+
// Wall-clock ceiling on a single spawn, enforced at the orchestration
|
|
52
|
+
// layer (both `dispatchSpawnSubagent` and the stream-driven
|
|
53
|
+
// `SubagentConsumer`). When exceeded, the orchestrator's `await` settles
|
|
54
|
+
// with a timeout error and releases the coalescing key for `inFlightKey`,
|
|
55
|
+
// so the next spawn of the same (name, inFlightKey) can proceed instead
|
|
56
|
+
// of being skip-coalesced. The underlying `invokeSubagent` call may keep
|
|
57
|
+
// running — pi-coding-agent's `session.prompt` does not accept an
|
|
58
|
+
// AbortSignal today, so a half-open LLM stream stays alive until the OS
|
|
59
|
+
// reaps it. The trade-off is honest: cancellation is upstream's job;
|
|
60
|
+
// releasing the coalescing key is ours, and that is what unblocks the
|
|
61
|
+
// user-visible "every subsequent turn skipped while the first spawn
|
|
62
|
+
// hangs" symptom. Omit for no ceiling (legacy behavior; the spawn waits
|
|
63
|
+
// as long as the provider takes).
|
|
64
|
+
timeoutMs?: number
|
|
51
65
|
}
|
|
52
66
|
|
|
53
67
|
export type Subagent<P = unknown> = SubagentShared<P> & {
|
|
@@ -248,6 +262,42 @@ export async function invokeSubagent(name: string, options: InvokeSubagentOption
|
|
|
248
262
|
}
|
|
249
263
|
}
|
|
250
264
|
|
|
265
|
+
export class SubagentTimeoutError extends Error {
|
|
266
|
+
override readonly name = 'SubagentTimeoutError'
|
|
267
|
+
constructor(
|
|
268
|
+
readonly subagentName: string,
|
|
269
|
+
readonly coalesceKey: string,
|
|
270
|
+
readonly timeoutMs: number,
|
|
271
|
+
) {
|
|
272
|
+
super(`subagent ${subagentName} (key=${coalesceKey}) spawn timed out after ${timeoutMs}ms`)
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
export function isSubagentTimeoutError(err: unknown): err is SubagentTimeoutError {
|
|
277
|
+
return err instanceof SubagentTimeoutError
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
export async function awaitWithSubagentTimeout(
|
|
281
|
+
work: Promise<void>,
|
|
282
|
+
subagentName: string,
|
|
283
|
+
coalesceKey: string,
|
|
284
|
+
timeoutMs: number | undefined,
|
|
285
|
+
): Promise<void> {
|
|
286
|
+
if (timeoutMs === undefined) {
|
|
287
|
+
await work
|
|
288
|
+
return
|
|
289
|
+
}
|
|
290
|
+
let timer: ReturnType<typeof setTimeout> | null = null
|
|
291
|
+
const timeout = new Promise<never>((_, reject) => {
|
|
292
|
+
timer = setTimeout(() => reject(new SubagentTimeoutError(subagentName, coalesceKey, timeoutMs)), timeoutMs)
|
|
293
|
+
})
|
|
294
|
+
try {
|
|
295
|
+
await Promise.race([work, timeout])
|
|
296
|
+
} finally {
|
|
297
|
+
if (timer !== null) clearTimeout(timer)
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
251
301
|
export type SubagentHandle = {
|
|
252
302
|
taskId: string
|
|
253
303
|
sessionId: string | undefined
|
|
@@ -447,20 +497,29 @@ export function createSubagentConsumer({
|
|
|
447
497
|
inFlight.add(key)
|
|
448
498
|
try {
|
|
449
499
|
const spawnedByOrigin = parseSpawnedByOriginJson(target.spawnedByOriginJson, logger, name)
|
|
450
|
-
await
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
500
|
+
await awaitWithSubagentTimeout(
|
|
501
|
+
invokeSubagent(name, {
|
|
502
|
+
registry,
|
|
503
|
+
...(createSessionForSubagent !== undefined ? { createSessionForSubagent } : {}),
|
|
504
|
+
agentDir,
|
|
505
|
+
userPrompt: '',
|
|
506
|
+
payload: msg.payload,
|
|
507
|
+
onProviderError: (message) => logger.error(`[subagent] ${key}: LLM call failed: ${message}`),
|
|
508
|
+
...(target.parentSessionId !== undefined ? { parentSessionId: target.parentSessionId } : {}),
|
|
509
|
+
...(target.spawnedByRole !== undefined ? { spawnedByRole: target.spawnedByRole } : {}),
|
|
510
|
+
...(spawnedByOrigin !== undefined ? { spawnedByOrigin } : {}),
|
|
511
|
+
}),
|
|
512
|
+
name,
|
|
513
|
+
key,
|
|
514
|
+
registry[name]?.timeoutMs,
|
|
515
|
+
)
|
|
461
516
|
} catch (err) {
|
|
462
|
-
|
|
463
|
-
|
|
517
|
+
if (isSubagentTimeoutError(err)) {
|
|
518
|
+
logger.warn(`[subagent] ${key} timed out after ${err.timeoutMs}ms; releasing coalesce key`)
|
|
519
|
+
} else {
|
|
520
|
+
const message = err instanceof Error ? err.message : String(err)
|
|
521
|
+
logger.error(`[subagent] ${key} failed: ${message}`)
|
|
522
|
+
}
|
|
464
523
|
} finally {
|
|
465
524
|
inFlight.delete(key)
|
|
466
525
|
}
|
|
@@ -60,7 +60,7 @@ There are two delegation modes. Pick deliberately.
|
|
|
60
60
|
|
|
61
61
|
**Mode A — Research fan-out** (in service of the current question)
|
|
62
62
|
|
|
63
|
-
When you need information to answer the user and the search is broad, fire 2-5 subagents in parallel with \`run_in_background: true\` covering different angles. End your response after spawning. The system will deliver a \`<system-reminder>\` for each completion;
|
|
63
|
+
When you need information to answer the user and the search is broad, fire 2-5 subagents in parallel with \`run_in_background: true\` covering different angles. End your response after spawning. The system will deliver a \`<system-reminder>\` for each completion; then call \`subagent_output\` once per task_id to fetch the result and answer the user. \`subagent_output\` always returns immediately with a snapshot — it does not block.
|
|
64
64
|
|
|
65
65
|
The bundled \`explorer\` subagent is the right tool for **local** reconnaissance — anything reachable on the agent's filesystem: code, past sessions (\`sessions/*.jsonl\`), memory topic shards and daily memory streams, skills, cron jobs, config, git history, mounts, channels state. It is read-only and runs on a fast/cheap model, so fire liberally. Do NOT ask it to plan, decide, or write code — it finds and reports.
|
|
66
66
|
|
|
@@ -72,13 +72,13 @@ When the user hands you a task that will take minutes (a multi-step browser sess
|
|
|
72
72
|
|
|
73
73
|
In a channel session, the completion \`<system-reminder>\` is NOT a user message — the channel origin's "you MUST call \`channel_reply\` for every user message" rule does not literally apply, but the underlying constraint does: plain-text output is invisible in a channel. Surface the result via \`channel_reply\` (or \`channel_send\`) so the user actually sees it. Failures need surfacing too: when a delegated task didn't complete, the user needs the outcome and whatever partial progress you got. \`NO_REPLY\` is the escape hatch only when the user has already seen the substantive answer — typically because you posted it via \`channel_reply\` in the same turn that spawned the subagent, and the reminder is purely confirming completion of a step the user is already tracking. Otherwise, post the result.
|
|
74
74
|
|
|
75
|
-
Before you run a tool chain that returns bulky intermediate output you won't need again — multiple \`webfetch\` calls, a \`websearch\` round you'll iterate on, a \`bash\` command that scrapes a site or dumps a large response, an \`agent-browser\` session, a \`claude\` (Claude Code) delegation driven through tmux, any "fetch N things and synthesize" loop — delegate it to a subagent. \`scout\` (for research) or \`operator\` (for actions with side effects) runs the noisy work in its own context window and returns a distilled summary; your session carries the *answer*, not the raw material you derived it from. This is about context economy, not latency: even a fast operation belongs in a subagent when the byproducts are large and disposable (three quick news searches across different outlets still dumps three SERPs and three article bodies into your context forever). The exception is exactly one call whose result you'll cite directly — one \`webfetch\` of a known URL, one \`websearch\` query whose top result is the answer. Two of either, or any "across multiple sources" framing, is delegation territory.
|
|
75
|
+
Before you run a tool chain that returns bulky intermediate output you won't need again — multiple \`webfetch\` calls, a \`websearch\` round you'll iterate on, a \`bash\` command that scrapes a site or dumps a large response, an \`agent-browser\` session, a \`claude\` (Claude Code) or \`codex\` (OpenAI Codex CLI) delegation driven through tmux, any "fetch N things and synthesize" loop — delegate it to a subagent. \`scout\` (for research) or \`operator\` (for actions with side effects) runs the noisy work in its own context window and returns a distilled summary; your session carries the *answer*, not the raw material you derived it from. This is about context economy, not latency: even a fast operation belongs in a subagent when the byproducts are large and disposable (three quick news searches across different outlets still dumps three SERPs and three article bodies into your context forever). The exception is exactly one call whose result you'll cite directly — one \`webfetch\` of a known URL, one \`websearch\` query whose top result is the answer. Two of either, or any "across multiple sources" framing, is delegation territory.
|
|
76
76
|
|
|
77
|
-
The bundled \`operator\` subagent is the right tool for this mode. It is write-capable (read, write, edit, bash with side effects) and runs on the default model. Use it for: browser sessions, multi-file refactors, deploys, batch API calls, Claude Code delegations (the tmux driving loop, the multi-turn polling, the worktree teardown — all of it inside operator), anything that involves taking action on behalf of the user over multiple steps. The operator returns a structured final report (outcome, what changed, what was observed); surface it naturally rather than copy-pasting. Operator is gated by a separate permission (\`subagent.spawn.operator\`) so write-capable spawns are restricted to owner-tier and trusted-tier callers — if the gate denies, fall back to doing the work in your own session rather than reporting failure to the user.
|
|
77
|
+
The bundled \`operator\` subagent is the right tool for this mode. It is write-capable (read, write, edit, bash with side effects) and runs on the default model. Use it for: browser sessions, multi-file refactors, deploys, batch API calls, Claude Code or Codex CLI delegations (the tmux driving loop, the multi-turn polling, the worktree teardown — all of it inside operator), anything that involves taking action on behalf of the user over multiple steps. The operator returns a structured final report (outcome, what changed, what was observed); surface it naturally rather than copy-pasting. Operator is gated by a separate permission (\`subagent.spawn.operator\`) so write-capable spawns are restricted to owner-tier and trusted-tier callers — if the gate denies, fall back to doing the work in your own session rather than reporting failure to the user.
|
|
78
78
|
|
|
79
79
|
**Status queries**
|
|
80
80
|
|
|
81
|
-
If the user asks "how's it going?" or "status?" on a running subagent, call \`subagent_output({ task_id
|
|
81
|
+
If the user asks "how's it going?" or "status?" on a running subagent, call \`subagent_output({ task_id })\` and report the \`status_summary\` in your own words. Don't pretend to know the status without checking.
|
|
82
82
|
|
|
83
83
|
**Prompt structure for spawns** (mandatory — the subagent does not see this conversation)
|
|
84
84
|
|
|
@@ -92,7 +92,7 @@ If the user asks "how's it going?" or "status?" on a running subagent, call \`su
|
|
|
92
92
|
|
|
93
93
|
- Don't fire more than 5 subagents in a single turn.
|
|
94
94
|
- Don't spawn for a known answer or single-file lookup — do it yourself.
|
|
95
|
-
- Don't
|
|
95
|
+
- Don't call \`subagent_output\` in a loop waiting for completion; end your response and the reminder will wake you, then fetch the result once.
|
|
96
96
|
- Don't ask a research subagent to make architectural decisions for you — they find and report; you decide.
|
|
97
97
|
- Subagents cannot recursively spawn other subagents.
|
|
98
98
|
|
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
import { Type } from '@mariozechner/pi-ai'
|
|
2
2
|
import { defineTool } from '@mariozechner/pi-coding-agent'
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import {
|
|
5
|
+
containsKimiToolDelimiter,
|
|
6
|
+
isNoReplySignal,
|
|
7
|
+
isUpstreamEmptyResponseSentinel,
|
|
8
|
+
type ChannelRouter,
|
|
9
|
+
} from '@/channels/router'
|
|
5
10
|
import type { AdapterId } from '@/channels/schema'
|
|
6
11
|
|
|
7
12
|
import { type ChannelToolLogger, consoleChannelLogger, formatChannelToolFailure } from './channel-log'
|
|
13
|
+
import { fenceRuntimeNotice } from './runtime-notice'
|
|
8
14
|
|
|
9
15
|
export type ChannelReplyOrigin = {
|
|
10
16
|
adapter: AdapterId
|
|
@@ -98,6 +104,15 @@ export function createChannelReplyTool({
|
|
|
98
104
|
}
|
|
99
105
|
}
|
|
100
106
|
|
|
107
|
+
const kimiLeakError = kimiToolCallLeakError(text)
|
|
108
|
+
if (kimiLeakError) {
|
|
109
|
+
logger.warn(formatChannelToolFailure('channel_reply', kimiLeakError))
|
|
110
|
+
return {
|
|
111
|
+
content: [{ type: 'text' as const, text: `channel_reply denied: ${kimiLeakError}` }],
|
|
112
|
+
details: { ok: false, error: kimiLeakError },
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
101
116
|
const result = await router.send({
|
|
102
117
|
adapter: origin.adapter,
|
|
103
118
|
workspace: origin.workspace,
|
|
@@ -148,14 +163,24 @@ export function createChannelReplyTool({
|
|
|
148
163
|
}),
|
|
149
164
|
)
|
|
150
165
|
: ''
|
|
166
|
+
const body = hint ? `${baseText}${hint}` : baseText
|
|
151
167
|
return {
|
|
152
|
-
content: [{ type: 'text' as const, text:
|
|
168
|
+
content: [{ type: 'text' as const, text: `${TOOL_RESULT_PREFIX}${body}` }],
|
|
153
169
|
details,
|
|
154
170
|
}
|
|
155
171
|
},
|
|
156
172
|
})
|
|
157
173
|
}
|
|
158
174
|
|
|
175
|
+
// Tool results reach the model as USER-role messages (OpenAI / Anthropic
|
|
176
|
+
// tool-API contract — the engine cannot tag them as system). Without this
|
|
177
|
+
// marker a persona-rich model reads its own echo as a fresh user inbound
|
|
178
|
+
// and replies to itself. Observed in production: Kimi K2 on KakaoTalk
|
|
179
|
+
// re-invoked after a successful send saw only the echo as new context
|
|
180
|
+
// and hallucinated a goodbye trigger from it. Mirrored verbatim in
|
|
181
|
+
// channel-send.ts so both tools share one greppable marker.
|
|
182
|
+
export const TOOL_RESULT_PREFIX = '[system: tool result, not a user message] '
|
|
183
|
+
|
|
159
184
|
export const ECHO_MAX_CHARS = 500
|
|
160
185
|
|
|
161
186
|
export function renderEcho(text: string): string {
|
|
@@ -211,12 +236,27 @@ function upstreamEmptyResponseSentinelError(text: string | undefined): string {
|
|
|
211
236
|
)
|
|
212
237
|
}
|
|
213
238
|
|
|
239
|
+
function kimiToolCallLeakError(text: string | undefined): string {
|
|
240
|
+
if (text === undefined) return ''
|
|
241
|
+
if (!containsKimiToolDelimiter(text)) return ''
|
|
242
|
+
return (
|
|
243
|
+
'refusing to forward raw provider tool-call control tokens; these are chat-template ' +
|
|
244
|
+
'delimiters that should have been parsed into a real tool call upstream. ' +
|
|
245
|
+
'Re-issue the intended channel reply as plain user-visible text only.'
|
|
246
|
+
)
|
|
247
|
+
}
|
|
248
|
+
|
|
214
249
|
// Mirror of the same hint used by channel_send. Kept identical so the model
|
|
215
|
-
// sees the same yield signal regardless of which tool it picked.
|
|
250
|
+
// sees the same yield signal regardless of which tool it picked. The body
|
|
251
|
+
// is wrapped via `fenceRuntimeNotice` (in `./runtime-notice`) so persona-rich
|
|
252
|
+
// models cannot read the trailing prose as a chat instruction and reply to
|
|
253
|
+
// it in-character. See that helper's comment for the failure mode that
|
|
254
|
+
// motivated the framing.
|
|
216
255
|
function consecutiveSendHint(countAfterSend: number): string {
|
|
217
256
|
if (countAfterSend <= 1) return ''
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
257
|
+
const body =
|
|
258
|
+
countAfterSend === 2
|
|
259
|
+
? 'this is your 2nd consecutive message in this conversation; continue only if the reply genuinely needs splitting.'
|
|
260
|
+
: `${countAfterSend}th consecutive message with no user reply; end your turn now unless the user explicitly asked for a multi-step response.`
|
|
261
|
+
return fenceRuntimeNotice(body)
|
|
222
262
|
}
|
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
import { Type } from '@mariozechner/pi-ai'
|
|
2
2
|
import { defineTool } from '@mariozechner/pi-coding-agent'
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import {
|
|
5
|
+
containsKimiToolDelimiter,
|
|
6
|
+
isNoReplySignal,
|
|
7
|
+
isUpstreamEmptyResponseSentinel,
|
|
8
|
+
type ChannelRouter,
|
|
9
|
+
} from '@/channels/router'
|
|
5
10
|
import { ADAPTER_IDS, type AdapterId } from '@/channels/schema'
|
|
6
11
|
|
|
7
12
|
import { type ChannelToolLogger, consoleChannelLogger, formatChannelToolFailure } from './channel-log'
|
|
8
|
-
import { renderOutboundEcho } from './channel-reply'
|
|
13
|
+
import { renderOutboundEcho, TOOL_RESULT_PREFIX } from './channel-reply'
|
|
14
|
+
import { fenceRuntimeNotice } from './runtime-notice'
|
|
9
15
|
|
|
10
16
|
export type ChannelSendOrigin = {
|
|
11
17
|
adapter: AdapterId
|
|
@@ -121,6 +127,15 @@ export function createChannelSendTool({ router, origin, logger = consoleChannelL
|
|
|
121
127
|
}
|
|
122
128
|
}
|
|
123
129
|
|
|
130
|
+
const kimiLeakError = kimiToolCallLeakError(bodyText)
|
|
131
|
+
if (kimiLeakError) {
|
|
132
|
+
logger.warn(formatChannelToolFailure('channel_send', kimiLeakError))
|
|
133
|
+
return {
|
|
134
|
+
content: [{ type: 'text' as const, text: `channel_send denied: ${kimiLeakError}` }],
|
|
135
|
+
details: { ok: false, error: kimiLeakError },
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
124
139
|
const result = await router.send({
|
|
125
140
|
adapter,
|
|
126
141
|
workspace: params.workspace,
|
|
@@ -163,9 +178,9 @@ export function createChannelSendTool({ router, origin, logger = consoleChannelL
|
|
|
163
178
|
})
|
|
164
179
|
if (threadMismatch) hints.push(threadMismatch)
|
|
165
180
|
}
|
|
166
|
-
const
|
|
181
|
+
const body = hints.length > 0 ? `${baseText}${hints.join('')}` : baseText
|
|
167
182
|
return {
|
|
168
|
-
content: [{ type: 'text' as const, text:
|
|
183
|
+
content: [{ type: 'text' as const, text: `${TOOL_RESULT_PREFIX}${body}` }],
|
|
169
184
|
details,
|
|
170
185
|
}
|
|
171
186
|
},
|
|
@@ -181,6 +196,11 @@ export function createChannelSendTool({ router, origin, logger = consoleChannelL
|
|
|
181
196
|
//
|
|
182
197
|
// Only fires when the origin had a thread to begin with — channel-root
|
|
183
198
|
// sessions can't have a "missing thread" problem.
|
|
199
|
+
//
|
|
200
|
+
// Body is fenced via `fenceRuntimeNotice` for the same reason the
|
|
201
|
+
// consecutive-send hint is — see that helper's comment for the failure
|
|
202
|
+
// mode (Kimi-K2.x reading trailing tool-result prose as a chat instruction
|
|
203
|
+
// and replying to it in-character).
|
|
184
204
|
function threadMismatchHint(
|
|
185
205
|
origin: ChannelSendOrigin | undefined,
|
|
186
206
|
sent: { adapter: AdapterId; workspace: string; chat: string; thread: string | undefined },
|
|
@@ -191,10 +211,10 @@ function threadMismatchHint(
|
|
|
191
211
|
if (origin.adapter !== sent.adapter) return ''
|
|
192
212
|
if (origin.workspace !== sent.workspace) return ''
|
|
193
213
|
if (origin.chat !== sent.chat) return ''
|
|
194
|
-
return (
|
|
214
|
+
return fenceRuntimeNotice(
|
|
195
215
|
`note: this session's origin thread is ${JSON.stringify(origin.thread)} but you posted to channel root. ` +
|
|
196
|
-
|
|
197
|
-
|
|
216
|
+
`if breaking out of the thread was intentional, ignore this; otherwise prefer \`channel_reply\` ` +
|
|
217
|
+
`or pass \`thread: ${JSON.stringify(origin.thread)}\` on your next channel_send.`,
|
|
198
218
|
)
|
|
199
219
|
}
|
|
200
220
|
|
|
@@ -233,16 +253,28 @@ function upstreamEmptyResponseSentinelError(text: string | undefined): string {
|
|
|
233
253
|
)
|
|
234
254
|
}
|
|
235
255
|
|
|
256
|
+
function kimiToolCallLeakError(text: string | undefined): string {
|
|
257
|
+
if (text === undefined) return ''
|
|
258
|
+
if (!containsKimiToolDelimiter(text)) return ''
|
|
259
|
+
return (
|
|
260
|
+
'refusing to forward raw provider tool-call control tokens; these are chat-template ' +
|
|
261
|
+
'delimiters that should have been parsed into a real tool call upstream. ' +
|
|
262
|
+
'Re-issue the intended channel send as plain user-visible text only.'
|
|
263
|
+
)
|
|
264
|
+
}
|
|
265
|
+
|
|
236
266
|
// Returns a behavioral hint to nudge the model toward yielding when it has
|
|
237
267
|
// been the only voice in the conversation for several messages. The router
|
|
238
268
|
// increments its counter AFTER router.send returns, so a count of 1 means
|
|
239
269
|
// "this is the second consecutive bot message in this chat:thread" — which
|
|
240
270
|
// is the first count where a hint is warranted. Empty string at count <= 1
|
|
241
271
|
// preserves the original tool-result text for the common single-reply case.
|
|
272
|
+
// Mirror of channel-reply.ts; body wrapped via `fenceRuntimeNotice`.
|
|
242
273
|
function consecutiveSendHint(countAfterSend: number): string {
|
|
243
274
|
if (countAfterSend <= 1) return ''
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
275
|
+
const body =
|
|
276
|
+
countAfterSend === 2
|
|
277
|
+
? 'this is your 2nd consecutive message in this conversation; continue only if the reply genuinely needs splitting.'
|
|
278
|
+
: `${countAfterSend}th consecutive message with no user reply; end your turn now unless the user explicitly asked for a multi-step response.`
|
|
279
|
+
return fenceRuntimeNotice(body)
|
|
248
280
|
}
|
|
@@ -27,6 +27,15 @@ export type CreateRestartToolOptions = {
|
|
|
27
27
|
// fixes. Required even when stream is absent so the type stays simple and
|
|
28
28
|
// the field's presence documents the runtime contract.
|
|
29
29
|
originatingSessionId: string
|
|
30
|
+
// Override the default 5s ACK budget. Production has no caller for this —
|
|
31
|
+
// 5s is generous against a real hostd on the same host. Test-only seam:
|
|
32
|
+
// restart.test.ts spawns a `Bun.serve` and awaits its HTTP roundtrip from
|
|
33
|
+
// the same parallel-test-runner that hosts dozens of other workers
|
|
34
|
+
// contending on libuv's I/O threads. Under that contention, an in-process
|
|
35
|
+
// 127.0.0.1 fetch can occasionally exceed 5s and the test's `expect(ok:
|
|
36
|
+
// true)` assertion flips to `ok: false, reason: 'daemon ack timeout'`.
|
|
37
|
+
// Optional so production callers keep the 5s default unchanged.
|
|
38
|
+
ackTimeoutMs?: number
|
|
30
39
|
}
|
|
31
40
|
|
|
32
41
|
export type RestartToolDetails = { ok: boolean; containerName: string; reason?: string }
|
|
@@ -45,9 +54,11 @@ export function createRestartTool({
|
|
|
45
54
|
hostdToken,
|
|
46
55
|
stream,
|
|
47
56
|
originatingSessionId,
|
|
57
|
+
ackTimeoutMs,
|
|
48
58
|
}: CreateRestartToolOptions) {
|
|
49
59
|
const doExit = exit ?? ((code: number) => process.exit(code))
|
|
50
60
|
const httpUrl = hostdUrl ?? process.env.TYPECLAW_HOSTD_URL
|
|
61
|
+
const ackBudget = ackTimeoutMs ?? ACK_TIMEOUT_MS
|
|
51
62
|
const httpToken = hostdToken ?? process.env.TYPECLAW_HOSTD_TOKEN
|
|
52
63
|
|
|
53
64
|
return defineTool({
|
|
@@ -78,8 +89,8 @@ export function createRestartTool({
|
|
|
78
89
|
const request = { kind: 'restart' as const, containerName, build }
|
|
79
90
|
const reply =
|
|
80
91
|
httpUrl && httpToken
|
|
81
|
-
? await sendHttp(request, { timeoutMs:
|
|
82
|
-
: await send(request, { timeoutMs:
|
|
92
|
+
? await sendHttp(request, { timeoutMs: ackBudget, url: httpUrl, token: httpToken })
|
|
93
|
+
: await send(request, { timeoutMs: ackBudget, socket: socketPath ?? containerSocketPath() })
|
|
83
94
|
if (!reply.ok) {
|
|
84
95
|
const details: RestartToolDetails = { ok: false, containerName, reason: reply.reason }
|
|
85
96
|
return {
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
// Wraps a runtime-emitted notice body in canonical SYSTEM MESSAGE framing so
|
|
2
|
+
// persona-rich models cannot read the prose as a chat instruction from a
|
|
3
|
+
// human and respond to it in-character.
|
|
4
|
+
//
|
|
5
|
+
// The failure mode this exists to prevent: tool results reach the model as
|
|
6
|
+
// USER-role messages (provider tool-call contract — engines cannot tag them
|
|
7
|
+
// as system). The `TOOL_RESULT_PREFIX` already marks each result's leading
|
|
8
|
+
// position, but trailing natural-language hints (the consecutive-send nudge
|
|
9
|
+
// is the canonical case) still parse as conversational prose, and Kimi-K2.x
|
|
10
|
+
// has been observed in production responding to those hints in-character —
|
|
11
|
+
// an apology directly addressed at the human ("sorry for talking so much,
|
|
12
|
+
// I'll be quieter next time") when the only stimulus in the prompt was the
|
|
13
|
+
// router's "Nth consecutive message; end your turn now" hint. Four
|
|
14
|
+
// consecutive in-character replies to fenced-prose runtime hints in a
|
|
15
|
+
// single drain iteration is the observed shape.
|
|
16
|
+
//
|
|
17
|
+
// Framing convention is the same shape `composeTurnPrompt` uses for the
|
|
18
|
+
// loop-guard block in `router.ts` — bracketed marker, fence rules, and
|
|
19
|
+
// explicit "Do not acknowledge or reply to this notice" closer. The
|
|
20
|
+
// loop-guard block has been in production against Kimi for months without
|
|
21
|
+
// the misread we observed on the consecutive-send hint, which is why we
|
|
22
|
+
// reuse the exact same shape here.
|
|
23
|
+
//
|
|
24
|
+
// Applied unconditionally (not model-gated): the cost is ~40 tokens per
|
|
25
|
+
// hint emission, paid only on consecutive sends (where the hint is already
|
|
26
|
+
// firing), and the framing is safe for every model — well-behaved models
|
|
27
|
+
// read it and move on. Gating by model family would have required a
|
|
28
|
+
// traits table for one defense and would still need extending the moment
|
|
29
|
+
// a second model family exhibited the same misread, so we accept the
|
|
30
|
+
// universal cost in exchange for never having to remember to add a new
|
|
31
|
+
// family to a list.
|
|
32
|
+
export function fenceRuntimeNotice(body: string): string {
|
|
33
|
+
return (
|
|
34
|
+
'\n\n---\n' +
|
|
35
|
+
'**[SYSTEM MESSAGE — not from a human]**\n\n' +
|
|
36
|
+
body +
|
|
37
|
+
'\n\nThis is an automated signal from the channel router, not a message ' +
|
|
38
|
+
'from anyone in the chat. **Do not acknowledge or reply to this notice.**\n' +
|
|
39
|
+
'---'
|
|
40
|
+
)
|
|
41
|
+
}
|
|
@@ -130,7 +130,6 @@ export function createSpawnSubagentTool(options: CreateSpawnSubagentToolOptions)
|
|
|
130
130
|
startedAt,
|
|
131
131
|
status: 'running' as const,
|
|
132
132
|
abort: resolvedHandle.abort,
|
|
133
|
-
awaitCompletion: () => completion.then((c) => completionToFinalShape(c, now() - startedAt)),
|
|
134
133
|
}
|
|
135
134
|
liveRegistry.register(live)
|
|
136
135
|
|
|
@@ -6,9 +6,6 @@ import type { PermissionService } from '@/permissions'
|
|
|
6
6
|
import type { LiveSubagentRegistry, StatusSnapshot, SubagentProgressEvent } from '../live-subagents'
|
|
7
7
|
import type { SessionOrigin } from '../session-origin'
|
|
8
8
|
|
|
9
|
-
const DEFAULT_TIMEOUT_MS = 60_000
|
|
10
|
-
const MAX_TIMEOUT_MS = 300_000
|
|
11
|
-
|
|
12
9
|
export type SubagentOutputToolDetails =
|
|
13
10
|
| {
|
|
14
11
|
ok: true
|
|
@@ -57,43 +54,19 @@ export function createSubagentOutputTool(options: CreateSubagentOutputToolOption
|
|
|
57
54
|
'Fetch the current state of a subagent you previously spawned. Returns one of three statuses: ' +
|
|
58
55
|
"'running' (with a human-readable status_summary and a tail of recent progress events), " +
|
|
59
56
|
"'completed' (with the final message), or 'failed' (with the error). " +
|
|
60
|
-
'
|
|
61
|
-
'
|
|
62
|
-
'
|
|
57
|
+
'Returns immediately with a snapshot — never blocks. ' +
|
|
58
|
+
'For backgrounded spawns, end your turn after spawning and wait for the completion <system-reminder>; ' +
|
|
59
|
+
'then call this once to fetch the result. Use it for ad-hoc status checks too — never in a polling loop.',
|
|
63
60
|
parameters: Type.Object({
|
|
64
61
|
task_id: Type.String({
|
|
65
62
|
description: 'The task_id returned by a previous spawn_subagent call.',
|
|
66
63
|
}),
|
|
67
|
-
block: Type.Optional(
|
|
68
|
-
Type.Boolean({
|
|
69
|
-
description:
|
|
70
|
-
'If true, wait for the subagent to complete (or time out) before returning. Default false: return immediately with the current state.',
|
|
71
|
-
}),
|
|
72
|
-
),
|
|
73
|
-
timeout_ms: Type.Optional(
|
|
74
|
-
Type.Integer({
|
|
75
|
-
description: `When block=true, max milliseconds to wait (default ${DEFAULT_TIMEOUT_MS}, max ${MAX_TIMEOUT_MS}).`,
|
|
76
|
-
minimum: 1,
|
|
77
|
-
maximum: MAX_TIMEOUT_MS,
|
|
78
|
-
}),
|
|
79
|
-
),
|
|
80
64
|
}),
|
|
81
65
|
|
|
82
66
|
async execute(_toolCallId, params) {
|
|
83
67
|
if (permissions !== undefined && !permissions.has(getOrigin(), 'subagent.output')) {
|
|
84
68
|
return errorResult('subagent.output denied: insufficient permissions')
|
|
85
69
|
}
|
|
86
|
-
const live = liveRegistry.get(params.task_id)
|
|
87
|
-
if (live === undefined) {
|
|
88
|
-
return errorResult(`Unknown task_id: ${params.task_id}.`)
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
const wantsBlock = params.block === true && live.status === 'running'
|
|
92
|
-
if (wantsBlock) {
|
|
93
|
-
const timeoutMs = clampTimeout(params.timeout_ms)
|
|
94
|
-
await raceWithTimeout(live.awaitCompletion(), timeoutMs)
|
|
95
|
-
}
|
|
96
|
-
|
|
97
70
|
const snap = liveRegistry.snapshot(params.task_id, now())
|
|
98
71
|
if (snap === undefined) {
|
|
99
72
|
return errorResult(`Unknown task_id: ${params.task_id}.`)
|
|
@@ -103,27 +76,6 @@ export function createSubagentOutputTool(options: CreateSubagentOutputToolOption
|
|
|
103
76
|
})
|
|
104
77
|
}
|
|
105
78
|
|
|
106
|
-
function clampTimeout(value: number | undefined): number {
|
|
107
|
-
if (value === undefined) return DEFAULT_TIMEOUT_MS
|
|
108
|
-
return Math.min(Math.max(1, Math.floor(value)), MAX_TIMEOUT_MS)
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
async function raceWithTimeout<T>(promise: Promise<T>, timeoutMs: number): Promise<T | undefined> {
|
|
112
|
-
return new Promise<T | undefined>((resolve) => {
|
|
113
|
-
const timer = setTimeout(() => resolve(undefined), timeoutMs)
|
|
114
|
-
promise.then(
|
|
115
|
-
(value) => {
|
|
116
|
-
clearTimeout(timer)
|
|
117
|
-
resolve(value)
|
|
118
|
-
},
|
|
119
|
-
() => {
|
|
120
|
-
clearTimeout(timer)
|
|
121
|
-
resolve(undefined)
|
|
122
|
-
},
|
|
123
|
-
)
|
|
124
|
-
})
|
|
125
|
-
}
|
|
126
|
-
|
|
127
79
|
type ToolReturn = {
|
|
128
80
|
content: { type: 'text'; text: string }[]
|
|
129
81
|
details: SubagentOutputToolDetails
|