typeclaw 0.9.2 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/package.json +2 -2
  2. package/src/agent/index.ts +46 -11
  3. package/src/agent/restart-handoff/index.ts +91 -0
  4. package/src/agent/restart-handoff/paths.ts +11 -0
  5. package/src/agent/session-origin.ts +30 -10
  6. package/src/agent/subagent-completion-reminder.ts +4 -2
  7. package/src/agent/system-prompt.ts +1 -1
  8. package/src/agent/tools/restart.ts +42 -1
  9. package/src/agent/tools/skip-response.ts +157 -0
  10. package/src/bundled-plugins/memory/README.md +18 -2
  11. package/src/bundled-plugins/memory/index.ts +108 -6
  12. package/src/bundled-plugins/memory/memory-logger.ts +33 -24
  13. package/src/bundled-plugins/security/index.ts +19 -17
  14. package/src/bundled-plugins/security/permissions.ts +9 -8
  15. package/src/bundled-plugins/security/policies/cron-promotion.ts +26 -9
  16. package/src/bundled-plugins/security/policies/git-exfil.ts +23 -15
  17. package/src/bundled-plugins/security/policies/prompt-injection.ts +1 -1
  18. package/src/bundled-plugins/security/policies/role-promotion.ts +25 -18
  19. package/src/channels/adapters/github/auth-app.ts +53 -9
  20. package/src/channels/adapters/github/auth-pat.ts +4 -1
  21. package/src/channels/adapters/github/auth.ts +10 -0
  22. package/src/channels/adapters/github/event-permissions.ts +83 -0
  23. package/src/channels/adapters/github/inbound.ts +126 -1
  24. package/src/channels/adapters/github/index.ts +60 -66
  25. package/src/channels/adapters/github/outbound.ts +65 -17
  26. package/src/channels/adapters/github/permission-guidance.ts +169 -0
  27. package/src/channels/adapters/github/team-membership.ts +56 -0
  28. package/src/channels/router.ts +313 -10
  29. package/src/channels/schema.ts +22 -0
  30. package/src/channels/types.ts +1 -1
  31. package/src/cli/channel.ts +135 -38
  32. package/src/cli/cron.ts +1 -1
  33. package/src/cli/init.ts +133 -86
  34. package/src/cli/inspect-controller.ts +66 -0
  35. package/src/cli/inspect.ts +99 -14
  36. package/src/cli/role.ts +2 -2
  37. package/src/cli/run.ts +24 -5
  38. package/src/cli/tui.ts +34 -10
  39. package/src/cli/tunnel.ts +453 -14
  40. package/src/config/config.ts +35 -7
  41. package/src/config/providers.ts +82 -56
  42. package/src/cron/bridge.ts +25 -4
  43. package/src/hostd/daemon.ts +44 -24
  44. package/src/hostd/portbroker-manager.ts +19 -3
  45. package/src/init/dockerfile.ts +52 -0
  46. package/src/init/env-file.ts +66 -0
  47. package/src/init/gitignore.ts +8 -0
  48. package/src/init/hatching.ts +32 -5
  49. package/src/init/index.ts +131 -39
  50. package/src/init/validate-api-key.ts +31 -0
  51. package/src/inspect/index.ts +47 -6
  52. package/src/inspect/loop.ts +31 -0
  53. package/src/inspect/replay.ts +15 -1
  54. package/src/permissions/builtins.ts +29 -21
  55. package/src/permissions/permissions.ts +32 -5
  56. package/src/role-claim/code.ts +9 -9
  57. package/src/role-claim/controller.ts +3 -2
  58. package/src/role-claim/match-rule.ts +14 -19
  59. package/src/role-claim/pending.ts +2 -2
  60. package/src/run/codex-fetch-observer.ts +377 -0
  61. package/src/run/index.ts +12 -2
  62. package/src/server/index.ts +59 -1
  63. package/src/shared/protocol.ts +1 -1
  64. package/src/skills/typeclaw-channel-github/SKILL.md +45 -1
  65. package/src/skills/typeclaw-codex-cli/SKILL.md +1 -1
  66. package/src/skills/typeclaw-codex-cli/references/auth-flow.md +14 -1
  67. package/src/skills/typeclaw-config/SKILL.md +7 -1
  68. package/src/skills/typeclaw-config/references/recommended-mounts.md +233 -0
  69. package/src/skills/typeclaw-permissions/SKILL.md +24 -18
  70. package/src/skills/typeclaw-tunnels/SKILL.md +33 -1
  71. package/src/tui/index.ts +17 -5
  72. package/src/tunnels/index.ts +1 -0
  73. package/src/tunnels/manager.ts +18 -0
  74. package/src/tunnels/providers/cloudflare-named.ts +224 -0
  75. package/src/tunnels/types.ts +17 -1
  76. package/typeclaw.schema.json +120 -7
@@ -152,6 +152,29 @@ export function createPermissionService(opts: CreatePermissionServiceOptions = {
152
152
  }
153
153
  }
154
154
 
155
+ // Walk order: owner, trusted, custom roles (in REVERSE declaration order),
156
+ // member, guest. First role whose `match[]` covers the origin wins.
157
+ //
158
+ // Built-in tower: owner > trusted > member > guest. Pinning the tower
159
+ // ahead of any user-declared rule closes a load-bearing footgun in the
160
+ // previous pure-declaration-order resolver: declaring
161
+ // `member.match: ["*"]` before `owner.match: [...]` resolved every
162
+ // channel session — INCLUDING the owner's — to `member`, because the
163
+ // wildcard matched first. The rolePromotion guard then made it
164
+ // un-fixable from inside the demoted session (a member-resolved speaker
165
+ // cannot rewrite `roles` without a TUI-issued ack).
166
+ //
167
+ // Custom roles use REVERSE declaration order: later declarations override
168
+ // earlier ones. This matches the standard "later config wins" mental
169
+ // model — when an operator adds a new role with the same match-scope as
170
+ // an existing one (or appends a new author-pinned override to an existing
171
+ // broad rule), the newer entry takes precedence. The previous "earlier
172
+ // wins" was an arbitrary consequence of map iteration order rather than
173
+ // a deliberate semantic.
174
+ //
175
+ // Custom roles cannot self-promote above trusted (no inherent severity
176
+ // guarantee) and cannot demote themselves below member (declaring a custom
177
+ // role implies the operator wants it to win against bottom catch-alls).
155
178
  function buildRoleTable(
156
179
  roles: RolesConfig,
157
180
  pluginPermissions: readonly string[],
@@ -160,16 +183,20 @@ function buildRoleTable(
160
183
  const out: ResolvedRole[] = []
161
184
  const seen = new Set<string>()
162
185
 
163
- for (const name of Object.keys(roles)) {
164
- if (seen.has(name)) continue
186
+ const emit = (name: string): void => {
187
+ if (seen.has(name)) return
165
188
  seen.add(name)
166
189
  out.push(resolveOne(name, roles[name], pluginPermissions, ownerWildcardExclusions))
167
190
  }
168
191
 
169
- for (const name of BUILTIN_ROLE_NAMES) {
170
- if (seen.has(name)) continue
171
- out.push(resolveOne(name, undefined, pluginPermissions, ownerWildcardExclusions))
192
+ emit('owner')
193
+ emit('trusted')
194
+ const customRoles = Object.keys(roles).filter((name) => !isBuiltinRoleName(name))
195
+ for (let i = customRoles.length - 1; i >= 0; i--) {
196
+ emit(customRoles[i]!)
172
197
  }
198
+ emit('member')
199
+ emit('guest')
173
200
 
174
201
  return out
175
202
  }
@@ -1,17 +1,17 @@
1
1
  import { randomBytes } from 'node:crypto'
2
2
 
3
3
  // Role-claim codes are short, human-typeable tokens the operator sends from
4
- // their host CLI to the bot via a channel DM to prove ownership of that
5
- // channel identity. Shape: `claim-XXXX-YYYY` where each block is 4 chars
6
- // from a Crockford-style base32 alphabet (0-9 + A-Z minus I, L, O, U to
7
- // dodge OCR-confusable / profane shapes). 8 chars * 5 bits = 40 bits of
8
- // entropy, which is overkill for a TTL'd in-memory window but cheap to
9
- // display and dictate over voice.
4
+ // their host CLI to the bot in any chat (DM, group, channel) to prove
5
+ // ownership of that channel identity. Shape: `claim-XXXX-YYYY` where each
6
+ // block is 4 chars from a Crockford-style base32 alphabet (0-9 + A-Z minus
7
+ // I, L, O, U to dodge OCR-confusable / profane shapes). 8 chars * 5 bits =
8
+ // 40 bits of entropy, which is overkill for a TTL'd in-memory window but
9
+ // cheap to display and dictate over voice.
10
10
  //
11
11
  // The `claim-` prefix lets the channel router recognize potential claim
12
- // attempts in a DM body without scanning the whole text for hex blocks,
13
- // and distinguishes claim DMs from normal first-message text like "hi"
14
- // which would otherwise need a regex of its own to disambiguate.
12
+ // attempts in inbound text without scanning the whole body for hex blocks,
13
+ // and distinguishes claim messages from normal first-message text like
14
+ // "hi" which would otherwise need a regex of its own to disambiguate.
15
15
 
16
16
  export const CLAIM_CODE_PREFIX = 'claim-'
17
17
 
@@ -10,8 +10,9 @@ import { createPendingClaimRegistry, type PendingClaim, type PendingClaimRegistr
10
10
  //
11
11
  // 1. The host CLI (typeclaw role claim) opens a WS and sends `claim_start`.
12
12
  // 2. The WS server forwards that to controller.startClaim().
13
- // 3. The channel router's claimHandler (also wired here) intercepts DMs
14
- // bearing the code and calls controller.tryConsumeInbound().
13
+ // 3. The channel router's claimHandler (also wired here) intercepts any
14
+ // inbound bearing the code (DM, group, or channel) and calls
15
+ // controller.tryConsumeInbound().
15
16
  // 4. On consume, the controller writes to typeclaw.json#roles.<role>.match
16
17
  // via grantRole, then reloads the live PermissionService so the new
17
18
  // match rule takes effect without a container restart.
@@ -1,15 +1,19 @@
1
1
  // Builds a canonical match-rule DSL string from an inbound channel origin,
2
- // for the role table. Output shapes:
2
+ // for the role table. Output shape is always platform-wide + author:
3
3
  //
4
- // slack:T0123 author:U_ALICE
5
- // discord:9999 author:U_ALICE
6
- // telegram:42 author:U_ALICE
7
- // kakao:dm/<chatId> author:<authorId>
4
+ // slack:* author:<authorId>
5
+ // discord:* author:<authorId>
6
+ // telegram:* author:<authorId>
7
+ // kakao:* author:<authorId>
8
8
  //
9
- // The author qualifier is always emitted so a claim grants the specific
10
- // human, not the whole workspace. To grant the whole workspace, the
11
- // operator edits typeclaw.json by hand or runs a future `typeclaw role grant`
12
- // without --claim.
9
+ // "Platform-wide" means every chat the adapter sees on that platform
10
+ // DMs, group chats, and threads alike gated by the author qualifier so
11
+ // only this specific human is matched. The intent is: once an operator
12
+ // proves they control a channel identity (by sending a code to the bot),
13
+ // they keep their role wherever they speak from on the same platform. To
14
+ // scope tighter (e.g. one workspace, one chat), the operator edits
15
+ // typeclaw.json by hand; the claim flow is deliberately broad because
16
+ // re-claiming on every new chat would be tedious for the common case.
13
17
 
14
18
  import type { ChannelKey } from '@/channels/types'
15
19
 
@@ -31,14 +35,5 @@ const ADAPTER_TO_PLATFORM: Record<ChannelKey['adapter'], 'slack' | 'discord' | '
31
35
 
32
36
  export function formatClaimMatchRule(origin: PartialChannelOrigin): string {
33
37
  const platform = ADAPTER_TO_PLATFORM[origin.adapter]
34
- const authorQual = ` author:${origin.authorId}`
35
- if (origin.adapter === 'kakaotalk') {
36
- // Kakao has no workspace; routes use dm/group/open buckets. We can't
37
- // know which bucket from a partial origin alone (adapter-side classifies
38
- // it), so claim flows are restricted to DM and we emit the specific
39
- // chat-id form so the rule grants only this 1:1 conversation, not every
40
- // DM the agent is in.
41
- return `${platform}:dm/${origin.chat}${authorQual}`
42
- }
43
- return `${platform}:${origin.workspace}${authorQual}`
38
+ return `${platform}:* author:${origin.authorId}`
44
39
  }
@@ -21,8 +21,8 @@ export type PendingClaimRegistry = {
21
21
  cancel: (code: string) => boolean
22
22
  current: () => PendingClaim | null
23
23
  // Snapshot of consumption result without actually committing the grant.
24
- // The router calls this on every DM-shaped inbound; the grant only fires
25
- // when the result is 'consumed'.
24
+ // The router calls this on every claim-code-bearing inbound; the grant
25
+ // only fires when the result is 'consumed'.
26
26
  tryConsume: (
27
27
  code: string,
28
28
  origin: PartialChannelOrigin,
@@ -0,0 +1,377 @@
1
+ export type CodexFetchObserverLogger = {
2
+ info: (msg: string) => void
3
+ warn: (msg: string) => void
4
+ }
5
+
6
+ export type CodexFetchObserverOptions = {
7
+ logger?: CodexFetchObserverLogger
8
+ codexHost?: string
9
+ now?: () => number
10
+ // Override the default pre-headers (TTFB) deadline applied to the outer
11
+ // fetch(). When the codex backend silently holds a request without sending
12
+ // response headers, this is the timer that releases the request so
13
+ // `pi-coding-agent`'s `_isRetryableError` can retry. Default: 15_000 ms.
14
+ //
15
+ // Healthy Codex turns return response headers within ~1s (observed
16
+ // production p50: ~860ms). The first SSE event (`response.created`) is
17
+ // emitted before any model work begins and arrives within ~50ms of
18
+ // headers. Pathological-but-healthy upper bounds: TLS handshake on a cold
19
+ // connection (~2s), prompt-prefill on a cache miss with large input
20
+ // (~3s), Cloudflare PoP routing slowness (~2s) — sum ~7s. 15s is ~2x
21
+ // that, so anything past it is almost certainly the silent-hang failure
22
+ // mode rather than a real request making progress. False-positive cost
23
+ // is one retry (~5s extra); false-negative cost is the full Bun socket
24
+ // deadline (~268s). Aggressive wins.
25
+ ttfbMs?: number
26
+ // Override the sliding inter-chunk idle deadline applied to the SSE body
27
+ // reader. Resets on every chunk; if no bytes arrive within this window the
28
+ // body stream errors. Default: 300_000 ms, matches `openai/codex`'s Rust CLI
29
+ // `DEFAULT_STREAM_IDLE_TIMEOUT_MS`. Set to 0 to disable just this timer.
30
+ idleMs?: number
31
+ // Schedule fn for tests. Receives (delayMs, callback) and returns a handle
32
+ // the wrapper can pass to `clear`. Default: `setTimeout`/`clearTimeout`.
33
+ scheduler?: TimeoutScheduler
34
+ }
35
+
36
+ export type TimeoutScheduler = {
37
+ set: (delayMs: number, cb: () => void) => unknown
38
+ clear: (handle: unknown) => void
39
+ }
40
+
41
+ const DEFAULT_CODEX_HOST = 'chatgpt.com'
42
+ const CODEX_PATH_FRAGMENT = '/codex/responses'
43
+ const ENV_DISABLE_OBSERVER = 'TYPECLAW_CODEX_FETCH_OBSERVER'
44
+ const ENV_DISABLE_TIMEOUTS = 'TYPECLAW_CODEX_TIMEOUTS'
45
+ const ENV_TTFB_MS = 'TYPECLAW_CODEX_TTFB_MS'
46
+ const ENV_IDLE_MS = 'TYPECLAW_CODEX_IDLE_MS'
47
+ const DEFAULT_TTFB_MS = 15_000
48
+ const DEFAULT_IDLE_MS = 300_000
49
+ const LOG_PREFIX = '[codex-fetch]'
50
+
51
+ const defaultScheduler: TimeoutScheduler = {
52
+ set: (delayMs, cb) => setTimeout(cb, delayMs),
53
+ clear: (handle) => clearTimeout(handle as ReturnType<typeof setTimeout>),
54
+ }
55
+
56
+ const consoleLogger: CodexFetchObserverLogger = {
57
+ info: (m) => console.log(m),
58
+ warn: (m) => console.warn(m),
59
+ }
60
+
61
+ type InstallState = {
62
+ originalFetch: typeof fetch
63
+ uninstall: () => void
64
+ }
65
+
66
+ let installed: InstallState | null = null
67
+
68
+ // Returns true when the request is for the Codex Responses endpoint and we
69
+ // should attach phase-timing instrumentation. Method check matches the
70
+ // pi-ai provider (only POST hits codex/responses); GETs to the same host
71
+ // (auth probes, etc.) are deliberately ignored.
72
+ function shouldObserve(input: RequestInfo | URL, init: RequestInit | undefined, codexHost: string): boolean {
73
+ const method = (init?.method ?? (input instanceof Request ? input.method : 'GET')).toUpperCase()
74
+ if (method !== 'POST') return false
75
+ let urlString: string
76
+ if (typeof input === 'string') urlString = input
77
+ else if (input instanceof URL) urlString = input.toString()
78
+ else urlString = input.url
79
+ let parsed: URL
80
+ try {
81
+ parsed = new URL(urlString)
82
+ } catch {
83
+ return false
84
+ }
85
+ if (parsed.hostname !== codexHost) return false
86
+ return parsed.pathname.includes(CODEX_PATH_FRAGMENT)
87
+ }
88
+
89
+ function quote(value: string | null): string {
90
+ if (value === null) return 'null'
91
+ return `"${value.replace(/"/g, '\\"')}"`
92
+ }
93
+
94
+ function formatLine(fields: {
95
+ status: number | null
96
+ headersMs: number | null
97
+ firstByteMs: number | null
98
+ totalMs: number
99
+ bodyBytes: number
100
+ retryAfter: string | null
101
+ requestId: string | null
102
+ error: string | null
103
+ cause: string | null
104
+ }): string {
105
+ return [
106
+ LOG_PREFIX,
107
+ `status=${fields.status === null ? 'null' : fields.status}`,
108
+ `headers_ms=${fields.headersMs === null ? 'null' : fields.headersMs}`,
109
+ `first_byte_ms=${fields.firstByteMs === null ? 'null' : fields.firstByteMs}`,
110
+ `total_ms=${fields.totalMs}`,
111
+ `body_bytes=${fields.bodyBytes}`,
112
+ `retry_after=${fields.retryAfter === null ? 'null' : fields.retryAfter}`,
113
+ `request_id=${fields.requestId === null ? 'null' : fields.requestId}`,
114
+ `error=${quote(fields.error)}`,
115
+ `cause=${fields.cause === null ? 'null' : fields.cause}`,
116
+ ].join(' ')
117
+ }
118
+
119
+ function readEnvMs(name: string, fallback: number): number {
120
+ const raw = process.env[name]
121
+ if (raw === undefined || raw === '') return fallback
122
+ const parsed = Number.parseInt(raw, 10)
123
+ if (!Number.isFinite(parsed) || parsed < 0) return fallback
124
+ return parsed
125
+ }
126
+
127
+ type BodyTapConfig = {
128
+ idleMs: number
129
+ scheduler: TimeoutScheduler
130
+ }
131
+
132
+ function attachBodyTimingTap(
133
+ response: Response,
134
+ start: number,
135
+ headersMs: number,
136
+ status: number,
137
+ retryAfter: string | null,
138
+ requestId: string | null,
139
+ now: () => number,
140
+ logger: CodexFetchObserverLogger,
141
+ config: BodyTapConfig,
142
+ ): Response {
143
+ if (response.body === null) {
144
+ logger.info(
145
+ formatLine({
146
+ status,
147
+ headersMs,
148
+ firstByteMs: null,
149
+ totalMs: now() - start,
150
+ bodyBytes: 0,
151
+ retryAfter,
152
+ requestId,
153
+ error: null,
154
+ cause: null,
155
+ }),
156
+ )
157
+ return response
158
+ }
159
+
160
+ let firstByteMs: number | null = null
161
+ let bodyBytes = 0
162
+ let settled = false
163
+ let cause: string | null = null
164
+
165
+ const settle = (error: string | null) => {
166
+ if (settled) return
167
+ settled = true
168
+ logger.info(
169
+ formatLine({
170
+ status,
171
+ headersMs,
172
+ firstByteMs,
173
+ totalMs: now() - start,
174
+ bodyBytes,
175
+ retryAfter,
176
+ requestId,
177
+ error,
178
+ cause,
179
+ }),
180
+ )
181
+ }
182
+
183
+ const tap = new TransformStream<Uint8Array, Uint8Array>({
184
+ transform(chunk, controller) {
185
+ if (firstByteMs === null) firstByteMs = now() - start
186
+ bodyBytes += chunk.byteLength
187
+ controller.enqueue(chunk)
188
+ },
189
+ flush() {
190
+ settle(null)
191
+ },
192
+ })
193
+
194
+ const piped = response.body.pipeThrough(tap, { preventCancel: false })
195
+
196
+ const idleController = config.idleMs > 0 ? new AbortController() : null
197
+ let idleHandle: unknown = null
198
+ const armIdleTimer = () => {
199
+ if (idleController === null) return
200
+ if (idleHandle !== null) config.scheduler.clear(idleHandle)
201
+ idleHandle = config.scheduler.set(config.idleMs, () => {
202
+ cause = 'idle_timeout'
203
+ idleController.abort(new Error(`Codex SSE body idle for ${config.idleMs}ms (typeclaw observer timeout)`))
204
+ })
205
+ }
206
+ const disarmIdleTimer = () => {
207
+ if (idleHandle !== null) {
208
+ config.scheduler.clear(idleHandle)
209
+ idleHandle = null
210
+ }
211
+ }
212
+
213
+ // The idle abort listener is installed exactly once for the lifetime of the
214
+ // stream and removed in `finally`. Earlier shapes constructed a fresh
215
+ // `Promise.race` listener per chunk; if `reader.read()` won the race, the
216
+ // listener was never removed and closures accumulated on the signal across a
217
+ // long stream. Keeping one shared abort promise bounds the listener count to
218
+ // 1 regardless of chunk count.
219
+ const observerBody = new ReadableStream<Uint8Array>({
220
+ async start(controller) {
221
+ const reader = piped.getReader()
222
+ armIdleTimer()
223
+ let abortFired = false
224
+ let onAbort: (() => void) | null = null
225
+ const abortPromise = idleController
226
+ ? new Promise<never>((_, reject) => {
227
+ onAbort = () => {
228
+ abortFired = true
229
+ reject(idleController.signal.reason ?? new Error('idle timeout'))
230
+ }
231
+ if (idleController.signal.aborted) onAbort()
232
+ else idleController.signal.addEventListener('abort', onAbort, { once: true })
233
+ })
234
+ : null
235
+ // Swallow the shared rejection if no race ever observes it (clean stream
236
+ // end before any timeout). Without this, an aborted-after-close path
237
+ // could surface as an unhandled rejection on the runtime.
238
+ abortPromise?.catch(() => {})
239
+ try {
240
+ while (true) {
241
+ const readPromise = reader.read()
242
+ const result = abortPromise ? await Promise.race([readPromise, abortPromise]) : await readPromise
243
+ if (abortFired) {
244
+ reader.cancel(idleController!.signal.reason).catch(() => {})
245
+ throw idleController!.signal.reason
246
+ }
247
+ const { done, value } = result
248
+ if (done) {
249
+ disarmIdleTimer()
250
+ controller.close()
251
+ return
252
+ }
253
+ armIdleTimer()
254
+ controller.enqueue(value)
255
+ }
256
+ } catch (err) {
257
+ disarmIdleTimer()
258
+ const message = err instanceof Error ? err.message : String(err)
259
+ settle(message)
260
+ controller.error(err)
261
+ } finally {
262
+ if (onAbort !== null && idleController !== null && !idleController.signal.aborted) {
263
+ idleController.signal.removeEventListener('abort', onAbort)
264
+ }
265
+ reader.releaseLock()
266
+ }
267
+ },
268
+ cancel(reason) {
269
+ disarmIdleTimer()
270
+ const message = reason === undefined ? 'cancelled' : reason instanceof Error ? reason.message : String(reason)
271
+ settle(message)
272
+ },
273
+ })
274
+
275
+ return new Response(observerBody, {
276
+ status: response.status,
277
+ statusText: response.statusText,
278
+ headers: response.headers,
279
+ })
280
+ }
281
+
282
+ export function installCodexFetchObserver(opts: CodexFetchObserverOptions = {}): () => void {
283
+ if (process.env[ENV_DISABLE_OBSERVER] === 'off') {
284
+ return () => {}
285
+ }
286
+ const logger = opts.logger ?? consoleLogger
287
+ if (installed !== null) {
288
+ logger.warn(`${LOG_PREFIX} install called but observer already installed; ignoring`)
289
+ return installed.uninstall
290
+ }
291
+
292
+ const codexHost = opts.codexHost ?? DEFAULT_CODEX_HOST
293
+ const now = opts.now ?? Date.now
294
+ const scheduler = opts.scheduler ?? defaultScheduler
295
+ const timeoutsEnabled = process.env[ENV_DISABLE_TIMEOUTS] !== 'off'
296
+ const ttfbMs = timeoutsEnabled ? (opts.ttfbMs ?? readEnvMs(ENV_TTFB_MS, DEFAULT_TTFB_MS)) : 0
297
+ const idleMs = timeoutsEnabled ? (opts.idleMs ?? readEnvMs(ENV_IDLE_MS, DEFAULT_IDLE_MS)) : 0
298
+ const originalFetch = globalThis.fetch
299
+
300
+ const wrappedImpl = async (
301
+ input: Parameters<typeof fetch>[0],
302
+ init?: Parameters<typeof fetch>[1],
303
+ ): Promise<Response> => {
304
+ if (!shouldObserve(input, init, codexHost)) {
305
+ return originalFetch(input, init)
306
+ }
307
+ const start = now()
308
+
309
+ let ttfbCause: 'ttfb_timeout' | null = null
310
+ let ttfbHandle: unknown = null
311
+ let initWithSignal: RequestInit | undefined = init
312
+ if (ttfbMs > 0) {
313
+ const ttfbController = new AbortController()
314
+ ttfbHandle = scheduler.set(ttfbMs, () => {
315
+ ttfbCause = 'ttfb_timeout'
316
+ ttfbController.abort(
317
+ new Error(`Codex fetch timed out before response headers after ${ttfbMs}ms (typeclaw observer timeout)`),
318
+ )
319
+ })
320
+ const signal = init?.signal ? AbortSignal.any([init.signal, ttfbController.signal]) : ttfbController.signal
321
+ initWithSignal = { ...init, signal }
322
+ }
323
+
324
+ let response: Response
325
+ try {
326
+ response = await originalFetch(input, initWithSignal)
327
+ } catch (err) {
328
+ if (ttfbHandle !== null) scheduler.clear(ttfbHandle)
329
+ const isTtfbAbort = ttfbCause === 'ttfb_timeout'
330
+ const surfacedError = isTtfbAbort
331
+ ? new Error(`Codex fetch timed out before response headers after ${ttfbMs}ms (typeclaw observer timeout)`)
332
+ : err
333
+ const message = surfacedError instanceof Error ? surfacedError.message : String(surfacedError)
334
+ logger.info(
335
+ formatLine({
336
+ status: null,
337
+ headersMs: null,
338
+ firstByteMs: null,
339
+ totalMs: now() - start,
340
+ bodyBytes: 0,
341
+ retryAfter: null,
342
+ requestId: null,
343
+ error: message,
344
+ cause: ttfbCause,
345
+ }),
346
+ )
347
+ throw surfacedError
348
+ }
349
+ if (ttfbHandle !== null) scheduler.clear(ttfbHandle)
350
+ const headersMs = now() - start
351
+ const retryAfter = response.headers.get('retry-after')
352
+ const requestId = response.headers.get('x-request-id')
353
+ return attachBodyTimingTap(response, start, headersMs, response.status, retryAfter, requestId, now, logger, {
354
+ idleMs,
355
+ scheduler,
356
+ })
357
+ }
358
+
359
+ // Preserve any static methods Bun attaches to `globalThis.fetch` (e.g.
360
+ // `preconnect`) so the wrapper is a drop-in replacement.
361
+ const wrapped = Object.assign(wrappedImpl, {
362
+ preconnect: (originalFetch as { preconnect?: (url: string) => void }).preconnect ?? (() => {}),
363
+ }) as typeof fetch
364
+
365
+ globalThis.fetch = wrapped
366
+
367
+ const uninstall = () => {
368
+ if (installed === null) return
369
+ if (globalThis.fetch === wrapped) {
370
+ globalThis.fetch = originalFetch
371
+ }
372
+ installed = null
373
+ }
374
+
375
+ installed = { originalFetch, uninstall }
376
+ return uninstall
377
+ }
package/src/run/index.ts CHANGED
@@ -59,11 +59,12 @@ import { createTunnelManager, type TunnelManager, type TunnelManagerOptions } fr
59
59
 
60
60
  import { BUNDLED_PLUGINS } from './bundled-plugins'
61
61
  import { buildChannelSessionFactory } from './channel-session-factory'
62
+ import { installCodexFetchObserver } from './codex-fetch-observer'
62
63
  import { createPluginRuntime, type PluginRuntime, type PluginSubagentEntry } from './plugin-runtime'
63
64
 
64
65
  type BunServer = ReturnType<Server['start']>
65
66
 
66
- export type TuiFactory = (options: TuiOptions) => { run: () => Promise<void> }
67
+ export type TuiFactory = (options: TuiOptions) => { run: () => Promise<unknown> }
67
68
 
68
69
  export type LoadCronFn = (agentDir: string, options?: { subagents?: SubagentRegistry }) => Promise<LoadCronResult>
69
70
  export type SchedulerFactory = (options: { cwd: string; file: CronFile; onFire: (job: CronJob) => void }) => Scheduler
@@ -86,7 +87,7 @@ export type StartAgentOptions = {
86
87
 
87
88
  export type StartAgentResult = {
88
89
  server: BunServer
89
- tuiPromise: Promise<void> | null
90
+ tuiPromise: Promise<unknown> | null
90
91
  scheduler: Scheduler | null
91
92
  cronConsumer: CronConsumer | null
92
93
  subagentConsumer: SubagentConsumer
@@ -113,6 +114,14 @@ export async function startAgent({
113
114
  }: StartAgentOptions): Promise<StartAgentResult> {
114
115
  const reloadRegistry = new ReloadRegistry()
115
116
 
117
+ // Wrap globalThis.fetch BEFORE any plugin/session/manager construction so
118
+ // every Codex Responses call from anywhere in the container is observed.
119
+ // Logs one `[codex-fetch]` line per matched request with phase timings;
120
+ // never aborts, never retries — purely passive instrumentation while we
121
+ // investigate the recurring multi-minute Codex stalls (see issue #394).
122
+ // Opt out with TYPECLAW_CODEX_FETCH_OBSERVER=off.
123
+ const uninstallCodexFetchObserver = installCodexFetchObserver()
124
+
116
125
  // The host CLI sets TYPECLAW_CONTAINER_NAME when it `docker run`s us. When
117
126
  // running outside a typeclaw container (tests, ad-hoc `bun run typeclaw run`
118
127
  // outside docker), the env var is absent and the `restart` tool is omitted —
@@ -585,6 +594,7 @@ export async function startAgent({
585
594
  subagentCompletionBridge.stop()
586
595
  await tunnelManager.stop()
587
596
  await channelManager.stop()
597
+ uninstallCodexFetchObserver()
588
598
  }
589
599
 
590
600
  if (!attachTui) {
@@ -1,3 +1,4 @@
1
+ import { SessionManager } from '@mariozechner/pi-coding-agent'
1
2
  import type { Server as BunServer, ServerWebSocket } from 'bun'
2
3
 
3
4
  import {
@@ -10,6 +11,7 @@ import { runPluginDoctorChecks, runPluginDoctorFix } from '@/agent/doctor'
10
11
  import type { LiveSessionRegistry } from '@/agent/live-sessions'
11
12
  import type { LiveSubagentRegistry } from '@/agent/live-subagents'
12
13
  import { detectProviderError } from '@/agent/provider-error'
14
+ import { consumeRestartHandoff, type RestartHandoff } from '@/agent/restart-handoff'
13
15
  import type { SessionOrigin } from '@/agent/session-origin'
14
16
  import { parseSubagentCompletedPayload, renderSubagentCompletionReminder } from '@/agent/subagent-completion-reminder'
15
17
  import type { CreateSessionForSubagent } from '@/agent/subagents'
@@ -233,6 +235,42 @@ export function createServer({
233
235
  }: ServerOptions) {
234
236
  const sessionStates = new WeakMap<Ws, SessionState>()
235
237
  const callIdToWs = new Map<string, AnyOwnerWs>()
238
+
239
+ // The first TUI WS open per container lifetime checks for
240
+ // `.typeclaw/restart-pending.json`; subsequent opens see null. The
241
+ // in-flight promise serializes concurrent first-opens — two TUIs
242
+ // reconnecting at the same instant share the single consume() call rather
243
+ // than each racing to reopen the originator's JSONL. Once the promise
244
+ // resolves, the handoff is consumed exactly once: subsequent opens see
245
+ // `handoffPending === false` and return null without checking the file.
246
+ let handoffInFlight: Promise<RestartHandoff | null> | null = null
247
+ let handoffPending = true
248
+ async function takeRestartHandoff(): Promise<RestartHandoff | null> {
249
+ if (!handoffPending) return null
250
+ if (handoffInFlight !== null) return handoffInFlight
251
+ if (agentDir === undefined) {
252
+ handoffPending = false
253
+ return null
254
+ }
255
+ handoffInFlight = consumeRestartHandoff(agentDir).catch(() => null)
256
+ const result = await handoffInFlight
257
+ handoffPending = false
258
+ handoffInFlight = null
259
+ return result
260
+ }
261
+
262
+ function resumeFromHandoff(handoff: RestartHandoff, factory: SessionFactory | undefined): SessionManager | null {
263
+ if (factory === undefined) return null
264
+ const sessionPath = `${factory.sessionDir()}/${handoff.originatingSessionFile}`
265
+ try {
266
+ return SessionManager.open(sessionPath)
267
+ } catch (err) {
268
+ const message = err instanceof Error ? err.message : String(err)
269
+ logger.warn(`restart-handoff: failed to reopen ${sessionPath}: ${message}`)
270
+ return null
271
+ }
272
+ }
273
+
236
274
  const commandRunner: CommandRunner | undefined = commandRunnerFactory
237
275
  ? commandRunnerFactory({
238
276
  stdout(callId, chunk) {
@@ -397,7 +435,9 @@ export function createServer({
397
435
  if (rawWs.data.kind === 'inspect') return
398
436
  const ws = rawWs as Ws
399
437
  try {
400
- const sessionManager = sessionFactory?.createPersisted()
438
+ const handoff = await takeRestartHandoff()
439
+ const resumed = handoff !== null ? resumeFromHandoff(handoff, sessionFactory) : null
440
+ const sessionManager = resumed ?? sessionFactory?.createPersisted()
401
441
  const sessionFileId = sessionManager?.getSessionId() ?? ws.data.sessionId
402
442
  // Snapshot the runtime once so the entire session lifecycle for this
403
443
  // ws connection sees one consistent generation of registry+hooks. A
@@ -485,6 +525,24 @@ export function createServer({
485
525
  ...(runtimeVersion !== undefined ? { serverVersion: runtimeVersion } : {}),
486
526
  })
487
527
  console.log(`session ${sessionFileId}: open`)
528
+
529
+ // Fire the post-restart kick. The originator's JSONL already
530
+ // contains the `typeclaw.restart-self` custom message entry that
531
+ // the dying container appended (see subscribeRestartNotice in
532
+ // src/agent/index.ts). pi's buildSessionContext() hydrates that
533
+ // entry as a `role: "user"` LLM message on the next prompt, so
534
+ // a single-space kick is enough to trigger a turn — the entry's
535
+ // own text instructs the model to "briefly confirm the restart
536
+ // completed". Publish AFTER the session-target subscription is
537
+ // wired (state.unsubPrompts above) so the kick is enqueued, not
538
+ // dropped on the floor.
539
+ if (resumed !== null && stream) {
540
+ stream.publish({
541
+ target: { kind: 'session', sessionId: sessionFileId },
542
+ payload: { kind: 'prompt', text: ' ', delivery: 'queue' },
543
+ meta: { source: 'restart-handoff' },
544
+ })
545
+ }
488
546
  } catch (err) {
489
547
  const message = err instanceof Error ? err.message : String(err)
490
548
  console.error(`session ${ws.data.sessionId}: open failed: ${message}`)
@@ -28,7 +28,7 @@ export type TunnelRequestId = string
28
28
 
29
29
  export type TunnelSnapshot = {
30
30
  name: string
31
- provider: 'external' | 'cloudflare-quick'
31
+ provider: 'external' | 'cloudflare-quick' | 'cloudflare-named'
32
32
  for: { kind: 'channel'; name: string } | { kind: 'manual' }
33
33
  url: string | null
34
34
  status: 'stopped' | 'starting' | 'healthy' | 'unhealthy' | 'permanently-failed'