npm - typeclaw - Versions diffs - 0.9.2 → 0.11.0 - Mend

typeclaw 0.9.2 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/package.json +2 -2
package/src/agent/index.ts +46 -11
package/src/agent/restart-handoff/index.ts +91 -0
package/src/agent/restart-handoff/paths.ts +11 -0
package/src/agent/session-origin.ts +30 -10
package/src/agent/subagent-completion-reminder.ts +4 -2
package/src/agent/system-prompt.ts +1 -1
package/src/agent/tools/restart.ts +42 -1
package/src/agent/tools/skip-response.ts +157 -0
package/src/bundled-plugins/memory/README.md +18 -2
package/src/bundled-plugins/memory/index.ts +108 -6
package/src/bundled-plugins/memory/memory-logger.ts +33 -24
package/src/bundled-plugins/security/index.ts +19 -17
package/src/bundled-plugins/security/permissions.ts +9 -8
package/src/bundled-plugins/security/policies/cron-promotion.ts +26 -9
package/src/bundled-plugins/security/policies/git-exfil.ts +23 -15
package/src/bundled-plugins/security/policies/prompt-injection.ts +1 -1
package/src/bundled-plugins/security/policies/role-promotion.ts +25 -18
package/src/channels/adapters/github/auth-app.ts +53 -9
package/src/channels/adapters/github/auth-pat.ts +4 -1
package/src/channels/adapters/github/auth.ts +10 -0
package/src/channels/adapters/github/event-permissions.ts +83 -0
package/src/channels/adapters/github/inbound.ts +126 -1
package/src/channels/adapters/github/index.ts +60 -66
package/src/channels/adapters/github/outbound.ts +65 -17
package/src/channels/adapters/github/permission-guidance.ts +169 -0
package/src/channels/adapters/github/team-membership.ts +56 -0
package/src/channels/router.ts +313 -10
package/src/channels/schema.ts +22 -0
package/src/channels/types.ts +1 -1
package/src/cli/channel.ts +135 -38
package/src/cli/cron.ts +1 -1
package/src/cli/init.ts +133 -86
package/src/cli/inspect-controller.ts +66 -0
package/src/cli/inspect.ts +99 -14
package/src/cli/role.ts +2 -2
package/src/cli/run.ts +24 -5
package/src/cli/tui.ts +34 -10
package/src/cli/tunnel.ts +453 -14
package/src/config/config.ts +35 -7
package/src/config/providers.ts +82 -56
package/src/cron/bridge.ts +25 -4
package/src/hostd/daemon.ts +44 -24
package/src/hostd/portbroker-manager.ts +19 -3
package/src/init/dockerfile.ts +52 -0
package/src/init/env-file.ts +66 -0
package/src/init/gitignore.ts +8 -0
package/src/init/hatching.ts +32 -5
package/src/init/index.ts +131 -39
package/src/init/validate-api-key.ts +31 -0
package/src/inspect/index.ts +47 -6
package/src/inspect/loop.ts +31 -0
package/src/inspect/replay.ts +15 -1
package/src/permissions/builtins.ts +29 -21
package/src/permissions/permissions.ts +32 -5
package/src/role-claim/code.ts +9 -9
package/src/role-claim/controller.ts +3 -2
package/src/role-claim/match-rule.ts +14 -19
package/src/role-claim/pending.ts +2 -2
package/src/run/codex-fetch-observer.ts +377 -0
package/src/run/index.ts +12 -2
package/src/server/index.ts +59 -1
package/src/shared/protocol.ts +1 -1
package/src/skills/typeclaw-channel-github/SKILL.md +45 -1
package/src/skills/typeclaw-codex-cli/SKILL.md +1 -1
package/src/skills/typeclaw-codex-cli/references/auth-flow.md +14 -1
package/src/skills/typeclaw-config/SKILL.md +7 -1
package/src/skills/typeclaw-config/references/recommended-mounts.md +233 -0
package/src/skills/typeclaw-permissions/SKILL.md +24 -18
package/src/skills/typeclaw-tunnels/SKILL.md +33 -1
package/src/tui/index.ts +17 -5
package/src/tunnels/index.ts +1 -0
package/src/tunnels/manager.ts +18 -0
package/src/tunnels/providers/cloudflare-named.ts +224 -0
package/src/tunnels/types.ts +17 -1
package/typeclaw.schema.json +120 -7

package/src/permissions/permissions.ts CHANGED Viewed

@@ -152,6 +152,29 @@ export function createPermissionService(opts: CreatePermissionServiceOptions = {
   }
 }
+// Walk order: owner, trusted, custom roles (in REVERSE declaration order),
+// member, guest. First role whose `match[]` covers the origin wins.
+//
+// Built-in tower: owner > trusted > member > guest. Pinning the tower
+// ahead of any user-declared rule closes a load-bearing footgun in the
+// previous pure-declaration-order resolver: declaring
+// `member.match: ["*"]` before `owner.match: [...]` resolved every
+// channel session — INCLUDING the owner's — to `member`, because the
+// wildcard matched first. The rolePromotion guard then made it
+// un-fixable from inside the demoted session (a member-resolved speaker
+// cannot rewrite `roles` without a TUI-issued ack).
+//
+// Custom roles use REVERSE declaration order: later declarations override
+// earlier ones. This matches the standard "later config wins" mental
+// model — when an operator adds a new role with the same match-scope as
+// an existing one (or appends a new author-pinned override to an existing
+// broad rule), the newer entry takes precedence. The previous "earlier
+// wins" was an arbitrary consequence of map iteration order rather than
+// a deliberate semantic.
+//
+// Custom roles cannot self-promote above trusted (no inherent severity
+// guarantee) and cannot demote themselves below member (declaring a custom
+// role implies the operator wants it to win against bottom catch-alls).
 function buildRoleTable(
   roles: RolesConfig,
   pluginPermissions: readonly string[],
@@ -160,16 +183,20 @@ function buildRoleTable(
   const out: ResolvedRole[] = []
   const seen = new Set<string>()
-  for (const name of Object.keys(roles)) {
-    if (seen.has(name)) continue
+  const emit = (name: string): void => {
+    if (seen.has(name)) return
     seen.add(name)
     out.push(resolveOne(name, roles[name], pluginPermissions, ownerWildcardExclusions))
   }
-  for (const name of BUILTIN_ROLE_NAMES) {
-    if (seen.has(name)) continue
-    out.push(resolveOne(name, undefined, pluginPermissions, ownerWildcardExclusions))
+  emit('owner')
+  emit('trusted')
+  const customRoles = Object.keys(roles).filter((name) => !isBuiltinRoleName(name))
+  for (let i = customRoles.length - 1; i >= 0; i--) {
+    emit(customRoles[i]!)
   }
+  emit('member')
+  emit('guest')
   return out
 }

package/src/role-claim/code.ts CHANGED Viewed

@@ -1,17 +1,17 @@
 import { randomBytes } from 'node:crypto'
 // Role-claim codes are short, human-typeable tokens the operator sends from
-// their host CLI to the bot via a channel DM to prove ownership of that
-// channel identity. Shape: `claim-XXXX-YYYY` where each block is 4 chars
-// from a Crockford-style base32 alphabet (0-9 + A-Z minus I, L, O, U to
-// dodge OCR-confusable / profane shapes). 8 chars * 5 bits = 40 bits of
-// entropy, which is overkill for a TTL'd in-memory window but cheap to
-// display and dictate over voice.
+// their host CLI to the bot in any chat (DM, group, channel) to prove
+// ownership of that channel identity. Shape: `claim-XXXX-YYYY` where each
+// block is 4 chars from a Crockford-style base32 alphabet (0-9 + A-Z minus
+// I, L, O, U to dodge OCR-confusable / profane shapes). 8 chars * 5 bits =
+// 40 bits of entropy, which is overkill for a TTL'd in-memory window but
+// cheap to display and dictate over voice.
 //
 // The `claim-` prefix lets the channel router recognize potential claim
-// attempts in a DM body without scanning the whole text for hex blocks,
-// and distinguishes claim DMs from normal first-message text like "hi"
-// which would otherwise need a regex of its own to disambiguate.
+// attempts in inbound text without scanning the whole body for hex blocks,
+// and distinguishes claim messages from normal first-message text like
+// "hi" which would otherwise need a regex of its own to disambiguate.
 export const CLAIM_CODE_PREFIX = 'claim-'

package/src/role-claim/controller.ts CHANGED Viewed

@@ -10,8 +10,9 @@ import { createPendingClaimRegistry, type PendingClaim, type PendingClaimRegistr
 //
 //   1. The host CLI (typeclaw role claim) opens a WS and sends `claim_start`.
 //   2. The WS server forwards that to controller.startClaim().
-//   3. The channel router's claimHandler (also wired here) intercepts DMs
-//      bearing the code and calls controller.tryConsumeInbound().
+//   3. The channel router's claimHandler (also wired here) intercepts any
+//      inbound bearing the code (DM, group, or channel) and calls
+//      controller.tryConsumeInbound().
 //   4. On consume, the controller writes to typeclaw.json#roles.<role>.match
 //      via grantRole, then reloads the live PermissionService so the new
 //      match rule takes effect without a container restart.

package/src/role-claim/match-rule.ts CHANGED Viewed

@@ -1,15 +1,19 @@
 // Builds a canonical match-rule DSL string from an inbound channel origin,
-// for the role table. Output shapes:
+// for the role table. Output shape is always platform-wide + author:
 //
-//   slack:T0123 author:U_ALICE
-//   discord:9999 author:U_ALICE
-//   telegram:42 author:U_ALICE
-//   kakao:dm/<chatId> author:<authorId>
+//   slack:* author:<authorId>
+//   discord:* author:<authorId>
+//   telegram:* author:<authorId>
+//   kakao:* author:<authorId>
 //
-// The author qualifier is always emitted so a claim grants the specific
-// human, not the whole workspace. To grant the whole workspace, the
-// operator edits typeclaw.json by hand or runs a future `typeclaw role grant`
-// without --claim.
+// "Platform-wide" means every chat the adapter sees on that platform —
+// DMs, group chats, and threads alike — gated by the author qualifier so
+// only this specific human is matched. The intent is: once an operator
+// proves they control a channel identity (by sending a code to the bot),
+// they keep their role wherever they speak from on the same platform. To
+// scope tighter (e.g. one workspace, one chat), the operator edits
+// typeclaw.json by hand; the claim flow is deliberately broad because
+// re-claiming on every new chat would be tedious for the common case.
 import type { ChannelKey } from '@/channels/types'
@@ -31,14 +35,5 @@ const ADAPTER_TO_PLATFORM: Record<ChannelKey['adapter'], 'slack' | 'discord' | '
 export function formatClaimMatchRule(origin: PartialChannelOrigin): string {
   const platform = ADAPTER_TO_PLATFORM[origin.adapter]
-  const authorQual = ` author:${origin.authorId}`
-  if (origin.adapter === 'kakaotalk') {
-    // Kakao has no workspace; routes use dm/group/open buckets. We can't
-    // know which bucket from a partial origin alone (adapter-side classifies
-    // it), so claim flows are restricted to DM and we emit the specific
-    // chat-id form so the rule grants only this 1:1 conversation, not every
-    // DM the agent is in.
-    return `${platform}:dm/${origin.chat}${authorQual}`
-  }
-  return `${platform}:${origin.workspace}${authorQual}`
+  return `${platform}:* author:${origin.authorId}`
 }

package/src/role-claim/pending.ts CHANGED Viewed

@@ -21,8 +21,8 @@ export type PendingClaimRegistry = {
   cancel: (code: string) => boolean
   current: () => PendingClaim | null
   // Snapshot of consumption result without actually committing the grant.
-  // The router calls this on every DM-shaped inbound; the grant only fires
-  // when the result is 'consumed'.
+  // The router calls this on every claim-code-bearing inbound; the grant
+  // only fires when the result is 'consumed'.
   tryConsume: (
     code: string,
     origin: PartialChannelOrigin,

package/src/run/codex-fetch-observer.ts ADDED Viewed

@@ -0,0 +1,377 @@
+export type CodexFetchObserverLogger = {
+  info: (msg: string) => void
+  warn: (msg: string) => void
+}
+export type CodexFetchObserverOptions = {
+  logger?: CodexFetchObserverLogger
+  codexHost?: string
+  now?: () => number
+  // Override the default pre-headers (TTFB) deadline applied to the outer
+  // fetch(). When the codex backend silently holds a request without sending
+  // response headers, this is the timer that releases the request so
+  // `pi-coding-agent`'s `_isRetryableError` can retry. Default: 15_000 ms.
+  //
+  // Healthy Codex turns return response headers within ~1s (observed
+  // production p50: ~860ms). The first SSE event (`response.created`) is
+  // emitted before any model work begins and arrives within ~50ms of
+  // headers. Pathological-but-healthy upper bounds: TLS handshake on a cold
+  // connection (~2s), prompt-prefill on a cache miss with large input
+  // (~3s), Cloudflare PoP routing slowness (~2s) — sum ~7s. 15s is ~2x
+  // that, so anything past it is almost certainly the silent-hang failure
+  // mode rather than a real request making progress. False-positive cost
+  // is one retry (~5s extra); false-negative cost is the full Bun socket
+  // deadline (~268s). Aggressive wins.
+  ttfbMs?: number
+  // Override the sliding inter-chunk idle deadline applied to the SSE body
+  // reader. Resets on every chunk; if no bytes arrive within this window the
+  // body stream errors. Default: 300_000 ms, matches `openai/codex`'s Rust CLI
+  // `DEFAULT_STREAM_IDLE_TIMEOUT_MS`. Set to 0 to disable just this timer.
+  idleMs?: number
+  // Schedule fn for tests. Receives (delayMs, callback) and returns a handle
+  // the wrapper can pass to `clear`. Default: `setTimeout`/`clearTimeout`.
+  scheduler?: TimeoutScheduler
+}
+export type TimeoutScheduler = {
+  set: (delayMs: number, cb: () => void) => unknown
+  clear: (handle: unknown) => void
+}
+const DEFAULT_CODEX_HOST = 'chatgpt.com'
+const CODEX_PATH_FRAGMENT = '/codex/responses'
+const ENV_DISABLE_OBSERVER = 'TYPECLAW_CODEX_FETCH_OBSERVER'
+const ENV_DISABLE_TIMEOUTS = 'TYPECLAW_CODEX_TIMEOUTS'
+const ENV_TTFB_MS = 'TYPECLAW_CODEX_TTFB_MS'
+const ENV_IDLE_MS = 'TYPECLAW_CODEX_IDLE_MS'
+const DEFAULT_TTFB_MS = 15_000
+const DEFAULT_IDLE_MS = 300_000
+const LOG_PREFIX = '[codex-fetch]'
+const defaultScheduler: TimeoutScheduler = {
+  set: (delayMs, cb) => setTimeout(cb, delayMs),
+  clear: (handle) => clearTimeout(handle as ReturnType<typeof setTimeout>),
+}
+const consoleLogger: CodexFetchObserverLogger = {
+  info: (m) => console.log(m),
+  warn: (m) => console.warn(m),
+}
+type InstallState = {
+  originalFetch: typeof fetch
+  uninstall: () => void
+}
+let installed: InstallState | null = null
+// Returns true when the request is for the Codex Responses endpoint and we
+// should attach phase-timing instrumentation. Method check matches the
+// pi-ai provider (only POST hits codex/responses); GETs to the same host
+// (auth probes, etc.) are deliberately ignored.
+function shouldObserve(input: RequestInfo | URL, init: RequestInit | undefined, codexHost: string): boolean {
+  const method = (init?.method ?? (input instanceof Request ? input.method : 'GET')).toUpperCase()
+  if (method !== 'POST') return false
+  let urlString: string
+  if (typeof input === 'string') urlString = input
+  else if (input instanceof URL) urlString = input.toString()
+  else urlString = input.url
+  let parsed: URL
+  try {
+    parsed = new URL(urlString)
+  } catch {
+    return false
+  }
+  if (parsed.hostname !== codexHost) return false
+  return parsed.pathname.includes(CODEX_PATH_FRAGMENT)
+}
+function quote(value: string | null): string {
+  if (value === null) return 'null'
+  return `"${value.replace(/"/g, '\\"')}"`
+}
+function formatLine(fields: {
+  status: number | null
+  headersMs: number | null
+  firstByteMs: number | null
+  totalMs: number
+  bodyBytes: number
+  retryAfter: string | null
+  requestId: string | null
+  error: string | null
+  cause: string | null
+}): string {
+  return [
+    LOG_PREFIX,
+    `status=${fields.status === null ? 'null' : fields.status}`,
+    `headers_ms=${fields.headersMs === null ? 'null' : fields.headersMs}`,
+    `first_byte_ms=${fields.firstByteMs === null ? 'null' : fields.firstByteMs}`,
+    `total_ms=${fields.totalMs}`,
+    `body_bytes=${fields.bodyBytes}`,
+    `retry_after=${fields.retryAfter === null ? 'null' : fields.retryAfter}`,
+    `request_id=${fields.requestId === null ? 'null' : fields.requestId}`,
+    `error=${quote(fields.error)}`,
+    `cause=${fields.cause === null ? 'null' : fields.cause}`,
+  ].join(' ')
+}
+function readEnvMs(name: string, fallback: number): number {
+  const raw = process.env[name]
+  if (raw === undefined || raw === '') return fallback
+  const parsed = Number.parseInt(raw, 10)
+  if (!Number.isFinite(parsed) || parsed < 0) return fallback
+  return parsed
+}
+type BodyTapConfig = {
+  idleMs: number
+  scheduler: TimeoutScheduler
+}
+function attachBodyTimingTap(
+  response: Response,
+  start: number,
+  headersMs: number,
+  status: number,
+  retryAfter: string | null,
+  requestId: string | null,
+  now: () => number,
+  logger: CodexFetchObserverLogger,
+  config: BodyTapConfig,
+): Response {
+  if (response.body === null) {
+    logger.info(
+      formatLine({
+        status,
+        headersMs,
+        firstByteMs: null,
+        totalMs: now() - start,
+        bodyBytes: 0,
+        retryAfter,
+        requestId,
+        error: null,
+        cause: null,
+      }),
+    )
+    return response
+  }
+  let firstByteMs: number | null = null
+  let bodyBytes = 0
+  let settled = false
+  let cause: string | null = null
+  const settle = (error: string | null) => {
+    if (settled) return
+    settled = true
+    logger.info(
+      formatLine({
+        status,
+        headersMs,
+        firstByteMs,
+        totalMs: now() - start,
+        bodyBytes,
+        retryAfter,
+        requestId,
+        error,
+        cause,
+      }),
+    )
+  }
+  const tap = new TransformStream<Uint8Array, Uint8Array>({
+    transform(chunk, controller) {
+      if (firstByteMs === null) firstByteMs = now() - start
+      bodyBytes += chunk.byteLength
+      controller.enqueue(chunk)
+    },
+    flush() {
+      settle(null)
+    },
+  })
+  const piped = response.body.pipeThrough(tap, { preventCancel: false })
+  const idleController = config.idleMs > 0 ? new AbortController() : null
+  let idleHandle: unknown = null
+  const armIdleTimer = () => {
+    if (idleController === null) return
+    if (idleHandle !== null) config.scheduler.clear(idleHandle)
+    idleHandle = config.scheduler.set(config.idleMs, () => {
+      cause = 'idle_timeout'
+      idleController.abort(new Error(`Codex SSE body idle for ${config.idleMs}ms (typeclaw observer timeout)`))
+    })
+  }
+  const disarmIdleTimer = () => {
+    if (idleHandle !== null) {
+      config.scheduler.clear(idleHandle)
+      idleHandle = null
+    }
+  }
+  // The idle abort listener is installed exactly once for the lifetime of the
+  // stream and removed in `finally`. Earlier shapes constructed a fresh
+  // `Promise.race` listener per chunk; if `reader.read()` won the race, the
+  // listener was never removed and closures accumulated on the signal across a
+  // long stream. Keeping one shared abort promise bounds the listener count to
+  // 1 regardless of chunk count.
+  const observerBody = new ReadableStream<Uint8Array>({
+    async start(controller) {
+      const reader = piped.getReader()
+      armIdleTimer()
+      let abortFired = false
+      let onAbort: (() => void) | null = null
+      const abortPromise = idleController
+        ? new Promise<never>((_, reject) => {
+            onAbort = () => {
+              abortFired = true
+              reject(idleController.signal.reason ?? new Error('idle timeout'))
+            }
+            if (idleController.signal.aborted) onAbort()
+            else idleController.signal.addEventListener('abort', onAbort, { once: true })
+          })
+        : null
+      // Swallow the shared rejection if no race ever observes it (clean stream
+      // end before any timeout). Without this, an aborted-after-close path
+      // could surface as an unhandled rejection on the runtime.
+      abortPromise?.catch(() => {})
+      try {
+        while (true) {
+          const readPromise = reader.read()
+          const result = abortPromise ? await Promise.race([readPromise, abortPromise]) : await readPromise
+          if (abortFired) {
+            reader.cancel(idleController!.signal.reason).catch(() => {})
+            throw idleController!.signal.reason
+          }
+          const { done, value } = result
+          if (done) {
+            disarmIdleTimer()
+            controller.close()
+            return
+          }
+          armIdleTimer()
+          controller.enqueue(value)
+        }
+      } catch (err) {
+        disarmIdleTimer()
+        const message = err instanceof Error ? err.message : String(err)
+        settle(message)
+        controller.error(err)
+      } finally {
+        if (onAbort !== null && idleController !== null && !idleController.signal.aborted) {
+          idleController.signal.removeEventListener('abort', onAbort)
+        }
+        reader.releaseLock()
+      }
+    },
+    cancel(reason) {
+      disarmIdleTimer()
+      const message = reason === undefined ? 'cancelled' : reason instanceof Error ? reason.message : String(reason)
+      settle(message)
+    },
+  })
+  return new Response(observerBody, {
+    status: response.status,
+    statusText: response.statusText,
+    headers: response.headers,
+  })
+}
+export function installCodexFetchObserver(opts: CodexFetchObserverOptions = {}): () => void {
+  if (process.env[ENV_DISABLE_OBSERVER] === 'off') {
+    return () => {}
+  }
+  const logger = opts.logger ?? consoleLogger
+  if (installed !== null) {
+    logger.warn(`${LOG_PREFIX} install called but observer already installed; ignoring`)
+    return installed.uninstall
+  }
+  const codexHost = opts.codexHost ?? DEFAULT_CODEX_HOST
+  const now = opts.now ?? Date.now
+  const scheduler = opts.scheduler ?? defaultScheduler
+  const timeoutsEnabled = process.env[ENV_DISABLE_TIMEOUTS] !== 'off'
+  const ttfbMs = timeoutsEnabled ? (opts.ttfbMs ?? readEnvMs(ENV_TTFB_MS, DEFAULT_TTFB_MS)) : 0
+  const idleMs = timeoutsEnabled ? (opts.idleMs ?? readEnvMs(ENV_IDLE_MS, DEFAULT_IDLE_MS)) : 0
+  const originalFetch = globalThis.fetch
+  const wrappedImpl = async (
+    input: Parameters<typeof fetch>[0],
+    init?: Parameters<typeof fetch>[1],
+  ): Promise<Response> => {
+    if (!shouldObserve(input, init, codexHost)) {
+      return originalFetch(input, init)
+    }
+    const start = now()
+    let ttfbCause: 'ttfb_timeout' | null = null
+    let ttfbHandle: unknown = null
+    let initWithSignal: RequestInit | undefined = init
+    if (ttfbMs > 0) {
+      const ttfbController = new AbortController()
+      ttfbHandle = scheduler.set(ttfbMs, () => {
+        ttfbCause = 'ttfb_timeout'
+        ttfbController.abort(
+          new Error(`Codex fetch timed out before response headers after ${ttfbMs}ms (typeclaw observer timeout)`),
+        )
+      })
+      const signal = init?.signal ? AbortSignal.any([init.signal, ttfbController.signal]) : ttfbController.signal
+      initWithSignal = { ...init, signal }
+    }
+    let response: Response
+    try {
+      response = await originalFetch(input, initWithSignal)
+    } catch (err) {
+      if (ttfbHandle !== null) scheduler.clear(ttfbHandle)
+      const isTtfbAbort = ttfbCause === 'ttfb_timeout'
+      const surfacedError = isTtfbAbort
+        ? new Error(`Codex fetch timed out before response headers after ${ttfbMs}ms (typeclaw observer timeout)`)
+        : err
+      const message = surfacedError instanceof Error ? surfacedError.message : String(surfacedError)
+      logger.info(
+        formatLine({
+          status: null,
+          headersMs: null,
+          firstByteMs: null,
+          totalMs: now() - start,
+          bodyBytes: 0,
+          retryAfter: null,
+          requestId: null,
+          error: message,
+          cause: ttfbCause,
+        }),
+      )
+      throw surfacedError
+    }
+    if (ttfbHandle !== null) scheduler.clear(ttfbHandle)
+    const headersMs = now() - start
+    const retryAfter = response.headers.get('retry-after')
+    const requestId = response.headers.get('x-request-id')
+    return attachBodyTimingTap(response, start, headersMs, response.status, retryAfter, requestId, now, logger, {
+      idleMs,
+      scheduler,
+    })
+  }
+  // Preserve any static methods Bun attaches to `globalThis.fetch` (e.g.
+  // `preconnect`) so the wrapper is a drop-in replacement.
+  const wrapped = Object.assign(wrappedImpl, {
+    preconnect: (originalFetch as { preconnect?: (url: string) => void }).preconnect ?? (() => {}),
+  }) as typeof fetch
+  globalThis.fetch = wrapped
+  const uninstall = () => {
+    if (installed === null) return
+    if (globalThis.fetch === wrapped) {
+      globalThis.fetch = originalFetch
+    }
+    installed = null
+  }
+  installed = { originalFetch, uninstall }
+  return uninstall
+}

package/src/run/index.ts CHANGED Viewed

@@ -59,11 +59,12 @@ import { createTunnelManager, type TunnelManager, type TunnelManagerOptions } fr
 import { BUNDLED_PLUGINS } from './bundled-plugins'
 import { buildChannelSessionFactory } from './channel-session-factory'
+import { installCodexFetchObserver } from './codex-fetch-observer'
 import { createPluginRuntime, type PluginRuntime, type PluginSubagentEntry } from './plugin-runtime'
 type BunServer = ReturnType<Server['start']>
-export type TuiFactory = (options: TuiOptions) => { run: () => Promise<void> }
+export type TuiFactory = (options: TuiOptions) => { run: () => Promise<unknown> }
 export type LoadCronFn = (agentDir: string, options?: { subagents?: SubagentRegistry }) => Promise<LoadCronResult>
 export type SchedulerFactory = (options: { cwd: string; file: CronFile; onFire: (job: CronJob) => void }) => Scheduler
@@ -86,7 +87,7 @@ export type StartAgentOptions = {
 export type StartAgentResult = {
   server: BunServer
-  tuiPromise: Promise<void> | null
+  tuiPromise: Promise<unknown> | null
   scheduler: Scheduler | null
   cronConsumer: CronConsumer | null
   subagentConsumer: SubagentConsumer
@@ -113,6 +114,14 @@ export async function startAgent({
 }: StartAgentOptions): Promise<StartAgentResult> {
   const reloadRegistry = new ReloadRegistry()
+  // Wrap globalThis.fetch BEFORE any plugin/session/manager construction so
+  // every Codex Responses call from anywhere in the container is observed.
+  // Logs one `[codex-fetch]` line per matched request with phase timings;
+  // never aborts, never retries — purely passive instrumentation while we
+  // investigate the recurring multi-minute Codex stalls (see issue #394).
+  // Opt out with TYPECLAW_CODEX_FETCH_OBSERVER=off.
+  const uninstallCodexFetchObserver = installCodexFetchObserver()
   // The host CLI sets TYPECLAW_CONTAINER_NAME when it `docker run`s us. When
   // running outside a typeclaw container (tests, ad-hoc `bun run typeclaw run`
   // outside docker), the env var is absent and the `restart` tool is omitted —
@@ -585,6 +594,7 @@ export async function startAgent({
     subagentCompletionBridge.stop()
     await tunnelManager.stop()
     await channelManager.stop()
+    uninstallCodexFetchObserver()
   }
   if (!attachTui) {

package/src/server/index.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { SessionManager } from '@mariozechner/pi-coding-agent'
 import type { Server as BunServer, ServerWebSocket } from 'bun'
 import {
@@ -10,6 +11,7 @@ import { runPluginDoctorChecks, runPluginDoctorFix } from '@/agent/doctor'
 import type { LiveSessionRegistry } from '@/agent/live-sessions'
 import type { LiveSubagentRegistry } from '@/agent/live-subagents'
 import { detectProviderError } from '@/agent/provider-error'
+import { consumeRestartHandoff, type RestartHandoff } from '@/agent/restart-handoff'
 import type { SessionOrigin } from '@/agent/session-origin'
 import { parseSubagentCompletedPayload, renderSubagentCompletionReminder } from '@/agent/subagent-completion-reminder'
 import type { CreateSessionForSubagent } from '@/agent/subagents'
@@ -233,6 +235,42 @@ export function createServer({
 }: ServerOptions) {
   const sessionStates = new WeakMap<Ws, SessionState>()
   const callIdToWs = new Map<string, AnyOwnerWs>()
+  // The first TUI WS open per container lifetime checks for
+  // `.typeclaw/restart-pending.json`; subsequent opens see null. The
+  // in-flight promise serializes concurrent first-opens — two TUIs
+  // reconnecting at the same instant share the single consume() call rather
+  // than each racing to reopen the originator's JSONL. Once the promise
+  // resolves, the handoff is consumed exactly once: subsequent opens see
+  // `handoffPending === false` and return null without checking the file.
+  let handoffInFlight: Promise<RestartHandoff | null> | null = null
+  let handoffPending = true
+  async function takeRestartHandoff(): Promise<RestartHandoff | null> {
+    if (!handoffPending) return null
+    if (handoffInFlight !== null) return handoffInFlight
+    if (agentDir === undefined) {
+      handoffPending = false
+      return null
+    }
+    handoffInFlight = consumeRestartHandoff(agentDir).catch(() => null)
+    const result = await handoffInFlight
+    handoffPending = false
+    handoffInFlight = null
+    return result
+  }
+  function resumeFromHandoff(handoff: RestartHandoff, factory: SessionFactory | undefined): SessionManager | null {
+    if (factory === undefined) return null
+    const sessionPath = `${factory.sessionDir()}/${handoff.originatingSessionFile}`
+    try {
+      return SessionManager.open(sessionPath)
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err)
+      logger.warn(`restart-handoff: failed to reopen ${sessionPath}: ${message}`)
+      return null
+    }
+  }
   const commandRunner: CommandRunner | undefined = commandRunnerFactory
     ? commandRunnerFactory({
         stdout(callId, chunk) {
@@ -397,7 +435,9 @@ export function createServer({
           if (rawWs.data.kind === 'inspect') return
           const ws = rawWs as Ws
           try {
-            const sessionManager = sessionFactory?.createPersisted()
+            const handoff = await takeRestartHandoff()
+            const resumed = handoff !== null ? resumeFromHandoff(handoff, sessionFactory) : null
+            const sessionManager = resumed ?? sessionFactory?.createPersisted()
             const sessionFileId = sessionManager?.getSessionId() ?? ws.data.sessionId
             // Snapshot the runtime once so the entire session lifecycle for this
             // ws connection sees one consistent generation of registry+hooks. A
@@ -485,6 +525,24 @@ export function createServer({
               ...(runtimeVersion !== undefined ? { serverVersion: runtimeVersion } : {}),
             })
             console.log(`session ${sessionFileId}: open`)
+            // Fire the post-restart kick. The originator's JSONL already
+            // contains the `typeclaw.restart-self` custom message entry that
+            // the dying container appended (see subscribeRestartNotice in
+            // src/agent/index.ts). pi's buildSessionContext() hydrates that
+            // entry as a `role: "user"` LLM message on the next prompt, so
+            // a single-space kick is enough to trigger a turn — the entry's
+            // own text instructs the model to "briefly confirm the restart
+            // completed". Publish AFTER the session-target subscription is
+            // wired (state.unsubPrompts above) so the kick is enqueued, not
+            // dropped on the floor.
+            if (resumed !== null && stream) {
+              stream.publish({
+                target: { kind: 'session', sessionId: sessionFileId },
+                payload: { kind: 'prompt', text: ' ', delivery: 'queue' },
+                meta: { source: 'restart-handoff' },
+              })
+            }
           } catch (err) {
             const message = err instanceof Error ? err.message : String(err)
             console.error(`session ${ws.data.sessionId}: open failed: ${message}`)

package/src/shared/protocol.ts CHANGED Viewed

@@ -28,7 +28,7 @@ export type TunnelRequestId = string
 export type TunnelSnapshot = {
   name: string
-  provider: 'external' | 'cloudflare-quick'
+  provider: 'external' | 'cloudflare-quick' | 'cloudflare-named'
   for: { kind: 'channel'; name: string } | { kind: 'manual' }
   url: string | null
   status: 'stopped' | 'starting' | 'healthy' | 'unhealthy' | 'permanently-failed'