npm - agim-cli - Versions diffs - 1.1.1 → 1.1.2 - Mend

agim-cli 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/CHANGELOG.md +101 -0
package/README.md +7 -1
package/README.zh-CN.md +7 -1
package/dist/cli.js +265 -30
package/dist/cli.js.map +1 -1
package/dist/core/a2a.d.ts +36 -0
package/dist/core/a2a.d.ts.map +1 -0
package/dist/core/a2a.js +203 -0
package/dist/core/a2a.js.map +1 -0
package/dist/core/approval-bus.d.ts +40 -0
package/dist/core/approval-bus.d.ts.map +1 -1
package/dist/core/approval-bus.js +106 -0
package/dist/core/approval-bus.js.map +1 -1
package/dist/core/approval-router.d.ts.map +1 -1
package/dist/core/approval-router.js +31 -2
package/dist/core/approval-router.js.map +1 -1
package/dist/core/commands/a2a.d.ts +3 -0
package/dist/core/commands/a2a.d.ts.map +1 -0
package/dist/core/commands/a2a.js +148 -0
package/dist/core/commands/a2a.js.map +1 -0
package/dist/core/commands/job.d.ts.map +1 -1
package/dist/core/commands/job.js +11 -2
package/dist/core/commands/job.js.map +1 -1
package/dist/core/commands/outbox.d.ts +3 -0
package/dist/core/commands/outbox.d.ts.map +1 -0
package/dist/core/commands/outbox.js +92 -0
package/dist/core/commands/outbox.js.map +1 -0
package/dist/core/job-board.d.ts +122 -1
package/dist/core/job-board.d.ts.map +1 -1
package/dist/core/job-board.js +404 -21
package/dist/core/job-board.js.map +1 -1
package/dist/core/job-recovery.d.ts +48 -0
package/dist/core/job-recovery.d.ts.map +1 -0
package/dist/core/job-recovery.js +185 -0
package/dist/core/job-recovery.js.map +1 -0
package/dist/core/message-sink.d.ts +63 -0
package/dist/core/message-sink.d.ts.map +1 -0
package/dist/core/message-sink.js +296 -0
package/dist/core/message-sink.js.map +1 -0
package/dist/core/outbox.d.ts +71 -0
package/dist/core/outbox.d.ts.map +1 -0
package/dist/core/outbox.js +301 -0
package/dist/core/outbox.js.map +1 -0
package/dist/core/reminders.d.ts.map +1 -1
package/dist/core/reminders.js +12 -1
package/dist/core/reminders.js.map +1 -1
package/dist/core/restart-completion.d.ts.map +1 -1
package/dist/core/restart-completion.js +18 -1
package/dist/core/restart-completion.js.map +1 -1
package/dist/core/router.d.ts +8 -0
package/dist/core/router.d.ts.map +1 -1
package/dist/core/router.js +16 -0
package/dist/core/router.js.map +1 -1
package/dist/core/types.d.ts +22 -0
package/dist/core/types.d.ts.map +1 -1
package/dist/plugins/agents/claude-code/index.d.ts.map +1 -1
package/dist/plugins/agents/claude-code/index.js +5 -0
package/dist/plugins/agents/claude-code/index.js.map +1 -1
package/dist/plugins/agents/claude-code/mcp-approval-server.d.ts +21 -0
package/dist/plugins/agents/claude-code/mcp-approval-server.d.ts.map +1 -1
package/dist/plugins/agents/claude-code/mcp-approval-server.js +106 -0
package/dist/plugins/agents/claude-code/mcp-approval-server.js.map +1 -1
package/dist/plugins/agents/codex/index.d.ts.map +1 -1
package/dist/plugins/agents/codex/index.js +5 -0
package/dist/plugins/agents/codex/index.js.map +1 -1
package/dist/plugins/agents/opencode/opencode-stdio-adapter.d.ts.map +1 -1
package/dist/plugins/agents/opencode/opencode-stdio-adapter.js +5 -0
package/dist/plugins/agents/opencode/opencode-stdio-adapter.js.map +1 -1
package/dist/web/server.d.ts.map +1 -1
package/dist/web/server.js +28 -16
package/dist/web/server.js.map +1 -1
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,107 @@ All notable changes to this project will be documented in this file.
 ## [Unreleased]
+## [1.1.2] - 2026-05-15
+### Added — Task recovery (3-phase, all on by default)
+- **Outbox-backed IM delivery.** Every outbound message (replies, reminder
+  fires, approval prompts/receipts, restart-completion notices, web
+  notifications) now flows through a SQLite-backed outbox in
+  `~/.agim/outbox.db`. A worker drains pending rows with exponential
+  backoff (1s → 5s → 30s → 5min → 30min → 2h → `giving_up`). Replies
+  survive IM platform glitches, brief disconnects, and adapter restarts
+  — no more "agent answered but the user never got it" black holes.
+  - `/outbox status` aggregate counts.
+  - `/outbox list [N]` recent rows.
+  - `/outbox failed` rows in `giving_up`.
+  - `/outbox retry <id>` resurrect a `giving_up` row.
+  - Retention: 24h after `delivered` / `giving_up`. Override with
+    `IMHUB_OUTBOX_RETENTION_HOURS`.
+- **Inline-job tracking.** Every inbound agent-bound message now creates
+  a row in `jobs.db` with `kind='inline'`, going through the full
+  lifecycle `pending → running → completed → delivered` (or `failed`).
+  The state machine survives crashes; this is the substrate Phase 3
+  recovery and A2A both rely on.
+  - Visible in `/job list` and per-row in `/job check <id>`.
+  - Retention: 24h (vs 30d for `kind='job'` rows from explicit `/job
+    create`). Override with `IMHUB_INLINE_JOB_RETENTION_HOURS`.
+  - Kill switch: `IMHUB_INLINE_JOB_TRACKING=0` disables creation; the
+    user reply path falls back to the unchanged in-memory pipeline.
+- **Crash-recovery flow.** When agim exits (SIGTERM/SIGINT/crash), any
+  in-flight inline job is stamped `interrupted`. On startup, jobs
+  interrupted within the last 10 minutes get a per-thread retry prompt
+  through the outbox: `⚠️ 上次的消息被服务重启中断了：「<prompt 摘要>…」
+  回复 1 重发 / 2 取消（10 分钟内有效）`.
+  - Reply `1` → fresh inline job with the original prompt, old row
+    stamped `replaced_by=<new id>`.
+  - Reply `2` → old row marked `cancelled`.
+  - No reply → old row swept to `abandoned` by the next startup.
+  - Window override: `IMHUB_RECOVERY_WINDOW_MS` (default 600000).
+### Added — Agent-to-Agent (A2A) Layer 1
+- **`mcp__imhub__call_agent` MCP tool.** A running agent (claude-code /
+  opencode / codex / copilot) can now program-call another agent and
+  wait for the result inside its own tool loop. The tool description
+  uses imperative + Chinese trigger phrases so models invoke it
+  naturally when the user says e.g. "用 codex 帮我跑 git status" or
+  "ask opencode to run the tests".
+- **Guardrails enforced inside agim, not on the model**:
+  - `IMHUB_A2A_MAX_DEPTH` (default 3) — caps nested call chains.
+  - Self-call ban — `claude-code` can't recursively call `claude-code`.
+  - Workspace whitelist — callee must be in the caller's workspace
+    `agents[]`. Caller's `userId` carries through.
+  - `IMHUB_A2A_TIMEOUT_DEFAULT_MS` (default 600_000) — accumulation
+    timeout. Callee keeps running on timeout; caller stops waiting.
+  - Kill switch: `IMHUB_A2A_ENABLED=0` disables the whole feature.
+- **`/a2a stats | recent [N] | tree <id>`** — observability for A2A
+  traffic. Each callee row is linked to its caller via `parent_id`
+  + `call_depth`, so `/a2a tree <root>` renders the full chain.
+- **Audit integration** — `intent='a2a'` lights up in `/audit` queries
+  and feeds the same per-user budget that user-originated calls do.
+### Changed
+- `SIGTERM` now reuses the same graceful-shutdown path as `SIGINT`.
+  Prior to 1.1.2, `systemctl restart` invocations skipped session-
+  manager / WAL-checkpoint cleanup because only `SIGINT` was hooked.
+  Recommended systemd unit drop-in: `KillMode=mixed` so children get
+  `TimeoutStopSec` of grace before SIGKILL.
+- `jobs` table schema migrated (idempotent ALTER TABLE) with
+  `thread_key`, `kind`, `started_at`, `delivered_at`, `replaced_by`,
+  `last_outbox_id`, `parent_id`, `call_depth`. Pre-v1.0 databases
+  (which still had only the original 8 columns) are auto-upgraded
+  on first startup.
+- `MessengerAdapter.sendMessage` call sites consolidated through
+  `core/message-sink.ts`. Reminder polish / memo polish run upstream of
+  sink and are unaffected; sink only takes the final plain payload.
+### Fixed
+- Schema-vs-migrate ordering broke job-board on legacy databases. The
+  v1.0 ownership migration's `CREATE INDEX idx_jobs_creator ON
+  jobs(creator_id)` was inside the schema string, which `sqlite-helper`
+  exec()s **before** `migrateOwnership`. Against a pre-v1.0
+  `~/.agim/jobs.db` (no `creator_id` column yet) this threw
+  `no such column: creator_id` and disabled the entire job-board for
+  the lifetime of the process. The bug shipped silently in v1.0 and
+  only surfaced when Phase 2 forced everyone to actually look at the
+  jobs table. Column-dependent indexes now live inside `migrateColumns`,
+  after the ALTER TABLE pass.
+- Recovery retry path no longer creates orphan `pending` rows. The
+  retry interceptor's pre-created replacement job (so it could stamp
+  `replaced_by` atomically) is now reused by `handleMessage` instead
+  of being shadowed by a second `createInlineJob` call.
+### Reference docs
+- `docs/task-recovery-plan.md` — 3-phase design + risk register +
+  cross-session handoff guide.
+- `docs/task-recovery-testing.md` — grey-deploy step-by-step verifier.
+- `docs/agent-to-agent-plan.md` — 3-layer A2A design (L1 implemented;
+  L2 shared-artifacts and L3 workflow-DSL are documented for later).
 ## [1.1.1] - 2026-05-14
 ### Changed

package/README.md CHANGED Viewed

@@ -18,6 +18,9 @@
 ## Highlights
 - **5 messengers + email, 4+ agents** — WeChat (image / file / voice), Feishu, DingTalk (image / voice with server-side ASR), Telegram, Discord, Email (SMTP); Claude Code, Codex, Copilot, OpenCode, plus any ACP endpoint
+- **Agent-to-Agent (A2A)** — agents call other agents inline via `mcp__imhub__call_agent`. Just say *"用 codex 帮我跑 git status"* / *"ask opencode to run the tests"* — the active agent hands off, waits, integrates the reply. Guardrails: depth limit (default 3), self-call ban, workspace whitelist, per-user budget. Observability: `/a2a stats | recent | tree <id>`. Disable with `IMHUB_A2A_ENABLED=0`.
+- **Crash-safe delivery** — every outbound message (replies, reminders, approvals, restart notices) flows through a SQLite outbox; a worker drains pending rows with exponential backoff. IM glitches and brief disconnects no longer drop replies. `/outbox status | list | failed | retry <id>`.
+- **In-flight job recovery** — every inbound message becomes a tracked inline job (`pending → running → completed → delivered`). When agim is restarted mid-flight, the next startup pings each affected thread within 10 min: *"⚠️ 上次的消息被服务重启中断了：「…」回复 1 重发 / 2 取消"*. Per-job lifecycle visible in `/job list` and `/job check <id>`.
 - **`/remind` reminders subsystem** — one-shot + recurring (`每天8点喝水`); LLM auto-detects reminder intent in casual chat; LLM polishes delivery; agent MCP tools; web `/reminders` page; email + IM delivery
 - **`/memo` 5W1H persistent memory** — generic "what / who / when / where / how / why" notes with optional GPS capture (browser geolocation H5 + Baidu geocoder); permanent by default, transient bucket for parking spots / today's meeting; agents store + retrieve via MCP tools so casual mentions get remembered automatically
 - **Browser dashboard** — chat UI, tasks panel (jobs / schedules / approvals / health / files / audit), reminders panel, settings page with workspace CRUD
@@ -25,7 +28,7 @@
 - **Rich media in WeChat / Telegram / DingTalk** — receive images, files, videos; voice messages transcribed via WeChat STT, DingTalk's server-side ASR, OpenAI Whisper, or whisper.cpp (per-platform fallback chain)
 - **Smart routing** — intent classifier (CJK + ASCII), sticky sessions, circuit breaker, rate limiter
 - **Multi-tenant workspaces** — per-workspace agent whitelist, rate limits, command-level ACL
-- **Persistent jobs & cron** — SQLite-backed, survives restarts, 30-day retention
+- **Persistent jobs & cron** — SQLite-backed, survives restarts, 30-day retention (24h for auto-tracked inline jobs)
 - **Observability** — structured logging (pino + traceId), Prometheus metrics, audit log
 - **Security** — timing-safe auth, SSRF guards, credential file permissions, approval socket entropy
@@ -126,8 +129,11 @@ agim messengers            # List available messengers
 | `/remind …` | Reminders — see [Reminders](#reminders) below |
 | `/memo …` | 5W1H persistent memory — see [Memos](#memos) below (aliases `/记`, `/note`) |
 | `/job`, `/cron`, `/audit`, `/stats` | Manage jobs, cron schedules, audit, stats (`/schedule` aliases `/cron` until v0.4.0) |
+| `/outbox status\|list\|failed\|retry <id>` | Inspect & operate the persistent IM delivery queue (v1.1.2+) |
+| `/a2a stats\|recent\|tree <id>` | Agent-to-Agent observability — chain, latency, callee histogram (v1.1.2+) |
 | `/router status\|explain` | Inspect routing decisions |
 | `y` / `n` / `批准` / `拒绝` | Approve / deny Claude tool call (or reminder confirmation card) |
+| `1` / `2` | After a service restart, reply `1` to redo the interrupted message or `2` to cancel (10 min window) |
 ## Human-in-the-loop Tool Approval

package/README.zh-CN.md CHANGED Viewed

@@ -14,6 +14,9 @@
 ## 亮点
 - **5 种 IM + 邮件，4+ 种 Agent** — 微信（图片 / 文件 / 语音）、飞书、钉钉（图片 / 语音，自带服务端 ASR）、Telegram、Discord、Email（SMTP）；Claude Code、Codex、Copilot、OpenCode，以及任意 ACP 端点
+- **Agent 之间互调（A2A）** — Agent 可以在自己的工具循环里直接调另一个 Agent，比如直接说「用 codex 帮我跑 git status」/「让 opencode 跑一下测试」。护栏在 agim 里强制：递归深度限制（默认 3）、禁止自调、工作区白名单、按人预算共享；可观测：`/a2a stats | recent | tree <id>`；`IMHUB_A2A_ENABLED=0` 关闭
+- **投递不丢消息** — Agent 回复 / 提醒 / 审批卡片 / 重启通知 全部走 SQLite outbox + 指数退避 worker，IM 平台抖动或断连不再丢回复；`/outbox status | list | failed | retry <id>`
+- **崩溃中断可恢复** — 每条入站消息都建一条 inline job（`pending → running → completed → delivered`）。agim 重启时正在跑的任务会标 `interrupted`，下次启动 10 分钟窗口内对每条受影响的会话发：「⚠️ 上次的消息被服务重启中断了：「…」回复 1 重发 / 2 取消」。每条 job 的完整生命周期可在 `/job list` + `/job check <id>` 查看
 - **`/remind` 提醒子系统** — 一次性 + 定期（`每天8点喝水`）；非 slash 消息 LLM 自动识别意图；LLM 润色投递文本；Agent MCP 工具直接创建；Web `/reminders` 管理页；邮件 + IM 双通道投递
 - **`/memo` 5W1H 持久记忆库** — 通用「what / who / when / where / how / why」记事本，可选 GPS（浏览器地理定位 H5 + 百度地理编码）；默认永久保存，临时类（停车、当天会议）单独走 24h 桶；Agent 通过 MCP 工具自动落库 + 检索，闲聊里提到的地点 / 生日 / 待办都能记下
 - **浏览器控制台** — 对话界面、任务面板（Jobs / 调度 / 审批 / 健康 / 文件 / 审计）、提醒面板、设置页含工作区 CRUD
@@ -21,7 +24,7 @@
 - **微信 / Telegram / 钉钉 富媒体** — 接收图片、文件、视频；语音消息按平台走最佳转写链路：微信 STT、钉钉服务端 ASR、OpenAI Whisper、whisper.cpp
 - **智能路由** — 意图分类（中英文）、Sticky 会话、断路器、限流器
 - **多租户工作区** — 按工作区隔离 Agent 白名单、限流、命令级 ACL
-- **持久化任务与定时** — SQLite 落地，重启不丢，30 天自动修剪
+- **持久化任务与定时** — SQLite 落地，重启不丢，显式 `/job` 30 天 / 自动跟踪的 inline job 24 小时
 - **可观测** — 结构化日志（pino + traceId）、Prometheus 指标、审计日志
 - **安全** — 常量时鉴权、SSRF 防护、凭证文件权限、审批 socket 熵值
@@ -118,8 +121,11 @@ agim messengers            # 列出可用 IM
 | `/remind …` | 提醒子系统 — 详见 [提醒](#提醒) |
 | `/memo …` | 5W1H 持久记忆库 — 详见 [备忘](#备忘)（别名 `/记`、`/note`）|
 | `/job`、`/cron`、`/audit`、`/stats` | 管理任务、定时、审计、统计（`/schedule` 仍是 `/cron` 别名，v0.4.0 移除） |
+| `/outbox status\|list\|failed\|retry <id>` | 查看与重发 IM 投递队列（v1.1.2+） |
+| `/a2a stats\|recent\|tree <id>` | 查看 Agent 之间互调的统计 / 最近 N 次 / 调用链树（v1.1.2+） |
 | `/router status\|explain` | 查看路由策略 |
 | `y` / `n` / `批准` / `拒绝` | 同意 / 拒绝（工具调用 或 提醒确认卡片） |
+| `1` / `2` | 服务重启后看到「上次消息中断」提示时，回 `1` 重发 / `2` 取消（10 分钟内有效） |
 ## 工具调用人审

package/dist/cli.js CHANGED Viewed

@@ -32,6 +32,8 @@ import { consumeLocationContext, formatLocationAnnotation } from './core/locatio
 import { tryDetectReminderIntent } from './core/remind-intent.js';
 import { createReminder } from './core/reminders.js';
 import { setReminderConfirmNotifier } from './core/reminder-rpc.js';
+import { sink } from './core/message-sink.js';
+import { createInlineJob, markJobRunning, markJobCompleted, markJobFailed, linkJobOutbox, updateJobAgent, } from './core/job-board.js';
 import { checkMessengerConfig, checkAgentAvailability, runMessengerOnboarding, formatAgentInstallHint, formatMessengerStartError, loadConfig as loadOnboardingConfig, saveConfig as saveOnboardingConfig, } from './core/onboarding.js';
 import { startWebServer } from './web/server.js';
 import { startACPServer } from './core/acp-server.js';
@@ -260,6 +262,64 @@ program
             ctx.traceId = traceId;
             ctx.logger = createLogger({ traceId, platform: ctx.platform, component: 'cli' });
             ctx.logger.info({ event: 'message.received', text: ctx.message.text.substring(0, 120), userId: ctx.message.userId });
+            // Phase 3 — interrupted-job recovery reply. If the user has a
+            // pending "retry / cancel" prompt outstanding on this thread AND
+            // their reply parses as 1/重发/2/取消, consume the message here
+            // before any other interceptor (approval / reminder) sees it.
+            // hasPendingRecovery is a cheap map lookup; we only do the full
+            // parse when the gate is true.
+            const recoveryThreadKey = `${ctx.platform}:${ctx.channelId}:${ctx.message.threadId}`;
+            try {
+                const { hasPendingRecovery, tryHandleRecoveryReply } = await import('./core/job-recovery.js');
+                if (hasPendingRecovery(recoveryThreadKey)) {
+                    const outcome = tryHandleRecoveryReply(recoveryThreadKey, ctx.message.text);
+                    if (outcome.kind === 'retried') {
+                        ctx.logger.info({
+                            event: 'recovery.retried', oldJobId: outcome.oldJobId, newJobId: outcome.newJobId,
+                        });
+                        // Substitute the user's "1" with the original prompt so the
+                        // rest of the handler reruns the work. handleMessage reads
+                        // message.text again on its own, so this is the cheapest
+                        // way to re-enter the normal path without duplicating logic.
+                        ctx.message.text = outcome.entry.prompt;
+                        // Hand the freshly-created replacement job id to handleMessage
+                        // so it doesn't build a second inline row (the recovery flow
+                        // already created newJobId + stamped replaced_by on the old
+                        // row). Without this we'd accumulate orphan 'pending' rows
+                        // that retention eventually GCs — visible but harmless.
+                        ctx.inlineJobId = outcome.newJobId;
+                        // Send a short "好的，重发中..." acknowledgement first so the
+                        // user knows we accepted the choice (the actual agent reply
+                        // can take many seconds). priority=normal — slot it after
+                        // anything urgent in the outbox.
+                        await sink.deliver({
+                            platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
+                            payload: '🔁 好的，重发中…',
+                            kind: 'text',
+                        });
+                        // Fall through to the normal handler below with the rewritten
+                        // message text.
+                    }
+                    else if (outcome.kind === 'cancelled') {
+                        ctx.logger.info({ event: 'recovery.cancelled', oldJobId: outcome.oldJobId });
+                        await sink.deliver({
+                            platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
+                            payload: '✅ 已取消，不再处理上次的消息。',
+                            kind: 'text',
+                        });
+                        return;
+                    }
+                    // outcome.kind === 'not-recovery-reply' → user typed something
+                    // else; fall through to normal handling so we don't swallow
+                    // unrelated messages.
+                }
+            }
+            catch (err) {
+                ctx.logger.warn({
+                    event: 'recovery.intercept_crashed',
+                    err: err instanceof Error ? err.message : String(err),
+                }, 'recovery intercept threw — falling through to normal flow');
+            }
             // Approval interception comes BEFORE the agent router. If a pending
             // approval exists for this thread and the message is a y/n-style
             // reply, we resolve the approval and stop. Anything else routes
@@ -295,7 +355,10 @@ program
             const replyDecision = tryConsumePendingReminderReply(tk, ctx.message.text);
             if (replyDecision) {
                 if (replyDecision.decision === 'cancel') {
-                    await messenger.sendMessage(ctx.message.threadId, '✅ 已忽略这条提醒建议');
+                    await sink.deliver({
+                        platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
+                        payload: '✅ 已忽略这条提醒建议', kind: 'text',
+                    });
                 }
                 else {
                     const p = replyDecision.pending;
@@ -311,11 +374,18 @@ program
                             recurrence: p.recurrence,
                         });
                         const recurLine = p.recurrence ? `\n   循环：${p.recurrence}` : '';
-                        await messenger.sendMessage(ctx.message.threadId, `✅ 已创建提醒 #${id}\n   触发：${p.fireAt.toLocaleString()}${recurLine}\n   内容：${p.text}`);
+                        await sink.deliver({
+                            platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
+                            payload: `✅ 已创建提醒 #${id}\n   触发：${p.fireAt.toLocaleString()}${recurLine}\n   内容：${p.text}`,
+                            kind: 'text',
+                        });
                     }
                     catch (err) {
                         const msg = err instanceof Error ? err.message : String(err);
-                        await messenger.sendMessage(ctx.message.threadId, `❌ 创建失败：${msg}`);
+                        await sink.deliver({
+                            platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
+                            payload: `❌ 创建失败：${msg}`, kind: 'text',
+                        });
                     }
                 }
                 ctx.logger.info({ event: 'message.consumed_by_pending_reminder', decision: replyDecision.decision });
@@ -354,7 +424,10 @@ program
                         reply = await handleStopCommand('', routeCtx);
                     else
                         reply = await handleStatusCommand('', routeCtx);
-                    await messenger.sendMessage(ctx.message.threadId, reply);
+                    await sink.deliver({
+                        platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
+                        payload: reply, kind: 'text',
+                    });
                     return;
                 }
             }
@@ -372,7 +445,10 @@ program
                         text: ctx.message.text,
                     });
                     if (detected.handled && detected.reply) {
-                        await messenger.sendMessage(ctx.message.threadId, detected.reply);
+                        await sink.deliver({
+                            platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
+                            payload: detected.reply, kind: 'text',
+                        });
                         ctx.logger.info({ event: 'message.proposed_reminder' });
                         return;
                     }
@@ -412,18 +488,54 @@ program
         });
         console.log('✅ Approval router wired to messengers');
     }
+    // Start the outbox worker AFTER all messengers have been registered/
+    // started. If we started it earlier, every queued row from a prior run
+    // would fail "messenger not registered" on the first tick and bounce
+    // into backoff — minor but pointless. Worker is single-threaded and
+    // ticks at IMHUB_OUTBOX_TICK_MS (default 1s).
+    sink.startWorker();
+    console.log('✅ Message-sink worker started');
+    // Phase 3 — startup recovery scan. Look at jobs.db for inline rows
+    // that got 'interrupted' on the previous shutdown and are still inside
+    // the recovery window (default 10 min, env IMHUB_RECOVERY_WINDOW_MS).
+    // For each, send a "回 1 重发 / 2 取消" prompt via sink — outbox will
+    // hold the notification until the messenger is fully connected. Older
+    // interrupted rows transition to 'abandoned' so we don't ask the user
+    // to retry something they sent half an hour ago.
+    try {
+        const { scanInterruptedAndNotify, sweepExpiredPending } = await import('./core/job-recovery.js');
+        const { notified, abandoned } = await scanInterruptedAndNotify();
+        if (notified > 0 || abandoned > 0) {
+            console.log(`✅ Job recovery: ${notified} retry-prompt(s) sent, ${abandoned} abandoned`);
+        }
+        // Tidy expired pending entries every minute. The in-memory map is
+        // cheap but a long-lived process accumulates entries from users who
+        // never reply.
+        const sweepTimer = setInterval(() => { sweepExpiredPending(); }, 60_000);
+        if (typeof sweepTimer === 'object' && sweepTimer && 'unref' in sweepTimer) {
+            sweepTimer.unref();
+        }
+    }
+    catch (err) {
+        console.warn('job-recovery module failed to scan:', err instanceof Error ? err.message : String(err));
+    }
     // Reminder confirm notifier — used by reminder-rpc.ts when an agent
     // creates a high-frequency + LLM-polish reminder (those go through a
     // y/n card before landing in the DB). Plain message via the same
     // messenger the agent is talking on.
     setReminderConfirmNotifier(async (ctx, message) => {
-        const messenger = registry.getMessenger(platformToMessengerName(ctx.platform))
-            ?? registry.getMessenger(ctx.platform);
-        if (!messenger) {
-            console.warn(`reminder-confirm: no messenger for platform "${ctx.platform}"`);
-            return;
-        }
-        await messenger.sendMessage(ctx.threadId, message);
+        // sink resolves the adapter at delivery time; we just need to canonicalize
+        // the messenger name (e.g. 'wechat' → 'wechat-ilink') so the worker picks
+        // the right adapter from the registry.
+        const canonical = platformToMessengerName(ctx.platform);
+        const platform = registry.getMessenger(canonical) ? canonical : ctx.platform;
+        await sink.deliver({
+            platform,
+            channelId: ctx.channelId,
+            threadId: ctx.threadId,
+            payload: message,
+            kind: 'text',
+        });
     });
     // ============================================
     // START WEB CHAT SERVER
@@ -476,12 +588,37 @@ program
     catch (err) {
         console.warn('restart-completion module failed to load:', err instanceof Error ? err.message : String(err));
     }
-    // Keep process alive
-    process.on('SIGINT', async () => {
-        console.log('\n👋 Shutting down...');
+    // Graceful shutdown. Registered for BOTH SIGINT (Ctrl-C from a terminal)
+    // and SIGTERM (systemd / docker stop). Prior to v1.1.2 only SIGINT was
+    // hooked, which meant `systemctl restart agim` skipped this entire path —
+    // sessionManager.stop, WAL checkpoints, etc. — and the next startup had
+    // to recover from a half-flushed state. The handler is idempotent against
+    // duplicate signals.
+    let shuttingDown = false;
+    const gracefulShutdown = async (signal) => {
+        if (shuttingDown)
+            return;
+        shuttingDown = true;
+        console.log(`\n👋 Received ${signal}, shutting down...`);
         sessionManager.stop();
         webServer?.close();
         acpServer?.close();
+        // Stop the outbox worker BEFORE messengers so no new send attempts
+        // race the adapter shutdown. Worker is single-threaded so a pending
+        // tick will finish its current row before we move on.
+        try {
+            sink.stopWorker();
+        }
+        catch { /* ignore */ }
+        // Phase 3: any inline job that was still pending/running gets stamped
+        // 'interrupted' so the next startup's recovery scan can offer the user
+        // a retry. Must happen BEFORE closeJobBoardDb() below, while the
+        // handle is still open.
+        try {
+            const { markRunningInlineJobsInterrupted } = await import('./core/job-board.js');
+            markRunningInlineJobsInterrupted(`agim shutdown (${signal})`);
+        }
+        catch { /* ignore */ }
         // Stop all messengers
         for (const name of registry.listMessengers()) {
             const messenger = registry.getMessenger(name);
@@ -527,8 +664,16 @@ program
             closeMemosDb(); // also stops the sweep timer
         }
         catch { /* ignore */ }
+        try {
+            const { stopOutboxRetentionSweep, closeOutboxDb } = await import('./core/outbox.js');
+            stopOutboxRetentionSweep();
+            closeOutboxDb();
+        }
+        catch { /* ignore */ }
         process.exit(0);
-    });
+    };
+    process.on('SIGINT', () => { void gracefulShutdown('SIGINT'); });
+    process.on('SIGTERM', () => { void gracefulShutdown('SIGTERM'); });
     // Wait forever
     await new Promise(() => { });
 });
@@ -581,6 +726,11 @@ async function handleMessage(ctx, defaultAgent) {
     let dismissThinking;
     const looksLikeApproval = /^\s*[yn]\s*$/i.test(message.text) ||
         /^\s*(批准|拒绝|同意|不同意|通过|可以|不可以|不行|✅|❌)\s*$/.test(message.text);
+    // Inline-job tracking id (Phase 2). Declared at function scope so the
+    // catch block below can mark the row 'failed' on any throw. -1 means
+    // "not tracked": either the message didn't reach willInvokeAgent, or
+    // createInlineJob fell back to in-memory (DB locked / IMHUB_INLINE_JOB_TRACKING=0).
+    let inlineJobId = -1;
     try {
         if (messenger.sendTyping) {
             messenger.sendTyping(message.threadId, true).catch(() => { });
@@ -642,6 +792,66 @@ async function handleMessage(ctx, defaultAgent) {
                 }
             };
         }
+        // Inline-job tracking: every agent-bound inbound message gets a row in
+        // the jobs table (kind='inline') so the state machine survives crashes
+        // and the Phase 3 startup scan can offer the user a retry. Built-ins
+        // (status/audit/router) return string immediately and never touch the
+        // agent — no point tracking them. Inline tracking can be killed wholesale
+        // via env IMHUB_INLINE_JOB_TRACKING=0 (createInlineJob returns -1, every
+        // mark*Job helper no-ops on id<=0). createInlineJob itself never throws —
+        // if the DB is locked or disk-full we get -1 and the user path runs
+        // unchanged on the in-memory fast path.
+        //
+        // We create the row BEFORE entering the runOnThread queue so a stuck
+        // queue (or runAgentInvocation crash before the agent runs) still leaves
+        // a 'pending' record we can attribute to the user. The agent name is
+        // initially defaultAgent and gets corrected from inside onAgentResolved
+        // once routing picks the real one.
+        //
+        // EXCEPTION: when the cli's recovery interceptor handed us a pre-created
+        // replacement-job id via ctx.inlineJobId, reuse it. The recovery path
+        // had to create the row up-front so it could stamp replaced_by on the
+        // old interrupted row atomically; building a second row here would
+        // leave the recovery-created one as an orphan 'pending' that retention
+        // eventually GCs.
+        if (willInvokeAgent) {
+            if (ctx.inlineJobId && ctx.inlineJobId > 0) {
+                inlineJobId = ctx.inlineJobId;
+            }
+            else {
+                inlineJobId = createInlineJob({
+                    agent: defaultAgent,
+                    prompt: message.text,
+                    threadKey: `${platform}:${ctx.channelId}:${message.threadId}`,
+                    creatorId: message.userId ?? '',
+                });
+            }
+            // A2A-L1: stash on routeCtx so callAgentWithHistory can forward it
+            // through AgentSendOpts → adapter.registerRun → RunContext. When the
+            // agent fires mcp__imhub__call_agent the bus then knows what
+            // parent_id to stamp on the callee inline-job row. User-originated
+            // runs are call_depth=0; nested runs (A2A callees) reach this same
+            // handler with their own ctx.inlineJobId and ctx.callDepth set.
+            if (inlineJobId > 0) {
+                routeCtx.parentJobId = inlineJobId;
+                // handleMessage is only the user-inbound path — callDepth is
+                // always 0 here. (A2A callees don't traverse this handler; they
+                // go through a2a.callAgentByName which passes its own callDepth
+                // directly to target.sendPrompt.)
+                routeCtx.callDepth = 0;
+            }
+        }
+        // Chain the agent-name correction onto the existing onAgentResolved
+        // (which already wires native session ids). Both run sequentially —
+        // the inline-job update is fast and best-effort.
+        const prevOnResolved = routeCtx.onAgentResolved;
+        if (prevOnResolved || inlineJobId > 0) {
+            routeCtx.onAgentResolved = async (resolvedAgent) => {
+                if (prevOnResolved)
+                    await prevOnResolved(resolvedAgent);
+                updateJobAgent(inlineJobId, resolvedAgent);
+            };
+        }
         // The thinking placeholder + agent invocation + response delivery are
         // wrapped into a closure so agent-bound messages can be serialized
         // per-thread via runOnThread (see core/thread-queue.ts). Without
@@ -660,6 +870,10 @@ async function handleMessage(ctx, defaultAgent) {
                     logger.debug({ err: String(err) }, 'sendThinking failed');
                 }
             }
+            // Mark inline job 'running' just before the agent actually starts —
+            // anything that throws before this leaves the row in 'pending' which
+            // is the correct "never ran" semantic.
+            markJobRunning(inlineJobId);
             const result = await routeMessage(parsed, routeCtx);
             const dismiss = async () => {
                 if (dismissThinking) {
@@ -674,7 +888,14 @@ async function handleMessage(ctx, defaultAgent) {
             if (typeof result === 'string') {
                 await stopTyping();
                 await dismiss();
-                await messenger.sendMessage(message.threadId, await maybePrefix(result));
+                markJobCompleted(inlineJobId, result);
+                const sinkRes = await sink.deliver({
+                    platform, channelId: ctx.channelId, threadId: message.threadId,
+                    payload: await maybePrefix(result), kind: 'text',
+                    jobId: inlineJobId > 0 ? inlineJobId : undefined,
+                });
+                if (sinkRes.outboxId > 0)
+                    linkJobOutbox(inlineJobId, sinkRes.outboxId);
                 logger.info({ event: 'message.sent', responseLen: result.length });
             }
             else {
@@ -686,10 +907,22 @@ async function handleMessage(ctx, defaultAgent) {
                 await stopTyping();
                 await dismiss();
                 if (fullResponse) {
-                    await messenger.sendMessage(message.threadId, await maybePrefix(fullResponse));
+                    markJobCompleted(inlineJobId, fullResponse);
+                    const sinkRes = await sink.deliver({
+                        platform, channelId: ctx.channelId, threadId: message.threadId,
+                        payload: await maybePrefix(fullResponse), kind: 'text',
+                        jobId: inlineJobId > 0 ? inlineJobId : undefined,
+                    });
+                    if (sinkRes.outboxId > 0)
+                        linkJobOutbox(inlineJobId, sinkRes.outboxId);
                     logger.info({ event: 'message.sent', responseLen: fullResponse.length });
                 }
                 else {
+                    // Empty response is treated as a failure of the inline job —
+                    // there's nothing to deliver and we don't want the row stuck in
+                    // 'running' forever. The user already sees nothing; the row gets
+                    // pruned out of /jobs lists.
+                    markJobFailed(inlineJobId, 'agent returned empty response');
                     logger.warn({ event: 'message.empty_response' });
                 }
             }
@@ -699,12 +932,11 @@ async function handleMessage(ctx, defaultAgent) {
             await runOnThread(queueKey, runAgentInvocation, {
                 onQueued: async (ahead) => {
                     logger.info({ event: 'message.queued', ahead, queueKey });
-                    try {
-                        await messenger.sendMessage(message.threadId, `📥 已收到（前面还有 ${ahead} 条在处理，稍后给你结果）`);
-                    }
-                    catch (err) {
-                        logger.debug({ err: String(err) }, 'queued-notice send failed');
-                    }
+                    await sink.deliver({
+                        platform, channelId: ctx.channelId, threadId: message.threadId,
+                        payload: `📥 已收到（前面还有 ${ahead} 条在处理，稍后给你结果）`,
+                        kind: 'text',
+                    });
                 },
             });
         }
@@ -722,12 +954,15 @@ async function handleMessage(ctx, defaultAgent) {
             }
             catch { /* ignore */ }
         }
-        try {
-            await messenger.sendMessage(message.threadId, '❌ An error occurred processing your message.');
-        }
-        catch {
-            // Ignore
-        }
+        // Mark inline job 'failed' before sending the user error — markJobFailed
+        // is a no-op for id<=0 (built-ins / inline tracking disabled) and for
+        // already-terminal rows. The error blurb still goes through sink so it
+        // survives an IM hiccup.
+        markJobFailed(inlineJobId, errMsg);
+        await sink.deliver({
+            platform, channelId: ctx.channelId, threadId: message.threadId,
+            payload: '❌ An error occurred processing your message.', kind: 'text',
+        }).catch(() => { });
     }
 }
 program