agim-cli 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/CHANGELOG.md +101 -0
  2. package/README.md +7 -1
  3. package/README.zh-CN.md +7 -1
  4. package/dist/cli.js +265 -30
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/a2a.d.ts +36 -0
  7. package/dist/core/a2a.d.ts.map +1 -0
  8. package/dist/core/a2a.js +203 -0
  9. package/dist/core/a2a.js.map +1 -0
  10. package/dist/core/approval-bus.d.ts +40 -0
  11. package/dist/core/approval-bus.d.ts.map +1 -1
  12. package/dist/core/approval-bus.js +106 -0
  13. package/dist/core/approval-bus.js.map +1 -1
  14. package/dist/core/approval-router.d.ts.map +1 -1
  15. package/dist/core/approval-router.js +31 -2
  16. package/dist/core/approval-router.js.map +1 -1
  17. package/dist/core/commands/a2a.d.ts +3 -0
  18. package/dist/core/commands/a2a.d.ts.map +1 -0
  19. package/dist/core/commands/a2a.js +148 -0
  20. package/dist/core/commands/a2a.js.map +1 -0
  21. package/dist/core/commands/job.d.ts.map +1 -1
  22. package/dist/core/commands/job.js +11 -2
  23. package/dist/core/commands/job.js.map +1 -1
  24. package/dist/core/commands/outbox.d.ts +3 -0
  25. package/dist/core/commands/outbox.d.ts.map +1 -0
  26. package/dist/core/commands/outbox.js +92 -0
  27. package/dist/core/commands/outbox.js.map +1 -0
  28. package/dist/core/job-board.d.ts +122 -1
  29. package/dist/core/job-board.d.ts.map +1 -1
  30. package/dist/core/job-board.js +404 -21
  31. package/dist/core/job-board.js.map +1 -1
  32. package/dist/core/job-recovery.d.ts +48 -0
  33. package/dist/core/job-recovery.d.ts.map +1 -0
  34. package/dist/core/job-recovery.js +185 -0
  35. package/dist/core/job-recovery.js.map +1 -0
  36. package/dist/core/message-sink.d.ts +63 -0
  37. package/dist/core/message-sink.d.ts.map +1 -0
  38. package/dist/core/message-sink.js +296 -0
  39. package/dist/core/message-sink.js.map +1 -0
  40. package/dist/core/outbox.d.ts +71 -0
  41. package/dist/core/outbox.d.ts.map +1 -0
  42. package/dist/core/outbox.js +301 -0
  43. package/dist/core/outbox.js.map +1 -0
  44. package/dist/core/reminders.d.ts.map +1 -1
  45. package/dist/core/reminders.js +12 -1
  46. package/dist/core/reminders.js.map +1 -1
  47. package/dist/core/restart-completion.d.ts.map +1 -1
  48. package/dist/core/restart-completion.js +18 -1
  49. package/dist/core/restart-completion.js.map +1 -1
  50. package/dist/core/router.d.ts +8 -0
  51. package/dist/core/router.d.ts.map +1 -1
  52. package/dist/core/router.js +16 -0
  53. package/dist/core/router.js.map +1 -1
  54. package/dist/core/types.d.ts +22 -0
  55. package/dist/core/types.d.ts.map +1 -1
  56. package/dist/plugins/agents/claude-code/index.d.ts.map +1 -1
  57. package/dist/plugins/agents/claude-code/index.js +5 -0
  58. package/dist/plugins/agents/claude-code/index.js.map +1 -1
  59. package/dist/plugins/agents/claude-code/mcp-approval-server.d.ts +21 -0
  60. package/dist/plugins/agents/claude-code/mcp-approval-server.d.ts.map +1 -1
  61. package/dist/plugins/agents/claude-code/mcp-approval-server.js +106 -0
  62. package/dist/plugins/agents/claude-code/mcp-approval-server.js.map +1 -1
  63. package/dist/plugins/agents/codex/index.d.ts.map +1 -1
  64. package/dist/plugins/agents/codex/index.js +5 -0
  65. package/dist/plugins/agents/codex/index.js.map +1 -1
  66. package/dist/plugins/agents/opencode/opencode-stdio-adapter.d.ts.map +1 -1
  67. package/dist/plugins/agents/opencode/opencode-stdio-adapter.js +5 -0
  68. package/dist/plugins/agents/opencode/opencode-stdio-adapter.js.map +1 -1
  69. package/dist/web/server.d.ts.map +1 -1
  70. package/dist/web/server.js +28 -16
  71. package/dist/web/server.js.map +1 -1
  72. package/package.json +1 -1
package/CHANGELOG.md CHANGED
@@ -4,6 +4,107 @@ All notable changes to this project will be documented in this file.
4
4
 
5
5
  ## [Unreleased]
6
6
 
7
+ ## [1.1.2] - 2026-05-15
8
+
9
+ ### Added — Task recovery (3-phase, all on by default)
10
+
11
+ - **Outbox-backed IM delivery.** Every outbound message (replies, reminder
12
+ fires, approval prompts/receipts, restart-completion notices, web
13
+ notifications) now flows through a SQLite-backed outbox in
14
+ `~/.agim/outbox.db`. A worker drains pending rows with exponential
15
+ backoff (1s → 5s → 30s → 5min → 30min → 2h → `giving_up`). Replies
16
+ survive IM platform glitches, brief disconnects, and adapter restarts
17
+ — no more "agent answered but the user never got it" black holes.
18
+ - `/outbox status` aggregate counts.
19
+ - `/outbox list [N]` recent rows.
20
+ - `/outbox failed` rows in `giving_up`.
21
+ - `/outbox retry <id>` resurrect a `giving_up` row.
22
+ - Retention: 24h after `delivered` / `giving_up`. Override with
23
+ `IMHUB_OUTBOX_RETENTION_HOURS`.
24
+ - **Inline-job tracking.** Every inbound agent-bound message now creates
25
+ a row in `jobs.db` with `kind='inline'`, going through the full
26
+ lifecycle `pending → running → completed → delivered` (or `failed`).
27
+ The state machine survives crashes; this is the substrate Phase 3
28
+ recovery and A2A both rely on.
29
+ - Visible in `/job list` and per-row in `/job check <id>`.
30
+ - Retention: 24h (vs 30d for `kind='job'` rows from explicit `/job
31
+ create`). Override with `IMHUB_INLINE_JOB_RETENTION_HOURS`.
32
+ - Kill switch: `IMHUB_INLINE_JOB_TRACKING=0` disables creation; the
33
+ user reply path falls back to the unchanged in-memory pipeline.
34
+ - **Crash-recovery flow.** When agim exits (SIGTERM/SIGINT/crash), any
35
+ in-flight inline job is stamped `interrupted`. On startup, jobs
36
+ interrupted within the last 10 minutes get a per-thread retry prompt
37
+ through the outbox: `⚠️ 上次的消息被服务重启中断了:「<prompt 摘要>…」
38
+ 回复 1 重发 / 2 取消(10 分钟内有效)`.
39
+ - Reply `1` → fresh inline job with the original prompt, old row
40
+ stamped `replaced_by=<new id>`.
41
+ - Reply `2` → old row marked `cancelled`.
42
+ - No reply → old row swept to `abandoned` by the next startup.
43
+ - Window override: `IMHUB_RECOVERY_WINDOW_MS` (default 600000).
44
+
45
+ ### Added — Agent-to-Agent (A2A) Layer 1
46
+
47
+ - **`mcp__imhub__call_agent` MCP tool.** A running agent (claude-code /
48
+ opencode / codex / copilot) can now program-call another agent and
49
+ wait for the result inside its own tool loop. The tool description
50
+ uses imperative + Chinese trigger phrases so models invoke it
51
+ naturally when the user says e.g. "用 codex 帮我跑 git status" or
52
+ "ask opencode to run the tests".
53
+ - **Guardrails enforced inside agim, not on the model**:
54
+ - `IMHUB_A2A_MAX_DEPTH` (default 3) — caps nested call chains.
55
+ - Self-call ban — `claude-code` can't recursively call `claude-code`.
56
+ - Workspace whitelist — callee must be in the caller's workspace
57
+ `agents[]`. Caller's `userId` carries through.
58
+ - `IMHUB_A2A_TIMEOUT_DEFAULT_MS` (default 600_000) — accumulation
59
+ timeout. Callee keeps running on timeout; caller stops waiting.
60
+ - Kill switch: `IMHUB_A2A_ENABLED=0` disables the whole feature.
61
+ - **`/a2a stats | recent [N] | tree <id>`** — observability for A2A
62
+ traffic. Each callee row is linked to its caller via `parent_id`
63
+ + `call_depth`, so `/a2a tree <root>` renders the full chain.
64
+ - **Audit integration** — `intent='a2a'` lights up in `/audit` queries
65
+ and feeds the same per-user budget that user-originated calls do.
66
+
67
+ ### Changed
68
+
69
+ - `SIGTERM` now reuses the same graceful-shutdown path as `SIGINT`.
70
+ Prior to 1.1.2, `systemctl restart` invocations skipped session-
71
+ manager / WAL-checkpoint cleanup because only `SIGINT` was hooked.
72
+ Recommended systemd unit drop-in: `KillMode=mixed` so children get
73
+ `TimeoutStopSec` of grace before SIGKILL.
74
+ - `jobs` table schema migrated (idempotent ALTER TABLE) with
75
+ `thread_key`, `kind`, `started_at`, `delivered_at`, `replaced_by`,
76
+ `last_outbox_id`, `parent_id`, `call_depth`. Pre-v1.0 databases
77
+ (which still had only the original 8 columns) are auto-upgraded
78
+ on first startup.
79
+ - `MessengerAdapter.sendMessage` call sites consolidated through
80
+ `core/message-sink.ts`. Reminder polish / memo polish run upstream of
81
+ sink and are unaffected; sink only takes the final plain payload.
82
+
83
+ ### Fixed
84
+
85
+ - Schema-vs-migrate ordering broke job-board on legacy databases. The
86
+ v1.0 ownership migration's `CREATE INDEX idx_jobs_creator ON
87
+ jobs(creator_id)` was inside the schema string, which `sqlite-helper`
88
+ exec()s **before** `migrateOwnership`. Against a pre-v1.0
89
+ `~/.agim/jobs.db` (no `creator_id` column yet) this threw
90
+ `no such column: creator_id` and disabled the entire job-board for
91
+ the lifetime of the process. The bug shipped silently in v1.0 and
92
+ only surfaced when Phase 2 forced everyone to actually look at the
93
+ jobs table. Column-dependent indexes now live inside `migrateColumns`,
94
+ after the ALTER TABLE pass.
95
+ - Recovery retry path no longer creates orphan `pending` rows. The
96
+ retry interceptor's pre-created replacement job (so it could stamp
97
+ `replaced_by` atomically) is now reused by `handleMessage` instead
98
+ of being shadowed by a second `createInlineJob` call.
99
+
100
+ ### Reference docs
101
+
102
+ - `docs/task-recovery-plan.md` — 3-phase design + risk register +
103
+ cross-session handoff guide.
104
+ - `docs/task-recovery-testing.md` — grey-deploy step-by-step verifier.
105
+ - `docs/agent-to-agent-plan.md` — 3-layer A2A design (L1 implemented;
106
+ L2 shared-artifacts and L3 workflow-DSL are documented for later).
107
+
7
108
  ## [1.1.1] - 2026-05-14
8
109
 
9
110
  ### Changed
package/README.md CHANGED
@@ -18,6 +18,9 @@
18
18
  ## Highlights
19
19
 
20
20
  - **5 messengers + email, 4+ agents** — WeChat (image / file / voice), Feishu, DingTalk (image / voice with server-side ASR), Telegram, Discord, Email (SMTP); Claude Code, Codex, Copilot, OpenCode, plus any ACP endpoint
21
+ - **Agent-to-Agent (A2A)** — agents call other agents inline via `mcp__imhub__call_agent`. Just say *"用 codex 帮我跑 git status"* / *"ask opencode to run the tests"* — the active agent hands off, waits, integrates the reply. Guardrails: depth limit (default 3), self-call ban, workspace whitelist, per-user budget. Observability: `/a2a stats | recent | tree <id>`. Disable with `IMHUB_A2A_ENABLED=0`.
22
+ - **Crash-safe delivery** — every outbound message (replies, reminders, approvals, restart notices) flows through a SQLite outbox; a worker drains pending rows with exponential backoff. IM glitches and brief disconnects no longer drop replies. `/outbox status | list | failed | retry <id>`.
23
+ - **In-flight job recovery** — every inbound message becomes a tracked inline job (`pending → running → completed → delivered`). When agim is restarted mid-flight, the next startup pings each affected thread within 10 min: *"⚠️ 上次的消息被服务重启中断了:「…」回复 1 重发 / 2 取消"*. Per-job lifecycle visible in `/job list` and `/job check <id>`.
21
24
  - **`/remind` reminders subsystem** — one-shot + recurring (`每天8点喝水`); LLM auto-detects reminder intent in casual chat; LLM polishes delivery; agent MCP tools; web `/reminders` page; email + IM delivery
22
25
  - **`/memo` 5W1H persistent memory** — generic "what / who / when / where / how / why" notes with optional GPS capture (browser geolocation H5 + Baidu geocoder); permanent by default, transient bucket for parking spots / today's meeting; agents store + retrieve via MCP tools so casual mentions get remembered automatically
23
26
  - **Browser dashboard** — chat UI, tasks panel (jobs / schedules / approvals / health / files / audit), reminders panel, settings page with workspace CRUD
@@ -25,7 +28,7 @@
25
28
  - **Rich media in WeChat / Telegram / DingTalk** — receive images, files, videos; voice messages transcribed via WeChat STT, DingTalk's server-side ASR, OpenAI Whisper, or whisper.cpp (per-platform fallback chain)
26
29
  - **Smart routing** — intent classifier (CJK + ASCII), sticky sessions, circuit breaker, rate limiter
27
30
  - **Multi-tenant workspaces** — per-workspace agent whitelist, rate limits, command-level ACL
28
- - **Persistent jobs & cron** — SQLite-backed, survives restarts, 30-day retention
31
+ - **Persistent jobs & cron** — SQLite-backed, survives restarts, 30-day retention (24h for auto-tracked inline jobs)
29
32
  - **Observability** — structured logging (pino + traceId), Prometheus metrics, audit log
30
33
  - **Security** — timing-safe auth, SSRF guards, credential file permissions, approval socket entropy
31
34
 
@@ -126,8 +129,11 @@ agim messengers # List available messengers
126
129
  | `/remind …` | Reminders — see [Reminders](#reminders) below |
127
130
  | `/memo …` | 5W1H persistent memory — see [Memos](#memos) below (aliases `/记`, `/note`) |
128
131
  | `/job`, `/cron`, `/audit`, `/stats` | Manage jobs, cron schedules, audit, stats (`/schedule` aliases `/cron` until v0.4.0) |
132
+ | `/outbox status\|list\|failed\|retry <id>` | Inspect & operate the persistent IM delivery queue (v1.1.2+) |
133
+ | `/a2a stats\|recent\|tree <id>` | Agent-to-Agent observability — chain, latency, callee histogram (v1.1.2+) |
129
134
  | `/router status\|explain` | Inspect routing decisions |
130
135
  | `y` / `n` / `批准` / `拒绝` | Approve / deny Claude tool call (or reminder confirmation card) |
136
+ | `1` / `2` | After a service restart, reply `1` to redo the interrupted message or `2` to cancel (10 min window) |
131
137
 
132
138
  ## Human-in-the-loop Tool Approval
133
139
 
package/README.zh-CN.md CHANGED
@@ -14,6 +14,9 @@
14
14
  ## 亮点
15
15
 
16
16
  - **5 种 IM + 邮件,4+ 种 Agent** — 微信(图片 / 文件 / 语音)、飞书、钉钉(图片 / 语音,自带服务端 ASR)、Telegram、Discord、Email(SMTP);Claude Code、Codex、Copilot、OpenCode,以及任意 ACP 端点
17
+ - **Agent 之间互调(A2A)** — Agent 可以在自己的工具循环里直接调另一个 Agent,比如直接说「用 codex 帮我跑 git status」/「让 opencode 跑一下测试」。护栏在 agim 里强制:递归深度限制(默认 3)、禁止自调、工作区白名单、按人预算共享;可观测:`/a2a stats | recent | tree <id>`;`IMHUB_A2A_ENABLED=0` 关闭
18
+ - **投递不丢消息** — Agent 回复 / 提醒 / 审批卡片 / 重启通知 全部走 SQLite outbox + 指数退避 worker,IM 平台抖动或断连不再丢回复;`/outbox status | list | failed | retry <id>`
19
+ - **崩溃中断可恢复** — 每条入站消息都建一条 inline job(`pending → running → completed → delivered`)。agim 重启时正在跑的任务会标 `interrupted`,下次启动 10 分钟窗口内对每条受影响的会话发:「⚠️ 上次的消息被服务重启中断了:「…」回复 1 重发 / 2 取消」。每条 job 的完整生命周期可在 `/job list` + `/job check <id>` 查看
17
20
  - **`/remind` 提醒子系统** — 一次性 + 定期(`每天8点喝水`);非 slash 消息 LLM 自动识别意图;LLM 润色投递文本;Agent MCP 工具直接创建;Web `/reminders` 管理页;邮件 + IM 双通道投递
18
21
  - **`/memo` 5W1H 持久记忆库** — 通用「what / who / when / where / how / why」记事本,可选 GPS(浏览器地理定位 H5 + 百度地理编码);默认永久保存,临时类(停车、当天会议)单独走 24h 桶;Agent 通过 MCP 工具自动落库 + 检索,闲聊里提到的地点 / 生日 / 待办都能记下
19
22
  - **浏览器控制台** — 对话界面、任务面板(Jobs / 调度 / 审批 / 健康 / 文件 / 审计)、提醒面板、设置页含工作区 CRUD
@@ -21,7 +24,7 @@
21
24
  - **微信 / Telegram / 钉钉 富媒体** — 接收图片、文件、视频;语音消息按平台走最佳转写链路:微信 STT、钉钉服务端 ASR、OpenAI Whisper、whisper.cpp
22
25
  - **智能路由** — 意图分类(中英文)、Sticky 会话、断路器、限流器
23
26
  - **多租户工作区** — 按工作区隔离 Agent 白名单、限流、命令级 ACL
24
- - **持久化任务与定时** — SQLite 落地,重启不丢,30 天自动修剪
27
+ - **持久化任务与定时** — SQLite 落地,重启不丢,显式 `/job` 30 天 / 自动跟踪的 inline job 24 小时
25
28
  - **可观测** — 结构化日志(pino + traceId)、Prometheus 指标、审计日志
26
29
  - **安全** — 常量时鉴权、SSRF 防护、凭证文件权限、审批 socket 熵值
27
30
 
@@ -118,8 +121,11 @@ agim messengers # 列出可用 IM
118
121
  | `/remind …` | 提醒子系统 — 详见 [提醒](#提醒) |
119
122
  | `/memo …` | 5W1H 持久记忆库 — 详见 [备忘](#备忘)(别名 `/记`、`/note`)|
120
123
  | `/job`、`/cron`、`/audit`、`/stats` | 管理任务、定时、审计、统计(`/schedule` 仍是 `/cron` 别名,v0.4.0 移除) |
124
+ | `/outbox status\|list\|failed\|retry <id>` | 查看与重发 IM 投递队列(v1.1.2+) |
125
+ | `/a2a stats\|recent\|tree <id>` | 查看 Agent 之间互调的统计 / 最近 N 次 / 调用链树(v1.1.2+) |
121
126
  | `/router status\|explain` | 查看路由策略 |
122
127
  | `y` / `n` / `批准` / `拒绝` | 同意 / 拒绝(工具调用 或 提醒确认卡片) |
128
+ | `1` / `2` | 服务重启后看到「上次消息中断」提示时,回 `1` 重发 / `2` 取消(10 分钟内有效) |
123
129
 
124
130
  ## 工具调用人审
125
131
 
package/dist/cli.js CHANGED
@@ -32,6 +32,8 @@ import { consumeLocationContext, formatLocationAnnotation } from './core/locatio
32
32
  import { tryDetectReminderIntent } from './core/remind-intent.js';
33
33
  import { createReminder } from './core/reminders.js';
34
34
  import { setReminderConfirmNotifier } from './core/reminder-rpc.js';
35
+ import { sink } from './core/message-sink.js';
36
+ import { createInlineJob, markJobRunning, markJobCompleted, markJobFailed, linkJobOutbox, updateJobAgent, } from './core/job-board.js';
35
37
  import { checkMessengerConfig, checkAgentAvailability, runMessengerOnboarding, formatAgentInstallHint, formatMessengerStartError, loadConfig as loadOnboardingConfig, saveConfig as saveOnboardingConfig, } from './core/onboarding.js';
36
38
  import { startWebServer } from './web/server.js';
37
39
  import { startACPServer } from './core/acp-server.js';
@@ -260,6 +262,64 @@ program
260
262
  ctx.traceId = traceId;
261
263
  ctx.logger = createLogger({ traceId, platform: ctx.platform, component: 'cli' });
262
264
  ctx.logger.info({ event: 'message.received', text: ctx.message.text.substring(0, 120), userId: ctx.message.userId });
265
+ // Phase 3 — interrupted-job recovery reply. If the user has a
266
+ // pending "retry / cancel" prompt outstanding on this thread AND
267
+ // their reply parses as 1/重发/2/取消, consume the message here
268
+ // before any other interceptor (approval / reminder) sees it.
269
+ // hasPendingRecovery is a cheap map lookup; we only do the full
270
+ // parse when the gate is true.
271
+ const recoveryThreadKey = `${ctx.platform}:${ctx.channelId}:${ctx.message.threadId}`;
272
+ try {
273
+ const { hasPendingRecovery, tryHandleRecoveryReply } = await import('./core/job-recovery.js');
274
+ if (hasPendingRecovery(recoveryThreadKey)) {
275
+ const outcome = tryHandleRecoveryReply(recoveryThreadKey, ctx.message.text);
276
+ if (outcome.kind === 'retried') {
277
+ ctx.logger.info({
278
+ event: 'recovery.retried', oldJobId: outcome.oldJobId, newJobId: outcome.newJobId,
279
+ });
280
+ // Substitute the user's "1" with the original prompt so the
281
+ // rest of the handler reruns the work. handleMessage reads
282
+ // message.text again on its own, so this is the cheapest
283
+ // way to re-enter the normal path without duplicating logic.
284
+ ctx.message.text = outcome.entry.prompt;
285
+ // Hand the freshly-created replacement job id to handleMessage
286
+ // so it doesn't build a second inline row (the recovery flow
287
+ // already created newJobId + stamped replaced_by on the old
288
+ // row). Without this we'd accumulate orphan 'pending' rows
289
+ // that retention eventually GCs — visible but harmless.
290
+ ctx.inlineJobId = outcome.newJobId;
291
+ // Send a short "好的,重发中..." acknowledgement first so the
292
+ // user knows we accepted the choice (the actual agent reply
293
+ // can take many seconds). priority=normal — slot it after
294
+ // anything urgent in the outbox.
295
+ await sink.deliver({
296
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
297
+ payload: '🔁 好的,重发中…',
298
+ kind: 'text',
299
+ });
300
+ // Fall through to the normal handler below with the rewritten
301
+ // message text.
302
+ }
303
+ else if (outcome.kind === 'cancelled') {
304
+ ctx.logger.info({ event: 'recovery.cancelled', oldJobId: outcome.oldJobId });
305
+ await sink.deliver({
306
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
307
+ payload: '✅ 已取消,不再处理上次的消息。',
308
+ kind: 'text',
309
+ });
310
+ return;
311
+ }
312
+ // outcome.kind === 'not-recovery-reply' → user typed something
313
+ // else; fall through to normal handling so we don't swallow
314
+ // unrelated messages.
315
+ }
316
+ }
317
+ catch (err) {
318
+ ctx.logger.warn({
319
+ event: 'recovery.intercept_crashed',
320
+ err: err instanceof Error ? err.message : String(err),
321
+ }, 'recovery intercept threw — falling through to normal flow');
322
+ }
263
323
  // Approval interception comes BEFORE the agent router. If a pending
264
324
  // approval exists for this thread and the message is a y/n-style
265
325
  // reply, we resolve the approval and stop. Anything else routes
@@ -295,7 +355,10 @@ program
295
355
  const replyDecision = tryConsumePendingReminderReply(tk, ctx.message.text);
296
356
  if (replyDecision) {
297
357
  if (replyDecision.decision === 'cancel') {
298
- await messenger.sendMessage(ctx.message.threadId, '✅ 已忽略这条提醒建议');
358
+ await sink.deliver({
359
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
360
+ payload: '✅ 已忽略这条提醒建议', kind: 'text',
361
+ });
299
362
  }
300
363
  else {
301
364
  const p = replyDecision.pending;
@@ -311,11 +374,18 @@ program
311
374
  recurrence: p.recurrence,
312
375
  });
313
376
  const recurLine = p.recurrence ? `\n 循环:${p.recurrence}` : '';
314
- await messenger.sendMessage(ctx.message.threadId, `✅ 已创建提醒 #${id}\n 触发:${p.fireAt.toLocaleString()}${recurLine}\n 内容:${p.text}`);
377
+ await sink.deliver({
378
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
379
+ payload: `✅ 已创建提醒 #${id}\n 触发:${p.fireAt.toLocaleString()}${recurLine}\n 内容:${p.text}`,
380
+ kind: 'text',
381
+ });
315
382
  }
316
383
  catch (err) {
317
384
  const msg = err instanceof Error ? err.message : String(err);
318
- await messenger.sendMessage(ctx.message.threadId, `❌ 创建失败:${msg}`);
385
+ await sink.deliver({
386
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
387
+ payload: `❌ 创建失败:${msg}`, kind: 'text',
388
+ });
319
389
  }
320
390
  }
321
391
  ctx.logger.info({ event: 'message.consumed_by_pending_reminder', decision: replyDecision.decision });
@@ -354,7 +424,10 @@ program
354
424
  reply = await handleStopCommand('', routeCtx);
355
425
  else
356
426
  reply = await handleStatusCommand('', routeCtx);
357
- await messenger.sendMessage(ctx.message.threadId, reply);
427
+ await sink.deliver({
428
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
429
+ payload: reply, kind: 'text',
430
+ });
358
431
  return;
359
432
  }
360
433
  }
@@ -372,7 +445,10 @@ program
372
445
  text: ctx.message.text,
373
446
  });
374
447
  if (detected.handled && detected.reply) {
375
- await messenger.sendMessage(ctx.message.threadId, detected.reply);
448
+ await sink.deliver({
449
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
450
+ payload: detected.reply, kind: 'text',
451
+ });
376
452
  ctx.logger.info({ event: 'message.proposed_reminder' });
377
453
  return;
378
454
  }
@@ -412,18 +488,54 @@ program
412
488
  });
413
489
  console.log('✅ Approval router wired to messengers');
414
490
  }
491
+ // Start the outbox worker AFTER all messengers have been registered/
492
+ // started. If we started it earlier, every queued row from a prior run
493
+ // would fail "messenger not registered" on the first tick and bounce
494
+ // into backoff — minor but pointless. Worker is single-threaded and
495
+ // ticks at IMHUB_OUTBOX_TICK_MS (default 1s).
496
+ sink.startWorker();
497
+ console.log('✅ Message-sink worker started');
498
+ // Phase 3 — startup recovery scan. Look at jobs.db for inline rows
499
+ // that got 'interrupted' on the previous shutdown and are still inside
500
+ // the recovery window (default 10 min, env IMHUB_RECOVERY_WINDOW_MS).
501
+ // For each, send a "回 1 重发 / 2 取消" prompt via sink — outbox will
502
+ // hold the notification until the messenger is fully connected. Older
503
+ // interrupted rows transition to 'abandoned' so we don't ask the user
504
+ // to retry something they sent half an hour ago.
505
+ try {
506
+ const { scanInterruptedAndNotify, sweepExpiredPending } = await import('./core/job-recovery.js');
507
+ const { notified, abandoned } = await scanInterruptedAndNotify();
508
+ if (notified > 0 || abandoned > 0) {
509
+ console.log(`✅ Job recovery: ${notified} retry-prompt(s) sent, ${abandoned} abandoned`);
510
+ }
511
+ // Tidy expired pending entries every minute. The in-memory map is
512
+ // cheap but a long-lived process accumulates entries from users who
513
+ // never reply.
514
+ const sweepTimer = setInterval(() => { sweepExpiredPending(); }, 60_000);
515
+ if (typeof sweepTimer === 'object' && sweepTimer && 'unref' in sweepTimer) {
516
+ sweepTimer.unref();
517
+ }
518
+ }
519
+ catch (err) {
520
+ console.warn('job-recovery module failed to scan:', err instanceof Error ? err.message : String(err));
521
+ }
415
522
  // Reminder confirm notifier — used by reminder-rpc.ts when an agent
416
523
  // creates a high-frequency + LLM-polish reminder (those go through a
417
524
  // y/n card before landing in the DB). Plain message via the same
418
525
  // messenger the agent is talking on.
419
526
  setReminderConfirmNotifier(async (ctx, message) => {
420
- const messenger = registry.getMessenger(platformToMessengerName(ctx.platform))
421
- ?? registry.getMessenger(ctx.platform);
422
- if (!messenger) {
423
- console.warn(`reminder-confirm: no messenger for platform "${ctx.platform}"`);
424
- return;
425
- }
426
- await messenger.sendMessage(ctx.threadId, message);
527
+ // sink resolves the adapter at delivery time; we just need to canonicalize
528
+ // the messenger name (e.g. 'wechat' → 'wechat-ilink') so the worker picks
529
+ // the right adapter from the registry.
530
+ const canonical = platformToMessengerName(ctx.platform);
531
+ const platform = registry.getMessenger(canonical) ? canonical : ctx.platform;
532
+ await sink.deliver({
533
+ platform,
534
+ channelId: ctx.channelId,
535
+ threadId: ctx.threadId,
536
+ payload: message,
537
+ kind: 'text',
538
+ });
427
539
  });
428
540
  // ============================================
429
541
  // START WEB CHAT SERVER
@@ -476,12 +588,37 @@ program
476
588
  catch (err) {
477
589
  console.warn('restart-completion module failed to load:', err instanceof Error ? err.message : String(err));
478
590
  }
479
- // Keep process alive
480
- process.on('SIGINT', async () => {
481
- console.log('\n👋 Shutting down...');
591
+ // Graceful shutdown. Registered for BOTH SIGINT (Ctrl-C from a terminal)
592
+ // and SIGTERM (systemd / docker stop). Prior to v1.1.2 only SIGINT was
593
+ // hooked, which meant `systemctl restart agim` skipped this entire path —
594
+ // sessionManager.stop, WAL checkpoints, etc. — and the next startup had
595
+ // to recover from a half-flushed state. The handler is idempotent against
596
+ // duplicate signals.
597
+ let shuttingDown = false;
598
+ const gracefulShutdown = async (signal) => {
599
+ if (shuttingDown)
600
+ return;
601
+ shuttingDown = true;
602
+ console.log(`\n👋 Received ${signal}, shutting down...`);
482
603
  sessionManager.stop();
483
604
  webServer?.close();
484
605
  acpServer?.close();
606
+ // Stop the outbox worker BEFORE messengers so no new send attempts
607
+ // race the adapter shutdown. Worker is single-threaded so a pending
608
+ // tick will finish its current row before we move on.
609
+ try {
610
+ sink.stopWorker();
611
+ }
612
+ catch { /* ignore */ }
613
+ // Phase 3: any inline job that was still pending/running gets stamped
614
+ // 'interrupted' so the next startup's recovery scan can offer the user
615
+ // a retry. Must happen BEFORE closeJobBoardDb() below, while the
616
+ // handle is still open.
617
+ try {
618
+ const { markRunningInlineJobsInterrupted } = await import('./core/job-board.js');
619
+ markRunningInlineJobsInterrupted(`agim shutdown (${signal})`);
620
+ }
621
+ catch { /* ignore */ }
485
622
  // Stop all messengers
486
623
  for (const name of registry.listMessengers()) {
487
624
  const messenger = registry.getMessenger(name);
@@ -527,8 +664,16 @@ program
527
664
  closeMemosDb(); // also stops the sweep timer
528
665
  }
529
666
  catch { /* ignore */ }
667
+ try {
668
+ const { stopOutboxRetentionSweep, closeOutboxDb } = await import('./core/outbox.js');
669
+ stopOutboxRetentionSweep();
670
+ closeOutboxDb();
671
+ }
672
+ catch { /* ignore */ }
530
673
  process.exit(0);
531
- });
674
+ };
675
+ process.on('SIGINT', () => { void gracefulShutdown('SIGINT'); });
676
+ process.on('SIGTERM', () => { void gracefulShutdown('SIGTERM'); });
532
677
  // Wait forever
533
678
  await new Promise(() => { });
534
679
  });
@@ -581,6 +726,11 @@ async function handleMessage(ctx, defaultAgent) {
581
726
  let dismissThinking;
582
727
  const looksLikeApproval = /^\s*[yn]\s*$/i.test(message.text) ||
583
728
  /^\s*(批准|拒绝|同意|不同意|通过|可以|不可以|不行|✅|❌)\s*$/.test(message.text);
729
+ // Inline-job tracking id (Phase 2). Declared at function scope so the
730
+ // catch block below can mark the row 'failed' on any throw. -1 means
731
+ // "not tracked": either the message didn't reach willInvokeAgent, or
732
+ // createInlineJob fell back to in-memory (DB locked / IMHUB_INLINE_JOB_TRACKING=0).
733
+ let inlineJobId = -1;
584
734
  try {
585
735
  if (messenger.sendTyping) {
586
736
  messenger.sendTyping(message.threadId, true).catch(() => { });
@@ -642,6 +792,66 @@ async function handleMessage(ctx, defaultAgent) {
642
792
  }
643
793
  };
644
794
  }
795
+ // Inline-job tracking: every agent-bound inbound message gets a row in
796
+ // the jobs table (kind='inline') so the state machine survives crashes
797
+ // and the Phase 3 startup scan can offer the user a retry. Built-ins
798
+ // (status/audit/router) return string immediately and never touch the
799
+ // agent — no point tracking them. Inline tracking can be killed wholesale
800
+ // via env IMHUB_INLINE_JOB_TRACKING=0 (createInlineJob returns -1, every
801
+ // mark*Job helper no-ops on id<=0). createInlineJob itself never throws —
802
+ // if the DB is locked or disk-full we get -1 and the user path runs
803
+ // unchanged on the in-memory fast path.
804
+ //
805
+ // We create the row BEFORE entering the runOnThread queue so a stuck
806
+ // queue (or runAgentInvocation crash before the agent runs) still leaves
807
+ // a 'pending' record we can attribute to the user. The agent name is
808
+ // initially defaultAgent and gets corrected from inside onAgentResolved
809
+ // once routing picks the real one.
810
+ //
811
+ // EXCEPTION: when the cli's recovery interceptor handed us a pre-created
812
+ // replacement-job id via ctx.inlineJobId, reuse it. The recovery path
813
+ // had to create the row up-front so it could stamp replaced_by on the
814
+ // old interrupted row atomically; building a second row here would
815
+ // leave the recovery-created one as an orphan 'pending' that retention
816
+ // eventually GCs.
817
+ if (willInvokeAgent) {
818
+ if (ctx.inlineJobId && ctx.inlineJobId > 0) {
819
+ inlineJobId = ctx.inlineJobId;
820
+ }
821
+ else {
822
+ inlineJobId = createInlineJob({
823
+ agent: defaultAgent,
824
+ prompt: message.text,
825
+ threadKey: `${platform}:${ctx.channelId}:${message.threadId}`,
826
+ creatorId: message.userId ?? '',
827
+ });
828
+ }
829
+ // A2A-L1: stash on routeCtx so callAgentWithHistory can forward it
830
+ // through AgentSendOpts → adapter.registerRun → RunContext. When the
831
+ // agent fires mcp__imhub__call_agent the bus then knows what
832
+ // parent_id to stamp on the callee inline-job row. User-originated
833
+ // runs are call_depth=0; nested runs (A2A callees) reach this same
834
+ // handler with their own ctx.inlineJobId and ctx.callDepth set.
835
+ if (inlineJobId > 0) {
836
+ routeCtx.parentJobId = inlineJobId;
837
+ // handleMessage is only the user-inbound path — callDepth is
838
+ // always 0 here. (A2A callees don't traverse this handler; they
839
+ // go through a2a.callAgentByName which passes its own callDepth
840
+ // directly to target.sendPrompt.)
841
+ routeCtx.callDepth = 0;
842
+ }
843
+ }
844
+ // Chain the agent-name correction onto the existing onAgentResolved
845
+ // (which already wires native session ids). Both run sequentially —
846
+ // the inline-job update is fast and best-effort.
847
+ const prevOnResolved = routeCtx.onAgentResolved;
848
+ if (prevOnResolved || inlineJobId > 0) {
849
+ routeCtx.onAgentResolved = async (resolvedAgent) => {
850
+ if (prevOnResolved)
851
+ await prevOnResolved(resolvedAgent);
852
+ updateJobAgent(inlineJobId, resolvedAgent);
853
+ };
854
+ }
645
855
  // The thinking placeholder + agent invocation + response delivery are
646
856
  // wrapped into a closure so agent-bound messages can be serialized
647
857
  // per-thread via runOnThread (see core/thread-queue.ts). Without
@@ -660,6 +870,10 @@ async function handleMessage(ctx, defaultAgent) {
660
870
  logger.debug({ err: String(err) }, 'sendThinking failed');
661
871
  }
662
872
  }
873
+ // Mark inline job 'running' just before the agent actually starts —
874
+ // anything that throws before this leaves the row in 'pending' which
875
+ // is the correct "never ran" semantic.
876
+ markJobRunning(inlineJobId);
663
877
  const result = await routeMessage(parsed, routeCtx);
664
878
  const dismiss = async () => {
665
879
  if (dismissThinking) {
@@ -674,7 +888,14 @@ async function handleMessage(ctx, defaultAgent) {
674
888
  if (typeof result === 'string') {
675
889
  await stopTyping();
676
890
  await dismiss();
677
- await messenger.sendMessage(message.threadId, await maybePrefix(result));
891
+ markJobCompleted(inlineJobId, result);
892
+ const sinkRes = await sink.deliver({
893
+ platform, channelId: ctx.channelId, threadId: message.threadId,
894
+ payload: await maybePrefix(result), kind: 'text',
895
+ jobId: inlineJobId > 0 ? inlineJobId : undefined,
896
+ });
897
+ if (sinkRes.outboxId > 0)
898
+ linkJobOutbox(inlineJobId, sinkRes.outboxId);
678
899
  logger.info({ event: 'message.sent', responseLen: result.length });
679
900
  }
680
901
  else {
@@ -686,10 +907,22 @@ async function handleMessage(ctx, defaultAgent) {
686
907
  await stopTyping();
687
908
  await dismiss();
688
909
  if (fullResponse) {
689
- await messenger.sendMessage(message.threadId, await maybePrefix(fullResponse));
910
+ markJobCompleted(inlineJobId, fullResponse);
911
+ const sinkRes = await sink.deliver({
912
+ platform, channelId: ctx.channelId, threadId: message.threadId,
913
+ payload: await maybePrefix(fullResponse), kind: 'text',
914
+ jobId: inlineJobId > 0 ? inlineJobId : undefined,
915
+ });
916
+ if (sinkRes.outboxId > 0)
917
+ linkJobOutbox(inlineJobId, sinkRes.outboxId);
690
918
  logger.info({ event: 'message.sent', responseLen: fullResponse.length });
691
919
  }
692
920
  else {
921
+ // Empty response is treated as a failure of the inline job —
922
+ // there's nothing to deliver and we don't want the row stuck in
923
+ // 'running' forever. The user already sees nothing; the row gets
924
+ // pruned out of /jobs lists.
925
+ markJobFailed(inlineJobId, 'agent returned empty response');
693
926
  logger.warn({ event: 'message.empty_response' });
694
927
  }
695
928
  }
@@ -699,12 +932,11 @@ async function handleMessage(ctx, defaultAgent) {
699
932
  await runOnThread(queueKey, runAgentInvocation, {
700
933
  onQueued: async (ahead) => {
701
934
  logger.info({ event: 'message.queued', ahead, queueKey });
702
- try {
703
- await messenger.sendMessage(message.threadId, `📥 已收到(前面还有 ${ahead} 条在处理,稍后给你结果)`);
704
- }
705
- catch (err) {
706
- logger.debug({ err: String(err) }, 'queued-notice send failed');
707
- }
935
+ await sink.deliver({
936
+ platform, channelId: ctx.channelId, threadId: message.threadId,
937
+ payload: `📥 已收到(前面还有 ${ahead} 条在处理,稍后给你结果)`,
938
+ kind: 'text',
939
+ });
708
940
  },
709
941
  });
710
942
  }
@@ -722,12 +954,15 @@ async function handleMessage(ctx, defaultAgent) {
722
954
  }
723
955
  catch { /* ignore */ }
724
956
  }
725
- try {
726
- await messenger.sendMessage(message.threadId, '❌ An error occurred processing your message.');
727
- }
728
- catch {
729
- // Ignore
730
- }
957
+ // Mark inline job 'failed' before sending the user error — markJobFailed
958
+ // is a no-op for id<=0 (built-ins / inline tracking disabled) and for
959
+ // already-terminal rows. The error blurb still goes through sink so it
960
+ // survives an IM hiccup.
961
+ markJobFailed(inlineJobId, errMsg);
962
+ await sink.deliver({
963
+ platform, channelId: ctx.channelId, threadId: message.threadId,
964
+ payload: '❌ An error occurred processing your message.', kind: 'text',
965
+ }).catch(() => { });
731
966
  }
732
967
  }
733
968
  program