agim-cli 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +147 -0
  2. package/README.md +8 -1
  3. package/README.zh-CN.md +8 -1
  4. package/dist/cli.js +265 -30
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/a2a.d.ts +55 -0
  7. package/dist/core/a2a.d.ts.map +1 -0
  8. package/dist/core/a2a.js +254 -0
  9. package/dist/core/a2a.js.map +1 -0
  10. package/dist/core/approval-bus.d.ts +40 -0
  11. package/dist/core/approval-bus.d.ts.map +1 -1
  12. package/dist/core/approval-bus.js +125 -0
  13. package/dist/core/approval-bus.js.map +1 -1
  14. package/dist/core/approval-router.d.ts.map +1 -1
  15. package/dist/core/approval-router.js +31 -2
  16. package/dist/core/approval-router.js.map +1 -1
  17. package/dist/core/artifacts.d.ts +86 -0
  18. package/dist/core/artifacts.d.ts.map +1 -0
  19. package/dist/core/artifacts.js +306 -0
  20. package/dist/core/artifacts.js.map +1 -0
  21. package/dist/core/commands/a2a.d.ts +3 -0
  22. package/dist/core/commands/a2a.d.ts.map +1 -0
  23. package/dist/core/commands/a2a.js +162 -0
  24. package/dist/core/commands/a2a.js.map +1 -0
  25. package/dist/core/commands/job.d.ts.map +1 -1
  26. package/dist/core/commands/job.js +11 -2
  27. package/dist/core/commands/job.js.map +1 -1
  28. package/dist/core/commands/outbox.d.ts +3 -0
  29. package/dist/core/commands/outbox.d.ts.map +1 -0
  30. package/dist/core/commands/outbox.js +92 -0
  31. package/dist/core/commands/outbox.js.map +1 -0
  32. package/dist/core/job-board.d.ts +122 -1
  33. package/dist/core/job-board.d.ts.map +1 -1
  34. package/dist/core/job-board.js +432 -21
  35. package/dist/core/job-board.js.map +1 -1
  36. package/dist/core/job-recovery.d.ts +48 -0
  37. package/dist/core/job-recovery.d.ts.map +1 -0
  38. package/dist/core/job-recovery.js +185 -0
  39. package/dist/core/job-recovery.js.map +1 -0
  40. package/dist/core/message-sink.d.ts +63 -0
  41. package/dist/core/message-sink.d.ts.map +1 -0
  42. package/dist/core/message-sink.js +296 -0
  43. package/dist/core/message-sink.js.map +1 -0
  44. package/dist/core/outbox.d.ts +71 -0
  45. package/dist/core/outbox.d.ts.map +1 -0
  46. package/dist/core/outbox.js +301 -0
  47. package/dist/core/outbox.js.map +1 -0
  48. package/dist/core/reminders.d.ts.map +1 -1
  49. package/dist/core/reminders.js +12 -1
  50. package/dist/core/reminders.js.map +1 -1
  51. package/dist/core/restart-completion.d.ts.map +1 -1
  52. package/dist/core/restart-completion.js +18 -1
  53. package/dist/core/restart-completion.js.map +1 -1
  54. package/dist/core/router.d.ts +8 -0
  55. package/dist/core/router.d.ts.map +1 -1
  56. package/dist/core/router.js +16 -0
  57. package/dist/core/router.js.map +1 -1
  58. package/dist/core/types.d.ts +22 -0
  59. package/dist/core/types.d.ts.map +1 -1
  60. package/dist/plugins/agents/claude-code/index.d.ts.map +1 -1
  61. package/dist/plugins/agents/claude-code/index.js +5 -0
  62. package/dist/plugins/agents/claude-code/index.js.map +1 -1
  63. package/dist/plugins/agents/claude-code/mcp-approval-server.d.ts +46 -0
  64. package/dist/plugins/agents/claude-code/mcp-approval-server.d.ts.map +1 -1
  65. package/dist/plugins/agents/claude-code/mcp-approval-server.js +158 -0
  66. package/dist/plugins/agents/claude-code/mcp-approval-server.js.map +1 -1
  67. package/dist/plugins/agents/codex/index.d.ts.map +1 -1
  68. package/dist/plugins/agents/codex/index.js +5 -0
  69. package/dist/plugins/agents/codex/index.js.map +1 -1
  70. package/dist/plugins/agents/opencode/opencode-stdio-adapter.d.ts.map +1 -1
  71. package/dist/plugins/agents/opencode/opencode-stdio-adapter.js +5 -0
  72. package/dist/plugins/agents/opencode/opencode-stdio-adapter.js.map +1 -1
  73. package/dist/web/server.d.ts.map +1 -1
  74. package/dist/web/server.js +28 -16
  75. package/dist/web/server.js.map +1 -1
  76. package/package.json +1 -1
package/CHANGELOG.md CHANGED
@@ -4,6 +4,153 @@ All notable changes to this project will be documented in this file.
4
4
 
5
5
  ## [Unreleased]
6
6
 
7
+ ## [1.1.3] - 2026-05-15
8
+
9
+ ### Added — Agent-to-Agent Layer 2 (shared artifacts)
10
+
11
+ - **File exchange between agents.** `mcp__imhub__call_agent` now accepts
12
+ optional `inputs[]` and `expectOutputs[]` so caller and callee can
13
+ hand off files instead of round-tripping bytes through prompts /
14
+ results.
15
+ - `inputs` entries pick a source: `fromAbsolutePath` (host file),
16
+ `fromCallerOutput` (caller's own `_agim-output/`), or inline
17
+ `content` (UTF-8 string).
18
+ - `expectOutputs` is a soft contract — missing files become entries
19
+ in `warnings`, never a hard failure.
20
+ - Each A2A run gets `~/.agim/artifacts/<jobId>/{_agim-input,_agim-output}/`.
21
+ The callee learns about its workspace through a `[A2A workspace]`
22
+ envelope prepended to its prompt: absolute paths, input inventory
23
+ with sizes, expected-output checklist.
24
+ - Response includes `artifactsDir`, `outputs[]` (name + bytes + mtime),
25
+ and optional `warnings[]`. Caller reads `outputs[]` and uses the
26
+ `Read` tool to pull file contents on demand.
27
+ - **Hard-link first, copy second.** `fromAbsolutePath` and
28
+ `fromCallerOutput` use `link(2)` when source and destination share a
29
+ filesystem — near-zero cost for read-only inputs. Falls back to plain
30
+ copy on EXDEV.
31
+ - **Size caps**: `IMHUB_A2A_ARTIFACT_MAX_BYTES_PER_FILE` (default 50 MB),
32
+ `IMHUB_A2A_ARTIFACT_MAX_BYTES_PER_JOB` (200 MB),
33
+ `IMHUB_A2A_ARTIFACT_MAX_FILES_PER_JOB` (200). Failed setup fails the
34
+ job before any tokens are spent.
35
+ - **Retention follows the inline-job row.** When job-board's sweep
36
+ deletes a row, `pruneArtifactsBatch` removes its directory. Fire-and-
37
+ forget — orphan dirs (if disk pruning fails) never block DB cleanup.
38
+ - **`/a2a tree <id>`** now lists `📎 <name> (<size>)` under each row
39
+ that left files in `_agim-output/`.
40
+
41
+ ### Changed
42
+
43
+ - `mcp__imhub__call_agent` description grew a paragraph documenting the
44
+ L2 fields; existing L1 callers (no `inputs` / `expectOutputs`) keep
45
+ the L1 string-in / string-out behavior unchanged.
46
+
47
+ ### Reference docs
48
+
49
+ - `docs/a2a-l2-plan.md` — full Layer 2 design, decision register, and
50
+ the L3 punt list (cross-thread artifacts, remote ACP artifacts,
51
+ versioning) are documented but not implemented.
52
+
53
+ ## [1.1.2] - 2026-05-15
54
+
55
+ ### Added — Task recovery (3-phase, all on by default)
56
+
57
+ - **Outbox-backed IM delivery.** Every outbound message (replies, reminder
58
+ fires, approval prompts/receipts, restart-completion notices, web
59
+ notifications) now flows through a SQLite-backed outbox in
60
+ `~/.agim/outbox.db`. A worker drains pending rows with exponential
61
+ backoff (1s → 5s → 30s → 5min → 30min → 2h → `giving_up`). Replies
62
+ survive IM platform glitches, brief disconnects, and adapter restarts
63
+ — no more "agent answered but the user never got it" black holes.
64
+ - `/outbox status` aggregate counts.
65
+ - `/outbox list [N]` recent rows.
66
+ - `/outbox failed` rows in `giving_up`.
67
+ - `/outbox retry <id>` resurrect a `giving_up` row.
68
+ - Retention: 24h after `delivered` / `giving_up`. Override with
69
+ `IMHUB_OUTBOX_RETENTION_HOURS`.
70
+ - **Inline-job tracking.** Every inbound agent-bound message now creates
71
+ a row in `jobs.db` with `kind='inline'`, going through the full
72
+ lifecycle `pending → running → completed → delivered` (or `failed`).
73
+ The state machine survives crashes; this is the substrate Phase 3
74
+ recovery and A2A both rely on.
75
+ - Visible in `/job list` and per-row in `/job check <id>`.
76
+ - Retention: 24h (vs 30d for `kind='job'` rows from explicit `/job
77
+ create`). Override with `IMHUB_INLINE_JOB_RETENTION_HOURS`.
78
+ - Kill switch: `IMHUB_INLINE_JOB_TRACKING=0` disables creation; the
79
+ user reply path falls back to the unchanged in-memory pipeline.
80
+ - **Crash-recovery flow.** When agim exits (SIGTERM/SIGINT/crash), any
81
+ in-flight inline job is stamped `interrupted`. On startup, jobs
82
+ interrupted within the last 10 minutes get a per-thread retry prompt
83
+ through the outbox: `⚠️ 上次的消息被服务重启中断了:「<prompt 摘要>…」
84
+ 回复 1 重发 / 2 取消(10 分钟内有效)`.
85
+ - Reply `1` → fresh inline job with the original prompt, old row
86
+ stamped `replaced_by=<new id>`.
87
+ - Reply `2` → old row marked `cancelled`.
88
+ - No reply → old row swept to `abandoned` by the next startup.
89
+ - Window override: `IMHUB_RECOVERY_WINDOW_MS` (default 600000).
90
+
91
+ ### Added — Agent-to-Agent (A2A) Layer 1
92
+
93
+ - **`mcp__imhub__call_agent` MCP tool.** A running agent (claude-code /
94
+ opencode / codex / copilot) can now program-call another agent and
95
+ wait for the result inside its own tool loop. The tool description
96
+ uses imperative + Chinese trigger phrases so models invoke it
97
+ naturally when the user says e.g. "用 codex 帮我跑 git status" or
98
+ "ask opencode to run the tests".
99
+ - **Guardrails enforced inside agim, not on the model**:
100
+ - `IMHUB_A2A_MAX_DEPTH` (default 3) — caps nested call chains.
101
+ - Self-call ban — `claude-code` can't recursively call `claude-code`.
102
+ - Workspace whitelist — callee must be in the caller's workspace
103
+ `agents[]`. Caller's `userId` carries through.
104
+ - `IMHUB_A2A_TIMEOUT_DEFAULT_MS` (default 600_000) — accumulation
105
+ timeout. Callee keeps running on timeout; caller stops waiting.
106
+ - Kill switch: `IMHUB_A2A_ENABLED=0` disables the whole feature.
107
+ - **`/a2a stats | recent [N] | tree <id>`** — observability for A2A
108
+ traffic. Each callee row is linked to its caller via `parent_id`
109
+ + `call_depth`, so `/a2a tree <root>` renders the full chain.
110
+ - **Audit integration** — `intent='a2a'` lights up in `/audit` queries
111
+ and feeds the same per-user budget that user-originated calls do.
112
+
113
+ ### Changed
114
+
115
+ - `SIGTERM` now reuses the same graceful-shutdown path as `SIGINT`.
116
+ Prior to 1.1.2, `systemctl restart` invocations skipped session-
117
+ manager / WAL-checkpoint cleanup because only `SIGINT` was hooked.
118
+ Recommended systemd unit drop-in: `KillMode=mixed` so children get
119
+ `TimeoutStopSec` of grace before SIGKILL.
120
+ - `jobs` table schema migrated (idempotent ALTER TABLE) with
121
+ `thread_key`, `kind`, `started_at`, `delivered_at`, `replaced_by`,
122
+ `last_outbox_id`, `parent_id`, `call_depth`. Pre-v1.0 databases
123
+ (which still had only the original 8 columns) are auto-upgraded
124
+ on first startup.
125
+ - `MessengerAdapter.sendMessage` call sites consolidated through
126
+ `core/message-sink.ts`. Reminder polish / memo polish run upstream of
127
+ sink and are unaffected; sink only takes the final plain payload.
128
+
129
+ ### Fixed
130
+
131
+ - Schema-vs-migrate ordering broke job-board on legacy databases. The
132
+ v1.0 ownership migration's `CREATE INDEX idx_jobs_creator ON
133
+ jobs(creator_id)` was inside the schema string, which `sqlite-helper`
134
+ exec()s **before** `migrateOwnership`. Against a pre-v1.0
135
+ `~/.agim/jobs.db` (no `creator_id` column yet) this threw
136
+ `no such column: creator_id` and disabled the entire job-board for
137
+ the lifetime of the process. The bug shipped silently in v1.0 and
138
+ only surfaced when Phase 2 forced everyone to actually look at the
139
+ jobs table. Column-dependent indexes now live inside `migrateColumns`,
140
+ after the ALTER TABLE pass.
141
+ - Recovery retry path no longer creates orphan `pending` rows. The
142
+ retry interceptor's pre-created replacement job (so it could stamp
143
+ `replaced_by` atomically) is now reused by `handleMessage` instead
144
+ of being shadowed by a second `createInlineJob` call.
145
+
146
+ ### Reference docs
147
+
148
+ - `docs/task-recovery-plan.md` — 3-phase design + risk register +
149
+ cross-session handoff guide.
150
+ - `docs/task-recovery-testing.md` — grey-deploy step-by-step verifier.
151
+ - `docs/agent-to-agent-plan.md` — 3-layer A2A design (L1 implemented;
152
+ L2 shared-artifacts and L3 workflow-DSL are documented for later).
153
+
7
154
  ## [1.1.1] - 2026-05-14
8
155
 
9
156
  ### Changed
package/README.md CHANGED
@@ -18,6 +18,10 @@
18
18
  ## Highlights
19
19
 
20
20
  - **5 messengers + email, 4+ agents** — WeChat (image / file / voice), Feishu, DingTalk (image / voice with server-side ASR), Telegram, Discord, Email (SMTP); Claude Code, Codex, Copilot, OpenCode, plus any ACP endpoint
21
+ - **Agent-to-Agent (A2A)** — agents call other agents inline via `mcp__imhub__call_agent`. Just say *"用 codex 帮我跑 git status"* / *"ask opencode to run the tests"* — the active agent hands off, waits, integrates the reply. Guardrails: depth limit (default 3), self-call ban, workspace whitelist, per-user budget. Observability: `/a2a stats | recent | tree <id>`. Disable with `IMHUB_A2A_ENABLED=0`.
22
+ - **A2A shared artifacts (v1.1.3+)** — caller drops files into the callee's workspace via `inputs[]` (`fromAbsolutePath` / `fromCallerOutput` / inline `content`), and callee writes products to `~/.agim/artifacts/<jobId>/_agim-output/`. Returned `outputs[]` lets the caller read files on demand. Hard-link first, copy fallback. Size caps configurable.
23
+ - **Crash-safe delivery** — every outbound message (replies, reminders, approvals, restart notices) flows through a SQLite outbox; a worker drains pending rows with exponential backoff. IM glitches and brief disconnects no longer drop replies. `/outbox status | list | failed | retry <id>`.
24
+ - **In-flight job recovery** — every inbound message becomes a tracked inline job (`pending → running → completed → delivered`). When agim is restarted mid-flight, the next startup pings each affected thread within 10 min: *"⚠️ 上次的消息被服务重启中断了:「…」回复 1 重发 / 2 取消"*. Per-job lifecycle visible in `/job list` and `/job check <id>`.
21
25
  - **`/remind` reminders subsystem** — one-shot + recurring (`每天8点喝水`); LLM auto-detects reminder intent in casual chat; LLM polishes delivery; agent MCP tools; web `/reminders` page; email + IM delivery
22
26
  - **`/memo` 5W1H persistent memory** — generic "what / who / when / where / how / why" notes with optional GPS capture (browser geolocation H5 + Baidu geocoder); permanent by default, transient bucket for parking spots / today's meeting; agents store + retrieve via MCP tools so casual mentions get remembered automatically
23
27
  - **Browser dashboard** — chat UI, tasks panel (jobs / schedules / approvals / health / files / audit), reminders panel, settings page with workspace CRUD
@@ -25,7 +29,7 @@
25
29
  - **Rich media in WeChat / Telegram / DingTalk** — receive images, files, videos; voice messages transcribed via WeChat STT, DingTalk's server-side ASR, OpenAI Whisper, or whisper.cpp (per-platform fallback chain)
26
30
  - **Smart routing** — intent classifier (CJK + ASCII), sticky sessions, circuit breaker, rate limiter
27
31
  - **Multi-tenant workspaces** — per-workspace agent whitelist, rate limits, command-level ACL
28
- - **Persistent jobs & cron** — SQLite-backed, survives restarts, 30-day retention
32
+ - **Persistent jobs & cron** — SQLite-backed, survives restarts, 30-day retention (24h for auto-tracked inline jobs)
29
33
  - **Observability** — structured logging (pino + traceId), Prometheus metrics, audit log
30
34
  - **Security** — timing-safe auth, SSRF guards, credential file permissions, approval socket entropy
31
35
 
@@ -126,8 +130,11 @@ agim messengers # List available messengers
126
130
  | `/remind …` | Reminders — see [Reminders](#reminders) below |
127
131
  | `/memo …` | 5W1H persistent memory — see [Memos](#memos) below (aliases `/记`, `/note`) |
128
132
  | `/job`, `/cron`, `/audit`, `/stats` | Manage jobs, cron schedules, audit, stats (`/schedule` aliases `/cron` until v0.4.0) |
133
+ | `/outbox status\|list\|failed\|retry <id>` | Inspect & operate the persistent IM delivery queue (v1.1.2+) |
134
+ | `/a2a stats\|recent\|tree <id>` | Agent-to-Agent observability — chain, latency, callee histogram (v1.1.2+) |
129
135
  | `/router status\|explain` | Inspect routing decisions |
130
136
  | `y` / `n` / `批准` / `拒绝` | Approve / deny Claude tool call (or reminder confirmation card) |
137
+ | `1` / `2` | After a service restart, reply `1` to redo the interrupted message or `2` to cancel (10 min window) |
131
138
 
132
139
  ## Human-in-the-loop Tool Approval
133
140
 
package/README.zh-CN.md CHANGED
@@ -14,6 +14,10 @@
14
14
  ## 亮点
15
15
 
16
16
  - **5 种 IM + 邮件,4+ 种 Agent** — 微信(图片 / 文件 / 语音)、飞书、钉钉(图片 / 语音,自带服务端 ASR)、Telegram、Discord、Email(SMTP);Claude Code、Codex、Copilot、OpenCode,以及任意 ACP 端点
17
+ - **Agent 之间互调(A2A)** — Agent 可以在自己的工具循环里直接调另一个 Agent,比如直接说「用 codex 帮我跑 git status」/「让 opencode 跑一下测试」。护栏在 agim 里强制:递归深度限制(默认 3)、禁止自调、工作区白名单、按人预算共享;可观测:`/a2a stats | recent | tree <id>`;`IMHUB_A2A_ENABLED=0` 关闭
18
+ - **A2A 共享文件传递(v1.1.3+)** — caller 可通过 `inputs[]`(`fromAbsolutePath` / `fromCallerOutput` / 内联 `content`)把文件放进 callee 工作区;callee 把产物写到 `~/.agim/artifacts/<jobId>/_agim-output/`,返回的 `outputs[]` 让 caller 按需读。同文件系统优先 hard-link,否则复制。大小上限 env 可调
19
+ - **投递不丢消息** — Agent 回复 / 提醒 / 审批卡片 / 重启通知 全部走 SQLite outbox + 指数退避 worker,IM 平台抖动或断连不再丢回复;`/outbox status | list | failed | retry <id>`
20
+ - **崩溃中断可恢复** — 每条入站消息都建一条 inline job(`pending → running → completed → delivered`)。agim 重启时正在跑的任务会标 `interrupted`,下次启动 10 分钟窗口内对每条受影响的会话发:「⚠️ 上次的消息被服务重启中断了:「…」回复 1 重发 / 2 取消」。每条 job 的完整生命周期可在 `/job list` + `/job check <id>` 查看
17
21
  - **`/remind` 提醒子系统** — 一次性 + 定期(`每天8点喝水`);非 slash 消息 LLM 自动识别意图;LLM 润色投递文本;Agent MCP 工具直接创建;Web `/reminders` 管理页;邮件 + IM 双通道投递
18
22
  - **`/memo` 5W1H 持久记忆库** — 通用「what / who / when / where / how / why」记事本,可选 GPS(浏览器地理定位 H5 + 百度地理编码);默认永久保存,临时类(停车、当天会议)单独走 24h 桶;Agent 通过 MCP 工具自动落库 + 检索,闲聊里提到的地点 / 生日 / 待办都能记下
19
23
  - **浏览器控制台** — 对话界面、任务面板(Jobs / 调度 / 审批 / 健康 / 文件 / 审计)、提醒面板、设置页含工作区 CRUD
@@ -21,7 +25,7 @@
21
25
  - **微信 / Telegram / 钉钉 富媒体** — 接收图片、文件、视频;语音消息按平台走最佳转写链路:微信 STT、钉钉服务端 ASR、OpenAI Whisper、whisper.cpp
22
26
  - **智能路由** — 意图分类(中英文)、Sticky 会话、断路器、限流器
23
27
  - **多租户工作区** — 按工作区隔离 Agent 白名单、限流、命令级 ACL
24
- - **持久化任务与定时** — SQLite 落地,重启不丢,30 天自动修剪
28
+ - **持久化任务与定时** — SQLite 落地,重启不丢,显式 `/job` 30 天 / 自动跟踪的 inline job 24 小时
25
29
  - **可观测** — 结构化日志(pino + traceId)、Prometheus 指标、审计日志
26
30
  - **安全** — 常量时鉴权、SSRF 防护、凭证文件权限、审批 socket 熵值
27
31
 
@@ -118,8 +122,11 @@ agim messengers # 列出可用 IM
118
122
  | `/remind …` | 提醒子系统 — 详见 [提醒](#提醒) |
119
123
  | `/memo …` | 5W1H 持久记忆库 — 详见 [备忘](#备忘)(别名 `/记`、`/note`)|
120
124
  | `/job`、`/cron`、`/audit`、`/stats` | 管理任务、定时、审计、统计(`/schedule` 仍是 `/cron` 别名,v0.4.0 移除) |
125
+ | `/outbox status\|list\|failed\|retry <id>` | 查看与重发 IM 投递队列(v1.1.2+) |
126
+ | `/a2a stats\|recent\|tree <id>` | 查看 Agent 之间互调的统计 / 最近 N 次 / 调用链树(v1.1.2+) |
121
127
  | `/router status\|explain` | 查看路由策略 |
122
128
  | `y` / `n` / `批准` / `拒绝` | 同意 / 拒绝(工具调用 或 提醒确认卡片) |
129
+ | `1` / `2` | 服务重启后看到「上次消息中断」提示时,回 `1` 重发 / `2` 取消(10 分钟内有效) |
123
130
 
124
131
  ## 工具调用人审
125
132
 
package/dist/cli.js CHANGED
@@ -32,6 +32,8 @@ import { consumeLocationContext, formatLocationAnnotation } from './core/locatio
32
32
  import { tryDetectReminderIntent } from './core/remind-intent.js';
33
33
  import { createReminder } from './core/reminders.js';
34
34
  import { setReminderConfirmNotifier } from './core/reminder-rpc.js';
35
+ import { sink } from './core/message-sink.js';
36
+ import { createInlineJob, markJobRunning, markJobCompleted, markJobFailed, linkJobOutbox, updateJobAgent, } from './core/job-board.js';
35
37
  import { checkMessengerConfig, checkAgentAvailability, runMessengerOnboarding, formatAgentInstallHint, formatMessengerStartError, loadConfig as loadOnboardingConfig, saveConfig as saveOnboardingConfig, } from './core/onboarding.js';
36
38
  import { startWebServer } from './web/server.js';
37
39
  import { startACPServer } from './core/acp-server.js';
@@ -260,6 +262,64 @@ program
260
262
  ctx.traceId = traceId;
261
263
  ctx.logger = createLogger({ traceId, platform: ctx.platform, component: 'cli' });
262
264
  ctx.logger.info({ event: 'message.received', text: ctx.message.text.substring(0, 120), userId: ctx.message.userId });
265
+ // Phase 3 — interrupted-job recovery reply. If the user has a
266
+ // pending "retry / cancel" prompt outstanding on this thread AND
267
+ // their reply parses as 1/重发/2/取消, consume the message here
268
+ // before any other interceptor (approval / reminder) sees it.
269
+ // hasPendingRecovery is a cheap map lookup; we only do the full
270
+ // parse when the gate is true.
271
+ const recoveryThreadKey = `${ctx.platform}:${ctx.channelId}:${ctx.message.threadId}`;
272
+ try {
273
+ const { hasPendingRecovery, tryHandleRecoveryReply } = await import('./core/job-recovery.js');
274
+ if (hasPendingRecovery(recoveryThreadKey)) {
275
+ const outcome = tryHandleRecoveryReply(recoveryThreadKey, ctx.message.text);
276
+ if (outcome.kind === 'retried') {
277
+ ctx.logger.info({
278
+ event: 'recovery.retried', oldJobId: outcome.oldJobId, newJobId: outcome.newJobId,
279
+ });
280
+ // Substitute the user's "1" with the original prompt so the
281
+ // rest of the handler reruns the work. handleMessage reads
282
+ // message.text again on its own, so this is the cheapest
283
+ // way to re-enter the normal path without duplicating logic.
284
+ ctx.message.text = outcome.entry.prompt;
285
+ // Hand the freshly-created replacement job id to handleMessage
286
+ // so it doesn't build a second inline row (the recovery flow
287
+ // already created newJobId + stamped replaced_by on the old
288
+ // row). Without this we'd accumulate orphan 'pending' rows
289
+ // that retention eventually GCs — visible but harmless.
290
+ ctx.inlineJobId = outcome.newJobId;
291
+ // Send a short "好的,重发中..." acknowledgement first so the
292
+ // user knows we accepted the choice (the actual agent reply
293
+ // can take many seconds). priority=normal — slot it after
294
+ // anything urgent in the outbox.
295
+ await sink.deliver({
296
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
297
+ payload: '🔁 好的,重发中…',
298
+ kind: 'text',
299
+ });
300
+ // Fall through to the normal handler below with the rewritten
301
+ // message text.
302
+ }
303
+ else if (outcome.kind === 'cancelled') {
304
+ ctx.logger.info({ event: 'recovery.cancelled', oldJobId: outcome.oldJobId });
305
+ await sink.deliver({
306
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
307
+ payload: '✅ 已取消,不再处理上次的消息。',
308
+ kind: 'text',
309
+ });
310
+ return;
311
+ }
312
+ // outcome.kind === 'not-recovery-reply' → user typed something
313
+ // else; fall through to normal handling so we don't swallow
314
+ // unrelated messages.
315
+ }
316
+ }
317
+ catch (err) {
318
+ ctx.logger.warn({
319
+ event: 'recovery.intercept_crashed',
320
+ err: err instanceof Error ? err.message : String(err),
321
+ }, 'recovery intercept threw — falling through to normal flow');
322
+ }
263
323
  // Approval interception comes BEFORE the agent router. If a pending
264
324
  // approval exists for this thread and the message is a y/n-style
265
325
  // reply, we resolve the approval and stop. Anything else routes
@@ -295,7 +355,10 @@ program
295
355
  const replyDecision = tryConsumePendingReminderReply(tk, ctx.message.text);
296
356
  if (replyDecision) {
297
357
  if (replyDecision.decision === 'cancel') {
298
- await messenger.sendMessage(ctx.message.threadId, '✅ 已忽略这条提醒建议');
358
+ await sink.deliver({
359
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
360
+ payload: '✅ 已忽略这条提醒建议', kind: 'text',
361
+ });
299
362
  }
300
363
  else {
301
364
  const p = replyDecision.pending;
@@ -311,11 +374,18 @@ program
311
374
  recurrence: p.recurrence,
312
375
  });
313
376
  const recurLine = p.recurrence ? `\n 循环:${p.recurrence}` : '';
314
- await messenger.sendMessage(ctx.message.threadId, `✅ 已创建提醒 #${id}\n 触发:${p.fireAt.toLocaleString()}${recurLine}\n 内容:${p.text}`);
377
+ await sink.deliver({
378
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
379
+ payload: `✅ 已创建提醒 #${id}\n 触发:${p.fireAt.toLocaleString()}${recurLine}\n 内容:${p.text}`,
380
+ kind: 'text',
381
+ });
315
382
  }
316
383
  catch (err) {
317
384
  const msg = err instanceof Error ? err.message : String(err);
318
- await messenger.sendMessage(ctx.message.threadId, `❌ 创建失败:${msg}`);
385
+ await sink.deliver({
386
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
387
+ payload: `❌ 创建失败:${msg}`, kind: 'text',
388
+ });
319
389
  }
320
390
  }
321
391
  ctx.logger.info({ event: 'message.consumed_by_pending_reminder', decision: replyDecision.decision });
@@ -354,7 +424,10 @@ program
354
424
  reply = await handleStopCommand('', routeCtx);
355
425
  else
356
426
  reply = await handleStatusCommand('', routeCtx);
357
- await messenger.sendMessage(ctx.message.threadId, reply);
427
+ await sink.deliver({
428
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
429
+ payload: reply, kind: 'text',
430
+ });
358
431
  return;
359
432
  }
360
433
  }
@@ -372,7 +445,10 @@ program
372
445
  text: ctx.message.text,
373
446
  });
374
447
  if (detected.handled && detected.reply) {
375
- await messenger.sendMessage(ctx.message.threadId, detected.reply);
448
+ await sink.deliver({
449
+ platform: ctx.platform, channelId: ctx.channelId, threadId: ctx.message.threadId,
450
+ payload: detected.reply, kind: 'text',
451
+ });
376
452
  ctx.logger.info({ event: 'message.proposed_reminder' });
377
453
  return;
378
454
  }
@@ -412,18 +488,54 @@ program
412
488
  });
413
489
  console.log('✅ Approval router wired to messengers');
414
490
  }
491
+ // Start the outbox worker AFTER all messengers have been registered/
492
+ // started. If we started it earlier, every queued row from a prior run
493
+ // would fail "messenger not registered" on the first tick and bounce
494
+ // into backoff — minor but pointless. Worker is single-threaded and
495
+ // ticks at IMHUB_OUTBOX_TICK_MS (default 1s).
496
+ sink.startWorker();
497
+ console.log('✅ Message-sink worker started');
498
+ // Phase 3 — startup recovery scan. Look at jobs.db for inline rows
499
+ // that got 'interrupted' on the previous shutdown and are still inside
500
+ // the recovery window (default 10 min, env IMHUB_RECOVERY_WINDOW_MS).
501
+ // For each, send a "回 1 重发 / 2 取消" prompt via sink — outbox will
502
+ // hold the notification until the messenger is fully connected. Older
503
+ // interrupted rows transition to 'abandoned' so we don't ask the user
504
+ // to retry something they sent half an hour ago.
505
+ try {
506
+ const { scanInterruptedAndNotify, sweepExpiredPending } = await import('./core/job-recovery.js');
507
+ const { notified, abandoned } = await scanInterruptedAndNotify();
508
+ if (notified > 0 || abandoned > 0) {
509
+ console.log(`✅ Job recovery: ${notified} retry-prompt(s) sent, ${abandoned} abandoned`);
510
+ }
511
+ // Tidy expired pending entries every minute. The in-memory map is
512
+ // cheap but a long-lived process accumulates entries from users who
513
+ // never reply.
514
+ const sweepTimer = setInterval(() => { sweepExpiredPending(); }, 60_000);
515
+ if (typeof sweepTimer === 'object' && sweepTimer && 'unref' in sweepTimer) {
516
+ sweepTimer.unref();
517
+ }
518
+ }
519
+ catch (err) {
520
+ console.warn('job-recovery module failed to scan:', err instanceof Error ? err.message : String(err));
521
+ }
415
522
  // Reminder confirm notifier — used by reminder-rpc.ts when an agent
416
523
  // creates a high-frequency + LLM-polish reminder (those go through a
417
524
  // y/n card before landing in the DB). Plain message via the same
418
525
  // messenger the agent is talking on.
419
526
  setReminderConfirmNotifier(async (ctx, message) => {
420
- const messenger = registry.getMessenger(platformToMessengerName(ctx.platform))
421
- ?? registry.getMessenger(ctx.platform);
422
- if (!messenger) {
423
- console.warn(`reminder-confirm: no messenger for platform "${ctx.platform}"`);
424
- return;
425
- }
426
- await messenger.sendMessage(ctx.threadId, message);
527
+ // sink resolves the adapter at delivery time; we just need to canonicalize
528
+ // the messenger name (e.g. 'wechat' → 'wechat-ilink') so the worker picks
529
+ // the right adapter from the registry.
530
+ const canonical = platformToMessengerName(ctx.platform);
531
+ const platform = registry.getMessenger(canonical) ? canonical : ctx.platform;
532
+ await sink.deliver({
533
+ platform,
534
+ channelId: ctx.channelId,
535
+ threadId: ctx.threadId,
536
+ payload: message,
537
+ kind: 'text',
538
+ });
427
539
  });
428
540
  // ============================================
429
541
  // START WEB CHAT SERVER
@@ -476,12 +588,37 @@ program
476
588
  catch (err) {
477
589
  console.warn('restart-completion module failed to load:', err instanceof Error ? err.message : String(err));
478
590
  }
479
- // Keep process alive
480
- process.on('SIGINT', async () => {
481
- console.log('\n👋 Shutting down...');
591
+ // Graceful shutdown. Registered for BOTH SIGINT (Ctrl-C from a terminal)
592
+ // and SIGTERM (systemd / docker stop). Prior to v1.1.2 only SIGINT was
593
+ // hooked, which meant `systemctl restart agim` skipped this entire path —
594
+ // sessionManager.stop, WAL checkpoints, etc. — and the next startup had
595
+ // to recover from a half-flushed state. The handler is idempotent against
596
+ // duplicate signals.
597
+ let shuttingDown = false;
598
+ const gracefulShutdown = async (signal) => {
599
+ if (shuttingDown)
600
+ return;
601
+ shuttingDown = true;
602
+ console.log(`\n👋 Received ${signal}, shutting down...`);
482
603
  sessionManager.stop();
483
604
  webServer?.close();
484
605
  acpServer?.close();
606
+ // Stop the outbox worker BEFORE messengers so no new send attempts
607
+ // race the adapter shutdown. Worker is single-threaded so a pending
608
+ // tick will finish its current row before we move on.
609
+ try {
610
+ sink.stopWorker();
611
+ }
612
+ catch { /* ignore */ }
613
+ // Phase 3: any inline job that was still pending/running gets stamped
614
+ // 'interrupted' so the next startup's recovery scan can offer the user
615
+ // a retry. Must happen BEFORE closeJobBoardDb() below, while the
616
+ // handle is still open.
617
+ try {
618
+ const { markRunningInlineJobsInterrupted } = await import('./core/job-board.js');
619
+ markRunningInlineJobsInterrupted(`agim shutdown (${signal})`);
620
+ }
621
+ catch { /* ignore */ }
485
622
  // Stop all messengers
486
623
  for (const name of registry.listMessengers()) {
487
624
  const messenger = registry.getMessenger(name);
@@ -527,8 +664,16 @@ program
527
664
  closeMemosDb(); // also stops the sweep timer
528
665
  }
529
666
  catch { /* ignore */ }
667
+ try {
668
+ const { stopOutboxRetentionSweep, closeOutboxDb } = await import('./core/outbox.js');
669
+ stopOutboxRetentionSweep();
670
+ closeOutboxDb();
671
+ }
672
+ catch { /* ignore */ }
530
673
  process.exit(0);
531
- });
674
+ };
675
+ process.on('SIGINT', () => { void gracefulShutdown('SIGINT'); });
676
+ process.on('SIGTERM', () => { void gracefulShutdown('SIGTERM'); });
532
677
  // Wait forever
533
678
  await new Promise(() => { });
534
679
  });
@@ -581,6 +726,11 @@ async function handleMessage(ctx, defaultAgent) {
581
726
  let dismissThinking;
582
727
  const looksLikeApproval = /^\s*[yn]\s*$/i.test(message.text) ||
583
728
  /^\s*(批准|拒绝|同意|不同意|通过|可以|不可以|不行|✅|❌)\s*$/.test(message.text);
729
+ // Inline-job tracking id (Phase 2). Declared at function scope so the
730
+ // catch block below can mark the row 'failed' on any throw. -1 means
731
+ // "not tracked": either the message didn't reach willInvokeAgent, or
732
+ // createInlineJob fell back to in-memory (DB locked / IMHUB_INLINE_JOB_TRACKING=0).
733
+ let inlineJobId = -1;
584
734
  try {
585
735
  if (messenger.sendTyping) {
586
736
  messenger.sendTyping(message.threadId, true).catch(() => { });
@@ -642,6 +792,66 @@ async function handleMessage(ctx, defaultAgent) {
642
792
  }
643
793
  };
644
794
  }
795
+ // Inline-job tracking: every agent-bound inbound message gets a row in
796
+ // the jobs table (kind='inline') so the state machine survives crashes
797
+ // and the Phase 3 startup scan can offer the user a retry. Built-ins
798
+ // (status/audit/router) return string immediately and never touch the
799
+ // agent — no point tracking them. Inline tracking can be killed wholesale
800
+ // via env IMHUB_INLINE_JOB_TRACKING=0 (createInlineJob returns -1, every
801
+ // mark*Job helper no-ops on id<=0). createInlineJob itself never throws —
802
+ // if the DB is locked or disk-full we get -1 and the user path runs
803
+ // unchanged on the in-memory fast path.
804
+ //
805
+ // We create the row BEFORE entering the runOnThread queue so a stuck
806
+ // queue (or runAgentInvocation crash before the agent runs) still leaves
807
+ // a 'pending' record we can attribute to the user. The agent name is
808
+ // initially defaultAgent and gets corrected from inside onAgentResolved
809
+ // once routing picks the real one.
810
+ //
811
+ // EXCEPTION: when the cli's recovery interceptor handed us a pre-created
812
+ // replacement-job id via ctx.inlineJobId, reuse it. The recovery path
813
+ // had to create the row up-front so it could stamp replaced_by on the
814
+ // old interrupted row atomically; building a second row here would
815
+ // leave the recovery-created one as an orphan 'pending' that retention
816
+ // eventually GCs.
817
+ if (willInvokeAgent) {
818
+ if (ctx.inlineJobId && ctx.inlineJobId > 0) {
819
+ inlineJobId = ctx.inlineJobId;
820
+ }
821
+ else {
822
+ inlineJobId = createInlineJob({
823
+ agent: defaultAgent,
824
+ prompt: message.text,
825
+ threadKey: `${platform}:${ctx.channelId}:${message.threadId}`,
826
+ creatorId: message.userId ?? '',
827
+ });
828
+ }
829
+ // A2A-L1: stash on routeCtx so callAgentWithHistory can forward it
830
+ // through AgentSendOpts → adapter.registerRun → RunContext. When the
831
+ // agent fires mcp__imhub__call_agent the bus then knows what
832
+ // parent_id to stamp on the callee inline-job row. User-originated
833
+ // runs are call_depth=0; nested runs (A2A callees) reach this same
834
+ // handler with their own ctx.inlineJobId and ctx.callDepth set.
835
+ if (inlineJobId > 0) {
836
+ routeCtx.parentJobId = inlineJobId;
837
+ // handleMessage is only the user-inbound path — callDepth is
838
+ // always 0 here. (A2A callees don't traverse this handler; they
839
+ // go through a2a.callAgentByName which passes its own callDepth
840
+ // directly to target.sendPrompt.)
841
+ routeCtx.callDepth = 0;
842
+ }
843
+ }
844
+ // Chain the agent-name correction onto the existing onAgentResolved
845
+ // (which already wires native session ids). Both run sequentially —
846
+ // the inline-job update is fast and best-effort.
847
+ const prevOnResolved = routeCtx.onAgentResolved;
848
+ if (prevOnResolved || inlineJobId > 0) {
849
+ routeCtx.onAgentResolved = async (resolvedAgent) => {
850
+ if (prevOnResolved)
851
+ await prevOnResolved(resolvedAgent);
852
+ updateJobAgent(inlineJobId, resolvedAgent);
853
+ };
854
+ }
645
855
  // The thinking placeholder + agent invocation + response delivery are
646
856
  // wrapped into a closure so agent-bound messages can be serialized
647
857
  // per-thread via runOnThread (see core/thread-queue.ts). Without
@@ -660,6 +870,10 @@ async function handleMessage(ctx, defaultAgent) {
660
870
  logger.debug({ err: String(err) }, 'sendThinking failed');
661
871
  }
662
872
  }
873
+ // Mark inline job 'running' just before the agent actually starts —
874
+ // anything that throws before this leaves the row in 'pending' which
875
+ // is the correct "never ran" semantic.
876
+ markJobRunning(inlineJobId);
663
877
  const result = await routeMessage(parsed, routeCtx);
664
878
  const dismiss = async () => {
665
879
  if (dismissThinking) {
@@ -674,7 +888,14 @@ async function handleMessage(ctx, defaultAgent) {
674
888
  if (typeof result === 'string') {
675
889
  await stopTyping();
676
890
  await dismiss();
677
- await messenger.sendMessage(message.threadId, await maybePrefix(result));
891
+ markJobCompleted(inlineJobId, result);
892
+ const sinkRes = await sink.deliver({
893
+ platform, channelId: ctx.channelId, threadId: message.threadId,
894
+ payload: await maybePrefix(result), kind: 'text',
895
+ jobId: inlineJobId > 0 ? inlineJobId : undefined,
896
+ });
897
+ if (sinkRes.outboxId > 0)
898
+ linkJobOutbox(inlineJobId, sinkRes.outboxId);
678
899
  logger.info({ event: 'message.sent', responseLen: result.length });
679
900
  }
680
901
  else {
@@ -686,10 +907,22 @@ async function handleMessage(ctx, defaultAgent) {
686
907
  await stopTyping();
687
908
  await dismiss();
688
909
  if (fullResponse) {
689
- await messenger.sendMessage(message.threadId, await maybePrefix(fullResponse));
910
+ markJobCompleted(inlineJobId, fullResponse);
911
+ const sinkRes = await sink.deliver({
912
+ platform, channelId: ctx.channelId, threadId: message.threadId,
913
+ payload: await maybePrefix(fullResponse), kind: 'text',
914
+ jobId: inlineJobId > 0 ? inlineJobId : undefined,
915
+ });
916
+ if (sinkRes.outboxId > 0)
917
+ linkJobOutbox(inlineJobId, sinkRes.outboxId);
690
918
  logger.info({ event: 'message.sent', responseLen: fullResponse.length });
691
919
  }
692
920
  else {
921
+ // Empty response is treated as a failure of the inline job —
922
+ // there's nothing to deliver and we don't want the row stuck in
923
+ // 'running' forever. The user already sees nothing; the row gets
924
+ // pruned out of /jobs lists.
925
+ markJobFailed(inlineJobId, 'agent returned empty response');
693
926
  logger.warn({ event: 'message.empty_response' });
694
927
  }
695
928
  }
@@ -699,12 +932,11 @@ async function handleMessage(ctx, defaultAgent) {
699
932
  await runOnThread(queueKey, runAgentInvocation, {
700
933
  onQueued: async (ahead) => {
701
934
  logger.info({ event: 'message.queued', ahead, queueKey });
702
- try {
703
- await messenger.sendMessage(message.threadId, `📥 已收到(前面还有 ${ahead} 条在处理,稍后给你结果)`);
704
- }
705
- catch (err) {
706
- logger.debug({ err: String(err) }, 'queued-notice send failed');
707
- }
935
+ await sink.deliver({
936
+ platform, channelId: ctx.channelId, threadId: message.threadId,
937
+ payload: `📥 已收到(前面还有 ${ahead} 条在处理,稍后给你结果)`,
938
+ kind: 'text',
939
+ });
708
940
  },
709
941
  });
710
942
  }
@@ -722,12 +954,15 @@ async function handleMessage(ctx, defaultAgent) {
722
954
  }
723
955
  catch { /* ignore */ }
724
956
  }
725
- try {
726
- await messenger.sendMessage(message.threadId, '❌ An error occurred processing your message.');
727
- }
728
- catch {
729
- // Ignore
730
- }
957
+ // Mark inline job 'failed' before sending the user error — markJobFailed
958
+ // is a no-op for id<=0 (built-ins / inline tracking disabled) and for
959
+ // already-terminal rows. The error blurb still goes through sink so it
960
+ // survives an IM hiccup.
961
+ markJobFailed(inlineJobId, errMsg);
962
+ await sink.deliver({
963
+ platform, channelId: ctx.channelId, threadId: message.threadId,
964
+ payload: '❌ An error occurred processing your message.', kind: 'text',
965
+ }).catch(() => { });
731
966
  }
732
967
  }
733
968
  program