npm - @fitlab-ai/agent-infra - Versions diffs - 0.7.4 → 0.7.6 - Mend

@fitlab-ai/agent-infra 0.7.4 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

package/lib/task/commands/log.ts CHANGED Viewed

@@ -4,13 +4,15 @@ import { resolveTaskRef } from '../resolve-ref.ts';
 const USAGE = `Usage: ai task log <N | #N | TASK-id>
-Renders a task's activity log as a chronological timeline table.
+Renders a task's activity log as a per-step status table. A step's start and
+completion are paired onto one row: STARTED holds the start time, DONE the
+completion time (or '(in progress)' while still running).
   <ref>   Bare numeric / '#N' short id, or a full TASK-YYYYMMDD-HHMMSS id.
-Columns: # (timeline position) / TIME / STEP / AGENT / NOTE
+Columns: # (row) / STEP / AGENT / STARTED / DONE / NOTE
 `;
-const TABLE_HEADERS = ['#', 'TIME', 'STEP', 'AGENT', 'NOTE'] as const;
+const TABLE_HEADERS = ['#', 'STEP', 'AGENT', 'STARTED', 'DONE', 'NOTE'] as const;
 // The activity-log H2 heading is language-dependent (zh template / en template).
 const HEADING_RE = /^##\s+(活动日志|Activity Log)\s*$/;
@@ -23,6 +25,17 @@ const ENTRY_RE =
 type LogEntry = { time: string; step: string; agent: string; note: string };
+// One rendered row = one step instance. `started`/`done` are timestamps; an empty
+// `done` with a non-empty `started` means the step is still in flight, while an
+// empty `started` is a historical done-only entry (no start marker was written).
+type StepRow = { step: string; agent: string; started: string; done: string; note: string };
+// A start marker reuses the normal entry grammar and only suffixes its action
+// with ` [started]`; the matching done entry carries the identical base action
+// without the suffix. Pairing therefore keys on the base action (including any
+// `(Round N)`), so every round and every repeated execution pairs on its own.
+const STARTED_SUFFIX_RE = /\s*\[started\]\s*$/;
 function parseActivityLog(content: string): { sectionFound: boolean; entries: LogEntry[] } {
   const lines = content.split('\n');
   let i = 0;
@@ -44,6 +57,38 @@ function parseActivityLog(content: string): { sectionFound: boolean; entries: Lo
   return { sectionFound: true, entries: parsed.map((p) => p.entry) };
 }
+// Collapse a chronological entry list into per-step rows: a `[started]` marker
+// opens a row, the next matching done entry fills it in place (FIFO per base
+// action). Started-only rows stay in flight; done-only entries (legacy logs with
+// no start marker) render as standalone rows. Result order = first-seen order,
+// which is already ascending because `entries` is sorted ascending.
+function pairEntries(entries: LogEntry[]): StepRow[] {
+  const rows: StepRow[] = [];
+  const open = new Map<string, StepRow[]>();
+  for (const e of entries) {
+    const isStarted = STARTED_SUFFIX_RE.test(e.step);
+    const base = e.step.replace(STARTED_SUFFIX_RE, '');
+    if (isStarted) {
+      const row: StepRow = { step: base, agent: e.agent, started: e.time, done: '', note: e.note };
+      rows.push(row);
+      const queue = open.get(base);
+      if (queue) queue.push(row);
+      else open.set(base, [row]);
+    } else {
+      const pending = open.get(base)?.shift();
+      if (pending) {
+        // Done fills the open row; the done entry carries the meaningful note.
+        pending.done = e.time;
+        pending.agent = e.agent;
+        pending.note = e.note;
+      } else {
+        rows.push({ step: base, agent: e.agent, started: '', done: e.time, note: e.note });
+      }
+    }
+  }
+  return rows;
+}
 function log(args: string[] = []): void {
   if (args.length === 0 || args[0] === '--help' || args[0] === '-h') {
     process.stdout.write(USAGE);
@@ -70,11 +115,19 @@ function log(args: string[] = []): void {
     process.exitCode = 1;
     return;
   }
-  const rows = entries.map((e, idx) => [String(idx + 1), e.time, e.step, e.agent, e.note]);
+  const steps = pairEntries(entries);
+  const rows = steps.map((s, idx) => [
+    String(idx + 1),
+    s.step,
+    s.agent,
+    s.started,
+    s.done || (s.started ? '(in progress)' : ''),
+    s.note
+  ]);
   for (const line of formatTable(TABLE_HEADERS, rows, { zebra: Boolean(process.stdout.isTTY) })) {
     process.stdout.write(`${line}\n`);
   }
-  process.stdout.write(`Total: ${entries.length} entries\n`);
+  process.stdout.write(`Total: ${steps.length} steps\n`);
 }
-export { log, parseActivityLog };
+export { log, parseActivityLog, pairEntries };

package/lib/task/index.ts CHANGED Viewed

@@ -38,14 +38,9 @@ export async function runTask(args: string[]): Promise<void> {
   }
   switch (subcommand) {
-    case 'ls': {
-      const { ls } = await import('./commands/ls.ts');
-      ls(rest);
-      break;
-    }
-    case 'show': {
-      const { show } = await import('./commands/show.ts');
-      show(rest);
+    case 'cat': {
+      const { cat } = await import('./commands/cat.ts');
+      cat(rest);
       break;
     }
     case 'files': {
@@ -53,11 +48,6 @@ export async function runTask(args: string[]): Promise<void> {
       files(rest);
       break;
     }
-    case 'cat': {
-      const { cat } = await import('./commands/cat.ts');
-      cat(rest);
-      break;
-    }
     case 'grep': {
       const { grep } = await import('./commands/grep.ts');
       grep(rest);
@@ -68,6 +58,16 @@ export async function runTask(args: string[]): Promise<void> {
       log(rest);
       break;
     }
+    case 'ls': {
+      const { ls } = await import('./commands/ls.ts');
+      ls(rest);
+      break;
+    }
+    case 'show': {
+      const { show } = await import('./commands/show.ts');
+      show(rest);
+      break;
+    }
     case 'status': {
       const { status } = await import('./commands/status.ts');
       status(rest);

package/lib/update.ts CHANGED Viewed

@@ -119,7 +119,7 @@ function syncFileRegistry(config: UpdateConfig, platformType: string, enabledTUI
 async function cmdUpdate(): Promise<void> {
   console.log('');
-  console.log('  agent-infra update');
+  console.log('  ai update');
   console.log('  ==================================');
   console.log('');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@fitlab-ai/agent-infra",
-  "version": "0.7.4",
+  "version": "0.7.6",
   "description": "Bootstrap tool for AI multi-tool collaboration infrastructure — works with Claude Code, Codex, Gemini CLI, and OpenCode",
   "license": "MIT",
   "type": "module",

package/templates/.agents/rules/README.en.md CHANGED Viewed

@@ -8,7 +8,7 @@ so you can quickly find "which ones to read" without opening each file.
 ## General Principles
-- [`no-mid-flow-questions.md`](no-mid-flow-questions.md) — Silence during SKILL runs: no user questions by default, plus two exceptions.
+- [`no-mid-flow-questions.md`](no-mid-flow-questions.md) — Silence during SKILL runs: no user questions by default, plus the exceptions the rule lists.
 - [`next-step-output.md`](next-step-output.md) — "Next step" output rules: task short-id rendering and the `Completed at` trailer.
 - [`version-stamp.md`](version-stamp.md) — How and when to stamp `agent_infra_version`.
 - [`debugging-guide.md`](debugging-guide.md) — Structured debugging flow: gather evidence → form hypothesis → verify hypothesis → fix the root cause; no blind patch-and-retry.
@@ -25,6 +25,7 @@ so you can quickly find "which ones to read" without opening each file.
 ## Task Workflow
 - [`task-management.md`](task-management.md) — Task intent detection and workflow-command mapping.
+- [`review-handshake.md`](review-handshake.md) — Three-stage bidirectional review handshake: four-state disposition, symmetric evidence, disagreement ledger, convergence and post-review commit gate.
 - [`task-short-id.md`](task-short-id.md) — Resolution, allocation and lifecycle of `#NN` / bare-number short ids.
 - [`milestone-inference.md`](milestone-inference.md) — Milestone inference for create-task / code-task / create-pr.
 - [`label-milestone-setup.md`](label-milestone-setup.md) — Platform commands to initialize labels / milestones.
@@ -35,7 +36,10 @@ so you can quickly find "which ones to read" without opening each file.
 - [`commit-and-pr.md`](commit-and-pr.md) — Conventional Commits message and PR conventions.
 - [`release-commands.md`](release-commands.md) — Read past releases, query merged PRs, publish release notes.
-## Testing & Cross-platform
+## Testing
 - [`testing-discipline.md`](testing-discipline.md) — Test-writing discipline: prefer structural asserts, no brittle wording matches.
-- [`cross-platform-tests.md`](cross-platform-tests.md) — Cross-platform test guards: express platform skips via `onPlatforms()`.
+## CLI
+- [`cli-help-format.md`](cli-help-format.md) — CLI help text conventions: unify display name on `ai`, `Usage:`+`Commands:` structure, alphabetical command order (top-level and namespace-level help only).

package/templates/.agents/rules/README.zh-CN.md CHANGED Viewed

@@ -7,7 +7,7 @@
 ## 通用准则
-- [`no-mid-flow-questions.md`](no-mid-flow-questions.md) — SKILL 执行期禁言：默认不向用户提问，及两类例外。
+- [`no-mid-flow-questions.md`](no-mid-flow-questions.md) — SKILL 执行期禁言：默认不向用户提问，及规则列明的例外。
 - [`next-step-output.md`](next-step-output.md) — 「下一步」输出规则：任务短号渲染与 `Completed at` 收尾行。
 - [`version-stamp.md`](version-stamp.md) — `agent_infra_version` 版本戳的取值命令与写入时机。
 - [`debugging-guide.md`](debugging-guide.md) — 结构化调试流程：收集证据→形成假设→验证假设→修复根因，禁止盲目改代码重试。
@@ -24,6 +24,7 @@
 ## 任务工作流
 - [`task-management.md`](task-management.md) — 任务语义识别与工作流命令映射。
+- [`review-handshake.md`](review-handshake.md) — 三阶段双向审查握手协议：四态处置、对称证据、分歧账本、收敛与 post-review commit 门禁。
 - [`task-short-id.md`](task-short-id.md) — 任务短号 `#NN` / 裸数字的解析、分配与生命周期。
 - [`milestone-inference.md`](milestone-inference.md) — create-task / code-task / create-pr 的 milestone 推断。
 - [`label-milestone-setup.md`](label-milestone-setup.md) — 初始化 label / milestone 的平台命令集。
@@ -34,7 +35,10 @@
 - [`commit-and-pr.md`](commit-and-pr.md) — Conventional Commits 提交信息与 PR 规范。
 - [`release-commands.md`](release-commands.md) — 读取历史 release、查询已合并 PR、发布 Release notes。
-## 测试与跨平台
+## 测试
 - [`testing-discipline.md`](testing-discipline.md) — 测试编写纪律：结构性断言优先，禁止脆弱的措辞匹配。
-- [`cross-platform-tests.md`](cross-platform-tests.md) — 跨平台测试守卫：用 `onPlatforms()` 表达平台跳过。
+## CLI
+- [`cli-help-format.md`](cli-help-format.md) — CLI help 文案约定：展示名统一 `ai`、`Usage:`+`Commands:` 结构、命令按字母序（仅顶层与命名空间级 help）。

package/templates/.agents/rules/cli-help-format.en.md ADDED Viewed

@@ -0,0 +1,49 @@
+# CLI help text conventions
+Unify the help text display structure, display name, and command ordering of the `ai` / `agent-infra` CLI so newly added subcommands follow them automatically and never drift across levels again. Read this file before adding or changing CLI help text.
+## Scope
+- **Display name `ai`**: applies to **all** user-facing help / usage / banner text — top-level, namespace-level, and the single-line usage / startup banners of leaf commands such as `merge` / `init` / `update`. The only exceptions: the top-level help first line keeps the brand + version line `agent-infra ${VERSION}`, and `@fitlab-ai/agent-infra` in package names / install commands / repo URLs stays as-is.
+- **Structure & ordering** (`Usage:` + `Commands:` structure, alphabetical command order): applies only to levels that carry a `Commands:` listing — top-level help (`bin/cli.ts`) and namespace-level help (e.g. `ai sandbox` / `ai task`). Leaf commands have only a single-line usage and need no `Commands:` structure.
+## Display name
+- Use **`ai`** as the command display name in help text (the recommended short form; `package.json`'s `bin` registers both `ai` and `agent-infra`).
+- Keep the top-level help first line as the brand + version line `agent-infra ${VERSION} - bootstrap ...` (it is the brand and version marker that several tests anchor on).
+- Keep `@fitlab-ai/agent-infra` in install methods, package names, and repo URLs as-is (those are package names, not command display names).
+## List structure
+Namespace-level and top-level help follow:
+```
+Usage: ai <ns> <command> [options]
+Commands:
+  <command>  <description aligned from two spaces>
+  ...
+Run 'ai <ns> <command> --help' for details.
+```
+- The `Commands:` block uses bare command names (no repeated binary name), two-space indent, descriptions aligned to the longest command name.
+- Namespace-level help ends with a `Run 'ai <ns> <command> --help' for details.` footer.
+- Top-level help has no uniform subcommand `--help` convention, so the footer is not required there; if an `Examples:` section exists, its command display name is also `ai`.
+## Ordering
+Command lists, `Examples`, and command enumerations embedded in descriptions are all sorted by the **first token of the command, in ascending alphabetical order**:
+- Multi-token commands (e.g. `vm status|start|stop`) sort by the first token (`vm`).
+- Commands with angle/square-bracket parameters sort by the command name (the bare word before the parameters).
+- Case-insensitive.
+## Checklist for adding a subcommand
+When adding a subcommand:
+1. Insert the command at the correct alphabetical position in `Commands:`.
+2. If it has examples, insert them at the alphabetical position in `Examples:`.
+3. If a top-level `task` / `sandbox` description has an embedded command enumeration, update its alphabetical order too.
+4. Sync the corresponding help test's **structural** assertions (whether the command appears, whether the `Usage:` / `Commands:` header exists); do not bind to full sentences (see [`testing-discipline.md`](testing-discipline.md)).

package/templates/.agents/rules/cli-help-format.zh-CN.md ADDED Viewed

@@ -0,0 +1,49 @@
+# CLI help 文案约定
+统一 `ai` / `agent-infra` CLI 的 help 文案展示结构、展示名与命令排序，让后续新增子命令自动遵守，避免跨层级再次漂移。新增或调整 CLI help 文案前先读取本文件。
+## 适用范围
+- **展示名 `ai`**：适用于**所有**面向用户的 help / usage / 交互横幅文案——顶层、命名空间级，以及 `merge` / `init` / `update` 等叶子命令的单行 usage 与启动横幅，统一用 `ai`。唯一例外：顶层 help 首行保留品牌 + 版本行 `agent-infra ${VERSION}`；包名 / 安装命令 / 仓库 URL 中的 `@fitlab-ai/agent-infra` 保持原样。
+- **结构与排序**（`Usage:` + `Commands:` 结构、命令按字母序）：仅适用于带 `Commands:` 子清单的层级——顶层 help（`bin/cli.ts`）与命名空间级 help（如 `ai sandbox` / `ai task`）。叶子命令只有单行 usage，无需 `Commands:` 结构。
+## 展示名
+- help 文案中的命令展示名统一用 **`ai`**（推荐简写，`package.json` 的 `bin` 同时注册 `ai` 与 `agent-infra`）。
+- 顶层 help 首行保留品牌 + 版本行 `agent-infra ${VERSION} - bootstrap ...`（这是品牌与版本标识，多处测试锚定它）。
+- 安装方式、包名、仓库 URL 中的 `@fitlab-ai/agent-infra` 等保持原样（是包名而非命令展示名）。
+## 列表结构
+命名空间级与顶层 help 统一为：
+```
+Usage: ai <ns> <command> [options]
+Commands:
+  <command>  <两空格起对齐的描述>
+  ...
+Run 'ai <ns> <command> --help' for details.
+```
+- `Commands:` 块用裸命令名（不重复二进制名），两空格缩进，描述按最长命令名对齐。
+- 命名空间级 help 末尾加 `Run 'ai <ns> <command> --help' for details.` footer。
+- 顶层 help 无统一子命令 `--help` 约定，故不强制加该 footer；如有 `Examples:` 段，命令展示名同样用 `ai`。
+## 排序
+命令清单、`Examples`、描述中内嵌的命令枚举，一律按**命令首 token 的字母升序**排列：
+- 多 token 命令（如 `vm status|start|stop`）按首 token（`vm`）排序。
+- 带尖括号 / 方括号参数的命令按命令名（参数前的裸词）排序。
+- 大小写不敏感。
+## 新增子命令检查清单
+新增一个子命令时：
+1. 把命令插入 `Commands:` 的字母序正确位置。
+2. 如有示例，插入 `Examples:` 的字母序位置。
+3. 若顶层 `task` / `sandbox` 等描述中有内嵌命令枚举，同步更新其字母序。
+4. 同步对应 help 测试的**结构性**断言（命令是否出现、`Usage:` / `Commands:` 头是否存在），不要绑定整句文案（见 [`testing-discipline.md`](testing-discipline.md)）。

package/templates/.agents/rules/no-mid-flow-questions.en.md CHANGED Viewed

@@ -1,7 +1,7 @@
 # General Rule - No Mid-Flow Questions During SKILL Execution
 > **Scope**: this rule applies to **all SKILL** executions.
-> Only the two exemption categories below may ask the user; any other mid-flow question is a violation.
+> Only the exemption categories listed below may ask the user; any other mid-flow question is a violation.
 ## Exemption Categories
@@ -27,9 +27,21 @@ SKILLs currently covered by this exemption:
 - `init-labels`: may confirm before deleting legacy labels not in the final mapping
 - `commit`: may stop and confirm when its plan conflicts with the user's uncommitted changes
+### Exemption 3: Entry-point requirement-sufficiency clarification
+Allowed only when a SKILL judges, **at its entry point**, whether the current task's requirement information is sufficient for a reliable analysis; it may then ask the user about the **missing requirement information** to converge the requirements. Constraints:
+- Limited to the `analyze-task` entry point; ask one question at a time and wait for the answer before asking the next;
+- Used only to fill requirement-sufficiency gaps; it must **not** be used to solicit implementation / technical-choice preferences (those still go into the artifact's `## Open Questions` per the default clause);
+- Exit the questioning and proceed to normal analysis once the question budget is reached or the user says "just analyze / skip".
+SKILLs currently covered by this exemption:
+- `analyze-task`: when the task description/requirements are insufficient for a reliable analysis, it may ask questions one at a time at the entry point to converge the requirements
 ## No-Mid-Flow-Questions Clause (default behavior)
-For every SKILL execution context not covered by the two exemptions above, the default behavior is:
+For every SKILL execution context not covered by any exemption above, the default behavior is:
 1. Do not call any user-question tool, including but not limited to `AskUserQuestion` and equivalent mechanisms that ask the user to choose.
 2. When uncertain, proceed with the most robust option without interrupting the flow. Use this priority order:
@@ -41,6 +53,17 @@ For every SKILL execution context not covered by the two exemptions above, the d
    - Meaning: the assumptions section records assumptions used for this run that may be revisited later; the open questions section records unresolved questions for human review
    - If the artifact template does not reserve these sections, append them as needed. If there are no assumptions or open questions, do not force empty sections.
+## Key Design Decision Marking And Ledgering
+When an open question is a key design decision that needs human judgment, the executor must mark the item with `[needs-human-decision]` and write the matching `HD-` row to task.md `## Review Disagreement Ledger` according to `.agents/rules/review-handshake.md`.
+Use these checks together:
+- **Source test**: can the conclusion be uniquely derived from the task description, existing requirements, code conventions, or an approved plan? If not, and multiple reasonable options exist, it is a choice.
+- **Impact test**: does the choice change scope, boundaries, defaults, thresholds, become irreversible / costly, or set precedent for later tasks? Any hit upgrades it to a key design decision.
+- **Small-impact exemption**: if it is only a local, reversible, low-cost execution detail, record it under `## Assumptions` instead of upgrading it to a human ruling.
+- **Fallback**: when unsure whether it is key, treat it as key; `review-*` must check whether the executor missed any `[needs-human-decision]` markings that should have been upgraded.
 ## Human Review Checkpoint Semantics
 A mandatory human review checkpoint means:

package/templates/.agents/rules/no-mid-flow-questions.zh-CN.md CHANGED Viewed

@@ -1,7 +1,7 @@
 # 通用规则 - SKILL 执行禁言
 > **适用范围**：本规则适用于**所有 SKILL** 的执行过程。
-> 仅以下两类例外可向用户提问；不属于这两类的发问一律按违规处理。
+> 仅以下列出的例外可向用户提问；不属于这些例外的发问一律按违规处理。
 ## 例外类型
@@ -27,9 +27,21 @@
 - `init-labels`：删除不在最终映射中的旧 label 前可确认
 - `commit`：检测到与用户未提交改动冲突时可停下确认
+### 例外 3：入口式需求充分性澄清
+仅当 SKILL 在**入口处**判断「当前任务的需求信息是否充分到可以可靠分析」时，可就**缺失的需求信息**向用户提问以收敛需求。约束：
+- 仅限 `analyze-task` 入口；一次只问一个问题，等用户回答后再问下一个；
+- 仅用于补齐需求充分性，**不得**借此征求实现方案 / 技术选型偏好（这些仍按禁言铁律写入产物的 `## 未决问题`）；
+- 达到提问预算上限或用户表示「直接分析 / skip」即退出提问，进入正常分析。
+当前归入本例外的 SKILL：
+- `analyze-task`：任务描述/需求信息不足以支撑可靠分析时，可在入口处逐个提问收敛需求
 ## 禁言条款（默认行为）
-不属于上述两类例外的所有 SKILL 执行场景，遵循以下默认行为：
+不属于上述任一例外的所有 SKILL 执行场景，遵循以下默认行为：
 1. **禁止调用**任何向用户发问的工具（包括但不限于 `AskUserQuestion` 及等价的「征求用户选择」机制）。
 2. **不确定时**，按「最稳健方案」自主推进，不中断对话。最稳健方案的判定优先级：
@@ -41,6 +53,17 @@
    - 含义：`假设` 段记录本次按某假设推进、未来若假设不成立可推翻；`未决问题` 段记录本次未决、需要人工审查时裁定的问题
    - 产物模板未预留这两段时，按需追加；没有假设或未决问题时不必强行写空段。
+## 关键设计决策标记与落账
+当未决问题属于需要人工裁定的关键设计决策时，执行方必须在该条目前标记 `[needs-human-decision]`，并按 `.agents/rules/review-handshake.md` 在 task.md `## 审查分歧账本` 写入 `HD-` 行。
+判定时同时使用以下检查：
+- **来源测试**：结论是否能从任务描述、既有需求、代码约定或已批准方案中唯一推出？若不能，且存在多个合理选项，则它是选择题。
+- **影响测试**：该选择是否改变范围、边界、默认值、阈值，是否不可逆 / 成本较高，或是否会扩散成后续任务先例？任一命中即升级为关键设计决策。
+- **小影响豁免**：若它只是局部、可逆、低成本的执行细节，写入 `## 假设` 即可，不升级为人工裁决。
+- **兜底**：无法判断是否关键时按关键处理；`review-*` 需要复核执行方是否漏标应升级的 `[needs-human-decision]`。
 ## 人工审查检查点语义
 「强制性人工审查检查点」（mandatory human review checkpoint）的语义是：

package/templates/.agents/rules/pr-sync.github.en.md CHANGED Viewed

@@ -32,7 +32,11 @@ Aggregation rules:
 - build the review-history table from `review-code*` and `code*`
 - extract the test summary from `code*`
 - if one artifact class is missing, treat it as "no data for this stage" and continue
-- Manual verification section: extract items requiring human confirmation/fallback from the "Assumptions"/"Open Questions" of the latest `plan*` and the "Environment-Blocked Findings"/"Self-Doubt" sections (i.e. env-blocked items) of the latest `review-code*`; when there are none, write the explicit placeholder `- None — no items require manual verification`, never leave it empty
+- Manual verification section: include only post-code-stage checks that still require a human to execute or judge and that the AI cannot close on its own.
+  - **Admission boundary**: the verification result depends on a real environment, permissions, account, external system, or human judgment, and cannot be closed by an agent rerunning tests, adding checks, or continuing the fix loop.
+  - **Sources**: `review-code*` "Environment-Blocked Findings", plus `code*` items that satisfy the boundary above.
+  - **Wording**: each retained item must state at least "what to verify + location (file/change/scope) + why only a human can verify it".
+  - **Empty rendering**: when there are no retained items, do NOT use the ⚠️ alarm style (it falsely implies a problem). Render the whole block as: heading `### ✅ No Manual Verification Needed` and a single line `No items in this change require manual confirmation.`, with no item list. Only use the `### ⚠️ Manual Verification Required` heading + item list when retained items exist.
 ## Comment Body Template
@@ -47,11 +51,7 @@ Use this canonical comment body template:
 **Updated At**: {current-time}
-### ⚠️ Manual Verification Required
-> Items in this change that need human confirmation/fallback; reviewers can reply under this comment once verified.
-- {manual-verify-item}
+{manual-verify-section}
 ### Key Technical Decisions
@@ -72,6 +72,8 @@ Use this canonical comment body template:
 *Generated by {agent} · Internal tracking: {task-id}*
 ```
+> Render `{manual-verify-section}` per the "manual verification section" aggregation rule above: with retained items → `### ⚠️ Manual Verification Required` heading + quote + item list; with none → `### ✅ No Manual Verification Needed` heading + a single line `No items in this change require manual confirmation.` (no ⚠️, no list).
 ## Comment Lookup And Update
 Fetch existing comments through the Issues comments API, not the dedicated PR comments API.

package/templates/.agents/rules/pr-sync.github.zh-CN.md CHANGED Viewed

@@ -32,7 +32,11 @@
 - 用 `review-code*` 与 `code*` 构建审查历程表
 - 从 `code*` 提取测试结果摘要
 - 某一类产物缺失时，按“无该阶段数据”处理并继续生成
-- 需人工校验段落：从最新 `plan*` 的「假设」「未决问题」与最新 `review-code*` 的「环境性遗留」「自我质疑」提取需人工确认/兜底事项；无任何事项时写显式占位 `- 无需人工校验事项`，不得留空
+- 需人工校验段落：只收进入 code 阶段后仍需人实际执行或判断、AI 无法自行关闭的校验点。
+  - **准入边界**：校验结论依赖真实环境、权限、账号、外部系统或人工判断，且无法通过 agent 重跑测试、补充检查或继续修复自行关闭。
+  - **来源**：`review-code*` 的「环境性遗留」，以及 `code*` 中满足上述边界的校验点。
+  - **写法**：每条保留项至少写明「校验什么 + 定位（文件/改动/范围）+ 为什么只能由人校验」。
+  - **空集渲染**：无保留项时，不要使用 ⚠️ 告警样式（会让人误以为有问题）。整段降级渲染为：标题 `### ✅ 无需人工校验`，正文一行 `本次改动无需人工确认事项。`，不带条目列表。有保留项时才用 `### ⚠️ 需人工校验` 标题 + 条目列表。
 ## 评论体模板
@@ -47,11 +51,7 @@
 **更新时间**：{当前时间}
-### ⚠️ 需人工校验
-> 本次改动中需人工确认/兜底的事项；reviewer 校验后可在本评论下回复收尾。
-- {manual-verify-item}
+{manual-verify-section}
 ### 关键技术决策
@@ -72,6 +72,8 @@
 *由 {agent} 自动生成 · 内部追踪：{task-id}*
 ```
+> `{manual-verify-section}` 按上文「需人工校验段落」聚合规则渲染：有保留项 → `### ⚠️ 需人工校验` 标题 + 引用说明 + 条目列表；无保留项 → `### ✅ 无需人工校验` 标题 + 一行 `本次改动无需人工确认事项。`（不带 ⚠️、不带列表）。
 ## 评论查找与更新
 已有评论必须通过 Issues comments API 获取，而不是单独的 PR comments API。

package/templates/.agents/rules/review-handshake.en.md ADDED Viewed

@@ -0,0 +1,97 @@
+# Bidirectional Review Handshake Protocol
+> Shared by executor and reviewer across all three stages (analysis / plan / code) when running the `review-*` and `*-task` skills.
+> This file is the **single source of truth** for the protocol; each SKILL only `Read`s it and never re-copies the vocabulary.
+## Core principles
+- **A review finding is input to be verified, not a command to execute.** The executor must verify each finding before disposing of it — neither rubber-stamping nor blindly refuting.
+- **Symmetric evidence burden**: every disposition, whether accept or refute, must carry **commensurate evidence**. "Accept" is not a zero-cost default path.
+- **Converge before advancing**: while any unclosed disagreement, alternative fix, cannot-judge, or post-review commit exists, do not silently advance to the next stage, archive, or merge.
+## Executor four-state disposition (`*-task` skills, when responding to the prior review round in Round ≥ 2)
+For each finding in the latest `review-*`, first Read/Grep the cited `file:line` / command, then assign one status:
+| Status | Meaning | Required evidence |
+|--------|---------|-------------------|
+| `accepted` | Valid; will fix as suggested | `file:line` of the fix, or the change to be applied this round |
+| `adjusted` | Valid, but an alternative fix is used | the alternative + why it is better; awaits reviewer confirmation |
+| `refuted` | After verification, judged invalid / hallucinated / based on a wrong `file:line` | counter-evidence (`file:line` or raw command output); awaits reviewer confirmation |
+| `cannot-judge` | Insufficient evidence to decide | the verification path attempted; handed to reviewer/human |
+## Reviewer hand-back duty (`review-*` skills, when re-reviewing the executor response)
+After the executor gives `adjusted` / `refuted` / `cannot-judge`, the reviewer must respond per item — never re-reading the original finding nor ignoring the hand-back:
+- **Withdraw the finding** → set the ledger row to `confirmed` (accepts the refutation).
+- **Accept the alternative fix** → set to `confirmed`.
+- **Hold with new evidence** → set back to `open` (with new evidence, returned to the executor).
+- **Escalate to human** → set to `needs-human-decision`.
+## Convergence termination (loop guard)
+- The per-finding handshake round limit is `MAX_HANDSHAKE_ROUNDS`, default **3**, overridable via `review.maxHandshakeRounds` in `.agents/.airc.json`.
+- When a finding's `round` reaches the limit without entering a terminal state, it must be forced to `needs-human-decision`; the gate rejects rows that hit the limit without escalating.
+- `needs-human-decision` keeps blocking completion until a human records a ruling in the task.md `## 人工裁决` section and flips the row to `human-decided`.
+## Same-model convergence-bias mitigation (documentation-level discipline)
+The executor and reviewer are often the same/similar model and are naturally inclined to agree. When reviewing:
+1. **Read the evidence before the conclusion**: read the `git diff` / artifact itself and form findings independently **before** reading the executor's conclusions and responses, to avoid being anchored.
+2. **Default-skeptical framing**: treat "looks fine" as unverified; every clearance needs reproducible evidence (see the `Evidence` hard gate in each `review-*`).
+> The only mechanical lever is the **symmetric-evidence gate** (non-`open` ledger rows must carry evidence); model homogeneity itself is not mechanically checkable, so this section is discipline rather than a gate.
+## Mechanical ledger (task.md `## 审查分歧账本`)
+The single source of truth for disagreement state is the fixed `## 审查分歧账本` section in task.md — one parseable Markdown table. The phase-advance and `complete-task` gates read this section.
+```markdown
+## 审查分歧账本
+<!-- One row per review finding; state machine / evidence rules in .agents/rules/review-handshake.md. The phase-advance and complete-task gates read this section. -->
+| id | stage | round | severity | status | evidence |
+|----|-------|-------|----------|--------|----------|
+| CD-1 | code | 1 | blocker | open | review-code.md#1 |
+```
+- `id`: stage prefix + ordinal — analysis→`AN-`, plan→`PL-`, code→`CD-`; executor-raised human-ruling rows use `HD-`.
+- `stage` ∈ `{analysis, plan, code}` (plus the reserved value `post-review-commit`, used only for post-review exemption rows).
+- `status` legal enum: `open` / `accepted` / `adjusted` / `refuted` / `cannot-judge` / `confirmed` / `needs-human-decision` / `closed` / `human-decided`.
+- **Terminal set (gate passes)**: `{confirmed, closed, human-decided}`; everything else is blocking.
+- **Write responsibility**: `review-*` raises a finding → upsert an `open` row; `*-task` responds → set four-state and fill `evidence`, `round` +1; next `review-*` → `confirmed` / back to `open` / `needs-human-decision`; an executor fix verified by the next review → `closed`; a human ruling → `human-decided`.
+- **Backward compatible**: when task.md has no such section the gate treats it as no open disagreements and passes.
+### Executor-raised human-ruling rows
+When an executor marks an item in the artifact `## Open Questions` section as `[needs-human-decision]`, it must upsert the matching `HD-` row in task.md `## Review Disagreement Ledger`:
+```markdown
+| HD-1 | plan | - | decision | needs-human-decision | plan.md#HD-1 |
+```
+- `stage` is the stage where the decision arose: `analysis` / `plan` / `code`.
+- `round` is `-` because this is not a review-finding handshake round.
+- `severity` is always `decision`.
+- `status` starts as `needs-human-decision`, so the existing gate blocks it.
+- After a human records the ruling in task.md `## Human Rulings`, flip the matching `HD-` row to `human-decided` and point `evidence` to that ruling.
+## post-review commit gate (code stage only)
+- The highest-round `review-code` report records `Review Baseline Commit` (R, `git rev-parse HEAD`) and `Reviewed Diff Fingerprint` (F, full worktree diff fingerprint).
+- `commit` reads only the highest-round `review-code` artifact. When that artifact is Approved, the pre-commit HEAD equals R, and the staged diff fingerprint equals F, task.md receives `last_reviewed_commit` (B, the new commit SHA).
+- The `complete-task` `post-review-commit` gate prefers B; when B is absent or invalid, it falls back to R from the highest-round `review-code` artifact.
+- If new commits touch code / rule paths after B / R, the gate blocks and requires a fresh `review-code`.
+- **Exemption**: append a ledger row `| PRC-1 | post-review-commit | - | - | human-decided | <ruling note> |` recording that a human explicitly allowed those commits without re-review.
+## Gate behavior cheat sheet
+| Caller | `review-ledger` scope | `post-review-commit` |
+|--------|-----------------------|----------------------|
+| `plan-task` | only `analysis`-stage rows must be terminal | not attached |
+| `code-task` | `analysis` + `plan`-stage rows must be terminal | not attached |
+| `complete-task` | all stage rows must be terminal | attached (see above) |
+| `analyze-task` | not attached (first stage) | not attached |