npm - agentboss - Versions diffs - 0.1.1 → 0.1.3 - Mend

agentboss 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/client/dist/assets/{index-CsVml4AS.js → index-CT8rBVfX.js} +53 -49
package/client/dist/index.html +1 -1
package/package.json +1 -1
package/server/analysis/dimensions/judgement.js +111 -107
package/server/analysis/dimensions/llm-merge.js +59 -57
package/server/analysis/dimensions/output-quality.js +167 -167
package/server/analysis/dimensions/problem-definition.js +109 -104
package/server/analysis/job.js +37 -6
package/server/analysis/scoring-v2.js +12 -8
package/server/api/execution.js +94 -0
package/server/db/schema.js +5 -2
package/server/etl/opencode.js +5 -1
package/server/execution/job.js +141 -2
package/server/llm/advice-prompt.js +74 -11
package/server/llm/advice.js +50 -1
package/server/llm/mcp-classify.js +147 -0
package/server/llm/project-advice-prompt.js +106 -6
package/server/llm/project-advice.js +55 -2

package/server/llm/mcp-classify.js ADDED Viewed

@@ -0,0 +1,147 @@
+/**
+ * MCP / built-in tool classifier.
+ *
+ * Neither OpenCode's nor Claude Code's session DB tags tool calls with a
+ * provenance field — both stash the bare tool name ('read', 'bash',
+ * 'atlassian_getJiraIssue', …) into a single string.  But the two
+ * agents both follow a NAMING CONVENTION for MCP tools that we can pick
+ * up with a tiny rule set:
+ *
+ *   • Claude Code:  `mcp__<server>__<tool>`      (double underscore)
+ *   • OpenCode:     `<server>_<toolName>`        (single underscore;
+ *                    server name is the lowercased MCP server key)
+ *
+ * Built-in tools across both agents are short, lowercase, no separators
+ * (`read`, `bash`, `glob`, `todowrite`, …).  When in doubt we fall back
+ * to a whitelist — anything OUTSIDE the whitelist with a separator we
+ * call MCP.  Names that contain no separator and aren't whitelisted are
+ * reported as 'unknown' so the caller (currently: LLM prompts) can
+ * decide to mention the uncertainty or skip.
+ *
+ * No persistent state — pure functions, safe to call per-row.
+ *
+ * Source of truth for whitelist updates:
+ *   - OpenCode built-ins observed in production boss.db:
+ *       read, bash, edit, glob, grep, write, todowrite, question,
+ *       task, skill, websearch
+ *   - Claude Code built-ins (per docs):
+ *       Read, Write, Edit, Bash, Glob, Grep, LS, NotebookEdit,
+ *       NotebookRead, Task, TodoWrite, WebFetch, WebSearch
+ *
+ * Compared case-insensitively.
+ *
+ * @author Felix
+ */
+'use strict';
+const BUILTIN_NAMES = new Set([
+  // OpenCode (observed)
+  'read', 'bash', 'edit', 'glob', 'grep', 'write', 'todowrite',
+  'question', 'task', 'skill', 'websearch',
+  // Claude Code (per public docs — kept lowercase for case-insensitive
+  // matching; the on-disk casing may vary slightly between versions)
+  'ls', 'webfetch', 'notebookedit', 'notebookread',
+]);
+/**
+ * Classify one tool name.
+ *
+ * @param {string} name  raw tool_name from unified_tool_call
+ * @returns {{ kind: 'builtin'|'mcp'|'unknown', server: string|null, tool: string|null }}
+ */
+function classifyTool(name) {
+  if (typeof name !== 'string' || !name) {
+    return { kind: 'unknown', server: null, tool: null };
+  }
+  const lower = name.toLowerCase();
+  // 1. Claude Code MCP convention: mcp__<server>__<tool>
+  if (lower.startsWith('mcp__')) {
+    const rest = name.slice(5);
+    const idx = rest.indexOf('__');
+    if (idx > 0) {
+      return {
+        kind: 'mcp',
+        server: rest.slice(0, idx),
+        tool: rest.slice(idx + 2),
+      };
+    }
+    // mcp__foo  (no second separator) — still definitely MCP
+    return { kind: 'mcp', server: rest, tool: '' };
+  }
+  // 2. Built-in whitelist (covers both agents).
+  if (BUILTIN_NAMES.has(lower)) {
+    return { kind: 'builtin', server: null, tool: lower };
+  }
+  // 3. OpenCode MCP convention: <server>_<toolName>
+  //    Single underscore, not in whitelist, both halves non-empty.
+  const us = name.indexOf('_');
+  if (us > 0 && us < name.length - 1) {
+    return {
+      kind: 'mcp',
+      server: name.slice(0, us).toLowerCase(),
+      tool: name.slice(us + 1),
+    };
+  }
+  // 4. Anything else (no separator + not whitelisted): unknown.
+  //    Could be a newer built-in we haven't catalogued yet, or a
+  //    custom subagent.  We don't want to wrongly accuse it of being
+  //    MCP, so we punt to the caller.
+  return { kind: 'unknown', server: null, tool: name };
+}
+/**
+ * Aggregate a list of raw tool-usage rows into per-MCP-server stats.
+ *
+ * @param {{tool_name:string, count:number, error_count:number}[]} rows
+ * @returns {{
+ *   servers: { server:string, calls:number, errors:number, tools:string[] }[],
+ *   builtinCalls: number,
+ *   mcpCalls: number,
+ *   unknownCalls: number,
+ * }}
+ */
+function summariseMcpUsage(rows) {
+  const byServer = new Map();
+  let builtinCalls = 0;
+  let mcpCalls = 0;
+  let unknownCalls = 0;
+  for (const r of rows || []) {
+    const c = Number(r.count || 0);
+    const e = Number(r.error_count || 0);
+    const k = classifyTool(r.tool_name);
+    if (k.kind === 'builtin') { builtinCalls += c; continue; }
+    if (k.kind === 'unknown') { unknownCalls += c; continue; }
+    mcpCalls += c;
+    const key = k.server || '(?)';
+    if (!byServer.has(key)) {
+      byServer.set(key, { server: key, calls: 0, errors: 0, tools: new Set() });
+    }
+    const s = byServer.get(key);
+    s.calls += c;
+    s.errors += e;
+    if (k.tool) s.tools.add(k.tool);
+  }
+  const servers = Array.from(byServer.values())
+    .map((s) => ({
+      server: s.server,
+      calls: s.calls,
+      errors: s.errors,
+      tools: Array.from(s.tools).slice(0, 8),
+    }))
+    .sort((a, b) => b.calls - a.calls);
+  return { servers, builtinCalls, mcpCalls, unknownCalls };
+}
+module.exports = {
+  classifyTool,
+  summariseMcpUsage,
+  BUILTIN_NAMES,
+};

package/server/llm/project-advice-prompt.js CHANGED Viewed

@@ -209,6 +209,47 @@ function fmtWindow(ctx) {
   return `${ctx.windowFrom} → ${ctx.windowTo}`;
 }
+/**
+ * Render the cross-session MCP-server usage block.  Returns null when
+ * there's no signal at all (no MCP calls AND no built-in calls), so the
+ * caller can skip the entire section.
+ *
+ * Output is intentionally compact — the LLM only needs the aggregate
+ * picture; per-session tool tables already live in each session block
+ * (well, they don't; we deliberately *don't* feed raw transcripts), so
+ * this is the project's only window onto tool reality.
+ */
+function fmtMcpUsage(usage) {
+  if (!usage) return null;
+  const { servers = [], builtinCalls = 0, mcpCalls = 0, unknownCalls = 0 } = usage;
+  if (!builtinCalls && !mcpCalls && !unknownCalls) return null;
+  const lines = [];
+  lines.push(
+    `总调用: 内置 ${builtinCalls} · MCP ${mcpCalls}` +
+    (unknownCalls ? ` · 未分类 ${unknownCalls}` : '')
+  );
+  if (servers.length === 0) {
+    lines.push('MCP 服务器: (无)');
+  } else {
+    lines.push('按 MCP 服务器(calls 降序):');
+    for (const s of servers.slice(0, 10)) {
+      const toolList = (s.tools || []).slice(0, 6).join(', ');
+      lines.push(
+        `  - ${s.server.padEnd(16)} ` +
+        `calls=${String(s.calls).padStart(4)} ` +
+        `err=${String(s.errors).padStart(3)}` +
+        (toolList ? `   tools=[${toolList}]` : '')
+      );
+    }
+    if (servers.length > 10) {
+      lines.push(`  …(其余 ${servers.length - 10} 个 MCP 服务器省略)`);
+    }
+  }
+  return lines.join('\n');
+}
 // ---------------------------------------------------------------------------
 //  Prompt
 // ---------------------------------------------------------------------------
@@ -221,6 +262,7 @@ function fmtWindow(ctx) {
  */
 function buildProjectAdvicePrompt(ctx) {
   const stats = ctx.stats || {};
+  const mcpBlock = fmtMcpUsage(ctx.mcpUsage);
   const truncatedNote =
     ctx.truncated === 'hard'
@@ -254,6 +296,7 @@ function buildProjectAdvicePrompt(ctx) {
   - 是否应建立 / 完善 opencode skill 或 subagent
   - 工具使用习惯(是否反复使用低效组合)
   - 项目级流程瓶颈(测试节奏、回退频率)
+  - **MCP 服务器的取舍**(详见下「MCP 服务器使用」一节)
 不要谈:
   - 任何具体业务 / 代码层面的对错
@@ -284,7 +327,7 @@ AdviceItem(项目级):
   "why":        "1-2 句话,说明这个问题在多个会话中如何重复或累积",
   "action":     "1 句话,项目级别可落地的改变(skill / 配置 / 流程)",
   "evidence":   "必须引用具体会话证据,例:出现于 7/12 个会话(sess-abc, sess-def, ...)",
-  "actionable": true | false,
+  "actionable": true | false,    // 见硬规则 6
   "executor":   "opencode" | "claude" | "manual",
   "cwd_hint":   "project_root"
 }
@@ -299,10 +342,48 @@ AdviceItem(项目级):
 4. 出现「分数」「等级」「Lx」「子分」「H1」「H2」「E1」「O1」字眼一律违规。
 5. action 必须是「项目级」可落地动作(写一个 skill / 改一个配置 / 形成
    一条惯例),不是「下次开场更具体一些」这种纯口头建议。
-6. actionable / executor / cwd_hint 规则与单 session 版本相同;manual
-   时 actionable 必须 false。
-7. 如果没有发现任何值得 project 级别报告的问题(全是个例),把所有
-   categories 设为空数组,summary 写「未发现项目级别的系统性协作问题」。
+6. **actionable=true 必须是「AI 在用户电脑上真的会落盘的改动」**。
+   只有同时满足下面三点才能填 true:
+   a. 明确产出物在文件系统里——新建/修改某个具体的
+      \`.opencode/skills/<name>/SKILL.md\` / 配置文件 / 模板 / 脚本 /
+      文档。一句话能说清"AI 将创建/修改 \`<相对路径>\`"。
+   b. 不依赖人类专属知识——AI 看着项目根目录就能做完;不需要密码、
+      内部 wiki、外部账号、决策权、跨部门沟通。
+   c. 是项目级可复用的改变(本来就是项目级 advice 的核心场景)。
+   只要任一点不满足 → actionable=false, executor='manual'。
+   **反例(下面这些必须 actionable=false / executor='manual')**:
+   - 「项目里大家以后都先写 README 再写代码」
+     —— 是改人的习惯,AI 没法落盘。
+   - 「换更便宜的默认模型」「关闭 reasoning」
+     —— 模型档位是用户在 client / 终端设的,不是 AI 改文件。
+   - 「定期回顾会话」「定期评审 token 成本」
+     —— 流程性建议,需要人来做。
+   - 「和团队沟通 AI 使用规范」「开个分享会」
+     —— 显然是人的事。
+   - **典型 actionable=true 的例子**:
+       · 在 \`.opencode/skills/<name>/SKILL.md\` 创建一个 skill
+         自动化"开场附文件路径"这件事;
+       · 在仓库根目录新建一份 \`AGENTS.md\` 写下项目惯例;
+       · 修改 \`.editorconfig\` / lint 配置补齐缺失规则。
+7. executor:
+   - 落盘类动作(写 skill、加配置、写文档文件) → 'opencode' 或 'claude';
+   - 一切让人类调整行为/认知/沟通/决策的建议 → 'manual'。
+   不会判断时,**默认填 'manual'**——错填 manual 只是少一个按钮;
+   错填 opencode 会让用户点了按钮后 AI 干一些莫名其妙的事。
+8. cwd_hint: 目前只能填 "project_root"。
+9. actionable 与 executor 必须一致:executor='manual' ⇔ actionable=false;
+   executor 是 'opencode'/'claude' ⇔ actionable=true。
+   **不允许出现 executor='opencode' 且 actionable=false 的组合**。
+10. 如果没有发现任何值得 project 级别报告的问题(全是个例),把所有
+    categories 设为空数组,summary 写「未发现项目级别的系统性协作问题」。
 # 项目基础
@@ -314,7 +395,26 @@ AdviceItem(项目级):
 总 token:       ${fmtNum(stats.totalTokens)}
 错误总数:       ${fmtNum(stats.totalErrors)}
-# 各会话的 per-session 结论 ${truncatedNote}
+${mcpBlock ? `# MCP 服务器使用(跨该项目所有会话累计)
+${mcpBlock}
+# 关于 MCP 的分析角度(仅在数据真的体现出问题时谈)
+- **某个 MCP 错误率高**(err / calls 偏高):证据足够时,建议项目级地
+  调整该服务器的使用方式(改 query 模板、改用其它来源、加 wrapper skill)。
+- **某个 MCP 调用频次极高且都来自少数会话**:可能是反复在拉同一类数据,
+  考虑包成一个 skill 或落到本地缓存文件。
+- **MCP 与内置工具调用比例失衡**(例如 MCP 调用为 0 但项目明显需要 Jira /
+  GitHub 数据,推断自会话 summary 里"上网搜"或"手抄"等迹象),
+  建议引入对应 MCP 服务器并写在惯例文档。
+- **多个 MCP 服务器各只用 1-2 次**:可能是探索性试用,不必报告;只在
+  错误率高或明显错配时谈。
+证据里点名具体 MCP 服务器名(例如 \`atlassian\`)和涉及的会话数。
+不要为了凑够 MCP 内容硬挑;没有信号就不要写 MCP 相关 AdviceItem。
+` : ''}# 各会话的 per-session 结论 ${truncatedNote}
 ${fmtSessionBlocks(ctx.sessions || [])}

package/server/llm/project-advice.js CHANGED Viewed

@@ -39,7 +39,8 @@ const {
   truncateContext,
   annotateContext,
 } = require('./project-advice-prompt');
-const { loadAdvice } = require('./advice');
+const { loadAdvice, looksLikeHumanAction } = require('./advice');
+const { summariseMcpUsage } = require('./mcp-classify');
 const {
   queryAll,
   queryOne,
@@ -254,6 +255,43 @@ function isCacheFresh(cached, currentIds) {
 //  Stats aggregation
 // ---------------------------------------------------------------------------
+/**
+ * Aggregate tool usage across the given sessions and split it into
+ * MCP-server-grouped + built-in counts.  Used only at project-level
+ * prompt assembly (session-level prompt already has its own per-session
+ * tool table; classification there happens inside the LLM).
+ *
+ * Returns whatever summariseMcpUsage returns, or an "empty" shape when
+ * there are no tool calls (callers can decide to skip the prompt
+ * section instead of writing "(无)").
+ *
+ * @param {object} db
+ * @param {string[]} sessionIds
+ * @returns {{
+ *   servers: { server, calls, errors, tools }[],
+ *   builtinCalls: number,
+ *   mcpCalls: number,
+ *   unknownCalls: number,
+ * }}
+ */
+function summariseProjectMcpUsage(db, sessionIds) {
+  if (!sessionIds || !sessionIds.length) {
+    return { servers: [], builtinCalls: 0, mcpCalls: 0, unknownCalls: 0 };
+  }
+  const placeholders = sessionIds.map(() => '?').join(',');
+  const rows = queryAll(
+    db,
+    `SELECT tool_name,
+            COUNT(*)                                          AS count,
+            SUM(CASE WHEN status='error' THEN 1 ELSE 0 END)   AS error_count
+       FROM unified_tool_call
+      WHERE session_id IN (${placeholders})
+      GROUP BY tool_name`,
+    sessionIds
+  );
+  return summariseMcpUsage(rows);
+}
 function summariseStats(db, sessions) {
   let totalCost = 0;
   let totalTokens = 0;
@@ -381,9 +419,16 @@ async function generateProjectAdvice(db, opts = {}) {
     // 6. assemble + truncate
     const stats = summariseStats(db, withAdvice);
+    // Cross-session MCP server usage — fed into the prompt so the LLM
+    // can spot under/over-used MCP servers at project scope.  We use ALL
+    // resolved sessions (raw), not just the ones with advice cached:
+    // even sessions without per-session advice still contribute real
+    // tool-call data to the picture.
+    const mcpUsage = summariseProjectMcpUsage(db, raw.map((r) => r.id));
     const ctx = annotateContext({
       project, scope, windowFrom, windowTo,
       stats,
+      mcpUsage,
       sessions: withAdvice,
     });
     const trimmed = truncateContext(ctx);
@@ -469,11 +514,19 @@ function normaliseItem(it) {
   const cwd_hint = ALL_CWD_HINTS.includes(it.cwd_hint) ? it.cwd_hint : 'project_root';
   if (executor === 'manual') actionable = false;
   if (actionable && executor === 'manual') executor = 'opencode';
+  // Same human-action safety net as the session-level normalizer.
+  const action = typeof it.action === 'string' ? it.action : '';
+  if (actionable && looksLikeHumanAction(action)) {
+    actionable = false;
+    executor = 'manual';
+  }
   return {
     severity,
     title:    typeof it.title === 'string'    ? it.title.trim()    : '',
     why:      typeof it.why === 'string'      ? it.why.trim()      : '',
-    action:   typeof it.action === 'string'   ? it.action.trim()   : '',
+    action:   action.trim(),
     evidence: typeof it.evidence === 'string' ? it.evidence.trim() : '',
     actionable,
     executor,