npm - agentboss - Versions diffs - 0.1.0 - Mend

agentboss 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/README.md +34 -0
package/bin/aboss.js +288 -0
package/client/dist/assets/index-C1wFD_Vo.css +1 -0
package/client/dist/assets/index-DBj1Ujlx.js +137 -0
package/client/dist/index.html +34 -0
package/package.json +64 -0
package/server/analysis/daily-aggregator.js +258 -0
package/server/analysis/difficulty.js +129 -0
package/server/analysis/dimensions/ai-knowledge.js +172 -0
package/server/analysis/dimensions/ai-tools.js +161 -0
package/server/analysis/dimensions/judgement.js +107 -0
package/server/analysis/dimensions/llm-merge.js +57 -0
package/server/analysis/dimensions/output-quality.js +167 -0
package/server/analysis/dimensions/problem-definition.js +104 -0
package/server/analysis/dimensions/system-thinking.js +225 -0
package/server/analysis/evidence-builder.js +104 -0
package/server/analysis/job.js +273 -0
package/server/analysis/report-builder.js +581 -0
package/server/analysis/scoring-v2.js +72 -0
package/server/analysis/text-signals.js +179 -0
package/server/analysis/thresholds-v2.js +358 -0
package/server/api/advice.js +124 -0
package/server/api/analysis.js +141 -0
package/server/api/execution.js +330 -0
package/server/api/metrics.js +277 -0
package/server/api/overview.js +308 -0
package/server/api/project.js +255 -0
package/server/api/reports.js +125 -0
package/server/api/sessions.js +118 -0
package/server/api/settings.js +119 -0
package/server/db/connection.js +175 -0
package/server/db/queries.js +1051 -0
package/server/db/schema.js +487 -0
package/server/etl/active-time.js +150 -0
package/server/etl/backfill-subagents.js +178 -0
package/server/etl/claude-code.js +826 -0
package/server/etl/detect.js +341 -0
package/server/etl/judge-filter.js +117 -0
package/server/etl/opencode.js +606 -0
package/server/execution/job.js +662 -0
package/server/execution/prompt.js +227 -0
package/server/execution/runner.js +218 -0
package/server/index.js +94 -0
package/server/llm/advice-prompt.js +339 -0
package/server/llm/advice.js +384 -0
package/server/llm/analysis-prompt.js +162 -0
package/server/llm/cli-runner.js +249 -0
package/server/llm/judge-prompts.js +179 -0
package/server/llm/judge.js +118 -0
package/server/llm/project-advice-prompt.js +332 -0
package/server/llm/project-advice.js +491 -0
package/server/llm/session-analyzer.js +122 -0
package/server/utils/project.js +80 -0

package/server/llm/advice-prompt.js ADDED Viewed

@@ -0,0 +1,339 @@
+/**
+ * Per-session AI-advice prompt builder.
+ *
+ * Wraps a session's context (basic meta, tool-call summary, message
+ * transcript) into a single instruction that asks the local CLI
+ * (opencode / claude) to return 5-category, evidence-backed improvement
+ * suggestions as strict JSON.
+ *
+ * Spec: docs/superpowers/specs/2026-06-13-session-advice-design.md
+ *
+ * # Inputs the prompt deliberately OMITS
+ *
+ * We do NOT feed the model any of our computed H1/H2/E1/O1 scores or
+ * sub-scores.  Earlier iterations did, and the model immediately
+ * collapsed into echoing those numbers back as "evidence" ("H1=63 偏低,
+ * 所以问题定义不清晰"), which is just a fancy way of laundering our own
+ * rule-based output through an LLM.  Stripping the scores forces the
+ * model to look at the conversation itself.  The prompt also explicitly
+ * forbids referencing scores, levels, or dimension keys in the output.
+ *
+ * # Scope of advice (intentional)
+ *
+ * The model is told to comment on the developer's COLLABORATION PATTERNS:
+ * how prompts were phrased, where context was missing, where flow could
+ * have been tighter, whether a skill/subagent would have helped, where
+ * cost/cache choices hurt.
+ *
+ * It is told NOT to critique the domain content of the conversation —
+ * e.g. "your React hook is wrong", "you should have used an INNER JOIN".
+ * That kind of feedback belongs in code review, not in a usage analytics
+ * tool, and the model lacks the runtime/repo context to do it safely.
+ *
+ * # Independence from judge
+ *
+ * Advice has its own sentinel (ADVICE_SENTINEL).  The judge prompts use
+ * JUDGE_SENTINEL.  Both are recognised by server/etl/judge-filter.js so
+ * the resulting CLI sessions get filtered out of the user's own data.
+ * This file does NOT import from judge-prompts, so changes there cannot
+ * silently affect advice generation.
+ *
+ * # Versioning
+ *
+ * Bump ADVICE_PROMPT_VERSION whenever the OUTPUT contract (categories,
+ * AdviceItem shape, hard rules) changes so old cached results are
+ * discarded and re-judged.  Cosmetic edits to wording, layout or ctx
+ * ordering do NOT need a bump.
+ *
+ * @author Felix
+ */
+'use strict';
+// ---------------------------------------------------------------------------
+//  Constants
+// ---------------------------------------------------------------------------
+/**
+ * First line of every advice prompt.  Mirrors the role of JUDGE_SENTINEL
+ * in judge-prompts.js but lives here so advice is fully self-contained.
+ * Recognised by server/etl/judge-filter.js together with JUDGE_SENTINEL.
+ */
+const ADVICE_SENTINEL = '[ABOSS-ADVICE]';
+/** Bump when the JSON output contract changes (see header). */
+const ADVICE_PROMPT_VERSION = 4;
+/** Hard categories — written into the prompt as an enum the model MUST fill. */
+const CATEGORIES = ['cost', 'accuracy', 'context', 'skills', 'workflow'];
+/** Soft size budget for the assembled prompt (bytes).  Beyond this we
+ *  start dropping mid-conversation messages (see truncateContext). */
+const DEFAULT_MAX_BYTES = 80_000;
+/** When truncating: keep the first N and last N user/assistant turns
+ *  verbatim, replace the middle with a single system placeholder. */
+const HEAD_KEEP = 30;
+const TAIL_KEEP = 30;
+/** Per-message hard cap when even head+tail still overflow. */
+const MSG_HARD_CAP_CHARS = 600;
+// ---------------------------------------------------------------------------
+//  Truncation
+// ---------------------------------------------------------------------------
+/**
+ * Reduce ctx.messages so the resulting prompt stays within `maxBytes`.
+ *
+ * Strategy (applied in order):
+ *   1. If total < maxBytes → no-op.
+ *   2. Keep HEAD_KEEP + TAIL_KEEP messages, drop the middle, insert one
+ *      `[…省略 N 条…]` system placeholder.  Mark `truncated: true`.
+ *   3. Still over? Cap every remaining message text to MSG_HARD_CAP_CHARS.
+ *   4. Still over? Drop messages from the head (keep tail) until we fit;
+ *      mark `truncated: 'hard'`.
+ *
+ * @param {object} ctx  see buildAdvicePrompt() — must have .messages[]
+ * @param {number} [maxBytes=DEFAULT_MAX_BYTES]
+ * @returns {object} possibly-truncated ctx (shallow copy)
+ */
+function truncateContext(ctx, maxBytes = DEFAULT_MAX_BYTES) {
+  const out = { ...ctx, truncated: false, omittedMessages: 0 };
+  const messages = Array.isArray(ctx.messages) ? ctx.messages.slice() : [];
+  const FIXED_OVERHEAD = 4_000;
+  const estimateBytes = (msgs) =>
+    FIXED_OVERHEAD + msgs.reduce((n, m) => n + (m.text ? m.text.length : 0), 0);
+  if (estimateBytes(messages) <= maxBytes) {
+    out.messages = messages;
+    return out;
+  }
+  if (messages.length > HEAD_KEEP + TAIL_KEEP) {
+    const omitted = messages.length - HEAD_KEEP - TAIL_KEEP;
+    const head = messages.slice(0, HEAD_KEEP);
+    const tail = messages.slice(messages.length - TAIL_KEEP);
+    const placeholder = {
+      role: 'system',
+      text: `[…中段省略 ${omitted} 条消息…]`,
+    };
+    out.messages = head.concat([placeholder], tail);
+    out.truncated = true;
+    out.omittedMessages = omitted;
+  } else {
+    out.messages = messages;
+  }
+  if (estimateBytes(out.messages) <= maxBytes) return out;
+  out.messages = out.messages.map((m) => {
+    if (!m.text || m.text.length <= MSG_HARD_CAP_CHARS) return m;
+    return { ...m, text: m.text.slice(0, MSG_HARD_CAP_CHARS) + '…[trimmed]' };
+  });
+  if (estimateBytes(out.messages) <= maxBytes) return out;
+  while (out.messages.length > 2 && estimateBytes(out.messages) > maxBytes) {
+    out.messages.shift();
+  }
+  out.truncated = 'hard';
+  return out;
+}
+// ---------------------------------------------------------------------------
+//  Formatting helpers
+// ---------------------------------------------------------------------------
+function fmtNum(n) {
+  if (n == null || Number.isNaN(n)) return '–';
+  if (typeof n !== 'number') return String(n);
+  if (Number.isInteger(n)) return n.toLocaleString('en-US');
+  return n.toFixed(3);
+}
+function fmtToolTable(tools) {
+  if (!Array.isArray(tools) || tools.length === 0) return '(无工具调用)';
+  const rows = tools.slice(0, 20).map((t) =>
+    `  ${(t.name || '?').padEnd(20)} ` +
+    `count=${String(t.count ?? 0).padStart(4)} ` +
+    `err=${String(t.errorCount ?? 0).padStart(3)} ` +
+    `avg_ms=${String(Math.round(t.avgDurationMs ?? 0)).padStart(6)} ` +
+    `args="${(t.argsPreview || '').replace(/\s+/g, ' ').slice(0, 120)}"`
+  );
+  return rows.join('\n');
+}
+function fmtMessages(messages) {
+  if (!Array.isArray(messages) || messages.length === 0) {
+    return '(无消息)';
+  }
+  return messages
+    .filter((m) => m.text != null && m.text !== '')
+    .map((m) => {
+      const role = (m.role || '?').toUpperCase().padEnd(9);
+      return `[${role}] ${m.text}`;
+    })
+    .join('\n---\n');
+}
+// ---------------------------------------------------------------------------
+//  Prompt
+// ---------------------------------------------------------------------------
+/**
+ * Assemble the full advice prompt.
+ *
+ * `ctx` shape — see spec §4.4.
+ *
+ * @param {object} ctx
+ * @returns {string}
+ */
+function buildAdvicePrompt(ctx) {
+  const s = ctx.session || {};
+  const t = s.tokens || {};
+  const truncatedNote =
+    ctx.truncated === 'hard'
+      ? '（注意:会话很长,已强力截断,部分内容缺失。）'
+      : ctx.truncated
+      ? `（注意:中段已省略 ${ctx.omittedMessages} 条消息。）`
+      : '';
+  return `${ADVICE_SENTINEL}（内部标记,忽略本行）
+你是一位 AI 协作教练。我给你一段开发者与 AI 编程助手的会话原文,以及
+非常少量的基础统计(模型、时长、token、工具次数)。你的任务:仔细
+阅读对话本身,评估开发者「如何使用 AI」,给出可执行的改进建议。
+# 关键原则:只看对话,不要套用任何指标
+我们的系统另外有一套 H1/H2/E1/O1 等评分体系,但本次评估**不会**把
+那些分数喂给你,也禁止你引用、推测或编造任何此类分数。
+  禁止出现的表达举例:
+    - 「H1=63 偏低,所以问题定义不清晰」
+    - 「子分 clarity=0.45,说明开场模糊」
+    - 「评分显示效率不足」
+    - 「L2 水平,有改进空间」
+  即使下文有的统计字段(消息数、token、错误次数)可以引用,也只能作为
+  「对话事实」的描述,不能换算成抽象分数或等级。
+每一条建议的 evidence 必须直接指向对话内容或基础统计中的具体事实,例如:
+  - 「第 3 条用户消息只说『改一下』,未指出文件路径」
+  - 「同一个 read 工具被连续调用 12 次,看起来在反复检索」
+  - 「开场没有给出任何代码片段或文件名」
+# 评估对象与禁区
+你要评估的是「协作方式」,不是「业务内容」:
+可以谈:
+  - 用户的提问方式(是否清晰、是否分步、是否说明意图)
+  - 上下文准备(是否提供文件路径、约束、示例、依赖)
+  - 工具与模型的使用(是否选对工具、是否过度调用、缓存命中率)
+  - 流程节奏(回退次数、纠偏速度、是否过早收敛)
+  - 是否适合引入一个 opencode skill 或 subagent 来自动化重复模式
+  - 成本与 token 经济性
+不要谈:
+  - 对话中讨论的具体技术 / 代码是否正确(如「你写的 SQL JOIN 错了」、
+    「这个 React Hook 用法不对」、「应该用更高效的算法」)
+  - 业务领域内的推荐(「应该改用 PostgreSQL 而不是 MySQL」)
+  - 任何需要运行代码 / 看完整仓库才能下的判断
+  上述话题属于代码评审,不属于协作分析,即使你能看出问题也不要写。
+如果对话里只有业务讨论、看不出可改进的协作模式,5 个类别都给空数组,
+summary 写「本会话以业务讨论为主,协作模式无明显问题」。
+# 输出契约
+只输出严格 JSON,不要 markdown 代码块,不要多余文字。结构如下:
+{
+  "summary": "≤60 字的一句话总评(只谈协作)",
+  "categories": {
+    "cost":     [AdviceItem, ...],
+    "accuracy": [AdviceItem, ...],
+    "context":  [AdviceItem, ...],
+    "skills":   [AdviceItem, ...],
+    "workflow": [AdviceItem, ...]
+  },
+  "rationale": "≤80 字综合理由(只谈协作)"
+}
+AdviceItem:
+{
+  "severity":   "high" | "medium" | "low",
+  "title":      "≤20 字",
+  "why":        "1 句话,说明协作上的问题",
+  "action":     "1 句话,具体可操作的改变(下次怎么做)",
+  "evidence":   "引自第 N 条消息 / 工具 X / 基础统计 — 必须是对话事实,不得引用任何评分",
+  "actionable": true | false,
+  "executor":   "opencode" | "claude" | "manual",
+  "cwd_hint":   "project_root"
+}
+# 硬规则
+1. 5 个 categories 键必须存在,无内容给空数组。
+2. 全部 AdviceItem 总数 ≤ 6 条,只挑最值得改的;按 severity 由高到低排。
+3. 每条 evidence 必须能在下文对话或基础统计中找到原话/原始数字;
+   出现「分数」「等级」「Lx」「子分」「H1」「H2」「E1」「O1」字眼一律视为违规。
+4. action 必须是「下次怎么做」级别的协作动作,不是「这段代码应该改成 X」。
+5. 类别定义:
+   - cost     省钱:模型档位、prompt 长度、工具调用次数、缓存利用。
+   - accuracy 提准确率(协作层):暴露隐藏假设、要求 AI 自检、加入验证步骤。
+                 不是「业务结论是否正确」。
+   - context  上下文准备:开场是否给出文件 / 依赖 / 约束 / 示例 / 期望输出。
+   - skills   推荐新建或使用 opencode skill / subagent。
+                 每条 action 给出:skill 名 + 触发条件 + 一句话用途。
+   - workflow 流程与节奏:拆解、迭代步幅、回退策略、人 ↔ AI 分工。
+6. actionable=true 的条件必须同时满足:
+   - 是「写代码 / 改文件 / 加 skill」类具体动作;
+   - 在原项目根目录运行 opencode/claude 就能完成,无需补充人类专属知识。
+   不属于这一类(如「下次开场用模板」「以后多用缓存」「对 AI 的指令更
+   具体」)的 → actionable=false, executor='manual'。
+7. executor:
+   - 显式动手做的任务(创建文件、改代码、写 skill) → 'opencode' 或 'claude';
+   - 单纯让人类调整行为的建议 → 'manual'。
+   不会判断时填 'opencode'。
+8. cwd_hint: 目前只能填 "project_root"。
+9. actionable 与 executor 必须一致:executor='manual' 时 actionable 必须 false;
+   executor 是 'opencode'/'claude' 时 actionable 通常 true。
+# 会话基础(只作事实参考,不要换算成分数)
+模型:      ${s.model || '未知'}
+难度:      ${s.difficulty ?? '?'} / 4
+时长:      ${fmtNum(s.durationMinutes)} 分钟
+消息:      ${fmtNum(s.messageCount)} 条  (用户 ${fmtNum(s.userCount)} / 助手 ${fmtNum(s.assistantCount)})
+工具调用:  ${fmtNum(s.toolCallCount)} 次  错误 ${fmtNum(s.errorCount)}
+Token:     in ${fmtNum(t.input)} / out ${fmtNum(t.output)} / reasoning ${fmtNum(t.reasoning)} / cacheR ${fmtNum(t.cacheRead)} / cacheW ${fmtNum(t.cacheWrite)}
+成本:      $${typeof s.cost === 'number' ? s.cost.toFixed(4) : '–'}
+已回退:    ${s.reverted ? '是' : '否'}
+# 工具使用 Top 20
+${fmtToolTable(ctx.toolBreakdown)}
+# 消息全文 ${truncatedNote}
+${fmtMessages(ctx.messages)}
+—— 输出 JSON ——`;
+}
+module.exports = {
+  ADVICE_SENTINEL,
+  ADVICE_PROMPT_VERSION,
+  CATEGORIES,
+  DEFAULT_MAX_BYTES,
+  buildAdvicePrompt,
+  truncateContext,
+};