npm - @seanyao/roll - Versions diffs - 2026.522.2 → 2026.523.2 - Mend

@seanyao/roll 2026.522.2 → 2026.523.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/CHANGELOG.md +46 -0
package/bin/dream-test-quality-scan +110 -0
package/bin/roll +761 -82
package/lib/__pycache__/model_prices.cpython-314.pyc +0 -0
package/lib/__pycache__/prices_fetcher.cpython-314.pyc +0 -0
package/lib/__pycache__/roll-loop-status.cpython-314.pyc +0 -0
package/lib/__pycache__/roll_render.cpython-314.pyc +0 -0
package/lib/i18n.sh +113 -0
package/lib/loop-fmt.py +62 -3
package/lib/model_prices.py +78 -38
package/lib/prices/snapshot-2026-05-22.json +20 -0
package/lib/prices_fetcher.py +285 -0
package/lib/roll-loop-status.py +88 -48
package/lib/roll_render.py +20 -8
package/package.json +1 -1
package/skills/roll-.dream/SKILL.md +59 -0
package/skills/roll-design/SKILL.md +4 -3
package/skills/roll-notes/SKILL.md +6 -3

package/lib/roll-loop-status.py CHANGED Viewed

@@ -153,7 +153,11 @@ def load_backlog(project_root: Optional[Path] = None) -> Dict[str, str]:
 # ════════════════════════════════════════════════════════════════════════════
 # Cycle aggregation — group events by cycle label; attach cron + story id
 # ════════════════════════════════════════════════════════════════════════════
-_STORY_ID_PAT = re.compile(r"\b([A-Z]+(?:-[A-Z]+)*-\d+)\b")
+# FIX-108: each segment was [A-Z]+ (letters only), so alphanumeric segments
+# like I18N / K8S / D2 / S3 / 2FA failed to match — dashboard silently dropped
+# any story id with a mixed-letter-digit segment (US-I18N-001 etc.). First
+# char must still be a letter so "001-002" doesn't false-positive as an id.
+_STORY_ID_PAT = re.compile(r"\b([A-Z][A-Z0-9]*(?:-[A-Z][A-Z0-9]*)*-\d+)\b")
 _PR_NUM_PAT = re.compile(r"/pull/(\d+)")
 def _extract_story_id(ev_detail: str) -> Optional[str]:
@@ -356,21 +360,29 @@ def backfill_usage_from_claude_sessions(cycles: List[Dict[str, Any]], slug: str)
         # Path 1: usage event written by loop-fmt at result time.
         ue = cy.get("usage_event")
         if isinstance(ue, dict) and (ue.get("input_tokens") or ue.get("output_tokens")):
-            cy["input_tokens"]  = int(ue.get("input_tokens")  or 0)
-            cy["output_tokens"] = int(ue.get("output_tokens") or 0)
+            cy["input_tokens"]          = int(ue.get("input_tokens")          or 0)
+            cy["output_tokens"]         = int(ue.get("output_tokens")         or 0)
+            cy["cache_creation_tokens"] = int(ue.get("cache_creation_tokens") or 0)
+            cy["cache_read_tokens"]     = int(ue.get("cache_read_tokens")     or 0)
             cy["model"] = ue.get("model")
-            # US-VIEW-010: aggregate now sums per-turn usage tokens, so the
-            # totals in `ue` reflect the whole cycle. Always compute cost at
-            # list price for cross-account comparability — supersedes FIX-060
-            # which preferred cost_reported_usd as a workaround for
-            # last-event-only token counts (that root cause is now gone).
-            cy["cost_list"] = mp.compute_list_cost(
-                ue.get("model"),
-                input_tokens=ue.get("input_tokens", 0),
-                output_tokens=ue.get("output_tokens", 0),
-                cache_creation_tokens=ue.get("cache_creation_tokens", 0),
-                cache_read_tokens=ue.get("cache_read_tokens", 0),
-            )
+            # US-VIEW-014: prefer the cost frozen at cycle_end so a later
+            # prices refresh never rewrites a historical cycle's cost. Only
+            # legacy events (pre-US-VIEW-014) fall back to recomputing — and
+            # the row gets a muted [legacy] tag so it can't be mistaken for
+            # the authoritative value.
+            persisted = ue.get("cost_list_usd")
+            if persisted is not None:
+                cy["cost_list"]        = float(persisted)
+                cy["cost_list_legacy"] = False
+            else:
+                cy["cost_list"] = mp.compute_list_cost(
+                    ue.get("model"),
+                    input_tokens=ue.get("input_tokens", 0),
+                    output_tokens=ue.get("output_tokens", 0),
+                    cache_creation_tokens=ue.get("cache_creation_tokens", 0),
+                    cache_read_tokens=ue.get("cache_read_tokens", 0),
+                )
+                cy["cost_list_legacy"] = True
             if ue.get("duration_ms") and not cy.get("duration_s"):
                 cy["duration_s"] = int(ue["duration_ms"] / 1000)
             continue
@@ -380,8 +392,10 @@ def backfill_usage_from_claude_sessions(cycles: List[Dict[str, Any]], slug: str)
         u = load_claude_session_usage(cy.get("label", ""), slug)
         if not u:
             continue
-        cy["input_tokens"]  = int(u.get("input_tokens")  or 0)
-        cy["output_tokens"] = int(u.get("output_tokens") or 0)
+        cy["input_tokens"]          = int(u.get("input_tokens")          or 0)
+        cy["output_tokens"]         = int(u.get("output_tokens")         or 0)
+        cy["cache_creation_tokens"] = int(u.get("cache_creation_tokens") or 0)
+        cy["cache_read_tokens"]     = int(u.get("cache_read_tokens")     or 0)
         cy["model"] = u["model"]
         cy["cost_list"] = mp.compute_list_cost(
             u["model"],
@@ -390,25 +404,39 @@ def backfill_usage_from_claude_sessions(cycles: List[Dict[str, Any]], slug: str)
             cache_creation_tokens=u["cache_creation_tokens"],
             cache_read_tokens=u["cache_read_tokens"],
         )
+        # US-VIEW-014: session salvage never has a frozen cycle_end cost, so
+        # this path is always legacy.
+        cy["cost_list_legacy"] = True
         if u.get("duration_ms") and not cy.get("duration_s"):
             cy["duration_s"] = int(u["duration_ms"] / 1000)
 def load_pr_merges_from_git(days: int) -> Dict[str, Dict[str, Any]]:
     """Repair fallback: when events.ndjson dropped the pr / cycle_end events
-    for a cycle (events writer regressions), git log still has the merge
-    commit `Merge pull request #N from seanyao/loop/cycle-LABEL`. Extract
-    PR number + story IDs from the merge subject + body so orphan cycles
-    can be reclassified done instead of permanently '⏵ running'."""
+    for a cycle (events writer regressions, or cycle_end fired before PR
+    merged), git log still has the merge commit. Two known subject formats:
+      - Branch-named (Merge commit / older squash): "Merge pull request #N
+        from seanyao/loop/cycle-LABEL" — the branch name carries the label.
+      - Squash with default-title (newer GitHub UI / `gh pr merge --squash`):
+        "loop cycle LABEL (#N)" — space-separated, no slash.
+    FIX-107: the old --grep="loop/cycle-" + label_re missed the squash
+    subject entirely, so PRs merged AFTER cycle_end never got their
+    pr_outcome promoted to 'merged' on the dashboard.
+    """
     try:
         out = subprocess.check_output(
             ["git", "log", f"--since={days + 1} days ago",
-             "--grep=loop/cycle-", "--format=%H|||%s|||%b<<<END>>>"],
+             "--grep=loop[ /]cycle", "--extended-regexp",
+             "--format=%H|||%s|||%b<<<END>>>"],
             text=True, errors="ignore"
         )
     except Exception:
         return {}
     result: Dict[str, Dict[str, Any]] = {}
-    label_re  = re.compile(r"loop/cycle-([A-Za-z0-9-]+)")
+    # Accept both `loop/cycle-LABEL` and `loop cycle LABEL` (with or without
+    # the leading `-` separator after `cycle`). LABEL = YYYYMMDD-HHMMSS-PID.
+    label_re  = re.compile(r"loop[ /]cycle[-\s](\d{8}-\d+-\d+)")
     pr_re     = re.compile(r"#(\d+)")
     story_re  = re.compile(r"\b([A-Z]+(?:-[A-Z]+)*-\d+)\b")
     for chunk in out.split("<<<END>>>"):
@@ -557,7 +585,8 @@ def rollup_for_day(day_cycles: List[Dict[str, Any]]) -> Dict[str, Any]:
     # reads all 4 fields), but they don't represent the model's actual work.
     r = {"cycles": len(day_cycles), "prs": 0, "failed": 0,
          "duration_s": 0, "cost": 0.0,
-         "input_tokens": 0, "output_tokens": 0}
+         "input_tokens": 0, "output_tokens": 0,
+         "cache_creation_tokens": 0, "cache_read_tokens": 0}
     for cy in day_cycles:
         if cy.get("outcome") == "fail":
             r["failed"] += 1
@@ -567,6 +596,10 @@ def rollup_for_day(day_cycles: List[Dict[str, Any]]) -> Dict[str, Any]:
             r["input_tokens"] += cy["input_tokens"]
         if cy.get("output_tokens"):
             r["output_tokens"] += cy["output_tokens"]
+        if cy.get("cache_creation_tokens"):
+            r["cache_creation_tokens"] += cy["cache_creation_tokens"]
+        if cy.get("cache_read_tokens"):
+            r["cache_read_tokens"] += cy["cache_read_tokens"]
         # US-VIEW-011: rollup only counts cycles whose PR actually merged.
         # Backward compat: rows where pr_outcome is missing but pr URL exists
         # (no `pr` event after the writer upgrade ran for that cycle) are
@@ -634,10 +667,13 @@ def render(events, cron, state, backlog, *, days=3, lang="both", now=None,
                     c("dim", "run ") + c("fg", "roll loop on", bold=True) +
                     c("dim", " to enable"))
             eb_zh = c("dim", "  未安装 · 运行 ") + c("fg", "roll loop on") + c("dim", " 启用")
-        elif install_state == "disabled":
-            eb_l = (c("amber", "◌ installed/off", bold=True) + c("muted", "   ") +
-                    c("dim", "loop disabled — run ") + c("fg", "roll loop on", bold=True))
-            eb_zh = c("dim", "  未启用 · 运行 ") + c("fg", "roll loop on") + c("dim", " 启用")
+        elif install_state in ("stale", "disabled"):
+            # FIX-098: 'stale' = plist on disk but agent not registered in launchd.
+            # 'disabled' kept for back-compat (old install_state values). Both mean
+            # the user needs to run 'roll loop on' to bootstrap the agent.
+            eb_l = (c("amber", "◌ STALE — plist present, not loaded", bold=True) + c("muted", "   ") +
+                    c("dim", "run ") + c("fg", "roll loop on", bold=True) + c("dim", " to repair"))
+            eb_zh = c("dim", "  Plist 存在但未加载 · 运行 ") + c("fg", "roll loop on") + c("dim", " 修复")
         else:
             eb_l = (c("blue", "● IDLE", bold=True) + c("muted", " · ") +
                     c("dim", "enabled · next run ") + c("fg", _next_cron_hint(state), bold=True))
@@ -723,11 +759,12 @@ def render(events, cron, state, backlog, *, days=3, lang="both", now=None,
            yest_color="amber" if yest["failed"] > 0 else "dim",
            yest_suffix="⚠" if yest["failed"] > 0 else "")
     metric_dur("duration", today["duration_s"], yest["duration_s"], d2["duration_s"], partial=is_partial)
-    # US-VIEW-012: input + output as two separate rows. cache_read no longer
-    # surfaces here — true cost is on the "cost" line below (computed from all
-    # 4 token kinds via list price). This row labels what the model actually
-    # processed and generated for this cycle.
+    # US-VIEW-017: show all 4 token components so the cost is explainable.
+    # cache_creation (↑) and cache_read (↓) typically account for 80-90% of
+    # cost — hiding them makes the cost line incomprehensible.
     metric_tokens("input tokens",  today["input_tokens"],  yest["input_tokens"],  d2["input_tokens"],  partial=is_partial)
+    metric_tokens("cache writes",  today["cache_creation_tokens"], yest["cache_creation_tokens"], d2["cache_creation_tokens"], partial=is_partial)
+    metric_tokens("cache reads",   today["cache_read_tokens"],     yest["cache_read_tokens"],     d2["cache_read_tokens"],     partial=is_partial)
     metric_tokens("output tokens", today["output_tokens"], yest["output_tokens"], d2["output_tokens"], partial=is_partial)
     metric_dollar("cost",   today["cost"],      yest["cost"],      d2["cost"],       partial=is_partial)
@@ -784,15 +821,18 @@ def _read_plist_loop_minute() -> int:
 def _detect_install_state() -> str:
-    """FIX-095: classify the launchd install state of the loop service.
+    """FIX-095 / FIX-098: classify the launchd install state of the loop service.
     Returns one of:
       'not-installed' — no plist for com.roll.loop.<slug> in ~/Library/LaunchAgents/
-      'disabled'      — plist exists but launchctl print-disabled shows '=> disabled'
-      'enabled'       — plist exists and no disable override is set
-    Pre-FIX-095, the v2 view rendered '● IDLE' for all three states, leaving
-    users unable to tell whether the loop was actually installed/enabled.
+      'stale'         — plist on disk but agent NOT registered in launchd
+                        (happens after roll loop off + roll update without roll loop on)
+      'enabled'       — plist on disk AND registered in launchd
+    FIX-098: switched from `launchctl print-disabled` (disabled-overrides DB) to
+    `launchctl print gui/<uid>/<label>` which probes the actual launchd registry.
+    The old approach returned false-positive 'enabled' when the disabled-overrides
+    DB had no entry for the label (empty = not explicitly disabled, not loaded).
     """
     slug = project_slug()
     label = f"com.roll.loop.{slug}"
@@ -801,17 +841,17 @@ def _detect_install_state() -> str:
         return "not-installed"
     try:
         uid = os.getuid()
-        out = subprocess.run(
-            ["launchctl", "print-disabled", f"gui/{uid}"],
-            capture_output=True, text=True, timeout=2,
-        ).stdout or ""
-        for line in out.splitlines():
-            if f'"{label}"' in line and "=> disabled" in line:
-                return "disabled"
+        result = subprocess.run(
+            ["launchctl", "print", f"gui/{uid}/{label}"],
+            capture_output=True, timeout=2,
+        )
+        if result.returncode == 0:
+            return "enabled"
+        return "stale"
     except Exception:
-        # launchctl missing or timed out — best-effort fall through to enabled.
-        pass
-    return "enabled"
+        # launchctl missing or timed out — assume stale (safe: user sees STALE
+        # banner and is told to run 'roll loop on' to repair).
+        return "stale"
 def _next_cron_hint(state: Dict[str, str], zh: bool = False) -> str:

package/lib/roll_render.py CHANGED Viewed

@@ -298,12 +298,19 @@ def cycle_row(cy: Dict[str, Any], backlog: Dict[str, str]) -> None:
         from datetime import datetime as _dt, timezone as _tz
         dur_s = int((_dt.now(_tz.utc) - cy["start"]).total_seconds())
     dur = fmt_dur(dur_s) if dur_s else "—"
-    # US-VIEW-012: token column shows model's real work as input/output. Cache
-    # creation / cache read are kept in events.ndjson for cost math but never
-    # surface in the UI — they would inflate the visible number to 10–100× the
-    # "real" work done by the model on this cycle. fmt_tokens(0) already
-    # returns "—", so a cycle missing usage_event prints as "—/—".
-    tok = f"{fmt_tokens(cy.get('input_tokens') or 0)}/{fmt_tokens(cy.get('output_tokens') or 0)}"
+    # US-VIEW-017: show all 4 token components when cache data is available.
+    # Format: "in/cw↑ cr↓/out" (cache writes ↑, cache reads ↓).
+    # Falls back to "in/out" for cycles that predate cache tracking.
+    inp = cy.get('input_tokens') or 0
+    out_tok = cy.get('output_tokens') or 0
+    cw  = cy.get('cache_creation_tokens') or 0
+    cr  = cy.get('cache_read_tokens') or 0
+    if cw or cr:
+        tok = (f"{fmt_tokens(inp)}"
+               f"/{fmt_tokens(cw)}↑ {fmt_tokens(cr)}↓"
+               f"/{fmt_tokens(out_tok)}")
+    else:
+        tok = f"{fmt_tokens(inp)}/{fmt_tokens(out_tok)}"
     # cost prefers the backfilled list-price; falls back to cron.log when
     # the claude session log isn't available (only the latest cycle).
     if cy.get("cost_list") is not None:
@@ -343,14 +350,19 @@ def cycle_row(cy: Dict[str, Any], backlog: Dict[str, str]) -> None:
             "open":   ("dim",   "…"),
         }.get(pr_outcome, ("dim", "…"))
         pr_marker = " " + c(mark_c, f"#{pr_num} {mark_sym}")
+    # US-VIEW-014: pre-US-VIEW-014 events (no frozen cost_list_usd at
+    # cycle_end) get a muted [legacy] suffix — the number is recomputed on
+    # the fly and can shift with future price changes, unlike the frozen
+    # values written by current loop-fmt.
+    legacy_marker = " " + c("muted", "[legacy]") if cy.get("cost_list_legacy") else ""
     inner = (
         "  " + c(glyph_c, glyph, bold=True) + "  " +
         c(time_c, pad(time_str, 5), bold=(outcome == "fail")) + "   " +
         c("muted", pad(dur, 4, "r")) + "  " +
-        c("muted", pad(tok, 11, "r")) + "  " +
+        c("muted", pad(tok, 26)) + "  " +
         model_seg +
         c("muted", pad(cost, 7, "r")) + "   " +
-        c(sid_c, ids_str, bold=True) + pr_marker
+        c(sid_c, ids_str, bold=True) + pr_marker + legacy_marker
     )
     # Subtle red bg on failure rows so a fail can't be missed at a glance.
     if outcome == "fail" and USE_COLOR:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@seanyao/roll",
-  "version": "2026.522.2",
+  "version": "2026.523.2",
   "description": "Roll — Roll out features with AI agents",
   "scripts": {
     "test": "bash tests/run.sh"

package/skills/roll-.dream/SKILL.md CHANGED Viewed

@@ -224,6 +224,65 @@ Add after `## 文档覆盖度` section:
 {发现内容列表 或 "文档新鲜度良好，无滞后或缺失项。"}
 ```
+### Scan 7 — Test Quality (rubric-driven)
+Apply the test-quality rubric at [guide/en/testing/quality-rubric.md](../../guide/en/testing/quality-rubric.md)
+(Chinese: [quality-rubric.zh.md](../../guide/zh/testing/quality-rubric.md)) against every file under
+`tests/`. The rubric publishes six anti-pattern categories (❶..❻); each has a
+**Signals** subsection that lists the matching heuristics. Scan 7 is purely a
+mechanical apply-the-rubric step — no new logic.
+**Per-category signals** — read from the rubric, summarized here:
+| Marker | Anti-pattern | Cheapest signal |
+|--------|--------------|-----------------|
+| ❶ | Hardcoded business data | Bare numeric / version / pricing literal inside `[[ "$output" == *"..."*` that matches a value also defined in `lib/` |
+| ❷ | Over-mocking real boundaries | `function git() {` / `function gh() {` overrides at the top of a unit test |
+| ❸ | Asserting implementation details | `grep '_internal_helper'` against output; assertions on `.roll/internal/*` paths |
+| ❹ | Fixture order coupling | `setup_file` writes shared mutable state without per-test reset |
+| ❺ | Testing private functions | Test sources a `lib/` file and calls a `_underscore_prefixed` helper directly |
+| ❻ | Asserting framework behavior | References to `$BATS_TEST_NUMBER`, `$BATS_SUITE_NAME` in assertions |
+**Rate cap — 每轮 ≤ 5 条 test-quality REFACTOR entries**. Same dream cycle may
+emit more than 5 findings; the dream scan must rank by severity (❶ > ❷ > ❸ > ❹ > ❺ > ❻
+and within a class, by occurrence count) and only persist the top 5 to BACKLOG.
+Remaining findings go into the dream log under `## 测试质量` but are not made
+into REFACTOR rows — this prevents the backlog from being drowned in test-debt
+on the first scan after rubric publication.
+**REFACTOR entry format** — same as other scans, but tagged with category:
+```markdown
+| REFACTOR-XXX | docs: <one-line description> [test-quality:❶] — flagged by dream YYYY-MM-DD | 📋 Todo |
+```
+The `[test-quality:❶]` (through `❻`) tag is **required** so downstream filtering
+(e.g. "show me all ❶ items still open") is mechanical. The marker character must
+match the rubric exactly.
+**Optional helper** — `bin/dream-test-quality-scan` is a thin shell script
+maintainers can invoke ad-hoc to dry-run the ❶ detector against a single file
+or directory (see `bin/dream-test-quality-scan --help`). The dream skill itself
+does **not** depend on the helper — Scan 7 is the AI agent applying the rubric.
+The helper just exists so a maintainer (or this skill's smoke test) can confirm
+the ❶ heuristic still finds known instances.
+#### Dream Log Section (Scan 7)
+Add after `## 文档新鲜度` section:
+```markdown
+## 测试质量
+- 本轮发现 {N} 项（写入 BACKLOG 的前 5 项见下；剩余 {M} 项仅记录于本日志）
+- ❶ 硬编码业务数据：{count}
+- ❷ 过度 mock：{count}
+- ❸ 断言实现细节：{count}
+- ❹ Fixture 顺序耦合：{count}
+- ❺ 测私有函数：{count}
+- ❻ 断言框架行为：{count}
+{命中文件列表 或 "未发现可治理的测试反模式。"}
+```
 ## Output
 ### REFACTOR Entry (.roll/backlog.md)

package/skills/roll-design/SKILL.md CHANGED Viewed

@@ -118,9 +118,10 @@ Document structure (two-layer separation):
 **Important rules:**
 1. Plan files go in `.roll/features/<feature>-plan.md` (**no longer using** `docs/plans/`)
 2. US details go in the corresponding `.roll/features/<feature>.md`
-3. .roll/backlog.md only contains index rows (one row per US), **do not write** AC / Files / Notes
-4. Domain model files go in `.roll/domain/` — create on first greenfield design, update incrementally
-5. **Do not** write to `~/.kimi/` or any global config directory
+3. **FIX / IDEA detail files use ID-prefixed filenames**: `.roll/features/<epic>/FIX-097.md`, not `.roll/features/<epic>/some-descriptive-slug.md`. Reason: a single FIX is one card, not a long-lived feature; the ID is the most stable handle, descriptive slugs date quickly and break links. US can keep feature-slug naming (US lives inside a multi-Story feature file). Quick lookup: `ls .roll/features/<epic>/FIX-*.md` finds all bugs in that area without grepping content.
+4. .roll/backlog.md only contains index rows (one row per US), **do not write** AC / Files / Notes
+5. Domain model files go in `.roll/domain/` — create on first greenfield design, update incrementally
+6. **Do not** write to `~/.kimi/` or any global config directory
 **File path resolution order:**
 1. Determine Feature ownership (based on the requirement domain: compiler / ingest / qa / ...)

package/skills/roll-notes/SKILL.md CHANGED Viewed

@@ -29,7 +29,7 @@ $roll-notes 今天的 code review 给了很好的反馈
 ## Behavior
-1. **Determine file path**: `notes/YYYY-MM-DD.md` relative to project root
+1. **Determine file path**: `.roll/notes/YYYY-MM-DD.md` relative to project root (parallel to `.roll/dream/` and `.roll/briefs/` — notes is project metadata, not source)
 2. **Get current time**: Use `Asia/Shanghai` timezone (`TZ=Asia/Shanghai date`)
 3. **Read existing entries for style**: Before writing, read the last 2–3 entries
    in the same file. Analyze their style: heading format, voice/tone,
@@ -95,6 +95,9 @@ $roll-notes 今天的 code review 给了很好的反馈
 ## File location
 ```
-notes/
-  └── YYYY-MM-DD.md
+.roll/
+  └── notes/
+        └── YYYY-MM-DD.md
 ```
+注：notes 是项目元数据（与 `.roll/dream/` / `.roll/briefs/` 同级），不入 git；由 dream/brief 等下游 skill 跨日聚合。