npm - @seanyao/roll - Versions diffs - 0.5.0 → 2.602.2 - Mend

@seanyao/roll 0.5.0 → 2.602.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

package/CHANGELOG.md +736 -0
package/LICENSE +21 -0
package/README.md +65 -165
package/bin/dream-test-quality-scan +110 -0
package/bin/roll +15030 -814
package/conventions/config.yaml +17 -1
package/conventions/global/AGENTS.md +146 -100
package/conventions/global/CLAUDE.md +1 -21
package/conventions/global/GEMINI.md +8 -22
package/conventions/global/project_rules.md +9 -0
package/conventions/templates/backend-service/AGENTS.md +30 -81
package/conventions/templates/backend-service/GEMINI.md +3 -3
package/conventions/templates/backend-service/project_rules.md +16 -0
package/conventions/templates/cli/AGENTS.md +31 -58
package/conventions/templates/cli/CLAUDE.md +3 -5
package/conventions/templates/cli/GEMINI.md +3 -3
package/conventions/templates/cli/project_rules.md +16 -0
package/conventions/templates/frontend-only/AGENTS.md +29 -64
package/conventions/templates/frontend-only/GEMINI.md +3 -3
package/conventions/templates/frontend-only/project_rules.md +14 -0
package/conventions/templates/fullstack/AGENTS.md +31 -79
package/conventions/templates/fullstack/CLAUDE.md +1 -1
package/conventions/templates/fullstack/GEMINI.md +3 -3
package/conventions/templates/fullstack/project_rules.md +15 -0
package/lib/README.md +42 -0
package/lib/__pycache__/github_sync.cpython-314.pyc +0 -0
package/lib/__pycache__/loop-fmt.cpython-314.pyc +0 -0
package/lib/__pycache__/loop_result_eval.cpython-314.pyc +0 -0
package/lib/__pycache__/loop_unstick.cpython-314.pyc +0 -0
package/lib/__pycache__/model_prices.cpython-314.pyc +0 -0
package/lib/__pycache__/prices_fetcher.cpython-314.pyc +0 -0
package/lib/__pycache__/roll-home.cpython-314.pyc +0 -0
package/lib/__pycache__/roll-loop-status.cpython-314.pyc +0 -0
package/lib/__pycache__/roll_git.cpython-314.pyc +0 -0
package/lib/__pycache__/roll_render.cpython-314.pyc +0 -0
package/lib/__pycache__/slides-render.cpython-314.pyc +0 -0
package/lib/agent_usage/README.md +49 -0
package/lib/agent_usage/__init__.py +108 -0
package/lib/agent_usage/__pycache__/__init__.cpython-314.pyc +0 -0
package/lib/agent_usage/__pycache__/gemini.cpython-314.pyc +0 -0
package/lib/agent_usage/__pycache__/kimi.cpython-314.pyc +0 -0
package/lib/agent_usage/__pycache__/openai.cpython-314.pyc +0 -0
package/lib/agent_usage/__pycache__/pi.cpython-314.pyc +0 -0
package/lib/agent_usage/__pycache__/pi_emit.cpython-314.pyc +0 -0
package/lib/agent_usage/__pycache__/qwen.cpython-314.pyc +0 -0
package/lib/agent_usage/gemini.py +127 -0
package/lib/agent_usage/kimi.py +278 -0
package/lib/agent_usage/kimi_emit.py +123 -0
package/lib/agent_usage/openai.py +126 -0
package/lib/agent_usage/pi.py +200 -0
package/lib/agent_usage/pi_emit.py +135 -0
package/lib/agent_usage/qwen.py +128 -0
package/lib/backfill-pi-usage.py +243 -0
package/lib/changelog_audit.py +155 -0
package/lib/changelog_generate.py +263 -0
package/lib/context_feed_budget.sh +194 -0
package/lib/github_sync.py +876 -0
package/lib/i18n/README.md +54 -0
package/lib/i18n/agent.sh +75 -0
package/lib/i18n/alert.sh +20 -0
package/lib/i18n/backlog.sh +96 -0
package/lib/i18n/brief.sh +5 -0
package/lib/i18n/changelog.sh +5 -0
package/lib/i18n/ci.sh +15 -0
package/lib/i18n/debug.sh +0 -0
package/lib/i18n/doctor.sh +44 -0
package/lib/i18n/dream.sh +0 -0
package/lib/i18n/init.sh +91 -0
package/lib/i18n/lang.sh +10 -0
package/lib/i18n/loop.sh +140 -0
package/lib/i18n/migrate.sh +74 -0
package/lib/i18n/offboard.sh +31 -0
package/lib/i18n/onboard.sh +0 -0
package/lib/i18n/peer.sh +41 -0
package/lib/i18n/peer_help.sh +25 -0
package/lib/i18n/peer_reset.sh +7 -0
package/lib/i18n/peer_status.sh +5 -0
package/lib/i18n/prices.sh +3 -0
package/lib/i18n/prices_refresh.sh +17 -0
package/lib/i18n/prices_show.sh +7 -0
package/lib/i18n/propose.sh +0 -0
package/lib/i18n/release.sh +0 -0
package/lib/i18n/research.sh +0 -0
package/lib/i18n/review_pr.sh +0 -0
package/lib/i18n/sentinel.sh +0 -0
package/lib/i18n/setup.sh +3 -0
package/lib/i18n/shared.sh +157 -0
package/lib/i18n/skills/roll-brief.sh +47 -0
package/lib/i18n/skills/roll-build.sh +97 -0
package/lib/i18n/skills/roll-design.sh +18 -0
package/lib/i18n/skills/roll-fix.sh +53 -0
package/lib/i18n/skills/roll-loop.sh +28 -0
package/lib/i18n/skills/roll-onboard.sh +33 -0
package/lib/i18n/skills_catalog.sh +30 -0
package/lib/i18n/slides.sh +3 -0
package/lib/i18n/slides_build.sh +38 -0
package/lib/i18n/slides_delete.sh +19 -0
package/lib/i18n/slides_list.sh +14 -0
package/lib/i18n/slides_logs.sh +12 -0
package/lib/i18n/slides_new.sh +15 -0
package/lib/i18n/slides_preview.sh +14 -0
package/lib/i18n/slides_templates.sh +7 -0
package/lib/i18n/status.sh +21 -0
package/lib/i18n/update.sh +24 -0
package/lib/i18n.sh +211 -0
package/lib/loop-exit-summary.py +393 -0
package/lib/loop-fmt.py +589 -0
package/lib/loop_pick_agent.py +316 -0
package/lib/loop_result_eval.py +469 -0
package/lib/loop_unstick.py +180 -0
package/lib/model_prices.py +194 -0
package/lib/prices/README.md +35 -0
package/lib/prices/snapshot-2026-05-22.json +22 -0
package/lib/prices/snapshot-2026-05-23-deepseek.json +15 -0
package/lib/prices/snapshot-2026-05-23-kimi.json +15 -0
package/lib/prices_fetcher.py +285 -0
package/lib/roll-backlog.py +225 -0
package/lib/roll-brief.py +286 -0
package/lib/roll-help.py +158 -0
package/lib/roll-home.py +556 -0
package/lib/roll-init.py +156 -0
package/lib/roll-loop-status.py +1683 -0
package/lib/roll-loop-story.py +191 -0
package/lib/roll-onboard-render.py +378 -0
package/lib/roll-peer.py +252 -0
package/lib/roll-plan-validate.py +386 -0
package/lib/roll-setup.py +102 -0
package/lib/roll-status.py +367 -0
package/lib/roll_git.py +41 -0
package/lib/roll_render.py +414 -0
package/lib/slides/components/README.md +123 -0
package/lib/slides/components/cards-2.html +9 -0
package/lib/slides/components/cards-3.html +9 -0
package/lib/slides/components/cards-4.html +9 -0
package/lib/slides/components/compare.html +22 -0
package/lib/slides/components/highlight.html +9 -0
package/lib/slides/components/pipeline.html +12 -0
package/lib/slides/components/plain.html +7 -0
package/lib/slides/components/quote.html +4 -0
package/lib/slides/components/timeline.html +9 -0
package/lib/slides/templates/introduction-v3.html +571 -0
package/lib/slides/templates/pitch.html +0 -0
package/lib/slides-render.py +778 -0
package/lib/slides-validate.py +357 -0
package/lib/test_quality_gate.py +143 -0
package/package.json +8 -7
package/skills/roll-.changelog/SKILL.md +406 -33
package/skills/roll-.clarify/SKILL.md +5 -2
package/skills/roll-.dream/SKILL.md +374 -0
package/skills/roll-.echo/SKILL.md +5 -2
package/skills/roll-.qa/SKILL.md +57 -3
package/skills/roll-.review/SKILL.md +42 -3
package/skills/roll-brief/SKILL.md +209 -0
package/skills/roll-build/SKILL.md +308 -63
package/skills/roll-debug/SKILL.md +341 -162
package/skills/roll-debug/injectable-bb.js +263 -0
package/skills/roll-deck/SKILL.md +296 -0
package/skills/roll-design/ENGINEERING_CHECKLIST.md +1 -1
package/skills/roll-design/SKILL.md +733 -94
package/skills/roll-doc/SKILL.md +595 -0
package/skills/roll-doctor/SKILL.md +192 -0
package/skills/roll-fix/SKILL.md +149 -32
package/skills/{roll-jot → roll-idea}/SKILL.md +18 -10
package/skills/roll-loop/SKILL.md +579 -0
package/skills/roll-notes/SKILL.md +103 -0
package/skills/roll-onboard/SKILL.md +234 -0
package/skills/roll-peer/SKILL.md +336 -0
package/skills/roll-propose/SKILL.md +157 -0
package/skills/roll-review-pr/SKILL.md +58 -0
package/skills/roll-sentinel/SKILL.md +11 -2
package/skills/roll-spar/SKILL.md +8 -6
package/template/.github/workflows/ci.yml +5 -2
package/template/AGENTS.md +20 -74
package/skills/roll-research/SKILL.md +0 -307
package/skills/roll-research/references/schema.json +0 -162
package/skills/roll-research/scripts/md_to_pdf.py +0 -289
package/tools/roll-fetch/SKILL.md +0 -182
package/tools/roll-fetch/package.json +0 -15
package/tools/roll-fetch/smart-web-fetch.js +0 -558
package/tools/roll-probe/SKILL.md +0 -84
/package/template/{BACKLOG.md → .roll/backlog.md} +0 -0

package/lib/loop_result_eval.py ADDED Viewed

@@ -0,0 +1,469 @@
+#!/usr/bin/env python3
+"""Score one loop cycle's *result* against a multi-dimensional rubric (US-EVAL-001).
+This is the pure-function ground floor of loop-result-eval. It defines the
+rubric — which dimensions exist, how each maps a cycle's *facts* to a 0..1
+score, and how the weighted dimensions roll up into a single 1..10 cycle
+score — and nothing else. It does NOT collect facts, read runs.jsonl, or
+talk to git/gh; that wiring lands in US-EVAL-002.
+Distinct from skill self-scoring (US-SKILL-010..015): that is the agent's
+*subjective* self-review of a single skill run, written by the agent into
+``.roll/notes/*.md``. This is an *objective* result eval, computed from cycle
+facts with zero extra tokens, destined for the runs.jsonl ``result_eval`` block.
+Dimensions (each scored on 0..1; see ``DIMENSIONS`` for weights):
+    outcome         did the cycle actually merge into main?
+                    1.0 merged · 0.0 not merged · unknown if merge state absent
+    correctness     is the produced PR's CI green?
+                    1.0 green · 0.0 red · unknown if no CI signal
+    scope_fidelity  did the cycle complete the story it was routed to (vs
+                    going idle, picking an already-Done story, or drifting)?
+                    1.0 completed · 0.0 idle / wrong / drifted
+    quality         did the cycle add/adjust tests and avoid immediate rework?
+                    1.0 tcr_count>=1 and no follow-up rework FIX · 0.5 tests
+                    but a rework FIX landed · 0.0 no test activity
+    efficiency      duration vs the story's est_min budget.
+                    1.0 within budget · graded down past it · unknown if no
+                    duration or no est_min to compare against
+    cleanliness     no orphan worktrees/branches and no ALERTs raised.
+                    1.0 clean · 0.0 alerts or orphans present
+Each dimension may evaluate to the sentinel ``UNKNOWN`` when its required
+facts are absent (e.g. CI could not be fetched). Unknown dimensions are
+*excluded* from the weighted sum and the weights of the remaining dimensions
+are renormalised, so a missing fact never silently scores 0 (AC of US-EVAL-002).
+The 1..10 cycle score is::
+    weighted = sum(score_i * weight_i for known dims) / sum(weight_i for known dims)
+    cycle_score = round(1 + weighted * 9)        # 0.0 → 1, 1.0 → 10
+``result_eval`` schema (the block US-EVAL-002 writes into runs.jsonl)::
+    {
+      "version": 1,
+      "score": <int 1..10>,
+      "dims": { "<dim>": <float 0..1> | "unknown", ... }
+    }
+Backward compatibility: older runs.jsonl records simply have no ``result_eval``
+key; consumers must treat its absence as "not scored" rather than an error.
+CLI (used by the bats unit test) — reads a JSON facts object from --facts or
+stdin and prints the result_eval JSON::
+    loop_result_eval.py --facts '{"status":"merged","ci":"green",...}'
+    echo '{...}' | loop_result_eval.py
+Exit codes:
+  0 — scored
+  1 — bad/unreadable facts JSON
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+# Sentinel for a dimension whose facts are unavailable this cycle. Distinct
+# from a real 0.0 score (which means "measured, and bad").
+UNKNOWN = "unknown"
+SCHEMA_VERSION = 1
+# Rubric: dimension name → weight. Centralised single source of truth —
+# tunable here, but intentionally NOT a user-facing high-frequency knob.
+# Weights are relative; they are renormalised over the known dimensions, so
+# their absolute scale does not matter, only their ratio.
+DIMENSIONS = (
+    ("outcome", 3.0),         # merged into main is what ultimately matters
+    ("correctness", 2.0),     # green CI on the produced PR
+    ("scope_fidelity", 2.0),  # did the right, intended work
+    ("quality", 1.0),         # tests added, no immediate rework
+    ("efficiency", 1.0),      # within the story's time budget
+    ("cleanliness", 1.0),     # no orphans / alerts
+)
+DIM_WEIGHTS = dict(DIMENSIONS)
+def _truthy_merged(facts) -> bool:
+    """A cycle counts as merged when status==merged or merged flag is set."""
+    if str(facts.get("status", "")).strip().lower() == "merged":
+        return True
+    return bool(facts.get("merged"))
+def _score_outcome(facts):
+    """1.0 merged · 0.0 not merged. Unknown only when there is no signal at
+    all (no status and no explicit merged flag)."""
+    if "merged" not in facts and not facts.get("status"):
+        return UNKNOWN
+    return 1.0 if _truthy_merged(facts) else 0.0
+def _score_correctness(facts):
+    """CI verdict: green → 1.0, red/failing → 0.0, otherwise unknown."""
+    ci = facts.get("ci")
+    if ci is None or str(ci).strip() == "":
+        return UNKNOWN
+    ci = str(ci).strip().lower()
+    if ci in ("green", "pass", "passing", "success"):
+        return 1.0
+    if ci in ("red", "fail", "failing", "failure"):
+        return 0.0
+    return UNKNOWN
+def _score_scope_fidelity(facts):
+    """Did the cycle complete the story it was routed to?
+    idle / no story picked → 0.0. A story routed but ending without a built
+    artefact (drifted / picked-already-Done) → 0.0. Routed and present in
+    built[] → 1.0.
+    """
+    status = str(facts.get("status", "")).strip().lower()
+    if status == "idle" or not facts.get("routed_story"):
+        return 0.0
+    built = facts.get("built") or []
+    routed = facts.get("routed_story")
+    if isinstance(built, list) and routed in built:
+        return 1.0
+    # Routed but nothing built for it → drifted / no-op.
+    return 0.0
+def _score_quality(facts):
+    """Tests added/adjusted and no immediate rework.
+    tcr_count missing → unknown (no test signal). >=1 with no rework FIX →
+    1.0; >=1 but a rework FIX landed → 0.5; 0 → 0.0.
+    """
+    tcr = facts.get("tcr_count")
+    if tcr is None:
+        return UNKNOWN
+    try:
+        tcr = int(tcr)
+    except (TypeError, ValueError):
+        return UNKNOWN
+    if tcr <= 0:
+        return 0.0
+    if facts.get("rework_fix"):
+        return 0.5
+    return 1.0
+def _score_efficiency(facts):
+    """duration_sec vs est_min budget. Unknown when either is missing.
+    Within budget → 1.0. Over budget grades down linearly to a 0.2 floor at
+    3x the budget (a cycle that blows way past est is bad but not zero).
+    """
+    duration_sec = facts.get("duration_sec")
+    est_min = facts.get("est_min")
+    if duration_sec is None or est_min is None:
+        return UNKNOWN
+    try:
+        duration_min = float(duration_sec) / 60.0
+        budget = float(est_min)
+    except (TypeError, ValueError):
+        return UNKNOWN
+    if budget <= 0:
+        return UNKNOWN
+    if duration_min <= budget:
+        return 1.0
+    overrun = duration_min / budget  # >1
+    # 1x → 1.0, 3x → 0.2, clamped.
+    graded = 1.0 - (overrun - 1.0) * 0.4
+    return max(0.2, min(1.0, graded))
+def _score_cleanliness(facts):
+    """No ALERTs and no orphan worktrees/branches → 1.0, else 0.0."""
+    alerts = facts.get("alerts") or []
+    orphans = facts.get("orphans") or []
+    if alerts or orphans:
+        return 0.0
+    return 1.0
+_SCORERS = {
+    "outcome": _score_outcome,
+    "correctness": _score_correctness,
+    "scope_fidelity": _score_scope_fidelity,
+    "quality": _score_quality,
+    "efficiency": _score_efficiency,
+    "cleanliness": _score_cleanliness,
+}
+def score_dimensions(facts: dict) -> dict:
+    """Return {dim: float 0..1 | UNKNOWN} for every rubric dimension."""
+    facts = facts or {}
+    return {name: _SCORERS[name](facts) for name, _ in DIMENSIONS}
+def aggregate(dims: dict) -> int:
+    """Roll the per-dimension scores up into a 1..10 cycle score.
+    Unknown dimensions are excluded and the remaining weights renormalised.
+    When every dimension is unknown, returns the neutral midpoint (5).
+    """
+    num = 0.0
+    den = 0.0
+    for name, weight in DIMENSIONS:
+        s = dims.get(name, UNKNOWN)
+        if s == UNKNOWN:
+            continue
+        num += float(s) * weight
+        den += weight
+    if den == 0:
+        return 5  # no measurable dimension → neutral
+    weighted = num / den  # 0..1
+    return int(round(1 + weighted * 9))
+def score_cycle(facts: dict) -> dict:
+    """Compute the full ``result_eval`` block for one cycle's facts."""
+    dims = score_dimensions(facts)
+    return {
+        "version": SCHEMA_VERSION,
+        "score": aggregate(dims),
+        "dims": dims,
+    }
+# ─────────────────────────────────────────────────────────────────────────────
+# US-EVAL-004: self-evolution signals — repeated low-score patterns.
+#
+# This is the pure *detection* half. Given an ordered (oldest→newest) list of
+# runs.jsonl records, it finds dimensions that have been low (0.0) for N cycles
+# in a row and turns each into a structured improvement *signal*. It does NOT
+# write the brief, touch the backlog, or dedup against history — that side-
+# effecting wiring lives in bin/roll, which dedups on each signal's stable
+# ``key`` so the same standing pattern is surfaced once, not every cycle.
+#
+# A signal is advisory only: it is meant to be surfaced in the brief's
+# improvement-signal section and to seed a *candidate* backlog draft marked
+# "📋 待人确认" — never to auto-activate a story or auto-edit code.
+# ─────────────────────────────────────────────────────────────────────────────
+# How many consecutive low cycles a dimension must show before it is a signal.
+SIGNAL_STREAK = 3
+# Per-dimension signal metadata: the candidate backlog item kind (FIX vs IDEA)
+# and a human-facing description of what the streak means. A dimension that
+# keeps measuring 0.0 means the loop is reliably failing that axis, so most map
+# to FIX; scope_fidelity (repeatedly idle / off-scope) is a process IDEA.
+_SIGNAL_META = {
+    "outcome":        ("FIX",  "cycles keep failing to merge into main"),
+    "correctness":    ("FIX",  "produced PRs keep failing CI"),
+    "scope_fidelity": ("IDEA", "cycles keep going idle or off-scope"),
+    "quality":        ("FIX",  "cycles keep landing without test activity"),
+    "efficiency":     ("IDEA", "cycles keep blowing past their est_min budget"),
+    "cleanliness":    ("FIX",  "cycles keep leaving orphans / raising ALERTs"),
+}
+def _result_eval_of(record):
+    """Pull a usable result_eval block out of a record, or None.
+    Accepts either a full runs.jsonl record ({..., "result_eval": {...}}) or a
+    bare result_eval block ({"score":.., "dims":{...}})."""
+    if not isinstance(record, dict):
+        return None
+    ev = record.get("result_eval", record)
+    if isinstance(ev, dict) and isinstance(ev.get("dims"), dict):
+        return ev
+    return None
+def detect_signals(records, streak: int = SIGNAL_STREAK):
+    """Detect repeated-low-score patterns over an ordered record list.
+    ``records`` is oldest→newest. A dimension fires a signal when its most
+    recent ``streak`` *scored* cycles all measure exactly 0.0 (low) on it —
+    "unknown" cycles are skipped (they neither confirm nor break the streak,
+    so a missing CI signal does not mask a real failing streak). Each signal
+    is a dict::
+        {
+          "key": "lowdim:<dim>",      # stable id for dedup
+          "dim": "<dim>",
+          "kind": "FIX" | "IDEA",
+          "streak": <int>,            # how many low cycles in a row
+          "summary": "<one-line human description>",
+        }
+    Returns signals in DIMENSIONS order (deterministic, locale-independent).
+    """
+    try:
+        streak = int(streak)
+    except (TypeError, ValueError):
+        streak = SIGNAL_STREAK
+    if streak < 1:
+        streak = 1
+    evals = [ev for ev in (_result_eval_of(r) for r in (records or [])) if ev]
+    signals = []
+    for name, _weight in DIMENSIONS:
+        # Walk newest→oldest, counting a leading run of known-low scores.
+        run = 0
+        for ev in reversed(evals):
+            v = (ev.get("dims") or {}).get(name, UNKNOWN)
+            if v == UNKNOWN or v is None:
+                continue  # unknown neither extends nor breaks the streak
+            try:
+                fv = float(v)
+            except (TypeError, ValueError):
+                continue
+            if fv <= 0.0:
+                run += 1
+            else:
+                break  # a known-good cycle breaks the streak
+        if run >= streak:
+            kind, why = _SIGNAL_META.get(name, ("IDEA", "repeated low score"))
+            signals.append({
+                "key": "lowdim:" + name,
+                "dim": name,
+                "kind": kind,
+                "streak": run,
+                "summary": "%s for %d cycles in a row" % (why, run),
+            })
+    return signals
+# ─────────────────────────────────────────────────────────────────────────────
+# US-AGENT-030: per-(agent × story_type) historical hit-rate aggregation.
+#
+# This is the pure read-model the adaptive in-tier nudge (lib/loop_pick_agent.py
+# nudge_within_tier) consumes. Given runs.jsonl records, it computes — for every
+# observed (agent, story_type) pair — the share of *scored* cycles that landed a
+# "hit" (a high result_eval.score), plus the sample size that share rests on.
+#
+# Crucially distinct from the US-AGENT-022-retired soft preference: that ranked
+# agents by an opaque, unbounded, implicitly-decaying history with no audit
+# trail. This is a flat, deterministic, locale-independent count over the
+# records handed in — same records in → same numbers out, every time. The nudge
+# layer adds the sample floor and the on/off switch; this function only counts.
+#
+# A "hit" is a cycle whose result_eval.score is at or above HIT_SCORE_MIN. Using
+# the rolled-up 1..10 cycle score (not a single dimension) keeps the signal the
+# same one the dashboard already trends, so the audit story is one number.
+# ─────────────────────────────────────────────────────────────────────────────
+# A cycle counts as a "hit" for its (agent, story_type) when its rolled-up
+# result_eval.score is at least this. 8/10 = "clearly good cycle". Centralised
+# constant; intentionally not a user knob (keeps the nudge's input deterministic
+# and explainable).
+HIT_SCORE_MIN = 8
+def agent_story_hit_rates(records):
+    """Aggregate per-(agent, story_type) hit-rate + sample size from records.
+    ``records`` is an iterable of runs.jsonl record dicts (order irrelevant —
+    the result is a flat count, so it is deterministic regardless of input
+    order). A record contributes to a pair only when it carries a non-empty
+    ``agent`` and ``story_type`` and a usable ``result_eval.score`` (records
+    without a score are simply not counted — never treated as a 0 hit).
+    Returns a dict keyed by ``"<agent>\\x1f<story_type>"`` (unit-separator so
+    the key round-trips through JSON and shell without ambiguity) →
+    ``{"agent":.., "story_type":.., "hit_rate": float 0..1, "sample_n": int}``.
+    A hit is ``result_eval.score >= HIT_SCORE_MIN``.
+    """
+    # pair-key → [hits, sample_n]
+    tally = {}
+    for r in (records or []):
+        if not isinstance(r, dict):
+            continue
+        agent = r.get("agent")
+        stype = r.get("story_type")
+        if not agent or not stype:
+            continue
+        ev = r.get("result_eval")
+        if not isinstance(ev, dict):
+            continue
+        score = ev.get("score")
+        if not isinstance(score, (int, float)):
+            continue
+        key = "%s\x1f%s" % (agent, stype)
+        slot = tally.get(key)
+        if slot is None:
+            slot = [0, 0]
+            tally[key] = slot
+        slot[1] += 1
+        if float(score) >= HIT_SCORE_MIN:
+            slot[0] += 1
+    out = {}
+    for key in sorted(tally):
+        hits, n = tally[key]
+        agent, stype = key.split("\x1f", 1)
+        out[key] = {
+            "agent": agent,
+            "story_type": stype,
+            "hit_rate": (hits / n) if n else 0.0,
+            "sample_n": n,
+        }
+    return out
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Score a loop cycle result.")
+    parser.add_argument("--facts", default=None,
+                        help="cycle facts as a JSON object; reads stdin if omitted")
+    parser.add_argument("--signals", action="store_true",
+                        help="read a JSON array of runs records from --facts/stdin "
+                             "and emit detected self-evolution signals")
+    parser.add_argument("--streak", type=int, default=SIGNAL_STREAK,
+                        help="consecutive low cycles required to fire a signal")
+    parser.add_argument("--hit-rates", action="store_true",
+                        help="read a JSON array of runs records from --facts/stdin "
+                             "and emit per-(agent × story_type) hit-rate + sample_n "
+                             "(US-AGENT-030 adaptive-nudge read model)")
+    args = parser.parse_args()
+    raw = args.facts if args.facts is not None else sys.stdin.read()
+    if args.hit_rates:
+        try:
+            records = json.loads(raw) if raw.strip() else []
+        except (ValueError, AttributeError) as exc:
+            print(f"loop_result_eval: bad records JSON: {exc}", file=sys.stderr)
+            return 1
+        if not isinstance(records, list):
+            print("loop_result_eval: --hit-rates expects a JSON array", file=sys.stderr)
+            return 1
+        print(json.dumps(agent_story_hit_rates(records), sort_keys=True))
+        return 0
+    if args.signals:
+        try:
+            records = json.loads(raw) if raw.strip() else []
+        except (ValueError, AttributeError) as exc:
+            print(f"loop_result_eval: bad records JSON: {exc}", file=sys.stderr)
+            return 1
+        if not isinstance(records, list):
+            print("loop_result_eval: --signals expects a JSON array", file=sys.stderr)
+            return 1
+        print(json.dumps(detect_signals(records, args.streak), sort_keys=True))
+        return 0
+    try:
+        facts = json.loads(raw) if raw.strip() else {}
+    except (ValueError, AttributeError) as exc:
+        print(f"loop_result_eval: bad facts JSON: {exc}", file=sys.stderr)
+        return 1
+    if not isinstance(facts, dict):
+        print("loop_result_eval: facts must be a JSON object", file=sys.stderr)
+        return 1
+    print(json.dumps(score_cycle(facts), sort_keys=True))
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/lib/loop_unstick.py ADDED Viewed

@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""FIX-112: revert 🔨 In Progress stories whose latest cycle definitively
+failed and has been quiet for a while. Default safe gate is conservative:
+- Story row is currently 🔨 In Progress in backlog
+- Most recent `pick_todo <story_id>` event in events-<slug>.ndjson lives in
+  a cycle whose `cycle_end` outcome is one of: failed | aborted | blocked
+- That cycle_end timestamp is at least N hours ago (default 4)
+Stories that match are flipped back to 📋 Todo and an ALERT note is appended
+to the per-project ALERT file. Stories still actively running, or claimed
+by a human / agent for legitimate work (no failed cycle_end), stay alone.
+Usage:
+  python3 lib/loop_unstick.py            # apply (writes backlog + ALERT)
+  python3 lib/loop_unstick.py --dry-run  # report what would change, write nothing
+  python3 lib/loop_unstick.py --ttl-hours 8
+Returns 0 always (idempotent). Prints one line per reverted story.
+"""
+from __future__ import annotations
+import argparse, json, os, re, sys, time
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+_LIB_DIR = os.path.dirname(os.path.realpath(__file__))
+if _LIB_DIR not in sys.path:
+    sys.path.insert(0, _LIB_DIR)
+# FIX-108-compatible: accept multi-segment story IDs (US-VIEW-011, US-I18N-001)
+# and alphanumeric segments (K8S, D2, 2FA-ish layouts within rules).
+ID_RE   = re.compile(r"^\s*\[?([A-Z][A-Z0-9]*(?:-[A-Z][A-Z0-9]*)*-\d+)")
+TICK    = chr(96)
+def _shared_root() -> Path:
+    # bin/roll uses _SHARED_ROOT, lib/roll-home.py uses ROLL_SHARED_ROOT.
+    # Honor both so tests that sandbox either name work transparently.
+    root = os.environ.get("ROLL_SHARED_ROOT") or os.environ.get("_SHARED_ROOT")
+    return Path(root or os.path.expanduser("~/.shared/roll"))
+def _project_slug() -> str:
+    try:
+        import subprocess, hashlib
+        path = os.path.realpath(os.getcwd())
+        common = subprocess.check_output(
+            ["git", "-C", path, "rev-parse", "--git-common-dir"],
+            stderr=subprocess.DEVNULL, text=True,
+        ).strip()
+        if common.endswith("/.git"):
+            path = common[:-5]
+    except Exception:
+        path = os.path.realpath(os.getcwd())
+    import hashlib
+    base = re.sub(r"[^A-Za-z0-9]+", "-", os.path.basename(path)).strip("-")
+    h = hashlib.md5(path.encode()).hexdigest()[:6]
+    return f"{base}-{h}"
+def _read_events(slug: str) -> list:
+    path = _shared_root() / "loop" / f"events-{slug}.ndjson"
+    out = []
+    if not path.exists():
+        return out
+    with path.open(errors="ignore") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                ev = json.loads(line)
+                ts = ev.get("ts", "")
+                ev["_ts"] = datetime.fromisoformat(ts.replace("Z", "+00:00")) if ts else None
+                out.append(ev)
+            except Exception:
+                continue
+    return out
+def _scan_in_progress(backlog: Path) -> list:
+    """Return list of (line_index, story_id, raw_line) for rows that are 🔨 In Progress."""
+    if not backlog.exists():
+        return []
+    rows = []
+    for i, line in enumerate(backlog.open(errors="ignore")):
+        if "| 🔨 In Progress |" not in line:
+            continue
+        if not line.startswith("|"):
+            continue
+        parts = [p.strip() for p in line.split("|")]
+        if len(parts) < 4:
+            continue
+        m = ID_RE.match(parts[1])
+        if not m:
+            continue
+        rows.append((i, m.group(1), line.rstrip("\n")))
+    return rows
+def _cycle_end_for_pick(events: list, story_id: str):
+    """Return (cycle_end_ts, outcome) of the latest cycle that picked
+    story_id, or None if no such cycle / cycle still running."""
+    # Walk events back to front looking for the latest pick_todo matching story_id
+    latest_pick = None
+    for ev in reversed(events):
+        if ev.get("stage") == "pick_todo" and ev.get("detail") == story_id:
+            latest_pick = ev
+            break
+    if not latest_pick:
+        return None
+    label = latest_pick.get("label", "")
+    # Look forward (from the pick) for cycle_end with the same label
+    pick_idx = events.index(latest_pick)
+    for ev in events[pick_idx + 1:]:
+        if ev.get("stage") == "cycle_end" and ev.get("label", "").endswith(label):
+            return ev.get("_ts"), ev.get("outcome", "")
+    return None
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dry-run", action="store_true")
+    ap.add_argument("--ttl-hours", type=float, default=4.0,
+                    help="Minimum hours since failed cycle_end before reverting (default 4)")
+    ap.add_argument("--backlog", default=".roll/backlog.md")
+    args = ap.parse_args()
+    backlog = Path(args.backlog)
+    if not backlog.exists():
+        print(f"backlog not found: {backlog}", file=sys.stderr)
+        return 0
+    slug = _project_slug()
+    events = _read_events(slug)
+    in_progress = _scan_in_progress(backlog)
+    if not in_progress:
+        return 0
+    now = datetime.now(timezone.utc)
+    cutoff = now - timedelta(hours=args.ttl_hours)
+    candidates_to_revert = []
+    failed_outcomes = {"failed", "aborted", "blocked"}
+    for line_idx, sid, raw in in_progress:
+        result = _cycle_end_for_pick(events, sid)
+        if not result:
+            continue  # still running OR no failed cycle yet — leave alone
+        end_ts, outcome = result
+        if outcome not in failed_outcomes:
+            continue
+        if not end_ts or end_ts > cutoff:
+            continue  # too recent
+        age_hours = (now - end_ts).total_seconds() / 3600
+        candidates_to_revert.append((line_idx, sid, raw, outcome, age_hours))
+    if not candidates_to_revert:
+        return 0
+    if args.dry_run:
+        for line_idx, sid, raw, outcome, age in candidates_to_revert:
+            print(f"would-revert {sid} (cycle ended {outcome} {age:.1f}h ago)")
+        return 0
+    # Apply: read backlog, flip status, write back.
+    lines = backlog.read_text(errors="ignore").splitlines(keepends=True)
+    for line_idx, sid, raw, outcome, age in candidates_to_revert:
+        lines[line_idx] = lines[line_idx].replace("| 🔨 In Progress |", "| 📋 Todo |")
+    backlog.write_text("".join(lines))
+    # Append ALERT
+    alert_file = _shared_root() / "loop" / f"ALERT-{slug}.md"
+    alert_file.parent.mkdir(parents=True, exist_ok=True)
+    with alert_file.open("a") as f:
+        for line_idx, sid, raw, outcome, age in candidates_to_revert:
+            ts = now.strftime("%Y-%m-%dT%H:%M:%SZ")
+            f.write(f"[{ts}] unstick: reverted {sid} (cycle ended {outcome} {age:.1f}h ago, > {args.ttl_hours}h TTL)\n")
+    for line_idx, sid, raw, outcome, age in candidates_to_revert:
+        print(f"reverted {sid} (cycle ended {outcome} {age:.1f}h ago)")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())