@event4u/agent-config 4.9.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/implement-ticket.md +5 -4
- package/.agent-src/rules/language-and-tone.md +4 -10
- package/.agent-src/skills/command-routing/SKILL.md +5 -4
- package/.claude-plugin/marketplace.json +1 -1
- package/CHANGELOG.md +73 -0
- package/CONTRIBUTING.md +19 -0
- package/README.md +11 -0
- package/dist/cli/registry.js +0 -2
- package/dist/cli/registry.js.map +1 -1
- package/dist/discovery/deprecation-report.md +1 -1
- package/dist/discovery/discovery-manifest.json +5 -5
- package/dist/discovery/discovery-manifest.json.sha256 +1 -1
- package/dist/discovery/discovery-manifest.summary.md +1 -1
- package/dist/discovery/orphan-report.md +1 -1
- package/dist/discovery/packs.json +2 -2
- package/dist/discovery/trust-report.md +1 -1
- package/dist/discovery/workspaces.json +2 -2
- package/dist/mcp/registry-manifest.json +2 -2
- package/dist/router.json +1 -1671
- package/docs/benchmark.md +20 -8
- package/docs/benchmarks.md +11 -0
- package/docs/contracts/benchmark-corpus-spec.md +31 -3
- package/docs/contracts/command-surface-tiers.md +1 -1
- package/docs/contracts/hook-architecture-v1.md +33 -0
- package/docs/contracts/migrate-command.md +197 -0
- package/docs/contracts/settings-api.md +2 -1
- package/docs/contracts/value-dashboard-spec.md +374 -0
- package/docs/contracts/value-report-schema.md +150 -0
- package/docs/decisions/ADR-031-validation-severity-tiers-and-projection-roundtrip.md +97 -0
- package/docs/decisions/INDEX.md +1 -0
- package/docs/guidelines/agent-infra/installed-tools-manifest.md +6 -3
- package/docs/guidelines/agent-infra/language-and-tone-examples.md +35 -0
- package/docs/migration/v1-to-v2.md +40 -27
- package/docs/value.md +84 -0
- package/package.json +8 -8
- package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
- package/scripts/_cli/cmd_migrate.py +264 -102
- package/scripts/_cli/cmd_settings_migrate.py +2 -1
- package/scripts/_dispatch.bash +147 -49
- package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
- package/scripts/_lib/install_regenerator.py +129 -0
- package/scripts/_lib/value_ladder.py +599 -0
- package/scripts/_lib/value_report.py +441 -0
- package/scripts/bench_rtk_savings.py +320 -0
- package/scripts/compile_router.py +19 -5
- package/scripts/expected_perms.json +1 -1
- package/scripts/first_run_gate_hook.py +178 -0
- package/scripts/hook_manifest.yaml +16 -7
- package/scripts/hooks/dispatch_hook.py +27 -0
- package/scripts/hooks/dispatch_issues.py +136 -0
- package/scripts/hooks_doctor.py +40 -1
- package/scripts/install.py +25 -21
- package/scripts/lint_agents_layout.py +5 -4
- package/scripts/lint_bench_corpus.py +86 -4
- package/scripts/lint_global_paths.py +4 -3
- package/scripts/lint_marketplace_install_completeness.py +188 -0
- package/scripts/lint_value_dashboard.py +218 -0
- package/scripts/render_benchmark_md.py +6 -2
- package/scripts/render_value_md.py +355 -0
- package/scripts/repro/repro_marketplace_install_gap.sh +161 -0
- package/scripts/roadmap_progress_hook.py +23 -0
- package/scripts/router_telemetry.py +470 -0
- package/scripts/validate_frontmatter.py +23 -9
- package/scripts/_cli/cmd_migrate_to_global.py +0 -415
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Reproduce the silent marketplace-install gap that this roadmap fixes.
|
|
3
|
+
#
|
|
4
|
+
# Phase 0 of `road-to-hooks-actually-fire-in-consumers`.
|
|
5
|
+
#
|
|
6
|
+
# Simulates a consumer project that ran `/plugin install
|
|
7
|
+
# agent-config@event4u-agent-config` but NEVER ran `agent-config init`.
|
|
8
|
+
# The plugin's hooks.json fires under Claude's lifecycle, but every
|
|
9
|
+
# resolved command (`"$CLAUDE_PROJECT_DIR"/agent-config dispatch:hook
|
|
10
|
+
# …`) errors out silently because the prerequisites do not exist.
|
|
11
|
+
#
|
|
12
|
+
# Expected output (the bug):
|
|
13
|
+
# - Dispatcher exits 0 (never-block contract)
|
|
14
|
+
# - NO `agents/roadmaps-progress.md` written
|
|
15
|
+
# - NO state file under `agents/runtime/state/`
|
|
16
|
+
# - Hook tried, hook failed, no trace left behind
|
|
17
|
+
#
|
|
18
|
+
# Once Phases 1-4 land, the same script should produce a
|
|
19
|
+
# `dispatch-issues.jsonl` entry naming the missing artefact.
|
|
20
|
+
|
|
21
|
+
set -euo pipefail
|
|
22
|
+
|
|
23
|
+
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
24
|
+
TMPDIR_BASE="${TMPDIR:-/tmp}"
|
|
25
|
+
CONSUMER_ROOT="$(mktemp -d "$TMPDIR_BASE/marketplace-install-gap-XXXXXX")"
|
|
26
|
+
EVIDENCE_FILE="$REPO_ROOT/agents/evidence/analysis/hooks-marketplace-gap-2026-05-29.md"
|
|
27
|
+
|
|
28
|
+
cleanup() {
|
|
29
|
+
rm -rf "$CONSUMER_ROOT" 2>/dev/null || true
|
|
30
|
+
}
|
|
31
|
+
trap cleanup EXIT
|
|
32
|
+
|
|
33
|
+
echo "==> Setting up synthetic marketplace-install consumer at: $CONSUMER_ROOT"
|
|
34
|
+
|
|
35
|
+
# 1. Write only the marketplace-install end-state (.claude/settings.json
|
|
36
|
+
# with the plugin enabled). Nothing else — no symlink, no regenerator,
|
|
37
|
+
# no .augment/, no agents/runtime/state/.
|
|
38
|
+
mkdir -p "$CONSUMER_ROOT/.claude"
|
|
39
|
+
cat > "$CONSUMER_ROOT/.claude/settings.json" <<'JSON'
|
|
40
|
+
{
|
|
41
|
+
"enabledPlugins": {
|
|
42
|
+
"agent-config@event4u-agent-config": true
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
JSON
|
|
46
|
+
|
|
47
|
+
# 2. Fake roadmap so a hook on path_prefix `agents/roadmaps/` has a
|
|
48
|
+
# target to react to. (For Phase 0 we don't actually run a hook
|
|
49
|
+
# that depends on this — but it documents the file layout.)
|
|
50
|
+
mkdir -p "$CONSUMER_ROOT/agents/roadmaps"
|
|
51
|
+
cat > "$CONSUMER_ROOT/agents/roadmaps/road-to-fake.md" <<'MD'
|
|
52
|
+
---
|
|
53
|
+
complexity: lightweight
|
|
54
|
+
---
|
|
55
|
+
# Roadmap: fake
|
|
56
|
+
|
|
57
|
+
## Phase 1
|
|
58
|
+
|
|
59
|
+
- [ ] **Step 1:** placeholder
|
|
60
|
+
MD
|
|
61
|
+
|
|
62
|
+
# 3. Capture the missing-artefact inventory BEFORE we try anything.
|
|
63
|
+
echo
|
|
64
|
+
echo "==> Missing-artefact inventory:"
|
|
65
|
+
INVENTORY=""
|
|
66
|
+
for artefact in \
|
|
67
|
+
".claude/settings.json (plugin enabled)" \
|
|
68
|
+
"agent-config symlink" \
|
|
69
|
+
".augment/scripts/update_roadmap_progress.py" \
|
|
70
|
+
".agent-src/scripts/update_roadmap_progress.py" \
|
|
71
|
+
".agent-src.uncondensed/scripts/update_roadmap_progress.py" \
|
|
72
|
+
".git/hooks/pre-commit" \
|
|
73
|
+
"agents/runtime/state/"
|
|
74
|
+
do
|
|
75
|
+
# Strip parenthetical for the existence check.
|
|
76
|
+
path_only="${artefact% (*}"
|
|
77
|
+
if [ -e "$CONSUMER_ROOT/$path_only" ] || [ -L "$CONSUMER_ROOT/$path_only" ]; then
|
|
78
|
+
status="present"
|
|
79
|
+
else
|
|
80
|
+
status="MISSING"
|
|
81
|
+
fi
|
|
82
|
+
line=" $status: $artefact"
|
|
83
|
+
INVENTORY="$INVENTORY$line"$'\n'
|
|
84
|
+
echo "$line"
|
|
85
|
+
done
|
|
86
|
+
|
|
87
|
+
# 4. Emit a synthetic PostToolUse JSON envelope on stdin to the
|
|
88
|
+
# dispatch hook, simulating what Claude Code would send when an
|
|
89
|
+
# agent writes to the fake roadmap.
|
|
90
|
+
echo
|
|
91
|
+
echo "==> Invoking dispatch hook with synthetic PostToolUse envelope..."
|
|
92
|
+
ENVELOPE=$(cat <<JSON
|
|
93
|
+
{
|
|
94
|
+
"session_id": "repro-marketplace-gap",
|
|
95
|
+
"transcript_path": "/dev/null",
|
|
96
|
+
"cwd": "$CONSUMER_ROOT",
|
|
97
|
+
"hook_event_name": "PostToolUse",
|
|
98
|
+
"tool_name": "Write",
|
|
99
|
+
"tool_input": {
|
|
100
|
+
"file_path": "$CONSUMER_ROOT/agents/roadmaps/road-to-fake.md"
|
|
101
|
+
},
|
|
102
|
+
"tool_response": {}
|
|
103
|
+
}
|
|
104
|
+
JSON
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
DISPATCH_STDERR="$(mktemp "$TMPDIR_BASE/dispatch-stderr-XXXXXX")"
|
|
108
|
+
DISPATCH_EXIT=0
|
|
109
|
+
echo "$ENVELOPE" | (
|
|
110
|
+
cd "$CONSUMER_ROOT"
|
|
111
|
+
CLAUDE_PROJECT_DIR="$CONSUMER_ROOT" python3 "$REPO_ROOT/scripts/hooks/dispatch_hook.py" \
|
|
112
|
+
--platform claude --event post_tool_use --native-event PostToolUse \
|
|
113
|
+
2>"$DISPATCH_STDERR"
|
|
114
|
+
) || DISPATCH_EXIT=$?
|
|
115
|
+
|
|
116
|
+
echo " dispatcher exit code: $DISPATCH_EXIT"
|
|
117
|
+
echo " dispatcher stderr:"
|
|
118
|
+
sed 's/^/ /' "$DISPATCH_STDERR" | head -20
|
|
119
|
+
|
|
120
|
+
# 5. Confirm the silent-no-op shape.
|
|
121
|
+
echo
|
|
122
|
+
echo "==> Verifying the silent no-op:"
|
|
123
|
+
|
|
124
|
+
DASHBOARD_EXISTS="no"
|
|
125
|
+
if [ -e "$CONSUMER_ROOT/agents/roadmaps-progress.md" ]; then
|
|
126
|
+
DASHBOARD_EXISTS="yes"
|
|
127
|
+
fi
|
|
128
|
+
echo " agents/roadmaps-progress.md written: $DASHBOARD_EXISTS (expected: no)"
|
|
129
|
+
|
|
130
|
+
STATE_FILES=0
|
|
131
|
+
if [ -d "$CONSUMER_ROOT/agents/runtime/state" ]; then
|
|
132
|
+
STATE_FILES=$(find "$CONSUMER_ROOT/agents/runtime/state" -type f 2>/dev/null | wc -l | tr -d ' ')
|
|
133
|
+
fi
|
|
134
|
+
echo " state files under agents/runtime/state/: $STATE_FILES (expected: 0)"
|
|
135
|
+
|
|
136
|
+
# Phase-1-aware check: after that phase lands, dispatch-issues.jsonl
|
|
137
|
+
# should exist with at least one entry. Today (pre-Phase-1) it does
|
|
138
|
+
# not. The script reports both shapes.
|
|
139
|
+
DISPATCH_ISSUES="no"
|
|
140
|
+
if [ -e "$CONSUMER_ROOT/agents/runtime/state/dispatch-issues.jsonl" ]; then
|
|
141
|
+
DISPATCH_ISSUES="yes"
|
|
142
|
+
fi
|
|
143
|
+
echo " agents/runtime/state/dispatch-issues.jsonl: $DISPATCH_ISSUES (pre-Phase-1: no; post-Phase-1: yes)"
|
|
144
|
+
|
|
145
|
+
rm -f "$DISPATCH_STDERR"
|
|
146
|
+
|
|
147
|
+
# 6. Append evidence.
|
|
148
|
+
mkdir -p "$(dirname "$EVIDENCE_FILE")"
|
|
149
|
+
{
|
|
150
|
+
printf '## %s — repro run\n\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
151
|
+
printf 'Tmp consumer root: `%s`\n\n' "$CONSUMER_ROOT"
|
|
152
|
+
printf 'Inventory:\n\n'
|
|
153
|
+
printf '```\n%s```\n\n' "$INVENTORY"
|
|
154
|
+
printf 'Dispatcher exit: `%s`\n' "$DISPATCH_EXIT"
|
|
155
|
+
printf 'Dashboard written: `%s`\n' "$DASHBOARD_EXISTS"
|
|
156
|
+
printf 'State files: `%s`\n' "$STATE_FILES"
|
|
157
|
+
printf 'dispatch-issues.jsonl: `%s`\n\n' "$DISPATCH_ISSUES"
|
|
158
|
+
} >> "$EVIDENCE_FILE"
|
|
159
|
+
|
|
160
|
+
echo
|
|
161
|
+
echo "==> Evidence appended to: ${EVIDENCE_FILE#$REPO_ROOT/}"
|
|
@@ -132,6 +132,29 @@ def run(stdin_text: str, *, consumer_root: Path, verbose: bool = False) -> int:
|
|
|
132
132
|
|
|
133
133
|
script = _resolve_regenerator(consumer_root)
|
|
134
134
|
if script is None:
|
|
135
|
+
# Phase 1 of road-to-hooks-actually-fire-in-consumers: log
|
|
136
|
+
# dispatch issue directly (this hook runs as a subprocess from
|
|
137
|
+
# the universal dispatcher; routing through the dispatcher
|
|
138
|
+
# would add latency for no benefit).
|
|
139
|
+
try:
|
|
140
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent / "hooks"))
|
|
141
|
+
from dispatch_issues import log_dispatch_issue # noqa: PLC0415
|
|
142
|
+
log_dispatch_issue(
|
|
143
|
+
workspace_root=consumer_root,
|
|
144
|
+
hook="roadmap-progress",
|
|
145
|
+
issue="prerequisite_missing",
|
|
146
|
+
detail=(
|
|
147
|
+
"update_roadmap_progress.py not found at any of: "
|
|
148
|
+
".augment/scripts/, .agent-src/scripts/, "
|
|
149
|
+
".agent-src.uncondensed/scripts/"
|
|
150
|
+
),
|
|
151
|
+
resolution=(
|
|
152
|
+
"./agent-config hooks:install --regen "
|
|
153
|
+
"(or ./agent-config init)"
|
|
154
|
+
),
|
|
155
|
+
)
|
|
156
|
+
except (ImportError, OSError):
|
|
157
|
+
pass # observability never breaks the hook
|
|
135
158
|
if verbose:
|
|
136
159
|
print("roadmap-progress-hook: regenerator not found, skipping",
|
|
137
160
|
file=sys.stderr)
|
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Local replay of the router against a corpus — pure, no API spend.
|
|
3
|
+
|
|
4
|
+
Phase 3 of `agents/roadmaps/road-to-value-dashboard-netto-cuts.md`.
|
|
5
|
+
|
|
6
|
+
For each prompt in a corpus, applies the same trigger-match logic
|
|
7
|
+
agent hosts would apply at runtime against `dist/router.json`:
|
|
8
|
+
|
|
9
|
+
- kernel rules: always active (no triggers, always-on by definition).
|
|
10
|
+
- tier_1 + tier_2 rules: active iff any trigger matches the prompt
|
|
11
|
+
text (plus optional open-files / command context).
|
|
12
|
+
|
|
13
|
+
Trigger semantics implemented:
|
|
14
|
+
|
|
15
|
+
| Type | Match rule |
|
|
16
|
+
|----------------|------------------------------------------------------------------|
|
|
17
|
+
| `keyword` | case-insensitive substring inside the prompt text |
|
|
18
|
+
| `phrase` | case-insensitive substring (multi-word) inside the prompt text |
|
|
19
|
+
| `command` | case-sensitive prefix on `command:` field (optional context) |
|
|
20
|
+
| `intent` | informational only — never auto-matches; counted separately |
|
|
21
|
+
| `path_prefix` | prefix match against any path in `open_files` (optional context) |
|
|
22
|
+
| `file_pattern` | fnmatch against any path in `open_files` (optional context) |
|
|
23
|
+
|
|
24
|
+
Rules a task expects to fire only via `intent` (or a router coverage
|
|
25
|
+
gap) the static replay cannot see are declared in the corpus field
|
|
26
|
+
`replay_opaque_triggers`. They surface in `intended_vs_observed_match`
|
|
27
|
+
under `replay_opaque` and are excluded from both `missed_intended`
|
|
28
|
+
(no false drift) and `unintended_activations`.
|
|
29
|
+
|
|
30
|
+
Reports go to `internal/bench/reports/router-telemetry/<UTC>.json`
|
|
31
|
+
with three blocks:
|
|
32
|
+
|
|
33
|
+
- `per_trigger_hits` — count of times each trigger fired
|
|
34
|
+
- `per_rule_activations` — count of times each rule activated
|
|
35
|
+
- `panel_b_untouchable_rules` — tier-1 rules that activated on ≥ 1
|
|
36
|
+
Track B task; hard floor for Phase 5
|
|
37
|
+
|
|
38
|
+
Sample size is capped per corpus (`--sample-cap`, default 200).
|
|
39
|
+
Larger corpora are replayed deterministically over the first N
|
|
40
|
+
sorted-by-id prompts.
|
|
41
|
+
|
|
42
|
+
Honours `--quiet` per the script-output convention.
|
|
43
|
+
"""
|
|
44
|
+
from __future__ import annotations
|
|
45
|
+
|
|
46
|
+
import argparse
|
|
47
|
+
import fnmatch
|
|
48
|
+
import json
|
|
49
|
+
import sys
|
|
50
|
+
from datetime import datetime, timezone
|
|
51
|
+
from pathlib import Path
|
|
52
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
import yaml
|
|
56
|
+
except ImportError:
|
|
57
|
+
yaml = None # type: ignore[assignment]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
61
|
+
ROUTER_JSON = REPO_ROOT / "dist" / "router.json"
|
|
62
|
+
DEFAULT_OUT_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "router-telemetry"
|
|
63
|
+
DEFAULT_SAMPLE_CAP = 200
|
|
64
|
+
|
|
65
|
+
# Track B corpus = the Panel B evidence basis; rules that fire on its
|
|
66
|
+
# tasks are the attribution map and become the untouchable set.
|
|
67
|
+
TRACK_B_CORPUS_REL = "internal/bench/corpora/ab-trackb.yaml"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _log(msg: str, quiet: bool, *, err: bool = False) -> None:
|
|
71
|
+
if err:
|
|
72
|
+
print(msg, file=sys.stderr)
|
|
73
|
+
elif not quiet:
|
|
74
|
+
print(msg)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _utc_iso() -> str:
|
|
78
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ── Trigger matching ────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def trigger_matches(
|
|
85
|
+
trigger: Dict[str, Any],
|
|
86
|
+
prompt: str,
|
|
87
|
+
open_files: Optional[Iterable[str]] = None,
|
|
88
|
+
command: Optional[str] = None,
|
|
89
|
+
) -> bool:
|
|
90
|
+
"""Apply one trigger to a prompt + context; return True on match."""
|
|
91
|
+
prompt_lower = prompt.lower()
|
|
92
|
+
if "keyword" in trigger:
|
|
93
|
+
return str(trigger["keyword"]).lower() in prompt_lower
|
|
94
|
+
if "phrase" in trigger:
|
|
95
|
+
return str(trigger["phrase"]).lower() in prompt_lower
|
|
96
|
+
if "command" in trigger:
|
|
97
|
+
if not command:
|
|
98
|
+
return False
|
|
99
|
+
return command.startswith(str(trigger["command"]))
|
|
100
|
+
if "path_prefix" in trigger:
|
|
101
|
+
if not open_files:
|
|
102
|
+
return False
|
|
103
|
+
pref = str(trigger["path_prefix"])
|
|
104
|
+
return any(str(p).startswith(pref) for p in open_files)
|
|
105
|
+
if "file_pattern" in trigger:
|
|
106
|
+
if not open_files:
|
|
107
|
+
return False
|
|
108
|
+
pat = str(trigger["file_pattern"])
|
|
109
|
+
return any(fnmatch.fnmatch(str(p), pat) for p in open_files)
|
|
110
|
+
if "intent" in trigger:
|
|
111
|
+
# Intent triggers are informational and never auto-match.
|
|
112
|
+
return False
|
|
113
|
+
return False
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def match_prompt(
|
|
117
|
+
router: Dict[str, Any],
|
|
118
|
+
prompt: str,
|
|
119
|
+
profile: str = "full",
|
|
120
|
+
open_files: Optional[Iterable[str]] = None,
|
|
121
|
+
command: Optional[str] = None,
|
|
122
|
+
) -> Dict[str, Any]:
|
|
123
|
+
"""Return the matched-triggers + activated-rules for one prompt.
|
|
124
|
+
|
|
125
|
+
Kernel rules are always active. tier_1 always considered. tier_2
|
|
126
|
+
only considered when `profile == 'full'`.
|
|
127
|
+
"""
|
|
128
|
+
tiers = [("tier_1", router.get("tier_1", []) or [])]
|
|
129
|
+
if profile == "full":
|
|
130
|
+
tiers.append(("tier_2", router.get("tier_2", []) or []))
|
|
131
|
+
|
|
132
|
+
matched_triggers: List[Dict[str, Any]] = []
|
|
133
|
+
activated_rules: List[Dict[str, Any]] = []
|
|
134
|
+
|
|
135
|
+
for tier_name, rules in tiers:
|
|
136
|
+
for rule in rules:
|
|
137
|
+
rule_id = rule.get("id")
|
|
138
|
+
rule_triggers = rule.get("triggers", []) or []
|
|
139
|
+
rule_hit = False
|
|
140
|
+
for trig in rule_triggers:
|
|
141
|
+
if trigger_matches(trig, prompt, open_files, command):
|
|
142
|
+
matched_triggers.append({"tier": tier_name, "rule": rule_id, "trigger": trig})
|
|
143
|
+
rule_hit = True
|
|
144
|
+
if rule_hit:
|
|
145
|
+
activated_rules.append({"tier": tier_name, "rule": rule_id})
|
|
146
|
+
|
|
147
|
+
# Kernel rules are always active.
|
|
148
|
+
for kid in router.get("kernel", []) or []:
|
|
149
|
+
activated_rules.append({"tier": "kernel", "rule": kid})
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
"matched_triggers": matched_triggers,
|
|
153
|
+
"activated_rules": activated_rules,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# ── Corpus loading ──────────────────────────────────────────────────────
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _safe_yaml_load(path: Path) -> Optional[Dict[str, Any]]:
|
|
161
|
+
if yaml is None or not path.exists():
|
|
162
|
+
return None
|
|
163
|
+
try:
|
|
164
|
+
return yaml.safe_load(path.read_text()) or {}
|
|
165
|
+
except yaml.YAMLError:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def load_corpus_prompts(
|
|
170
|
+
corpus_path: Path, sample_cap: int
|
|
171
|
+
) -> List[Dict[str, Any]]:
|
|
172
|
+
"""Return per-prompt entries capped at sample_cap, sorted by id.
|
|
173
|
+
|
|
174
|
+
Each entry: `{id, text, intended_triggers, open_files, command}`.
|
|
175
|
+
All context fields beyond id/text are optional; missing → defaults.
|
|
176
|
+
"""
|
|
177
|
+
data = _safe_yaml_load(corpus_path)
|
|
178
|
+
if not data:
|
|
179
|
+
return []
|
|
180
|
+
out: List[Dict[str, Any]] = []
|
|
181
|
+
# Track B uses `tasks:`, dev uses `prompts:`.
|
|
182
|
+
for key in ("tasks", "prompts"):
|
|
183
|
+
for entry in data.get(key, []) or []:
|
|
184
|
+
pid = str(entry.get("id", ""))
|
|
185
|
+
text = entry.get("prompt") or entry.get("text") or ""
|
|
186
|
+
intended = entry.get("intended_triggers") or []
|
|
187
|
+
opaque = entry.get("replay_opaque_triggers") or []
|
|
188
|
+
open_files = entry.get("open_files") or []
|
|
189
|
+
command = entry.get("command") or None
|
|
190
|
+
if not isinstance(intended, list):
|
|
191
|
+
intended = []
|
|
192
|
+
if not isinstance(opaque, list):
|
|
193
|
+
opaque = []
|
|
194
|
+
if not isinstance(open_files, list):
|
|
195
|
+
open_files = []
|
|
196
|
+
if pid and text:
|
|
197
|
+
out.append(
|
|
198
|
+
{
|
|
199
|
+
"id": pid,
|
|
200
|
+
"text": str(text),
|
|
201
|
+
"intended_triggers": [str(t) for t in intended],
|
|
202
|
+
"replay_opaque_triggers": [str(t) for t in opaque],
|
|
203
|
+
"open_files": [str(p) for p in open_files],
|
|
204
|
+
"command": str(command) if command else None,
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
out.sort(key=lambda x: x["id"])
|
|
208
|
+
return out[:sample_cap]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# ── Aggregation ─────────────────────────────────────────────────────────
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def aggregate_replay(
|
|
215
|
+
router: Dict[str, Any],
|
|
216
|
+
corpora: List[Tuple[str, Path]],
|
|
217
|
+
sample_cap: int,
|
|
218
|
+
profile: str,
|
|
219
|
+
) -> Dict[str, Any]:
|
|
220
|
+
"""Replay every corpus through the router; aggregate hits."""
|
|
221
|
+
per_trigger_hits: Dict[str, int] = {}
|
|
222
|
+
per_rule_activations: Dict[str, Dict[str, int]] = {}
|
|
223
|
+
panel_b_seen_tier1: set = set()
|
|
224
|
+
panel_b_seen_tier2: set = set()
|
|
225
|
+
per_corpus_summary: List[Dict[str, Any]] = []
|
|
226
|
+
intended_vs_observed: List[Dict[str, Any]] = []
|
|
227
|
+
unintended_histogram: Dict[str, int] = {}
|
|
228
|
+
|
|
229
|
+
for corpus_name, corpus_path in corpora:
|
|
230
|
+
prompts = load_corpus_prompts(corpus_path, sample_cap)
|
|
231
|
+
corpus_rule_hits: Dict[str, int] = {}
|
|
232
|
+
for entry in prompts:
|
|
233
|
+
pid = entry["id"]
|
|
234
|
+
text = entry["text"]
|
|
235
|
+
intended = entry["intended_triggers"]
|
|
236
|
+
opaque = entry["replay_opaque_triggers"]
|
|
237
|
+
result = match_prompt(
|
|
238
|
+
router,
|
|
239
|
+
text,
|
|
240
|
+
profile=profile,
|
|
241
|
+
open_files=entry["open_files"] or None,
|
|
242
|
+
command=entry["command"],
|
|
243
|
+
)
|
|
244
|
+
for hit in result["matched_triggers"]:
|
|
245
|
+
key = f"{hit['rule']}::{json.dumps(hit['trigger'], sort_keys=True)}"
|
|
246
|
+
per_trigger_hits[key] = per_trigger_hits.get(key, 0) + 1
|
|
247
|
+
seen_in_prompt: set = set()
|
|
248
|
+
for act in result["activated_rules"]:
|
|
249
|
+
rid = act["rule"]
|
|
250
|
+
if rid is None or act["tier"] == "kernel":
|
|
251
|
+
# Skip kernel — always-on by definition, no signal.
|
|
252
|
+
continue
|
|
253
|
+
seen_in_prompt.add((act["tier"], rid))
|
|
254
|
+
activated_ids = {rid for _t, rid in seen_in_prompt}
|
|
255
|
+
for tier, rid in seen_in_prompt:
|
|
256
|
+
per_rule_activations.setdefault(tier, {})
|
|
257
|
+
per_rule_activations[tier][rid] = (
|
|
258
|
+
per_rule_activations[tier].get(rid, 0) + 1
|
|
259
|
+
)
|
|
260
|
+
corpus_rule_hits[rid] = corpus_rule_hits.get(rid, 0) + 1
|
|
261
|
+
if corpus_name == "ab-trackb":
|
|
262
|
+
if tier == "tier_1":
|
|
263
|
+
panel_b_seen_tier1.add(rid)
|
|
264
|
+
elif tier == "tier_2":
|
|
265
|
+
panel_b_seen_tier2.add(rid)
|
|
266
|
+
# Council R3 honesty floor: surface intended vs observed.
|
|
267
|
+
# `replay_opaque` rules fire at runtime only via `intent`
|
|
268
|
+
# triggers (or router gaps) the deterministic replay cannot
|
|
269
|
+
# see — they are NOT counted as `missed_intended` (that would
|
|
270
|
+
# be false drift) and NOT counted as `unintended_activations`.
|
|
271
|
+
if intended or opaque:
|
|
272
|
+
intended_set = set(intended)
|
|
273
|
+
opaque_set = set(opaque)
|
|
274
|
+
hit = sorted(intended_set & activated_ids)
|
|
275
|
+
miss = sorted(intended_set - activated_ids)
|
|
276
|
+
unintended = sorted(activated_ids - intended_set - opaque_set)
|
|
277
|
+
intended_vs_observed.append(
|
|
278
|
+
{
|
|
279
|
+
"corpus": corpus_name,
|
|
280
|
+
"task": pid,
|
|
281
|
+
"intended": sorted(intended),
|
|
282
|
+
"replay_opaque": sorted(opaque),
|
|
283
|
+
"hit": hit,
|
|
284
|
+
"missed_intended": miss,
|
|
285
|
+
"unintended_activations": unintended,
|
|
286
|
+
}
|
|
287
|
+
)
|
|
288
|
+
# Council R3 #3: inter-rule conflict histogram.
|
|
289
|
+
for rid in unintended:
|
|
290
|
+
unintended_histogram[rid] = unintended_histogram.get(rid, 0) + 1
|
|
291
|
+
per_corpus_summary.append(
|
|
292
|
+
{
|
|
293
|
+
"corpus": corpus_name,
|
|
294
|
+
"prompts_replayed": len(prompts),
|
|
295
|
+
"unique_rules_activated": len(corpus_rule_hits),
|
|
296
|
+
"top_rules": sorted(
|
|
297
|
+
corpus_rule_hits.items(), key=lambda x: -x[1]
|
|
298
|
+
)[:10],
|
|
299
|
+
}
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
panel_b_untouchable = sorted(panel_b_seen_tier1)
|
|
303
|
+
return {
|
|
304
|
+
"per_trigger_hits": per_trigger_hits,
|
|
305
|
+
"per_rule_activations": per_rule_activations,
|
|
306
|
+
"panel_b_untouchable_rules": panel_b_untouchable,
|
|
307
|
+
"panel_b_tier2_drivers": sorted(panel_b_seen_tier2),
|
|
308
|
+
"per_corpus_summary": per_corpus_summary,
|
|
309
|
+
"intended_vs_observed_match": intended_vs_observed,
|
|
310
|
+
"unintended_activation_histogram": sorted(
|
|
311
|
+
unintended_histogram.items(), key=lambda x: -x[1]
|
|
312
|
+
),
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
# ── Reports ─────────────────────────────────────────────────────────────
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def write_report(
|
|
320
|
+
aggregate: Dict[str, Any],
|
|
321
|
+
out_dir: Path,
|
|
322
|
+
corpora_paths: List[Path],
|
|
323
|
+
sample_cap: int,
|
|
324
|
+
profile: str,
|
|
325
|
+
) -> Path:
|
|
326
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
327
|
+
stamp = _utc_iso().replace(":", "-")
|
|
328
|
+
out_path = out_dir / f"{stamp}.json"
|
|
329
|
+
latest = out_dir / "latest.json"
|
|
330
|
+
payload = {
|
|
331
|
+
"schema_version": 1,
|
|
332
|
+
"schema_id": "router-telemetry-v1",
|
|
333
|
+
"generated_at": _utc_iso(),
|
|
334
|
+
"config": {
|
|
335
|
+
"router": "dist/router.json",
|
|
336
|
+
"profile": profile,
|
|
337
|
+
"sample_cap_per_corpus": sample_cap,
|
|
338
|
+
"corpora": [str(p.relative_to(REPO_ROOT)) for p in corpora_paths],
|
|
339
|
+
},
|
|
340
|
+
**aggregate,
|
|
341
|
+
}
|
|
342
|
+
text = json.dumps(payload, indent=2, ensure_ascii=False) + "\n"
|
|
343
|
+
out_path.write_text(text)
|
|
344
|
+
latest.write_text(text)
|
|
345
|
+
return out_path
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def find_never_matched_tier1(router: Dict[str, Any], activations: Dict[str, Any]) -> List[str]:
|
|
349
|
+
"""Tier-1 rules with zero activations across all corpora — dead-rule candidates."""
|
|
350
|
+
tier_1_activations = activations.get("tier_1", {}) or {}
|
|
351
|
+
all_tier_1_ids = [r.get("id") for r in router.get("tier_1", []) if r.get("id")]
|
|
352
|
+
return sorted([rid for rid in all_tier_1_ids if rid not in tier_1_activations])
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# ── Entry point ─────────────────────────────────────────────────────────
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def run(
|
|
359
|
+
corpora: List[Tuple[str, Path]],
|
|
360
|
+
out_dir: Path = DEFAULT_OUT_DIR,
|
|
361
|
+
sample_cap: int = DEFAULT_SAMPLE_CAP,
|
|
362
|
+
profile: str = "full",
|
|
363
|
+
quiet: bool = False,
|
|
364
|
+
) -> int:
|
|
365
|
+
if not ROUTER_JSON.exists():
|
|
366
|
+
_log(f"router not found: {ROUTER_JSON}", quiet, err=True)
|
|
367
|
+
return 1
|
|
368
|
+
try:
|
|
369
|
+
router = json.loads(ROUTER_JSON.read_text())
|
|
370
|
+
except json.JSONDecodeError as exc:
|
|
371
|
+
_log(f"failed to parse router: {exc}", quiet, err=True)
|
|
372
|
+
return 1
|
|
373
|
+
|
|
374
|
+
_log(
|
|
375
|
+
f"router_telemetry: replaying {len(corpora)} corpora · "
|
|
376
|
+
f"cap={sample_cap} prompts/corpus · profile={profile}",
|
|
377
|
+
quiet,
|
|
378
|
+
)
|
|
379
|
+
agg = aggregate_replay(router, corpora, sample_cap, profile)
|
|
380
|
+
never_matched = find_never_matched_tier1(router, agg["per_rule_activations"])
|
|
381
|
+
agg["never_matched_tier1"] = never_matched
|
|
382
|
+
|
|
383
|
+
out_path = write_report(
|
|
384
|
+
agg, out_dir, [p for _name, p in corpora], sample_cap, profile
|
|
385
|
+
)
|
|
386
|
+
relpath = out_path.relative_to(REPO_ROOT)
|
|
387
|
+
_log(
|
|
388
|
+
f"router_telemetry: wrote {relpath} · "
|
|
389
|
+
f"panel_b_untouchable={len(agg['panel_b_untouchable_rules'])} · "
|
|
390
|
+
f"never_matched_tier1={len(never_matched)}",
|
|
391
|
+
quiet=False,
|
|
392
|
+
)
|
|
393
|
+
return 0
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def parse_args(argv: List[str]) -> argparse.Namespace:
|
|
397
|
+
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
398
|
+
p.add_argument(
|
|
399
|
+
"--corpus",
|
|
400
|
+
action="append",
|
|
401
|
+
default=[],
|
|
402
|
+
metavar="NAME:PATH",
|
|
403
|
+
help="Repeatable. NAME tags the corpus in the report; PATH is the YAML.",
|
|
404
|
+
)
|
|
405
|
+
p.add_argument(
|
|
406
|
+
"--sample-cap",
|
|
407
|
+
type=int,
|
|
408
|
+
default=DEFAULT_SAMPLE_CAP,
|
|
409
|
+
help="Max prompts per corpus (default %(default)s).",
|
|
410
|
+
)
|
|
411
|
+
p.add_argument(
|
|
412
|
+
"--profile",
|
|
413
|
+
choices=["balanced", "full"],
|
|
414
|
+
default="full",
|
|
415
|
+
help="Routing profile (default 'full' — includes tier-2 rules).",
|
|
416
|
+
)
|
|
417
|
+
p.add_argument(
|
|
418
|
+
"--out",
|
|
419
|
+
type=Path,
|
|
420
|
+
default=DEFAULT_OUT_DIR,
|
|
421
|
+
help="Output directory (default %(default)s).",
|
|
422
|
+
)
|
|
423
|
+
p.add_argument("--quiet", action="store_true")
|
|
424
|
+
return p.parse_args(argv)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _default_corpora() -> List[Tuple[str, Path]]:
|
|
428
|
+
"""The default manifest: original 3 corpora + every router-coverage file.
|
|
429
|
+
|
|
430
|
+
Phase 3 of road-to-corpus-expansion-evidence-based-cuts: adding a new
|
|
431
|
+
corpus file under `internal/bench/corpora/router-coverage/` no longer
|
|
432
|
+
requires editing this script — the manifest auto-discovers them.
|
|
433
|
+
"""
|
|
434
|
+
corpora: List[Tuple[str, Path]] = [
|
|
435
|
+
("ab-trackb", REPO_ROOT / TRACK_B_CORPUS_REL),
|
|
436
|
+
("dev", REPO_ROOT / "tests/eval/corpus-dev.yaml"),
|
|
437
|
+
("non-dev", REPO_ROOT / "tests/eval/corpus-non-dev.yaml"),
|
|
438
|
+
]
|
|
439
|
+
coverage_dir = REPO_ROOT / "internal" / "bench" / "corpora" / "router-coverage"
|
|
440
|
+
if coverage_dir.is_dir():
|
|
441
|
+
for p in sorted(coverage_dir.glob("*.yaml")):
|
|
442
|
+
# Tag name: "router-coverage:<stem>" so the report distinguishes
|
|
443
|
+
# them from the original 3 corpora at a glance.
|
|
444
|
+
corpora.append((f"router-coverage:{p.stem}", p))
|
|
445
|
+
return corpora
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def main(argv: List[str] | None = None) -> int:
|
|
449
|
+
args = parse_args(argv if argv is not None else sys.argv[1:])
|
|
450
|
+
if not args.corpus:
|
|
451
|
+
corpora = _default_corpora()
|
|
452
|
+
else:
|
|
453
|
+
corpora = []
|
|
454
|
+
for spec in args.corpus:
|
|
455
|
+
if ":" not in spec:
|
|
456
|
+
print(f"--corpus expects NAME:PATH, got {spec!r}", file=sys.stderr)
|
|
457
|
+
return 1
|
|
458
|
+
name, path = spec.split(":", 1)
|
|
459
|
+
corpora.append((name.strip(), Path(path.strip())))
|
|
460
|
+
return run(
|
|
461
|
+
corpora,
|
|
462
|
+
out_dir=args.out,
|
|
463
|
+
sample_cap=args.sample_cap,
|
|
464
|
+
profile=args.profile,
|
|
465
|
+
quiet=args.quiet,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
if __name__ == "__main__":
|
|
470
|
+
raise SystemExit(main())
|