@event4u/agent-config 2.13.0 → 2.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/memory/learn-low-impact.md +143 -0
- package/.agent-src/rules/ask-when-uncertain.md +10 -6
- package/.agent-src/rules/copilot-routing.md +1 -1
- package/.agent-src/rules/devcontainer-routing.md +1 -1
- package/.agent-src/rules/external-reference-deep-dive.md +1 -1
- package/.agent-src/rules/fast-path-marker-visibility.md +38 -0
- package/.agent-src/rules/low-impact-corpus-privacy-floor.md +74 -0
- package/.agent-src/rules/symfony-routing.md +1 -1
- package/.agent-src/skills/ai-council/SKILL.md +208 -8
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.claude-plugin/marketplace.json +2 -1
- package/CHANGELOG.md +299 -124
- package/README.md +6 -6
- package/config/gitignore-block.txt +6 -0
- package/docs/architecture.md +12 -12
- package/docs/archive/CHANGELOG-pre-2.11.0.md +141 -0
- package/docs/catalog.md +10 -7
- package/docs/contracts/adr-architectural-consensus-mechanism.md +4 -3
- package/docs/contracts/adr-level-6-productization.md +7 -9
- package/docs/contracts/ai-council-config.md +492 -20
- package/docs/contracts/command-clusters.md +1 -1
- package/docs/contracts/command-surface-tiers.md +3 -2
- package/docs/contracts/cost-profile-defaults.md +5 -0
- package/docs/contracts/decision-engine-gates.md +5 -0
- package/docs/contracts/decision-trace-v1.md +2 -2
- package/docs/contracts/file-ownership-matrix.json +1735 -72
- package/docs/contracts/installed-tools-lockfile.md +2 -1
- package/docs/contracts/low-impact-corpus-format.md +95 -0
- package/docs/contracts/mcp-beta-criteria.md +6 -5
- package/docs/contracts/mcp-cloud-scope.md +5 -4
- package/docs/contracts/multi-tool-projection-fidelity.md +8 -2
- package/docs/contracts/release-trunk-sync.md +4 -3
- package/docs/contracts/tier-3-contrib-plugin.md +5 -6
- package/docs/getting-started.md +2 -2
- package/docs/guidelines/agent-infra/installed-tools-manifest.md +2 -1
- package/docs/installation.md +32 -0
- package/package.json +1 -1
- package/scripts/_cli/cmd_doctor.py +134 -0
- package/scripts/ai_council/airgap.py +165 -0
- package/scripts/ai_council/cli_hints.py +123 -0
- package/scripts/ai_council/clients.py +787 -5
- package/scripts/ai_council/compile_corpus.py +178 -0
- package/scripts/ai_council/confidence_gate.py +156 -0
- package/scripts/ai_council/config.py +1007 -11
- package/scripts/ai_council/consensus.py +41 -2
- package/scripts/ai_council/events_log.py +137 -0
- package/scripts/ai_council/learn_low_impact_preview.py +252 -0
- package/scripts/ai_council/low_impact.py +714 -0
- package/scripts/ai_council/low_impact_corpus.py +466 -0
- package/scripts/ai_council/low_impact_intake.py +163 -0
- package/scripts/ai_council/modes.py +6 -1
- package/scripts/ai_council/necessity.py +782 -0
- package/scripts/ai_council/orchestrator.py +252 -14
- package/scripts/ai_council/probation_gate.py +152 -0
- package/scripts/ai_council/redact_low_impact_entry.py +155 -0
- package/scripts/ai_council/replay.py +155 -0
- package/scripts/ai_council/session.py +19 -1
- package/scripts/ai_council/shadow_dispatch.py +235 -0
- package/scripts/ai_council/solo_dispatch.py +226 -0
- package/scripts/audit_cloud_compatibility.py +74 -0
- package/scripts/audit_command_surface.py +363 -0
- package/scripts/check_council_layout.py +11 -0
- package/scripts/council_cli.py +1046 -15
- package/scripts/install.sh +12 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Decision-replay artefact for council sessions (Phase 9).
|
|
2
|
+
|
|
3
|
+
Produces a per-session ``decision-replay.md`` that surfaces the audit
|
|
4
|
+
trail GPT review of PR #148 called out as missing: for each top
|
|
5
|
+
finding, the consensus_strength, agreeing-members with their key
|
|
6
|
+
argument, dissenting-members with their counter-argument, the
|
|
7
|
+
evidence-quality verdict, and a final synthesis verdict line.
|
|
8
|
+
|
|
9
|
+
The artefact is a pure projection of the consensus data plus the
|
|
10
|
+
per-member deliberation texts — no extra model calls. Schema is
|
|
11
|
+
documented in ``docs/contracts/ai-council-config.md`` under
|
|
12
|
+
"Decision-replay schema".
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Iterable, Sequence
|
|
19
|
+
|
|
20
|
+
from scripts.ai_council.clients import CouncilResponse
|
|
21
|
+
from scripts.ai_council.consensus import (
|
|
22
|
+
ConsensusMetadata,
|
|
23
|
+
Finding,
|
|
24
|
+
FindingScore,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class DecisionReplayInputs:
|
|
30
|
+
"""Bundle accepted by :func:`render_decision_replay`.
|
|
31
|
+
|
|
32
|
+
``include_member_arguments`` toggles the redacted-vs-full output.
|
|
33
|
+
When ``False`` the artefact emits consensus + dissent COUNT only —
|
|
34
|
+
no per-member arguments — for sharing without leaking which model
|
|
35
|
+
framed which point.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
findings: Sequence[Finding]
|
|
39
|
+
scores: Sequence[FindingScore]
|
|
40
|
+
metadata: dict[str, ConsensusMetadata]
|
|
41
|
+
deliberation: Sequence[CouncilResponse] # last-round per-member texts
|
|
42
|
+
original_ask: str = ""
|
|
43
|
+
include_member_arguments: bool = True
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _verdict(strength: float) -> str:
|
|
47
|
+
"""Single-word verdict band for a consensus_strength."""
|
|
48
|
+
if strength > 0.7:
|
|
49
|
+
return "Strong"
|
|
50
|
+
if strength > 0.4:
|
|
51
|
+
return "Moderate"
|
|
52
|
+
return "Weak"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _scorer_argument(
|
|
56
|
+
scorer: str,
|
|
57
|
+
member_texts: dict[str, str],
|
|
58
|
+
score: FindingScore | None,
|
|
59
|
+
) -> str:
|
|
60
|
+
"""Return the one-line key argument for ``scorer`` on a finding.
|
|
61
|
+
|
|
62
|
+
Prefers the scorer's ``reason`` field (rich, contextual) and falls
|
|
63
|
+
back to the truncated deliberation snippet so the audit trail never
|
|
64
|
+
surfaces an empty argument.
|
|
65
|
+
"""
|
|
66
|
+
if score and score.reason:
|
|
67
|
+
flat = " ".join(score.reason.split())
|
|
68
|
+
if len(flat) > 200:
|
|
69
|
+
flat = flat[:199].rstrip() + "…"
|
|
70
|
+
return flat
|
|
71
|
+
snippet = member_texts.get(scorer, "")
|
|
72
|
+
flat = " ".join(snippet.split())
|
|
73
|
+
if not flat:
|
|
74
|
+
return "no argument captured"
|
|
75
|
+
if len(flat) > 200:
|
|
76
|
+
flat = flat[:199].rstrip() + "…"
|
|
77
|
+
return flat
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _scores_for_finding(
|
|
81
|
+
fid: str, scores: Iterable[FindingScore],
|
|
82
|
+
) -> dict[str, FindingScore]:
|
|
83
|
+
return {s.scorer: s for s in scores if s.finding_id == fid}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def render_decision_replay(inputs: DecisionReplayInputs) -> str:
|
|
87
|
+
"""Render the ``decision-replay.md`` body.
|
|
88
|
+
|
|
89
|
+
Sections (in order): a leading H1 plus the original ask blockquote,
|
|
90
|
+
one ``## <finding-id> — <truncated text>`` block per finding (ranked
|
|
91
|
+
by consensus_strength desc), and a trailing footer with the toggle
|
|
92
|
+
state so consumers can tell at a glance whether arguments were
|
|
93
|
+
redacted.
|
|
94
|
+
"""
|
|
95
|
+
member_texts = {f"{r.provider}:{r.model}": r.text or "" for r in inputs.deliberation}
|
|
96
|
+
ranked = sorted(
|
|
97
|
+
inputs.findings,
|
|
98
|
+
key=lambda f: inputs.metadata.get(
|
|
99
|
+
f.id,
|
|
100
|
+
ConsensusMetadata(
|
|
101
|
+
finding_id=f.id, consensus_strength=0.0, dissent_count=0,
|
|
102
|
+
scorers=(), mean_score=0.0,
|
|
103
|
+
),
|
|
104
|
+
).consensus_strength,
|
|
105
|
+
reverse=True,
|
|
106
|
+
)
|
|
107
|
+
lines: list[str] = ["# Decision Replay\n"]
|
|
108
|
+
if inputs.original_ask.strip():
|
|
109
|
+
ask = " ".join(inputs.original_ask.split())
|
|
110
|
+
if len(ask) > 400:
|
|
111
|
+
ask = ask[:399].rstrip() + "…"
|
|
112
|
+
lines.append(f"> {ask}\n")
|
|
113
|
+
if not ranked:
|
|
114
|
+
lines.append("*No findings were extracted for this session.*\n")
|
|
115
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
116
|
+
for f in ranked:
|
|
117
|
+
m = inputs.metadata.get(f.id)
|
|
118
|
+
if m is None:
|
|
119
|
+
m = ConsensusMetadata(
|
|
120
|
+
finding_id=f.id, consensus_strength=0.0, dissent_count=0,
|
|
121
|
+
scorers=(), mean_score=0.0,
|
|
122
|
+
)
|
|
123
|
+
title = " ".join(f.text.split())
|
|
124
|
+
if len(title) > 120:
|
|
125
|
+
title = title[:119].rstrip() + "…"
|
|
126
|
+
verdict = _verdict(m.consensus_strength)
|
|
127
|
+
lines.append(f"## {f.id} — {title}\n")
|
|
128
|
+
lines.append(
|
|
129
|
+
f"- **Consensus**: {verdict} ({m.consensus_strength:.2f})\n"
|
|
130
|
+
f"- **Evidence quality**: {m.evidence_quality} "
|
|
131
|
+
f"(mean {m.mean_score:.1f}/10)\n"
|
|
132
|
+
f"- **Agreement**: {m.concur_count}/"
|
|
133
|
+
f"{m.concur_count + m.dissent_count} members concur, "
|
|
134
|
+
f"{m.dissent_count} dissent\n",
|
|
135
|
+
)
|
|
136
|
+
if inputs.include_member_arguments:
|
|
137
|
+
score_map = _scores_for_finding(f.id, inputs.scores)
|
|
138
|
+
agreeing = [s for s in m.scorers if score_map.get(s) and score_map[s].agree]
|
|
139
|
+
dissent = [pair for pair in m.dissent_reasons]
|
|
140
|
+
if agreeing:
|
|
141
|
+
lines.append("**Agreeing members**:")
|
|
142
|
+
for scorer in agreeing:
|
|
143
|
+
arg = _scorer_argument(scorer, member_texts, score_map.get(scorer))
|
|
144
|
+
lines.append(f"- _{scorer}_ — {arg}")
|
|
145
|
+
lines.append("")
|
|
146
|
+
if dissent:
|
|
147
|
+
lines.append("**Dissenting members**:")
|
|
148
|
+
for scorer, reason in dissent:
|
|
149
|
+
arg = _scorer_argument(scorer, member_texts, score_map.get(scorer))
|
|
150
|
+
lines.append(f"- _{scorer}_ — {arg}")
|
|
151
|
+
lines.append("")
|
|
152
|
+
lines.append(f"**Synthesis verdict**: {verdict} consensus — {f.source} sourced.\n")
|
|
153
|
+
mode_label = "full" if inputs.include_member_arguments else "redacted (counts only)"
|
|
154
|
+
lines.append(f"---\n\n_artefact mode: {mode_label}_\n")
|
|
155
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
@@ -71,14 +71,32 @@ def _utc_timestamp() -> str:
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def _serialise_response(r: CouncilResponse) -> dict[str, object]:
|
|
74
|
-
|
|
74
|
+
"""Project a `CouncilResponse` into the manifest schema.
|
|
75
|
+
|
|
76
|
+
Phase 5 / Step 1 — surface ``transport``, ``billable``,
|
|
77
|
+
``subscription_label``, ``cost_usd``, and ``tokens_estimated`` so
|
|
78
|
+
the audit trail can distinguish flat-rate CLI calls from billable
|
|
79
|
+
api / community-CLI calls. When ``tokens_estimated`` is true the
|
|
80
|
+
token counts are kept (heuristic) but flagged so consumers can
|
|
81
|
+
null or disclaim them.
|
|
82
|
+
"""
|
|
83
|
+
meta = r.metadata or {}
|
|
84
|
+
payload: dict[str, object] = {
|
|
75
85
|
"provider": r.provider,
|
|
76
86
|
"model": r.model,
|
|
77
87
|
"input_tokens": r.input_tokens,
|
|
78
88
|
"output_tokens": r.output_tokens,
|
|
79
89
|
"latency_ms": r.latency_ms,
|
|
80
90
|
"error": r.error,
|
|
91
|
+
"transport": meta.get("transport", "api"),
|
|
92
|
+
"billable": bool(meta.get("billable", True)),
|
|
93
|
+
"tokens_estimated": bool(meta.get("tokens_estimated", False)),
|
|
81
94
|
}
|
|
95
|
+
if meta.get("subscription_label"):
|
|
96
|
+
payload["subscription_label"] = meta["subscription_label"]
|
|
97
|
+
if "cost_usd" in meta:
|
|
98
|
+
payload["cost_usd"] = meta["cost_usd"]
|
|
99
|
+
return payload
|
|
82
100
|
|
|
83
101
|
|
|
84
102
|
def _load_retention_days(settings_path: Path | None = None) -> int:
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""Shadow-mode dispatch for low-impact solo-member decisions (step-9 P10).
|
|
2
|
+
|
|
3
|
+
When ``low_impact.dispatch: single`` is active, a Bernoulli-sampled subset
|
|
4
|
+
of decisions is shadowed through the full council so disagreement between
|
|
5
|
+
the solo verdict and the council verdict can be measured. The shadow log
|
|
6
|
+
lives at ``agents/council-shadow-log.jsonl`` and is subject to the same
|
|
7
|
+
privacy floor as the low-impact corpus: redactor-refused entries are
|
|
8
|
+
dropped, not softened.
|
|
9
|
+
|
|
10
|
+
The flip from ``single`` back to ``full`` is a user decision; this module
|
|
11
|
+
emits data and an SLO banner, nothing else.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
import random
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from datetime import datetime, timedelta, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Iterable
|
|
23
|
+
|
|
24
|
+
from scripts.ai_council.bundler import redact
|
|
25
|
+
|
|
26
|
+
SHADOW_LOG_PATH = Path("agents/council-shadow-log.jsonl")
|
|
27
|
+
|
|
28
|
+
SLO_THRESHOLD_WARN = 0.05
|
|
29
|
+
SLO_THRESHOLD_BREACH = 0.08
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class ShadowDecision:
|
|
34
|
+
timestamp: str
|
|
35
|
+
query_hash: str
|
|
36
|
+
solo_verdict: str
|
|
37
|
+
full_verdict: str
|
|
38
|
+
agreed: bool
|
|
39
|
+
#: Step-9 P13 — True when the confidence gate auto-escalated this
|
|
40
|
+
#: decision to the full council. Distinguishes "silent disagreement"
|
|
41
|
+
#: (escalated=False, agreed=False) from "gate-caught" (escalated=True)
|
|
42
|
+
#: in the SLO banner.
|
|
43
|
+
escalated: bool = False
|
|
44
|
+
escalation_reason: str = "ok"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def should_shadow(
|
|
48
|
+
sample_rate: float,
|
|
49
|
+
*,
|
|
50
|
+
rng: random.Random | None = None,
|
|
51
|
+
) -> bool:
|
|
52
|
+
rate = max(0.0, min(1.0, sample_rate))
|
|
53
|
+
r = rng if rng is not None else random
|
|
54
|
+
return r.random() < rate
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _hash_query(query: str) -> str:
|
|
58
|
+
redacted = redact(query)
|
|
59
|
+
return hashlib.sha256(redacted.encode("utf-8")).hexdigest()[:16]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _privacy_dropped(redacted: str) -> bool:
|
|
63
|
+
stripped = redacted.strip()
|
|
64
|
+
if not stripped:
|
|
65
|
+
return True
|
|
66
|
+
return stripped.startswith("[redacted")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def record_shadow_decision(
|
|
70
|
+
log_path: Path,
|
|
71
|
+
*,
|
|
72
|
+
query: str,
|
|
73
|
+
solo_verdict: str,
|
|
74
|
+
full_verdict: str,
|
|
75
|
+
escalated: bool = False,
|
|
76
|
+
escalation_reason: str = "ok",
|
|
77
|
+
) -> ShadowDecision | None:
|
|
78
|
+
"""Append one JSONL row. Returns ``None`` when redaction would drop
|
|
79
|
+
the entry (privacy floor — do not soften).
|
|
80
|
+
|
|
81
|
+
``escalated`` / ``escalation_reason`` come from the confidence
|
|
82
|
+
gate (step-9 P13). When True, ``solo_verdict`` is the rejected
|
|
83
|
+
solo response and ``full_verdict`` is the council's verdict that
|
|
84
|
+
actually answered the user.
|
|
85
|
+
"""
|
|
86
|
+
redacted_q = redact(query)
|
|
87
|
+
if _privacy_dropped(redacted_q):
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
decision = ShadowDecision(
|
|
91
|
+
timestamp=datetime.now(timezone.utc).isoformat(timespec="seconds"),
|
|
92
|
+
query_hash=_hash_query(query),
|
|
93
|
+
solo_verdict=solo_verdict,
|
|
94
|
+
full_verdict=full_verdict,
|
|
95
|
+
agreed=(solo_verdict == full_verdict),
|
|
96
|
+
escalated=escalated,
|
|
97
|
+
escalation_reason=escalation_reason,
|
|
98
|
+
)
|
|
99
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
with log_path.open("a", encoding="utf-8") as f:
|
|
101
|
+
f.write(json.dumps({
|
|
102
|
+
"timestamp": decision.timestamp,
|
|
103
|
+
"query_hash": decision.query_hash,
|
|
104
|
+
"solo_verdict": decision.solo_verdict,
|
|
105
|
+
"full_verdict": decision.full_verdict,
|
|
106
|
+
"agreed": decision.agreed,
|
|
107
|
+
"escalated": decision.escalated,
|
|
108
|
+
"escalation_reason": decision.escalation_reason,
|
|
109
|
+
}) + "\n")
|
|
110
|
+
return decision
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _iter_log(log_path: Path) -> Iterable[dict]:
|
|
114
|
+
if not log_path.exists():
|
|
115
|
+
return
|
|
116
|
+
with log_path.open("r", encoding="utf-8") as f:
|
|
117
|
+
for line in f:
|
|
118
|
+
line = line.strip()
|
|
119
|
+
if not line:
|
|
120
|
+
continue
|
|
121
|
+
try:
|
|
122
|
+
yield json.loads(line)
|
|
123
|
+
except json.JSONDecodeError:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def compute_disagreement_rate(
|
|
128
|
+
log_path: Path,
|
|
129
|
+
*,
|
|
130
|
+
window_days: int = 7,
|
|
131
|
+
now: datetime | None = None,
|
|
132
|
+
) -> tuple[float, int]:
|
|
133
|
+
"""``(disagreement_rate, sample_count)`` over the rolling window.
|
|
134
|
+
|
|
135
|
+
Counts a row as "disagreed" when ``agreed=False`` regardless of
|
|
136
|
+
the escalation flag — a gate-caught split is still a sign that
|
|
137
|
+
solo mode was wrong on that decision. :func:`compute_escalation_rate`
|
|
138
|
+
breaks the same window down by ``escalated=True`` for the banner.
|
|
139
|
+
"""
|
|
140
|
+
cutoff = (now or datetime.now(timezone.utc)) - timedelta(days=window_days)
|
|
141
|
+
total = 0
|
|
142
|
+
disagreed = 0
|
|
143
|
+
for row in _iter_log(log_path):
|
|
144
|
+
raw_ts = row.get("timestamp", "")
|
|
145
|
+
try:
|
|
146
|
+
ts = datetime.fromisoformat(raw_ts.replace("Z", "+00:00"))
|
|
147
|
+
except ValueError:
|
|
148
|
+
continue
|
|
149
|
+
if ts.tzinfo is None:
|
|
150
|
+
ts = ts.replace(tzinfo=timezone.utc)
|
|
151
|
+
if ts < cutoff:
|
|
152
|
+
continue
|
|
153
|
+
total += 1
|
|
154
|
+
if not row.get("agreed", True):
|
|
155
|
+
disagreed += 1
|
|
156
|
+
if total == 0:
|
|
157
|
+
return 0.0, 0
|
|
158
|
+
return disagreed / total, total
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def compute_escalation_rate(
|
|
162
|
+
log_path: Path,
|
|
163
|
+
*,
|
|
164
|
+
window_days: int = 7,
|
|
165
|
+
now: datetime | None = None,
|
|
166
|
+
) -> tuple[float, int]:
|
|
167
|
+
"""``(escalation_rate, sample_count)`` — fraction with ``escalated=True``.
|
|
168
|
+
|
|
169
|
+
Step-9 P13 — separates gate-caught escalations from silent
|
|
170
|
+
disagreement so the banner can name the dominant failure mode.
|
|
171
|
+
"""
|
|
172
|
+
cutoff = (now or datetime.now(timezone.utc)) - timedelta(days=window_days)
|
|
173
|
+
total = 0
|
|
174
|
+
escalated = 0
|
|
175
|
+
for row in _iter_log(log_path):
|
|
176
|
+
raw_ts = row.get("timestamp", "")
|
|
177
|
+
try:
|
|
178
|
+
ts = datetime.fromisoformat(raw_ts.replace("Z", "+00:00"))
|
|
179
|
+
except ValueError:
|
|
180
|
+
continue
|
|
181
|
+
if ts.tzinfo is None:
|
|
182
|
+
ts = ts.replace(tzinfo=timezone.utc)
|
|
183
|
+
if ts < cutoff:
|
|
184
|
+
continue
|
|
185
|
+
total += 1
|
|
186
|
+
if row.get("escalated", False):
|
|
187
|
+
escalated += 1
|
|
188
|
+
if total == 0:
|
|
189
|
+
return 0.0, 0
|
|
190
|
+
return escalated / total, total
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def slo_status(rate: float) -> str:
|
|
194
|
+
if rate < SLO_THRESHOLD_WARN:
|
|
195
|
+
return "OK"
|
|
196
|
+
if rate < SLO_THRESHOLD_BREACH:
|
|
197
|
+
return "WARN"
|
|
198
|
+
return "BREACH"
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def slo_banner(
|
|
202
|
+
rate: float,
|
|
203
|
+
sample_count: int,
|
|
204
|
+
*,
|
|
205
|
+
escalation_rate: float | None = None,
|
|
206
|
+
) -> str:
|
|
207
|
+
"""One-line SLO banner. ``escalation_rate`` is appended when given.
|
|
208
|
+
|
|
209
|
+
Step-9 P13 — escalation tail surfaces the share of decisions the
|
|
210
|
+
confidence gate caught before they reached the user.
|
|
211
|
+
"""
|
|
212
|
+
pct = rate * 100
|
|
213
|
+
status = slo_status(rate)
|
|
214
|
+
if sample_count == 0:
|
|
215
|
+
return "[shadow SLO] no samples yet"
|
|
216
|
+
if status == "OK":
|
|
217
|
+
base = (
|
|
218
|
+
f"[shadow SLO] OK · {pct:.1f}% disagreement over "
|
|
219
|
+
f"{sample_count} samples (<5%)"
|
|
220
|
+
)
|
|
221
|
+
elif status == "WARN":
|
|
222
|
+
base = (
|
|
223
|
+
f"[shadow SLO] WARN · {pct:.1f}% disagreement over "
|
|
224
|
+
f"{sample_count} samples (5–8% — consider reverting to "
|
|
225
|
+
f"low_impact.dispatch: full)"
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
base = (
|
|
229
|
+
f"[shadow SLO] BREACH · {pct:.1f}% disagreement over "
|
|
230
|
+
f"{sample_count} samples (>8% — revert to "
|
|
231
|
+
f"low_impact.dispatch: full)"
|
|
232
|
+
)
|
|
233
|
+
if escalation_rate is not None:
|
|
234
|
+
base += f" · {escalation_rate * 100:.1f}% auto-escalated"
|
|
235
|
+
return base
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""Solo-member dispatch — step-9 P9 (U2).
|
|
2
|
+
|
|
3
|
+
Picks the first enabled, auth-valid member from
|
|
4
|
+
``routing.solo_member_fallback_chain`` so low-impact decisions can
|
|
5
|
+
optionally route to a single member instead of the full council. The
|
|
6
|
+
selection is intentionally side-effect-free: callers own logging,
|
|
7
|
+
dispatch, and the all-invalid → full-council fallback.
|
|
8
|
+
|
|
9
|
+
Iron Law: a None selection from :func:`select_solo_member` is the
|
|
10
|
+
caller's signal to fall back to the full council with a WARN log —
|
|
11
|
+
NEVER to fail the decision. The dispatcher must never break a
|
|
12
|
+
user's flow because a CLI was offline.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import time
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from typing import Callable, Mapping
|
|
21
|
+
|
|
22
|
+
from scripts.ai_council.config import MemberConfig, RoutingConfig
|
|
23
|
+
from scripts.ai_council.confidence_gate import (
|
|
24
|
+
EscalationDecision,
|
|
25
|
+
should_escalate,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
#: TTL for cached auth-probe results. Lazy probe per session; bumped
|
|
29
|
+
#: forward whenever a probe is re-run.
|
|
30
|
+
_AUTH_CACHE_TTL_SECONDS = 15 * 60
|
|
31
|
+
|
|
32
|
+
#: Env var that forces every solo-dispatch path back to full council
|
|
33
|
+
#: for the current invocation. Honored by :func:`select_solo_member`
|
|
34
|
+
#: and surfaced through :func:`force_full_council`.
|
|
35
|
+
FORCE_FULL_ENV = "AGENT_CONFIG_FORCE_FULL_COUNCIL"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class AuthCacheEntry:
|
|
40
|
+
"""One auth-probe result with the expiry it was cached against."""
|
|
41
|
+
|
|
42
|
+
valid: bool
|
|
43
|
+
expires_at: float
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class AuthCache:
|
|
48
|
+
"""In-memory cache for auth-probe verdicts (per-process)."""
|
|
49
|
+
|
|
50
|
+
entries: dict[str, AuthCacheEntry] = field(default_factory=dict)
|
|
51
|
+
|
|
52
|
+
def get(self, name: str, *, now: float) -> bool | None:
|
|
53
|
+
entry = self.entries.get(name)
|
|
54
|
+
if entry is None or entry.expires_at <= now:
|
|
55
|
+
return None
|
|
56
|
+
return entry.valid
|
|
57
|
+
|
|
58
|
+
def set(self, name: str, *, valid: bool, now: float) -> None:
|
|
59
|
+
self.entries[name] = AuthCacheEntry(
|
|
60
|
+
valid=valid, expires_at=now + _AUTH_CACHE_TTL_SECONDS,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def force_full_council(env: Mapping[str, str] | None = None) -> bool:
|
|
65
|
+
"""Return True iff the env-var override is set to ``1``.
|
|
66
|
+
|
|
67
|
+
Truthy values other than ``1`` are intentionally rejected — the
|
|
68
|
+
override is a hard one-bit switch, not a free-form bool.
|
|
69
|
+
"""
|
|
70
|
+
src = env if env is not None else os.environ
|
|
71
|
+
return src.get(FORCE_FULL_ENV, "") == "1"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def select_solo_member(
|
|
75
|
+
routing: RoutingConfig,
|
|
76
|
+
members: Mapping[str, MemberConfig],
|
|
77
|
+
*,
|
|
78
|
+
auth_cache: AuthCache,
|
|
79
|
+
probe: Callable[[str, float], bool],
|
|
80
|
+
now: float | None = None,
|
|
81
|
+
env: Mapping[str, str] | None = None,
|
|
82
|
+
) -> str | None:
|
|
83
|
+
"""Return the first chain entry whose member is enabled + auth-valid.
|
|
84
|
+
|
|
85
|
+
Walks ``routing.solo_member_fallback_chain`` in order. For each
|
|
86
|
+
entry: skip when the member is missing or disabled; consult the
|
|
87
|
+
auth cache; on miss probe lazily with the configured timeout and
|
|
88
|
+
cache the result. Returns the provider name of the first valid
|
|
89
|
+
member, or ``None`` when every chain entry is unavailable.
|
|
90
|
+
|
|
91
|
+
``probe(name, timeout_s) -> bool`` is the caller-supplied auth
|
|
92
|
+
check. It MUST honor ``timeout_s`` and return False on timeout
|
|
93
|
+
so the dispatcher cannot stall on a wedged CLI.
|
|
94
|
+
|
|
95
|
+
Env-var override (``AGENT_CONFIG_FORCE_FULL_COUNCIL=1``) short-
|
|
96
|
+
circuits to None, treating the whole chain as unavailable. The
|
|
97
|
+
caller still owns the WARN log + full-council escalation.
|
|
98
|
+
"""
|
|
99
|
+
if force_full_council(env):
|
|
100
|
+
return None
|
|
101
|
+
if now is None:
|
|
102
|
+
now = time.monotonic()
|
|
103
|
+
timeout_s = routing.auth_check_timeout_seconds
|
|
104
|
+
for name in routing.solo_member_fallback_chain:
|
|
105
|
+
member = members.get(name)
|
|
106
|
+
if member is None or not member.enabled:
|
|
107
|
+
continue
|
|
108
|
+
cached = auth_cache.get(name, now=now)
|
|
109
|
+
if cached is False:
|
|
110
|
+
continue
|
|
111
|
+
if cached is True:
|
|
112
|
+
return name
|
|
113
|
+
try:
|
|
114
|
+
valid = bool(probe(name, timeout_s))
|
|
115
|
+
except Exception:
|
|
116
|
+
# Probe blew up — treat as auth-invalid so the chain
|
|
117
|
+
# walks to the next entry. Don't swallow silently in
|
|
118
|
+
# production: callers should log probe failures.
|
|
119
|
+
valid = False
|
|
120
|
+
auth_cache.set(name, valid=valid, now=now)
|
|
121
|
+
if valid:
|
|
122
|
+
return name
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass(frozen=True)
|
|
127
|
+
class SoloDispatchResult:
|
|
128
|
+
"""Outcome of :func:`dispatch_with_escalation`.
|
|
129
|
+
|
|
130
|
+
``verdict`` is the final answer text returned to the caller.
|
|
131
|
+
``escalated`` is True when the solo response was rejected by the
|
|
132
|
+
confidence gate and the full council ran. ``solo_member`` /
|
|
133
|
+
``solo_response`` are populated even on escalation so the shadow
|
|
134
|
+
log can record both sides without re-running the solo step.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
verdict: str
|
|
138
|
+
escalated: bool
|
|
139
|
+
escalation_reason: str # 'low_confidence' | 'split' | 'refusal' | 'short_response' | 'ok' | 'no_solo_member'
|
|
140
|
+
solo_member: str | None
|
|
141
|
+
solo_response: str | None
|
|
142
|
+
solo_confidence: float | None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def dispatch_with_escalation(
|
|
146
|
+
routing: RoutingConfig,
|
|
147
|
+
members: Mapping[str, MemberConfig],
|
|
148
|
+
*,
|
|
149
|
+
auth_cache: AuthCache,
|
|
150
|
+
probe: Callable[[str, float], bool],
|
|
151
|
+
run_solo: Callable[[str], str],
|
|
152
|
+
run_full: Callable[[], str],
|
|
153
|
+
confidence_floor: float,
|
|
154
|
+
now: float | None = None,
|
|
155
|
+
env: Mapping[str, str] | None = None,
|
|
156
|
+
) -> SoloDispatchResult:
|
|
157
|
+
"""Solo-dispatch with auto-escalation on low-confidence / split / refusal.
|
|
158
|
+
|
|
159
|
+
Step-9 P13 — defense-in-depth on top of shadow-mode SLO.
|
|
160
|
+
|
|
161
|
+
Flow:
|
|
162
|
+
|
|
163
|
+
1. ``select_solo_member`` picks the chain entry.
|
|
164
|
+
2. None → escalate immediately (``no_solo_member``).
|
|
165
|
+
3. ``run_solo`` is invoked; response is scored via
|
|
166
|
+
:func:`scripts.ai_council.confidence_gate.should_escalate`.
|
|
167
|
+
4. Verdict ``escalate=True`` → ``run_full`` is invoked and that
|
|
168
|
+
verdict is returned; the solo response stays on the result
|
|
169
|
+
for shadow logging.
|
|
170
|
+
5. ``escalate=False`` → solo verdict is returned as-is.
|
|
171
|
+
|
|
172
|
+
``run_solo(name) -> str`` and ``run_full() -> str`` are caller-
|
|
173
|
+
supplied; this module owns no LLM transport. Callers MUST raise
|
|
174
|
+
on transport errors — escalation is for *content* low-confidence,
|
|
175
|
+
not infrastructure failures (those bubble up to the orchestrator's
|
|
176
|
+
own retry / fallback policy).
|
|
177
|
+
"""
|
|
178
|
+
name = select_solo_member(
|
|
179
|
+
routing,
|
|
180
|
+
members,
|
|
181
|
+
auth_cache=auth_cache,
|
|
182
|
+
probe=probe,
|
|
183
|
+
now=now,
|
|
184
|
+
env=env,
|
|
185
|
+
)
|
|
186
|
+
if name is None:
|
|
187
|
+
return SoloDispatchResult(
|
|
188
|
+
verdict=run_full(),
|
|
189
|
+
escalated=True,
|
|
190
|
+
escalation_reason="no_solo_member",
|
|
191
|
+
solo_member=None,
|
|
192
|
+
solo_response=None,
|
|
193
|
+
solo_confidence=None,
|
|
194
|
+
)
|
|
195
|
+
solo = run_solo(name)
|
|
196
|
+
decision: EscalationDecision = should_escalate(solo, floor=confidence_floor)
|
|
197
|
+
if decision.escalate:
|
|
198
|
+
return SoloDispatchResult(
|
|
199
|
+
verdict=run_full(),
|
|
200
|
+
escalated=True,
|
|
201
|
+
escalation_reason=decision.reason,
|
|
202
|
+
solo_member=name,
|
|
203
|
+
solo_response=solo,
|
|
204
|
+
solo_confidence=decision.confidence,
|
|
205
|
+
)
|
|
206
|
+
return SoloDispatchResult(
|
|
207
|
+
verdict=solo,
|
|
208
|
+
escalated=False,
|
|
209
|
+
escalation_reason="ok",
|
|
210
|
+
solo_member=name,
|
|
211
|
+
solo_response=solo,
|
|
212
|
+
solo_confidence=decision.confidence,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
__all__ = [
|
|
217
|
+
"AUTH_CACHE_TTL_SECONDS",
|
|
218
|
+
"AuthCache",
|
|
219
|
+
"AuthCacheEntry",
|
|
220
|
+
"FORCE_FULL_ENV",
|
|
221
|
+
"SoloDispatchResult",
|
|
222
|
+
"dispatch_with_escalation",
|
|
223
|
+
"force_full_council",
|
|
224
|
+
"select_solo_member",
|
|
225
|
+
]
|
|
226
|
+
AUTH_CACHE_TTL_SECONDS = _AUTH_CACHE_TTL_SECONDS
|