@event4u/agent-config 2.13.0 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/agents/user/accept.md +117 -0
- package/.agent-src/commands/agents/user/init.md +163 -0
- package/.agent-src/commands/agents/user/review.md +107 -0
- package/.agent-src/commands/agents/user/show.md +109 -0
- package/.agent-src/commands/agents/user/update.md +98 -0
- package/.agent-src/commands/agents/user.md +66 -0
- package/.agent-src/commands/agents.md +2 -0
- package/.agent-src/commands/memory/learn-low-impact.md +143 -0
- package/.agent-src/rules/ask-when-uncertain.md +10 -6
- package/.agent-src/rules/copilot-routing.md +1 -1
- package/.agent-src/rules/devcontainer-routing.md +1 -1
- package/.agent-src/rules/external-reference-deep-dive.md +1 -1
- package/.agent-src/rules/fast-path-marker-visibility.md +38 -0
- package/.agent-src/rules/low-impact-corpus-privacy-floor.md +74 -0
- package/.agent-src/rules/symfony-routing.md +1 -1
- package/.agent-src/skills/ai-council/SKILL.md +208 -8
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.claude-plugin/marketplace.json +8 -1
- package/CHANGELOG.md +328 -124
- package/README.md +21 -6
- package/config/agent-settings.template.yml +4 -0
- package/config/gitignore-block.txt +17 -0
- package/docs/architecture.md +12 -12
- package/docs/archive/CHANGELOG-pre-2.11.0.md +141 -0
- package/docs/catalog.md +16 -7
- package/docs/contracts/adr-architectural-consensus-mechanism.md +4 -3
- package/docs/contracts/adr-level-6-productization.md +7 -9
- package/docs/contracts/agent-user-schema.md +165 -0
- package/docs/contracts/ai-council-config.md +492 -20
- package/docs/contracts/command-clusters.md +2 -2
- package/docs/contracts/command-surface-tiers.md +3 -2
- package/docs/contracts/cost-profile-defaults.md +5 -0
- package/docs/contracts/decision-engine-gates.md +5 -0
- package/docs/contracts/decision-trace-v1.md +2 -2
- package/docs/contracts/file-ownership-matrix.json +1961 -108
- package/docs/contracts/installed-tools-lockfile.md +2 -1
- package/docs/contracts/low-impact-corpus-format.md +95 -0
- package/docs/contracts/mcp-beta-criteria.md +6 -5
- package/docs/contracts/mcp-cloud-scope.md +5 -4
- package/docs/contracts/multi-tool-projection-fidelity.md +8 -2
- package/docs/contracts/release-trunk-sync.md +4 -3
- package/docs/contracts/tier-3-contrib-plugin.md +5 -6
- package/docs/examples/agent-user.example.md +21 -0
- package/docs/getting-started.md +2 -2
- package/docs/guidelines/agent-infra/installed-tools-manifest.md +2 -1
- package/docs/installation.md +32 -0
- package/package.json +1 -1
- package/scripts/_cli/cmd_doctor.py +134 -0
- package/scripts/ai_council/airgap.py +165 -0
- package/scripts/ai_council/cli_hints.py +123 -0
- package/scripts/ai_council/clients.py +787 -5
- package/scripts/ai_council/compile_corpus.py +178 -0
- package/scripts/ai_council/confidence_gate.py +156 -0
- package/scripts/ai_council/config.py +1007 -11
- package/scripts/ai_council/consensus.py +41 -2
- package/scripts/ai_council/events_log.py +137 -0
- package/scripts/ai_council/learn_low_impact_preview.py +252 -0
- package/scripts/ai_council/low_impact.py +714 -0
- package/scripts/ai_council/low_impact_corpus.py +466 -0
- package/scripts/ai_council/low_impact_intake.py +163 -0
- package/scripts/ai_council/modes.py +6 -1
- package/scripts/ai_council/necessity.py +782 -0
- package/scripts/ai_council/orchestrator.py +252 -14
- package/scripts/ai_council/probation_gate.py +152 -0
- package/scripts/ai_council/redact_low_impact_entry.py +155 -0
- package/scripts/ai_council/replay.py +155 -0
- package/scripts/ai_council/session.py +19 -1
- package/scripts/ai_council/shadow_dispatch.py +235 -0
- package/scripts/ai_council/solo_dispatch.py +226 -0
- package/scripts/audit_cloud_compatibility.py +74 -0
- package/scripts/audit_command_surface.py +363 -0
- package/scripts/check_council_layout.py +11 -0
- package/scripts/council_cli.py +1046 -15
- package/scripts/install.sh +12 -0
|
@@ -0,0 +1,782 @@
|
|
|
1
|
+
"""Council-necessity classifier (Phase 6).
|
|
2
|
+
|
|
3
|
+
Heuristic pre-flight that decides whether the request actually warrants
|
|
4
|
+
a council deliberation. Three verdicts drive three exit paths in the
|
|
5
|
+
dispatcher (skip / educate / proceed). See
|
|
6
|
+
``docs/contracts/ai-council-config.md`` for the trigger lists and the
|
|
7
|
+
toggle schema.
|
|
8
|
+
|
|
9
|
+
The classifier is **shape-based**, not semantic — it scans the prompt
|
|
10
|
+
for marker words associated with each bucket. False positives are
|
|
11
|
+
preferable to false negatives on the `necessary` side (an extra council
|
|
12
|
+
run is cheaper than a missed strategic decision); the educate path
|
|
13
|
+
exists exactly to let the user override a wrong `unnecessary` verdict.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Literal
|
|
21
|
+
|
|
22
|
+
NecessityVerdict = Literal["necessary", "borderline", "unnecessary"]
|
|
23
|
+
Invocation = Literal["agent", "user_explicit"]
|
|
24
|
+
|
|
25
|
+
#: Length tier cut-offs in characters (stripped prompt). Tier names are
|
|
26
|
+
#: used in :func:`classify_size_fit` rationales; tweak only with a
|
|
27
|
+
#: parametrised test update.
|
|
28
|
+
_SHORT_PROMPT_MAX = 200
|
|
29
|
+
_MEDIUM_PROMPT_MAX = 800
|
|
30
|
+
|
|
31
|
+
#: Lenses where the size classifier never suggests a downgrade. Debate
|
|
32
|
+
#: is structurally expensive but also depends on top-tier reasoning to
|
|
33
|
+
#: produce useful dissent — surfacing a downgrade prompt mid-debate
|
|
34
|
+
#: degrades signal-to-noise.
|
|
35
|
+
_NO_DOWNGRADE_LENSES = frozenset({"debate"})
|
|
36
|
+
|
|
37
|
+
#: Trigger words that flag a prompt as `necessary`. Each entry must be
|
|
38
|
+
#: a lowercase, whole-word match — surrounding word boundaries are
|
|
39
|
+
#: enforced by :func:`_count_matches`. Buckets:
|
|
40
|
+
#:
|
|
41
|
+
#: - architecture: structural / boundary / cross-component decisions
|
|
42
|
+
#: - tradeoff: multi-stakeholder or multi-axis trade-off shape
|
|
43
|
+
#: - ambiguity: explicit uncertainty markers in the prompt
|
|
44
|
+
#: - strategic: decision verbs that move the artefact across a fork
|
|
45
|
+
NECESSARY_TRIGGERS: dict[str, tuple[str, ...]] = {
|
|
46
|
+
"architecture": (
|
|
47
|
+
"architecture", "architectural", "system design", "boundary",
|
|
48
|
+
"boundaries", "coupling", "decouple", "monorepo", "microservice",
|
|
49
|
+
"microservices", "service boundary", "module boundary",
|
|
50
|
+
"refactor strategy", "migration plan", "rewrite", "redesign",
|
|
51
|
+
),
|
|
52
|
+
"tradeoff": (
|
|
53
|
+
"trade-off", "tradeoff", "trade off", "stakeholder", "stakeholders",
|
|
54
|
+
"competing", "tension", "balance", "weigh", "pros and cons",
|
|
55
|
+
"alternatives", "options", "vs", "versus",
|
|
56
|
+
),
|
|
57
|
+
"ambiguity": (
|
|
58
|
+
"unsure", "uncertain", "ambiguous", "unclear", "not sure",
|
|
59
|
+
"don't know", "dont know", "open question", "controversial",
|
|
60
|
+
"debate", "second opinion", "sanity check",
|
|
61
|
+
),
|
|
62
|
+
"strategic": (
|
|
63
|
+
"should we", "shall we", "do we", "roadmap", "long-term",
|
|
64
|
+
"strategic", "strategy", "vision", "direction", "decision",
|
|
65
|
+
"decide", "choose", "select", "approach", "policy",
|
|
66
|
+
),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
#: Trigger words that flag a prompt as `unnecessary`. Same matching
|
|
70
|
+
#: rules as :data:`NECESSARY_TRIGGERS`. Buckets:
|
|
71
|
+
#:
|
|
72
|
+
#: - bugfix: localised defect / error / crash hunt
|
|
73
|
+
#: - syntax: tooling / format / lint level
|
|
74
|
+
#: - single_file: implementation scoped to one file or function
|
|
75
|
+
#: - lookup: information retrieval, not deliberation
|
|
76
|
+
UNNECESSARY_TRIGGERS: dict[str, tuple[str, ...]] = {
|
|
77
|
+
"bugfix": (
|
|
78
|
+
"bug", "bugfix", "fix bug", "crash", "error", "exception",
|
|
79
|
+
"stack trace", "traceback", "failing test", "fails", "broken",
|
|
80
|
+
"regression",
|
|
81
|
+
),
|
|
82
|
+
"syntax": (
|
|
83
|
+
"syntax", "typo", "format", "formatting", "lint", "linter",
|
|
84
|
+
"indent", "indentation", "rename", "import order",
|
|
85
|
+
),
|
|
86
|
+
"single_file": (
|
|
87
|
+
"this function", "this method", "this file", "one-line",
|
|
88
|
+
"one liner", "small change", "rename", "extract method",
|
|
89
|
+
"extract function", "add a getter", "add a setter",
|
|
90
|
+
),
|
|
91
|
+
"lookup": (
|
|
92
|
+
"what is", "what's", "what does", "how does", "look up",
|
|
93
|
+
"documentation", "docs", "example", "snippet", "syntax of",
|
|
94
|
+
"api of",
|
|
95
|
+
),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
#: Lenses where the necessity bar is tighter — debate is expensive, so
|
|
99
|
+
#: a `borderline` verdict on the `debate` lens gets nudged toward
|
|
100
|
+
#: `unnecessary` when no `necessary` marker is present. `pr` lens fires
|
|
101
|
+
#: on diffs and stays neutral. Other lenses use the default scoring.
|
|
102
|
+
_STRICT_LENSES = frozenset({"debate"})
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass(frozen=True)
|
|
106
|
+
class ClassificationResult:
|
|
107
|
+
"""Outcome of a necessity classification.
|
|
108
|
+
|
|
109
|
+
Attributes:
|
|
110
|
+
verdict: One of ``necessary`` / ``borderline`` / ``unnecessary``.
|
|
111
|
+
category: Best-match trigger bucket (``architecture``, ``bugfix``,
|
|
112
|
+
``lookup``, …) or ``"unclassified"`` when no marker fired.
|
|
113
|
+
rationale: One-line human-readable explanation suitable for
|
|
114
|
+
inline display in session.md or the educate path.
|
|
115
|
+
necessary_hits: Number of `necessary` triggers matched.
|
|
116
|
+
unnecessary_hits: Number of `unnecessary` triggers matched.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
verdict: NecessityVerdict
|
|
120
|
+
category: str
|
|
121
|
+
rationale: str
|
|
122
|
+
necessary_hits: int
|
|
123
|
+
unnecessary_hits: int
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
_WORD_RE_CACHE: dict[str, re.Pattern[str]] = {}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _compile(trigger: str) -> re.Pattern[str]:
|
|
130
|
+
cached = _WORD_RE_CACHE.get(trigger)
|
|
131
|
+
if cached is not None:
|
|
132
|
+
return cached
|
|
133
|
+
if any(ch.isspace() for ch in trigger):
|
|
134
|
+
pattern = r"\b" + re.escape(trigger) + r"\b"
|
|
135
|
+
else:
|
|
136
|
+
pattern = r"\b" + re.escape(trigger) + r"\b"
|
|
137
|
+
compiled = re.compile(pattern, flags=re.IGNORECASE)
|
|
138
|
+
_WORD_RE_CACHE[trigger] = compiled
|
|
139
|
+
return compiled
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _count_matches(
|
|
143
|
+
text: str, triggers: dict[str, tuple[str, ...]],
|
|
144
|
+
) -> tuple[int, str | None]:
|
|
145
|
+
"""Return ``(total_hits, top_bucket_or_None)``.
|
|
146
|
+
|
|
147
|
+
Top bucket = the bucket with the most matches in ``text``. Ties are
|
|
148
|
+
broken by definition order in the input dict — Python dicts preserve
|
|
149
|
+
insertion order, so the trigger tables above act as priority lists.
|
|
150
|
+
"""
|
|
151
|
+
best_bucket: str | None = None
|
|
152
|
+
best_count = 0
|
|
153
|
+
total = 0
|
|
154
|
+
for bucket, words in triggers.items():
|
|
155
|
+
count = 0
|
|
156
|
+
for w in words:
|
|
157
|
+
if _compile(w).search(text):
|
|
158
|
+
count += 1
|
|
159
|
+
total += count
|
|
160
|
+
if count > best_count:
|
|
161
|
+
best_count = count
|
|
162
|
+
best_bucket = bucket
|
|
163
|
+
return total, best_bucket
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def classify_necessity(
|
|
167
|
+
prompt: str,
|
|
168
|
+
lens: str = "analysis",
|
|
169
|
+
invocation: Invocation = "agent",
|
|
170
|
+
) -> ClassificationResult:
|
|
171
|
+
"""Classify a council request as necessary / borderline / unnecessary.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
prompt: The raw prompt text the council would deliberate on.
|
|
175
|
+
Whitespace-stripped; empty input maps to ``unnecessary`` /
|
|
176
|
+
``"empty"``.
|
|
177
|
+
lens: Active lens (``analysis``, ``debate``, ``pr``, …). Strict
|
|
178
|
+
lenses (currently ``debate``) raise the bar — a borderline
|
|
179
|
+
verdict with no `necessary` hits flips to ``unnecessary``.
|
|
180
|
+
invocation: Source signal — ``agent`` or ``user_explicit``.
|
|
181
|
+
Does not change the verdict itself; the dispatcher routes
|
|
182
|
+
on the pair ``(verdict, invocation)``.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
:class:`ClassificationResult` with verdict, top-matched
|
|
186
|
+
category, one-line rationale, and raw hit counts (useful for
|
|
187
|
+
tests and session.md provenance).
|
|
188
|
+
"""
|
|
189
|
+
text = (prompt or "").strip()
|
|
190
|
+
if not text:
|
|
191
|
+
return ClassificationResult(
|
|
192
|
+
verdict="unnecessary",
|
|
193
|
+
category="empty",
|
|
194
|
+
rationale="Empty prompt — nothing to deliberate.",
|
|
195
|
+
necessary_hits=0,
|
|
196
|
+
unnecessary_hits=0,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
n_hits, n_bucket = _count_matches(text, NECESSARY_TRIGGERS)
|
|
200
|
+
u_hits, u_bucket = _count_matches(text, UNNECESSARY_TRIGGERS)
|
|
201
|
+
|
|
202
|
+
# Decision table (intentionally simple — heuristic by design):
|
|
203
|
+
# strong necessary signal → necessary
|
|
204
|
+
# strong unnecessary signal → unnecessary (unless necessary also fires)
|
|
205
|
+
# mixed → borderline
|
|
206
|
+
# no signal → borderline
|
|
207
|
+
if n_hits >= 2 and n_hits > u_hits:
|
|
208
|
+
verdict: NecessityVerdict = "necessary"
|
|
209
|
+
category = n_bucket or "unclassified"
|
|
210
|
+
rationale = (
|
|
211
|
+
f"Matched {n_hits} `necessary` trigger(s) in bucket "
|
|
212
|
+
f"`{category}`; council deliberation typically warranted."
|
|
213
|
+
)
|
|
214
|
+
elif n_hits >= 1 and u_hits == 0:
|
|
215
|
+
verdict = "necessary" if n_hits >= 2 else "borderline"
|
|
216
|
+
category = n_bucket or "unclassified"
|
|
217
|
+
rationale = (
|
|
218
|
+
f"{n_hits} `necessary` marker(s) in `{category}`, no "
|
|
219
|
+
f"`unnecessary` markers — leaning toward deliberation."
|
|
220
|
+
)
|
|
221
|
+
elif u_hits >= 2 and n_hits == 0:
|
|
222
|
+
verdict = "unnecessary"
|
|
223
|
+
category = u_bucket or "unclassified"
|
|
224
|
+
rationale = (
|
|
225
|
+
f"Matched {u_hits} `unnecessary` trigger(s) in bucket "
|
|
226
|
+
f"`{category}`; council typically does not add value here."
|
|
227
|
+
)
|
|
228
|
+
elif u_hits >= 1 and n_hits == 0:
|
|
229
|
+
verdict = "unnecessary" if u_hits >= 2 else "borderline"
|
|
230
|
+
category = u_bucket or "unclassified"
|
|
231
|
+
rationale = (
|
|
232
|
+
f"{u_hits} `unnecessary` marker(s) in `{category}`, no "
|
|
233
|
+
f"`necessary` markers — leaning away from deliberation."
|
|
234
|
+
)
|
|
235
|
+
else:
|
|
236
|
+
# Mixed or no markers — borderline by default.
|
|
237
|
+
verdict = "borderline"
|
|
238
|
+
category = (n_bucket or u_bucket) or "unclassified"
|
|
239
|
+
rationale = (
|
|
240
|
+
f"Mixed signals: necessary={n_hits}, unnecessary={u_hits}. "
|
|
241
|
+
f"Borderline — proceed with a one-line note in session.md."
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Lens-strictness pass: debate-tier lenses nudge borderline →
|
|
245
|
+
# unnecessary when no `necessary` marker is present, to prevent
|
|
246
|
+
# expensive debate runs on trivial questions.
|
|
247
|
+
if (
|
|
248
|
+
lens in _STRICT_LENSES
|
|
249
|
+
and verdict == "borderline"
|
|
250
|
+
and n_hits == 0
|
|
251
|
+
):
|
|
252
|
+
verdict = "unnecessary"
|
|
253
|
+
rationale = (
|
|
254
|
+
f"Lens `{lens}` is strict (expensive deliberation); "
|
|
255
|
+
f"borderline with zero `necessary` markers → unnecessary."
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
return ClassificationResult(
|
|
259
|
+
verdict=verdict,
|
|
260
|
+
category=category,
|
|
261
|
+
rationale=rationale,
|
|
262
|
+
necessary_hits=n_hits,
|
|
263
|
+
unnecessary_hits=u_hits,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def educate_message(result: ClassificationResult, lens: str) -> str:
|
|
268
|
+
"""Return the user-facing educate paragraph for the dispatcher.
|
|
269
|
+
|
|
270
|
+
Emitted only on the `user_explicit + unnecessary` path. The skill
|
|
271
|
+
layer pairs this with a numbered-options prompt (1=proceed,
|
|
272
|
+
2=skip); the CLI surfaces it as plain text and returns a non-zero
|
|
273
|
+
exit code unless ``--proceed-anyway`` is set.
|
|
274
|
+
"""
|
|
275
|
+
return (
|
|
276
|
+
f"This request looks like `{result.category}` "
|
|
277
|
+
f"({result.unnecessary_hits} matching marker(s)) on the "
|
|
278
|
+
f"`{lens}` lens. Council typically adds value when the request "
|
|
279
|
+
f"involves architectural trade-offs, multi-stakeholder "
|
|
280
|
+
f"decisions, or strategic direction — not for localised bug "
|
|
281
|
+
f"fixes, syntax / formatting work, or lookups.\n"
|
|
282
|
+
f"\n"
|
|
283
|
+
f"Re-run with `--proceed-anyway` to invoke the council anyway."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# --- Phase 7: Model-size classifier + downgrade suggestion ---------------
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@dataclass(frozen=True)
|
|
291
|
+
class SizeFitVerdict:
|
|
292
|
+
"""Outcome of a model-size fit classification.
|
|
293
|
+
|
|
294
|
+
Attributes:
|
|
295
|
+
fit: ``True`` when ``current_model`` is appropriate for the
|
|
296
|
+
prompt shape; ``False`` when a cheaper / faster sibling on
|
|
297
|
+
the same ladder would answer as well.
|
|
298
|
+
suggested_model: ladder entry recommended when ``fit=False``.
|
|
299
|
+
``None`` when ``fit=True`` (no swap proposed).
|
|
300
|
+
reason: one-line human-readable rationale.
|
|
301
|
+
current_index: zero-based index of ``current_model`` in the
|
|
302
|
+
ladder (smallest = 0). ``-1`` when ``current_model`` is not
|
|
303
|
+
on the ladder.
|
|
304
|
+
length_tier: ``"short"`` / ``"medium"`` / ``"long"``.
|
|
305
|
+
complexity_hits: count of `necessary`-bucket markers in the
|
|
306
|
+
prompt (proxy for "needs big model").
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
fit: bool
|
|
310
|
+
suggested_model: str | None
|
|
311
|
+
reason: str
|
|
312
|
+
current_index: int
|
|
313
|
+
length_tier: Literal["short", "medium", "long"]
|
|
314
|
+
complexity_hits: int
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _length_tier(text: str) -> Literal["short", "medium", "long"]:
|
|
318
|
+
if len(text) < _SHORT_PROMPT_MAX:
|
|
319
|
+
return "short"
|
|
320
|
+
if len(text) < _MEDIUM_PROMPT_MAX:
|
|
321
|
+
return "medium"
|
|
322
|
+
return "long"
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def classify_size_fit(
|
|
326
|
+
prompt: str,
|
|
327
|
+
current_model: str,
|
|
328
|
+
ladder: tuple[str, ...] | list[str],
|
|
329
|
+
lens: str = "analysis",
|
|
330
|
+
) -> SizeFitVerdict:
|
|
331
|
+
"""Decide whether ``current_model`` fits the prompt shape.
|
|
332
|
+
|
|
333
|
+
Heuristic — never suggests an UP-tier swap (Phase 7 is downgrade-
|
|
334
|
+
only). When the prompt is short AND carries no complexity markers
|
|
335
|
+
AND the current model is above the smallest tier, suggest the next
|
|
336
|
+
rung down. Longer prompts or multi-axis complexity keep the current
|
|
337
|
+
model.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
prompt: raw prompt text the council would deliberate on.
|
|
341
|
+
current_model: model id currently selected for the member.
|
|
342
|
+
ladder: provider's `model_ladder` ordered smallest → largest.
|
|
343
|
+
When ``current_model`` is not on the ladder, returns
|
|
344
|
+
``fit=True`` with an explanatory reason (no downgrade
|
|
345
|
+
suggested — caller should configure the ladder first).
|
|
346
|
+
lens: active lens; ``debate`` lens disables downgrade
|
|
347
|
+
suggestions to keep dissent quality high.
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
:class:`SizeFitVerdict`.
|
|
351
|
+
"""
|
|
352
|
+
text = (prompt or "").strip()
|
|
353
|
+
tier = _length_tier(text)
|
|
354
|
+
n_hits, _ = _count_matches(text.lower(), NECESSARY_TRIGGERS)
|
|
355
|
+
|
|
356
|
+
ladder_list = list(ladder or ())
|
|
357
|
+
try:
|
|
358
|
+
idx = ladder_list.index(current_model)
|
|
359
|
+
except ValueError:
|
|
360
|
+
return SizeFitVerdict(
|
|
361
|
+
fit=True,
|
|
362
|
+
suggested_model=None,
|
|
363
|
+
reason=(
|
|
364
|
+
f"`{current_model}` is not on the configured ladder "
|
|
365
|
+
f"({ladder_list or 'empty'}) — no downgrade path."
|
|
366
|
+
),
|
|
367
|
+
current_index=-1,
|
|
368
|
+
length_tier=tier,
|
|
369
|
+
complexity_hits=n_hits,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
if idx == 0:
|
|
373
|
+
return SizeFitVerdict(
|
|
374
|
+
fit=True,
|
|
375
|
+
suggested_model=None,
|
|
376
|
+
reason=f"`{current_model}` is already on the smallest tier.",
|
|
377
|
+
current_index=idx,
|
|
378
|
+
length_tier=tier,
|
|
379
|
+
complexity_hits=n_hits,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
if lens in _NO_DOWNGRADE_LENSES:
|
|
383
|
+
return SizeFitVerdict(
|
|
384
|
+
fit=True,
|
|
385
|
+
suggested_model=None,
|
|
386
|
+
reason=(
|
|
387
|
+
f"Lens `{lens}` keeps the top tier for dissent quality; "
|
|
388
|
+
f"no downgrade suggested."
|
|
389
|
+
),
|
|
390
|
+
current_index=idx,
|
|
391
|
+
length_tier=tier,
|
|
392
|
+
complexity_hits=n_hits,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if n_hits >= 2 or tier == "long":
|
|
396
|
+
return SizeFitVerdict(
|
|
397
|
+
fit=True,
|
|
398
|
+
suggested_model=None,
|
|
399
|
+
reason=(
|
|
400
|
+
f"Complexity warrants the current tier "
|
|
401
|
+
f"(length={tier}, complexity_hits={n_hits})."
|
|
402
|
+
),
|
|
403
|
+
current_index=idx,
|
|
404
|
+
length_tier=tier,
|
|
405
|
+
complexity_hits=n_hits,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
if tier == "short" and n_hits == 0:
|
|
409
|
+
suggested = ladder_list[max(0, idx - 1)]
|
|
410
|
+
return SizeFitVerdict(
|
|
411
|
+
fit=False,
|
|
412
|
+
suggested_model=suggested,
|
|
413
|
+
reason=(
|
|
414
|
+
f"Short prompt ({len(text)} chars) with no complexity "
|
|
415
|
+
f"markers — `{suggested}` should answer as well."
|
|
416
|
+
),
|
|
417
|
+
current_index=idx,
|
|
418
|
+
length_tier=tier,
|
|
419
|
+
complexity_hits=n_hits,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if tier == "medium" and n_hits == 0 and idx >= 1:
|
|
423
|
+
suggested = ladder_list[max(0, idx - 1)]
|
|
424
|
+
return SizeFitVerdict(
|
|
425
|
+
fit=False,
|
|
426
|
+
suggested_model=suggested,
|
|
427
|
+
reason=(
|
|
428
|
+
f"Medium-length prompt with no complexity markers — "
|
|
429
|
+
f"`{suggested}` likely sufficient."
|
|
430
|
+
),
|
|
431
|
+
current_index=idx,
|
|
432
|
+
length_tier=tier,
|
|
433
|
+
complexity_hits=n_hits,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
return SizeFitVerdict(
|
|
437
|
+
fit=True,
|
|
438
|
+
suggested_model=None,
|
|
439
|
+
reason=(
|
|
440
|
+
f"Length / complexity balance keeps current tier "
|
|
441
|
+
f"(length={tier}, complexity_hits={n_hits})."
|
|
442
|
+
),
|
|
443
|
+
current_index=idx,
|
|
444
|
+
length_tier=tier,
|
|
445
|
+
complexity_hits=n_hits,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def downgrade_message(verdict: SizeFitVerdict, current_model: str) -> str:
|
|
450
|
+
"""User-facing downgrade-suggestion paragraph.
|
|
451
|
+
|
|
452
|
+
Emitted by the dispatcher when ``model_downgrade`` is enabled and
|
|
453
|
+
``classify_size_fit`` returned ``fit=False``. Followed by a single
|
|
454
|
+
numbered-options prompt at the agent surface (1=use suggested /
|
|
455
|
+
2=keep current / 3=skip this member).
|
|
456
|
+
"""
|
|
457
|
+
return (
|
|
458
|
+
f"Current model `{current_model}` looks oversized for this "
|
|
459
|
+
f"request. Suggested: `{verdict.suggested_model}` "
|
|
460
|
+
f"(reason: {verdict.reason})."
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
# --- Phase 10: Five-class impact classifier + routing --------------------
|
|
465
|
+
|
|
466
|
+
ImpactClass = Literal[
|
|
467
|
+
"trivial", "low_impact", "medium_impact", "high_impact", "user_required",
|
|
468
|
+
]
|
|
469
|
+
|
|
470
|
+
#: Classes that are structurally LOCKED to ``user`` routing. The
|
|
471
|
+
#: schema validator in ``config.py`` rejects any attempt to remap
|
|
472
|
+
#: these via ``decision_resolution.<class>.mode``. Iron Law per the
|
|
473
|
+
#: roadmap: security / auth / billing / tenant-boundary / migration /
|
|
474
|
+
#: production-destructive decisions always reach the user.
|
|
475
|
+
LOCKED_IMPACT_CLASSES: frozenset[ImpactClass] = frozenset(
|
|
476
|
+
{"high_impact", "user_required"},
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
#: User-fence markers that force ``user_required`` regardless of any
|
|
480
|
+
#: other signal. Mirrors the "fenced step" language in
|
|
481
|
+
#: ``scope-control``: when the user has set a review gate, the agent
|
|
482
|
+
#: never auto-routes the question away from them.
|
|
483
|
+
_USER_FENCE_MARKERS: tuple[str, ...] = (
|
|
484
|
+
"ask me", "review first", "plan only", "don't decide", "do not decide",
|
|
485
|
+
"wait for me", "I'll decide", "i will decide", "let me decide",
|
|
486
|
+
"frag mich", "warte auf mich",
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
#: Trigger words per impact class. Whole-word match via
|
|
490
|
+
#: :func:`_count_matches`. Ordered by structural severity — when a
|
|
491
|
+
#: prompt matches multiple classes, the higher-severity class wins
|
|
492
|
+
#: (handled by the override precedence in :func:`classify_impact`).
|
|
493
|
+
IMPACT_TRIGGERS: dict[ImpactClass, tuple[str, ...]] = {
|
|
494
|
+
"trivial": (
|
|
495
|
+
"naming", "rename", "name this", "what should i call",
|
|
496
|
+
"whitespace", "indent", "indentation", "comment style",
|
|
497
|
+
"import order", "import ordering", "variable case", "snake_case",
|
|
498
|
+
"camelcase", "typo", "spacing",
|
|
499
|
+
),
|
|
500
|
+
"low_impact": (
|
|
501
|
+
"service vs repository", "repository vs service", "idiom",
|
|
502
|
+
"dto", "dto vs array", "value object", "job vs sync",
|
|
503
|
+
"queue vs sync", "test extension", "test suffix", "trait vs class",
|
|
504
|
+
"helper vs static", "use composition", "use inheritance",
|
|
505
|
+
),
|
|
506
|
+
"medium_impact": (
|
|
507
|
+
"api shape", "endpoint shape", "contract change", "contract update",
|
|
508
|
+
"cross-module", "cross module", "module boundary", "package boundary",
|
|
509
|
+
"interface change", "signature change", "breaking change",
|
|
510
|
+
),
|
|
511
|
+
"high_impact": (
|
|
512
|
+
"security", "auth", "authentication", "authorization", "permission",
|
|
513
|
+
"tenant", "tenants", "tenant boundary", "migration", "schema migration",
|
|
514
|
+
"production", "prod database", "destructive", "drop table", "truncate",
|
|
515
|
+
"delete column", "billing", "secret", "secrets", "api key",
|
|
516
|
+
"credentials", "encryption", "sso", "oauth", "iam",
|
|
517
|
+
"policy change", "data retention", "personal data", "pii",
|
|
518
|
+
),
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
@dataclass(frozen=True)
|
|
523
|
+
class ImpactVerdict:
|
|
524
|
+
"""Outcome of an impact classification (Phase 10).
|
|
525
|
+
|
|
526
|
+
Attributes:
|
|
527
|
+
impact_class: One of :data:`ImpactClass`.
|
|
528
|
+
confidence: 0.0–1.0 self-rated certainty in the verdict.
|
|
529
|
+
Used by the routing layer's ``confidence_threshold`` gate:
|
|
530
|
+
high-confidence ``low_impact`` skips council, low-confidence
|
|
531
|
+
falls through to council (Phase 11) or user.
|
|
532
|
+
rationale: One-line explanation suitable for inline session.md
|
|
533
|
+
display. Includes the matched trigger bucket when applicable.
|
|
534
|
+
category: Best-match trigger bucket (or ``"unclassified"`` when
|
|
535
|
+
no marker fired and the prompt defaulted to a class).
|
|
536
|
+
"""
|
|
537
|
+
|
|
538
|
+
impact_class: ImpactClass
|
|
539
|
+
confidence: float
|
|
540
|
+
rationale: str
|
|
541
|
+
category: str
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def classify_impact(question_text: str) -> ImpactVerdict:
|
|
545
|
+
"""Classify a pending agent question by stakes / blast-radius.
|
|
546
|
+
|
|
547
|
+
Heuristic, keyword-shape based — no LLM call, fully explainable.
|
|
548
|
+
Precedence (highest wins): user-fence marker → high_impact markers
|
|
549
|
+
→ medium_impact markers → low_impact markers → trivial markers →
|
|
550
|
+
default fallback. Confidence is rule-based and reflects how many
|
|
551
|
+
distinct markers fired, not learned probability.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
question_text: The pending question text the agent is about
|
|
555
|
+
to surface. Whitespace-stripped before scanning; empty
|
|
556
|
+
input maps to ``user_required`` / confidence ``1.0`` (no
|
|
557
|
+
agent should silently resolve an empty question).
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
:class:`ImpactVerdict` with class, confidence, rationale,
|
|
561
|
+
and matched bucket.
|
|
562
|
+
"""
|
|
563
|
+
text = (question_text or "").strip()
|
|
564
|
+
if not text:
|
|
565
|
+
return ImpactVerdict(
|
|
566
|
+
impact_class="user_required",
|
|
567
|
+
confidence=1.0,
|
|
568
|
+
rationale="Empty question — user must clarify.",
|
|
569
|
+
category="empty",
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
lower = text.lower()
|
|
573
|
+
|
|
574
|
+
# User fence → user_required, beats every other signal. The agent
|
|
575
|
+
# never auto-routes around an explicit review gate.
|
|
576
|
+
for marker in _USER_FENCE_MARKERS:
|
|
577
|
+
if _compile(marker).search(lower):
|
|
578
|
+
return ImpactVerdict(
|
|
579
|
+
impact_class="user_required",
|
|
580
|
+
confidence=1.0,
|
|
581
|
+
rationale=(
|
|
582
|
+
f"User-fence marker (`{marker}`) — explicit review "
|
|
583
|
+
f"gate, routes to user regardless of topic."
|
|
584
|
+
),
|
|
585
|
+
category="user_fence",
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
# Severity precedence: count distinct triggers per class, take the
|
|
590
|
+
# highest-severity class with at least one hit. Confidence scales
|
|
591
|
+
# with hit count for the winning class.
|
|
592
|
+
hits_per_class: dict[ImpactClass, tuple[int, str]] = {}
|
|
593
|
+
for cls in ("high_impact", "medium_impact", "low_impact", "trivial"):
|
|
594
|
+
hits, bucket = _count_matches(lower, {cls: IMPACT_TRIGGERS[cls]})
|
|
595
|
+
if hits:
|
|
596
|
+
hits_per_class[cls] = (hits, bucket or cls)
|
|
597
|
+
|
|
598
|
+
for cls in ("high_impact", "medium_impact", "low_impact", "trivial"):
|
|
599
|
+
if cls in hits_per_class:
|
|
600
|
+
hits, bucket = hits_per_class[cls]
|
|
601
|
+
confidence = min(1.0, 0.5 + 0.15 * hits)
|
|
602
|
+
# high_impact is Iron-Law: cap confidence at 1.0 with at
|
|
603
|
+
# least one explicit marker — never downgrade.
|
|
604
|
+
if cls == "high_impact":
|
|
605
|
+
confidence = max(confidence, 0.85)
|
|
606
|
+
rationale = (
|
|
607
|
+
f"Matched {hits} `{cls}` marker(s) in bucket `{bucket}` — "
|
|
608
|
+
f"confidence {confidence:.2f}."
|
|
609
|
+
)
|
|
610
|
+
return ImpactVerdict(
|
|
611
|
+
impact_class=cls,
|
|
612
|
+
confidence=confidence,
|
|
613
|
+
rationale=rationale,
|
|
614
|
+
category=bucket,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
# No markers fired — default to medium_impact / low confidence so
|
|
618
|
+
# the routing layer falls through to council or user rather than
|
|
619
|
+
# silently letting the agent resolve.
|
|
620
|
+
return ImpactVerdict(
|
|
621
|
+
impact_class="medium_impact",
|
|
622
|
+
confidence=0.3,
|
|
623
|
+
rationale=(
|
|
624
|
+
"No impact markers fired — defaulting to `medium_impact` at "
|
|
625
|
+
"low confidence; routing layer should escalate."
|
|
626
|
+
),
|
|
627
|
+
category="unclassified",
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def load_validated_phrases(corpus_path: "object") -> tuple[str, ...]:
|
|
633
|
+
"""Return normalised `## Validated` question strings from a corpus.
|
|
634
|
+
|
|
635
|
+
Thin re-export of :func:`scripts.ai_council.low_impact_corpus.load_validated_phrases`
|
|
636
|
+
— the hardened parser (step-9 P4) lives there; routing stays lenient
|
|
637
|
+
so a broken corpus never blocks classification. Strict-mode
|
|
638
|
+
contract validation lives in
|
|
639
|
+
:func:`scripts.ai_council.low_impact_corpus.parse_corpus_strict`.
|
|
640
|
+
"""
|
|
641
|
+
from scripts.ai_council.low_impact_corpus import (
|
|
642
|
+
load_validated_phrases as _load,
|
|
643
|
+
)
|
|
644
|
+
return _load(corpus_path)
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def classify_impact_with_corpus(
|
|
648
|
+
question_text: str,
|
|
649
|
+
corpus_paths: "tuple[object, ...] | None" = None,
|
|
650
|
+
) -> ImpactVerdict:
|
|
651
|
+
"""Corpus-aware variant of :func:`classify_impact` (Phase 12).
|
|
652
|
+
|
|
653
|
+
Loads ``## Validated`` phrases from every ``corpus_paths`` entry
|
|
654
|
+
(project-local first, upstream seed second) and short-circuits to
|
|
655
|
+
``low_impact`` confidence ``0.9`` on exact-after-normalisation
|
|
656
|
+
match. Probation / anti-example sections are excluded.
|
|
657
|
+
|
|
658
|
+
The locked-class Iron Law from :func:`classify_impact` still wins
|
|
659
|
+
— user-fence markers AND ``high_impact`` triggers are checked
|
|
660
|
+
BEFORE the corpus lookup, so a question with both a corpus hit and
|
|
661
|
+
a security marker still routes to ``user``.
|
|
662
|
+
"""
|
|
663
|
+
base = classify_impact(question_text)
|
|
664
|
+
if base.impact_class in LOCKED_IMPACT_CLASSES:
|
|
665
|
+
return base
|
|
666
|
+
if not corpus_paths:
|
|
667
|
+
return base
|
|
668
|
+
norm_q = re.sub(r"[^\w\s]", " ", (question_text or "").lower())
|
|
669
|
+
norm_q = re.sub(r"\s+", " ", norm_q).strip()
|
|
670
|
+
if not norm_q:
|
|
671
|
+
return base
|
|
672
|
+
for path in corpus_paths:
|
|
673
|
+
for phrase in load_validated_phrases(path):
|
|
674
|
+
if norm_q == phrase:
|
|
675
|
+
return ImpactVerdict(
|
|
676
|
+
impact_class="low_impact",
|
|
677
|
+
confidence=0.9,
|
|
678
|
+
rationale=(
|
|
679
|
+
"Matched Validated corpus entry — routing as "
|
|
680
|
+
"`low_impact` (Phase 12 learning loop)."
|
|
681
|
+
),
|
|
682
|
+
category="corpus_validated",
|
|
683
|
+
)
|
|
684
|
+
return base
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
ResolutionMode = Literal["agent", "council", "user"]
|
|
688
|
+
_RESOLUTION_RUNGS: tuple[ResolutionMode, ...] = ("agent", "council", "user")
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
@dataclass(frozen=True)
|
|
692
|
+
class DecisionRouting:
|
|
693
|
+
"""Final routing decision (Phase 10).
|
|
694
|
+
|
|
695
|
+
Combines :class:`ImpactVerdict` with the per-class
|
|
696
|
+
``DecisionResolutionEntry`` from config to produce the mode the
|
|
697
|
+
chokepoint should dispatch to.
|
|
698
|
+
|
|
699
|
+
Attributes:
|
|
700
|
+
verdict: Underlying impact classification.
|
|
701
|
+
mode: Final resolution mode after Iron-Law + confidence-gate.
|
|
702
|
+
upgraded: ``True`` when the confidence-threshold pushed the
|
|
703
|
+
mode one rung up (e.g. ``agent`` → ``council``).
|
|
704
|
+
rationale: One-line explanation suitable for session.md.
|
|
705
|
+
"""
|
|
706
|
+
|
|
707
|
+
verdict: ImpactVerdict
|
|
708
|
+
mode: ResolutionMode
|
|
709
|
+
upgraded: bool
|
|
710
|
+
rationale: str
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def route_decision(
|
|
714
|
+
question_text: str,
|
|
715
|
+
classes: dict[str, "object"],
|
|
716
|
+
) -> DecisionRouting:
|
|
717
|
+
"""Classify + route a pending agent question.
|
|
718
|
+
|
|
719
|
+
Args:
|
|
720
|
+
question_text: The text the agent was about to ask the user.
|
|
721
|
+
classes: Mapping ``impact_class -> DecisionResolutionEntry``
|
|
722
|
+
(typed loosely to keep this module free of a config
|
|
723
|
+
import cycle). Each entry must expose ``mode`` and
|
|
724
|
+
``confidence_threshold`` attributes.
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
:class:`DecisionRouting` with the final mode. Iron Law:
|
|
728
|
+
:data:`LOCKED_IMPACT_CLASSES` always returns ``mode="user"``
|
|
729
|
+
regardless of config or confidence.
|
|
730
|
+
"""
|
|
731
|
+
verdict = classify_impact(question_text)
|
|
732
|
+
entry = classes.get(verdict.impact_class)
|
|
733
|
+
if entry is None:
|
|
734
|
+
# No config — Iron-Law fallback to user.
|
|
735
|
+
return DecisionRouting(
|
|
736
|
+
verdict=verdict,
|
|
737
|
+
mode="user",
|
|
738
|
+
upgraded=False,
|
|
739
|
+
rationale=(
|
|
740
|
+
f"No routing entry for `{verdict.impact_class}` — "
|
|
741
|
+
f"defaulting to user (Iron-Law fallback)."
|
|
742
|
+
),
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
base_mode: ResolutionMode = getattr(entry, "mode", "user") # type: ignore[assignment]
|
|
746
|
+
threshold: float = getattr(entry, "confidence_threshold", 0.6)
|
|
747
|
+
|
|
748
|
+
if verdict.impact_class in LOCKED_IMPACT_CLASSES:
|
|
749
|
+
return DecisionRouting(
|
|
750
|
+
verdict=verdict,
|
|
751
|
+
mode="user",
|
|
752
|
+
upgraded=False,
|
|
753
|
+
rationale=(
|
|
754
|
+
f"`{verdict.impact_class}` is Iron-Law locked to `user` "
|
|
755
|
+
f"— bypass refused."
|
|
756
|
+
),
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
upgraded = False
|
|
760
|
+
mode: ResolutionMode = base_mode
|
|
761
|
+
if mode != "user" and verdict.confidence < threshold:
|
|
762
|
+
try:
|
|
763
|
+
idx = _RESOLUTION_RUNGS.index(base_mode)
|
|
764
|
+
mode = _RESOLUTION_RUNGS[min(idx + 1, len(_RESOLUTION_RUNGS) - 1)]
|
|
765
|
+
upgraded = mode != base_mode
|
|
766
|
+
except ValueError:
|
|
767
|
+
mode = "user"
|
|
768
|
+
upgraded = True
|
|
769
|
+
|
|
770
|
+
rationale = (
|
|
771
|
+
f"Class `{verdict.impact_class}` (confidence "
|
|
772
|
+
f"{verdict.confidence:.2f}, threshold {threshold:.2f}) → "
|
|
773
|
+
f"mode `{mode}`"
|
|
774
|
+
+ (f" (upgraded from `{base_mode}`)" if upgraded else "")
|
|
775
|
+
+ "."
|
|
776
|
+
)
|
|
777
|
+
return DecisionRouting(
|
|
778
|
+
verdict=verdict,
|
|
779
|
+
mode=mode,
|
|
780
|
+
upgraded=upgraded,
|
|
781
|
+
rationale=rationale,
|
|
782
|
+
)
|