npm - @event4u/agent-config - Versions diffs - 1.15.0 → 1.17.0 - Mend

@event4u/agent-config 1.15.0 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (354) hide show

package/scripts/ai_council/_one_off_roundtrip.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""One-off Phase-1 round-trip runner.
+Used exactly once to generate the evidence artefact required to lift
+the capture-only fence on `road-to-ai-council.md` Phase 2+ and the
+end-to-end verification on `road-to-council-modes.md` Phase 2a.
+Not part of the public CLI surface — `/council` remains the supported
+entry point. This script is committed under `scripts/ai_council/` so
+the evidence is reproducible from the git history alone.
+Invocation:
+    .venv/bin/python -m scripts.ai_council._one_off_roundtrip
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from scripts.ai_council.bundler import bundle_roadmap
+from scripts.ai_council.clients import AnthropicClient, load_anthropic_key
+from scripts.ai_council.orchestrator import (
+    CostBudget,
+    CouncilQuestion,
+    consult,
+    estimate,
+)
+from scripts.ai_council.pricing import estimate_cost, load_prices
+from scripts.ai_council.project_context import detect_project_context
+from scripts.ai_council.session import SessionManifest, save as save_session
+REPO_ROOT = Path(__file__).resolve().parents[2]
+ROADMAP_PATH = REPO_ROOT / "agents/roadmaps/road-to-council-modes.md"
+ORIGINAL_ASK = (
+    "Bitte review die folgende Roadmap (council-modes Phase 2c "
+    "Playwright). Die Maintainer-Recommendations für Q1-Q5 sind im "
+    "Block 'Decisions Required' bereits hinterlegt. Frage: sollten "
+    "wir die Recommendations annehmen wie sie sind, oder gibt es "
+    "blinde Flecken die wir vor dem Lift der capture-only fence "
+    "kläeren sollten?"
+)
+def main() -> int:
+    api_key = load_anthropic_key()
+    client = AnthropicClient(api_key=api_key)
+    context = bundle_roadmap(ROADMAP_PATH)
+    project = detect_project_context(REPO_ROOT)
+    table = load_prices()
+    question = CouncilQuestion(
+        mode="roadmap",
+        user_prompt=context.text,
+        max_tokens=2048,
+    )
+    estimates = estimate(
+        question, [client], table,
+        project=project, original_ask=ORIGINAL_ASK,
+    )
+    print(f"[estimate] {client.name}/{client.model}: "
+          f"~{estimates[0].input_tokens} in + {estimates[0].output_tokens} out "
+          f"= ${estimates[0].total_usd:.4f}")
+    budget = CostBudget(
+        max_input_tokens=50_000,
+        max_output_tokens=20_000,
+        max_calls=10,
+        max_total_usd=0.50,
+    )
+    print(f"[consult] calling {client.name}/{client.model} ...")
+    responses = consult(
+        [client], question, budget,
+        table=table, project=project, original_ask=ORIGINAL_ASK,
+    )
+    if not responses or responses[0].error:
+        err = responses[0].error if responses else "no response"
+        print(f"[error] {err}", file=sys.stderr)
+        return 1
+    r = responses[0]
+    actual = estimate_cost(r.provider, r.model, r.input_tokens, r.output_tokens, table)
+    actual_usd = actual.total_usd
+    print(f"[done] tokens: {r.input_tokens} in / {r.output_tokens} out · "
+          f"latency: {r.latency_ms} ms · actual ${actual_usd:.4f}")
+    manifest = SessionManifest(
+        mode="roadmap",
+        artefact=str(ROADMAP_PATH.relative_to(REPO_ROOT)),
+        original_ask=ORIGINAL_ASK,
+        members=[f"{r.provider}/{r.model}"],
+        rounds=1,
+        cost_usd_estimated=estimates[0].total_usd,
+        cost_usd_actual=actual_usd,
+        extra={"purpose": "Phase 1 ai-council round-trip + Phase 2a council-modes E2E evidence"},
+    )
+    session_dir = save_session(manifest=manifest, responses=responses)
+    print(f"[saved] {session_dir.relative_to(REPO_ROOT)}/")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/scripts/ai_council/_one_off_rule_hardening_v1.py ADDED Viewed

@@ -0,0 +1,251 @@
+"""Council review of road-to-rule-hardening.md v1.
+Three independent self-check rules silently skipped within one
+session despite being valid, loaded, and active. Host agent drafted
+a 6-phase lightweight roadmap proposing a 3-tier hardening model
+(Mechanical / Nudge / Inherent-soft).
+Council task: validate the architecture and the pilot order before
+the host agent autonomously implements Tier 1 hooks.
+Invocation:
+    .venv/bin/python -m scripts.ai_council._one_off_rule_hardening_v1
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from scripts.ai_council.bundler import bundle_prompt
+from scripts.ai_council.clients import (
+    AnthropicClient,
+    OpenAIClient,
+    load_anthropic_key,
+    load_openai_key,
+)
+from scripts.ai_council.orchestrator import (
+    CostBudget,
+    CouncilQuestion,
+    consult,
+    estimate,
+)
+from scripts.ai_council.pricing import estimate_cost, load_prices
+from scripts.ai_council.project_context import detect_project_context
+from scripts.ai_council.session import SessionManifest, save as save_session
+REPO_ROOT = Path(__file__).resolve().parents[2]
+ROADMAP_PATH = REPO_ROOT / "agents/roadmaps/road-to-rule-hardening.md"
+ORIGINAL_ASK = (
+    "Three independent self-check rules silently skipped within one "
+    "session: model-recommendation (task-start gate), context-hygiene "
+    "(turn-count handoff), roadmap-progress-sync (file-write side "
+    "effect). All valid, loaded, active. Host agent drafted a 6-phase "
+    "lightweight roadmap proposing a 3-tier hardening model. Council "
+    "task: validate the architecture and the pilot ordering before "
+    "implementation."
+)
+PROMPT_HEADER = """\
+# Council Review — road-to-rule-hardening.md v1
+## Background (verbatim, do not re-frame)
+Within a single session the host agent observed three rules failing
+the same way:
+| Rule | Trigger that should have fired | What happened |
+|---|---|---|
+| `model-recommendation` | Opus recommendation at task-start | silently skipped |
+| `context-hygiene` | Handoff at turn-count >= 20/40/60 | silently skipped |
+| `roadmap-progress-sync` | Regenerate dashboard on roadmap touch | silently skipped |
+All three are valid auto-load rules in `.augment/rules/`, all three
+are part of the active rule set surfaced to the agent. None of them
+fired. Hypothesis: they share a structural property — the trigger
+is observable only inside the agent, the check runs in the agent's
+head, no deterministic gate sits between decision and output. When
+the agent is in flow (multi-tool work, file edits, council
+orchestration), the self-check is the first thing to be dropped.
+The host agent drafted a 6-phase lightweight roadmap proposing a
+3-tier model:
+- **Tier 1 — Mechanical.** Hook + deterministic check.
+  Agent-independent.
+- **Tier 2 — Nudge.** Hook detects, marker injected, agent
+  formulates the response.
+- **Tier 3 — Inherent soft.** No platform mechanism exists. Either
+  accept as self-check, convert to user-invoked `/`-command, or
+  deprecate.
+Pilot order proposed by user: roadmap-progress-sync (1) → onboarding-gate
+(3) → context-hygiene (2). Hook surface available today: Augment
+PostToolUse / Stop, Claude Code Stop / SessionStart. Cursor / Cline /
+Windsurf parity is explicitly out of scope for this roadmap.
+Existing precedent: `chat-history-cadence` is the only rule already
+mechanically hardened (heartbeat marker pattern). Inventory:
+57 rules total, 18 contain self-check phrases (`MUST`, `MANDATORY`,
+`pre-send`, `before drafting`), 6 mention hooks today.
+## Your task
+Review road-to-rule-hardening.md v1 (full text appended below).
+Be adversarial — the host will autonomously execute the pilots, so
+catch architectural mistakes now, not after Phase 4.
+1. **Three-tier model:** is Mechanical / Nudge / Inherent-soft the
+   right partition, or does it collapse meaningful distinctions?
+   Specifically: should "Tier 2 Nudge" exist at all, or is it just a
+   weaker Tier 1?
+2. **Pilot order (1, 3, 2):** does roadmap-progress-sync prove the
+   pattern, or is it too narrow to generalise? Is per-turn counter
+   (context-hygiene) actually feasible cross-platform, or should it
+   move to Tier 3?
+3. **Failure-class generalisation:** are there self-check rules in
+   the inventory that the audit will MISS because they fire so
+   rarely the agent has not yet observed a skip? Name 1-2 likely
+   candidates.
+4. **Cross-platform scope:** roadmap defers Cursor/Cline/Windsurf.
+   Is this honest scope or hidden tech debt that will silently
+   block Phase 4 rollout?
+5. **Tier 3 disposition:** the roadmap allows accept-as-soft as a
+   valid disposition. Is that a real choice, or a way to declare
+   victory without solving anything?
+## Output contract (STRICT)
+For EACH of the six phases:
+```
+### Phase N — <title>
+**Verdict:** <ACCEPT | PARTIAL | REJECT>
+**What v1 gets right (1 sentence):** ...
+**What v1 misses or over-reaches (1-2 sentences):** ...
+**Concrete change to v2 (binding):** ...
+```
+Then five answers to the questions above (numbered 1-5, ≤ 3 sentences
+each).
+Then a final block:
+```
+### Greenlight verdict
+<one of: FULL GREENLIGHT — proceed with pilots / CONDITIONAL GREENLIGHT
+— apply N revisions then proceed / BLOCKED — major rework needed>
+**Binding revisions for v2 (numbered, ≤ 6):** ...
+**Pilot order recommendation:** <1,3,2 | 1,2,3 | 3,1,2 | other>
+**One-line architectural risk you would still proceed with:** ...
+```
+Total response budget: ≤ 1500 words. Do not re-write the roadmap.
+"""
+def _read(path: Path) -> str:
+    return path.read_text(encoding="utf-8") if path.exists() else ""
+def main() -> int:
+    anthropic = AnthropicClient(api_key=load_anthropic_key(), model="claude-sonnet-4-5")
+    openai = OpenAIClient(api_key=load_openai_key(), model="gpt-4o")
+    members = [anthropic, openai]
+    roadmap_text = _read(ROADMAP_PATH)
+    if not roadmap_text:
+        print(f"[error] roadmap not found: {ROADMAP_PATH}", file=sys.stderr)
+        return 1
+    bundle_text = "\n\n---\n\n".join([
+        PROMPT_HEADER,
+        "## Roadmap v1 (verbatim, the artefact to validate)\n\n" + roadmap_text,
+    ])
+    context = bundle_prompt(bundle_text)
+    project = detect_project_context(REPO_ROOT)
+    table = load_prices()
+    question = CouncilQuestion(
+        mode="prompt",
+        user_prompt=context.text,
+        max_tokens=4096,
+    )
+    estimates = estimate(
+        question, members, table, project=project, original_ask=ORIGINAL_ASK,
+    )
+    print("=== ESTIMATE (single round, max tokens) ===")
+    total_est = 0.0
+    for c, e in zip(members, estimates):
+        print(f"  {c.name}/{c.model}: ~{e.input_tokens} in + {e.output_tokens} out = ${e.total_usd:.4f}")
+        total_est += e.total_usd
+    print(f"  TOTAL per round (max): ${total_est:.4f}")
+    print()
+    budget = CostBudget(
+        max_input_tokens=200_000,
+        max_output_tokens=80_000,
+        max_calls=20,
+        max_total_usd=2.50,
+    )
+    rounds_collected: list[list] = []
+    def _on_round_complete(round_idx: int, round_responses) -> None:
+        rounds_collected.append(list(round_responses))
+        print(f"=== ROUND {round_idx + 1} COMPLETE ===")
+        for r in round_responses:
+            if r.error:
+                print(f"  [error] {r.provider}/{r.model}: {r.error}")
+                continue
+            actual = estimate_cost(r.provider, r.model, r.input_tokens, r.output_tokens, table)
+            print(f"  [done] {r.provider}/{r.model}: {r.input_tokens} in / "
+                  f"{r.output_tokens} out · {r.latency_ms} ms · ${actual.total_usd:.4f}")
+        print()
+    print("=== CONSULT (1 round) ===")
+    consult(
+        members, question, budget,
+        rounds=1,
+        on_round_complete=_on_round_complete,
+        table=table, project=project, original_ask=ORIGINAL_ASK,
+    )
+    if not rounds_collected:
+        print("[error] no rounds completed", file=sys.stderr)
+        return 1
+    actual_total = 0.0
+    for round_responses in rounds_collected:
+        for r in round_responses:
+            if r.error:
+                continue
+            actual = estimate_cost(r.provider, r.model, r.input_tokens, r.output_tokens, table)
+            actual_total += actual.total_usd
+    print(f"=== TOTAL ACTUAL: ${actual_total:.4f} ===")
+    final_round = rounds_collected[-1]
+    if not [r for r in final_round if not r.error]:
+        return 1
+    manifest = SessionManifest(
+        mode="prompt",
+        artefact=str(ROADMAP_PATH.relative_to(REPO_ROOT)),
+        original_ask=ORIGINAL_ASK,
+        members=[f"{r.provider}/{r.model}" for r in final_round],
+        rounds=len(rounds_collected),
+        cost_usd_estimated=total_est,
+        cost_usd_actual=actual_total,
+        extra={"purpose": "Council review of road-to-rule-hardening v1"},
+    )
+    session_dir = save_session(manifest=manifest, responses=rounds_collected)
+    print(f"[saved] {session_dir.relative_to(REPO_ROOT)}/")
+    return 1 if any(r.error for round_r in rounds_collected for r in round_r) else 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/scripts/ai_council/_one_off_structural_open_questions.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""One-off council run for the three open questions in road-to-structural-optimization.md v2.
+Forces a STRUCTURED multiple-choice verdict per question across 2 rounds
+(debate mode). Saves the session under agents/council-sessions/.
+Invocation:
+    .venv/bin/python -m scripts.ai_council._one_off_structural_open_questions
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from scripts.ai_council.bundler import bundle_roadmap
+from scripts.ai_council.clients import (
+    AnthropicClient,
+    OpenAIClient,
+    load_anthropic_key,
+    load_openai_key,
+)
+from scripts.ai_council.orchestrator import (
+    CostBudget,
+    CouncilQuestion,
+    consult,
+    estimate,
+)
+from scripts.ai_council.pricing import estimate_cost, load_prices
+from scripts.ai_council.project_context import detect_project_context
+from scripts.ai_council.session import SessionManifest, save as save_session
+REPO_ROOT = Path(__file__).resolve().parents[2]
+ROADMAP_PATH = REPO_ROOT / "agents/roadmaps/road-to-structural-optimization.md"
+ORIGINAL_ASK = (
+    "Lock the three open design questions on road-to-structural-optimization.md v2 "
+    "with binding A/B verdicts + 2-3 sentence rationale each. The roadmap text "
+    "is the context. Be decisive — say A or B (or C with new option). Avoid "
+    "'depends' or 'further investigation needed'."
+)
+OPEN_QUESTIONS_PROMPT = """\
+# Lock-the-Decision Council Round
+You have already reviewed road-to-structural-optimization.md v2 in two prior \
+rounds (architectural / sequencing critique). The roadmap author now needs \
+**binding verdicts** on the three open design questions. Stop re-reviewing the \
+roadmap. Pick A, B, or propose C with explicit justification.
+## Output Contract (STRICT)
+For each question, produce exactly this block:
+```
+### Q<n>: <short title>
+**Verdict:** <A | B | C>
+**Confidence:** <Low | Medium | High>
+**Rationale (2-4 sentences):** <why this option, what failure mode of the other you avoid>
+**Risk if wrong:** <one sentence — what breaks if your verdict is the wrong call>
+**Rollback if wrong:** <one sentence — how to recover if production reveals the choice was bad>
+```
+If you propose C (new option), keep the same block structure.
+## Q1 — Phase 3a Skill Consolidation Shape
+The four `judge-*` skills (`judge-bug-hunter`, `judge-code-quality`, \
+`judge-security-auditor`, `judge-test-coverage`) share procedure but \
+have distinct persona voices. Two consolidation shapes:
+- **A.** Keep four separate skills. Extract the shared procedure to \
+  `contexts/judges/judge-shared-procedure.md`. Each skill's SKILL.md \
+  loads the context via `load_context:` and adds its persona-specific \
+  delta (review heuristics, persona prompt, examples).
+- **B.** Single skill `judge` with `mode:` parameter \
+  (`mode: bug-hunter | code-quality | security-auditor | test-coverage`). \
+  One SKILL.md dispatches; persona contexts live at \
+  `contexts/judges/personas/<mode>.md` and are loaded based on `mode:`.
+**Decision criterion:** which preserves persona voice better at LLM \
+runtime, and which is more maintainable when a fifth judge persona is \
+added in 6 months?
+## Q2 — Phase 6 chat-history Rule Consolidation Shape
+Three rules (`chat-history-cadence`, `chat-history-ownership`, \
+`chat-history-visibility`) overlap on triggers and surface but each \
+encodes a distinct concern. Two consolidation shapes:
+- **A.** One rule `chat-history` + three contexts \
+  (`contexts/chat-history/cadence.md`, `.../ownership.md`, \
+  `.../visibility.md`). The rule body holds the unified trigger language \
+  and routes to the right context section based on the matched anchor.
+- **B.** Router rule `chat-history` + three thin specialist rules \
+  (`chat-history-cadence` etc.) reduced to <30 LOC each. Router fires \
+  first, dispatches to one specialist based on signal type \
+  (heartbeat / ownership-detection / cadence-decision).
+**Decision criterion:** which is more maintainable when a fourth concern \
+(e.g., `chat-history-archive` for log rotation) is added in 12 months, \
+and which has lower cognitive load for an agent reading the rule first \
+time mid-task?
+## Q3 — Phase 5 Safety-Floor Rule Endorsement
+Phase 5 tightens the always-rule budget. The four safety-floor rules are:
+- `non-destructive-by-default` (~4,607 chars)
+- `commit-policy` (~2,800 chars)
+- `scope-control` (~3,900 chars)
+- `verify-before-complete` (~3,200 chars)
+Should these be in scope for slimming?
+- **A.** Endorse keeping all four UNTOUCHED. Slimming risks weakening Iron \
+  Laws under budget pressure. Phase 5 hits target without them.
+- **B.** Allow slimming with stricter 2A.4-style obligation-diff (every \
+  MUST/NEVER preserved verbatim, mechanics moved to context). Treat them \
+  like normal always-rules.
+**Decision criterion:** is the marginal budget gain worth the residual \
+risk of an Iron Law regression slipping through the obligation-diff \
+gate?
+## Final Output
+After the three blocks, add:
+```
+### Cross-question coupling
+<2-3 sentences: do your verdicts on Q1/Q2/Q3 reinforce or conflict with \
+each other? Are there hidden dependencies between them?>
+```
+Do not write an executive summary. Do not re-review v2. Pick, justify, \
+ship. Total response budget: ≤ 1500 words.
+"""
+def main() -> int:
+    anthropic = AnthropicClient(api_key=load_anthropic_key(), model="claude-sonnet-4-5")
+    openai = OpenAIClient(api_key=load_openai_key(), model="gpt-4o")
+    members = [anthropic, openai]
+    context = bundle_roadmap(ROADMAP_PATH)
+    project = detect_project_context(REPO_ROOT)
+    table = load_prices()
+    user_prompt = OPEN_QUESTIONS_PROMPT + "\n\n---\n\n" + context.text
+    question = CouncilQuestion(
+        mode="roadmap",
+        user_prompt=user_prompt,
+        max_tokens=4096,
+    )
+    estimates = estimate(
+        question, members, table, project=project, original_ask=ORIGINAL_ASK,
+    )
+    print("=== ESTIMATE (single round, max tokens) ===")
+    total_est = 0.0
+    for c, e in zip(members, estimates):
+        print(f"  {c.name}/{c.model}: ~{e.input_tokens} in + {e.output_tokens} out = ${e.total_usd:.4f}")
+        total_est += e.total_usd
+    print(f"  TOTAL per round (max): ${total_est:.4f}")
+    print(f"  TOTAL 2 rounds (max): ${total_est * 2:.4f}")
+    print()
+    budget = CostBudget(
+        max_input_tokens=200_000,
+        max_output_tokens=80_000,
+        max_calls=20,
+        max_total_usd=2.50,
+    )
+    rounds_collected: list[list] = []
+    def _on_round_complete(round_idx: int, round_responses) -> None:
+        rounds_collected.append(list(round_responses))
+        print(f"=== ROUND {round_idx + 1} COMPLETE ===")
+        for r in round_responses:
+            if r.error:
+                print(f"  [error] {r.provider}/{r.model}: {r.error}")
+                continue
+            actual = estimate_cost(r.provider, r.model, r.input_tokens, r.output_tokens, table)
+            print(f"  [done] {r.provider}/{r.model}: {r.input_tokens} in / "
+                  f"{r.output_tokens} out · {r.latency_ms} ms · ${actual.total_usd:.4f}")
+        print()
+    print("=== CONSULT (2 rounds, debate mode) ===")
+    consult(
+        members, question, budget,
+        rounds=2,
+        on_round_complete=_on_round_complete,
+        table=table, project=project, original_ask=ORIGINAL_ASK,
+    )
+    if not rounds_collected:
+        print("[error] no rounds completed", file=sys.stderr)
+        return 1
+    actual_total = 0.0
+    for round_responses in rounds_collected:
+        for r in round_responses:
+            if r.error:
+                continue
+            actual = estimate_cost(r.provider, r.model, r.input_tokens, r.output_tokens, table)
+            actual_total += actual.total_usd
+    print(f"=== TOTAL ACTUAL: ${actual_total:.4f} (across {len(rounds_collected)} rounds) ===")
+    final_round = rounds_collected[-1]
+    if not [r for r in final_round if not r.error]:
+        return 1
+    manifest = SessionManifest(
+        mode="roadmap",
+        artefact=str(ROADMAP_PATH.relative_to(REPO_ROOT)),
+        original_ask=ORIGINAL_ASK,
+        members=[f"{r.provider}/{r.model}" for r in final_round],
+        rounds=len(rounds_collected),
+        cost_usd_estimated=total_est * 2,
+        cost_usd_actual=actual_total,
+        extra={"purpose": "council lock on three open questions of structural-optimization v2"},
+    )
+    session_dir = save_session(manifest=manifest, responses=rounds_collected)
+    print(f"[saved] {session_dir.relative_to(REPO_ROOT)}/")
+    return 1 if any(r.error for round_r in rounds_collected for r in round_r) else 0
+if __name__ == "__main__":
+    raise SystemExit(main())