kairos-chain 3.19.1 → 3.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +71 -0
- data/lib/kairos_mcp/version.rb +1 -1
- data/templates/knowledge/multi_llm_review_workflow/multi_llm_review_workflow.md +106 -2
- data/templates/skillsets/agent/config/agent.yml +15 -2
- data/templates/skillsets/agent/lib/agent/review_hint.rb +84 -0
- data/templates/skillsets/agent/lib/agent/trigger_validator.rb +69 -0
- data/templates/skillsets/agent/lib/agent.rb +2 -0
- data/templates/skillsets/agent/test/test_agent_complexity_review.rb +205 -4
- data/templates/skillsets/agent/test/test_decide_prompt_contract.rb +91 -0
- data/templates/skillsets/agent/test/test_review_hint.rb +168 -0
- data/templates/skillsets/agent/tools/agent_start.rb +14 -0
- data/templates/skillsets/agent/tools/agent_step.rb +319 -20
- data/templates/skillsets/llm_client/lib/llm_client/cursor_adapter.rb +9 -0
- data/templates/skillsets/llm_client/test/test_cursor_adapter_model.rb +111 -0
- data/templates/skillsets/multi_llm_review/config/multi_llm_review.yml +35 -1
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/build_review_bundle.rb +170 -0
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/consensus.rb +19 -1
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/feedback_formatter.rb +58 -0
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/sanitizer.rb +184 -0
- data/templates/skillsets/multi_llm_review/skillset.json +8 -4
- data/templates/skillsets/multi_llm_review/test/test_feedback_formatter.rb +97 -0
- data/templates/skillsets/multi_llm_review/test/test_multi_llm_review.rb +155 -1
- data/templates/skillsets/multi_llm_review/test/test_multi_llm_review_bundle.rb +167 -0
- data/templates/skillsets/multi_llm_review/test/test_multi_llm_review_wait.rb +249 -0
- data/templates/skillsets/multi_llm_review/test/test_sanitizer.rb +213 -0
- data/templates/skillsets/multi_llm_review/tools/multi_llm_review.rb +80 -21
- data/templates/skillsets/multi_llm_review/tools/multi_llm_review_bundle.rb +139 -0
- data/templates/skillsets/multi_llm_review/tools/multi_llm_review_collect.rb +20 -3
- data/templates/skillsets/multi_llm_review/tools/multi_llm_review_wait.rb +313 -0
- metadata +16 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 20e2a223137f51dc61025e57dd6fd205a8f702ef923b5b0b2e0d464a308d279f
|
|
4
|
+
data.tar.gz: 51627cb487cf5fc2e46b8e6055bf36e0cf5c8839f15cb2c2236f2ff56efa2def
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9fd4b17a28bdc06b7b19195e7274ef41e4ba7a93fc70e2c3a38083c58eae85ea8dfd432cb7cc6219cf7ba49ba637dbed12c48ea12312f4a3f26f46dfb936438f
|
|
7
|
+
data.tar.gz: fadb35fdbf47eeebfc9b667685452a0c222ecb396dbb2031de08ac27b2df1de1a3985fc41c03e4e767d22a58ec98a77d52c2059e921f8084d1663a2a099bdd1d
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,77 @@ All notable changes to the `kairos-chain` gem will be documented in this file.
|
|
|
4
4
|
|
|
5
5
|
This project follows [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
+
## [3.24.0] - 2026-04-27
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
|
|
11
|
+
- **multi_llm_review_wait MCP tool** (Phase 1.5) — optional blocking gate
|
|
12
|
+
between `multi_llm_review` (Phase 1) and `multi_llm_review_collect`
|
|
13
|
+
(Phase 2). Wraps the existing `WaitForWorker.wait` polling loop and
|
|
14
|
+
exposes 6 distinct status codes (`ready`, `still_pending`, `crashed`,
|
|
15
|
+
`unknown_token`, `already_collected`, `past_collect_deadline`) each with
|
|
16
|
+
a `next_action` recovery hint pointing at the right next tool.
|
|
17
|
+
- **`next_action` hint on `multi_llm_review` delegation_pending response** —
|
|
18
|
+
structured `{tool, args, purpose}` field nudging the orchestrator to call
|
|
19
|
+
`multi_llm_review_wait` after persona Agent dispatch. MCP does not enforce
|
|
20
|
+
ordering; this is a hint, not a constraint, but in practice LLMs follow
|
|
21
|
+
it reliably.
|
|
22
|
+
- **Path A vs Path B disambiguation in workflow knowledge doc** — surfaces
|
|
23
|
+
the long-implicit distinction between the host-tracked Bash workflow
|
|
24
|
+
(Claude Code's `Bash(background)` pattern, statusbar shows `XX shells`)
|
|
25
|
+
and the MCP-managed SkillSet (detached worker, no host-side tracking,
|
|
26
|
+
polling required).
|
|
27
|
+
- New config keys under `delegation.parallel`:
|
|
28
|
+
`wait_poll_interval_seconds: 1.0`, `wait_max_default_seconds: 600`,
|
|
29
|
+
`wait_max_hard_cap_seconds: 1800`, `wait_still_pending_streak_limit: 3`.
|
|
30
|
+
- Streak guard: 3 consecutive `still_pending` returns escalate to
|
|
31
|
+
`crashed/wait_exhausted` so a wedged worker cannot trap the orchestrator
|
|
32
|
+
in an infinite wait loop.
|
|
33
|
+
- 14 new tests in `test_multi_llm_review_wait.rb` covering all status
|
|
34
|
+
paths, streak persistence/reset, hard cap clamping, deadline-remaining
|
|
35
|
+
clamping, and backward compatibility (collect still works without wait).
|
|
36
|
+
|
|
37
|
+
### Changed
|
|
38
|
+
|
|
39
|
+
- `multi_llm_review` SkillSet version 0.4.0 → 0.5.0.
|
|
40
|
+
- `delegation` instruction text now mentions wait → collect chain.
|
|
41
|
+
|
|
42
|
+
### Notes
|
|
43
|
+
|
|
44
|
+
- Backward compatible: callers that skip wait and call collect directly
|
|
45
|
+
still work via collect's existing internal polling.
|
|
46
|
+
- Design review (Codex GPT-5.5 + Cursor Composer-2 + Claude Team Opus 4.7)
|
|
47
|
+
produced 3/3 REVISE with 6-7 P1 issues; revisions R1-R14 captured in
|
|
48
|
+
handoff L2 `multi_llm_review_wait_tool_handoff` before implementation.
|
|
49
|
+
|
|
50
|
+
## [3.23.3] - 2026-04-27
|
|
51
|
+
|
|
52
|
+
### Documentation
|
|
53
|
+
|
|
54
|
+
- **multi_llm_review_workflow knowledge** — Added "Async/Parallel Collect
|
|
55
|
+
Timing — Iron Rule" subsection. Documents the workflow constraint that the
|
|
56
|
+
orchestrator must call `multi_llm_review_collect` immediately after persona
|
|
57
|
+
Agent reviews complete, without intervening user dialogue. Explains the
|
|
58
|
+
underlying mechanics (LLM is not event-driven; collect already polls
|
|
59
|
+
internally at 0.5s intervals; token expiry vs subprocess completion). Adds
|
|
60
|
+
recommended flow, anti-pattern, and manual recovery instructions.
|
|
61
|
+
- Updated stale `must_collect_by` default reference (600s → 1800s).
|
|
62
|
+
|
|
63
|
+
## [3.23.2] - 2026-04-26
|
|
64
|
+
|
|
65
|
+
### Fixed
|
|
66
|
+
|
|
67
|
+
- **multi_llm_review collect_deadline bug** — `timeout_seconds_override` no longer
|
|
68
|
+
leaves the orchestrator's submission window shorter than the worker lifespan.
|
|
69
|
+
In the async/parallel path, `collect_deadline` is now auto-extended to cover
|
|
70
|
+
`worker self_timeout + poll margin` so raising `timeout_seconds_override`
|
|
71
|
+
alone keeps the token alive while the worker is healthy.
|
|
72
|
+
- New `collect_deadline_seconds_override` argument on `multi_llm_review` for
|
|
73
|
+
explicit control of the orchestrator's submission window.
|
|
74
|
+
- Default `delegation.collect_deadline_seconds` raised from `600` (10 min) to
|
|
75
|
+
`1800` (30 min) to better fit interactive runs where user dialogue intervenes
|
|
76
|
+
between Phase 1 and `multi_llm_review_collect`.
|
|
77
|
+
|
|
7
78
|
## [3.17.0] - 2026-04-22
|
|
8
79
|
|
|
9
80
|
### Added
|
data/lib/kairos_mcp/version.rb
CHANGED
|
@@ -29,6 +29,54 @@ This skill covers:
|
|
|
29
29
|
For **WHO** (which LLM is good at what), see: `multi_llm_reviewer_evaluation`
|
|
30
30
|
For **development lifecycle** (design → implement → verify), see: `design_to_implementation_workflow`
|
|
31
31
|
|
|
32
|
+
## Two Execution Paths (read this first)
|
|
33
|
+
|
|
34
|
+
There are **two distinct execution paths** with the same name "multi-LLM review".
|
|
35
|
+
They differ in subprocess lifecycle ownership and completion-detection mechanics.
|
|
36
|
+
Pick the right one for your environment:
|
|
37
|
+
|
|
38
|
+
### Path A — Host-tracked (Bash workflow)
|
|
39
|
+
|
|
40
|
+
- **Trigger**: orchestrator (LLM) calls Claude Code's `Bash` tool with
|
|
41
|
+
`run_in_background: true` to spawn `claude -p`, `codex exec`, `agent -p` directly.
|
|
42
|
+
- **Process parent**: Claude Code (the host harness).
|
|
43
|
+
- **Completion detection**: **event-driven**. Claude Code's shell tracker monitors
|
|
44
|
+
the spawned shells; when they exit, the LLM is notified through the standard
|
|
45
|
+
tool-result mechanism. Statusbar shows `XX shells` while reviewers are running.
|
|
46
|
+
- **When to use**: interactive Claude Code sessions for one-off Tier 3 reviews.
|
|
47
|
+
- **Reference**: see "Orchestration Template" section below for the canonical
|
|
48
|
+
`Bash(background)` pattern.
|
|
49
|
+
|
|
50
|
+
### Path B — MCP-managed (multi_llm_review SkillSet)
|
|
51
|
+
|
|
52
|
+
- **Trigger**: orchestrator calls the MCP tool `multi_llm_review`.
|
|
53
|
+
- **Process parent**: the kairos-chain Ruby gem (MCP server). The gem forks a
|
|
54
|
+
detached worker (`bin/dispatch_worker.rb`) which calls `Process.setsid` and
|
|
55
|
+
spawns CLI reviewers as a separate session leader.
|
|
56
|
+
- **Completion detection**: **polling required**. Claude Code is not the parent,
|
|
57
|
+
so the spawned subprocesses do NOT appear in the `XX shells` statusbar count.
|
|
58
|
+
The orchestrator must call `multi_llm_review_collect` (and optionally
|
|
59
|
+
`multi_llm_review_wait` first) to observe completion.
|
|
60
|
+
- **When to use**: portable execution (other MCP hosts, autonomous Agent SkillSet),
|
|
61
|
+
or any case where you want the consensus computation done server-side.
|
|
62
|
+
- **Recommended chain (3-step)**: `multi_llm_review` → `multi_llm_review_wait` →
|
|
63
|
+
`multi_llm_review_collect`. Each Phase-1/1.5 response carries a `next_action`
|
|
64
|
+
hint pointing at the next tool. wait is optional but recommended — without it,
|
|
65
|
+
collect's internal polling still covers worker completion, but recovery hints
|
|
66
|
+
for `still_pending`, `crashed`, and `past_collect_deadline` are less explicit.
|
|
67
|
+
- **Reference**: see "Orchestrator Delegation Protocol" + "Async/Parallel Collect
|
|
68
|
+
Timing — Iron Rule" sections below.
|
|
69
|
+
|
|
70
|
+
### Quick selector
|
|
71
|
+
|
|
72
|
+
| Question | Answer |
|
|
73
|
+
|----------|--------|
|
|
74
|
+
| Are you in an interactive Claude Code session and just need one review? | **Path A** |
|
|
75
|
+
| Do you need this to work in Cursor / autonomous mode / other MCP host? | **Path B** |
|
|
76
|
+
| Do you want the consensus result inside the MCP tool response? | **Path B** |
|
|
77
|
+
| Did you observe `XX shells` in the statusbar last time it worked? | That was Path A |
|
|
78
|
+
| Did the run produce a `collect_token` and a `pending/<token>/` directory? | That was Path B |
|
|
79
|
+
|
|
32
80
|
## Roles
|
|
33
81
|
|
|
34
82
|
| Role | Who | Responsibility |
|
|
@@ -331,8 +379,8 @@ cross-model subprocess reviewers give epistemic diversity. The two are complemen
|
|
|
331
379
|
|
|
332
380
|
**Failure modes**:
|
|
333
381
|
- `expired_or_unknown_token`: orchestrator missed `must_collect_by` deadline
|
|
334
|
-
(default 600s), or token never existed. The pending
|
|
335
|
-
`multi_llm_review` again from scratch.
|
|
382
|
+
(default 1800s since v3.23.2; was 600s), or token never existed. The pending
|
|
383
|
+
review is gone; call `multi_llm_review` again from scratch.
|
|
336
384
|
- `error: invalid orchestrator_reviews`: persona count outside 2-4 or missing
|
|
337
385
|
required fields. Fix and retry collect with the same token.
|
|
338
386
|
- All-subprocess-failed at Call 1: returns error immediately; no token issued.
|
|
@@ -340,6 +388,62 @@ cross-model subprocess reviewers give epistemic diversity. The two are complemen
|
|
|
340
388
|
**Default**: `orchestrator_strategy` defaults to `"exclude"` (back-compat). Use
|
|
341
389
|
`"delegate"` explicitly until validated by use.
|
|
342
390
|
|
|
391
|
+
#### Async/Parallel Collect Timing — Iron Rule
|
|
392
|
+
|
|
393
|
+
When `delegation.parallel.default: true` (the v3.x default), Call 1 returns
|
|
394
|
+
`delegation_pending` **immediately** (~50ms) and a detached worker runs the
|
|
395
|
+
subprocess reviewers in parallel with the orchestrator's persona Agent
|
|
396
|
+
reviews. This is faster, but introduces a timing trap:
|
|
397
|
+
|
|
398
|
+
> **The orchestrator MUST call `multi_llm_review_collect` immediately after
|
|
399
|
+
> the persona Agent reviews complete — without intervening user dialogue,
|
|
400
|
+
> unrelated tool calls, or context switches.**
|
|
401
|
+
|
|
402
|
+
Why this matters:
|
|
403
|
+
|
|
404
|
+
- The LLM is **not event-driven**. When the worker finishes writing
|
|
405
|
+
`subprocess_status: "done"` to `state.json`, nothing wakes the orchestrator.
|
|
406
|
+
The orchestrator only notices when it next calls `multi_llm_review_collect`.
|
|
407
|
+
- `multi_llm_review_collect` already polls internally at
|
|
408
|
+
`poll_interval_seconds: 0.5` for up to `collect_max_wait_seconds: 420` (7min)
|
|
409
|
+
per call. Polling is not the bottleneck — the bottleneck is the orchestrator
|
|
410
|
+
forgetting to call collect at all.
|
|
411
|
+
- The token expires at `collect_deadline` (default 30min since v3.23.2). If
|
|
412
|
+
user dialogue or other work intervenes between persona Agent completion and
|
|
413
|
+
the collect call, the token can expire while the subprocess results sit
|
|
414
|
+
ready and unread on disk.
|
|
415
|
+
|
|
416
|
+
Recommended orchestrator flow (single LLM turn, no detours):
|
|
417
|
+
|
|
418
|
+
```
|
|
419
|
+
1. multi_llm_review(...) → receive delegation_pending + collect_token
|
|
420
|
+
2. Spawn persona Agent reviews (Agent tool, parallel, 2-4 personas)
|
|
421
|
+
3. As soon as ALL personas return → multi_llm_review_collect(collect_token, ...)
|
|
422
|
+
4. Return final consensus to user
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
Anti-pattern (do NOT do this):
|
|
426
|
+
|
|
427
|
+
```
|
|
428
|
+
1. multi_llm_review(...) → delegation_pending
|
|
429
|
+
2. Run persona Agent reviews
|
|
430
|
+
3. ❌ "By the way, while we wait, let me explain X to the user…"
|
|
431
|
+
4. ❌ User asks an unrelated question, conversation drifts
|
|
432
|
+
5. ❌ 30+ minutes later, finally try collect → expired_or_unknown_token
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
If the orchestrator is genuinely interrupted (user explicitly switches topic,
|
|
436
|
+
or persona Agent itself takes a long time and the orchestrator wants to
|
|
437
|
+
report progress), it should still **call collect first** — collect returns
|
|
438
|
+
quickly if the worker is already done, or blocks up to 7min if not. Either
|
|
439
|
+
way, the token stays alive and consensus is captured before resuming side
|
|
440
|
+
work.
|
|
441
|
+
|
|
442
|
+
Manual recovery if expiry happens: subprocess results are persisted at
|
|
443
|
+
`.kairos/multi_llm_review/pending/<token>/subprocess_results.json` and remain
|
|
444
|
+
readable until GC. Read them directly and synthesize manually, then re-run
|
|
445
|
+
`multi_llm_review` for fresh results if needed.
|
|
446
|
+
|
|
343
447
|
### Critical CLI Notes
|
|
344
448
|
|
|
345
449
|
- **Cursor Agent stdin**: `cat file | agent -p -` does NOT work. Use file-reference:
|
|
@@ -107,12 +107,25 @@ complexity_review:
|
|
|
107
107
|
post_act_review: true # enable medium-complexity post-ACT review
|
|
108
108
|
# Multi-LLM review integration (Gate 5.5c)
|
|
109
109
|
multi_llm_review:
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
# Phase 12 PR2: KairosChain framework default — multi-LLM review is on by
|
|
111
|
+
# default as structural compensation for current LLM metacognition limits.
|
|
112
|
+
# See knowledge: project_multi_llm_review_default_philosophy.
|
|
113
|
+
# Override with `enabled: false` if you specifically need legacy single-LLM
|
|
114
|
+
# behavior; `system_upgrade` 3-way merge preserves user overrides.
|
|
115
|
+
enabled: true
|
|
116
|
+
# rule_or_hint (default): OR-floor — review fires when EITHER the deterministic
|
|
117
|
+
# rule (trigger_on ∩ complexity[:signals]) fires OR the LLM-emitted
|
|
118
|
+
# review_hint.needed=true. The hint is ADDITIVE only; it cannot suppress the
|
|
119
|
+
# rule (Phase 12 §3.2 trust boundary).
|
|
120
|
+
# rule_only: legacy/diagnostic — ignore review_hint, rule path only.
|
|
121
|
+
# unknown values: fail-closed to rule_only (most restrictive).
|
|
122
|
+
trigger_mode: rule_or_hint
|
|
123
|
+
trigger_on: # validated against KNOWN_SIGNALS at session start (§12)
|
|
112
124
|
- l0_change
|
|
113
125
|
- design_scope
|
|
114
126
|
max_concurrent: 2 # override Dispatcher default
|
|
115
127
|
timeout_seconds: 300
|
|
128
|
+
insufficient_first_attempt: retry # retry | checkpoint (§3.8)
|
|
116
129
|
# Reviewer roster delegates to multi_llm_review/config/multi_llm_review.yml
|
|
117
130
|
|
|
118
131
|
# Audit
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module KairosMcp
|
|
4
|
+
module SkillSets
|
|
5
|
+
module Agent
|
|
6
|
+
# Strict validator for the LLM-emitted review_hint structure (Phase 12 §3.6).
|
|
7
|
+
#
|
|
8
|
+
# Critical property: this hint is ADDITIVE only (§3.2 OR-floor). A return value
|
|
9
|
+
# of `false` does NOT suppress rule-based triggers — rule_fired || hint_needed.
|
|
10
|
+
# Therefore malformed hints fail-through to `false` (no review request from
|
|
11
|
+
# this side), and the OR-floor lets the deterministic rule path fire on its own.
|
|
12
|
+
#
|
|
13
|
+
# Schema:
|
|
14
|
+
# review_hint: {
|
|
15
|
+
# needed: Boolean,
|
|
16
|
+
# reason: String | nil,
|
|
17
|
+
# urgency: 'low' | 'medium' | 'high' | nil
|
|
18
|
+
# }
|
|
19
|
+
class ReviewHint
|
|
20
|
+
VALID_URGENCY = %w[low medium high].freeze
|
|
21
|
+
|
|
22
|
+
# PR3 hardening: per-process counter of validation failures, exposed for
|
|
23
|
+
# observability. agent_status / introspection tools may read this to
|
|
24
|
+
# surface drift (e.g., DECIDE LLM repeatedly emitting malformed hints).
|
|
25
|
+
# Reset by tests via reset_failure_count!.
|
|
26
|
+
@failure_count = 0
|
|
27
|
+
class << self
|
|
28
|
+
attr_reader :failure_count
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def self.reset_failure_count!
|
|
32
|
+
@failure_count = 0
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Parse and validate. Returns boolean.
|
|
36
|
+
# On any validation failure, returns false (and logs + increments counter).
|
|
37
|
+
# The counter exposes audit signal without forcing a chain_record dependency
|
|
38
|
+
# in this hot path (Phase 12 kairos Prop 3: recognition without raise/break,
|
|
39
|
+
# but observable through @failure_count + log).
|
|
40
|
+
def self.parse(raw, logger: nil)
|
|
41
|
+
return false unless raw.is_a?(Hash)
|
|
42
|
+
|
|
43
|
+
needed = raw['needed']
|
|
44
|
+
unless needed == true || needed == false
|
|
45
|
+
note_failure(logger, "review_hint.needed must be boolean, got #{needed.inspect}")
|
|
46
|
+
return false
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
reason = raw['reason']
|
|
50
|
+
unless reason.nil? || reason.is_a?(String)
|
|
51
|
+
note_failure(logger, "review_hint.reason must be string or nil, got #{reason.class}")
|
|
52
|
+
return false
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
urgency = raw['urgency']
|
|
56
|
+
unless urgency.nil? || VALID_URGENCY.include?(urgency)
|
|
57
|
+
note_failure(logger, "review_hint.urgency must be one of #{VALID_URGENCY} or nil, got #{urgency.inspect}")
|
|
58
|
+
return false
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
needed
|
|
62
|
+
rescue StandardError => e
|
|
63
|
+
note_failure(logger, "review_hint parse error: #{e.class}: #{e.message}")
|
|
64
|
+
false
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def self.note_failure(logger, msg)
|
|
68
|
+
@failure_count += 1
|
|
69
|
+
log(logger, msg)
|
|
70
|
+
end
|
|
71
|
+
private_class_method :note_failure
|
|
72
|
+
|
|
73
|
+
def self.log(logger, msg)
|
|
74
|
+
if logger
|
|
75
|
+
logger.warn("[review_hint] #{msg}")
|
|
76
|
+
else
|
|
77
|
+
warn "[review_hint] #{msg}"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
private_class_method :log
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module KairosMcp
|
|
4
|
+
module SkillSets
|
|
5
|
+
module Agent
|
|
6
|
+
# Validates agent.yml's complexity_review.multi_llm_review.trigger_on against
|
|
7
|
+
# the actual signal vocabulary produced by assess_decision_complexity.
|
|
8
|
+
#
|
|
9
|
+
# Phase 12 §12 / v0.4 P-1. Aligned with agent_step.rb:1005-1031.
|
|
10
|
+
#
|
|
11
|
+
# Fail-loud at session start (not silently at first cycle) so typos like
|
|
12
|
+
# "l0_chagne" don't bypass the review gate at runtime.
|
|
13
|
+
class TriggerValidator
|
|
14
|
+
KNOWN_SIGNALS = %w[
|
|
15
|
+
high_risk
|
|
16
|
+
many_steps
|
|
17
|
+
design_scope
|
|
18
|
+
l0_change
|
|
19
|
+
core_files
|
|
20
|
+
multi_file
|
|
21
|
+
state_mutation
|
|
22
|
+
].freeze
|
|
23
|
+
|
|
24
|
+
class ConfigurationError < StandardError; end
|
|
25
|
+
|
|
26
|
+
# @param trigger_on [Array<String>] from agent.yml
|
|
27
|
+
# @param multi_cfg [Hash, nil] complexity_review.multi_llm_review subtree;
|
|
28
|
+
# when provided, validate! warns on the rule_only + enabled + empty
|
|
29
|
+
# trigger_on combination (review gate effectively disabled despite enabled:true).
|
|
30
|
+
# @return [Array<String>] the validated, stringified signals (echo of input)
|
|
31
|
+
# @raise [ConfigurationError] on unknown signal name
|
|
32
|
+
def self.validate!(trigger_on, multi_cfg: nil)
|
|
33
|
+
stringified = Array(trigger_on).map(&:to_s)
|
|
34
|
+
if stringified.empty?
|
|
35
|
+
warn_if_review_unreachable(multi_cfg)
|
|
36
|
+
return []
|
|
37
|
+
end
|
|
38
|
+
unknown = stringified - KNOWN_SIGNALS
|
|
39
|
+
unless unknown.empty?
|
|
40
|
+
raise ConfigurationError,
|
|
41
|
+
"agent.yml complexity_review.multi_llm_review.trigger_on contains " \
|
|
42
|
+
"unknown signals: #{unknown.inspect}. Known: #{KNOWN_SIGNALS.inspect}"
|
|
43
|
+
end
|
|
44
|
+
stringified
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# PR3 hardening: surface configuration that would silently disable the
|
|
48
|
+
# review gate. enabled:true + trigger_on:[] under rule_only mode means
|
|
49
|
+
# rule never fires; under rule_or_hint, only LLM hints can ever trigger
|
|
50
|
+
# which is unreliable. Either case warrants an operator warning.
|
|
51
|
+
def self.warn_if_review_unreachable(multi_cfg)
|
|
52
|
+
return unless multi_cfg.is_a?(Hash) && multi_cfg['enabled']
|
|
53
|
+
mode = multi_cfg['trigger_mode'] || 'rule_or_hint'
|
|
54
|
+
if mode == 'rule_only'
|
|
55
|
+
warn '[trigger_validator] WARNING: enabled:true but trigger_on:[] with ' \
|
|
56
|
+
'trigger_mode:rule_only — review gate cannot fire. ' \
|
|
57
|
+
'Either populate trigger_on (e.g. [l0_change, design_scope]) or ' \
|
|
58
|
+
'set trigger_mode:rule_or_hint to allow LLM hints to trigger review.'
|
|
59
|
+
else
|
|
60
|
+
warn '[trigger_validator] note: trigger_on:[] under rule_or_hint — ' \
|
|
61
|
+
'review gate fires only on LLM-emitted review_hint.needed=true. ' \
|
|
62
|
+
'Consider adding [l0_change, design_scope] for structural floor.'
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
private_class_method :warn_if_review_unreachable
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -345,11 +345,17 @@ assert "test_merge_llm_cannot_lower: LLM low + structural high → final high" d
|
|
|
345
345
|
result[:level] == 'high'
|
|
346
346
|
end
|
|
347
347
|
|
|
348
|
-
assert "test_merge_llm_same_level: LLM medium + structural medium → final medium" do
|
|
348
|
+
assert "test_merge_llm_same_level: LLM medium + structural medium → final medium (signals trust-quarantined)" do
|
|
349
|
+
# Phase 12 §3.2 / v0.4 P-2 trust boundary: LLM-emitted signals MUST NOT enter
|
|
350
|
+
# :signals (which is the OR-floor input). They go into :complexity_hint_signals
|
|
351
|
+
# (advisory only). This test verifies the quarantine.
|
|
349
352
|
structural = { level: 'medium', signals: ['high_risk'] }
|
|
350
353
|
llm_hint = { 'level' => 'medium', 'signals' => ['moderate_scope'] }
|
|
351
354
|
result = step.send(:merge_complexity, structural, llm_hint)
|
|
352
|
-
result[:level] == 'medium' &&
|
|
355
|
+
result[:level] == 'medium' &&
|
|
356
|
+
result[:signals] == ['high_risk'] &&
|
|
357
|
+
!result[:signals].include?('moderate_scope') &&
|
|
358
|
+
result[:complexity_hint_signals].include?('moderate_scope')
|
|
353
359
|
end
|
|
354
360
|
|
|
355
361
|
assert "test_merge_nil_hint: nil LLM hint → structural unchanged" do
|
|
@@ -358,11 +364,13 @@ assert "test_merge_nil_hint: nil LLM hint → structural unchanged" do
|
|
|
358
364
|
result[:level] == 'medium'
|
|
359
365
|
end
|
|
360
366
|
|
|
361
|
-
assert "test_merge_symbol_keys: symbol-key llm_hint works" do
|
|
367
|
+
assert "test_merge_symbol_keys: symbol-key llm_hint works (signals quarantined to hint field)" do
|
|
362
368
|
structural = { level: 'low', signals: [] }
|
|
363
369
|
llm_hint = { level: 'high', signals: ['deep'] }
|
|
364
370
|
result = step.send(:merge_complexity, structural, llm_hint)
|
|
365
|
-
result[:level] == 'medium' &&
|
|
371
|
+
result[:level] == 'medium' &&
|
|
372
|
+
result[:signals] == [] &&
|
|
373
|
+
result[:complexity_hint_signals].include?('deep')
|
|
366
374
|
end
|
|
367
375
|
|
|
368
376
|
# ============================================================
|
|
@@ -573,6 +581,199 @@ assert "test_multi_llm_review_prompt: L0 review prompt generated" do
|
|
|
573
581
|
prompt.include?('L0 Change Review') && prompt.include?('evolve skill') && prompt.include?('test_goal')
|
|
574
582
|
end
|
|
575
583
|
|
|
584
|
+
# ============================================================
|
|
585
|
+
# Phase 12 §3.11 / PR3.5 — chain reject_unsanitized_for_chain_inline
|
|
586
|
+
# ============================================================
|
|
587
|
+
section "L0 chain-record reject (PR3.5)"
|
|
588
|
+
|
|
589
|
+
assert "safe content passes (returns nil)" do
|
|
590
|
+
step.send(:reject_unsanitized_for_chain_inline, 'plain safe text').nil?
|
|
591
|
+
end
|
|
592
|
+
|
|
593
|
+
assert "raw delimiter rejected" do
|
|
594
|
+
reason = step.send(:reject_unsanitized_for_chain_inline, 'see </artifact> end')
|
|
595
|
+
reason && reason.include?('raw delimiter')
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
assert "fullwidth delimiter rejected (NFKC)" do
|
|
599
|
+
reason = step.send(:reject_unsanitized_for_chain_inline, 'tag <artifact> here')
|
|
600
|
+
reason && reason.include?('raw delimiter')
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
assert "case variant rejected" do
|
|
604
|
+
reason = step.send(:reject_unsanitized_for_chain_inline, '<ARTIFACT>')
|
|
605
|
+
reason && reason.include?('raw delimiter')
|
|
606
|
+
end
|
|
607
|
+
|
|
608
|
+
assert "HTML entity encoded delimiter rejected (PR3.5 fix)" do
|
|
609
|
+
reason = step.send(:reject_unsanitized_for_chain_inline, 'persistent <artifact> here')
|
|
610
|
+
reason && reason.include?('encoded delimiter')
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
assert "URL-encoded delimiter rejected (PR3.5 fix)" do
|
|
614
|
+
reason = step.send(:reject_unsanitized_for_chain_inline, 'persistent %3Cartifact%3E here')
|
|
615
|
+
reason && reason.include?('encoded delimiter')
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
assert "nil returns nil (don't crash)" do
|
|
619
|
+
step.send(:reject_unsanitized_for_chain_inline, nil).nil?
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
assert "empty string returns nil" do
|
|
623
|
+
step.send(:reject_unsanitized_for_chain_inline, '').nil?
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
# ============================================================
|
|
627
|
+
# Phase 12 §10 KAIROS_TEST_FORCE_REVIEW env flag (PR3)
|
|
628
|
+
# ============================================================
|
|
629
|
+
section "KAIROS_TEST_FORCE_REVIEW env flag"
|
|
630
|
+
|
|
631
|
+
assert "flag unset → not forced" do
|
|
632
|
+
ENV.delete('KAIROS_TEST_FORCE_REVIEW')
|
|
633
|
+
ENV.delete('KAIROS_ENV')
|
|
634
|
+
step.send(:review_force_enabled?) == false
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
assert "flag=true outside production → forced" do
|
|
638
|
+
ENV['KAIROS_TEST_FORCE_REVIEW'] = 'true'
|
|
639
|
+
ENV['KAIROS_ENV'] = 'development'
|
|
640
|
+
result = step.send(:review_force_enabled?)
|
|
641
|
+
ENV.delete('KAIROS_TEST_FORCE_REVIEW')
|
|
642
|
+
ENV.delete('KAIROS_ENV')
|
|
643
|
+
result == true
|
|
644
|
+
end
|
|
645
|
+
|
|
646
|
+
assert "flag=true with KAIROS_ENV=production → IGNORED" do
|
|
647
|
+
ENV['KAIROS_TEST_FORCE_REVIEW'] = 'true'
|
|
648
|
+
ENV['KAIROS_ENV'] = 'production'
|
|
649
|
+
result = step.send(:review_force_enabled?)
|
|
650
|
+
ENV.delete('KAIROS_TEST_FORCE_REVIEW')
|
|
651
|
+
ENV.delete('KAIROS_ENV')
|
|
652
|
+
result == false
|
|
653
|
+
end
|
|
654
|
+
|
|
655
|
+
assert "flag=true with KAIROS_ENV=PRODUCTION (case insensitive) → IGNORED" do
|
|
656
|
+
ENV['KAIROS_TEST_FORCE_REVIEW'] = 'true'
|
|
657
|
+
ENV['KAIROS_ENV'] = 'PRODUCTION'
|
|
658
|
+
result = step.send(:review_force_enabled?)
|
|
659
|
+
ENV.delete('KAIROS_TEST_FORCE_REVIEW')
|
|
660
|
+
ENV.delete('KAIROS_ENV')
|
|
661
|
+
result == false
|
|
662
|
+
end
|
|
663
|
+
|
|
664
|
+
assert "flag=anything-else → not forced" do
|
|
665
|
+
ENV['KAIROS_TEST_FORCE_REVIEW'] = '1' # not 'true' literally
|
|
666
|
+
result = step.send(:review_force_enabled?)
|
|
667
|
+
ENV.delete('KAIROS_TEST_FORCE_REVIEW')
|
|
668
|
+
result == false
|
|
669
|
+
end
|
|
670
|
+
|
|
671
|
+
# ============================================================
|
|
672
|
+
# Phase 12 §3.10 schema_version validation (PR2)
|
|
673
|
+
# ============================================================
|
|
674
|
+
section "Schema version validation (fail-closed)"
|
|
675
|
+
|
|
676
|
+
assert "current schema (v=1, f=1) → no error" do
|
|
677
|
+
step.send(:schema_version_check, {
|
|
678
|
+
'verdict_schema_version' => 1, 'feedback_text_schema_version' => 1
|
|
679
|
+
}).nil?
|
|
680
|
+
end
|
|
681
|
+
|
|
682
|
+
assert "missing verdict_schema_version → reject" do
|
|
683
|
+
msg = step.send(:schema_version_check, { 'feedback_text_schema_version' => 1 })
|
|
684
|
+
msg && msg.include?('verdict_schema_version missing')
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
assert "missing feedback_text_schema_version → reject" do
|
|
688
|
+
msg = step.send(:schema_version_check, { 'verdict_schema_version' => 1 })
|
|
689
|
+
msg && msg.include?('feedback_text_schema_version missing')
|
|
690
|
+
end
|
|
691
|
+
|
|
692
|
+
assert "newer verdict_schema_version → reject (fail-closed)" do
|
|
693
|
+
msg = step.send(:schema_version_check, {
|
|
694
|
+
'verdict_schema_version' => 99, 'feedback_text_schema_version' => 1
|
|
695
|
+
})
|
|
696
|
+
msg && msg.include?('newer than supported')
|
|
697
|
+
end
|
|
698
|
+
|
|
699
|
+
assert "newer feedback_text_schema_version → reject" do
|
|
700
|
+
msg = step.send(:schema_version_check, {
|
|
701
|
+
'verdict_schema_version' => 1, 'feedback_text_schema_version' => 99
|
|
702
|
+
})
|
|
703
|
+
msg && msg.include?('newer than supported')
|
|
704
|
+
end
|
|
705
|
+
|
|
706
|
+
# ============================================================
|
|
707
|
+
# Phase 12 §3.2 OR-floor trigger (PR2)
|
|
708
|
+
# ============================================================
|
|
709
|
+
section "OR-floor trigger logic"
|
|
710
|
+
|
|
711
|
+
assert "rule_or_hint: rule fires (l0_change in signals) → review needed" do
|
|
712
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
|
|
713
|
+
'trigger_on' => ['l0_change', 'design_scope'] }
|
|
714
|
+
complexity = { signals: ['l0_change'] }
|
|
715
|
+
decision = { 'review_hint' => { 'needed' => false } }
|
|
716
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
|
|
717
|
+
end
|
|
718
|
+
|
|
719
|
+
assert "rule_or_hint: needed:false CANNOT suppress rule (critical OR-floor property)" do
|
|
720
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
|
|
721
|
+
'trigger_on' => ['l0_change'] }
|
|
722
|
+
complexity = { signals: ['l0_change'] } # rule fires
|
|
723
|
+
decision = { 'review_hint' => { 'needed' => false } } # hint suppresses... but rule wins
|
|
724
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
|
|
725
|
+
end
|
|
726
|
+
|
|
727
|
+
assert "rule_or_hint: hint:true raises gate when rule does NOT fire" do
|
|
728
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
|
|
729
|
+
'trigger_on' => ['l0_change'] }
|
|
730
|
+
complexity = { signals: ['high_risk'] } # rule does not fire
|
|
731
|
+
decision = { 'review_hint' => { 'needed' => true, 'urgency' => 'high' } }
|
|
732
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
|
|
733
|
+
end
|
|
734
|
+
|
|
735
|
+
assert "rule_or_hint: neither rule nor hint → no review" do
|
|
736
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
|
|
737
|
+
'trigger_on' => ['l0_change'] }
|
|
738
|
+
complexity = { signals: ['high_risk'] }
|
|
739
|
+
decision = { 'review_hint' => { 'needed' => false } }
|
|
740
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
|
|
741
|
+
end
|
|
742
|
+
|
|
743
|
+
assert "rule_or_hint: malformed review_hint falls through to rule (additive contract)" do
|
|
744
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
|
|
745
|
+
'trigger_on' => ['l0_change'] }
|
|
746
|
+
complexity = { signals: ['l0_change'] }
|
|
747
|
+
decision = { 'review_hint' => { 'needed' => 'yes' } } # malformed string
|
|
748
|
+
# ReviewHint.parse → false; OR-floor still uses rule which fires
|
|
749
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
assert "rule_only: ignores review_hint even when needed:true" do
|
|
753
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'rule_only',
|
|
754
|
+
'trigger_on' => ['l0_change'] }
|
|
755
|
+
complexity = { signals: ['high_risk'] } # rule does NOT fire
|
|
756
|
+
decision = { 'review_hint' => { 'needed' => true, 'urgency' => 'high' } }
|
|
757
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
|
|
758
|
+
end
|
|
759
|
+
|
|
760
|
+
assert "unknown trigger_mode → fail-closed to rule_only" do
|
|
761
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'evil_typo',
|
|
762
|
+
'trigger_on' => ['l0_change'] }
|
|
763
|
+
complexity = { signals: ['high_risk'] } # rule does not fire
|
|
764
|
+
decision = { 'review_hint' => { 'needed' => true } } # would have raised under rule_or_hint
|
|
765
|
+
# fails closed to rule_only → hint ignored → false
|
|
766
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
|
|
767
|
+
end
|
|
768
|
+
|
|
769
|
+
assert "missing review_hint key in payload is treated as no hint" do
|
|
770
|
+
cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
|
|
771
|
+
'trigger_on' => ['l0_change'] }
|
|
772
|
+
complexity = { signals: ['high_risk'] }
|
|
773
|
+
decision = {} # no review_hint
|
|
774
|
+
step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
|
|
775
|
+
end
|
|
776
|
+
|
|
576
777
|
# ============================================================
|
|
577
778
|
# Summary
|
|
578
779
|
# ============================================================
|