kairos-chain 3.19.1 → 3.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +71 -0
  3. data/lib/kairos_mcp/version.rb +1 -1
  4. data/templates/knowledge/multi_llm_review_workflow/multi_llm_review_workflow.md +106 -2
  5. data/templates/skillsets/agent/config/agent.yml +15 -2
  6. data/templates/skillsets/agent/lib/agent/review_hint.rb +84 -0
  7. data/templates/skillsets/agent/lib/agent/trigger_validator.rb +69 -0
  8. data/templates/skillsets/agent/lib/agent.rb +2 -0
  9. data/templates/skillsets/agent/test/test_agent_complexity_review.rb +205 -4
  10. data/templates/skillsets/agent/test/test_decide_prompt_contract.rb +91 -0
  11. data/templates/skillsets/agent/test/test_review_hint.rb +168 -0
  12. data/templates/skillsets/agent/tools/agent_start.rb +14 -0
  13. data/templates/skillsets/agent/tools/agent_step.rb +319 -20
  14. data/templates/skillsets/llm_client/lib/llm_client/cursor_adapter.rb +9 -0
  15. data/templates/skillsets/llm_client/test/test_cursor_adapter_model.rb +111 -0
  16. data/templates/skillsets/multi_llm_review/config/multi_llm_review.yml +35 -1
  17. data/templates/skillsets/multi_llm_review/lib/multi_llm_review/build_review_bundle.rb +170 -0
  18. data/templates/skillsets/multi_llm_review/lib/multi_llm_review/consensus.rb +19 -1
  19. data/templates/skillsets/multi_llm_review/lib/multi_llm_review/feedback_formatter.rb +58 -0
  20. data/templates/skillsets/multi_llm_review/lib/multi_llm_review/sanitizer.rb +184 -0
  21. data/templates/skillsets/multi_llm_review/skillset.json +8 -4
  22. data/templates/skillsets/multi_llm_review/test/test_feedback_formatter.rb +97 -0
  23. data/templates/skillsets/multi_llm_review/test/test_multi_llm_review.rb +155 -1
  24. data/templates/skillsets/multi_llm_review/test/test_multi_llm_review_bundle.rb +167 -0
  25. data/templates/skillsets/multi_llm_review/test/test_multi_llm_review_wait.rb +249 -0
  26. data/templates/skillsets/multi_llm_review/test/test_sanitizer.rb +213 -0
  27. data/templates/skillsets/multi_llm_review/tools/multi_llm_review.rb +80 -21
  28. data/templates/skillsets/multi_llm_review/tools/multi_llm_review_bundle.rb +139 -0
  29. data/templates/skillsets/multi_llm_review/tools/multi_llm_review_collect.rb +20 -3
  30. data/templates/skillsets/multi_llm_review/tools/multi_llm_review_wait.rb +313 -0
  31. metadata +16 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef45a8e58aedc056cd5303abb6463a66503da146fc2bb7abe8ee018569fde8e0
4
- data.tar.gz: c62bc1328bc6a22e577cb1a8dad2de501bf184dd9022acb207257a4cfaa49955
3
+ metadata.gz: 20e2a223137f51dc61025e57dd6fd205a8f702ef923b5b0b2e0d464a308d279f
4
+ data.tar.gz: 51627cb487cf5fc2e46b8e6055bf36e0cf5c8839f15cb2c2236f2ff56efa2def
5
5
  SHA512:
6
- metadata.gz: 9e6236cd2860aa3cf244361962de9660b04aa47e0eed03689bb16dd4e95f3f6e1f9f6fe7377e9c0b5b6be87842346757c8b273f6ac8564d293f5cd2157baa727
7
- data.tar.gz: e83f1c792045a5141c9914d6ef09d1ac4ecc33ae6664e27316cf9ab82987c8c5555981869ec18fbb0309cefe071f5d62eb748b6cd2d92b1d2ff9835bf5f5fed3
6
+ metadata.gz: 9fd4b17a28bdc06b7b19195e7274ef41e4ba7a93fc70e2c3a38083c58eae85ea8dfd432cb7cc6219cf7ba49ba637dbed12c48ea12312f4a3f26f46dfb936438f
7
+ data.tar.gz: fadb35fdbf47eeebfc9b667685452a0c222ecb396dbb2031de08ac27b2df1de1a3985fc41c03e4e767d22a58ec98a77d52c2059e921f8084d1663a2a099bdd1d
data/CHANGELOG.md CHANGED
@@ -4,6 +4,77 @@ All notable changes to the `kairos-chain` gem will be documented in this file.
4
4
 
5
5
  This project follows [Semantic Versioning](https://semver.org/).
6
6
 
7
+ ## [3.24.0] - 2026-04-27
8
+
9
+ ### Added
10
+
11
+ - **multi_llm_review_wait MCP tool** (Phase 1.5) — optional blocking gate
12
+ between `multi_llm_review` (Phase 1) and `multi_llm_review_collect`
13
+ (Phase 2). Wraps the existing `WaitForWorker.wait` polling loop and
14
+ exposes 6 distinct status codes (`ready`, `still_pending`, `crashed`,
15
+ `unknown_token`, `already_collected`, `past_collect_deadline`) each with
16
+ a `next_action` recovery hint pointing at the right next tool.
17
+ - **`next_action` hint on `multi_llm_review` delegation_pending response** —
18
+ structured `{tool, args, purpose}` field nudging the orchestrator to call
19
+ `multi_llm_review_wait` after persona Agent dispatch. MCP does not enforce
20
+ ordering; this is a hint, not a constraint, but in practice LLMs follow
21
+ it reliably.
22
+ - **Path A vs Path B disambiguation in workflow knowledge doc** — surfaces
23
+ the long-implicit distinction between the host-tracked Bash workflow
24
+ (Claude Code's `Bash(background)` pattern, statusbar shows `XX shells`)
25
+ and the MCP-managed SkillSet (detached worker, no host-side tracking,
26
+ polling required).
27
+ - New config keys under `delegation.parallel`:
28
+ `wait_poll_interval_seconds: 1.0`, `wait_max_default_seconds: 600`,
29
+ `wait_max_hard_cap_seconds: 1800`, `wait_still_pending_streak_limit: 3`.
30
+ - Streak guard: 3 consecutive `still_pending` returns escalate to
31
+ `crashed/wait_exhausted` so a wedged worker cannot trap the orchestrator
32
+ in an infinite wait loop.
33
+ - 14 new tests in `test_multi_llm_review_wait.rb` covering all status
34
+ paths, streak persistence/reset, hard cap clamping, deadline-remaining
35
+ clamping, and backward compatibility (collect still works without wait).
36
+
37
+ ### Changed
38
+
39
+ - `multi_llm_review` SkillSet version 0.4.0 → 0.5.0.
40
+ - `delegation` instruction text now mentions wait → collect chain.
41
+
42
+ ### Notes
43
+
44
+ - Backward compatible: callers that skip wait and call collect directly
45
+ still work via collect's existing internal polling.
46
+ - Design review (Codex GPT-5.5 + Cursor Composer-2 + Claude Team Opus 4.7)
47
+ produced 3/3 REVISE with 6-7 P1 issues; revisions R1-R14 captured in
48
+ handoff L2 `multi_llm_review_wait_tool_handoff` before implementation.
49
+
50
+ ## [3.23.3] - 2026-04-27
51
+
52
+ ### Documentation
53
+
54
+ - **multi_llm_review_workflow knowledge** — Added "Async/Parallel Collect
55
+ Timing — Iron Rule" subsection. Documents the workflow constraint that the
56
+ orchestrator must call `multi_llm_review_collect` immediately after persona
57
+ Agent reviews complete, without intervening user dialogue. Explains the
58
+ underlying mechanics (LLM is not event-driven; collect already polls
59
+ internally at 0.5s intervals; token expiry vs subprocess completion). Adds
60
+ recommended flow, anti-pattern, and manual recovery instructions.
61
+ - Updated stale `must_collect_by` default reference (600s → 1800s).
62
+
63
+ ## [3.23.2] - 2026-04-26
64
+
65
+ ### Fixed
66
+
67
+ - **multi_llm_review collect_deadline bug** — `timeout_seconds_override` no longer
68
+ leaves the orchestrator's submission window shorter than the worker lifespan.
69
+ In the async/parallel path, `collect_deadline` is now auto-extended to cover
70
+ `worker self_timeout + poll margin` so raising `timeout_seconds_override`
71
+ alone keeps the token alive while the worker is healthy.
72
+ - New `collect_deadline_seconds_override` argument on `multi_llm_review` for
73
+ explicit control of the orchestrator's submission window.
74
+ - Default `delegation.collect_deadline_seconds` raised from `600` (10 min) to
75
+ `1800` (30 min) to better fit interactive runs where user dialogue intervenes
76
+ between Phase 1 and `multi_llm_review_collect`.
77
+
7
78
  ## [3.17.0] - 2026-04-22
8
79
 
9
80
  ### Added
@@ -1,4 +1,4 @@
1
1
  module KairosMcp
2
- VERSION = "3.19.1"
2
+ VERSION = "3.24.0"
3
3
  CHANGELOG_URL = "https://github.com/masaomi/KairosChain_2026/blob/main/CHANGELOG.md"
4
4
  end
@@ -29,6 +29,54 @@ This skill covers:
29
29
  For **WHO** (which LLM is good at what), see: `multi_llm_reviewer_evaluation`
30
30
  For **development lifecycle** (design → implement → verify), see: `design_to_implementation_workflow`
31
31
 
32
+ ## Two Execution Paths (read this first)
33
+
34
+ There are **two distinct execution paths** with the same name "multi-LLM review".
35
+ They differ in subprocess lifecycle ownership and completion-detection mechanics.
36
+ Pick the right one for your environment:
37
+
38
+ ### Path A — Host-tracked (Bash workflow)
39
+
40
+ - **Trigger**: orchestrator (LLM) calls Claude Code's `Bash` tool with
41
+ `run_in_background: true` to spawn `claude -p`, `codex exec`, `agent -p` directly.
42
+ - **Process parent**: Claude Code (the host harness).
43
+ - **Completion detection**: **event-driven**. Claude Code's shell tracker monitors
44
+ the spawned shells; when they exit, the LLM is notified through the standard
45
+ tool-result mechanism. Statusbar shows `XX shells` while reviewers are running.
46
+ - **When to use**: interactive Claude Code sessions for one-off Tier 3 reviews.
47
+ - **Reference**: see "Orchestration Template" section below for the canonical
48
+ `Bash(background)` pattern.
49
+
50
+ ### Path B — MCP-managed (multi_llm_review SkillSet)
51
+
52
+ - **Trigger**: orchestrator calls the MCP tool `multi_llm_review`.
53
+ - **Process parent**: the kairos-chain Ruby gem (MCP server). The gem forks a
54
+ detached worker (`bin/dispatch_worker.rb`) which calls `Process.setsid` and
55
+ spawns CLI reviewers as a separate session leader.
56
+ - **Completion detection**: **polling required**. Claude Code is not the parent,
57
+ so the spawned subprocesses do NOT appear in the `XX shells` statusbar count.
58
+ The orchestrator must call `multi_llm_review_collect` (and optionally
59
+ `multi_llm_review_wait` first) to observe completion.
60
+ - **When to use**: portable execution (other MCP hosts, autonomous Agent SkillSet),
61
+ or any case where you want the consensus computation done server-side.
62
+ - **Recommended chain (3-step)**: `multi_llm_review` → `multi_llm_review_wait` →
63
+ `multi_llm_review_collect`. Each Phase-1/1.5 response carries a `next_action`
64
+ hint pointing at the next tool. wait is optional but recommended — without it,
65
+ collect's internal polling still covers worker completion, but recovery hints
66
+ for `still_pending`, `crashed`, and `past_collect_deadline` are less explicit.
67
+ - **Reference**: see "Orchestrator Delegation Protocol" + "Async/Parallel Collect
68
+ Timing — Iron Rule" sections below.
69
+
70
+ ### Quick selector
71
+
72
+ | Question | Answer |
73
+ |----------|--------|
74
+ | Are you in an interactive Claude Code session and just need one review? | **Path A** |
75
+ | Do you need this to work in Cursor / autonomous mode / other MCP host? | **Path B** |
76
+ | Do you want the consensus result inside the MCP tool response? | **Path B** |
77
+ | Did you observe `XX shells` in the statusbar last time it worked? | That was Path A |
78
+ | Did the run produce a `collect_token` and a `pending/<token>/` directory? | That was Path B |
79
+
32
80
  ## Roles
33
81
 
34
82
  | Role | Who | Responsibility |
@@ -331,8 +379,8 @@ cross-model subprocess reviewers give epistemic diversity. The two are complemen
331
379
 
332
380
  **Failure modes**:
333
381
  - `expired_or_unknown_token`: orchestrator missed `must_collect_by` deadline
334
- (default 600s), or token never existed. The pending review is gone; call
335
- `multi_llm_review` again from scratch.
382
+ (default 1800s since v3.23.2; was 600s), or token never existed. The pending
383
+ review is gone; call `multi_llm_review` again from scratch.
336
384
  - `error: invalid orchestrator_reviews`: persona count outside 2-4 or missing
337
385
  required fields. Fix and retry collect with the same token.
338
386
  - All-subprocess-failed at Call 1: returns error immediately; no token issued.
@@ -340,6 +388,62 @@ cross-model subprocess reviewers give epistemic diversity. The two are complemen
340
388
  **Default**: `orchestrator_strategy` defaults to `"exclude"` (back-compat). Use
341
389
  `"delegate"` explicitly until validated by use.
342
390
 
391
+ #### Async/Parallel Collect Timing — Iron Rule
392
+
393
+ When `delegation.parallel.default: true` (the v3.x default), Call 1 returns
394
+ `delegation_pending` **immediately** (~50ms) and a detached worker runs the
395
+ subprocess reviewers in parallel with the orchestrator's persona Agent
396
+ reviews. This is faster, but introduces a timing trap:
397
+
398
+ > **The orchestrator MUST call `multi_llm_review_collect` immediately after
399
+ > the persona Agent reviews complete — without intervening user dialogue,
400
+ > unrelated tool calls, or context switches.**
401
+
402
+ Why this matters:
403
+
404
+ - The LLM is **not event-driven**. When the worker finishes writing
405
+ `subprocess_status: "done"` to `state.json`, nothing wakes the orchestrator.
406
+ The orchestrator only notices when it next calls `multi_llm_review_collect`.
407
+ - `multi_llm_review_collect` already polls internally at
408
+ `poll_interval_seconds: 0.5` for up to `collect_max_wait_seconds: 420` (7min)
409
+ per call. Polling is not the bottleneck — the bottleneck is the orchestrator
410
+ forgetting to call collect at all.
411
+ - The token expires at `collect_deadline` (default 30min since v3.23.2). If
412
+ user dialogue or other work intervenes between persona Agent completion and
413
+ the collect call, the token can expire while the subprocess results sit
414
+ ready and unread on disk.
415
+
416
+ Recommended orchestrator flow (single LLM turn, no detours):
417
+
418
+ ```
419
+ 1. multi_llm_review(...) → receive delegation_pending + collect_token
420
+ 2. Spawn persona Agent reviews (Agent tool, parallel, 2-4 personas)
421
+ 3. As soon as ALL personas return → multi_llm_review_collect(collect_token, ...)
422
+ 4. Return final consensus to user
423
+ ```
424
+
425
+ Anti-pattern (do NOT do this):
426
+
427
+ ```
428
+ 1. multi_llm_review(...) → delegation_pending
429
+ 2. Run persona Agent reviews
430
+ 3. ❌ "By the way, while we wait, let me explain X to the user…"
431
+ 4. ❌ User asks an unrelated question, conversation drifts
432
+ 5. ❌ 30+ minutes later, finally try collect → expired_or_unknown_token
433
+ ```
434
+
435
+ If the orchestrator is genuinely interrupted (user explicitly switches topic,
436
+ or persona Agent itself takes a long time and the orchestrator wants to
437
+ report progress), it should still **call collect first** — collect returns
438
+ quickly if the worker is already done, or blocks up to 7min if not. Either
439
+ way, the token stays alive and consensus is captured before resuming side
440
+ work.
441
+
442
+ Manual recovery if expiry happens: subprocess results are persisted at
443
+ `.kairos/multi_llm_review/pending/<token>/subprocess_results.json` and remain
444
+ readable until GC. Read them directly and synthesize manually, then re-run
445
+ `multi_llm_review` for fresh results if needed.
446
+
343
447
  ### Critical CLI Notes
344
448
 
345
449
  - **Cursor Agent stdin**: `cat file | agent -p -` does NOT work. Use file-reference:
@@ -107,12 +107,25 @@ complexity_review:
107
107
  post_act_review: true # enable medium-complexity post-ACT review
108
108
  # Multi-LLM review integration (Gate 5.5c)
109
109
  multi_llm_review:
110
- enabled: false # opt-in: enable for multi-LLM review during autonomous mode
111
- trigger_on: # complexity signals that trigger multi-LLM review
110
+ # Phase 12 PR2: KairosChain framework default — multi-LLM review is on by
111
+ # default as structural compensation for current LLM metacognition limits.
112
+ # See knowledge: project_multi_llm_review_default_philosophy.
113
+ # Override with `enabled: false` if you specifically need legacy single-LLM
114
+ # behavior; `system_upgrade` 3-way merge preserves user overrides.
115
+ enabled: true
116
+ # rule_or_hint (default): OR-floor — review fires when EITHER the deterministic
117
+ # rule (trigger_on ∩ complexity[:signals]) fires OR the LLM-emitted
118
+ # review_hint.needed=true. The hint is ADDITIVE only; it cannot suppress the
119
+ # rule (Phase 12 §3.2 trust boundary).
120
+ # rule_only: legacy/diagnostic — ignore review_hint, rule path only.
121
+ # unknown values: fail-closed to rule_only (most restrictive).
122
+ trigger_mode: rule_or_hint
123
+ trigger_on: # validated against KNOWN_SIGNALS at session start (§12)
112
124
  - l0_change
113
125
  - design_scope
114
126
  max_concurrent: 2 # override Dispatcher default
115
127
  timeout_seconds: 300
128
+ insufficient_first_attempt: retry # retry | checkpoint (§3.8)
116
129
  # Reviewer roster delegates to multi_llm_review/config/multi_llm_review.yml
117
130
 
118
131
  # Audit
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KairosMcp
4
+ module SkillSets
5
+ module Agent
6
+ # Strict validator for the LLM-emitted review_hint structure (Phase 12 §3.6).
7
+ #
8
+ # Critical property: this hint is ADDITIVE only (§3.2 OR-floor). A return value
9
+ # of `false` does NOT suppress rule-based triggers — rule_fired || hint_needed.
10
+ # Therefore malformed hints fail-through to `false` (no review request from
11
+ # this side), and the OR-floor lets the deterministic rule path fire on its own.
12
+ #
13
+ # Schema:
14
+ # review_hint: {
15
+ # needed: Boolean,
16
+ # reason: String | nil,
17
+ # urgency: 'low' | 'medium' | 'high' | nil
18
+ # }
19
+ class ReviewHint
20
+ VALID_URGENCY = %w[low medium high].freeze
21
+
22
+ # PR3 hardening: per-process counter of validation failures, exposed for
23
+ # observability. agent_status / introspection tools may read this to
24
+ # surface drift (e.g., DECIDE LLM repeatedly emitting malformed hints).
25
+ # Reset by tests via reset_failure_count!.
26
+ @failure_count = 0
27
+ class << self
28
+ attr_reader :failure_count
29
+ end
30
+
31
+ def self.reset_failure_count!
32
+ @failure_count = 0
33
+ end
34
+
35
+ # Parse and validate. Returns boolean.
36
+ # On any validation failure, returns false (and logs + increments counter).
37
+ # The counter exposes audit signal without forcing a chain_record dependency
38
+ # in this hot path (Phase 12 kairos Prop 3: recognition without raise/break,
39
+ # but observable through @failure_count + log).
40
+ def self.parse(raw, logger: nil)
41
+ return false unless raw.is_a?(Hash)
42
+
43
+ needed = raw['needed']
44
+ unless needed == true || needed == false
45
+ note_failure(logger, "review_hint.needed must be boolean, got #{needed.inspect}")
46
+ return false
47
+ end
48
+
49
+ reason = raw['reason']
50
+ unless reason.nil? || reason.is_a?(String)
51
+ note_failure(logger, "review_hint.reason must be string or nil, got #{reason.class}")
52
+ return false
53
+ end
54
+
55
+ urgency = raw['urgency']
56
+ unless urgency.nil? || VALID_URGENCY.include?(urgency)
57
+ note_failure(logger, "review_hint.urgency must be one of #{VALID_URGENCY} or nil, got #{urgency.inspect}")
58
+ return false
59
+ end
60
+
61
+ needed
62
+ rescue StandardError => e
63
+ note_failure(logger, "review_hint parse error: #{e.class}: #{e.message}")
64
+ false
65
+ end
66
+
67
+ def self.note_failure(logger, msg)
68
+ @failure_count += 1
69
+ log(logger, msg)
70
+ end
71
+ private_class_method :note_failure
72
+
73
+ def self.log(logger, msg)
74
+ if logger
75
+ logger.warn("[review_hint] #{msg}")
76
+ else
77
+ warn "[review_hint] #{msg}"
78
+ end
79
+ end
80
+ private_class_method :log
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KairosMcp
4
+ module SkillSets
5
+ module Agent
6
+ # Validates agent.yml's complexity_review.multi_llm_review.trigger_on against
7
+ # the actual signal vocabulary produced by assess_decision_complexity.
8
+ #
9
+ # Phase 12 §12 / v0.4 P-1. Aligned with agent_step.rb:1005-1031.
10
+ #
11
+ # Fail-loud at session start (not silently at first cycle) so typos like
12
+ # "l0_chagne" don't bypass the review gate at runtime.
13
+ class TriggerValidator
14
+ KNOWN_SIGNALS = %w[
15
+ high_risk
16
+ many_steps
17
+ design_scope
18
+ l0_change
19
+ core_files
20
+ multi_file
21
+ state_mutation
22
+ ].freeze
23
+
24
+ class ConfigurationError < StandardError; end
25
+
26
+ # @param trigger_on [Array<String>] from agent.yml
27
+ # @param multi_cfg [Hash, nil] complexity_review.multi_llm_review subtree;
28
+ # when provided, validate! warns on the rule_only + enabled + empty
29
+ # trigger_on combination (review gate effectively disabled despite enabled:true).
30
+ # @return [Array<String>] the validated, stringified signals (echo of input)
31
+ # @raise [ConfigurationError] on unknown signal name
32
+ def self.validate!(trigger_on, multi_cfg: nil)
33
+ stringified = Array(trigger_on).map(&:to_s)
34
+ if stringified.empty?
35
+ warn_if_review_unreachable(multi_cfg)
36
+ return []
37
+ end
38
+ unknown = stringified - KNOWN_SIGNALS
39
+ unless unknown.empty?
40
+ raise ConfigurationError,
41
+ "agent.yml complexity_review.multi_llm_review.trigger_on contains " \
42
+ "unknown signals: #{unknown.inspect}. Known: #{KNOWN_SIGNALS.inspect}"
43
+ end
44
+ stringified
45
+ end
46
+
47
+ # PR3 hardening: surface configuration that would silently disable the
48
+ # review gate. enabled:true + trigger_on:[] under rule_only mode means
49
+ # rule never fires; under rule_or_hint, only LLM hints can ever trigger
50
+ # which is unreliable. Either case warrants an operator warning.
51
+ def self.warn_if_review_unreachable(multi_cfg)
52
+ return unless multi_cfg.is_a?(Hash) && multi_cfg['enabled']
53
+ mode = multi_cfg['trigger_mode'] || 'rule_or_hint'
54
+ if mode == 'rule_only'
55
+ warn '[trigger_validator] WARNING: enabled:true but trigger_on:[] with ' \
56
+ 'trigger_mode:rule_only — review gate cannot fire. ' \
57
+ 'Either populate trigger_on (e.g. [l0_change, design_scope]) or ' \
58
+ 'set trigger_mode:rule_or_hint to allow LLM hints to trigger review.'
59
+ else
60
+ warn '[trigger_validator] note: trigger_on:[] under rule_or_hint — ' \
61
+ 'review gate fires only on LLM-emitted review_hint.needed=true. ' \
62
+ 'Consider adding [l0_change, design_scope] for structural floor.'
63
+ end
64
+ end
65
+ private_class_method :warn_if_review_unreachable
66
+ end
67
+ end
68
+ end
69
+ end
@@ -4,3 +4,5 @@ require_relative 'agent/session'
4
4
  require_relative 'agent/message_format'
5
5
  require_relative 'agent/cognitive_loop'
6
6
  require_relative 'agent/mandate_adapter'
7
+ require_relative 'agent/review_hint'
8
+ require_relative 'agent/trigger_validator'
@@ -345,11 +345,17 @@ assert "test_merge_llm_cannot_lower: LLM low + structural high → final high" d
345
345
  result[:level] == 'high'
346
346
  end
347
347
 
348
- assert "test_merge_llm_same_level: LLM medium + structural medium → final medium" do
348
+ assert "test_merge_llm_same_level: LLM medium + structural medium → final medium (signals trust-quarantined)" do
349
+ # Phase 12 §3.2 / v0.4 P-2 trust boundary: LLM-emitted signals MUST NOT enter
350
+ # :signals (which is the OR-floor input). They go into :complexity_hint_signals
351
+ # (advisory only). This test verifies the quarantine.
349
352
  structural = { level: 'medium', signals: ['high_risk'] }
350
353
  llm_hint = { 'level' => 'medium', 'signals' => ['moderate_scope'] }
351
354
  result = step.send(:merge_complexity, structural, llm_hint)
352
- result[:level] == 'medium' && result[:signals].include?('moderate_scope')
355
+ result[:level] == 'medium' &&
356
+ result[:signals] == ['high_risk'] &&
357
+ !result[:signals].include?('moderate_scope') &&
358
+ result[:complexity_hint_signals].include?('moderate_scope')
353
359
  end
354
360
 
355
361
  assert "test_merge_nil_hint: nil LLM hint → structural unchanged" do
@@ -358,11 +364,13 @@ assert "test_merge_nil_hint: nil LLM hint → structural unchanged" do
358
364
  result[:level] == 'medium'
359
365
  end
360
366
 
361
- assert "test_merge_symbol_keys: symbol-key llm_hint works" do
367
+ assert "test_merge_symbol_keys: symbol-key llm_hint works (signals quarantined to hint field)" do
362
368
  structural = { level: 'low', signals: [] }
363
369
  llm_hint = { level: 'high', signals: ['deep'] }
364
370
  result = step.send(:merge_complexity, structural, llm_hint)
365
- result[:level] == 'medium' && result[:signals].include?('deep')
371
+ result[:level] == 'medium' &&
372
+ result[:signals] == [] &&
373
+ result[:complexity_hint_signals].include?('deep')
366
374
  end
367
375
 
368
376
  # ============================================================
@@ -573,6 +581,199 @@ assert "test_multi_llm_review_prompt: L0 review prompt generated" do
573
581
  prompt.include?('L0 Change Review') && prompt.include?('evolve skill') && prompt.include?('test_goal')
574
582
  end
575
583
 
584
+ # ============================================================
585
+ # Phase 12 §3.11 / PR3.5 — chain reject_unsanitized_for_chain_inline
586
+ # ============================================================
587
+ section "L0 chain-record reject (PR3.5)"
588
+
589
+ assert "safe content passes (returns nil)" do
590
+ step.send(:reject_unsanitized_for_chain_inline, 'plain safe text').nil?
591
+ end
592
+
593
+ assert "raw delimiter rejected" do
594
+ reason = step.send(:reject_unsanitized_for_chain_inline, 'see </artifact> end')
595
+ reason && reason.include?('raw delimiter')
596
+ end
597
+
598
+ assert "fullwidth delimiter rejected (NFKC)" do
599
+ reason = step.send(:reject_unsanitized_for_chain_inline, 'tag <artifact> here')
600
+ reason && reason.include?('raw delimiter')
601
+ end
602
+
603
+ assert "case variant rejected" do
604
+ reason = step.send(:reject_unsanitized_for_chain_inline, '<ARTIFACT>')
605
+ reason && reason.include?('raw delimiter')
606
+ end
607
+
608
+ assert "HTML entity encoded delimiter rejected (PR3.5 fix)" do
609
+ reason = step.send(:reject_unsanitized_for_chain_inline, 'persistent &lt;artifact&gt; here')
610
+ reason && reason.include?('encoded delimiter')
611
+ end
612
+
613
+ assert "URL-encoded delimiter rejected (PR3.5 fix)" do
614
+ reason = step.send(:reject_unsanitized_for_chain_inline, 'persistent %3Cartifact%3E here')
615
+ reason && reason.include?('encoded delimiter')
616
+ end
617
+
618
+ assert "nil returns nil (don't crash)" do
619
+ step.send(:reject_unsanitized_for_chain_inline, nil).nil?
620
+ end
621
+
622
+ assert "empty string returns nil" do
623
+ step.send(:reject_unsanitized_for_chain_inline, '').nil?
624
+ end
625
+
626
+ # ============================================================
627
+ # Phase 12 §10 KAIROS_TEST_FORCE_REVIEW env flag (PR3)
628
+ # ============================================================
629
+ section "KAIROS_TEST_FORCE_REVIEW env flag"
630
+
631
+ assert "flag unset → not forced" do
632
+ ENV.delete('KAIROS_TEST_FORCE_REVIEW')
633
+ ENV.delete('KAIROS_ENV')
634
+ step.send(:review_force_enabled?) == false
635
+ end
636
+
637
+ assert "flag=true outside production → forced" do
638
+ ENV['KAIROS_TEST_FORCE_REVIEW'] = 'true'
639
+ ENV['KAIROS_ENV'] = 'development'
640
+ result = step.send(:review_force_enabled?)
641
+ ENV.delete('KAIROS_TEST_FORCE_REVIEW')
642
+ ENV.delete('KAIROS_ENV')
643
+ result == true
644
+ end
645
+
646
+ assert "flag=true with KAIROS_ENV=production → IGNORED" do
647
+ ENV['KAIROS_TEST_FORCE_REVIEW'] = 'true'
648
+ ENV['KAIROS_ENV'] = 'production'
649
+ result = step.send(:review_force_enabled?)
650
+ ENV.delete('KAIROS_TEST_FORCE_REVIEW')
651
+ ENV.delete('KAIROS_ENV')
652
+ result == false
653
+ end
654
+
655
+ assert "flag=true with KAIROS_ENV=PRODUCTION (case insensitive) → IGNORED" do
656
+ ENV['KAIROS_TEST_FORCE_REVIEW'] = 'true'
657
+ ENV['KAIROS_ENV'] = 'PRODUCTION'
658
+ result = step.send(:review_force_enabled?)
659
+ ENV.delete('KAIROS_TEST_FORCE_REVIEW')
660
+ ENV.delete('KAIROS_ENV')
661
+ result == false
662
+ end
663
+
664
+ assert "flag=anything-else → not forced" do
665
+ ENV['KAIROS_TEST_FORCE_REVIEW'] = '1' # not 'true' literally
666
+ result = step.send(:review_force_enabled?)
667
+ ENV.delete('KAIROS_TEST_FORCE_REVIEW')
668
+ result == false
669
+ end
670
+
671
+ # ============================================================
672
+ # Phase 12 §3.10 schema_version validation (PR2)
673
+ # ============================================================
674
+ section "Schema version validation (fail-closed)"
675
+
676
+ assert "current schema (v=1, f=1) → no error" do
677
+ step.send(:schema_version_check, {
678
+ 'verdict_schema_version' => 1, 'feedback_text_schema_version' => 1
679
+ }).nil?
680
+ end
681
+
682
+ assert "missing verdict_schema_version → reject" do
683
+ msg = step.send(:schema_version_check, { 'feedback_text_schema_version' => 1 })
684
+ msg && msg.include?('verdict_schema_version missing')
685
+ end
686
+
687
+ assert "missing feedback_text_schema_version → reject" do
688
+ msg = step.send(:schema_version_check, { 'verdict_schema_version' => 1 })
689
+ msg && msg.include?('feedback_text_schema_version missing')
690
+ end
691
+
692
+ assert "newer verdict_schema_version → reject (fail-closed)" do
693
+ msg = step.send(:schema_version_check, {
694
+ 'verdict_schema_version' => 99, 'feedback_text_schema_version' => 1
695
+ })
696
+ msg && msg.include?('newer than supported')
697
+ end
698
+
699
+ assert "newer feedback_text_schema_version → reject" do
700
+ msg = step.send(:schema_version_check, {
701
+ 'verdict_schema_version' => 1, 'feedback_text_schema_version' => 99
702
+ })
703
+ msg && msg.include?('newer than supported')
704
+ end
705
+
706
+ # ============================================================
707
+ # Phase 12 §3.2 OR-floor trigger (PR2)
708
+ # ============================================================
709
+ section "OR-floor trigger logic"
710
+
711
+ assert "rule_or_hint: rule fires (l0_change in signals) → review needed" do
712
+ cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
713
+ 'trigger_on' => ['l0_change', 'design_scope'] }
714
+ complexity = { signals: ['l0_change'] }
715
+ decision = { 'review_hint' => { 'needed' => false } }
716
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
717
+ end
718
+
719
+ assert "rule_or_hint: needed:false CANNOT suppress rule (critical OR-floor property)" do
720
+ cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
721
+ 'trigger_on' => ['l0_change'] }
722
+ complexity = { signals: ['l0_change'] } # rule fires
723
+ decision = { 'review_hint' => { 'needed' => false } } # hint suppresses... but rule wins
724
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
725
+ end
726
+
727
+ assert "rule_or_hint: hint:true raises gate when rule does NOT fire" do
728
+ cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
729
+ 'trigger_on' => ['l0_change'] }
730
+ complexity = { signals: ['high_risk'] } # rule does not fire
731
+ decision = { 'review_hint' => { 'needed' => true, 'urgency' => 'high' } }
732
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
733
+ end
734
+
735
+ assert "rule_or_hint: neither rule nor hint → no review" do
736
+ cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
737
+ 'trigger_on' => ['l0_change'] }
738
+ complexity = { signals: ['high_risk'] }
739
+ decision = { 'review_hint' => { 'needed' => false } }
740
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
741
+ end
742
+
743
+ assert "rule_or_hint: malformed review_hint falls through to rule (additive contract)" do
744
+ cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
745
+ 'trigger_on' => ['l0_change'] }
746
+ complexity = { signals: ['l0_change'] }
747
+ decision = { 'review_hint' => { 'needed' => 'yes' } } # malformed string
748
+ # ReviewHint.parse → false; OR-floor still uses rule which fires
749
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == true
750
+ end
751
+
752
+ assert "rule_only: ignores review_hint even when needed:true" do
753
+ cfg = { 'enabled' => true, 'trigger_mode' => 'rule_only',
754
+ 'trigger_on' => ['l0_change'] }
755
+ complexity = { signals: ['high_risk'] } # rule does NOT fire
756
+ decision = { 'review_hint' => { 'needed' => true, 'urgency' => 'high' } }
757
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
758
+ end
759
+
760
+ assert "unknown trigger_mode → fail-closed to rule_only" do
761
+ cfg = { 'enabled' => true, 'trigger_mode' => 'evil_typo',
762
+ 'trigger_on' => ['l0_change'] }
763
+ complexity = { signals: ['high_risk'] } # rule does not fire
764
+ decision = { 'review_hint' => { 'needed' => true } } # would have raised under rule_or_hint
765
+ # fails closed to rule_only → hint ignored → false
766
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
767
+ end
768
+
769
+ assert "missing review_hint key in payload is treated as no hint" do
770
+ cfg = { 'enabled' => true, 'trigger_mode' => 'rule_or_hint',
771
+ 'trigger_on' => ['l0_change'] }
772
+ complexity = { signals: ['high_risk'] }
773
+ decision = {} # no review_hint
774
+ step.send(:multi_llm_review_needed?, cfg, complexity, decision) == false
775
+ end
776
+
576
777
  # ============================================================
577
778
  # Summary
578
779
  # ============================================================