kairos-chain 3.24.1 → 3.24.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 27275c4874d145bd69a309c8e89d96a371fdb614f6b0e9fdde6d22f810d907de
4
- data.tar.gz: 904a59d5332379a17162ad8dd371df5b0024a0f5eff01f652c5322b2f9bcbc57
3
+ metadata.gz: fb099806eedb198afc167cbb810110ab7bffac2fceea3684eb14a24e3e7b46fb
4
+ data.tar.gz: 61223eff1c6cd146eea47d44ab0ee95506a8baa6e15a0f1f5c2e929e9d44a5b1
5
5
  SHA512:
6
- metadata.gz: 49790dfd44ccc67e64bb58cf41bc1aeba3c0a79c5693329f47aa1b4024016215f63cf62691f4dbb162fbe31b6ebee5cf38fe78b6350c81ea6dbb958e480c902b
7
- data.tar.gz: 4fc908310610778a521837702b50bc427d3234a7680988fc22b683c8296e893f83652dacf1cbe4f178a19d9cd2031833c56d634024c54cd73f5f8b5a58133e55
6
+ metadata.gz: 91d4a86fc2df06025fefb5f0252e9f019d85cb9fc31f6ceec0d2e0bbe1209c8d24277e7448faace8ced8ba28c889dee0af7f68fbf50d6dda6da27bbb3366e588
7
+ data.tar.gz: b6847081bc03d40d2ae77ec317d52955ba7fdc2597b91d10addba2a052067339171542316810d8e1dc95c5f4eb35eb5963425f7f82961ae13bbc5e3bfa8e2f50
@@ -1,4 +1,4 @@
1
1
  module KairosMcp
2
- VERSION = "3.24.1"
2
+ VERSION = "3.24.3"
3
3
  CHANGELOG_URL = "https://github.com/masaomi/KairosChain_2026/blob/main/CHANGELOG.md"
4
4
  end
@@ -53,16 +53,16 @@ module KairosMcp
53
53
  # from codex 5.5 + cursor).
54
54
  # Guarded by `defined?` so non-worker consumers (MCP direct call)
55
55
  # that never load multi_llm_review/main_state don't NameError.
56
- bracket = defined?(KairosMcp::SkillSets::MultiLlmReview::MainState)
57
- if bracket
58
- KairosMcp::SkillSets::MultiLlmReview::MainState.enter_call!
59
- end
60
- begin
61
- result = CallRouter.perform(args, @config)
62
- ensure
63
- if bracket
64
- KairosMcp::SkillSets::MultiLlmReview::MainState.exit_call!
56
+ # v3.24.3: use with_call to enforce ensure-bracketed enter/exit.
57
+ # enter_call!/exit_call! are now private; with_call is the only
58
+ # supported pattern. defined?-guard preserved so non-worker
59
+ # consumers (MCP direct call) don't NameError.
60
+ if defined?(KairosMcp::SkillSets::MultiLlmReview::MainState)
61
+ result = KairosMcp::SkillSets::MultiLlmReview::MainState.with_call do
62
+ CallRouter.perform(args, @config)
65
63
  end
64
+ else
65
+ result = CallRouter.perform(args, @config)
66
66
  end
67
67
  # Shape matches BaseTool#text_content (symbol :text key) — what
68
68
  # Dispatcher consumes today via `b[:text] || b['text']`.
@@ -103,28 +103,31 @@ def self_timeout_at_from_state(token, request)
103
103
  end
104
104
  end
105
105
 
106
- # Pulse thread: touches worker.tick IFF main is alive (counter advanced OR
107
- # still inside an adapter.call within its expected timeout). (C3b/P0-3)
106
+ # Pulse thread: touches worker.tick IFF main is alive. v3.24.3 uses the
107
+ # per-thread (counter, in_flight, oldest_ts) snapshot from MainState and
108
+ # delegates the alive decision to MainState.compute_alive (pure function,
109
+ # unit-testable). Emits a diagnostic log line every ~5s so future incidents
110
+ # can be diagnosed from worker.log without filesystem mtime archaeology.
108
111
  pulse_thread = Thread.new do
109
112
  begin
110
113
  last_counter = -1
111
- # Loaded below; read now for the timeout window
112
- max_call_t = 300
113
- call_margin = 60
114
+ log_emit_at = 0
115
+ threshold = 360 # max_call_t (300) + call_margin (60)
114
116
  loop do
115
- # MainState.snapshot reads ts FIRST then counter, per the v0.3.2
116
- # reader-ordering invariant. Pulse must use snapshot (not raw struct
117
- # reads) so any future change to the invariant is observed here.
118
- counter, ts = MLR::MainState.snapshot
119
- alive =
120
- if counter != last_counter
121
- true
122
- elsif ts
123
- (Process.clock_gettime(Process::CLOCK_MONOTONIC) - ts) < (max_call_t + call_margin)
124
- else
125
- false
126
- end
117
+ counter, in_flight, oldest_ts = MLR::MainState.snapshot
118
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
119
+ alive = MLR::MainState.compute_alive(
120
+ counter, last_counter, in_flight, oldest_ts, now, threshold
121
+ )
127
122
  FileUtils.touch(PS.worker_tick_path(token)) if alive
123
+
124
+ if now - log_emit_at >= 5
125
+ oldest_age = oldest_ts ? (now - oldest_ts).round(1) : nil
126
+ warn "[pulse] counter=#{counter} in_flight=#{in_flight} " \
127
+ "oldest_age=#{oldest_age || 'nil'}s alive=#{alive}"
128
+ log_emit_at = now
129
+ end
130
+
128
131
  last_counter = counter
129
132
  sleep 2
130
133
  end
@@ -263,8 +266,9 @@ begin
263
266
  review_context: request['review_context'] || 'independent'
264
267
  )
265
268
 
266
- # Advance counter so pulse observes "progress since dispatch entered".
267
- MLR::MainState.exit_call!
269
+ # v3.24.3: counter-only signal (no enter_call!/exit_call! pair). bump_counter!
270
+ # advances pulse's progress signal without touching ts_by_thread.
271
+ MLR::MainState.bump_counter!
268
272
  check_shutdown!(token)
269
273
 
270
274
  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0
@@ -132,7 +132,10 @@ module KairosMcp
132
132
 
133
133
  def bump_main_state_counter
134
134
  return unless defined?(KairosMcp::SkillSets::MultiLlmReview::MainState)
135
- KairosMcp::SkillSets::MultiLlmReview::MainState.exit_call!
135
+ # v3.24.3: counter-only bump. exit_call! is private; bump_counter!
136
+ # is the public counter-only progress signal (does not touch
137
+ # ts_by_thread).
138
+ KairosMcp::SkillSets::MultiLlmReview::MainState.bump_counter!
136
139
  rescue StandardError
137
140
  nil
138
141
  end
@@ -3,61 +3,123 @@
3
3
  module KairosMcp
4
4
  module SkillSets
5
5
  module MultiLlmReview
6
- # Main-thread liveness state for the worker's pulse mechanism (v0.3 P0-3,
7
- # v0.3.2 C3b). Read by the pulse thread to decide whether worker.tick
8
- # should be touched; written by the main thread around each adapter.call.
6
+ # ──────────────────────────────────────────────────────────────────
7
+ # MainState main-thread liveness state for the worker pulse
8
+ # ──────────────────────────────────────────────────────────────────
9
9
  #
10
- # ORDERING INVARIANT (v0.3.2 C3b):
11
- # exit_call! increments `counter` FIRST, clears `in_llm_call_since_mono`
12
- # SECOND. A torn two-field read by the pulse thread therefore always
13
- # lands in one of:
14
- # (old_counter, old_ts) — in-call, recent → alive
15
- # (new_counter, old_ts) — counter advanced → alive
16
- # (new_counter, nil) — exit complete → alive via counter
17
- # Never (old_counter, nil), which would look stalled.
10
+ # Tracks per-thread enter/exit timestamps so the pulse thread can tell
11
+ # whether the worker's main path is still progressing through LLM calls.
12
+ # Replaces the v0.3.2 process-global single-ts design which raced under
13
+ # parallel reviewer threads (incident token 5b75ff8c-..., 2026-04-27).
18
14
  #
19
- # MRI atomicity note: integer accessor and flonum Float accessor reads
20
- # are each atomic via GVL-serialized method dispatch; the PAIR is not.
21
- # The invariant above makes pair torn reads benign.
22
- MAIN_STATE = Struct.new(:counter, :in_llm_call_since_mono).new(0, nil)
15
+ # ORDERING / ATOMICITY INVARIANTS (v3.24.3):
16
+ #
17
+ # 1. counter and ts_by_thread mutations AND reads are bracketed by a
18
+ # single Mutex (MUTEX). Readers (snapshot) take the same mutex, so
19
+ # they never observe a torn (counter, ts_by_thread) pair.
20
+ # Replaces the v0.3.2 "ts-first/counter-second" ordering invariant
21
+ # which assumed single-threaded callers.
22
+ #
23
+ # 2. with_call { ... } is the ONLY supported call-bracketing pattern.
24
+ # Direct enter_call!/exit_call! calls are private (see
25
+ # private_class_method below). This guarantees that any exception
26
+ # from the LLM call propagates AFTER ts_by_thread has been cleaned
27
+ # up (via `ensure exit_call!`), preventing per-thread entry leaks.
28
+ #
29
+ # 3. Thread.current.object_id is used as the per-thread key. MRI's
30
+ # object_id stays stable for the lifetime of a Thread object;
31
+ # reuse only happens after the Thread has been GC'd. Within a
32
+ # single with_call invocation, the Thread is on-stack and therefore
33
+ # not GC-eligible, so the key is unique.
34
+ #
35
+ # 4. Mutex#synchronize is Thread.kill-safe under MRI (Ruby's internal
36
+ # `ensure unlock`). The `ensure exit_call!` inside with_call also
37
+ # runs under Thread.kill, so cleanup is guaranteed even if the
38
+ # dispatch thread is forcibly terminated.
39
+ #
40
+ # 5. NON-REENTRANT: nested with_call on the same thread is NOT
41
+ # supported. The inner enter_call! would overwrite the outer
42
+ # ts_by_thread[tid], and the outer ensure exit_call! would delete
43
+ # the entry while the inner call is still tracked. Current
44
+ # multi_llm_review code paths never nest LLM calls; if a future
45
+ # adapter calls another LLM, this contract must be revisited.
46
+ MAIN_STATE = Struct.new(:counter, :ts_by_thread).new(0, {})
47
+ MUTEX = Mutex.new
23
48
 
24
49
  module MainState
25
50
  module_function
26
51
 
27
- # Called immediately before adapter.call enters a blocking LLM syscall.
28
- def enter_call!
29
- MAIN_STATE.in_llm_call_since_mono =
30
- Process.clock_gettime(Process::CLOCK_MONOTONIC)
52
+ # PUBLIC: bracket an LLM call. The block runs between enter_call!
53
+ # and exit_call!; ensure guarantees exit_call! even on exception or
54
+ # Thread.kill. Returns the value of the block.
55
+ def with_call
56
+ enter_call!
57
+ yield
58
+ ensure
59
+ exit_call!
31
60
  end
32
61
 
33
- # Called in the `ensure` block around adapter.call. Must be idempotent:
34
- # if enter_call! never ran (e.g., exception before entry), clearing a
35
- # nil timestamp is a no-op and counter is still bumped so a pulse read
36
- # observes progress.
37
- def exit_call!
38
- MAIN_STATE.counter += 1 # INVARIANT: counter first
39
- MAIN_STATE.in_llm_call_since_mono = nil # then clear timestamp
62
+ # PUBLIC: counter-only progress signal. Used by dispatcher's join
63
+ # cleanup loop where there is no LLM call in flight but the main
64
+ # thread is still doing useful work (joining worker threads). Does
65
+ # NOT touch ts_by_thread.
66
+ def bump_counter!
67
+ MUTEX.synchronize { MAIN_STATE.counter += 1 }
40
68
  end
41
69
 
42
- # Read current state as a plain Array snapshot.
43
- #
44
- # READER ORDERING (mirrors writer's C3b invariant): read ts FIRST,
45
- # counter SECOND. If reader observes ts == nil, the writer MUST
46
- # already have completed counter+=1 (writer writes counter before ts).
47
- # Therefore (old_counter, nil) is unreachable by any reader using
48
- # this snapshot. The pulse thread uses this helper — do not change
49
- # the order without also changing the writer invariant.
70
+ # PUBLIC: snapshot of current state. Returns (counter, in_flight,
71
+ # oldest_ts). in_flight = ts_by_thread.size; oldest_ts = min of
72
+ # in-flight ts (nil if idle). Always atomic via MUTEX.
50
73
  def snapshot
51
- ts = MAIN_STATE.in_llm_call_since_mono
52
- counter = MAIN_STATE.counter
53
- [counter, ts]
74
+ MUTEX.synchronize do
75
+ ts_values = MAIN_STATE.ts_by_thread.values
76
+ [MAIN_STATE.counter, ts_values.size, ts_values.min]
77
+ end
78
+ end
79
+
80
+ # PUBLIC PURE FUNCTION: determine alive state from a snapshot
81
+ # tuple. Extracted so unit tests can table-drive the four branches
82
+ # without forking a worker. The pulse thread calls this with the
83
+ # result of snapshot().
84
+ def compute_alive(counter, last_counter, in_flight, oldest_ts, now_mono, threshold_seconds)
85
+ if counter != last_counter
86
+ true # progress observed
87
+ elsif in_flight > 0 && oldest_ts
88
+ (now_mono - oldest_ts) < threshold_seconds # in-call, recent
89
+ elsif in_flight > 0
90
+ true # in-call but ts not visible (transient)
91
+ else
92
+ false # idle, no progress
93
+ end
54
94
  end
55
95
 
56
- # Reset for tests. NOT safe for runtime use.
96
+ # TEST API: clear all state. NOT safe for runtime use.
57
97
  def reset!
58
- MAIN_STATE.counter = 0
59
- MAIN_STATE.in_llm_call_since_mono = nil
98
+ MUTEX.synchronize do
99
+ MAIN_STATE.counter = 0
100
+ MAIN_STATE.ts_by_thread.clear
101
+ end
102
+ end
103
+
104
+ # ── private (do not call from outside MainState; use with_call) ──
105
+
106
+ def enter_call!
107
+ tid = Thread.current.object_id
108
+ MUTEX.synchronize do
109
+ MAIN_STATE.ts_by_thread[tid] =
110
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
111
+ end
112
+ end
113
+ private_class_method :enter_call!
114
+
115
+ def exit_call!
116
+ tid = Thread.current.object_id
117
+ MUTEX.synchronize do
118
+ MAIN_STATE.counter += 1
119
+ MAIN_STATE.ts_by_thread.delete(tid)
120
+ end
60
121
  end
122
+ private_class_method :exit_call!
61
123
  end
62
124
  end
63
125
  end
@@ -6,17 +6,45 @@ module KairosMcp
6
6
  # Phase 2's polling loop for the detached worker's subprocess_results.json.
7
7
  # Returns one of four outcomes:
8
8
  # :ready — subprocess_results.json parsed successfully
9
- # :crashed — state.subprocess_status terminal OR heartbeat stale
9
+ # :crashed — state.subprocess_status == crashed/self_timed_out
10
+ # OR state == done but results never parseable within
11
+ # wall-clock budget (reason: done_but_no_results)
12
+ # OR heartbeat stale (only while non-terminal state)
10
13
  # OR pid present but no heartbeat within grace OR
11
14
  # no pid/heartbeat within startup grace
12
15
  # :timeout — wall-clock max_wait exceeded with live worker
13
16
  # (raises on unexpected errors from PendingState)
17
+ #
18
+ # v3.24.2: 'done' state now bypasses the heartbeat staleness check.
19
+ # The heartbeat thread is killed in the worker's ensure block, so
20
+ # mtime stops advancing the moment the worker transitions to 'done'.
21
+ # Without this bypass, a transient parse-mid-rename of
22
+ # subprocess_results.json combined with the killed heartbeat could
23
+ # surface a false-positive 'heartbeat_stale' for a successfully
24
+ # completed worker.
14
25
  module WaitForWorker
15
26
  STARTUP_GRACE_DEFAULT = 30
16
27
  HEARTBEAT_STALE_DEFAULT = 15
17
28
  POLL_INTERVAL_DEFAULT = 0.5
18
29
  SUSPEND_JUMP_THRESHOLD = 5.0
19
30
 
31
+ # All possible :crashed outcome reasons. Single source of truth for
32
+ # the crash-reason taxonomy; operators grep these in worker.log and
33
+ # next_action redispatch hints. v3.24.3 declares the constant; usage
34
+ # sites still use string literals (replacement scheduled for v3.24.4
35
+ # to avoid bundling unrelated refactors).
36
+ CRASH_REASONS = %w[
37
+ heartbeat_stale
38
+ heartbeat_never_started
39
+ worker_never_started
40
+ done_but_no_results
41
+ crashed
42
+ self_timed_out
43
+ wait_exhausted
44
+ internal_error
45
+ malformed_state
46
+ ].freeze
47
+
20
48
  module_function
21
49
 
22
50
  def wait(token, opts = {})
@@ -48,17 +76,40 @@ module KairosMcp
48
76
  # transient parse mid-rename — keep polling
49
77
  end
50
78
 
51
- # 2. Explicit crash marker from worker
79
+ # 2. Explicit terminal status from worker
52
80
  state = PendingState.load_state(token)
53
- if state && (state['subprocess_status'] == 'crashed' ||
54
- state['subprocess_status'] == 'self_timed_out')
55
- return {
56
- status: :crashed,
57
- reason: state['crash_reason'] || state['subprocess_status'],
58
- pid: read_pid(token),
59
- pgid: read_pgid_from_file(token),
60
- log_tail: tail_log(token)
61
- }
81
+ if state
82
+ status = state['subprocess_status']
83
+ if status == 'crashed' || status == 'self_timed_out'
84
+ return {
85
+ status: :crashed,
86
+ reason: state['crash_reason'] || status,
87
+ pid: read_pid(token),
88
+ pgid: read_pgid_from_file(token),
89
+ log_tail: tail_log(token)
90
+ }
91
+ end
92
+
93
+ # Worker exited cleanly. subprocess_results.json should be (or
94
+ # imminently become) loadable via step 1 on a subsequent poll.
95
+ # The heartbeat thread is intentionally killed at worker exit
96
+ # (dispatch_worker.rb ensure block), so the heartbeat-stale
97
+ # check below would false-positive. Skip liveness checks while
98
+ # 'done', and rely on step 1 retry until results parse or the
99
+ # wall-clock budget exhausts.
100
+ if status == 'done'
101
+ if now_mono > deadline
102
+ return {
103
+ status: :crashed,
104
+ reason: 'done_but_no_results',
105
+ pid: read_pid(token),
106
+ pgid: read_pgid_from_file(token),
107
+ log_tail: tail_log(token)
108
+ }
109
+ end
110
+ sleep poll_interval
111
+ next
112
+ end
62
113
  end
63
114
 
64
115
  # 3. Heartbeat-based liveness checks
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ # v3.24.3: per-thread MainState concurrency tests. Covers the per-thread
4
+ # Hash invariants that fix the v0.3.2 single-ts process-global race
5
+ # (incident token 5b75ff8c-..., 2026-04-27).
6
+
7
+ require 'minitest/autorun'
8
+ require_relative '../lib/multi_llm_review/main_state'
9
+
10
+ module KairosMcp
11
+ module SkillSets
12
+ module MultiLlmReview
13
+ class TestMainStateConcurrency < Minitest::Test
14
+ def setup
15
+ MainState.reset!
16
+ end
17
+
18
+ # T1 enter, T2 enter, T1 exit. Verify oldest_ts becomes T2's ts
19
+ # (not stuck at T1's). This is the exact scenario that v0.3.2 broke
20
+ # under: T1.exit cleared the single global ts while T2 was still
21
+ # in-call.
22
+ def test_oldest_ts_advances_when_first_enter_exits
23
+ enter_order = Queue.new
24
+ can_exit_t1 = Queue.new
25
+ can_exit_t2 = Queue.new
26
+
27
+ t1_ts = nil
28
+ t2_ts = nil
29
+
30
+ t1 = Thread.new do
31
+ MainState.with_call do
32
+ # capture our ts via snapshot
33
+ _, _, oldest_ts = MainState.snapshot
34
+ t1_ts = oldest_ts
35
+ enter_order << :t1
36
+ can_exit_t1.pop # wait for main to release
37
+ end
38
+ end
39
+
40
+ # Wait for t1 to enter
41
+ assert_equal :t1, enter_order.pop
42
+
43
+ t2 = Thread.new do
44
+ MainState.with_call do
45
+ enter_order << :t2
46
+ can_exit_t2.pop
47
+ end
48
+ end
49
+
50
+ # Wait for t2 to enter
51
+ assert_equal :t2, enter_order.pop
52
+
53
+ # Both in flight. Capture snapshot.
54
+ _, in_flight, oldest_ts_both = MainState.snapshot
55
+ assert_equal 2, in_flight
56
+ assert_equal t1_ts, oldest_ts_both, 'oldest_ts is T1 (earliest enter)'
57
+
58
+ # Now grab T2's ts before T1 exits
59
+ # Since T2 entered after T1, T2's ts > T1's ts.
60
+ # After T1 exits, oldest_ts must become T2's ts.
61
+
62
+ can_exit_t1 << :go
63
+ t1.join
64
+
65
+ _, in_flight_after, oldest_ts_after = MainState.snapshot
66
+ assert_equal 1, in_flight_after, 'T2 still in-flight'
67
+ refute_nil oldest_ts_after
68
+ assert oldest_ts_after > t1_ts,
69
+ "oldest_ts must advance past T1's anchor after T1 exits " \
70
+ "(was #{t1_ts}, now #{oldest_ts_after})"
71
+
72
+ can_exit_t2 << :go
73
+ t2.join
74
+
75
+ # Both exited
76
+ counter, in_flight_final, oldest_ts_final = MainState.snapshot
77
+ assert_equal 2, counter
78
+ assert_equal 0, in_flight_final
79
+ assert_nil oldest_ts_final
80
+ end
81
+
82
+ # 4 threads cycling enter/exit 250 times each = 1000 total cycles.
83
+ # Verifies counter and ts_by_thread stay consistent under contention.
84
+ def test_concurrent_with_call_stress
85
+ srand(20260427) # deterministic seed
86
+ n_threads = 4
87
+ cycles_per_thread = 250
88
+ start_at = Time.now
89
+
90
+ threads = n_threads.times.map do
91
+ Thread.new do
92
+ cycles_per_thread.times do
93
+ MainState.with_call { }
94
+ end
95
+ end
96
+ end
97
+ threads.each(&:join)
98
+
99
+ elapsed = Time.now - start_at
100
+ assert elapsed < 10, "stress test took #{elapsed.round(2)}s, budget 10s"
101
+
102
+ counter, in_flight, oldest_ts = MainState.snapshot
103
+ assert_equal n_threads * cycles_per_thread, counter
104
+ assert_equal 0, in_flight, 'ts_by_thread leaked entries'
105
+ assert_nil oldest_ts
106
+ end
107
+
108
+ # If with_call raises mid-block across many threads, ts_by_thread
109
+ # must still be cleaned for every thread.
110
+ def test_concurrent_with_call_exception_cleanup
111
+ n_threads = 4
112
+ threads = n_threads.times.map do |i|
113
+ Thread.new do
114
+ begin
115
+ MainState.with_call { raise "boom from thread #{i}" }
116
+ rescue StandardError
117
+ # expected
118
+ end
119
+ end
120
+ end
121
+ threads.each(&:join)
122
+
123
+ counter, in_flight, oldest_ts = MainState.snapshot
124
+ assert_equal n_threads, counter, 'counter bumps even on exception'
125
+ assert_equal 0, in_flight, 'ts_by_thread must be cleaned on exception'
126
+ assert_nil oldest_ts
127
+ end
128
+
129
+ # bump_counter! is racy with concurrent with_call but must not
130
+ # corrupt ts_by_thread or under-count counter.
131
+ def test_bump_counter_concurrent_with_with_call
132
+ n_threads = 4
133
+ n_bumps = 100
134
+ n_cycles = 100
135
+
136
+ bump_threads = n_threads.times.map do
137
+ Thread.new { n_bumps.times { MainState.bump_counter! } }
138
+ end
139
+ call_threads = n_threads.times.map do
140
+ Thread.new { n_cycles.times { MainState.with_call { } } }
141
+ end
142
+ (bump_threads + call_threads).each(&:join)
143
+
144
+ counter, in_flight, oldest_ts = MainState.snapshot
145
+ assert_equal n_threads * (n_bumps + n_cycles), counter
146
+ assert_equal 0, in_flight
147
+ assert_nil oldest_ts
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ # v3.24.3: pure unit tests for MainState.compute_alive — table-driven
4
+ # coverage of all 4 branches plus threshold boundaries. No worker fork,
5
+ # no thread, no filesystem.
6
+
7
+ require 'minitest/autorun'
8
+ require_relative '../lib/multi_llm_review/main_state'
9
+
10
+ module KairosMcp
11
+ module SkillSets
12
+ module MultiLlmReview
13
+ class TestMainStateAlive < Minitest::Test
14
+ THRESHOLD = 360.0
15
+
16
+ # branch 1: counter advanced
17
+ def test_counter_advanced_is_alive
18
+ assert_equal true,
19
+ MainState.compute_alive(5, 4, 0, nil, 100.0, THRESHOLD)
20
+ assert_equal true,
21
+ MainState.compute_alive(5, 4, 2, 50.0, 1000.0, THRESHOLD)
22
+ end
23
+
24
+ # branch 2: in-call, recent (counter unchanged)
25
+ def test_in_flight_within_threshold_is_alive
26
+ # oldest_ts = 100, now = 100 + 359 = 459 → diff 359 < 360
27
+ assert_equal true,
28
+ MainState.compute_alive(5, 5, 1, 100.0, 459.0, THRESHOLD)
29
+ end
30
+
31
+ def test_in_flight_at_threshold_boundary_is_dead
32
+ # oldest_ts = 100, now = 100 + 360 = 460 → diff 360 NOT < 360
33
+ assert_equal false,
34
+ MainState.compute_alive(5, 5, 1, 100.0, 460.0, THRESHOLD)
35
+ end
36
+
37
+ def test_in_flight_past_threshold_is_dead
38
+ # oldest_ts = 100, now = 100 + 361
39
+ assert_equal false,
40
+ MainState.compute_alive(5, 5, 1, 100.0, 461.0, THRESHOLD)
41
+ end
42
+
43
+ # branch 3: in-call but oldest_ts nil (defensive — unreachable in
44
+ # practice because snapshot is mutex-atomic)
45
+ def test_in_flight_with_nil_ts_is_alive
46
+ assert_equal true,
47
+ MainState.compute_alive(5, 5, 1, nil, 1000.0, THRESHOLD)
48
+ end
49
+
50
+ # branch 4: idle
51
+ def test_idle_no_progress_is_dead
52
+ assert_equal false,
53
+ MainState.compute_alive(5, 5, 0, nil, 1000.0, THRESHOLD)
54
+ end
55
+
56
+ # Counter advance dominates threshold check
57
+ def test_counter_advanced_overrides_stale_ts
58
+ # Even if oldest_ts is way past threshold, counter advance => alive.
59
+ assert_equal true,
60
+ MainState.compute_alive(6, 5, 1, 100.0, 9999.0, THRESHOLD)
61
+ end
62
+
63
+ # First iteration of pulse loop: last_counter = -1, counter = 0,
64
+ # in_flight = 0, ts = nil. Worker just spawned, no calls yet.
65
+ # Counter advanced from -1 to 0 → alive=true.
66
+ def test_first_iteration_with_zero_counter
67
+ assert_equal true,
68
+ MainState.compute_alive(0, -1, 0, nil, 0.0, THRESHOLD)
69
+ end
70
+
71
+ # last_counter == counter == 0, in_flight==0, ts nil → idle, dead
72
+ def test_second_iteration_no_calls_yet
73
+ assert_equal false,
74
+ MainState.compute_alive(0, 0, 0, nil, 5.0, THRESHOLD)
75
+ end
76
+
77
+ # Custom threshold (e.g. lower for testing)
78
+ def test_custom_threshold
79
+ # threshold=10, oldest 100, now 109 → diff 9 < 10 → alive
80
+ assert_equal true,
81
+ MainState.compute_alive(5, 5, 1, 100.0, 109.0, 10.0)
82
+ # threshold=10, oldest 100, now 110 → diff 10 NOT < 10 → dead
83
+ assert_equal false,
84
+ MainState.compute_alive(5, 5, 1, 100.0, 110.0, 10.0)
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -168,6 +168,47 @@ module KairosMcp
168
168
  assert_equal 'multi_llm_review', payload['next_action']['tool']
169
169
  end
170
170
 
171
+ # v3.24.2: state == 'done' must NOT be misclassified as
172
+ # heartbeat_stale. Worker kills the heartbeat thread on exit, so
173
+ # mtime stops advancing the moment status becomes 'done'. Without
174
+ # the fix, a stale heartbeat ages past the threshold and wait
175
+ # returns crashed even though the worker completed successfully.
176
+ def test_done_with_stale_heartbeat_does_not_false_positive_crash
177
+ write_state('subprocess_status' => 'done')
178
+ # Heartbeat present but stale (older than 15s threshold).
179
+ hb_path = PendingState.worker_heartbeat_path(@token)
180
+ FileUtils.touch(hb_path)
181
+ File.utime(Time.now - 60, Time.now - 60, hb_path)
182
+ PendingState.write_worker_pid(@token, { 'pid' => Process.pid, 'pgid' => Process.pid })
183
+
184
+ # No subprocess_results.json yet — simulates the parse-mid-rename
185
+ # window. wait should NOT return heartbeat_stale; it should poll
186
+ # until budget exhausts and return done_but_no_results.
187
+ payload = call_wait('max_wait_seconds' => 1)
188
+ assert_equal 'crashed', payload['status']
189
+ assert_equal 'done_but_no_results', payload['crashed_reason']
190
+ refute_equal 'heartbeat_stale', payload['crashed_reason']
191
+ end
192
+
193
+ def test_done_with_results_returns_ready
194
+ write_state('subprocess_status' => 'done')
195
+ # Heartbeat is stale (worker killed its thread) but results are present.
196
+ hb_path = PendingState.worker_heartbeat_path(@token)
197
+ FileUtils.touch(hb_path)
198
+ File.utime(Time.now - 60, Time.now - 60, hb_path)
199
+ PendingState.write_subprocess_results(@token, {
200
+ 'token' => @token,
201
+ 'completed_at' => Time.now.iso8601,
202
+ 'elapsed_seconds' => 12.3,
203
+ 'results' => [{ 'role_label' => 'r1', 'status' => 'success' }],
204
+ 'exit_summary' => { 'successful' => 1, 'errored' => 0, 'skipped' => 0 }
205
+ })
206
+
207
+ payload = call_wait('max_wait_seconds' => 1)
208
+ assert_equal 'ready', payload['status']
209
+ assert_equal 1, payload['subprocess_done']
210
+ end
211
+
171
212
  # ── hard cap ─────────────────────────────────────────────────────
172
213
  # Hard cap is enforced before WaitForWorker is invoked. We verify the
173
214
  # clamping logic without actually waiting for the cap by checking the
@@ -328,80 +328,80 @@ module KairosMcp
328
328
  end
329
329
  end
330
330
 
331
- # ── MainState ordering invariant ─────────────────────────────────
331
+ # ── MainState (v3.24.3 per-thread invariants) ───────────────────
332
+ # Replaces v0.3.2 C3b ordering tests. v3.24.3 uses Mutex-bracketed
333
+ # per-thread ts_by_thread Hash; the prior "counter-before-ts" ordering
334
+ # invariant no longer applies (Mutex provides atomicity). Tests now
335
+ # cover: snapshot 3-tuple shape, with_call ensure semantics, private
336
+ # enter_call!/exit_call!, and bump_counter! semantics. Comprehensive
337
+ # parallel-thread coverage is in test_main_state.rb.
332
338
 
333
339
  class TestMainState < Minitest::Test
334
340
  def setup
335
341
  MainState.reset!
336
342
  end
337
343
 
338
- def test_initial_state
339
- assert_equal [0, nil], MainState.snapshot
344
+ def test_initial_snapshot_is_idle
345
+ counter, in_flight, oldest_ts = MainState.snapshot
346
+ assert_equal 0, counter
347
+ assert_equal 0, in_flight
348
+ assert_nil oldest_ts
340
349
  end
341
350
 
342
- def test_enter_call_sets_monotonic_timestamp
343
- MainState.enter_call!
344
- counter, ts = MainState.snapshot
345
- assert_equal 0, counter, 'enter_call! should not bump counter'
346
- refute_nil ts
347
- assert_kind_of Float, ts
351
+ def test_with_call_brackets_counter_and_ts
352
+ before, in_flight_before, ts_before = MainState.snapshot
353
+ assert_equal 0, in_flight_before
354
+ assert_nil ts_before
355
+
356
+ ts_during_block = nil
357
+ MainState.with_call do
358
+ _c, in_flight, oldest_ts = MainState.snapshot
359
+ ts_during_block = oldest_ts
360
+ assert_equal 1, in_flight
361
+ refute_nil oldest_ts
362
+ end
363
+
364
+ counter, in_flight, oldest_ts = MainState.snapshot
365
+ assert_equal before + 1, counter, 'with_call must bump counter on exit'
366
+ assert_equal 0, in_flight
367
+ assert_nil oldest_ts, 'ts must be cleared after with_call returns'
368
+ assert_kind_of Float, ts_during_block
348
369
  end
349
370
 
350
- def test_exit_call_increments_counter_first
351
- MainState.enter_call!
352
- before = MAIN_STATE.counter
353
- MainState.exit_call!
354
- counter, ts = MainState.snapshot
355
- assert_equal before + 1, counter
356
- assert_nil ts
371
+ def test_with_call_ensures_cleanup_on_exception
372
+ assert_raises(RuntimeError) do
373
+ MainState.with_call { raise 'boom' }
374
+ end
375
+ counter, in_flight, oldest_ts = MainState.snapshot
376
+ assert_equal 1, counter, 'counter still bumps on exception (ensure)'
377
+ assert_equal 0, in_flight, 'ts_by_thread must be cleaned on exception'
378
+ assert_nil oldest_ts
357
379
  end
358
380
 
359
- def test_exit_call_idempotent_without_enter
360
- MainState.exit_call!
361
- MainState.exit_call!
362
- counter, ts = MainState.snapshot
363
- assert_equal 2, counter
364
- assert_nil ts
381
+ def test_bump_counter_does_not_touch_ts_by_thread
382
+ before_counter, before_in_flight, _ = MainState.snapshot
383
+ MainState.bump_counter!
384
+ counter, in_flight, oldest_ts = MainState.snapshot
385
+ assert_equal before_counter + 1, counter
386
+ assert_equal before_in_flight, in_flight, 'bump_counter! must not touch ts_by_thread'
387
+ assert_nil oldest_ts
365
388
  end
366
389
 
367
- def test_multiple_enter_exit_cycles
368
- 3.times do
369
- MainState.enter_call!
370
- refute_nil MAIN_STATE.in_llm_call_since_mono
371
- MainState.exit_call!
372
- assert_nil MAIN_STATE.in_llm_call_since_mono
373
- end
374
- assert_equal 3, MAIN_STATE.counter
390
+ def test_enter_call_is_private
391
+ assert_raises(NoMethodError) { MainState.enter_call! }
375
392
  end
376
393
 
377
- # Ordering invariant (v0.3.2 C3b): exit_call! writes counter BEFORE
378
- # clearing in_llm_call_since_mono. Verified by instrumenting the
379
- # setter methods to record their invocation order — this avoids the
380
- # flakiness of GVL-dependent contention tests.
381
- def test_exit_call_writes_counter_before_clearing_timestamp
394
+ def test_exit_call_is_private
395
+ assert_raises(NoMethodError) { MainState.exit_call! }
396
+ end
397
+
398
+ def test_reset_clears_all_state
399
+ MainState.with_call { } # bumps counter to 1
382
400
  MainState.reset!
383
- MainState.enter_call!
384
- calls = []
385
- MAIN_STATE.singleton_class.class_eval do
386
- alias_method :_orig_counter=, :counter=
387
- alias_method :_orig_ts=, :in_llm_call_since_mono=
388
- define_method(:counter=) { |v| calls << :counter; send(:_orig_counter=, v) }
389
- define_method(:in_llm_call_since_mono=) { |v| calls << :ts; send(:_orig_ts=, v) }
390
- end
391
- begin
392
- MainState.exit_call!
393
- ensure
394
- MAIN_STATE.singleton_class.class_eval do
395
- remove_method :counter=
396
- remove_method :in_llm_call_since_mono=
397
- alias_method :counter=, :_orig_counter=
398
- alias_method :in_llm_call_since_mono=, :_orig_ts=
399
- remove_method :_orig_counter=
400
- remove_method :_orig_ts=
401
- end
402
- end
403
- assert_equal %i[counter ts], calls,
404
- 'exit_call! must write counter BEFORE clearing ts (C3b ordering invariant)'
401
+ counter, in_flight, oldest_ts = MainState.snapshot
402
+ assert_equal 0, counter
403
+ assert_equal 0, in_flight
404
+ assert_nil oldest_ts
405
405
  end
406
406
  end
407
407
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kairos-chain
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.24.1
4
+ version: 3.24.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Masaomi Hatakeyama
@@ -495,6 +495,8 @@ files:
495
495
  - templates/skillsets/multi_llm_review/skillset.json
496
496
  - templates/skillsets/multi_llm_review/test/test_dispatcher_usage.rb
497
497
  - templates/skillsets/multi_llm_review/test/test_feedback_formatter.rb
498
+ - templates/skillsets/multi_llm_review/test/test_main_state.rb
499
+ - templates/skillsets/multi_llm_review/test/test_main_state_alive.rb
498
500
  - templates/skillsets/multi_llm_review/test/test_multi_llm_review.rb
499
501
  - templates/skillsets/multi_llm_review/test/test_multi_llm_review_bundle.rb
500
502
  - templates/skillsets/multi_llm_review/test/test_multi_llm_review_wait.rb