kairos-chain 3.24.1 → 3.24.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kairos_mcp/version.rb +1 -1
- data/templates/skillsets/llm_client/lib/llm_client/headless.rb +9 -9
- data/templates/skillsets/multi_llm_review/bin/dispatch_worker.rb +23 -19
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/dispatcher.rb +4 -1
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/main_state.rb +102 -40
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/wait_for_worker.rb +62 -11
- data/templates/skillsets/multi_llm_review/test/test_main_state.rb +152 -0
- data/templates/skillsets/multi_llm_review/test/test_main_state_alive.rb +89 -0
- data/templates/skillsets/multi_llm_review/test/test_multi_llm_review_wait.rb +41 -0
- data/templates/skillsets/multi_llm_review/test/test_pending_state_v3.rb +57 -57
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fb099806eedb198afc167cbb810110ab7bffac2fceea3684eb14a24e3e7b46fb
|
|
4
|
+
data.tar.gz: 61223eff1c6cd146eea47d44ab0ee95506a8baa6e15a0f1f5c2e929e9d44a5b1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 91d4a86fc2df06025fefb5f0252e9f019d85cb9fc31f6ceec0d2e0bbe1209c8d24277e7448faace8ced8ba28c889dee0af7f68fbf50d6dda6da27bbb3366e588
|
|
7
|
+
data.tar.gz: b6847081bc03d40d2ae77ec317d52955ba7fdc2597b91d10addba2a052067339171542316810d8e1dc95c5f4eb35eb5963425f7f82961ae13bbc5e3bfa8e2f50
|
data/lib/kairos_mcp/version.rb
CHANGED
|
@@ -53,16 +53,16 @@ module KairosMcp
|
|
|
53
53
|
# from codex 5.5 + cursor).
|
|
54
54
|
# Guarded by `defined?` so non-worker consumers (MCP direct call)
|
|
55
55
|
# that never load multi_llm_review/main_state don't NameError.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
result =
|
|
62
|
-
|
|
63
|
-
if bracket
|
|
64
|
-
KairosMcp::SkillSets::MultiLlmReview::MainState.exit_call!
|
|
56
|
+
# v3.24.3: use with_call to enforce ensure-bracketed enter/exit.
|
|
57
|
+
# enter_call!/exit_call! are now private; with_call is the only
|
|
58
|
+
# supported pattern. defined?-guard preserved so non-worker
|
|
59
|
+
# consumers (MCP direct call) don't NameError.
|
|
60
|
+
if defined?(KairosMcp::SkillSets::MultiLlmReview::MainState)
|
|
61
|
+
result = KairosMcp::SkillSets::MultiLlmReview::MainState.with_call do
|
|
62
|
+
CallRouter.perform(args, @config)
|
|
65
63
|
end
|
|
64
|
+
else
|
|
65
|
+
result = CallRouter.perform(args, @config)
|
|
66
66
|
end
|
|
67
67
|
# Shape matches BaseTool#text_content (symbol :text key) — what
|
|
68
68
|
# Dispatcher consumes today via `b[:text] || b['text']`.
|
|
@@ -103,28 +103,31 @@ def self_timeout_at_from_state(token, request)
|
|
|
103
103
|
end
|
|
104
104
|
end
|
|
105
105
|
|
|
106
|
-
# Pulse thread: touches worker.tick IFF main is alive
|
|
107
|
-
#
|
|
106
|
+
# Pulse thread: touches worker.tick IFF main is alive. v3.24.3 uses the
|
|
107
|
+
# per-thread (counter, in_flight, oldest_ts) snapshot from MainState and
|
|
108
|
+
# delegates the alive decision to MainState.compute_alive (pure function,
|
|
109
|
+
# unit-testable). Emits a diagnostic log line every ~5s so future incidents
|
|
110
|
+
# can be diagnosed from worker.log without filesystem mtime archaeology.
|
|
108
111
|
pulse_thread = Thread.new do
|
|
109
112
|
begin
|
|
110
113
|
last_counter = -1
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
call_margin = 60
|
|
114
|
+
log_emit_at = 0
|
|
115
|
+
threshold = 360 # max_call_t (300) + call_margin (60)
|
|
114
116
|
loop do
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if counter != last_counter
|
|
121
|
-
true
|
|
122
|
-
elsif ts
|
|
123
|
-
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - ts) < (max_call_t + call_margin)
|
|
124
|
-
else
|
|
125
|
-
false
|
|
126
|
-
end
|
|
117
|
+
counter, in_flight, oldest_ts = MLR::MainState.snapshot
|
|
118
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
119
|
+
alive = MLR::MainState.compute_alive(
|
|
120
|
+
counter, last_counter, in_flight, oldest_ts, now, threshold
|
|
121
|
+
)
|
|
127
122
|
FileUtils.touch(PS.worker_tick_path(token)) if alive
|
|
123
|
+
|
|
124
|
+
if now - log_emit_at >= 5
|
|
125
|
+
oldest_age = oldest_ts ? (now - oldest_ts).round(1) : nil
|
|
126
|
+
warn "[pulse] counter=#{counter} in_flight=#{in_flight} " \
|
|
127
|
+
"oldest_age=#{oldest_age || 'nil'}s alive=#{alive}"
|
|
128
|
+
log_emit_at = now
|
|
129
|
+
end
|
|
130
|
+
|
|
128
131
|
last_counter = counter
|
|
129
132
|
sleep 2
|
|
130
133
|
end
|
|
@@ -263,8 +266,9 @@ begin
|
|
|
263
266
|
review_context: request['review_context'] || 'independent'
|
|
264
267
|
)
|
|
265
268
|
|
|
266
|
-
#
|
|
267
|
-
|
|
269
|
+
# v3.24.3: counter-only signal (no enter_call!/exit_call! pair). bump_counter!
|
|
270
|
+
# advances pulse's progress signal without touching ts_by_thread.
|
|
271
|
+
MLR::MainState.bump_counter!
|
|
268
272
|
check_shutdown!(token)
|
|
269
273
|
|
|
270
274
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0
|
|
@@ -132,7 +132,10 @@ module KairosMcp
|
|
|
132
132
|
|
|
133
133
|
def bump_main_state_counter
|
|
134
134
|
return unless defined?(KairosMcp::SkillSets::MultiLlmReview::MainState)
|
|
135
|
-
|
|
135
|
+
# v3.24.3: counter-only bump. exit_call! is private; bump_counter!
|
|
136
|
+
# is the public counter-only progress signal (does not touch
|
|
137
|
+
# ts_by_thread).
|
|
138
|
+
KairosMcp::SkillSets::MultiLlmReview::MainState.bump_counter!
|
|
136
139
|
rescue StandardError
|
|
137
140
|
nil
|
|
138
141
|
end
|
|
@@ -3,61 +3,123 @@
|
|
|
3
3
|
module KairosMcp
|
|
4
4
|
module SkillSets
|
|
5
5
|
module MultiLlmReview
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
6
|
+
# ──────────────────────────────────────────────────────────────────
|
|
7
|
+
# MainState — main-thread liveness state for the worker pulse
|
|
8
|
+
# ──────────────────────────────────────────────────────────────────
|
|
9
9
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
# (old_counter, old_ts) — in-call, recent → alive
|
|
15
|
-
# (new_counter, old_ts) — counter advanced → alive
|
|
16
|
-
# (new_counter, nil) — exit complete → alive via counter
|
|
17
|
-
# Never (old_counter, nil), which would look stalled.
|
|
10
|
+
# Tracks per-thread enter/exit timestamps so the pulse thread can tell
|
|
11
|
+
# whether the worker's main path is still progressing through LLM calls.
|
|
12
|
+
# Replaces the v0.3.2 process-global single-ts design which raced under
|
|
13
|
+
# parallel reviewer threads (incident token 5b75ff8c-..., 2026-04-27).
|
|
18
14
|
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
|
|
15
|
+
# ORDERING / ATOMICITY INVARIANTS (v3.24.3):
|
|
16
|
+
#
|
|
17
|
+
# 1. counter and ts_by_thread mutations AND reads are bracketed by a
|
|
18
|
+
# single Mutex (MUTEX). Readers (snapshot) take the same mutex, so
|
|
19
|
+
# they never observe a torn (counter, ts_by_thread) pair.
|
|
20
|
+
# Replaces the v0.3.2 "ts-first/counter-second" ordering invariant
|
|
21
|
+
# which assumed single-threaded callers.
|
|
22
|
+
#
|
|
23
|
+
# 2. with_call { ... } is the ONLY supported call-bracketing pattern.
|
|
24
|
+
# Direct enter_call!/exit_call! calls are private (see
|
|
25
|
+
# private_class_method below). This guarantees that any exception
|
|
26
|
+
# from the LLM call propagates AFTER ts_by_thread has been cleaned
|
|
27
|
+
# up (via `ensure exit_call!`), preventing per-thread entry leaks.
|
|
28
|
+
#
|
|
29
|
+
# 3. Thread.current.object_id is used as the per-thread key. MRI's
|
|
30
|
+
# object_id stays stable for the lifetime of a Thread object;
|
|
31
|
+
# reuse only happens after the Thread has been GC'd. Within a
|
|
32
|
+
# single with_call invocation, the Thread is on-stack and therefore
|
|
33
|
+
# not GC-eligible, so the key is unique.
|
|
34
|
+
#
|
|
35
|
+
# 4. Mutex#synchronize is Thread.kill-safe under MRI (Ruby's internal
|
|
36
|
+
# `ensure unlock`). The `ensure exit_call!` inside with_call also
|
|
37
|
+
# runs under Thread.kill, so cleanup is guaranteed even if the
|
|
38
|
+
# dispatch thread is forcibly terminated.
|
|
39
|
+
#
|
|
40
|
+
# 5. NON-REENTRANT: nested with_call on the same thread is NOT
|
|
41
|
+
# supported. The inner enter_call! would overwrite the outer
|
|
42
|
+
# ts_by_thread[tid], and the outer ensure exit_call! would delete
|
|
43
|
+
# the entry while the inner call is still tracked. Current
|
|
44
|
+
# multi_llm_review code paths never nest LLM calls; if a future
|
|
45
|
+
# adapter calls another LLM, this contract must be revisited.
|
|
46
|
+
MAIN_STATE = Struct.new(:counter, :ts_by_thread).new(0, {})
|
|
47
|
+
MUTEX = Mutex.new
|
|
23
48
|
|
|
24
49
|
module MainState
|
|
25
50
|
module_function
|
|
26
51
|
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
52
|
+
# PUBLIC: bracket an LLM call. The block runs between enter_call!
|
|
53
|
+
# and exit_call!; ensure guarantees exit_call! even on exception or
|
|
54
|
+
# Thread.kill. Returns the value of the block.
|
|
55
|
+
def with_call
|
|
56
|
+
enter_call!
|
|
57
|
+
yield
|
|
58
|
+
ensure
|
|
59
|
+
exit_call!
|
|
31
60
|
end
|
|
32
61
|
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
#
|
|
36
|
-
#
|
|
37
|
-
def
|
|
38
|
-
MAIN_STATE.counter += 1
|
|
39
|
-
MAIN_STATE.in_llm_call_since_mono = nil # then clear timestamp
|
|
62
|
+
# PUBLIC: counter-only progress signal. Used by dispatcher's join
|
|
63
|
+
# cleanup loop where there is no LLM call in flight but the main
|
|
64
|
+
# thread is still doing useful work (joining worker threads). Does
|
|
65
|
+
# NOT touch ts_by_thread.
|
|
66
|
+
def bump_counter!
|
|
67
|
+
MUTEX.synchronize { MAIN_STATE.counter += 1 }
|
|
40
68
|
end
|
|
41
69
|
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
# counter SECOND. If reader observes ts == nil, the writer MUST
|
|
46
|
-
# already have completed counter+=1 (writer writes counter before ts).
|
|
47
|
-
# Therefore (old_counter, nil) is unreachable by any reader using
|
|
48
|
-
# this snapshot. The pulse thread uses this helper — do not change
|
|
49
|
-
# the order without also changing the writer invariant.
|
|
70
|
+
# PUBLIC: snapshot of current state. Returns (counter, in_flight,
|
|
71
|
+
# oldest_ts). in_flight = ts_by_thread.size; oldest_ts = min of
|
|
72
|
+
# in-flight ts (nil if idle). Always atomic via MUTEX.
|
|
50
73
|
def snapshot
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
74
|
+
MUTEX.synchronize do
|
|
75
|
+
ts_values = MAIN_STATE.ts_by_thread.values
|
|
76
|
+
[MAIN_STATE.counter, ts_values.size, ts_values.min]
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# PUBLIC PURE FUNCTION: determine alive state from a snapshot
|
|
81
|
+
# tuple. Extracted so unit tests can table-drive the four branches
|
|
82
|
+
# without forking a worker. The pulse thread calls this with the
|
|
83
|
+
# result of snapshot().
|
|
84
|
+
def compute_alive(counter, last_counter, in_flight, oldest_ts, now_mono, threshold_seconds)
|
|
85
|
+
if counter != last_counter
|
|
86
|
+
true # progress observed
|
|
87
|
+
elsif in_flight > 0 && oldest_ts
|
|
88
|
+
(now_mono - oldest_ts) < threshold_seconds # in-call, recent
|
|
89
|
+
elsif in_flight > 0
|
|
90
|
+
true # in-call but ts not visible (transient)
|
|
91
|
+
else
|
|
92
|
+
false # idle, no progress
|
|
93
|
+
end
|
|
54
94
|
end
|
|
55
95
|
|
|
56
|
-
#
|
|
96
|
+
# TEST API: clear all state. NOT safe for runtime use.
|
|
57
97
|
def reset!
|
|
58
|
-
|
|
59
|
-
|
|
98
|
+
MUTEX.synchronize do
|
|
99
|
+
MAIN_STATE.counter = 0
|
|
100
|
+
MAIN_STATE.ts_by_thread.clear
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# ── private (do not call from outside MainState; use with_call) ──
|
|
105
|
+
|
|
106
|
+
def enter_call!
|
|
107
|
+
tid = Thread.current.object_id
|
|
108
|
+
MUTEX.synchronize do
|
|
109
|
+
MAIN_STATE.ts_by_thread[tid] =
|
|
110
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
private_class_method :enter_call!
|
|
114
|
+
|
|
115
|
+
def exit_call!
|
|
116
|
+
tid = Thread.current.object_id
|
|
117
|
+
MUTEX.synchronize do
|
|
118
|
+
MAIN_STATE.counter += 1
|
|
119
|
+
MAIN_STATE.ts_by_thread.delete(tid)
|
|
120
|
+
end
|
|
60
121
|
end
|
|
122
|
+
private_class_method :exit_call!
|
|
61
123
|
end
|
|
62
124
|
end
|
|
63
125
|
end
|
|
@@ -6,17 +6,45 @@ module KairosMcp
|
|
|
6
6
|
# Phase 2's polling loop for the detached worker's subprocess_results.json.
|
|
7
7
|
# Returns one of four outcomes:
|
|
8
8
|
# :ready — subprocess_results.json parsed successfully
|
|
9
|
-
# :crashed — state.subprocess_status
|
|
9
|
+
# :crashed — state.subprocess_status == crashed/self_timed_out
|
|
10
|
+
# OR state == done but results never parseable within
|
|
11
|
+
# wall-clock budget (reason: done_but_no_results)
|
|
12
|
+
# OR heartbeat stale (only while non-terminal state)
|
|
10
13
|
# OR pid present but no heartbeat within grace OR
|
|
11
14
|
# no pid/heartbeat within startup grace
|
|
12
15
|
# :timeout — wall-clock max_wait exceeded with live worker
|
|
13
16
|
# (raises on unexpected errors from PendingState)
|
|
17
|
+
#
|
|
18
|
+
# v3.24.2: 'done' state now bypasses the heartbeat staleness check.
|
|
19
|
+
# The heartbeat thread is killed in the worker's ensure block, so
|
|
20
|
+
# mtime stops advancing the moment the worker transitions to 'done'.
|
|
21
|
+
# Without this bypass, a transient parse-mid-rename of
|
|
22
|
+
# subprocess_results.json combined with the killed heartbeat could
|
|
23
|
+
# surface a false-positive 'heartbeat_stale' for a successfully
|
|
24
|
+
# completed worker.
|
|
14
25
|
module WaitForWorker
|
|
15
26
|
STARTUP_GRACE_DEFAULT = 30
|
|
16
27
|
HEARTBEAT_STALE_DEFAULT = 15
|
|
17
28
|
POLL_INTERVAL_DEFAULT = 0.5
|
|
18
29
|
SUSPEND_JUMP_THRESHOLD = 5.0
|
|
19
30
|
|
|
31
|
+
# All possible :crashed outcome reasons. Single source of truth for
|
|
32
|
+
# the crash-reason taxonomy; operators grep these in worker.log and
|
|
33
|
+
# next_action redispatch hints. v3.24.3 declares the constant; usage
|
|
34
|
+
# sites still use string literals (replacement scheduled for v3.24.4
|
|
35
|
+
# to avoid bundling unrelated refactors).
|
|
36
|
+
CRASH_REASONS = %w[
|
|
37
|
+
heartbeat_stale
|
|
38
|
+
heartbeat_never_started
|
|
39
|
+
worker_never_started
|
|
40
|
+
done_but_no_results
|
|
41
|
+
crashed
|
|
42
|
+
self_timed_out
|
|
43
|
+
wait_exhausted
|
|
44
|
+
internal_error
|
|
45
|
+
malformed_state
|
|
46
|
+
].freeze
|
|
47
|
+
|
|
20
48
|
module_function
|
|
21
49
|
|
|
22
50
|
def wait(token, opts = {})
|
|
@@ -48,17 +76,40 @@ module KairosMcp
|
|
|
48
76
|
# transient parse mid-rename — keep polling
|
|
49
77
|
end
|
|
50
78
|
|
|
51
|
-
# 2. Explicit
|
|
79
|
+
# 2. Explicit terminal status from worker
|
|
52
80
|
state = PendingState.load_state(token)
|
|
53
|
-
if state
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
81
|
+
if state
|
|
82
|
+
status = state['subprocess_status']
|
|
83
|
+
if status == 'crashed' || status == 'self_timed_out'
|
|
84
|
+
return {
|
|
85
|
+
status: :crashed,
|
|
86
|
+
reason: state['crash_reason'] || status,
|
|
87
|
+
pid: read_pid(token),
|
|
88
|
+
pgid: read_pgid_from_file(token),
|
|
89
|
+
log_tail: tail_log(token)
|
|
90
|
+
}
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Worker exited cleanly. subprocess_results.json should be (or
|
|
94
|
+
# imminently become) loadable via step 1 on a subsequent poll.
|
|
95
|
+
# The heartbeat thread is intentionally killed at worker exit
|
|
96
|
+
# (dispatch_worker.rb ensure block), so the heartbeat-stale
|
|
97
|
+
# check below would false-positive. Skip liveness checks while
|
|
98
|
+
# 'done', and rely on step 1 retry until results parse or the
|
|
99
|
+
# wall-clock budget exhausts.
|
|
100
|
+
if status == 'done'
|
|
101
|
+
if now_mono > deadline
|
|
102
|
+
return {
|
|
103
|
+
status: :crashed,
|
|
104
|
+
reason: 'done_but_no_results',
|
|
105
|
+
pid: read_pid(token),
|
|
106
|
+
pgid: read_pgid_from_file(token),
|
|
107
|
+
log_tail: tail_log(token)
|
|
108
|
+
}
|
|
109
|
+
end
|
|
110
|
+
sleep poll_interval
|
|
111
|
+
next
|
|
112
|
+
end
|
|
62
113
|
end
|
|
63
114
|
|
|
64
115
|
# 3. Heartbeat-based liveness checks
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# v3.24.3: per-thread MainState concurrency tests. Covers the per-thread
|
|
4
|
+
# Hash invariants that fix the v0.3.2 single-ts process-global race
|
|
5
|
+
# (incident token 5b75ff8c-..., 2026-04-27).
|
|
6
|
+
|
|
7
|
+
require 'minitest/autorun'
|
|
8
|
+
require_relative '../lib/multi_llm_review/main_state'
|
|
9
|
+
|
|
10
|
+
module KairosMcp
|
|
11
|
+
module SkillSets
|
|
12
|
+
module MultiLlmReview
|
|
13
|
+
class TestMainStateConcurrency < Minitest::Test
|
|
14
|
+
def setup
|
|
15
|
+
MainState.reset!
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# T1 enter, T2 enter, T1 exit. Verify oldest_ts becomes T2's ts
|
|
19
|
+
# (not stuck at T1's). This is the exact scenario that v0.3.2 broke
|
|
20
|
+
# under: T1.exit cleared the single global ts while T2 was still
|
|
21
|
+
# in-call.
|
|
22
|
+
def test_oldest_ts_advances_when_first_enter_exits
|
|
23
|
+
enter_order = Queue.new
|
|
24
|
+
can_exit_t1 = Queue.new
|
|
25
|
+
can_exit_t2 = Queue.new
|
|
26
|
+
|
|
27
|
+
t1_ts = nil
|
|
28
|
+
t2_ts = nil
|
|
29
|
+
|
|
30
|
+
t1 = Thread.new do
|
|
31
|
+
MainState.with_call do
|
|
32
|
+
# capture our ts via snapshot
|
|
33
|
+
_, _, oldest_ts = MainState.snapshot
|
|
34
|
+
t1_ts = oldest_ts
|
|
35
|
+
enter_order << :t1
|
|
36
|
+
can_exit_t1.pop # wait for main to release
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Wait for t1 to enter
|
|
41
|
+
assert_equal :t1, enter_order.pop
|
|
42
|
+
|
|
43
|
+
t2 = Thread.new do
|
|
44
|
+
MainState.with_call do
|
|
45
|
+
enter_order << :t2
|
|
46
|
+
can_exit_t2.pop
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Wait for t2 to enter
|
|
51
|
+
assert_equal :t2, enter_order.pop
|
|
52
|
+
|
|
53
|
+
# Both in flight. Capture snapshot.
|
|
54
|
+
_, in_flight, oldest_ts_both = MainState.snapshot
|
|
55
|
+
assert_equal 2, in_flight
|
|
56
|
+
assert_equal t1_ts, oldest_ts_both, 'oldest_ts is T1 (earliest enter)'
|
|
57
|
+
|
|
58
|
+
# Now grab T2's ts before T1 exits
|
|
59
|
+
# Since T2 entered after T1, T2's ts > T1's ts.
|
|
60
|
+
# After T1 exits, oldest_ts must become T2's ts.
|
|
61
|
+
|
|
62
|
+
can_exit_t1 << :go
|
|
63
|
+
t1.join
|
|
64
|
+
|
|
65
|
+
_, in_flight_after, oldest_ts_after = MainState.snapshot
|
|
66
|
+
assert_equal 1, in_flight_after, 'T2 still in-flight'
|
|
67
|
+
refute_nil oldest_ts_after
|
|
68
|
+
assert oldest_ts_after > t1_ts,
|
|
69
|
+
"oldest_ts must advance past T1's anchor after T1 exits " \
|
|
70
|
+
"(was #{t1_ts}, now #{oldest_ts_after})"
|
|
71
|
+
|
|
72
|
+
can_exit_t2 << :go
|
|
73
|
+
t2.join
|
|
74
|
+
|
|
75
|
+
# Both exited
|
|
76
|
+
counter, in_flight_final, oldest_ts_final = MainState.snapshot
|
|
77
|
+
assert_equal 2, counter
|
|
78
|
+
assert_equal 0, in_flight_final
|
|
79
|
+
assert_nil oldest_ts_final
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# 4 threads cycling enter/exit 250 times each = 1000 total cycles.
|
|
83
|
+
# Verifies counter and ts_by_thread stay consistent under contention.
|
|
84
|
+
def test_concurrent_with_call_stress
|
|
85
|
+
srand(20260427) # deterministic seed
|
|
86
|
+
n_threads = 4
|
|
87
|
+
cycles_per_thread = 250
|
|
88
|
+
start_at = Time.now
|
|
89
|
+
|
|
90
|
+
threads = n_threads.times.map do
|
|
91
|
+
Thread.new do
|
|
92
|
+
cycles_per_thread.times do
|
|
93
|
+
MainState.with_call { }
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
threads.each(&:join)
|
|
98
|
+
|
|
99
|
+
elapsed = Time.now - start_at
|
|
100
|
+
assert elapsed < 10, "stress test took #{elapsed.round(2)}s, budget 10s"
|
|
101
|
+
|
|
102
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
103
|
+
assert_equal n_threads * cycles_per_thread, counter
|
|
104
|
+
assert_equal 0, in_flight, 'ts_by_thread leaked entries'
|
|
105
|
+
assert_nil oldest_ts
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# If with_call raises mid-block across many threads, ts_by_thread
|
|
109
|
+
# must still be cleaned for every thread.
|
|
110
|
+
def test_concurrent_with_call_exception_cleanup
|
|
111
|
+
n_threads = 4
|
|
112
|
+
threads = n_threads.times.map do |i|
|
|
113
|
+
Thread.new do
|
|
114
|
+
begin
|
|
115
|
+
MainState.with_call { raise "boom from thread #{i}" }
|
|
116
|
+
rescue StandardError
|
|
117
|
+
# expected
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
threads.each(&:join)
|
|
122
|
+
|
|
123
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
124
|
+
assert_equal n_threads, counter, 'counter bumps even on exception'
|
|
125
|
+
assert_equal 0, in_flight, 'ts_by_thread must be cleaned on exception'
|
|
126
|
+
assert_nil oldest_ts
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# bump_counter! is racy with concurrent with_call but must not
|
|
130
|
+
# corrupt ts_by_thread or under-count counter.
|
|
131
|
+
def test_bump_counter_concurrent_with_with_call
|
|
132
|
+
n_threads = 4
|
|
133
|
+
n_bumps = 100
|
|
134
|
+
n_cycles = 100
|
|
135
|
+
|
|
136
|
+
bump_threads = n_threads.times.map do
|
|
137
|
+
Thread.new { n_bumps.times { MainState.bump_counter! } }
|
|
138
|
+
end
|
|
139
|
+
call_threads = n_threads.times.map do
|
|
140
|
+
Thread.new { n_cycles.times { MainState.with_call { } } }
|
|
141
|
+
end
|
|
142
|
+
(bump_threads + call_threads).each(&:join)
|
|
143
|
+
|
|
144
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
145
|
+
assert_equal n_threads * (n_bumps + n_cycles), counter
|
|
146
|
+
assert_equal 0, in_flight
|
|
147
|
+
assert_nil oldest_ts
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# v3.24.3: pure unit tests for MainState.compute_alive — table-driven
|
|
4
|
+
# coverage of all 4 branches plus threshold boundaries. No worker fork,
|
|
5
|
+
# no thread, no filesystem.
|
|
6
|
+
|
|
7
|
+
require 'minitest/autorun'
|
|
8
|
+
require_relative '../lib/multi_llm_review/main_state'
|
|
9
|
+
|
|
10
|
+
module KairosMcp
|
|
11
|
+
module SkillSets
|
|
12
|
+
module MultiLlmReview
|
|
13
|
+
class TestMainStateAlive < Minitest::Test
|
|
14
|
+
THRESHOLD = 360.0
|
|
15
|
+
|
|
16
|
+
# branch 1: counter advanced
|
|
17
|
+
def test_counter_advanced_is_alive
|
|
18
|
+
assert_equal true,
|
|
19
|
+
MainState.compute_alive(5, 4, 0, nil, 100.0, THRESHOLD)
|
|
20
|
+
assert_equal true,
|
|
21
|
+
MainState.compute_alive(5, 4, 2, 50.0, 1000.0, THRESHOLD)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# branch 2: in-call, recent (counter unchanged)
|
|
25
|
+
def test_in_flight_within_threshold_is_alive
|
|
26
|
+
# oldest_ts = 100, now = 100 + 359 = 459 → diff 359 < 360
|
|
27
|
+
assert_equal true,
|
|
28
|
+
MainState.compute_alive(5, 5, 1, 100.0, 459.0, THRESHOLD)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def test_in_flight_at_threshold_boundary_is_dead
|
|
32
|
+
# oldest_ts = 100, now = 100 + 360 = 460 → diff 360 NOT < 360
|
|
33
|
+
assert_equal false,
|
|
34
|
+
MainState.compute_alive(5, 5, 1, 100.0, 460.0, THRESHOLD)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def test_in_flight_past_threshold_is_dead
|
|
38
|
+
# oldest_ts = 100, now = 100 + 361
|
|
39
|
+
assert_equal false,
|
|
40
|
+
MainState.compute_alive(5, 5, 1, 100.0, 461.0, THRESHOLD)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# branch 3: in-call but oldest_ts nil (defensive — unreachable in
|
|
44
|
+
# practice because snapshot is mutex-atomic)
|
|
45
|
+
def test_in_flight_with_nil_ts_is_alive
|
|
46
|
+
assert_equal true,
|
|
47
|
+
MainState.compute_alive(5, 5, 1, nil, 1000.0, THRESHOLD)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# branch 4: idle
|
|
51
|
+
def test_idle_no_progress_is_dead
|
|
52
|
+
assert_equal false,
|
|
53
|
+
MainState.compute_alive(5, 5, 0, nil, 1000.0, THRESHOLD)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Counter advance dominates threshold check
|
|
57
|
+
def test_counter_advanced_overrides_stale_ts
|
|
58
|
+
# Even if oldest_ts is way past threshold, counter advance => alive.
|
|
59
|
+
assert_equal true,
|
|
60
|
+
MainState.compute_alive(6, 5, 1, 100.0, 9999.0, THRESHOLD)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# First iteration of pulse loop: last_counter = -1, counter = 0,
|
|
64
|
+
# in_flight = 0, ts = nil. Worker just spawned, no calls yet.
|
|
65
|
+
# Counter advanced from -1 to 0 → alive=true.
|
|
66
|
+
def test_first_iteration_with_zero_counter
|
|
67
|
+
assert_equal true,
|
|
68
|
+
MainState.compute_alive(0, -1, 0, nil, 0.0, THRESHOLD)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# last_counter == counter == 0, in_flight==0, ts nil → idle, dead
|
|
72
|
+
def test_second_iteration_no_calls_yet
|
|
73
|
+
assert_equal false,
|
|
74
|
+
MainState.compute_alive(0, 0, 0, nil, 5.0, THRESHOLD)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Custom threshold (e.g. lower for testing)
|
|
78
|
+
def test_custom_threshold
|
|
79
|
+
# threshold=10, oldest 100, now 109 → diff 9 < 10 → alive
|
|
80
|
+
assert_equal true,
|
|
81
|
+
MainState.compute_alive(5, 5, 1, 100.0, 109.0, 10.0)
|
|
82
|
+
# threshold=10, oldest 100, now 110 → diff 10 NOT < 10 → dead
|
|
83
|
+
assert_equal false,
|
|
84
|
+
MainState.compute_alive(5, 5, 1, 100.0, 110.0, 10.0)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -168,6 +168,47 @@ module KairosMcp
|
|
|
168
168
|
assert_equal 'multi_llm_review', payload['next_action']['tool']
|
|
169
169
|
end
|
|
170
170
|
|
|
171
|
+
# v3.24.2: state == 'done' must NOT be misclassified as
|
|
172
|
+
# heartbeat_stale. Worker kills the heartbeat thread on exit, so
|
|
173
|
+
# mtime stops advancing the moment status becomes 'done'. Without
|
|
174
|
+
# the fix, a stale heartbeat ages past the threshold and wait
|
|
175
|
+
# returns crashed even though the worker completed successfully.
|
|
176
|
+
def test_done_with_stale_heartbeat_does_not_false_positive_crash
|
|
177
|
+
write_state('subprocess_status' => 'done')
|
|
178
|
+
# Heartbeat present but stale (older than 15s threshold).
|
|
179
|
+
hb_path = PendingState.worker_heartbeat_path(@token)
|
|
180
|
+
FileUtils.touch(hb_path)
|
|
181
|
+
File.utime(Time.now - 60, Time.now - 60, hb_path)
|
|
182
|
+
PendingState.write_worker_pid(@token, { 'pid' => Process.pid, 'pgid' => Process.pid })
|
|
183
|
+
|
|
184
|
+
# No subprocess_results.json yet — simulates the parse-mid-rename
|
|
185
|
+
# window. wait should NOT return heartbeat_stale; it should poll
|
|
186
|
+
# until budget exhausts and return done_but_no_results.
|
|
187
|
+
payload = call_wait('max_wait_seconds' => 1)
|
|
188
|
+
assert_equal 'crashed', payload['status']
|
|
189
|
+
assert_equal 'done_but_no_results', payload['crashed_reason']
|
|
190
|
+
refute_equal 'heartbeat_stale', payload['crashed_reason']
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def test_done_with_results_returns_ready
|
|
194
|
+
write_state('subprocess_status' => 'done')
|
|
195
|
+
# Heartbeat is stale (worker killed its thread) but results are present.
|
|
196
|
+
hb_path = PendingState.worker_heartbeat_path(@token)
|
|
197
|
+
FileUtils.touch(hb_path)
|
|
198
|
+
File.utime(Time.now - 60, Time.now - 60, hb_path)
|
|
199
|
+
PendingState.write_subprocess_results(@token, {
|
|
200
|
+
'token' => @token,
|
|
201
|
+
'completed_at' => Time.now.iso8601,
|
|
202
|
+
'elapsed_seconds' => 12.3,
|
|
203
|
+
'results' => [{ 'role_label' => 'r1', 'status' => 'success' }],
|
|
204
|
+
'exit_summary' => { 'successful' => 1, 'errored' => 0, 'skipped' => 0 }
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
payload = call_wait('max_wait_seconds' => 1)
|
|
208
|
+
assert_equal 'ready', payload['status']
|
|
209
|
+
assert_equal 1, payload['subprocess_done']
|
|
210
|
+
end
|
|
211
|
+
|
|
171
212
|
# ── hard cap ─────────────────────────────────────────────────────
|
|
172
213
|
# Hard cap is enforced before WaitForWorker is invoked. We verify the
|
|
173
214
|
# clamping logic without actually waiting for the cap by checking the
|
|
@@ -328,80 +328,80 @@ module KairosMcp
|
|
|
328
328
|
end
|
|
329
329
|
end
|
|
330
330
|
|
|
331
|
-
# ── MainState
|
|
331
|
+
# ── MainState (v3.24.3 per-thread invariants) ───────────────────
|
|
332
|
+
# Replaces v0.3.2 C3b ordering tests. v3.24.3 uses Mutex-bracketed
|
|
333
|
+
# per-thread ts_by_thread Hash; the prior "counter-before-ts" ordering
|
|
334
|
+
# invariant no longer applies (Mutex provides atomicity). Tests now
|
|
335
|
+
# cover: snapshot 3-tuple shape, with_call ensure semantics, private
|
|
336
|
+
# enter_call!/exit_call!, and bump_counter! semantics. Comprehensive
|
|
337
|
+
# parallel-thread coverage is in test_main_state.rb.
|
|
332
338
|
|
|
333
339
|
class TestMainState < Minitest::Test
|
|
334
340
|
def setup
|
|
335
341
|
MainState.reset!
|
|
336
342
|
end
|
|
337
343
|
|
|
338
|
-
def
|
|
339
|
-
|
|
344
|
+
def test_initial_snapshot_is_idle
|
|
345
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
346
|
+
assert_equal 0, counter
|
|
347
|
+
assert_equal 0, in_flight
|
|
348
|
+
assert_nil oldest_ts
|
|
340
349
|
end
|
|
341
350
|
|
|
342
|
-
def
|
|
343
|
-
MainState.
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
351
|
+
def test_with_call_brackets_counter_and_ts
|
|
352
|
+
before, in_flight_before, ts_before = MainState.snapshot
|
|
353
|
+
assert_equal 0, in_flight_before
|
|
354
|
+
assert_nil ts_before
|
|
355
|
+
|
|
356
|
+
ts_during_block = nil
|
|
357
|
+
MainState.with_call do
|
|
358
|
+
_c, in_flight, oldest_ts = MainState.snapshot
|
|
359
|
+
ts_during_block = oldest_ts
|
|
360
|
+
assert_equal 1, in_flight
|
|
361
|
+
refute_nil oldest_ts
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
365
|
+
assert_equal before + 1, counter, 'with_call must bump counter on exit'
|
|
366
|
+
assert_equal 0, in_flight
|
|
367
|
+
assert_nil oldest_ts, 'ts must be cleared after with_call returns'
|
|
368
|
+
assert_kind_of Float, ts_during_block
|
|
348
369
|
end
|
|
349
370
|
|
|
350
|
-
def
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
counter,
|
|
355
|
-
assert_equal
|
|
356
|
-
|
|
371
|
+
def test_with_call_ensures_cleanup_on_exception
|
|
372
|
+
assert_raises(RuntimeError) do
|
|
373
|
+
MainState.with_call { raise 'boom' }
|
|
374
|
+
end
|
|
375
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
376
|
+
assert_equal 1, counter, 'counter still bumps on exception (ensure)'
|
|
377
|
+
assert_equal 0, in_flight, 'ts_by_thread must be cleaned on exception'
|
|
378
|
+
assert_nil oldest_ts
|
|
357
379
|
end
|
|
358
380
|
|
|
359
|
-
def
|
|
360
|
-
MainState.
|
|
361
|
-
MainState.
|
|
362
|
-
counter,
|
|
363
|
-
assert_equal
|
|
364
|
-
|
|
381
|
+
def test_bump_counter_does_not_touch_ts_by_thread
|
|
382
|
+
before_counter, before_in_flight, _ = MainState.snapshot
|
|
383
|
+
MainState.bump_counter!
|
|
384
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
385
|
+
assert_equal before_counter + 1, counter
|
|
386
|
+
assert_equal before_in_flight, in_flight, 'bump_counter! must not touch ts_by_thread'
|
|
387
|
+
assert_nil oldest_ts
|
|
365
388
|
end
|
|
366
389
|
|
|
367
|
-
def
|
|
368
|
-
|
|
369
|
-
MainState.enter_call!
|
|
370
|
-
refute_nil MAIN_STATE.in_llm_call_since_mono
|
|
371
|
-
MainState.exit_call!
|
|
372
|
-
assert_nil MAIN_STATE.in_llm_call_since_mono
|
|
373
|
-
end
|
|
374
|
-
assert_equal 3, MAIN_STATE.counter
|
|
390
|
+
def test_enter_call_is_private
|
|
391
|
+
assert_raises(NoMethodError) { MainState.enter_call! }
|
|
375
392
|
end
|
|
376
393
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
def
|
|
394
|
+
def test_exit_call_is_private
|
|
395
|
+
assert_raises(NoMethodError) { MainState.exit_call! }
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
def test_reset_clears_all_state
|
|
399
|
+
MainState.with_call { } # bumps counter to 1
|
|
382
400
|
MainState.reset!
|
|
383
|
-
MainState.
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
alias_method :_orig_ts=, :in_llm_call_since_mono=
|
|
388
|
-
define_method(:counter=) { |v| calls << :counter; send(:_orig_counter=, v) }
|
|
389
|
-
define_method(:in_llm_call_since_mono=) { |v| calls << :ts; send(:_orig_ts=, v) }
|
|
390
|
-
end
|
|
391
|
-
begin
|
|
392
|
-
MainState.exit_call!
|
|
393
|
-
ensure
|
|
394
|
-
MAIN_STATE.singleton_class.class_eval do
|
|
395
|
-
remove_method :counter=
|
|
396
|
-
remove_method :in_llm_call_since_mono=
|
|
397
|
-
alias_method :counter=, :_orig_counter=
|
|
398
|
-
alias_method :in_llm_call_since_mono=, :_orig_ts=
|
|
399
|
-
remove_method :_orig_counter=
|
|
400
|
-
remove_method :_orig_ts=
|
|
401
|
-
end
|
|
402
|
-
end
|
|
403
|
-
assert_equal %i[counter ts], calls,
|
|
404
|
-
'exit_call! must write counter BEFORE clearing ts (C3b ordering invariant)'
|
|
401
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
402
|
+
assert_equal 0, counter
|
|
403
|
+
assert_equal 0, in_flight
|
|
404
|
+
assert_nil oldest_ts
|
|
405
405
|
end
|
|
406
406
|
end
|
|
407
407
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kairos-chain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.24.
|
|
4
|
+
version: 3.24.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Masaomi Hatakeyama
|
|
@@ -495,6 +495,8 @@ files:
|
|
|
495
495
|
- templates/skillsets/multi_llm_review/skillset.json
|
|
496
496
|
- templates/skillsets/multi_llm_review/test/test_dispatcher_usage.rb
|
|
497
497
|
- templates/skillsets/multi_llm_review/test/test_feedback_formatter.rb
|
|
498
|
+
- templates/skillsets/multi_llm_review/test/test_main_state.rb
|
|
499
|
+
- templates/skillsets/multi_llm_review/test/test_main_state_alive.rb
|
|
498
500
|
- templates/skillsets/multi_llm_review/test/test_multi_llm_review.rb
|
|
499
501
|
- templates/skillsets/multi_llm_review/test/test_multi_llm_review_bundle.rb
|
|
500
502
|
- templates/skillsets/multi_llm_review/test/test_multi_llm_review_wait.rb
|