kairos-chain 3.24.0 → 3.24.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -0
- data/lib/kairos_mcp/version.rb +1 -1
- data/templates/skillsets/llm_client/lib/llm_client/headless.rb +9 -9
- data/templates/skillsets/multi_llm_review/bin/dispatch_worker.rb +23 -19
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/dispatcher.rb +4 -1
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/main_state.rb +102 -40
- data/templates/skillsets/multi_llm_review/lib/multi_llm_review/wait_for_worker.rb +62 -11
- data/templates/skillsets/multi_llm_review/test/test_main_state.rb +152 -0
- data/templates/skillsets/multi_llm_review/test/test_main_state_alive.rb +89 -0
- data/templates/skillsets/multi_llm_review/test/test_multi_llm_review_wait.rb +195 -0
- data/templates/skillsets/multi_llm_review/test/test_pending_state_v3.rb +57 -57
- data/templates/skillsets/multi_llm_review/tools/multi_llm_review_wait.rb +156 -92
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fb099806eedb198afc167cbb810110ab7bffac2fceea3684eb14a24e3e7b46fb
|
|
4
|
+
data.tar.gz: 61223eff1c6cd146eea47d44ab0ee95506a8baa6e15a0f1f5c2e929e9d44a5b1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 91d4a86fc2df06025fefb5f0252e9f019d85cb9fc31f6ceec0d2e0bbe1209c8d24277e7448faace8ced8ba28c889dee0af7f68fbf50d6dda6da27bbb3366e588
|
|
7
|
+
data.tar.gz: b6847081bc03d40d2ae77ec317d52955ba7fdc2597b91d10addba2a052067339171542316810d8e1dc95c5f4eb35eb5963425f7f82961ae13bbc5e3bfa8e2f50
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,54 @@ All notable changes to the `kairos-chain` gem will be documented in this file.
|
|
|
4
4
|
|
|
5
5
|
This project follows [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
+
## [3.24.1] - 2026-04-27
|
|
8
|
+
|
|
9
|
+
### Fixed (multi_llm_review_wait)
|
|
10
|
+
|
|
11
|
+
Self-referential validation: ran multi_llm_review on the new wait tool itself
|
|
12
|
+
(both Path A Bash workflow and Path B MCP SkillSet, 7 reviewers total) — found
|
|
13
|
+
1 P0 + 7 P1 bugs not caught by the v3.24.0 test suite. All fixed:
|
|
14
|
+
|
|
15
|
+
- **P0** `config_parallel` had dead `unless ... || true` guard so YAML never
|
|
16
|
+
loaded — all configured wait caps silently fell back to defaults. Removed
|
|
17
|
+
the bogus guard, added explicit `require 'yaml'` at the top of the file.
|
|
18
|
+
- **P1** Streak read-then-write was split: state was read at entry, streak
|
|
19
|
+
written later via `update_state`, so two concurrent waiters could both
|
|
20
|
+
observe the same N and both write N+1, undercounting. Now the increment
|
|
21
|
+
is fully inside the `update_state` RMW block.
|
|
22
|
+
- **P1** `still_pending` next_action message read streak limit from
|
|
23
|
+
`state.dig('wait_still_pending_streak_limit')` (a key never written),
|
|
24
|
+
always falling back to the default constant. Now `streak_limit` is
|
|
25
|
+
threaded through `translate_outcome` so the displayed denominator
|
|
26
|
+
matches the effective config.
|
|
27
|
+
- **P1** Post-wait deadline revalidation missing: `WaitForWorker.wait`
|
|
28
|
+
could return `:timeout` after the collect deadline elapsed during the
|
|
29
|
+
blocking wait, but the tool returned `still_pending`. Now re-checks
|
|
30
|
+
`Time.now >= deadline_at_entry` after the wait and returns
|
|
31
|
+
`past_collect_deadline` if so.
|
|
32
|
+
- **P1** Pre-wait streak guard ran before the ready/results-file check,
|
|
33
|
+
so a worker that finished while streak was at limit was misclassified
|
|
34
|
+
as `crashed/wait_exhausted`. Reordered: ready check now runs first.
|
|
35
|
+
- **P1** Internal exceptions returned `status: 'error'`, outside the
|
|
36
|
+
declared 6-status enum. Now mapped to `crashed` with
|
|
37
|
+
`crashed_reason: 'internal_error'`.
|
|
38
|
+
- **P1** Malformed `collect_deadline` (non-ISO8601 string) was silently
|
|
39
|
+
rescued to nil, skipping all deadline checks. Now returns `crashed`
|
|
40
|
+
with `crashed_reason: 'malformed_state'`.
|
|
41
|
+
- **P1** `safe_path` swallowed PendingState errors, masking real failures
|
|
42
|
+
as benign "not collected". Removed; errors now surface to the outer
|
|
43
|
+
rescue and become `crashed/internal_error`.
|
|
44
|
+
|
|
45
|
+
### Other improvements
|
|
46
|
+
|
|
47
|
+
- Deadline-cap arithmetic uses `ceil` instead of `to_i` so the wait can
|
|
48
|
+
actually run up to the deadline; the post-wait revalidation catches any
|
|
49
|
+
overshoot.
|
|
50
|
+
- `elapsed_seconds` field now correctly uses `outcome[:waited_seconds]`
|
|
51
|
+
for the `:timeout` path (was always 0.0 in v3.24.0).
|
|
52
|
+
- 8 new regression tests covering each of the bugs above (22 wait tool
|
|
53
|
+
tests total).
|
|
54
|
+
|
|
7
55
|
## [3.24.0] - 2026-04-27
|
|
8
56
|
|
|
9
57
|
### Added
|
data/lib/kairos_mcp/version.rb
CHANGED
|
@@ -53,16 +53,16 @@ module KairosMcp
|
|
|
53
53
|
# from codex 5.5 + cursor).
|
|
54
54
|
# Guarded by `defined?` so non-worker consumers (MCP direct call)
|
|
55
55
|
# that never load multi_llm_review/main_state don't NameError.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
result =
|
|
62
|
-
|
|
63
|
-
if bracket
|
|
64
|
-
KairosMcp::SkillSets::MultiLlmReview::MainState.exit_call!
|
|
56
|
+
# v3.24.3: use with_call to enforce ensure-bracketed enter/exit.
|
|
57
|
+
# enter_call!/exit_call! are now private; with_call is the only
|
|
58
|
+
# supported pattern. defined?-guard preserved so non-worker
|
|
59
|
+
# consumers (MCP direct call) don't NameError.
|
|
60
|
+
if defined?(KairosMcp::SkillSets::MultiLlmReview::MainState)
|
|
61
|
+
result = KairosMcp::SkillSets::MultiLlmReview::MainState.with_call do
|
|
62
|
+
CallRouter.perform(args, @config)
|
|
65
63
|
end
|
|
64
|
+
else
|
|
65
|
+
result = CallRouter.perform(args, @config)
|
|
66
66
|
end
|
|
67
67
|
# Shape matches BaseTool#text_content (symbol :text key) — what
|
|
68
68
|
# Dispatcher consumes today via `b[:text] || b['text']`.
|
|
@@ -103,28 +103,31 @@ def self_timeout_at_from_state(token, request)
|
|
|
103
103
|
end
|
|
104
104
|
end
|
|
105
105
|
|
|
106
|
-
# Pulse thread: touches worker.tick IFF main is alive
|
|
107
|
-
#
|
|
106
|
+
# Pulse thread: touches worker.tick IFF main is alive. v3.24.3 uses the
|
|
107
|
+
# per-thread (counter, in_flight, oldest_ts) snapshot from MainState and
|
|
108
|
+
# delegates the alive decision to MainState.compute_alive (pure function,
|
|
109
|
+
# unit-testable). Emits a diagnostic log line every ~5s so future incidents
|
|
110
|
+
# can be diagnosed from worker.log without filesystem mtime archaeology.
|
|
108
111
|
pulse_thread = Thread.new do
|
|
109
112
|
begin
|
|
110
113
|
last_counter = -1
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
call_margin = 60
|
|
114
|
+
log_emit_at = 0
|
|
115
|
+
threshold = 360 # max_call_t (300) + call_margin (60)
|
|
114
116
|
loop do
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if counter != last_counter
|
|
121
|
-
true
|
|
122
|
-
elsif ts
|
|
123
|
-
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - ts) < (max_call_t + call_margin)
|
|
124
|
-
else
|
|
125
|
-
false
|
|
126
|
-
end
|
|
117
|
+
counter, in_flight, oldest_ts = MLR::MainState.snapshot
|
|
118
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
119
|
+
alive = MLR::MainState.compute_alive(
|
|
120
|
+
counter, last_counter, in_flight, oldest_ts, now, threshold
|
|
121
|
+
)
|
|
127
122
|
FileUtils.touch(PS.worker_tick_path(token)) if alive
|
|
123
|
+
|
|
124
|
+
if now - log_emit_at >= 5
|
|
125
|
+
oldest_age = oldest_ts ? (now - oldest_ts).round(1) : nil
|
|
126
|
+
warn "[pulse] counter=#{counter} in_flight=#{in_flight} " \
|
|
127
|
+
"oldest_age=#{oldest_age || 'nil'}s alive=#{alive}"
|
|
128
|
+
log_emit_at = now
|
|
129
|
+
end
|
|
130
|
+
|
|
128
131
|
last_counter = counter
|
|
129
132
|
sleep 2
|
|
130
133
|
end
|
|
@@ -263,8 +266,9 @@ begin
|
|
|
263
266
|
review_context: request['review_context'] || 'independent'
|
|
264
267
|
)
|
|
265
268
|
|
|
266
|
-
#
|
|
267
|
-
|
|
269
|
+
# v3.24.3: counter-only signal (no enter_call!/exit_call! pair). bump_counter!
|
|
270
|
+
# advances pulse's progress signal without touching ts_by_thread.
|
|
271
|
+
MLR::MainState.bump_counter!
|
|
268
272
|
check_shutdown!(token)
|
|
269
273
|
|
|
270
274
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0
|
|
@@ -132,7 +132,10 @@ module KairosMcp
|
|
|
132
132
|
|
|
133
133
|
def bump_main_state_counter
|
|
134
134
|
return unless defined?(KairosMcp::SkillSets::MultiLlmReview::MainState)
|
|
135
|
-
|
|
135
|
+
# v3.24.3: counter-only bump. exit_call! is private; bump_counter!
|
|
136
|
+
# is the public counter-only progress signal (does not touch
|
|
137
|
+
# ts_by_thread).
|
|
138
|
+
KairosMcp::SkillSets::MultiLlmReview::MainState.bump_counter!
|
|
136
139
|
rescue StandardError
|
|
137
140
|
nil
|
|
138
141
|
end
|
|
@@ -3,61 +3,123 @@
|
|
|
3
3
|
module KairosMcp
|
|
4
4
|
module SkillSets
|
|
5
5
|
module MultiLlmReview
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
6
|
+
# ──────────────────────────────────────────────────────────────────
|
|
7
|
+
# MainState — main-thread liveness state for the worker pulse
|
|
8
|
+
# ──────────────────────────────────────────────────────────────────
|
|
9
9
|
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
# (old_counter, old_ts) — in-call, recent → alive
|
|
15
|
-
# (new_counter, old_ts) — counter advanced → alive
|
|
16
|
-
# (new_counter, nil) — exit complete → alive via counter
|
|
17
|
-
# Never (old_counter, nil), which would look stalled.
|
|
10
|
+
# Tracks per-thread enter/exit timestamps so the pulse thread can tell
|
|
11
|
+
# whether the worker's main path is still progressing through LLM calls.
|
|
12
|
+
# Replaces the v0.3.2 process-global single-ts design which raced under
|
|
13
|
+
# parallel reviewer threads (incident token 5b75ff8c-..., 2026-04-27).
|
|
18
14
|
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
|
|
15
|
+
# ORDERING / ATOMICITY INVARIANTS (v3.24.3):
|
|
16
|
+
#
|
|
17
|
+
# 1. counter and ts_by_thread mutations AND reads are bracketed by a
|
|
18
|
+
# single Mutex (MUTEX). Readers (snapshot) take the same mutex, so
|
|
19
|
+
# they never observe a torn (counter, ts_by_thread) pair.
|
|
20
|
+
# Replaces the v0.3.2 "ts-first/counter-second" ordering invariant
|
|
21
|
+
# which assumed single-threaded callers.
|
|
22
|
+
#
|
|
23
|
+
# 2. with_call { ... } is the ONLY supported call-bracketing pattern.
|
|
24
|
+
# Direct enter_call!/exit_call! calls are private (see
|
|
25
|
+
# private_class_method below). This guarantees that any exception
|
|
26
|
+
# from the LLM call propagates AFTER ts_by_thread has been cleaned
|
|
27
|
+
# up (via `ensure exit_call!`), preventing per-thread entry leaks.
|
|
28
|
+
#
|
|
29
|
+
# 3. Thread.current.object_id is used as the per-thread key. MRI's
|
|
30
|
+
# object_id stays stable for the lifetime of a Thread object;
|
|
31
|
+
# reuse only happens after the Thread has been GC'd. Within a
|
|
32
|
+
# single with_call invocation, the Thread is on-stack and therefore
|
|
33
|
+
# not GC-eligible, so the key is unique.
|
|
34
|
+
#
|
|
35
|
+
# 4. Mutex#synchronize is Thread.kill-safe under MRI (Ruby's internal
|
|
36
|
+
# `ensure unlock`). The `ensure exit_call!` inside with_call also
|
|
37
|
+
# runs under Thread.kill, so cleanup is guaranteed even if the
|
|
38
|
+
# dispatch thread is forcibly terminated.
|
|
39
|
+
#
|
|
40
|
+
# 5. NON-REENTRANT: nested with_call on the same thread is NOT
|
|
41
|
+
# supported. The inner enter_call! would overwrite the outer
|
|
42
|
+
# ts_by_thread[tid], and the outer ensure exit_call! would delete
|
|
43
|
+
# the entry while the inner call is still tracked. Current
|
|
44
|
+
# multi_llm_review code paths never nest LLM calls; if a future
|
|
45
|
+
# adapter calls another LLM, this contract must be revisited.
|
|
46
|
+
MAIN_STATE = Struct.new(:counter, :ts_by_thread).new(0, {})
|
|
47
|
+
MUTEX = Mutex.new
|
|
23
48
|
|
|
24
49
|
module MainState
|
|
25
50
|
module_function
|
|
26
51
|
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
52
|
+
# PUBLIC: bracket an LLM call. The block runs between enter_call!
|
|
53
|
+
# and exit_call!; ensure guarantees exit_call! even on exception or
|
|
54
|
+
# Thread.kill. Returns the value of the block.
|
|
55
|
+
def with_call
|
|
56
|
+
enter_call!
|
|
57
|
+
yield
|
|
58
|
+
ensure
|
|
59
|
+
exit_call!
|
|
31
60
|
end
|
|
32
61
|
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
#
|
|
36
|
-
#
|
|
37
|
-
def
|
|
38
|
-
MAIN_STATE.counter += 1
|
|
39
|
-
MAIN_STATE.in_llm_call_since_mono = nil # then clear timestamp
|
|
62
|
+
# PUBLIC: counter-only progress signal. Used by dispatcher's join
|
|
63
|
+
# cleanup loop where there is no LLM call in flight but the main
|
|
64
|
+
# thread is still doing useful work (joining worker threads). Does
|
|
65
|
+
# NOT touch ts_by_thread.
|
|
66
|
+
def bump_counter!
|
|
67
|
+
MUTEX.synchronize { MAIN_STATE.counter += 1 }
|
|
40
68
|
end
|
|
41
69
|
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
# counter SECOND. If reader observes ts == nil, the writer MUST
|
|
46
|
-
# already have completed counter+=1 (writer writes counter before ts).
|
|
47
|
-
# Therefore (old_counter, nil) is unreachable by any reader using
|
|
48
|
-
# this snapshot. The pulse thread uses this helper — do not change
|
|
49
|
-
# the order without also changing the writer invariant.
|
|
70
|
+
# PUBLIC: snapshot of current state. Returns (counter, in_flight,
|
|
71
|
+
# oldest_ts). in_flight = ts_by_thread.size; oldest_ts = min of
|
|
72
|
+
# in-flight ts (nil if idle). Always atomic via MUTEX.
|
|
50
73
|
def snapshot
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
74
|
+
MUTEX.synchronize do
|
|
75
|
+
ts_values = MAIN_STATE.ts_by_thread.values
|
|
76
|
+
[MAIN_STATE.counter, ts_values.size, ts_values.min]
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# PUBLIC PURE FUNCTION: determine alive state from a snapshot
|
|
81
|
+
# tuple. Extracted so unit tests can table-drive the four branches
|
|
82
|
+
# without forking a worker. The pulse thread calls this with the
|
|
83
|
+
# result of snapshot().
|
|
84
|
+
def compute_alive(counter, last_counter, in_flight, oldest_ts, now_mono, threshold_seconds)
|
|
85
|
+
if counter != last_counter
|
|
86
|
+
true # progress observed
|
|
87
|
+
elsif in_flight > 0 && oldest_ts
|
|
88
|
+
(now_mono - oldest_ts) < threshold_seconds # in-call, recent
|
|
89
|
+
elsif in_flight > 0
|
|
90
|
+
true # in-call but ts not visible (transient)
|
|
91
|
+
else
|
|
92
|
+
false # idle, no progress
|
|
93
|
+
end
|
|
54
94
|
end
|
|
55
95
|
|
|
56
|
-
#
|
|
96
|
+
# TEST API: clear all state. NOT safe for runtime use.
|
|
57
97
|
def reset!
|
|
58
|
-
|
|
59
|
-
|
|
98
|
+
MUTEX.synchronize do
|
|
99
|
+
MAIN_STATE.counter = 0
|
|
100
|
+
MAIN_STATE.ts_by_thread.clear
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# ── private (do not call from outside MainState; use with_call) ──
|
|
105
|
+
|
|
106
|
+
def enter_call!
|
|
107
|
+
tid = Thread.current.object_id
|
|
108
|
+
MUTEX.synchronize do
|
|
109
|
+
MAIN_STATE.ts_by_thread[tid] =
|
|
110
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
private_class_method :enter_call!
|
|
114
|
+
|
|
115
|
+
def exit_call!
|
|
116
|
+
tid = Thread.current.object_id
|
|
117
|
+
MUTEX.synchronize do
|
|
118
|
+
MAIN_STATE.counter += 1
|
|
119
|
+
MAIN_STATE.ts_by_thread.delete(tid)
|
|
120
|
+
end
|
|
60
121
|
end
|
|
122
|
+
private_class_method :exit_call!
|
|
61
123
|
end
|
|
62
124
|
end
|
|
63
125
|
end
|
|
@@ -6,17 +6,45 @@ module KairosMcp
|
|
|
6
6
|
# Phase 2's polling loop for the detached worker's subprocess_results.json.
|
|
7
7
|
# Returns one of four outcomes:
|
|
8
8
|
# :ready — subprocess_results.json parsed successfully
|
|
9
|
-
# :crashed — state.subprocess_status
|
|
9
|
+
# :crashed — state.subprocess_status == crashed/self_timed_out
|
|
10
|
+
# OR state == done but results never parseable within
|
|
11
|
+
# wall-clock budget (reason: done_but_no_results)
|
|
12
|
+
# OR heartbeat stale (only while non-terminal state)
|
|
10
13
|
# OR pid present but no heartbeat within grace OR
|
|
11
14
|
# no pid/heartbeat within startup grace
|
|
12
15
|
# :timeout — wall-clock max_wait exceeded with live worker
|
|
13
16
|
# (raises on unexpected errors from PendingState)
|
|
17
|
+
#
|
|
18
|
+
# v3.24.2: 'done' state now bypasses the heartbeat staleness check.
|
|
19
|
+
# The heartbeat thread is killed in the worker's ensure block, so
|
|
20
|
+
# mtime stops advancing the moment the worker transitions to 'done'.
|
|
21
|
+
# Without this bypass, a transient parse-mid-rename of
|
|
22
|
+
# subprocess_results.json combined with the killed heartbeat could
|
|
23
|
+
# surface a false-positive 'heartbeat_stale' for a successfully
|
|
24
|
+
# completed worker.
|
|
14
25
|
module WaitForWorker
|
|
15
26
|
STARTUP_GRACE_DEFAULT = 30
|
|
16
27
|
HEARTBEAT_STALE_DEFAULT = 15
|
|
17
28
|
POLL_INTERVAL_DEFAULT = 0.5
|
|
18
29
|
SUSPEND_JUMP_THRESHOLD = 5.0
|
|
19
30
|
|
|
31
|
+
# All possible :crashed outcome reasons. Single source of truth for
|
|
32
|
+
# the crash-reason taxonomy; operators grep these in worker.log and
|
|
33
|
+
# next_action redispatch hints. v3.24.3 declares the constant; usage
|
|
34
|
+
# sites still use string literals (replacement scheduled for v3.24.4
|
|
35
|
+
# to avoid bundling unrelated refactors).
|
|
36
|
+
CRASH_REASONS = %w[
|
|
37
|
+
heartbeat_stale
|
|
38
|
+
heartbeat_never_started
|
|
39
|
+
worker_never_started
|
|
40
|
+
done_but_no_results
|
|
41
|
+
crashed
|
|
42
|
+
self_timed_out
|
|
43
|
+
wait_exhausted
|
|
44
|
+
internal_error
|
|
45
|
+
malformed_state
|
|
46
|
+
].freeze
|
|
47
|
+
|
|
20
48
|
module_function
|
|
21
49
|
|
|
22
50
|
def wait(token, opts = {})
|
|
@@ -48,17 +76,40 @@ module KairosMcp
|
|
|
48
76
|
# transient parse mid-rename — keep polling
|
|
49
77
|
end
|
|
50
78
|
|
|
51
|
-
# 2. Explicit
|
|
79
|
+
# 2. Explicit terminal status from worker
|
|
52
80
|
state = PendingState.load_state(token)
|
|
53
|
-
if state
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
81
|
+
if state
|
|
82
|
+
status = state['subprocess_status']
|
|
83
|
+
if status == 'crashed' || status == 'self_timed_out'
|
|
84
|
+
return {
|
|
85
|
+
status: :crashed,
|
|
86
|
+
reason: state['crash_reason'] || status,
|
|
87
|
+
pid: read_pid(token),
|
|
88
|
+
pgid: read_pgid_from_file(token),
|
|
89
|
+
log_tail: tail_log(token)
|
|
90
|
+
}
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Worker exited cleanly. subprocess_results.json should be (or
|
|
94
|
+
# imminently become) loadable via step 1 on a subsequent poll.
|
|
95
|
+
# The heartbeat thread is intentionally killed at worker exit
|
|
96
|
+
# (dispatch_worker.rb ensure block), so the heartbeat-stale
|
|
97
|
+
# check below would false-positive. Skip liveness checks while
|
|
98
|
+
# 'done', and rely on step 1 retry until results parse or the
|
|
99
|
+
# wall-clock budget exhausts.
|
|
100
|
+
if status == 'done'
|
|
101
|
+
if now_mono > deadline
|
|
102
|
+
return {
|
|
103
|
+
status: :crashed,
|
|
104
|
+
reason: 'done_but_no_results',
|
|
105
|
+
pid: read_pid(token),
|
|
106
|
+
pgid: read_pgid_from_file(token),
|
|
107
|
+
log_tail: tail_log(token)
|
|
108
|
+
}
|
|
109
|
+
end
|
|
110
|
+
sleep poll_interval
|
|
111
|
+
next
|
|
112
|
+
end
|
|
62
113
|
end
|
|
63
114
|
|
|
64
115
|
# 3. Heartbeat-based liveness checks
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# v3.24.3: per-thread MainState concurrency tests. Covers the per-thread
|
|
4
|
+
# Hash invariants that fix the v0.3.2 single-ts process-global race
|
|
5
|
+
# (incident token 5b75ff8c-..., 2026-04-27).
|
|
6
|
+
|
|
7
|
+
require 'minitest/autorun'
|
|
8
|
+
require_relative '../lib/multi_llm_review/main_state'
|
|
9
|
+
|
|
10
|
+
module KairosMcp
|
|
11
|
+
module SkillSets
|
|
12
|
+
module MultiLlmReview
|
|
13
|
+
class TestMainStateConcurrency < Minitest::Test
|
|
14
|
+
def setup
|
|
15
|
+
MainState.reset!
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# T1 enter, T2 enter, T1 exit. Verify oldest_ts becomes T2's ts
|
|
19
|
+
# (not stuck at T1's). This is the exact scenario that v0.3.2 broke
|
|
20
|
+
# under: T1.exit cleared the single global ts while T2 was still
|
|
21
|
+
# in-call.
|
|
22
|
+
def test_oldest_ts_advances_when_first_enter_exits
|
|
23
|
+
enter_order = Queue.new
|
|
24
|
+
can_exit_t1 = Queue.new
|
|
25
|
+
can_exit_t2 = Queue.new
|
|
26
|
+
|
|
27
|
+
t1_ts = nil
|
|
28
|
+
t2_ts = nil
|
|
29
|
+
|
|
30
|
+
t1 = Thread.new do
|
|
31
|
+
MainState.with_call do
|
|
32
|
+
# capture our ts via snapshot
|
|
33
|
+
_, _, oldest_ts = MainState.snapshot
|
|
34
|
+
t1_ts = oldest_ts
|
|
35
|
+
enter_order << :t1
|
|
36
|
+
can_exit_t1.pop # wait for main to release
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Wait for t1 to enter
|
|
41
|
+
assert_equal :t1, enter_order.pop
|
|
42
|
+
|
|
43
|
+
t2 = Thread.new do
|
|
44
|
+
MainState.with_call do
|
|
45
|
+
enter_order << :t2
|
|
46
|
+
can_exit_t2.pop
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Wait for t2 to enter
|
|
51
|
+
assert_equal :t2, enter_order.pop
|
|
52
|
+
|
|
53
|
+
# Both in flight. Capture snapshot.
|
|
54
|
+
_, in_flight, oldest_ts_both = MainState.snapshot
|
|
55
|
+
assert_equal 2, in_flight
|
|
56
|
+
assert_equal t1_ts, oldest_ts_both, 'oldest_ts is T1 (earliest enter)'
|
|
57
|
+
|
|
58
|
+
# Now grab T2's ts before T1 exits
|
|
59
|
+
# Since T2 entered after T1, T2's ts > T1's ts.
|
|
60
|
+
# After T1 exits, oldest_ts must become T2's ts.
|
|
61
|
+
|
|
62
|
+
can_exit_t1 << :go
|
|
63
|
+
t1.join
|
|
64
|
+
|
|
65
|
+
_, in_flight_after, oldest_ts_after = MainState.snapshot
|
|
66
|
+
assert_equal 1, in_flight_after, 'T2 still in-flight'
|
|
67
|
+
refute_nil oldest_ts_after
|
|
68
|
+
assert oldest_ts_after > t1_ts,
|
|
69
|
+
"oldest_ts must advance past T1's anchor after T1 exits " \
|
|
70
|
+
"(was #{t1_ts}, now #{oldest_ts_after})"
|
|
71
|
+
|
|
72
|
+
can_exit_t2 << :go
|
|
73
|
+
t2.join
|
|
74
|
+
|
|
75
|
+
# Both exited
|
|
76
|
+
counter, in_flight_final, oldest_ts_final = MainState.snapshot
|
|
77
|
+
assert_equal 2, counter
|
|
78
|
+
assert_equal 0, in_flight_final
|
|
79
|
+
assert_nil oldest_ts_final
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# 4 threads cycling enter/exit 250 times each = 1000 total cycles.
|
|
83
|
+
# Verifies counter and ts_by_thread stay consistent under contention.
|
|
84
|
+
def test_concurrent_with_call_stress
|
|
85
|
+
srand(20260427) # deterministic seed
|
|
86
|
+
n_threads = 4
|
|
87
|
+
cycles_per_thread = 250
|
|
88
|
+
start_at = Time.now
|
|
89
|
+
|
|
90
|
+
threads = n_threads.times.map do
|
|
91
|
+
Thread.new do
|
|
92
|
+
cycles_per_thread.times do
|
|
93
|
+
MainState.with_call { }
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
threads.each(&:join)
|
|
98
|
+
|
|
99
|
+
elapsed = Time.now - start_at
|
|
100
|
+
assert elapsed < 10, "stress test took #{elapsed.round(2)}s, budget 10s"
|
|
101
|
+
|
|
102
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
103
|
+
assert_equal n_threads * cycles_per_thread, counter
|
|
104
|
+
assert_equal 0, in_flight, 'ts_by_thread leaked entries'
|
|
105
|
+
assert_nil oldest_ts
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# If with_call raises mid-block across many threads, ts_by_thread
|
|
109
|
+
# must still be cleaned for every thread.
|
|
110
|
+
def test_concurrent_with_call_exception_cleanup
|
|
111
|
+
n_threads = 4
|
|
112
|
+
threads = n_threads.times.map do |i|
|
|
113
|
+
Thread.new do
|
|
114
|
+
begin
|
|
115
|
+
MainState.with_call { raise "boom from thread #{i}" }
|
|
116
|
+
rescue StandardError
|
|
117
|
+
# expected
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
threads.each(&:join)
|
|
122
|
+
|
|
123
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
124
|
+
assert_equal n_threads, counter, 'counter bumps even on exception'
|
|
125
|
+
assert_equal 0, in_flight, 'ts_by_thread must be cleaned on exception'
|
|
126
|
+
assert_nil oldest_ts
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# bump_counter! is racy with concurrent with_call but must not
|
|
130
|
+
# corrupt ts_by_thread or under-count counter.
|
|
131
|
+
def test_bump_counter_concurrent_with_with_call
|
|
132
|
+
n_threads = 4
|
|
133
|
+
n_bumps = 100
|
|
134
|
+
n_cycles = 100
|
|
135
|
+
|
|
136
|
+
bump_threads = n_threads.times.map do
|
|
137
|
+
Thread.new { n_bumps.times { MainState.bump_counter! } }
|
|
138
|
+
end
|
|
139
|
+
call_threads = n_threads.times.map do
|
|
140
|
+
Thread.new { n_cycles.times { MainState.with_call { } } }
|
|
141
|
+
end
|
|
142
|
+
(bump_threads + call_threads).each(&:join)
|
|
143
|
+
|
|
144
|
+
counter, in_flight, oldest_ts = MainState.snapshot
|
|
145
|
+
assert_equal n_threads * (n_bumps + n_cycles), counter
|
|
146
|
+
assert_equal 0, in_flight
|
|
147
|
+
assert_nil oldest_ts
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|