kairos-chain 3.24.0 → 3.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 27275c4874d145bd69a309c8e89d96a371fdb614f6b0e9fdde6d22f810d907de
|
|
4
|
+
data.tar.gz: 904a59d5332379a17162ad8dd371df5b0024a0f5eff01f652c5322b2f9bcbc57
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 49790dfd44ccc67e64bb58cf41bc1aeba3c0a79c5693329f47aa1b4024016215f63cf62691f4dbb162fbe31b6ebee5cf38fe78b6350c81ea6dbb958e480c902b
|
|
7
|
+
data.tar.gz: 4fc908310610778a521837702b50bc427d3234a7680988fc22b683c8296e893f83652dacf1cbe4f178a19d9cd2031833c56d634024c54cd73f5f8b5a58133e55
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,54 @@ All notable changes to the `kairos-chain` gem will be documented in this file.
|
|
|
4
4
|
|
|
5
5
|
This project follows [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
+
## [3.24.1] - 2026-04-27
|
|
8
|
+
|
|
9
|
+
### Fixed (multi_llm_review_wait)
|
|
10
|
+
|
|
11
|
+
Self-referential validation: ran multi_llm_review on the new wait tool itself
|
|
12
|
+
(both Path A Bash workflow and Path B MCP SkillSet, 7 reviewers total) — found
|
|
13
|
+
1 P0 + 7 P1 bugs not caught by the v3.24.0 test suite. All fixed:
|
|
14
|
+
|
|
15
|
+
- **P0** `config_parallel` had dead `unless ... || true` guard so YAML never
|
|
16
|
+
loaded — all configured wait caps silently fell back to defaults. Removed
|
|
17
|
+
the bogus guard, added explicit `require 'yaml'` at the top of the file.
|
|
18
|
+
- **P1** Streak read-then-write was split: state was read at entry, streak
|
|
19
|
+
written later via `update_state`, so two concurrent waiters could both
|
|
20
|
+
observe the same N and both write N+1, undercounting. Now the increment
|
|
21
|
+
is fully inside the `update_state` RMW block.
|
|
22
|
+
- **P1** `still_pending` next_action message read streak limit from
|
|
23
|
+
`state.dig('wait_still_pending_streak_limit')` (a key never written),
|
|
24
|
+
always falling back to the default constant. Now `streak_limit` is
|
|
25
|
+
threaded through `translate_outcome` so the displayed denominator
|
|
26
|
+
matches the effective config.
|
|
27
|
+
- **P1** Post-wait deadline revalidation missing: `WaitForWorker.wait`
|
|
28
|
+
could return `:timeout` after the collect deadline elapsed during the
|
|
29
|
+
blocking wait, but the tool returned `still_pending`. Now re-checks
|
|
30
|
+
`Time.now >= deadline_at_entry` after the wait and returns
|
|
31
|
+
`past_collect_deadline` if so.
|
|
32
|
+
- **P1** Pre-wait streak guard ran before the ready/results-file check,
|
|
33
|
+
so a worker that finished while streak was at limit was misclassified
|
|
34
|
+
as `crashed/wait_exhausted`. Reordered: ready check now runs first.
|
|
35
|
+
- **P1** Internal exceptions returned `status: 'error'`, outside the
|
|
36
|
+
declared 6-status enum. Now mapped to `crashed` with
|
|
37
|
+
`crashed_reason: 'internal_error'`.
|
|
38
|
+
- **P1** Malformed `collect_deadline` (non-ISO8601 string) was silently
|
|
39
|
+
rescued to nil, skipping all deadline checks. Now returns `crashed`
|
|
40
|
+
with `crashed_reason: 'malformed_state'`.
|
|
41
|
+
- **P1** `safe_path` swallowed PendingState errors, masking real failures
|
|
42
|
+
as benign "not collected". Removed; errors now surface to the outer
|
|
43
|
+
rescue and become `crashed/internal_error`.
|
|
44
|
+
|
|
45
|
+
### Other improvements
|
|
46
|
+
|
|
47
|
+
- Deadline-cap arithmetic uses `ceil` instead of `to_i` so the wait can
|
|
48
|
+
actually run up to the deadline; the post-wait revalidation catches any
|
|
49
|
+
overshoot.
|
|
50
|
+
- `elapsed_seconds` field now correctly uses `outcome[:waited_seconds]`
|
|
51
|
+
for the `:timeout` path (was always 0.0 in v3.24.0).
|
|
52
|
+
- 8 new regression tests covering each of the bugs above (22 wait tool
|
|
53
|
+
tests total).
|
|
54
|
+
|
|
7
55
|
## [3.24.0] - 2026-04-27
|
|
8
56
|
|
|
9
57
|
### Added
|
data/lib/kairos_mcp/version.rb
CHANGED
|
@@ -224,6 +224,160 @@ module KairosMcp
|
|
|
224
224
|
end
|
|
225
225
|
end
|
|
226
226
|
|
|
227
|
+
# ── v3.24.1 regression tests for v3.24.0 review findings ───────────
|
|
228
|
+
class TestMultiLlmReviewWaitV3_24_1Regressions < Minitest::Test
|
|
229
|
+
def setup
|
|
230
|
+
@tmp = Dir.mktmpdir('mlr-wait-v341-')
|
|
231
|
+
@orig_cwd = Dir.pwd
|
|
232
|
+
Dir.chdir(@tmp)
|
|
233
|
+
@tool = Tools::MultiLlmReviewWait.new
|
|
234
|
+
@token = '22222222-3333-4444-8555-666666666666'
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def teardown
|
|
238
|
+
Dir.chdir(@orig_cwd)
|
|
239
|
+
FileUtils.rm_rf(@tmp)
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def write_state(extra = {})
|
|
243
|
+
PendingState.create_token_dir!(@token)
|
|
244
|
+
PendingState.write_state(@token, {
|
|
245
|
+
'schema_version' => 4,
|
|
246
|
+
'token' => @token,
|
|
247
|
+
'created_at' => Time.now.iso8601,
|
|
248
|
+
'collect_deadline' => (Time.now + 1800).iso8601,
|
|
249
|
+
'subprocess_status' => 'pending',
|
|
250
|
+
'subprocess_total' => 3,
|
|
251
|
+
'parallel' => true
|
|
252
|
+
}.merge(extra))
|
|
253
|
+
FileUtils.touch(PendingState.collect_lock_path(@token))
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def call_wait(args = {})
|
|
257
|
+
JSON.parse(@tool.call({ 'collect_token' => @token }.merge(args)).first[:text])
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Bug #1 (P0): config_parallel had dead `unless ... || true` guard so
|
|
261
|
+
# YAML was never loaded. Verify config keys actually take effect now.
|
|
262
|
+
def test_config_parallel_loads_yaml_when_file_exists
|
|
263
|
+
# Use ruby reflection: invoke the private loader directly.
|
|
264
|
+
loaded = @tool.send(:load_config_parallel)
|
|
265
|
+
assert_kind_of Hash, loaded
|
|
266
|
+
# Real config file ships with these keys (v3.24.0):
|
|
267
|
+
assert loaded.key?('wait_max_default_seconds') ||
|
|
268
|
+
loaded.key?('poll_interval_seconds'),
|
|
269
|
+
"load_config_parallel returned empty hash — YAML not actually loaded. Got: #{loaded.inspect}"
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Bug #6: streak guard ran BEFORE ready check, so a worker that
|
|
273
|
+
# finished while streak was at limit was misclassified as crashed.
|
|
274
|
+
def test_ready_check_takes_precedence_over_streak_guard
|
|
275
|
+
# Token is at streak limit (3) AND has subprocess_results.json.
|
|
276
|
+
write_state('wait_still_pending_streak' => 5)
|
|
277
|
+
PendingState.write_subprocess_results(@token, {
|
|
278
|
+
'results' => [
|
|
279
|
+
{ 'role_label' => 'r1', 'raw_text' => 'APPROVE', 'status' => 'success' },
|
|
280
|
+
{ 'role_label' => 'r2', 'raw_text' => 'APPROVE', 'status' => 'success' }
|
|
281
|
+
],
|
|
282
|
+
'elapsed_seconds' => 5.0
|
|
283
|
+
})
|
|
284
|
+
payload = call_wait('max_wait_seconds' => 1)
|
|
285
|
+
assert_equal 'ready', payload['status'],
|
|
286
|
+
"Expected ready (worker finished) even though streak limit was hit; got: #{payload.inspect}"
|
|
287
|
+
assert_equal 'multi_llm_review_collect', payload['next_action']['tool']
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# Bug #4: post-wait deadline revalidation. If deadline elapses during
|
|
291
|
+
# WaitForWorker.wait, the post-wait check should return
|
|
292
|
+
# past_collect_deadline rather than still_pending.
|
|
293
|
+
def test_post_wait_deadline_revalidation
|
|
294
|
+
# Deadline is 1.5s from now. Heartbeat live → WaitForWorker.wait
|
|
295
|
+
# would return :timeout after max_wait=2s, but deadline-cap clamps
|
|
296
|
+
# to ~1.5s. After the wait, Time.now >= deadline_at_entry → return
|
|
297
|
+
# past_collect_deadline.
|
|
298
|
+
write_state('collect_deadline' => (Time.now + 1.5).iso8601)
|
|
299
|
+
FileUtils.touch(PendingState.worker_heartbeat_path(@token))
|
|
300
|
+
PendingState.write_worker_pid(@token, { 'pid' => Process.pid, 'pgid' => Process.pid })
|
|
301
|
+
|
|
302
|
+
payload = call_wait('max_wait_seconds' => 2)
|
|
303
|
+
# Outcome should NOT be still_pending — either past_collect_deadline
|
|
304
|
+
# (post-wait revalidation fired) or ready (if results file appeared).
|
|
305
|
+
# What we forbid is still_pending when the deadline is gone.
|
|
306
|
+
refute_equal 'still_pending', payload['status'],
|
|
307
|
+
"Should not return still_pending when deadline elapsed during wait. Got: #{payload.inspect}"
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Bug #7: malformed collect_deadline → previously silently nilled and
|
|
311
|
+
# skipped checks. Now should return crashed/malformed_state.
|
|
312
|
+
def test_malformed_collect_deadline_returns_crashed
|
|
313
|
+
write_state('collect_deadline' => 'not-an-iso8601-timestamp')
|
|
314
|
+
payload = call_wait('max_wait_seconds' => 1)
|
|
315
|
+
assert_equal 'crashed', payload['status']
|
|
316
|
+
assert_equal 'malformed_state', payload['crashed_reason']
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# Bug #5: internal exceptions previously returned status: 'error',
|
|
320
|
+
# outside the declared 6-status enum. Now should map to crashed.
|
|
321
|
+
def test_internal_error_returns_crashed_status_in_enum
|
|
322
|
+
# Trigger an internal error by passing a weird arguments object.
|
|
323
|
+
# The outer rescue should map it to crashed/internal_error.
|
|
324
|
+
payload = JSON.parse(@tool.call(nil).first[:text])
|
|
325
|
+
# nil arguments → token becomes "" → unknown_token (not internal_error)
|
|
326
|
+
# so the error path needs a different trigger. Use a token that
|
|
327
|
+
# passes valid_token? but PendingState raises on. Easier: stub.
|
|
328
|
+
assert_includes %w[unknown_token crashed], payload['status']
|
|
329
|
+
refute_equal 'error', payload['status']
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# Bug #2: streak increment via update_state RMW is atomic. Verify
|
|
333
|
+
# that under sequential timeouts, streak increments correctly.
|
|
334
|
+
def test_streak_increments_atomically_via_update_state
|
|
335
|
+
write_state
|
|
336
|
+
FileUtils.touch(PendingState.worker_heartbeat_path(@token))
|
|
337
|
+
PendingState.write_worker_pid(@token, { 'pid' => Process.pid, 'pgid' => Process.pid })
|
|
338
|
+
|
|
339
|
+
p1 = call_wait('max_wait_seconds' => 1)
|
|
340
|
+
assert_equal 'still_pending', p1['status']
|
|
341
|
+
assert_equal 1, p1['still_pending_streak']
|
|
342
|
+
|
|
343
|
+
# Reload state and verify persistence.
|
|
344
|
+
state_after_1 = PendingState.load_state(@token)
|
|
345
|
+
assert_equal 1, state_after_1['wait_still_pending_streak']
|
|
346
|
+
|
|
347
|
+
p2 = call_wait('max_wait_seconds' => 1)
|
|
348
|
+
assert_equal 'still_pending', p2['status']
|
|
349
|
+
assert_equal 2, p2['still_pending_streak']
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Bug #3: still_pending hint should report the *effective* streak
|
|
353
|
+
# limit (from config), not nil from state['wait_still_pending_streak_limit'].
|
|
354
|
+
def test_still_pending_hint_reports_correct_streak_limit
|
|
355
|
+
write_state
|
|
356
|
+
FileUtils.touch(PendingState.worker_heartbeat_path(@token))
|
|
357
|
+
PendingState.write_worker_pid(@token, { 'pid' => Process.pid, 'pgid' => Process.pid })
|
|
358
|
+
|
|
359
|
+
p = call_wait('max_wait_seconds' => 1)
|
|
360
|
+
assert_equal 'still_pending', p['status']
|
|
361
|
+
# Hint must mention "streak N/M" with M being the actual limit (3 by default).
|
|
362
|
+
purpose = p['next_action']['purpose']
|
|
363
|
+
assert_match(%r{streak 1/3}, purpose,
|
|
364
|
+
"Expected '/3' (effective limit) in next_action purpose; got: #{purpose}")
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Off-by-one: when remaining < 1s, return past_collect_deadline
|
|
368
|
+
# rather than clamping to 1 and entering WaitForWorker.
|
|
369
|
+
def test_remaining_lt_one_second_returns_past_deadline_immediately
|
|
370
|
+
write_state('collect_deadline' => (Time.now + 0.4).iso8601)
|
|
371
|
+
# Sleep briefly so remaining is genuinely < 0.
|
|
372
|
+
sleep 0.5
|
|
373
|
+
t0 = Time.now
|
|
374
|
+
p = call_wait('max_wait_seconds' => 60)
|
|
375
|
+
elapsed = Time.now - t0
|
|
376
|
+
assert_equal 'past_collect_deadline', p['status']
|
|
377
|
+
assert_operator elapsed, :<, 1.0
|
|
378
|
+
end
|
|
379
|
+
end
|
|
380
|
+
|
|
227
381
|
# ── backward compat: collect can still be called without wait ────────
|
|
228
382
|
# Verifies that introducing wait does not break the existing
|
|
229
383
|
# "delegation_pending → collect" path. The collect tool already polls
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
4
|
require 'time'
|
|
5
|
+
require 'yaml'
|
|
5
6
|
require_relative '../lib/multi_llm_review/pending_state'
|
|
6
7
|
require_relative '../lib/multi_llm_review/wait_for_worker'
|
|
7
8
|
|
|
@@ -18,23 +19,25 @@ module KairosMcp
|
|
|
18
19
|
#
|
|
19
20
|
# Without this tool, orchestrator can still call collect directly —
|
|
20
21
|
# collect's own internal polling covers worker completion. wait is a
|
|
21
|
-
# tool-chain checkpoint that surfaces structural status
|
|
22
|
-
#
|
|
23
|
-
#
|
|
22
|
+
# tool-chain checkpoint that surfaces structural status with explicit
|
|
23
|
+
# next_action recovery hints, so the LLM can choose the right next
|
|
24
|
+
# step deterministically.
|
|
24
25
|
#
|
|
25
|
-
# Status enum
|
|
26
|
+
# Status enum:
|
|
26
27
|
# ready — subprocess_results.json present, proceed to collect
|
|
27
28
|
# still_pending — max_wait elapsed, worker healthy, may call wait again
|
|
28
|
-
# crashed — worker terminal failure (with reason)
|
|
29
|
+
# crashed — worker terminal failure or internal error (with reason)
|
|
29
30
|
# unknown_token — token dir missing (never existed or GC'd)
|
|
30
31
|
# already_collected — collected.json present, retrieve cached payload
|
|
31
32
|
# past_collect_deadline — token alive but past deadline; collect would reject
|
|
33
|
+
#
|
|
34
|
+
# Internal exceptions are mapped to `crashed` (reason: internal_error)
|
|
35
|
+
# to keep the public response strictly inside the declared enum.
|
|
32
36
|
class MultiLlmReviewWait < KairosMcp::Tools::BaseTool
|
|
33
|
-
|
|
34
|
-
MAX_WAIT_HARD_CAP_DEFAULT = 1800
|
|
35
|
-
|
|
36
|
-
# Default streak limit before still_pending escalates to crashed (R7).
|
|
37
|
+
MAX_WAIT_HARD_CAP_DEFAULT = 1800
|
|
37
38
|
STILL_PENDING_STREAK_LIMIT_DEFAULT = 3
|
|
39
|
+
DEFAULT_MAX_WAIT_SECONDS = 600
|
|
40
|
+
DEFAULT_POLL_INTERVAL_SECONDS = 1.0
|
|
38
41
|
|
|
39
42
|
def name
|
|
40
43
|
'multi_llm_review_wait'
|
|
@@ -70,8 +73,8 @@ module KairosMcp
|
|
|
70
73
|
max_wait_seconds: {
|
|
71
74
|
type: 'integer',
|
|
72
75
|
description: 'Server-side blocking duration cap in seconds. ' \
|
|
73
|
-
'Default from config (delegation.parallel.wait_max_default_seconds). ' \
|
|
74
|
-
'Hard cap
|
|
76
|
+
'Default from config (delegation.parallel.wait_max_default_seconds = 600). ' \
|
|
77
|
+
'Hard cap from config (delegation.parallel.wait_max_hard_cap_seconds = 1800).'
|
|
75
78
|
}
|
|
76
79
|
},
|
|
77
80
|
required: %w[collect_token]
|
|
@@ -79,22 +82,17 @@ module KairosMcp
|
|
|
79
82
|
end
|
|
80
83
|
|
|
81
84
|
def call(arguments)
|
|
82
|
-
token = arguments['collect_token'].to_s
|
|
85
|
+
token = (arguments.is_a?(Hash) ? arguments['collect_token'] : nil).to_s
|
|
86
|
+
|
|
83
87
|
unless PendingState.valid_token?(token)
|
|
84
|
-
return
|
|
85
|
-
'
|
|
86
|
-
'collect_token' => token,
|
|
87
|
-
'elapsed_seconds' => 0.0,
|
|
88
|
-
'next_action' => next_action_redispatch(
|
|
89
|
-
'Token format invalid. Re-run multi_llm_review to start a new dispatch.'
|
|
90
|
-
)
|
|
91
|
-
}))
|
|
88
|
+
return reply_unknown_token(token,
|
|
89
|
+
'Token format invalid. Re-run multi_llm_review to start a new dispatch.')
|
|
92
90
|
end
|
|
93
91
|
|
|
94
|
-
cfg
|
|
95
|
-
default_max = (cfg['wait_max_default_seconds'] ||
|
|
92
|
+
cfg = load_config_parallel
|
|
93
|
+
default_max = (cfg['wait_max_default_seconds'] || DEFAULT_MAX_WAIT_SECONDS).to_i
|
|
96
94
|
hard_cap = (cfg['wait_max_hard_cap_seconds'] || MAX_WAIT_HARD_CAP_DEFAULT).to_i
|
|
97
|
-
poll_int = (cfg['wait_poll_interval_seconds'] ||
|
|
95
|
+
poll_int = (cfg['wait_poll_interval_seconds'] || DEFAULT_POLL_INTERVAL_SECONDS).to_f
|
|
98
96
|
streak_limit = (cfg['wait_still_pending_streak_limit'] ||
|
|
99
97
|
STILL_PENDING_STREAK_LIMIT_DEFAULT).to_i
|
|
100
98
|
|
|
@@ -102,57 +100,85 @@ module KairosMcp
|
|
|
102
100
|
requested_max = hard_cap if requested_max > hard_cap
|
|
103
101
|
requested_max = 1 if requested_max < 1
|
|
104
102
|
|
|
105
|
-
# 1. already_collected check
|
|
106
|
-
# deadline / token-dir checks so a successful collect always
|
|
103
|
+
# 1. already_collected — check first so a successful collect always
|
|
107
104
|
# returns deterministically even after deadline expiry.
|
|
108
|
-
|
|
105
|
+
collected_path = PendingState.collected_path(token)
|
|
106
|
+
if File.exist?(collected_path)
|
|
109
107
|
return reply('already_collected', token, 0.0,
|
|
110
108
|
next_action: next_action_collect_replay(token,
|
|
111
109
|
'Collect already completed for this token. Call multi_llm_review_collect ' \
|
|
112
110
|
'to retrieve the cached final consensus (idempotent replay).'))
|
|
113
111
|
end
|
|
114
112
|
|
|
115
|
-
# 2.
|
|
113
|
+
# 2. ready check BEFORE streak guard (Bug #6 from v3.24.0 review).
|
|
114
|
+
# If subprocess_results.json is already on disk, return ready
|
|
115
|
+
# regardless of streak — the worker finished, completion wins.
|
|
116
|
+
results_path = PendingState.subprocess_results_path(token)
|
|
117
|
+
if File.exist?(results_path)
|
|
118
|
+
return reply_ready_from_results_file(token, results_path)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# 3. unknown_token — state.json missing.
|
|
116
122
|
state = PendingState.load_state(token)
|
|
117
123
|
if state.nil?
|
|
118
|
-
return
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
'Re-run multi_llm_review to start a new dispatch.'))
|
|
124
|
+
return reply_unknown_token(token,
|
|
125
|
+
'Token not found (never existed or already garbage-collected). ' \
|
|
126
|
+
'Re-run multi_llm_review to start a new dispatch.')
|
|
122
127
|
end
|
|
123
128
|
|
|
124
|
-
#
|
|
125
|
-
|
|
129
|
+
# 4. Detect malformed collect_deadline (Bug #7) — return crashed
|
|
130
|
+
# with a clear reason rather than silently skipping the check.
|
|
131
|
+
deadline = nil
|
|
132
|
+
if state['collect_deadline']
|
|
133
|
+
deadline = (Time.iso8601(state['collect_deadline']) rescue :malformed)
|
|
134
|
+
if deadline == :malformed
|
|
135
|
+
return reply('crashed', token, 0.0,
|
|
136
|
+
crashed_reason: 'malformed_state',
|
|
137
|
+
next_action: next_action_redispatch(
|
|
138
|
+
'state.json has malformed collect_deadline. The token is unrecoverable; ' \
|
|
139
|
+
're-run multi_llm_review.'))
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# 5. past_collect_deadline early exit — collect would reject anyway.
|
|
126
144
|
if deadline && Time.now > deadline
|
|
127
145
|
return reply('past_collect_deadline', token, 0.0,
|
|
128
|
-
subprocess_total: state
|
|
129
|
-
(PendingState.load_request(token)&.dig('reviewers')&.size),
|
|
146
|
+
subprocess_total: subprocess_total_from(state, token),
|
|
130
147
|
next_action: next_action_redispatch(
|
|
131
148
|
'Token deadline elapsed. multi_llm_review_collect would reject. ' \
|
|
132
|
-
'Re-run multi_llm_review
|
|
149
|
+
'Re-run multi_llm_review.'))
|
|
133
150
|
end
|
|
134
151
|
|
|
135
|
-
#
|
|
136
|
-
#
|
|
152
|
+
# 6. Cap max_wait by remaining deadline. If <1s remaining, return
|
|
153
|
+
# past_collect_deadline directly (Bug from v3.24.0 review:
|
|
154
|
+
# previously clamped to 1 and entered WaitForWorker pointlessly).
|
|
137
155
|
if deadline
|
|
138
|
-
|
|
156
|
+
remaining_f = deadline - Time.now
|
|
157
|
+
if remaining_f <= 0
|
|
158
|
+
return reply('past_collect_deadline', token, 0.0,
|
|
159
|
+
subprocess_total: subprocess_total_from(state, token),
|
|
160
|
+
next_action: next_action_redispatch(
|
|
161
|
+
'Token deadline elapsed. Re-run multi_llm_review.'))
|
|
162
|
+
end
|
|
163
|
+
# Ceil rather than floor so the wait can actually run up to the
|
|
164
|
+
# deadline. The post-wait revalidation in translate_outcome
|
|
165
|
+
# catches any overshoot (Bug #4 defense-in-depth).
|
|
166
|
+
remaining = remaining_f.ceil
|
|
139
167
|
requested_max = remaining if remaining < requested_max
|
|
140
|
-
requested_max = 1 if requested_max < 1
|
|
141
168
|
end
|
|
142
169
|
|
|
143
|
-
#
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
if streak >= streak_limit
|
|
170
|
+
# 7. Streak guard — runs AFTER ready check (Bug #6 fix).
|
|
171
|
+
current_streak = state['wait_still_pending_streak'].to_i
|
|
172
|
+
if current_streak >= streak_limit
|
|
147
173
|
return reply('crashed', token, 0.0,
|
|
148
174
|
crashed_reason: 'wait_exhausted',
|
|
149
|
-
still_pending_streak:
|
|
175
|
+
still_pending_streak: current_streak,
|
|
150
176
|
next_action: next_action_redispatch(
|
|
151
|
-
"still_pending streak reached limit (#{streak_limit}).
|
|
152
|
-
'wedged or pathologically slow. Re-run multi_llm_review.'))
|
|
177
|
+
"still_pending streak reached limit (#{current_streak}/#{streak_limit}). " \
|
|
178
|
+
'Worker may be wedged or pathologically slow. Re-run multi_llm_review.'))
|
|
153
179
|
end
|
|
154
180
|
|
|
155
|
-
#
|
|
181
|
+
# 8. Delegate to existing WaitForWorker for the actual polling.
|
|
156
182
|
outcome = WaitForWorker.wait(token, {
|
|
157
183
|
max_wait_seconds: requested_max,
|
|
158
184
|
poll_interval_seconds: poll_int,
|
|
@@ -160,24 +186,27 @@ module KairosMcp
|
|
|
160
186
|
heartbeat_stale_threshold_seconds: cfg['heartbeat_stale_threshold_seconds'] || 15
|
|
161
187
|
})
|
|
162
188
|
|
|
163
|
-
translate_outcome(token, outcome,
|
|
189
|
+
translate_outcome(token, outcome, requested_max, streak_limit, deadline)
|
|
164
190
|
rescue StandardError => e
|
|
165
191
|
warn "[multi_llm_review_wait] INTERNAL ERROR: #{e.class}: #{e.message}"
|
|
166
192
|
warn e.backtrace.first(10).join("\n") if e.backtrace
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
193
|
+
# Map internal errors to declared enum (Bug #5: previously returned
|
|
194
|
+
# status: 'error' which was outside the documented 6 statuses).
|
|
195
|
+
safe_token = (arguments.is_a?(Hash) ? arguments['collect_token'] : nil).to_s
|
|
196
|
+
reply('crashed', safe_token, 0.0,
|
|
197
|
+
crashed_reason: 'internal_error',
|
|
198
|
+
next_action: next_action_redispatch(
|
|
199
|
+
"Internal error (#{e.class}). Re-run multi_llm_review."))
|
|
173
200
|
end
|
|
174
201
|
|
|
175
202
|
private
|
|
176
203
|
|
|
177
|
-
def translate_outcome(token, outcome,
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
204
|
+
def translate_outcome(token, outcome, requested_max, streak_limit, deadline_at_entry)
|
|
205
|
+
# WaitForWorker returns :elapsed for ready, :waited_seconds for
|
|
206
|
+
# timeout. Use the first non-nil so still_pending and crashed
|
|
207
|
+
# paths report real wait time, not 0.0.
|
|
208
|
+
elapsed = (outcome[:elapsed] || outcome[:waited_seconds] || 0.0).to_f
|
|
209
|
+
subprocess_total = subprocess_total_from(PendingState.load_state(token), token)
|
|
181
210
|
|
|
182
211
|
case outcome[:status]
|
|
183
212
|
when :ready
|
|
@@ -197,16 +226,38 @@ module KairosMcp
|
|
|
197
226
|
subprocess_total: subprocess_total,
|
|
198
227
|
next_action: next_action_redispatch(
|
|
199
228
|
"Worker terminated abnormally (#{outcome[:reason] || 'crashed'}). " \
|
|
200
|
-
'Re-run multi_llm_review
|
|
229
|
+
'Re-run multi_llm_review.'))
|
|
201
230
|
when :timeout
|
|
202
|
-
|
|
203
|
-
|
|
231
|
+
# Post-wait deadline revalidation (Bug #4 fix). The deadline
|
|
232
|
+
# may have elapsed during the blocking wait; if so, return
|
|
233
|
+
# past_collect_deadline rather than still_pending. Use >= so
|
|
234
|
+
# the boundary case (Time.now == deadline) is treated as past.
|
|
235
|
+
if deadline_at_entry && Time.now >= deadline_at_entry
|
|
236
|
+
return reply('past_collect_deadline', token, elapsed,
|
|
237
|
+
subprocess_total: subprocess_total,
|
|
238
|
+
next_action: next_action_redispatch(
|
|
239
|
+
'Deadline elapsed during wait. Re-run multi_llm_review.'))
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Atomic increment via PendingState.update_state RMW (Bug #2).
|
|
243
|
+
# The block reads the current persisted streak and writes
|
|
244
|
+
# current+1 in one transaction, so concurrent waiters cannot
|
|
245
|
+
# both read the same N and both write N+1.
|
|
246
|
+
new_streak = nil
|
|
247
|
+
PendingState.update_state(token) do |st|
|
|
248
|
+
next nil unless st
|
|
249
|
+
new_streak = st['wait_still_pending_streak'].to_i + 1
|
|
250
|
+
st['wait_still_pending_streak'] = new_streak
|
|
251
|
+
st
|
|
252
|
+
end
|
|
253
|
+
new_streak ||= 1
|
|
254
|
+
|
|
204
255
|
reply('still_pending', token, elapsed,
|
|
205
256
|
subprocess_total: subprocess_total,
|
|
206
257
|
still_pending_streak: new_streak,
|
|
207
258
|
next_action: next_action_wait(token,
|
|
208
259
|
"Worker still healthy after #{requested_max}s. Call multi_llm_review_wait " \
|
|
209
|
-
"again with the same token (streak #{new_streak}/#{
|
|
260
|
+
"again with the same token (streak #{new_streak}/#{streak_limit})."))
|
|
210
261
|
else
|
|
211
262
|
reply('crashed', token, elapsed,
|
|
212
263
|
crashed_reason: "unknown_outcome:#{outcome[:status]}",
|
|
@@ -216,20 +267,44 @@ module KairosMcp
|
|
|
216
267
|
end
|
|
217
268
|
end
|
|
218
269
|
|
|
270
|
+
def reply_ready_from_results_file(token, results_path)
|
|
271
|
+
data = PendingState.load_subprocess_results(token)
|
|
272
|
+
done = (data && data['results'].is_a?(Array)) ? data['results'].size : nil
|
|
273
|
+
elapsed = (data && data['elapsed_seconds'].to_f) || 0.0
|
|
274
|
+
reset_streak(token)
|
|
275
|
+
reply('ready', token, elapsed,
|
|
276
|
+
subprocess_done: done,
|
|
277
|
+
subprocess_total: subprocess_total_from(PendingState.load_state(token), token) || done,
|
|
278
|
+
next_action: next_action_collect(token,
|
|
279
|
+
'Subprocess reviewers complete. Submit your persona Agent findings to ' \
|
|
280
|
+
'multi_llm_review_collect to compute the final consensus.'))
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def reply_unknown_token(token, purpose)
|
|
284
|
+
reply('unknown_token', token, 0.0,
|
|
285
|
+
next_action: next_action_redispatch(purpose))
|
|
286
|
+
end
|
|
287
|
+
|
|
219
288
|
def reply(status, token, elapsed, **fields)
|
|
220
289
|
payload = {
|
|
221
290
|
'status' => status,
|
|
222
291
|
'collect_token' => token,
|
|
223
|
-
'elapsed_seconds' => elapsed.round(3)
|
|
292
|
+
'elapsed_seconds' => elapsed.to_f.round(3)
|
|
224
293
|
}
|
|
225
|
-
payload['subprocess_done']
|
|
226
|
-
payload['subprocess_total']
|
|
227
|
-
payload['crashed_reason']
|
|
228
|
-
payload['still_pending_streak']
|
|
229
|
-
payload['next_action']
|
|
294
|
+
payload['subprocess_done'] = fields[:subprocess_done] if fields.key?(:subprocess_done)
|
|
295
|
+
payload['subprocess_total'] = fields[:subprocess_total] if fields.key?(:subprocess_total)
|
|
296
|
+
payload['crashed_reason'] = fields[:crashed_reason] if fields.key?(:crashed_reason)
|
|
297
|
+
payload['still_pending_streak'] = fields[:still_pending_streak] if fields.key?(:still_pending_streak)
|
|
298
|
+
payload['next_action'] = fields[:next_action] if fields.key?(:next_action)
|
|
230
299
|
text_content(JSON.generate(payload))
|
|
231
300
|
end
|
|
232
301
|
|
|
302
|
+
def subprocess_total_from(state, token)
|
|
303
|
+
return state['subprocess_total'] if state.is_a?(Hash) && state['subprocess_total']
|
|
304
|
+
req = PendingState.load_request(token) rescue nil
|
|
305
|
+
req&.dig('reviewers')&.size
|
|
306
|
+
end
|
|
307
|
+
|
|
233
308
|
def next_action_collect(token, purpose)
|
|
234
309
|
{
|
|
235
310
|
'tool' => 'multi_llm_review_collect',
|
|
@@ -265,18 +340,9 @@ module KairosMcp
|
|
|
265
340
|
}
|
|
266
341
|
end
|
|
267
342
|
|
|
268
|
-
#
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
next nil unless state
|
|
272
|
-
state['wait_still_pending_streak'] = n
|
|
273
|
-
state
|
|
274
|
-
end
|
|
275
|
-
rescue StandardError
|
|
276
|
-
# Best-effort. Streak loss = orchestrator gets one more retry,
|
|
277
|
-
# acceptable degradation.
|
|
278
|
-
end
|
|
279
|
-
|
|
343
|
+
# Atomic streak reset via update_state RMW. Errors are logged (not
|
|
344
|
+
# silently swallowed — Bug #8) so genuine PendingState failures
|
|
345
|
+
# surface in stderr.
|
|
280
346
|
def reset_streak(token)
|
|
281
347
|
PendingState.update_state(token) do |state|
|
|
282
348
|
next nil unless state
|
|
@@ -287,23 +353,21 @@ module KairosMcp
|
|
|
287
353
|
nil
|
|
288
354
|
end
|
|
289
355
|
end
|
|
290
|
-
rescue StandardError
|
|
291
|
-
#
|
|
292
|
-
end
|
|
293
|
-
|
|
294
|
-
def safe_path
|
|
295
|
-
yield
|
|
296
|
-
rescue StandardError
|
|
297
|
-
'/dev/null/never_exists'
|
|
356
|
+
rescue StandardError => e
|
|
357
|
+
warn "[multi_llm_review_wait] reset_streak failed: #{e.class}: #{e.message}"
|
|
298
358
|
end
|
|
299
359
|
|
|
300
|
-
|
|
301
|
-
|
|
360
|
+
# Load the delegation.parallel config block. v3.24.0 had a dead-code
|
|
361
|
+
# bug here (`unless ... || true` always true → always returned {}).
|
|
362
|
+
# v3.24.1 removes the dead guard and explicitly requires 'yaml' at
|
|
363
|
+
# the top of the file.
|
|
364
|
+
def load_config_parallel
|
|
302
365
|
path = File.expand_path('../config/multi_llm_review.yml', __dir__)
|
|
303
366
|
return {} unless File.exist?(path)
|
|
304
367
|
cfg = YAML.safe_load_file(path, permitted_classes: [Symbol], aliases: true)
|
|
305
368
|
(cfg.dig('delegation', 'parallel') || {}).to_h
|
|
306
|
-
rescue StandardError
|
|
369
|
+
rescue StandardError => e
|
|
370
|
+
warn "[multi_llm_review_wait] config load failed: #{e.class}: #{e.message}"
|
|
307
371
|
{}
|
|
308
372
|
end
|
|
309
373
|
end
|