sneakoscope 2.0.15 → 2.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -3
- package/crates/sks-core/Cargo.lock +1 -1
- package/crates/sks-core/Cargo.toml +1 -1
- package/crates/sks-core/src/main.rs +1 -1
- package/dist/.sks-build-stamp.json +4 -4
- package/dist/bin/sks.js +1 -1
- package/dist/cli/command-registry.js +1 -1
- package/dist/commands/proof.js +21 -0
- package/dist/commands/zellij-slot-pane.js +7 -1
- package/dist/core/agents/agent-orchestrator.js +68 -3
- package/dist/core/agents/agent-scheduler.js +217 -86
- package/dist/core/agents/agent-schema.js +1 -1
- package/dist/core/agents/native-cli-session-swarm.js +97 -27
- package/dist/core/agents/native-cli-worker.js +56 -7
- package/dist/core/agents/parallel-runtime-proof.js +276 -0
- package/dist/core/agents/runtime-proof-summary.js +75 -0
- package/dist/core/codex-control/codex-task-runner.js +32 -4
- package/dist/core/codex-control/model-call-concurrency.js +106 -0
- package/dist/core/commands/naruto-command.js +65 -8
- package/dist/core/commands/team-command.js +6 -487
- package/dist/core/commands/team-legacy-observe-command.js +182 -0
- package/dist/core/db-safety.js +49 -6
- package/dist/core/feature-registry.js +4 -2
- package/dist/core/fsx.js +1 -1
- package/dist/core/git/git-worktree-capability.js +18 -0
- package/dist/core/git/git-worktree-manager.js +80 -0
- package/dist/core/git/git-worktree-pool.js +4 -0
- package/dist/core/hooks-runtime.js +41 -4
- package/dist/core/init.js +1 -0
- package/dist/core/mad-db/mad-db-capability.js +42 -2
- package/dist/core/mad-db/mad-db-ledger.js +14 -0
- package/dist/core/mad-db/mad-db-policy-resolver.js +2 -0
- package/dist/core/mad-db/mad-db-result-lifecycle.js +136 -0
- package/dist/core/naruto/naruto-concurrency-governor.js +14 -1
- package/dist/core/release/release-gate-affected-selector.js +47 -5
- package/dist/core/release/release-gate-dag.js +5 -1
- package/dist/core/release/release-gate-scheduler.js +2 -1
- package/dist/core/routes.js +3 -1
- package/dist/core/version.js +1 -1
- package/dist/core/zellij/zellij-slot-pane-renderer.js +74 -1
- package/dist/core/zellij/zellij-slot-telemetry.js +81 -3
- package/dist/core/zellij/zellij-ui-mode.js +12 -2
- package/dist/scripts/prepublish-release-check-or-fast.js +3 -3
- package/dist/scripts/release-speed-summary.js +23 -1
- package/package.json +38 -3
- package/schemas/agents/parallel-runtime-proof.schema.json +79 -0
|
@@ -4,6 +4,7 @@ import { MAX_AGENT_COUNT } from './agent-schema.js';
|
|
|
4
4
|
import { appendAgentWorkQueueEvent, completeWorkItem, createAgentWorkQueue, enqueueFollowUpWorkItems, leaseNextWorkItem, pendingWorkItems, writeAgentWorkQueue } from './agent-work-queue.js';
|
|
5
5
|
import { closeWorkerSlotsAfterDrain, createAgentWorkerSlots, markWorkerSlotGenerationClosed, openWorkerSlotGeneration, writeAgentWorkerSlots } from './agent-worker-slot.js';
|
|
6
6
|
import { closeAgentSessionGeneration, createAgentSessionGeneration, writeAgentSessionGeneration } from './agent-session-generation.js';
|
|
7
|
+
import { appendParallelRuntimeEvent } from './parallel-runtime-proof.js';
|
|
7
8
|
export const AGENT_SCHEDULER_SCHEMA = 'sks.agent-scheduler.v1';
|
|
8
9
|
export const AGENT_SCHEDULER_EVENT_SCHEMA = 'sks.agent-scheduler-event.v1';
|
|
9
10
|
export async function runAgentScheduler(input) {
|
|
@@ -19,6 +20,12 @@ export async function runAgentScheduler(input) {
|
|
|
19
20
|
});
|
|
20
21
|
const active = new Map();
|
|
21
22
|
const results = [];
|
|
23
|
+
const schedulerStartedAt = Date.now();
|
|
24
|
+
let lastUtilizationUpdateMs = schedulerStartedAt;
|
|
25
|
+
let activeSlotTimeMs = 0;
|
|
26
|
+
let batchCounter = 0;
|
|
27
|
+
let batchLaunchSpanTotalMs = 0;
|
|
28
|
+
let batchDispatchInProgress = false;
|
|
22
29
|
let state = buildState(input.missionId, targetActiveSlots, queue, slots, active, {
|
|
23
30
|
status: 'initializing',
|
|
24
31
|
refillDelayMs: input.refillDelayMs || 0,
|
|
@@ -27,7 +34,7 @@ export async function runAgentScheduler(input) {
|
|
|
27
34
|
await writeAll(input.root, state, slots, queue, active, { event_type: 'scheduler_initialized' }, input.onSchedulerEvent);
|
|
28
35
|
await refillSlots(null);
|
|
29
36
|
while (active.size > 0 || pendingWorkItems(queue).length > 0) {
|
|
30
|
-
if (active.size === 0 && pendingWorkItems(queue).length > 0) {
|
|
37
|
+
if (!batchDispatchInProgress && active.size === 0 && pendingWorkItems(queue).length > 0) {
|
|
31
38
|
state.blockers.push('scheduler_pending_queue_without_active_sessions');
|
|
32
39
|
state.status = 'blocked';
|
|
33
40
|
await writeAll(input.root, state, slots, queue, active, { event_type: 'scheduler_blocked', pending_count: pendingWorkItems(queue).length }, input.onSchedulerEvent);
|
|
@@ -38,6 +45,7 @@ export async function runAgentScheduler(input) {
|
|
|
38
45
|
if (!entry)
|
|
39
46
|
continue;
|
|
40
47
|
const activeCountBeforeClose = active.size;
|
|
48
|
+
accumulateActiveSlotTime();
|
|
41
49
|
active.delete(settled.session_id);
|
|
42
50
|
const resultStatus = settled.result?.status === 'done' ? 'completed' : settled.result?.status === 'blocked' ? 'blocked' : 'failed';
|
|
43
51
|
completeWorkItem(queue, entry.work_item_id, settled.session_id, resultStatus, settled.error || null);
|
|
@@ -65,6 +73,7 @@ export async function runAgentScheduler(input) {
|
|
|
65
73
|
const pendingAfterClose = pendingWorkItems(queue).length;
|
|
66
74
|
if (pendingAfterClose > 0)
|
|
67
75
|
state.expected_backfill_count += 1;
|
|
76
|
+
updateUtilizationMetrics();
|
|
68
77
|
await writeAll(input.root, state, slots, queue, active, {
|
|
69
78
|
event_type: 'session_completed',
|
|
70
79
|
session_id: settled.session_id,
|
|
@@ -80,6 +89,7 @@ export async function runAgentScheduler(input) {
|
|
|
80
89
|
closed_at_ms: Date.now()
|
|
81
90
|
} : null);
|
|
82
91
|
}
|
|
92
|
+
updateUtilizationMetrics();
|
|
83
93
|
state.status = 'draining';
|
|
84
94
|
await writeAll(input.root, state, slots, queue, active, { event_type: 'scheduler_draining' }, input.onSchedulerEvent);
|
|
85
95
|
slots = closeWorkerSlotsAfterDrain(slots);
|
|
@@ -94,6 +104,7 @@ export async function runAgentScheduler(input) {
|
|
|
94
104
|
state.all_generations_closed = true;
|
|
95
105
|
if (!state.pending_queue_drained)
|
|
96
106
|
state.blockers.push('scheduler_pending_queue_not_drained');
|
|
107
|
+
updateUtilizationMetrics();
|
|
97
108
|
await writeAll(input.root, state, slots, queue, active, { event_type: 'scheduler_drained' }, input.onSchedulerEvent);
|
|
98
109
|
return {
|
|
99
110
|
schema: 'sks.agent-scheduler-result.v1',
|
|
@@ -105,9 +116,180 @@ export async function runAgentScheduler(input) {
|
|
|
105
116
|
};
|
|
106
117
|
async function refillSlots(backfill) {
|
|
107
118
|
state.status = 'running';
|
|
119
|
+
const launches = collectLaunchBatch();
|
|
120
|
+
if (!launches.length)
|
|
121
|
+
return;
|
|
122
|
+
batchDispatchInProgress = true;
|
|
123
|
+
const batchId = `batch-${Date.now().toString(36)}-${batchCounter++}`;
|
|
124
|
+
const batchStart = Date.now();
|
|
108
125
|
const launchEvents = [];
|
|
109
|
-
|
|
110
|
-
|
|
126
|
+
try {
|
|
127
|
+
for (const launch of launches)
|
|
128
|
+
slots[launch.slotIndex] = launch.openedSlot;
|
|
129
|
+
await Promise.all(launches.map((launch) => writeAgentSessionGeneration(input.root, launch.generation)));
|
|
130
|
+
await writeAll(input.root, state, slots, queue, active, {
|
|
131
|
+
event_type: 'batch_dispatch_started',
|
|
132
|
+
batch_id: batchId,
|
|
133
|
+
launch_count: launches.length,
|
|
134
|
+
session_ids: launches.map((launch) => launch.generation.session_id)
|
|
135
|
+
}, input.onSchedulerEvent);
|
|
136
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
137
|
+
event_type: 'batch_dispatch_started',
|
|
138
|
+
slot_id: null,
|
|
139
|
+
generation_index: null,
|
|
140
|
+
session_id: null,
|
|
141
|
+
pid: null,
|
|
142
|
+
backend: 'scheduler',
|
|
143
|
+
placement: 'unknown',
|
|
144
|
+
batch_id: batchId,
|
|
145
|
+
meta: { launch_count: launches.length, active_count_before: active.size }
|
|
146
|
+
}).catch(() => undefined);
|
|
147
|
+
for (const launch of launches) {
|
|
148
|
+
const { slot, openedSlot, generation, agent, workItem } = launch;
|
|
149
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
150
|
+
event_type: 'slot_reserved',
|
|
151
|
+
slot_id: slot.slot_id,
|
|
152
|
+
generation_index: generation.generation_index,
|
|
153
|
+
session_id: generation.session_id,
|
|
154
|
+
pid: null,
|
|
155
|
+
backend: 'scheduler',
|
|
156
|
+
placement: 'unknown',
|
|
157
|
+
batch_id: batchId,
|
|
158
|
+
meta: { work_item_id: workItem.id }
|
|
159
|
+
}).catch(() => undefined);
|
|
160
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
161
|
+
event_type: 'worker_launch_invoked',
|
|
162
|
+
slot_id: slot.slot_id,
|
|
163
|
+
generation_index: generation.generation_index,
|
|
164
|
+
session_id: generation.session_id,
|
|
165
|
+
pid: null,
|
|
166
|
+
backend: 'scheduler',
|
|
167
|
+
placement: 'unknown',
|
|
168
|
+
batch_id: batchId,
|
|
169
|
+
meta: { work_item_id: workItem.id }
|
|
170
|
+
}).catch(() => undefined);
|
|
171
|
+
const promise = Promise.resolve()
|
|
172
|
+
.then(() => input.launchSession({ agent, workItem, generation, slot: openedSlot, queue, state }))
|
|
173
|
+
.then((result) => ({
|
|
174
|
+
result,
|
|
175
|
+
session_id: generation.session_id,
|
|
176
|
+
slot_id: slot.slot_id,
|
|
177
|
+
generation_index: generation.generation_index,
|
|
178
|
+
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
179
|
+
}))
|
|
180
|
+
.catch((err) => ({
|
|
181
|
+
result: {
|
|
182
|
+
schema: 'sks.agent-result.v1',
|
|
183
|
+
mission_id: input.missionId,
|
|
184
|
+
agent_id: agent.id,
|
|
185
|
+
session_id: generation.session_id,
|
|
186
|
+
persona_id: agent.persona_id,
|
|
187
|
+
task_slice_id: workItem.id,
|
|
188
|
+
status: 'failed',
|
|
189
|
+
backend: 'fake',
|
|
190
|
+
summary: err instanceof Error ? err.message : String(err),
|
|
191
|
+
findings: [],
|
|
192
|
+
proposed_changes: [],
|
|
193
|
+
changed_files: [],
|
|
194
|
+
lease_compliance: { ok: true, violations: [] },
|
|
195
|
+
artifacts: [],
|
|
196
|
+
blockers: ['scheduler_launch_failed'],
|
|
197
|
+
confidence: 'failed',
|
|
198
|
+
handoff_notes: '',
|
|
199
|
+
unverified: [],
|
|
200
|
+
writes: [],
|
|
201
|
+
recursion_guard: { ok: true, violations: [] },
|
|
202
|
+
verification: { status: 'failed', checks: [] },
|
|
203
|
+
source_intelligence_refs: input.sourceIntelligenceRefs || null,
|
|
204
|
+
goal_mode_ref: input.goalModeRef || null
|
|
205
|
+
},
|
|
206
|
+
session_id: generation.session_id,
|
|
207
|
+
slot_id: slot.slot_id,
|
|
208
|
+
generation_index: generation.generation_index,
|
|
209
|
+
error: err instanceof Error ? err.message : String(err),
|
|
210
|
+
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
211
|
+
}));
|
|
212
|
+
accumulateActiveSlotTime();
|
|
213
|
+
active.set(generation.session_id, { slot_id: slot.slot_id, work_item_id: workItem.id, session_id: generation.session_id, promise });
|
|
214
|
+
}
|
|
215
|
+
await appendAgentWorkQueueEvent(input.root, 'batch_work_items_dispatched', {
|
|
216
|
+
batch_id: batchId,
|
|
217
|
+
launch_count: launches.length,
|
|
218
|
+
session_ids: launches.map((launch) => launch.generation.session_id),
|
|
219
|
+
work_item_ids: launches.map((launch) => launch.workItem.id)
|
|
220
|
+
});
|
|
221
|
+
for (const launch of launches)
|
|
222
|
+
await appendAgentWorkQueueEvent(input.root, 'work_item_dispatched', { work_item_id: launch.workItem.id, session_id: launch.generation.session_id, slot_id: launch.slot.slot_id });
|
|
223
|
+
if (backfill) {
|
|
224
|
+
const firstLaunch = launches[0];
|
|
225
|
+
const refillLatencyMs = Math.max(0, Date.now() - backfill.closed_at_ms);
|
|
226
|
+
state.backfill_count += 1;
|
|
227
|
+
state.refill_latency_events_ms.push(refillLatencyMs);
|
|
228
|
+
state.refill_latency_p95_ms = percentile95(state.refill_latency_events_ms);
|
|
229
|
+
launchEvents.push({
|
|
230
|
+
event_type: 'backfill_event',
|
|
231
|
+
closed_session_id: backfill.closed_session_id,
|
|
232
|
+
new_session_id: firstLaunch?.generation.session_id || null,
|
|
233
|
+
slot_id: firstLaunch?.slot.slot_id || null,
|
|
234
|
+
batch_id: batchId,
|
|
235
|
+
launch_count: launches.length,
|
|
236
|
+
active_count_before: backfill.active_count_before,
|
|
237
|
+
active_count_after: active.size,
|
|
238
|
+
refill_latency_ms: refillLatencyMs
|
|
239
|
+
});
|
|
240
|
+
backfill = null;
|
|
241
|
+
}
|
|
242
|
+
else {
|
|
243
|
+
for (const launch of launches)
|
|
244
|
+
launchEvents.push({
|
|
245
|
+
event_type: 'session_launched',
|
|
246
|
+
session_id: launch.generation.session_id,
|
|
247
|
+
slot_id: launch.slot.slot_id,
|
|
248
|
+
work_item_id: launch.workItem.id,
|
|
249
|
+
active_count_after: active.size
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
if (input.refillDelayMs && input.refillDelayMs > 0)
|
|
253
|
+
await delay(input.refillDelayMs);
|
|
254
|
+
const launchSpanMs = Math.max(0, Date.now() - batchStart);
|
|
255
|
+
batchLaunchSpanTotalMs += launchSpanMs;
|
|
256
|
+
state.batch_dispatch_count += 1;
|
|
257
|
+
state.largest_batch_size = Math.max(state.largest_batch_size, launches.length);
|
|
258
|
+
if (state.first_batch_launch_span_ms === 0)
|
|
259
|
+
state.first_batch_launch_span_ms = launchSpanMs;
|
|
260
|
+
state.average_batch_launch_span_ms = Math.round(batchLaunchSpanTotalMs / Math.max(1, state.batch_dispatch_count));
|
|
261
|
+
updateUtilizationMetrics();
|
|
262
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
263
|
+
event_type: 'batch_dispatch_completed',
|
|
264
|
+
slot_id: null,
|
|
265
|
+
generation_index: null,
|
|
266
|
+
session_id: null,
|
|
267
|
+
pid: null,
|
|
268
|
+
backend: 'scheduler',
|
|
269
|
+
placement: 'unknown',
|
|
270
|
+
batch_id: batchId,
|
|
271
|
+
meta: { launch_count: launches.length, launch_span_ms: launchSpanMs, active_count_after: active.size }
|
|
272
|
+
}).catch(() => undefined);
|
|
273
|
+
await writeAll(input.root, state, slots, queue, active, {
|
|
274
|
+
event_type: 'batch_dispatch_completed',
|
|
275
|
+
batch_id: batchId,
|
|
276
|
+
launch_count: launches.length,
|
|
277
|
+
launch_span_ms: launchSpanMs,
|
|
278
|
+
active_count_after: active.size,
|
|
279
|
+
session_ids: launches.map((launch) => launch.generation.session_id)
|
|
280
|
+
}, input.onSchedulerEvent);
|
|
281
|
+
}
|
|
282
|
+
finally {
|
|
283
|
+
batchDispatchInProgress = false;
|
|
284
|
+
}
|
|
285
|
+
for (const event of launchEvents)
|
|
286
|
+
await appendJsonl(path.join(input.root, 'agent-scheduler-events.jsonl'), { schema: AGENT_SCHEDULER_EVENT_SCHEMA, ts: nowIso(), ...event });
|
|
287
|
+
}
|
|
288
|
+
function collectLaunchBatch() {
|
|
289
|
+
const launches = [];
|
|
290
|
+
const reservedSlots = new Set();
|
|
291
|
+
while (active.size + launches.length < targetActiveSlots && pendingWorkItems(queue).length > 0) {
|
|
292
|
+
const slotIndex = slots.findIndex((slot, index) => slot.status === 'idle' && !reservedSlots.has(index));
|
|
111
293
|
if (slotIndex < 0)
|
|
112
294
|
break;
|
|
113
295
|
const slot = slots[slotIndex];
|
|
@@ -133,90 +315,25 @@ export async function runAgentScheduler(input) {
|
|
|
133
315
|
goalModeRef: workItem.goal_mode_ref
|
|
134
316
|
});
|
|
135
317
|
workItem.running_session_id = generation.session_id;
|
|
136
|
-
await writeAgentSessionGeneration(input.root, generation);
|
|
137
|
-
const agent = buildAgentForGeneration(slot, generation, workItem);
|
|
138
318
|
const openedSlot = openWorkerSlotGeneration(slot, generation);
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
session_id: generation.session_id,
|
|
143
|
-
slot_id: slot.slot_id,
|
|
144
|
-
generation_index: generation.generation_index,
|
|
145
|
-
work_item_id: workItem.id
|
|
146
|
-
}, input.onSchedulerEvent);
|
|
147
|
-
const promise = Promise.resolve()
|
|
148
|
-
.then(() => input.launchSession({ agent, workItem, generation, slot: openedSlot, queue, state }))
|
|
149
|
-
.then((result) => ({
|
|
150
|
-
result,
|
|
151
|
-
session_id: generation.session_id,
|
|
152
|
-
slot_id: slot.slot_id,
|
|
153
|
-
generation_index: generation.generation_index,
|
|
154
|
-
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
155
|
-
}))
|
|
156
|
-
.catch((err) => ({
|
|
157
|
-
result: {
|
|
158
|
-
schema: 'sks.agent-result.v1',
|
|
159
|
-
mission_id: input.missionId,
|
|
160
|
-
agent_id: agent.id,
|
|
161
|
-
session_id: generation.session_id,
|
|
162
|
-
persona_id: agent.persona_id,
|
|
163
|
-
task_slice_id: workItem.id,
|
|
164
|
-
status: 'failed',
|
|
165
|
-
backend: 'fake',
|
|
166
|
-
summary: err instanceof Error ? err.message : String(err),
|
|
167
|
-
findings: [],
|
|
168
|
-
proposed_changes: [],
|
|
169
|
-
changed_files: [],
|
|
170
|
-
lease_compliance: { ok: true, violations: [] },
|
|
171
|
-
artifacts: [],
|
|
172
|
-
blockers: ['scheduler_launch_failed'],
|
|
173
|
-
confidence: 'failed',
|
|
174
|
-
handoff_notes: '',
|
|
175
|
-
unverified: [],
|
|
176
|
-
writes: [],
|
|
177
|
-
recursion_guard: { ok: true, violations: [] },
|
|
178
|
-
verification: { status: 'failed', checks: [] },
|
|
179
|
-
source_intelligence_refs: input.sourceIntelligenceRefs || null,
|
|
180
|
-
goal_mode_ref: input.goalModeRef || null
|
|
181
|
-
},
|
|
182
|
-
session_id: generation.session_id,
|
|
183
|
-
slot_id: slot.slot_id,
|
|
184
|
-
generation_index: generation.generation_index,
|
|
185
|
-
error: err instanceof Error ? err.message : String(err),
|
|
186
|
-
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
187
|
-
}));
|
|
188
|
-
active.set(generation.session_id, { slot_id: slot.slot_id, work_item_id: workItem.id, session_id: generation.session_id, promise });
|
|
189
|
-
await appendAgentWorkQueueEvent(input.root, 'work_item_dispatched', { work_item_id: workItem.id, session_id: generation.session_id, slot_id: slot.slot_id });
|
|
190
|
-
if (backfill) {
|
|
191
|
-
const refillLatencyMs = Math.max(0, Date.now() - backfill.closed_at_ms);
|
|
192
|
-
state.backfill_count += 1;
|
|
193
|
-
state.refill_latency_events_ms.push(refillLatencyMs);
|
|
194
|
-
state.refill_latency_p95_ms = percentile95(state.refill_latency_events_ms);
|
|
195
|
-
launchEvents.push({
|
|
196
|
-
event_type: 'backfill_event',
|
|
197
|
-
closed_session_id: backfill.closed_session_id,
|
|
198
|
-
new_session_id: generation.session_id,
|
|
199
|
-
slot_id: slot.slot_id,
|
|
200
|
-
active_count_before: backfill.active_count_before,
|
|
201
|
-
active_count_after: active.size,
|
|
202
|
-
refill_latency_ms: refillLatencyMs
|
|
203
|
-
});
|
|
204
|
-
backfill = null;
|
|
205
|
-
}
|
|
206
|
-
else {
|
|
207
|
-
launchEvents.push({
|
|
208
|
-
event_type: 'session_launched',
|
|
209
|
-
session_id: generation.session_id,
|
|
210
|
-
slot_id: slot.slot_id,
|
|
211
|
-
work_item_id: workItem.id,
|
|
212
|
-
active_count_after: active.size
|
|
213
|
-
});
|
|
214
|
-
}
|
|
215
|
-
if (input.refillDelayMs && input.refillDelayMs > 0)
|
|
216
|
-
await delay(input.refillDelayMs);
|
|
319
|
+
const agent = buildAgentForGeneration(slot, generation, workItem);
|
|
320
|
+
launches.push({ slotIndex, slot, openedSlot, generation, agent, workItem, provisionalSessionId });
|
|
321
|
+
reservedSlots.add(slotIndex);
|
|
217
322
|
}
|
|
218
|
-
|
|
219
|
-
|
|
323
|
+
return launches;
|
|
324
|
+
}
|
|
325
|
+
function updateUtilizationMetrics() {
|
|
326
|
+
accumulateActiveSlotTime();
|
|
327
|
+
state.wall_time_ms = Math.max(0, Date.now() - schedulerStartedAt);
|
|
328
|
+
state.active_slot_time_ms = activeSlotTimeMs;
|
|
329
|
+
const denominator = Math.max(1, state.wall_time_ms * targetActiveSlots);
|
|
330
|
+
state.scheduler_utilization = Number(Math.min(1, state.active_slot_time_ms / denominator).toFixed(3));
|
|
331
|
+
}
|
|
332
|
+
function accumulateActiveSlotTime() {
|
|
333
|
+
const now = Date.now();
|
|
334
|
+
const delta = Math.max(0, now - lastUtilizationUpdateMs);
|
|
335
|
+
activeSlotTimeMs += active.size * delta;
|
|
336
|
+
lastUtilizationUpdateMs = now;
|
|
220
337
|
}
|
|
221
338
|
}
|
|
222
339
|
export function normalizeTargetActiveSlots(value, maxActiveSlots = MAX_AGENT_COUNT) {
|
|
@@ -261,7 +378,14 @@ function buildState(missionId, targetActiveSlots, queue, slots, active, opts) {
|
|
|
261
378
|
pending_queue_drained: pendingCount === 0,
|
|
262
379
|
all_slots_closed_after_drain: slots.length > 0 && slots.every((slot) => slot.status === 'closed'),
|
|
263
380
|
all_generations_closed: false,
|
|
264
|
-
blockers: [...(previous?.blockers || [])]
|
|
381
|
+
blockers: [...(previous?.blockers || [])],
|
|
382
|
+
batch_dispatch_count: previous?.batch_dispatch_count || 0,
|
|
383
|
+
largest_batch_size: previous?.largest_batch_size || 0,
|
|
384
|
+
first_batch_launch_span_ms: previous?.first_batch_launch_span_ms || 0,
|
|
385
|
+
average_batch_launch_span_ms: previous?.average_batch_launch_span_ms || 0,
|
|
386
|
+
scheduler_utilization: previous?.scheduler_utilization || 0,
|
|
387
|
+
active_slot_time_ms: previous?.active_slot_time_ms || 0,
|
|
388
|
+
wall_time_ms: previous?.wall_time_ms || 0
|
|
265
389
|
};
|
|
266
390
|
}
|
|
267
391
|
async function writeAll(root, currentState, slots, queue, active, event, onSchedulerEvent) {
|
|
@@ -289,6 +413,13 @@ async function writeAll(root, currentState, slots, queue, active, event, onSched
|
|
|
289
413
|
currentState.blocked = nextState.blocked;
|
|
290
414
|
currentState.pending_queue_drained = nextState.pending_queue_drained;
|
|
291
415
|
currentState.all_slots_closed_after_drain = nextState.all_slots_closed_after_drain;
|
|
416
|
+
currentState.batch_dispatch_count = nextState.batch_dispatch_count;
|
|
417
|
+
currentState.largest_batch_size = nextState.largest_batch_size;
|
|
418
|
+
currentState.first_batch_launch_span_ms = nextState.first_batch_launch_span_ms;
|
|
419
|
+
currentState.average_batch_launch_span_ms = nextState.average_batch_launch_span_ms;
|
|
420
|
+
currentState.scheduler_utilization = nextState.scheduler_utilization;
|
|
421
|
+
currentState.active_slot_time_ms = nextState.active_slot_time_ms;
|
|
422
|
+
currentState.wall_time_ms = nextState.wall_time_ms;
|
|
292
423
|
await writeAgentWorkQueue(root, queue);
|
|
293
424
|
await writeAgentWorkerSlots(root, slots);
|
|
294
425
|
await writeJsonAtomic(path.join(root, 'agent-scheduler-state.json'), currentState);
|
|
@@ -13,7 +13,7 @@ export const DEFAULT_AGENT_CONCURRENCY = 5;
|
|
|
13
13
|
// ceiling to up to 100 concurrent clone sessions. Only the naruto path opts into this
|
|
14
14
|
// cap; every other roster/scheduler caller keeps MAX_AGENT_COUNT as the default.
|
|
15
15
|
export const MAX_NARUTO_AGENT_COUNT = 100;
|
|
16
|
-
export const DEFAULT_NARUTO_CLONES =
|
|
16
|
+
export const DEFAULT_NARUTO_CLONES = 32;
|
|
17
17
|
export const AGENT_BACKENDS = ['fake', 'process', 'codex-sdk', 'zellij', 'ollama', 'local-llm'];
|
|
18
18
|
export function normalizeAgentBackend(input) {
|
|
19
19
|
const value = String(input || 'codex-sdk');
|
|
@@ -8,8 +8,9 @@ import { closeWorkerPane, openWorkerPane } from '../zellij/zellij-worker-pane-ma
|
|
|
8
8
|
import { closeWorkerInRightColumn, recordHeadlessWorkerInRightColumn } from '../zellij/zellij-right-column-manager.js';
|
|
9
9
|
import { resolveProviderContext } from '../provider/provider-context.js';
|
|
10
10
|
import { buildZellijSlotPaneCommand } from '../zellij/zellij-slot-pane-renderer.js';
|
|
11
|
-
import {
|
|
11
|
+
import { resolveZellijWorkerPaneUiMode } from '../zellij/zellij-ui-mode.js';
|
|
12
12
|
import { appendZellijSlotTelemetry } from '../zellij/zellij-slot-telemetry.js';
|
|
13
|
+
import { appendParallelRuntimeEvent } from './parallel-runtime-proof.js';
|
|
13
14
|
export const NATIVE_CLI_SESSION_SWARM_SCHEMA = 'sks.agent-native-cli-session-swarm.v1';
|
|
14
15
|
export function createNativeCliSessionSwarmRecorder(root, input) {
|
|
15
16
|
return new NativeCliSessionSwarmRecorder(root, input);
|
|
@@ -175,6 +176,16 @@ class NativeCliSessionSwarmRecorder {
|
|
|
175
176
|
record.pid = child.pid || null;
|
|
176
177
|
record.process_id = child.pid || null;
|
|
177
178
|
record.status = 'running';
|
|
179
|
+
await appendParallelRuntimeEvent(this.root, this.input.missionId, {
|
|
180
|
+
event_type: 'worker_process_spawned',
|
|
181
|
+
slot_id: ctx.agent.slot_id || ctx.agent.id || null,
|
|
182
|
+
generation_index: ctx.agent.generation_index || null,
|
|
183
|
+
session_id: ctx.agent.session_id || null,
|
|
184
|
+
pid: child.pid || null,
|
|
185
|
+
backend: this.input.backend,
|
|
186
|
+
placement: record.worker_placement === 'headless' ? 'headless' : 'process',
|
|
187
|
+
worktree_id: worktree?.id || null
|
|
188
|
+
}).catch(() => undefined);
|
|
178
189
|
await this.telemetry(ctx, {
|
|
179
190
|
eventType: 'worker_spawned',
|
|
180
191
|
status: 'launching',
|
|
@@ -267,7 +278,8 @@ class NativeCliSessionSwarmRecorder {
|
|
|
267
278
|
route: this.input.route,
|
|
268
279
|
serviceTier: this.input.fastModePolicy.service_tier
|
|
269
280
|
});
|
|
270
|
-
const uiMode =
|
|
281
|
+
const uiMode = resolveZellijWorkerPaneUiMode(Array.isArray(input.ctx.opts.args) ? input.ctx.opts.args : [], process.env);
|
|
282
|
+
const liveWorkerPane = uiMode !== 'compact-slots';
|
|
271
283
|
const workerEnv = {
|
|
272
284
|
...(input.ctx.opts.env || {}),
|
|
273
285
|
...fastModeEnv(this.input.fastModePolicy),
|
|
@@ -289,7 +301,7 @@ class NativeCliSessionSwarmRecorder {
|
|
|
289
301
|
artifacts: [path.join(input.workerDirRel, 'worker-intake.json'), input.heartbeatRel, input.resultRel],
|
|
290
302
|
logTail: `zellij=${sessionName}`
|
|
291
303
|
});
|
|
292
|
-
const workerCommand =
|
|
304
|
+
const workerCommand = liveWorkerPane
|
|
293
305
|
? buildPaneWorkerCommand({
|
|
294
306
|
args: input.args,
|
|
295
307
|
stdoutPath: path.join(this.root, input.stdoutRel),
|
|
@@ -321,6 +333,30 @@ class NativeCliSessionSwarmRecorder {
|
|
|
321
333
|
mode: uiMode,
|
|
322
334
|
watch: true
|
|
323
335
|
});
|
|
336
|
+
const processRun = liveWorkerPane
|
|
337
|
+
? null
|
|
338
|
+
: await this.spawnCompactSlotWorkerProcess({
|
|
339
|
+
args: input.args,
|
|
340
|
+
cwd: workerCwd,
|
|
341
|
+
env: workerEnv,
|
|
342
|
+
stdoutRel: input.stdoutRel,
|
|
343
|
+
stderrRel: input.stderrRel
|
|
344
|
+
});
|
|
345
|
+
if (processRun?.pid) {
|
|
346
|
+
input.record.pid = processRun.pid;
|
|
347
|
+
input.record.process_id = processRun.pid;
|
|
348
|
+
await appendParallelRuntimeEvent(this.root, this.input.missionId, {
|
|
349
|
+
event_type: 'worker_process_spawned',
|
|
350
|
+
slot_id: slotId,
|
|
351
|
+
generation_index: Number(input.ctx.agent.generation_index || 1),
|
|
352
|
+
session_id: input.ctx.agent.session_id || null,
|
|
353
|
+
pid: processRun.pid,
|
|
354
|
+
backend: this.input.backend,
|
|
355
|
+
placement: 'zellij-pane',
|
|
356
|
+
worktree_id: worktree?.id || null
|
|
357
|
+
}).catch(() => undefined);
|
|
358
|
+
await this.record(input.record);
|
|
359
|
+
}
|
|
324
360
|
let paneRecord;
|
|
325
361
|
try {
|
|
326
362
|
paneRecord = await openWorkerPane({
|
|
@@ -367,8 +403,10 @@ class NativeCliSessionSwarmRecorder {
|
|
|
367
403
|
if (input.zellijReservation)
|
|
368
404
|
this.releaseVisibleZellijReservation(input.zellijReservation);
|
|
369
405
|
}
|
|
370
|
-
const
|
|
371
|
-
|
|
406
|
+
const zellijRequired = process.env.SKS_REQUIRE_ZELLIJ === '1';
|
|
407
|
+
const launchBlockers = zellijRequired ? paneRecord.blockers || [] : [];
|
|
408
|
+
const launchWarnings = zellijRequired ? [] : paneRecord.blockers || [];
|
|
409
|
+
input.record.command_line = ['zellij', '--session', sessionName, 'action', 'new-pane', '--direction', paneRecord.direction_applied, '--name', paneRecord.pane_name, '--', 'sh', '-lc', liveWorkerPane ? '<native-cli-worker-command>' : '<zellij-slot-pane-renderer-command>'];
|
|
372
410
|
input.record.zellij_session_name = sessionName;
|
|
373
411
|
input.record.zellij_pane_id = paneRecord.pane_id || null;
|
|
374
412
|
input.record.zellij_pane_id_source = paneRecord.pane_id_source;
|
|
@@ -382,9 +420,10 @@ class NativeCliSessionSwarmRecorder {
|
|
|
382
420
|
input.record.provider_context = paneRecord.provider_context;
|
|
383
421
|
input.record.worktree = worktree;
|
|
384
422
|
input.record.zellij_ui_mode = uiMode;
|
|
385
|
-
input.record.slot_visualization =
|
|
423
|
+
input.record.slot_visualization = liveWorkerPane ? 'worker-command-pane' : 'zellij-slot-pane-renderer';
|
|
386
424
|
input.record.status = launchBlockers.length ? 'failed' : 'running';
|
|
387
425
|
input.record.blockers = launchBlockers;
|
|
426
|
+
input.record.warnings = [...(input.record.warnings || []), ...launchWarnings];
|
|
388
427
|
await this.telemetry(input.ctx, {
|
|
389
428
|
eventType: 'worker_spawned',
|
|
390
429
|
status: launchBlockers.length ? 'failed' : 'launching',
|
|
@@ -422,27 +461,18 @@ class NativeCliSessionSwarmRecorder {
|
|
|
422
461
|
goal_mode_ref: input.ctx.agent.goal_mode_ref || null
|
|
423
462
|
});
|
|
424
463
|
}
|
|
425
|
-
const
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
stderrRel: input.stderrRel
|
|
464
|
+
const heartbeatSeen = await waitForWorkerHeartbeat(path.join(this.root, input.heartbeatRel), Number(process.env.SKS_ZELLIJ_WORKER_HEARTBEAT_TIMEOUT_MS || 5000));
|
|
465
|
+
if (heartbeatSeen) {
|
|
466
|
+
await this.telemetry(input.ctx, {
|
|
467
|
+
eventType: 'heartbeat',
|
|
468
|
+
status: 'running',
|
|
469
|
+
artifacts: [input.heartbeatRel],
|
|
470
|
+
logTail: await tailFile(path.join(this.root, input.heartbeatRel), 600)
|
|
433
471
|
});
|
|
434
|
-
if (processRun?.pid) {
|
|
435
|
-
input.record.pid = processRun.pid;
|
|
436
|
-
input.record.process_id = processRun.pid;
|
|
437
|
-
await this.record(input.record);
|
|
438
472
|
}
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
status: 'running',
|
|
443
|
-
artifacts: [input.heartbeatRel],
|
|
444
|
-
logTail: await tailFile(path.join(this.root, input.heartbeatRel), 600)
|
|
445
|
-
});
|
|
473
|
+
else {
|
|
474
|
+
input.record.warnings = [...(input.record.warnings || []), 'zellij_worker_heartbeat_missing_launch_warning'];
|
|
475
|
+
}
|
|
446
476
|
await appendJsonl(path.join(this.root, input.workerDirRel, 'zellij-worker-pane-events.jsonl'), {
|
|
447
477
|
schema: 'sks.zellij-worker-pane-event.v1',
|
|
448
478
|
ts: nowIso(),
|
|
@@ -505,8 +535,10 @@ class NativeCliSessionSwarmRecorder {
|
|
|
505
535
|
const heartbeatOk = await hasHeartbeat(path.join(this.root, input.heartbeatRel));
|
|
506
536
|
input.record.blockers = [
|
|
507
537
|
...(parsed ? parsed.blockers || [] : ['zellij_worker_result_timeout']),
|
|
508
|
-
...(heartbeatOk ? [] : [
|
|
538
|
+
...(heartbeatOk ? [] : [])
|
|
509
539
|
];
|
|
540
|
+
if (!heartbeatOk)
|
|
541
|
+
input.record.warnings = [...(input.record.warnings || []), 'zellij_worker_heartbeat_missing'];
|
|
510
542
|
paneRecord = await closeWorkerPane({
|
|
511
543
|
root: this.root,
|
|
512
544
|
paneRecord,
|
|
@@ -593,6 +625,24 @@ class NativeCliSessionSwarmRecorder {
|
|
|
593
625
|
log_tail: input.logTail || '',
|
|
594
626
|
blockers: input.blockers || []
|
|
595
627
|
}).catch(() => undefined);
|
|
628
|
+
const parallelEvent = mapTelemetryToParallelEvent(input.eventType);
|
|
629
|
+
if (parallelEvent) {
|
|
630
|
+
await appendParallelRuntimeEvent(this.root, this.input.missionId, {
|
|
631
|
+
event_type: parallelEvent,
|
|
632
|
+
slot_id: String(ctx.agent?.slot_id || ctx.agent?.id || 'slot-001'),
|
|
633
|
+
generation_index: Number(ctx.agent?.generation_index || 1),
|
|
634
|
+
session_id: ctx.agent?.session_id == null ? null : String(ctx.agent.session_id),
|
|
635
|
+
pid: null,
|
|
636
|
+
backend: this.input.backend,
|
|
637
|
+
placement: normalizeParallelPlacement(ctx.opts?.workerPlacement || this.input.workerPlacement || (input.status === 'headless' ? 'headless' : 'unknown')),
|
|
638
|
+
worktree_id: ctx.agent?.worktree?.id || ctx.slice?.worktree?.id || null,
|
|
639
|
+
meta: {
|
|
640
|
+
status: input.status,
|
|
641
|
+
artifacts: input.artifacts || [],
|
|
642
|
+
blockers: input.blockers || []
|
|
643
|
+
}
|
|
644
|
+
}).catch(() => undefined);
|
|
645
|
+
}
|
|
596
646
|
}
|
|
597
647
|
async persist() {
|
|
598
648
|
this.writeLock = this.writeLock.catch(() => undefined).then(async () => {
|
|
@@ -692,7 +742,10 @@ export function buildPaneWorkerCommand(input) {
|
|
|
692
742
|
const holdMs = Math.max(0, Number(process.env.SKS_ZELLIJ_WORKER_PANE_HOLD_MS || 1500));
|
|
693
743
|
const hold = holdMs > 0 ? `sleep ${shellQuote(String(Math.min(30, holdMs / 1000)))}` : ':';
|
|
694
744
|
const header = input.header ? `printf '%s\\n' ${shellQuote(input.header)} | tee -a ${shellQuote(input.stdoutPath)};` : '';
|
|
695
|
-
|
|
745
|
+
const exitPath = `${input.heartbeatPath}.exit`;
|
|
746
|
+
const visibleCommand = `(${command}; printf '%s' "$?" > ${shellQuote(exitPath)}) 2>&1 | tee -a ${shellQuote(input.stdoutPath)}`;
|
|
747
|
+
const readExit = `code=$(cat ${shellQuote(exitPath)} 2>/dev/null || printf '1'); rm -f ${shellQuote(exitPath)}`;
|
|
748
|
+
return `${envPrefix.join(' ')} ${header} ${visibleCommand}; ${readExit}; ${heartbeat}; ${hold}; exit $code`.trim();
|
|
696
749
|
}
|
|
697
750
|
function buildPaneWorkerHeader(input) {
|
|
698
751
|
return [
|
|
@@ -765,6 +818,23 @@ function firstString(values) {
|
|
|
765
818
|
}
|
|
766
819
|
return null;
|
|
767
820
|
}
|
|
821
|
+
function mapTelemetryToParallelEvent(eventType) {
|
|
822
|
+
if (eventType === 'slot_reserved')
|
|
823
|
+
return 'slot_reserved';
|
|
824
|
+
if (eventType === 'heartbeat')
|
|
825
|
+
return 'worker_heartbeat_seen';
|
|
826
|
+
if (eventType === 'worker_completed')
|
|
827
|
+
return 'worker_completed';
|
|
828
|
+
if (eventType === 'worker_failed')
|
|
829
|
+
return 'worker_failed';
|
|
830
|
+
return null;
|
|
831
|
+
}
|
|
832
|
+
function normalizeParallelPlacement(value) {
|
|
833
|
+
const text = String(value || '');
|
|
834
|
+
if (text === 'zellij-pane' || text === 'process' || text === 'headless')
|
|
835
|
+
return text;
|
|
836
|
+
return 'unknown';
|
|
837
|
+
}
|
|
768
838
|
async function tailFile(file, max) {
|
|
769
839
|
try {
|
|
770
840
|
const text = await fs.promises.readFile(file, 'utf8');
|