sneakoscope 2.0.15 → 2.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/crates/sks-core/Cargo.lock +1 -1
- package/crates/sks-core/Cargo.toml +1 -1
- package/crates/sks-core/src/main.rs +1 -1
- package/dist/.sks-build-stamp.json +4 -4
- package/dist/bin/sks.js +1 -1
- package/dist/cli/command-registry.js +1 -1
- package/dist/core/agents/agent-orchestrator.js +66 -3
- package/dist/core/agents/agent-scheduler.js +204 -86
- package/dist/core/agents/agent-schema.js +1 -1
- package/dist/core/agents/native-cli-session-swarm.js +87 -21
- package/dist/core/agents/parallel-runtime-proof.js +217 -0
- package/dist/core/codex-control/codex-task-runner.js +32 -4
- package/dist/core/codex-control/model-call-concurrency.js +106 -0
- package/dist/core/commands/naruto-command.js +48 -5
- package/dist/core/commands/team-command.js +0 -176
- package/dist/core/db-safety.js +34 -6
- package/dist/core/fsx.js +1 -1
- package/dist/core/git/git-worktree-capability.js +18 -0
- package/dist/core/git/git-worktree-manager.js +80 -0
- package/dist/core/git/git-worktree-pool.js +4 -0
- package/dist/core/mad-db/mad-db-capability.js +33 -1
- package/dist/core/mad-db/mad-db-ledger.js +14 -0
- package/dist/core/mad-db/mad-db-policy-resolver.js +2 -0
- package/dist/core/naruto/naruto-concurrency-governor.js +14 -1
- package/dist/core/version.js +1 -1
- package/dist/core/zellij/zellij-slot-telemetry.js +56 -1
- package/dist/scripts/release-speed-summary.js +2 -0
- package/package.json +25 -1
- package/schemas/agents/parallel-runtime-proof.schema.json +48 -0
package/README.md
CHANGED
|
@@ -16,7 +16,7 @@ Set up this agent project with Sneakoscope Codex. Use [[mandarange/Sneakoscope-C
|
|
|
16
16
|
|
|
17
17
|
## Current Release
|
|
18
18
|
|
|
19
|
-
SKS **2.0.
|
|
19
|
+
SKS **2.0.16** is the real parallelism closure release. It proves Naruto/agent runtime concurrency with PID, launch overlap, wall-clock speedup, active/headless worker, model-call, worktree allocation, and incremental Zellij telemetry evidence while keeping Naruto as the execution SSOT.
|
|
20
20
|
|
|
21
21
|
What changed:
|
|
22
22
|
|
|
@@ -4,7 +4,7 @@ use std::io::{self, Read, Seek, SeekFrom};
|
|
|
4
4
|
fn main() {
|
|
5
5
|
let mut args = std::env::args().skip(1);
|
|
6
6
|
match args.next().as_deref() {
|
|
7
|
-
Some("--version") => println!("sks-rs 2.0.
|
|
7
|
+
Some("--version") => println!("sks-rs 2.0.16"),
|
|
8
8
|
Some("compact-info") => {
|
|
9
9
|
let mut input = String::new();
|
|
10
10
|
let _ = io::stdin().read_to_string(&mut input);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema": "sks.dist-build-stamp.v1",
|
|
3
3
|
"package_name": "sneakoscope",
|
|
4
|
-
"package_version": "2.0.
|
|
5
|
-
"source_digest": "
|
|
6
|
-
"source_file_count":
|
|
7
|
-
"built_at_source_time":
|
|
4
|
+
"package_version": "2.0.16",
|
|
5
|
+
"source_digest": "cafc32cad87d3b6c7aeb0ec0e8e56258f830b35e71fd919440a8c1b95b78432a",
|
|
6
|
+
"source_file_count": 2187,
|
|
7
|
+
"built_at_source_time": 1780927200051
|
|
8
8
|
}
|
package/dist/bin/sks.js
CHANGED
|
@@ -119,7 +119,7 @@ export const COMMANDS = {
|
|
|
119
119
|
commit: entry('stable', 'Create a simple git commit', 'dist/commands/commit.js', directCommand(() => import('../commands/commit.js'), 'dist/commands/commit.js')),
|
|
120
120
|
'commit-and-push': entry('stable', 'Create a simple git commit and push', 'dist/commands/commit-and-push.js', directCommand(() => import('../commands/commit-and-push.js'), 'dist/commands/commit-and-push.js')),
|
|
121
121
|
dfix: entry('stable', 'Run DFix diagnose/plan/patch/verify loop', 'dist/core/commands/dfix-command.js', commandArgsCommand(() => import('../core/commands/dfix-command.js'), 'dfixCommand', 'dist/core/commands/dfix-command.js')),
|
|
122
|
-
team: entry('beta', '
|
|
122
|
+
team: entry('beta', 'Deprecated Team alias; create redirects to Naruto, observe legacy Team missions', 'dist/core/commands/team-command.js', argsCommand(() => import('../core/commands/team-command.js'), 'team', 'dist/core/commands/team-command.js')),
|
|
123
123
|
agent: entry('beta', 'Run native multi-session agent missions', 'dist/core/commands/agent-command.js', argsCommand(() => import('../core/commands/agent-command.js'), 'agentCommand', 'dist/core/commands/agent-command.js')),
|
|
124
124
|
'with-local-llm': entry('beta', 'Enable or inspect local Ollama worker backend', 'dist/core/commands/local-model-command.js', argsCommand(() => import('../core/commands/local-model-command.js'), 'localModelCommand', 'dist/core/commands/local-model-command.js')),
|
|
125
125
|
naruto: entry('labs', 'Run $Naruto shadow-clone swarm (up to 100 parallel sessions)', 'dist/core/commands/naruto-command.js', argsCommand(() => import('../core/commands/naruto-command.js'), 'narutoCommand', 'dist/core/commands/naruto-command.js')),
|
|
@@ -54,7 +54,7 @@ import { CODEX_AGENT_WORKER_RESULT_SCHEMA_ID, codexAgentWorkerResultSchema } fro
|
|
|
54
54
|
import { resolveLocalCollaborationPolicy, localCollaborationParticipated } from '../local-llm/local-collaboration-policy.js';
|
|
55
55
|
import { runFinalGptReviewStage } from '../pipeline/final-gpt-review-stage.js';
|
|
56
56
|
import { selectFinalGptPatchSource } from '../pipeline/final-gpt-patch-stage.js';
|
|
57
|
-
import { allocateWorkerWorktree } from '../git/git-worktree-manager.js';
|
|
57
|
+
import { allocateWorkerWorktree, allocateWorkerWorktreesBatch } from '../git/git-worktree-manager.js';
|
|
58
58
|
import { exportGitWorktreeDiff } from '../git/git-worktree-diff.js';
|
|
59
59
|
import { buildGitWorktreePatchEnvelope } from '../git/git-worktree-patch-envelope.js';
|
|
60
60
|
import { checkpointWorkerWorktree } from '../git/git-worktree-checkpoint.js';
|
|
@@ -63,6 +63,7 @@ import { createGitIntegrationWorktree } from '../git/git-integration-worktree.js
|
|
|
63
63
|
import { applyGitWorktreeMergeQueue } from '../git/git-worktree-merge-queue.js';
|
|
64
64
|
import { crossRebaseIdleWorktrees } from '../git/git-worktree-cross-rebase.js';
|
|
65
65
|
import { gitOutputLine, runGitCommand } from '../git/git-worktree-runner.js';
|
|
66
|
+
import { writeParallelRuntimeProof } from './parallel-runtime-proof.js';
|
|
66
67
|
export async function runNativeAgentOrchestrator(opts = {}) {
|
|
67
68
|
const root = path.resolve(opts.root || process.cwd());
|
|
68
69
|
const prompt = String(opts.prompt || 'Native agent run');
|
|
@@ -289,9 +290,46 @@ export async function runNativeAgentOrchestrator(opts = {}) {
|
|
|
289
290
|
diffs: [],
|
|
290
291
|
checkpoints: [],
|
|
291
292
|
cleanup: [],
|
|
293
|
+
prewarmed_allocations: [],
|
|
292
294
|
blockers: []
|
|
293
295
|
};
|
|
294
296
|
await writeJsonAtomic(path.join(ledgerRoot, 'agent-git-worktree-runtime.json'), gitWorktreeRuntime);
|
|
297
|
+
const preparedWorktreeAllocations = new Map();
|
|
298
|
+
if (gitWorktreePolicy?.mode === 'git-worktree') {
|
|
299
|
+
const writeSlices = uniqueWritableSlicesForWorktrees(partition.slices, Math.max(1, targetActiveSlots));
|
|
300
|
+
if (writeSlices.length) {
|
|
301
|
+
const prewarmed = await allocateWorkerWorktreesBatch({
|
|
302
|
+
root: gitWorktreePolicy.main_repo_root || root,
|
|
303
|
+
missionId,
|
|
304
|
+
workers: writeSlices.map((slice, index) => ({
|
|
305
|
+
workerId: String(slice.owner_agent_id || slice.owner || `worker-${index + 1}`),
|
|
306
|
+
slotId: String(slice.owner_agent_id || slice.owner || `slot-${index + 1}`),
|
|
307
|
+
generationIndex: 1
|
|
308
|
+
})),
|
|
309
|
+
maxParallel: Math.min(targetActiveSlots, Number(process.env.SKS_NARUTO_GIT_WORKTREE_CAP || targetActiveSlots))
|
|
310
|
+
}).catch((err) => {
|
|
311
|
+
gitWorktreeRuntime.blockers.push('git_worktree_batch_prewarm_failed:' + (err instanceof Error ? err.message : String(err)));
|
|
312
|
+
gitWorktreeRuntime.ok = false;
|
|
313
|
+
return [];
|
|
314
|
+
});
|
|
315
|
+
gitWorktreeRuntime.prewarmed_allocations = prewarmed.map((allocation) => ({
|
|
316
|
+
worker_id: allocation.worker_id,
|
|
317
|
+
slot_id: allocation.slot_id,
|
|
318
|
+
ok: allocation.ok,
|
|
319
|
+
worktree_path: allocation.worktree_path,
|
|
320
|
+
branch: allocation.branch,
|
|
321
|
+
blockers: allocation.blockers
|
|
322
|
+
}));
|
|
323
|
+
for (const allocation of prewarmed) {
|
|
324
|
+
if (allocation.ok)
|
|
325
|
+
preparedWorktreeAllocations.set(String(allocation.worker_id), allocation);
|
|
326
|
+
else
|
|
327
|
+
gitWorktreeRuntime.blockers.push(...allocation.blockers);
|
|
328
|
+
}
|
|
329
|
+
gitWorktreeRuntime.ok = gitWorktreeRuntime.blockers.length === 0;
|
|
330
|
+
await writeJsonAtomic(path.join(ledgerRoot, 'agent-git-worktree-runtime.json'), gitWorktreeRuntime);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
295
333
|
const nativeCliSwarm = createNativeCliSessionSwarmRecorder(ledgerRoot, {
|
|
296
334
|
missionId,
|
|
297
335
|
requestedAgents: Number(opts.agents || roster.agent_count || targetActiveSlots),
|
|
@@ -329,7 +367,8 @@ export async function runNativeAgentOrchestrator(opts = {}) {
|
|
|
329
367
|
agent,
|
|
330
368
|
slice,
|
|
331
369
|
policy: gitWorktreePolicy,
|
|
332
|
-
runtime: gitWorktreeRuntime
|
|
370
|
+
runtime: gitWorktreeRuntime,
|
|
371
|
+
preparedAllocation: preparedWorktreeAllocations.get(String(agent.id || '')) || null
|
|
333
372
|
});
|
|
334
373
|
const runtimeAgent = workerWorktree ? { ...agent, worktree: workerWorktree.context } : agent;
|
|
335
374
|
const runtimeSlice = workerWorktree ? { ...slice, worktree: workerWorktree.context } : slice;
|
|
@@ -432,6 +471,13 @@ export async function runNativeAgentOrchestrator(opts = {}) {
|
|
|
432
471
|
}
|
|
433
472
|
});
|
|
434
473
|
await nativeCliSwarm.finalize();
|
|
474
|
+
const parallelRuntimeProof = await writeParallelRuntimeProof(ledgerRoot, missionId, {
|
|
475
|
+
requestedWorkers: Number(opts.agents || roster.agent_count || targetActiveSlots),
|
|
476
|
+
targetActiveSlots,
|
|
477
|
+
visiblePanes: visualLaneCount,
|
|
478
|
+
expectedWorkerRuntimeMs: targetActiveSlots >= 10 ? 8000 : targetActiveSlots >= 2 ? 2000 : 25,
|
|
479
|
+
minActiveWorkers: Math.min(targetActiveSlots, desiredWorkItemCount)
|
|
480
|
+
});
|
|
435
481
|
const results = scheduler.results;
|
|
436
482
|
const nativeCliSessionProof = await writeNativeCliSessionProof(ledgerRoot, {
|
|
437
483
|
requestedAgents: Number(opts.agents || roster.agent_count || targetActiveSlots),
|
|
@@ -600,6 +646,7 @@ export async function runNativeAgentOrchestrator(opts = {}) {
|
|
|
600
646
|
gpt_final_arbiter: gptFinalArbiter,
|
|
601
647
|
final_gpt_patch_stage: finalGptPatchStage,
|
|
602
648
|
patch_swarm: patchSwarm,
|
|
649
|
+
parallel_runtime_proof: parallelRuntimeProof,
|
|
603
650
|
proof
|
|
604
651
|
};
|
|
605
652
|
}
|
|
@@ -620,6 +667,22 @@ function withFinalGptPatchEnvelopes(results, patchEnvelopes = []) {
|
|
|
620
667
|
next[0] = { ...next[0], patch_envelopes: patchEnvelopes };
|
|
621
668
|
return next;
|
|
622
669
|
}
|
|
670
|
+
function uniqueWritableSlicesForWorktrees(slices = [], limit) {
|
|
671
|
+
const selected = [];
|
|
672
|
+
const seenOwners = new Set();
|
|
673
|
+
for (const slice of Array.isArray(slices) ? slices : []) {
|
|
674
|
+
if (!Array.isArray(slice?.write_paths) || slice.write_paths.length === 0)
|
|
675
|
+
continue;
|
|
676
|
+
const owner = String(slice.owner_agent_id || slice.owner || slice.id || '');
|
|
677
|
+
if (!owner || seenOwners.has(owner))
|
|
678
|
+
continue;
|
|
679
|
+
seenOwners.add(owner);
|
|
680
|
+
selected.push(slice);
|
|
681
|
+
if (selected.length >= Math.max(1, limit))
|
|
682
|
+
break;
|
|
683
|
+
}
|
|
684
|
+
return selected;
|
|
685
|
+
}
|
|
623
686
|
function applyNarutoWorkGraphToPartition(partition, graph, roster, targetActiveSlots, parentPrompt = '') {
|
|
624
687
|
const activeRoster = (Array.isArray(roster?.roster) ? roster.roster : []).slice(0, Math.max(1, targetActiveSlots));
|
|
625
688
|
const activeAgentIds = new Set(activeRoster.map((row) => String(row.id || '')).filter(Boolean));
|
|
@@ -873,7 +936,7 @@ async function prepareWorkerGitWorktree(input) {
|
|
|
873
936
|
if (!sliceHasWritePaths && !agentWriteCapable)
|
|
874
937
|
return null;
|
|
875
938
|
const generationIndex = Math.max(1, Math.floor(Number(input.agent.generation_index || 1)));
|
|
876
|
-
const allocation = await allocateWorkerWorktree({
|
|
939
|
+
const allocation = input.preparedAllocation || await allocateWorkerWorktree({
|
|
877
940
|
repoRoot: input.policy.main_repo_root || input.root,
|
|
878
941
|
missionId: input.missionId,
|
|
879
942
|
workerId: String(input.agent.id || input.slice.id || 'worker'),
|
|
@@ -4,6 +4,7 @@ import { MAX_AGENT_COUNT } from './agent-schema.js';
|
|
|
4
4
|
import { appendAgentWorkQueueEvent, completeWorkItem, createAgentWorkQueue, enqueueFollowUpWorkItems, leaseNextWorkItem, pendingWorkItems, writeAgentWorkQueue } from './agent-work-queue.js';
|
|
5
5
|
import { closeWorkerSlotsAfterDrain, createAgentWorkerSlots, markWorkerSlotGenerationClosed, openWorkerSlotGeneration, writeAgentWorkerSlots } from './agent-worker-slot.js';
|
|
6
6
|
import { closeAgentSessionGeneration, createAgentSessionGeneration, writeAgentSessionGeneration } from './agent-session-generation.js';
|
|
7
|
+
import { appendParallelRuntimeEvent } from './parallel-runtime-proof.js';
|
|
7
8
|
export const AGENT_SCHEDULER_SCHEMA = 'sks.agent-scheduler.v1';
|
|
8
9
|
export const AGENT_SCHEDULER_EVENT_SCHEMA = 'sks.agent-scheduler-event.v1';
|
|
9
10
|
export async function runAgentScheduler(input) {
|
|
@@ -19,6 +20,10 @@ export async function runAgentScheduler(input) {
|
|
|
19
20
|
});
|
|
20
21
|
const active = new Map();
|
|
21
22
|
const results = [];
|
|
23
|
+
const schedulerStartedAt = Date.now();
|
|
24
|
+
let batchCounter = 0;
|
|
25
|
+
let batchLaunchSpanTotalMs = 0;
|
|
26
|
+
let batchDispatchInProgress = false;
|
|
22
27
|
let state = buildState(input.missionId, targetActiveSlots, queue, slots, active, {
|
|
23
28
|
status: 'initializing',
|
|
24
29
|
refillDelayMs: input.refillDelayMs || 0,
|
|
@@ -27,7 +32,7 @@ export async function runAgentScheduler(input) {
|
|
|
27
32
|
await writeAll(input.root, state, slots, queue, active, { event_type: 'scheduler_initialized' }, input.onSchedulerEvent);
|
|
28
33
|
await refillSlots(null);
|
|
29
34
|
while (active.size > 0 || pendingWorkItems(queue).length > 0) {
|
|
30
|
-
if (active.size === 0 && pendingWorkItems(queue).length > 0) {
|
|
35
|
+
if (!batchDispatchInProgress && active.size === 0 && pendingWorkItems(queue).length > 0) {
|
|
31
36
|
state.blockers.push('scheduler_pending_queue_without_active_sessions');
|
|
32
37
|
state.status = 'blocked';
|
|
33
38
|
await writeAll(input.root, state, slots, queue, active, { event_type: 'scheduler_blocked', pending_count: pendingWorkItems(queue).length }, input.onSchedulerEvent);
|
|
@@ -94,6 +99,7 @@ export async function runAgentScheduler(input) {
|
|
|
94
99
|
state.all_generations_closed = true;
|
|
95
100
|
if (!state.pending_queue_drained)
|
|
96
101
|
state.blockers.push('scheduler_pending_queue_not_drained');
|
|
102
|
+
updateUtilizationMetrics();
|
|
97
103
|
await writeAll(input.root, state, slots, queue, active, { event_type: 'scheduler_drained' }, input.onSchedulerEvent);
|
|
98
104
|
return {
|
|
99
105
|
schema: 'sks.agent-scheduler-result.v1',
|
|
@@ -105,9 +111,179 @@ export async function runAgentScheduler(input) {
|
|
|
105
111
|
};
|
|
106
112
|
async function refillSlots(backfill) {
|
|
107
113
|
state.status = 'running';
|
|
114
|
+
const launches = collectLaunchBatch();
|
|
115
|
+
if (!launches.length)
|
|
116
|
+
return;
|
|
117
|
+
batchDispatchInProgress = true;
|
|
118
|
+
const batchId = `batch-${Date.now().toString(36)}-${batchCounter++}`;
|
|
119
|
+
const batchStart = Date.now();
|
|
108
120
|
const launchEvents = [];
|
|
109
|
-
|
|
110
|
-
|
|
121
|
+
try {
|
|
122
|
+
for (const launch of launches)
|
|
123
|
+
slots[launch.slotIndex] = launch.openedSlot;
|
|
124
|
+
await Promise.all(launches.map((launch) => writeAgentSessionGeneration(input.root, launch.generation)));
|
|
125
|
+
await writeAll(input.root, state, slots, queue, active, {
|
|
126
|
+
event_type: 'batch_dispatch_started',
|
|
127
|
+
batch_id: batchId,
|
|
128
|
+
launch_count: launches.length,
|
|
129
|
+
session_ids: launches.map((launch) => launch.generation.session_id)
|
|
130
|
+
}, input.onSchedulerEvent);
|
|
131
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
132
|
+
event_type: 'batch_dispatch_started',
|
|
133
|
+
slot_id: null,
|
|
134
|
+
generation_index: null,
|
|
135
|
+
session_id: null,
|
|
136
|
+
pid: null,
|
|
137
|
+
backend: 'scheduler',
|
|
138
|
+
placement: 'unknown',
|
|
139
|
+
batch_id: batchId,
|
|
140
|
+
meta: { launch_count: launches.length, active_count_before: active.size }
|
|
141
|
+
}).catch(() => undefined);
|
|
142
|
+
for (const launch of launches) {
|
|
143
|
+
const { slot, openedSlot, generation, agent, workItem } = launch;
|
|
144
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
145
|
+
event_type: 'slot_reserved',
|
|
146
|
+
slot_id: slot.slot_id,
|
|
147
|
+
generation_index: generation.generation_index,
|
|
148
|
+
session_id: generation.session_id,
|
|
149
|
+
pid: null,
|
|
150
|
+
backend: 'scheduler',
|
|
151
|
+
placement: 'unknown',
|
|
152
|
+
batch_id: batchId,
|
|
153
|
+
meta: { work_item_id: workItem.id }
|
|
154
|
+
}).catch(() => undefined);
|
|
155
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
156
|
+
event_type: 'worker_launch_invoked',
|
|
157
|
+
slot_id: slot.slot_id,
|
|
158
|
+
generation_index: generation.generation_index,
|
|
159
|
+
session_id: generation.session_id,
|
|
160
|
+
pid: null,
|
|
161
|
+
backend: 'scheduler',
|
|
162
|
+
placement: 'unknown',
|
|
163
|
+
batch_id: batchId,
|
|
164
|
+
meta: { work_item_id: workItem.id }
|
|
165
|
+
}).catch(() => undefined);
|
|
166
|
+
const promise = Promise.resolve()
|
|
167
|
+
.then(() => input.launchSession({ agent, workItem, generation, slot: openedSlot, queue, state }))
|
|
168
|
+
.then((result) => ({
|
|
169
|
+
result,
|
|
170
|
+
session_id: generation.session_id,
|
|
171
|
+
slot_id: slot.slot_id,
|
|
172
|
+
generation_index: generation.generation_index,
|
|
173
|
+
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
174
|
+
}))
|
|
175
|
+
.catch((err) => ({
|
|
176
|
+
result: {
|
|
177
|
+
schema: 'sks.agent-result.v1',
|
|
178
|
+
mission_id: input.missionId,
|
|
179
|
+
agent_id: agent.id,
|
|
180
|
+
session_id: generation.session_id,
|
|
181
|
+
persona_id: agent.persona_id,
|
|
182
|
+
task_slice_id: workItem.id,
|
|
183
|
+
status: 'failed',
|
|
184
|
+
backend: 'fake',
|
|
185
|
+
summary: err instanceof Error ? err.message : String(err),
|
|
186
|
+
findings: [],
|
|
187
|
+
proposed_changes: [],
|
|
188
|
+
changed_files: [],
|
|
189
|
+
lease_compliance: { ok: true, violations: [] },
|
|
190
|
+
artifacts: [],
|
|
191
|
+
blockers: ['scheduler_launch_failed'],
|
|
192
|
+
confidence: 'failed',
|
|
193
|
+
handoff_notes: '',
|
|
194
|
+
unverified: [],
|
|
195
|
+
writes: [],
|
|
196
|
+
recursion_guard: { ok: true, violations: [] },
|
|
197
|
+
verification: { status: 'failed', checks: [] },
|
|
198
|
+
source_intelligence_refs: input.sourceIntelligenceRefs || null,
|
|
199
|
+
goal_mode_ref: input.goalModeRef || null
|
|
200
|
+
},
|
|
201
|
+
session_id: generation.session_id,
|
|
202
|
+
slot_id: slot.slot_id,
|
|
203
|
+
generation_index: generation.generation_index,
|
|
204
|
+
error: err instanceof Error ? err.message : String(err),
|
|
205
|
+
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
206
|
+
}));
|
|
207
|
+
active.set(generation.session_id, { slot_id: slot.slot_id, work_item_id: workItem.id, session_id: generation.session_id, promise });
|
|
208
|
+
}
|
|
209
|
+
await appendAgentWorkQueueEvent(input.root, 'batch_work_items_dispatched', {
|
|
210
|
+
batch_id: batchId,
|
|
211
|
+
launch_count: launches.length,
|
|
212
|
+
session_ids: launches.map((launch) => launch.generation.session_id),
|
|
213
|
+
work_item_ids: launches.map((launch) => launch.workItem.id)
|
|
214
|
+
});
|
|
215
|
+
for (const launch of launches)
|
|
216
|
+
await appendAgentWorkQueueEvent(input.root, 'work_item_dispatched', { work_item_id: launch.workItem.id, session_id: launch.generation.session_id, slot_id: launch.slot.slot_id });
|
|
217
|
+
if (backfill) {
|
|
218
|
+
const firstLaunch = launches[0];
|
|
219
|
+
const refillLatencyMs = Math.max(0, Date.now() - backfill.closed_at_ms);
|
|
220
|
+
state.backfill_count += 1;
|
|
221
|
+
state.refill_latency_events_ms.push(refillLatencyMs);
|
|
222
|
+
state.refill_latency_p95_ms = percentile95(state.refill_latency_events_ms);
|
|
223
|
+
launchEvents.push({
|
|
224
|
+
event_type: 'backfill_event',
|
|
225
|
+
closed_session_id: backfill.closed_session_id,
|
|
226
|
+
new_session_id: firstLaunch?.generation.session_id || null,
|
|
227
|
+
slot_id: firstLaunch?.slot.slot_id || null,
|
|
228
|
+
batch_id: batchId,
|
|
229
|
+
launch_count: launches.length,
|
|
230
|
+
active_count_before: backfill.active_count_before,
|
|
231
|
+
active_count_after: active.size,
|
|
232
|
+
refill_latency_ms: refillLatencyMs
|
|
233
|
+
});
|
|
234
|
+
backfill = null;
|
|
235
|
+
}
|
|
236
|
+
else {
|
|
237
|
+
for (const launch of launches)
|
|
238
|
+
launchEvents.push({
|
|
239
|
+
event_type: 'session_launched',
|
|
240
|
+
session_id: launch.generation.session_id,
|
|
241
|
+
slot_id: launch.slot.slot_id,
|
|
242
|
+
work_item_id: launch.workItem.id,
|
|
243
|
+
active_count_after: active.size
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
if (input.refillDelayMs && input.refillDelayMs > 0)
|
|
247
|
+
await delay(input.refillDelayMs);
|
|
248
|
+
const launchSpanMs = Math.max(0, Date.now() - batchStart);
|
|
249
|
+
batchLaunchSpanTotalMs += launchSpanMs;
|
|
250
|
+
state.batch_dispatch_count += 1;
|
|
251
|
+
state.largest_batch_size = Math.max(state.largest_batch_size, launches.length);
|
|
252
|
+
if (state.first_batch_launch_span_ms === 0)
|
|
253
|
+
state.first_batch_launch_span_ms = launchSpanMs;
|
|
254
|
+
state.average_batch_launch_span_ms = Math.round(batchLaunchSpanTotalMs / Math.max(1, state.batch_dispatch_count));
|
|
255
|
+
updateUtilizationMetrics();
|
|
256
|
+
await appendParallelRuntimeEvent(input.root, input.missionId, {
|
|
257
|
+
event_type: 'batch_dispatch_completed',
|
|
258
|
+
slot_id: null,
|
|
259
|
+
generation_index: null,
|
|
260
|
+
session_id: null,
|
|
261
|
+
pid: null,
|
|
262
|
+
backend: 'scheduler',
|
|
263
|
+
placement: 'unknown',
|
|
264
|
+
batch_id: batchId,
|
|
265
|
+
meta: { launch_count: launches.length, launch_span_ms: launchSpanMs, active_count_after: active.size }
|
|
266
|
+
}).catch(() => undefined);
|
|
267
|
+
await writeAll(input.root, state, slots, queue, active, {
|
|
268
|
+
event_type: 'batch_dispatch_completed',
|
|
269
|
+
batch_id: batchId,
|
|
270
|
+
launch_count: launches.length,
|
|
271
|
+
launch_span_ms: launchSpanMs,
|
|
272
|
+
active_count_after: active.size,
|
|
273
|
+
session_ids: launches.map((launch) => launch.generation.session_id)
|
|
274
|
+
}, input.onSchedulerEvent);
|
|
275
|
+
}
|
|
276
|
+
finally {
|
|
277
|
+
batchDispatchInProgress = false;
|
|
278
|
+
}
|
|
279
|
+
for (const event of launchEvents)
|
|
280
|
+
await appendJsonl(path.join(input.root, 'agent-scheduler-events.jsonl'), { schema: AGENT_SCHEDULER_EVENT_SCHEMA, ts: nowIso(), ...event });
|
|
281
|
+
}
|
|
282
|
+
function collectLaunchBatch() {
|
|
283
|
+
const launches = [];
|
|
284
|
+
const reservedSlots = new Set();
|
|
285
|
+
while (active.size + launches.length < targetActiveSlots && pendingWorkItems(queue).length > 0) {
|
|
286
|
+
const slotIndex = slots.findIndex((slot, index) => slot.status === 'idle' && !reservedSlots.has(index));
|
|
111
287
|
if (slotIndex < 0)
|
|
112
288
|
break;
|
|
113
289
|
const slot = slots[slotIndex];
|
|
@@ -133,90 +309,18 @@ export async function runAgentScheduler(input) {
|
|
|
133
309
|
goalModeRef: workItem.goal_mode_ref
|
|
134
310
|
});
|
|
135
311
|
workItem.running_session_id = generation.session_id;
|
|
136
|
-
await writeAgentSessionGeneration(input.root, generation);
|
|
137
|
-
const agent = buildAgentForGeneration(slot, generation, workItem);
|
|
138
312
|
const openedSlot = openWorkerSlotGeneration(slot, generation);
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
session_id: generation.session_id,
|
|
143
|
-
slot_id: slot.slot_id,
|
|
144
|
-
generation_index: generation.generation_index,
|
|
145
|
-
work_item_id: workItem.id
|
|
146
|
-
}, input.onSchedulerEvent);
|
|
147
|
-
const promise = Promise.resolve()
|
|
148
|
-
.then(() => input.launchSession({ agent, workItem, generation, slot: openedSlot, queue, state }))
|
|
149
|
-
.then((result) => ({
|
|
150
|
-
result,
|
|
151
|
-
session_id: generation.session_id,
|
|
152
|
-
slot_id: slot.slot_id,
|
|
153
|
-
generation_index: generation.generation_index,
|
|
154
|
-
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
155
|
-
}))
|
|
156
|
-
.catch((err) => ({
|
|
157
|
-
result: {
|
|
158
|
-
schema: 'sks.agent-result.v1',
|
|
159
|
-
mission_id: input.missionId,
|
|
160
|
-
agent_id: agent.id,
|
|
161
|
-
session_id: generation.session_id,
|
|
162
|
-
persona_id: agent.persona_id,
|
|
163
|
-
task_slice_id: workItem.id,
|
|
164
|
-
status: 'failed',
|
|
165
|
-
backend: 'fake',
|
|
166
|
-
summary: err instanceof Error ? err.message : String(err),
|
|
167
|
-
findings: [],
|
|
168
|
-
proposed_changes: [],
|
|
169
|
-
changed_files: [],
|
|
170
|
-
lease_compliance: { ok: true, violations: [] },
|
|
171
|
-
artifacts: [],
|
|
172
|
-
blockers: ['scheduler_launch_failed'],
|
|
173
|
-
confidence: 'failed',
|
|
174
|
-
handoff_notes: '',
|
|
175
|
-
unverified: [],
|
|
176
|
-
writes: [],
|
|
177
|
-
recursion_guard: { ok: true, violations: [] },
|
|
178
|
-
verification: { status: 'failed', checks: [] },
|
|
179
|
-
source_intelligence_refs: input.sourceIntelligenceRefs || null,
|
|
180
|
-
goal_mode_ref: input.goalModeRef || null
|
|
181
|
-
},
|
|
182
|
-
session_id: generation.session_id,
|
|
183
|
-
slot_id: slot.slot_id,
|
|
184
|
-
generation_index: generation.generation_index,
|
|
185
|
-
error: err instanceof Error ? err.message : String(err),
|
|
186
|
-
terminal_close_report_path: path.join(generation.artifact_dir, 'agent-terminal-close-report.json')
|
|
187
|
-
}));
|
|
188
|
-
active.set(generation.session_id, { slot_id: slot.slot_id, work_item_id: workItem.id, session_id: generation.session_id, promise });
|
|
189
|
-
await appendAgentWorkQueueEvent(input.root, 'work_item_dispatched', { work_item_id: workItem.id, session_id: generation.session_id, slot_id: slot.slot_id });
|
|
190
|
-
if (backfill) {
|
|
191
|
-
const refillLatencyMs = Math.max(0, Date.now() - backfill.closed_at_ms);
|
|
192
|
-
state.backfill_count += 1;
|
|
193
|
-
state.refill_latency_events_ms.push(refillLatencyMs);
|
|
194
|
-
state.refill_latency_p95_ms = percentile95(state.refill_latency_events_ms);
|
|
195
|
-
launchEvents.push({
|
|
196
|
-
event_type: 'backfill_event',
|
|
197
|
-
closed_session_id: backfill.closed_session_id,
|
|
198
|
-
new_session_id: generation.session_id,
|
|
199
|
-
slot_id: slot.slot_id,
|
|
200
|
-
active_count_before: backfill.active_count_before,
|
|
201
|
-
active_count_after: active.size,
|
|
202
|
-
refill_latency_ms: refillLatencyMs
|
|
203
|
-
});
|
|
204
|
-
backfill = null;
|
|
205
|
-
}
|
|
206
|
-
else {
|
|
207
|
-
launchEvents.push({
|
|
208
|
-
event_type: 'session_launched',
|
|
209
|
-
session_id: generation.session_id,
|
|
210
|
-
slot_id: slot.slot_id,
|
|
211
|
-
work_item_id: workItem.id,
|
|
212
|
-
active_count_after: active.size
|
|
213
|
-
});
|
|
214
|
-
}
|
|
215
|
-
if (input.refillDelayMs && input.refillDelayMs > 0)
|
|
216
|
-
await delay(input.refillDelayMs);
|
|
313
|
+
const agent = buildAgentForGeneration(slot, generation, workItem);
|
|
314
|
+
launches.push({ slotIndex, slot, openedSlot, generation, agent, workItem, provisionalSessionId });
|
|
315
|
+
reservedSlots.add(slotIndex);
|
|
217
316
|
}
|
|
218
|
-
|
|
219
|
-
|
|
317
|
+
return launches;
|
|
318
|
+
}
|
|
319
|
+
function updateUtilizationMetrics() {
|
|
320
|
+
state.wall_time_ms = Math.max(0, Date.now() - schedulerStartedAt);
|
|
321
|
+
state.active_slot_time_ms = Math.max(state.active_slot_time_ms, state.completed_count * state.wall_time_ms);
|
|
322
|
+
const denominator = Math.max(1, state.wall_time_ms * targetActiveSlots);
|
|
323
|
+
state.scheduler_utilization = Number(Math.min(1, state.active_slot_time_ms / denominator).toFixed(3));
|
|
220
324
|
}
|
|
221
325
|
}
|
|
222
326
|
export function normalizeTargetActiveSlots(value, maxActiveSlots = MAX_AGENT_COUNT) {
|
|
@@ -261,7 +365,14 @@ function buildState(missionId, targetActiveSlots, queue, slots, active, opts) {
|
|
|
261
365
|
pending_queue_drained: pendingCount === 0,
|
|
262
366
|
all_slots_closed_after_drain: slots.length > 0 && slots.every((slot) => slot.status === 'closed'),
|
|
263
367
|
all_generations_closed: false,
|
|
264
|
-
blockers: [...(previous?.blockers || [])]
|
|
368
|
+
blockers: [...(previous?.blockers || [])],
|
|
369
|
+
batch_dispatch_count: previous?.batch_dispatch_count || 0,
|
|
370
|
+
largest_batch_size: previous?.largest_batch_size || 0,
|
|
371
|
+
first_batch_launch_span_ms: previous?.first_batch_launch_span_ms || 0,
|
|
372
|
+
average_batch_launch_span_ms: previous?.average_batch_launch_span_ms || 0,
|
|
373
|
+
scheduler_utilization: previous?.scheduler_utilization || 0,
|
|
374
|
+
active_slot_time_ms: previous?.active_slot_time_ms || 0,
|
|
375
|
+
wall_time_ms: previous?.wall_time_ms || 0
|
|
265
376
|
};
|
|
266
377
|
}
|
|
267
378
|
async function writeAll(root, currentState, slots, queue, active, event, onSchedulerEvent) {
|
|
@@ -289,6 +400,13 @@ async function writeAll(root, currentState, slots, queue, active, event, onSched
|
|
|
289
400
|
currentState.blocked = nextState.blocked;
|
|
290
401
|
currentState.pending_queue_drained = nextState.pending_queue_drained;
|
|
291
402
|
currentState.all_slots_closed_after_drain = nextState.all_slots_closed_after_drain;
|
|
403
|
+
currentState.batch_dispatch_count = nextState.batch_dispatch_count;
|
|
404
|
+
currentState.largest_batch_size = nextState.largest_batch_size;
|
|
405
|
+
currentState.first_batch_launch_span_ms = nextState.first_batch_launch_span_ms;
|
|
406
|
+
currentState.average_batch_launch_span_ms = nextState.average_batch_launch_span_ms;
|
|
407
|
+
currentState.scheduler_utilization = nextState.scheduler_utilization;
|
|
408
|
+
currentState.active_slot_time_ms = nextState.active_slot_time_ms;
|
|
409
|
+
currentState.wall_time_ms = nextState.wall_time_ms;
|
|
292
410
|
await writeAgentWorkQueue(root, queue);
|
|
293
411
|
await writeAgentWorkerSlots(root, slots);
|
|
294
412
|
await writeJsonAtomic(path.join(root, 'agent-scheduler-state.json'), currentState);
|
|
@@ -13,7 +13,7 @@ export const DEFAULT_AGENT_CONCURRENCY = 5;
|
|
|
13
13
|
// ceiling to up to 100 concurrent clone sessions. Only the naruto path opts into this
|
|
14
14
|
// cap; every other roster/scheduler caller keeps MAX_AGENT_COUNT as the default.
|
|
15
15
|
export const MAX_NARUTO_AGENT_COUNT = 100;
|
|
16
|
-
export const DEFAULT_NARUTO_CLONES =
|
|
16
|
+
export const DEFAULT_NARUTO_CLONES = 32;
|
|
17
17
|
export const AGENT_BACKENDS = ['fake', 'process', 'codex-sdk', 'zellij', 'ollama', 'local-llm'];
|
|
18
18
|
export function normalizeAgentBackend(input) {
|
|
19
19
|
const value = String(input || 'codex-sdk');
|