@ash-ai/server 0.0.19 → 0.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/sandbox-env.test.js +1 -0
- package/dist/__tests__/sandbox-env.test.js.map +1 -1
- package/dist/__tests__/sandbox-isolation.test.d.ts +2 -0
- package/dist/__tests__/sandbox-isolation.test.d.ts.map +1 -0
- package/dist/__tests__/sandbox-isolation.test.js +190 -0
- package/dist/__tests__/sandbox-isolation.test.js.map +1 -0
- package/dist/__tests__/session-create.test.d.ts +2 -0
- package/dist/__tests__/session-create.test.d.ts.map +1 -0
- package/dist/__tests__/session-create.test.js +83 -0
- package/dist/__tests__/session-create.test.js.map +1 -0
- package/dist/__tests__/tracing.test.d.ts +2 -0
- package/dist/__tests__/tracing.test.d.ts.map +1 -0
- package/dist/__tests__/tracing.test.js +69 -0
- package/dist/__tests__/tracing.test.js.map +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -2
- package/dist/index.js.map +1 -1
- package/dist/routes/agents.d.ts.map +1 -1
- package/dist/routes/agents.js +3 -0
- package/dist/routes/agents.js.map +1 -1
- package/dist/routes/sessions.d.ts.map +1 -1
- package/dist/routes/sessions.js +279 -180
- package/dist/routes/sessions.js.map +1 -1
- package/dist/telemetry/tracing.d.ts +13 -0
- package/dist/telemetry/tracing.d.ts.map +1 -0
- package/dist/telemetry/tracing.js +50 -0
- package/dist/telemetry/tracing.js.map +1 -0
- package/package.json +9 -3
package/dist/routes/sessions.js
CHANGED
|
@@ -5,10 +5,12 @@ import { SSE_WRITE_TIMEOUT_MS, timingEnabled, startTimer, logTiming } from '@ash
|
|
|
5
5
|
import { getAgent, insertSession, insertForkedSession, getSession, listSessions, updateSessionStatus, updateSessionSandbox, updateSessionConfig, touchSession, updateSessionRunner, insertMessage, listMessages, insertSessionEvent, insertSessionEvents, listSessionEvents } from '../db/index.js';
|
|
6
6
|
import { classifyBridgeMessage, classifyToStreamEvents } from '@ash-ai/shared';
|
|
7
7
|
import { VERSION } from '../version.js';
|
|
8
|
-
import { restoreSessionState, hasPersistedState, restoreStateFromCloud } from '@ash-ai/sandbox';
|
|
8
|
+
import { restoreSessionState, hasPersistedState, restoreStateFromCloud, restoreAgentFromCloud } from '@ash-ai/sandbox';
|
|
9
9
|
import { decryptCredential } from './credentials.js';
|
|
10
10
|
import { touchCredentialUsed } from '../db/index.js';
|
|
11
11
|
import { recordUsageFromMessage } from '../usage/extractor.js';
|
|
12
|
+
import { trace, context, propagation, SpanStatusCode } from '@opentelemetry/api';
|
|
13
|
+
const tracer = trace.getTracer('ash-coordinator');
|
|
12
14
|
/** Structured log line for every resume — always on, not gated by ASH_DEBUG_TIMING. */
|
|
13
15
|
function logResume(path, sessionId, agentName, source) {
|
|
14
16
|
process.stderr.write(JSON.stringify({
|
|
@@ -97,73 +99,101 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
97
99
|
201: sessionResponse,
|
|
98
100
|
400: { $ref: 'ApiError#' },
|
|
99
101
|
404: { $ref: 'ApiError#' },
|
|
102
|
+
422: { $ref: 'ApiError#' },
|
|
100
103
|
500: { $ref: 'ApiError#' },
|
|
101
104
|
503: { $ref: 'ApiError#' },
|
|
102
105
|
},
|
|
103
106
|
},
|
|
104
107
|
}, async (req, reply) => {
|
|
105
108
|
const { agent, credentialId, extraEnv: bodyExtraEnv, startupScript, model, mcpServers, systemPrompt, permissionMode, allowedTools, disallowedTools, betas, subagents, initialAgent } = req.body;
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
109
|
+
return tracer.startActiveSpan('ash.session.create', { attributes: { 'ash.agent.name': agent } }, async (span) => {
|
|
110
|
+
try {
|
|
111
|
+
const agentRecord = await getAgent(agent, req.tenantId);
|
|
112
|
+
if (!agentRecord) {
|
|
113
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent not found' });
|
|
114
|
+
return reply.status(404).send({ error: `Agent "${agent}" not found`, statusCode: 404 });
|
|
115
|
+
}
|
|
116
|
+
// Validate agent directory exists on disk — auto-restore from cloud if missing
|
|
117
|
+
if (!existsSync(agentRecord.path)) {
|
|
118
|
+
const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
|
|
119
|
+
if (!restored) {
|
|
120
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent directory not found' });
|
|
121
|
+
return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${agent}" may need to be re-deployed.`, statusCode: 422 });
|
|
122
|
+
}
|
|
123
|
+
console.log(`[sessions] Restored agent "${agent}" from cloud storage`);
|
|
124
|
+
}
|
|
125
|
+
// Resolve credential to env vars if provided
|
|
126
|
+
let extraEnv;
|
|
127
|
+
if (credentialId) {
|
|
128
|
+
const cred = await decryptCredential(credentialId, req.tenantId);
|
|
129
|
+
if (!cred) {
|
|
130
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: 'Invalid credential' });
|
|
131
|
+
return reply.status(400).send({ error: 'Invalid or inaccessible credential', statusCode: 400 });
|
|
132
|
+
}
|
|
133
|
+
const envKey = cred.type === 'anthropic' ? 'ANTHROPIC_API_KEY' : cred.type === 'openai' ? 'OPENAI_API_KEY' : 'ASH_CUSTOM_API_KEY';
|
|
134
|
+
extraEnv = { [envKey]: cred.key };
|
|
135
|
+
touchCredentialUsed(credentialId).catch(() => { });
|
|
136
|
+
}
|
|
137
|
+
// Merge body-level extraEnv (overrides credential env on conflict)
|
|
138
|
+
if (bodyExtraEnv) {
|
|
139
|
+
extraEnv = { ...extraEnv, ...bodyExtraEnv };
|
|
140
|
+
}
|
|
141
|
+
// Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
|
|
142
|
+
if (permissionMode) {
|
|
143
|
+
extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
|
|
144
|
+
}
|
|
145
|
+
const sessionId = randomUUID();
|
|
146
|
+
span.setAttribute('ash.session.id', sessionId);
|
|
147
|
+
try {
|
|
148
|
+
span.addEvent('selectBackend.start');
|
|
149
|
+
const { backend, runnerId } = await coordinator.selectBackend();
|
|
150
|
+
span.addEvent('selectBackend.end');
|
|
151
|
+
span.addEvent('createSandbox.start');
|
|
152
|
+
const handle = await backend.createSandbox({
|
|
153
|
+
sessionId,
|
|
154
|
+
agentDir: agentRecord.path,
|
|
155
|
+
agentName: agentRecord.name,
|
|
156
|
+
sandboxId: sessionId,
|
|
157
|
+
extraEnv,
|
|
158
|
+
startupScript,
|
|
159
|
+
mcpServers,
|
|
160
|
+
systemPrompt,
|
|
161
|
+
onOomKill: () => {
|
|
162
|
+
updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
163
|
+
},
|
|
164
|
+
});
|
|
165
|
+
span.addEvent('createSandbox.end');
|
|
166
|
+
// Resolve effective model: explicit request > agent record > null (SDK default)
|
|
167
|
+
const effectiveModel = model || agentRecord.model || undefined;
|
|
168
|
+
if (effectiveModel)
|
|
169
|
+
span.setAttribute('ash.model', effectiveModel);
|
|
170
|
+
// Build session-level SDK config (persisted on session, injected into every query)
|
|
171
|
+
const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
|
|
172
|
+
? { allowedTools, disallowedTools, betas, subagents, initialAgent }
|
|
173
|
+
: null;
|
|
174
|
+
const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
|
|
175
|
+
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
176
|
+
await updateSessionRunner(sessionId, effectiveRunnerId);
|
|
177
|
+
await updateSessionStatus(sessionId, 'active');
|
|
178
|
+
// Record lifecycle event
|
|
179
|
+
insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
180
|
+
telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
|
|
181
|
+
return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
|
|
182
|
+
}
|
|
183
|
+
catch (err) {
|
|
184
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
185
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
|
|
186
|
+
span.recordException(err instanceof Error ? err : new Error(msg));
|
|
187
|
+
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
188
|
+
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
189
|
+
}
|
|
190
|
+
return reply.status(500).send({ error: `Failed to create session: ${msg}`, statusCode: 500 });
|
|
191
|
+
}
|
|
116
192
|
}
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
touchCredentialUsed(credentialId).catch(() => { });
|
|
120
|
-
}
|
|
121
|
-
// Merge body-level extraEnv (overrides credential env on conflict)
|
|
122
|
-
if (bodyExtraEnv) {
|
|
123
|
-
extraEnv = { ...extraEnv, ...bodyExtraEnv };
|
|
124
|
-
}
|
|
125
|
-
// Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
|
|
126
|
-
if (permissionMode) {
|
|
127
|
-
extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
|
|
128
|
-
}
|
|
129
|
-
const sessionId = randomUUID();
|
|
130
|
-
try {
|
|
131
|
-
const { backend, runnerId } = await coordinator.selectBackend();
|
|
132
|
-
const handle = await backend.createSandbox({
|
|
133
|
-
sessionId,
|
|
134
|
-
agentDir: agentRecord.path,
|
|
135
|
-
agentName: agentRecord.name,
|
|
136
|
-
sandboxId: sessionId,
|
|
137
|
-
extraEnv,
|
|
138
|
-
startupScript,
|
|
139
|
-
mcpServers,
|
|
140
|
-
systemPrompt,
|
|
141
|
-
onOomKill: () => {
|
|
142
|
-
updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
143
|
-
},
|
|
144
|
-
});
|
|
145
|
-
// Resolve effective model: explicit request > agent record > null (SDK default)
|
|
146
|
-
const effectiveModel = model || agentRecord.model || undefined;
|
|
147
|
-
// Build session-level SDK config (persisted on session, injected into every query)
|
|
148
|
-
const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
|
|
149
|
-
? { allowedTools, disallowedTools, betas, subagents, initialAgent }
|
|
150
|
-
: null;
|
|
151
|
-
const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
|
|
152
|
-
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
153
|
-
await updateSessionRunner(sessionId, effectiveRunnerId);
|
|
154
|
-
await updateSessionStatus(sessionId, 'active');
|
|
155
|
-
// Record lifecycle event
|
|
156
|
-
insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
157
|
-
telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
|
|
158
|
-
return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
|
|
159
|
-
}
|
|
160
|
-
catch (err) {
|
|
161
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
162
|
-
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
163
|
-
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
193
|
+
finally {
|
|
194
|
+
span.end();
|
|
164
195
|
}
|
|
165
|
-
|
|
166
|
-
}
|
|
196
|
+
});
|
|
167
197
|
});
|
|
168
198
|
// List sessions (optional ?agent=name filter)
|
|
169
199
|
app.get('/api/sessions', {
|
|
@@ -357,17 +387,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
357
387
|
return reply.status(400).send({ error: `Session is ${session.status}`, statusCode: 400 });
|
|
358
388
|
}
|
|
359
389
|
const { content, includePartialMessages, model: messageModel, maxTurns, maxBudgetUsd, effort, thinking, outputFormat } = req.body;
|
|
390
|
+
const messageSpan = tracer.startSpan('ash.session.message', {
|
|
391
|
+
attributes: {
|
|
392
|
+
'ash.session.id': session.id,
|
|
393
|
+
'ash.agent.name': session.agentName,
|
|
394
|
+
'ash.sandbox.id': session.sandboxId,
|
|
395
|
+
},
|
|
396
|
+
});
|
|
397
|
+
const messageCtx = trace.setSpan(context.active(), messageSpan);
|
|
398
|
+
if (session.model)
|
|
399
|
+
messageSpan.setAttribute('ash.model', session.model);
|
|
360
400
|
let backend;
|
|
361
401
|
try {
|
|
362
402
|
backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
363
403
|
}
|
|
364
404
|
catch {
|
|
365
405
|
await updateSessionStatus(session.id, 'error');
|
|
406
|
+
messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Runner not available' });
|
|
407
|
+
messageSpan.end();
|
|
366
408
|
return reply.status(500).send({ error: 'Runner not available', statusCode: 500 });
|
|
367
409
|
}
|
|
368
410
|
const sandbox = backend.getSandbox(session.sandboxId);
|
|
369
411
|
if (!sandbox) {
|
|
370
412
|
await updateSessionStatus(session.id, 'error');
|
|
413
|
+
messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Sandbox not found' });
|
|
414
|
+
messageSpan.end();
|
|
371
415
|
return reply.status(500).send({ error: 'Sandbox not found', statusCode: 500 });
|
|
372
416
|
}
|
|
373
417
|
const lookupMs = elapsed?.() ?? 0;
|
|
@@ -387,6 +431,10 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
387
431
|
await writeSSE(reply.raw, `event: session_start\ndata: ${JSON.stringify({ sessionId: session.id, version: VERSION })}\n\n`);
|
|
388
432
|
let eventCount = 0;
|
|
389
433
|
let firstEventMs = 0;
|
|
434
|
+
// Inject trace context for bridge propagation
|
|
435
|
+
const carrier = {};
|
|
436
|
+
propagation.inject(messageCtx, carrier);
|
|
437
|
+
const traceContext = carrier['traceparent'];
|
|
390
438
|
try {
|
|
391
439
|
// Model precedence: per-message > session-level > agent default (.claude/settings.json)
|
|
392
440
|
const queryModel = messageModel || session.model || undefined;
|
|
@@ -410,6 +458,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
410
458
|
...(cfg?.betas && { betas: cfg.betas }),
|
|
411
459
|
...(cfg?.subagents && { subagents: cfg.subagents }),
|
|
412
460
|
...(cfg?.initialAgent && { initialAgent: cfg.initialAgent }),
|
|
461
|
+
// Distributed tracing context
|
|
462
|
+
...(traceContext && { traceContext }),
|
|
413
463
|
});
|
|
414
464
|
for await (const event of events) {
|
|
415
465
|
eventCount++;
|
|
@@ -462,6 +512,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
462
512
|
}
|
|
463
513
|
catch (err) {
|
|
464
514
|
const msg = err instanceof Error ? err.message : String(err);
|
|
515
|
+
messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
|
|
516
|
+
messageSpan.recordException(err instanceof Error ? err : new Error(msg));
|
|
465
517
|
reply.raw.write(`event: error\ndata: ${JSON.stringify({ error: msg })}\n\n`);
|
|
466
518
|
// Signal stream completion — the session remains active (not ended)
|
|
467
519
|
reply.raw.write(`event: done\ndata: ${JSON.stringify({ sessionId: session.id })}\n\n`);
|
|
@@ -469,6 +521,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
469
521
|
finally {
|
|
470
522
|
// Mark waiting after message processing completes
|
|
471
523
|
backend.markWaiting(session.sandboxId);
|
|
524
|
+
messageSpan.end();
|
|
472
525
|
}
|
|
473
526
|
if (elapsed) {
|
|
474
527
|
logTiming({
|
|
@@ -614,23 +667,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
614
667
|
},
|
|
615
668
|
},
|
|
616
669
|
}, async (req, reply) => {
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
670
|
+
return tracer.startActiveSpan('ash.session.pause', async (span) => {
|
|
671
|
+
try {
|
|
672
|
+
const session = await getSession(req.params.id);
|
|
673
|
+
if (!session || session.tenantId !== req.tenantId) {
|
|
674
|
+
return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
|
|
675
|
+
}
|
|
676
|
+
span.setAttribute('ash.session.id', session.id);
|
|
677
|
+
if (session.status !== 'active') {
|
|
678
|
+
return reply.status(400).send({ error: `Cannot pause session with status "${session.status}"`, statusCode: 400 });
|
|
679
|
+
}
|
|
680
|
+
// Best-effort persist state before pausing
|
|
681
|
+
try {
|
|
682
|
+
const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
683
|
+
backend.persistState(session.sandboxId, session.id, session.agentName);
|
|
684
|
+
}
|
|
685
|
+
catch { /* runner may be gone */ }
|
|
686
|
+
await updateSessionStatus(session.id, 'paused');
|
|
687
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'paused' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
688
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'paused' } });
|
|
689
|
+
return reply.send({ session: { ...session, status: 'paused' } });
|
|
690
|
+
}
|
|
691
|
+
finally {
|
|
692
|
+
span.end();
|
|
693
|
+
}
|
|
694
|
+
});
|
|
634
695
|
});
|
|
635
696
|
// Stop session — explicit user action (distinct from pause which is idle-based)
|
|
636
697
|
app.post('/api/sessions/:id/stop', {
|
|
@@ -644,24 +705,32 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
644
705
|
},
|
|
645
706
|
},
|
|
646
707
|
}, async (req, reply) => {
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
708
|
+
return tracer.startActiveSpan('ash.session.stop', async (span) => {
|
|
709
|
+
try {
|
|
710
|
+
const session = await getSession(req.params.id);
|
|
711
|
+
if (!session || session.tenantId !== req.tenantId) {
|
|
712
|
+
return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
|
|
713
|
+
}
|
|
714
|
+
span.setAttribute('ash.session.id', session.id);
|
|
715
|
+
if (session.status !== 'active' && session.status !== 'starting') {
|
|
716
|
+
return reply.status(400).send({ error: `Cannot stop session with status "${session.status}"`, statusCode: 400 });
|
|
717
|
+
}
|
|
718
|
+
// Persist state and destroy sandbox
|
|
719
|
+
try {
|
|
720
|
+
const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
721
|
+
backend.persistState(session.sandboxId, session.id, session.agentName);
|
|
722
|
+
await backend.destroySandbox(session.sandboxId);
|
|
723
|
+
}
|
|
724
|
+
catch { /* runner may be gone */ }
|
|
725
|
+
await updateSessionStatus(session.id, 'stopped');
|
|
726
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'stopped' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
727
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'stopped' } });
|
|
728
|
+
return reply.send({ session: { ...session, status: 'stopped' } });
|
|
729
|
+
}
|
|
730
|
+
finally {
|
|
731
|
+
span.end();
|
|
732
|
+
}
|
|
733
|
+
});
|
|
665
734
|
});
|
|
666
735
|
// Fork session — create a new session branching from parent's state and messages
|
|
667
736
|
app.post('/api/sessions/:id/fork', {
|
|
@@ -671,6 +740,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
671
740
|
response: {
|
|
672
741
|
201: sessionResponse,
|
|
673
742
|
404: { $ref: 'ApiError#' },
|
|
743
|
+
422: { $ref: 'ApiError#' },
|
|
674
744
|
500: { $ref: 'ApiError#' },
|
|
675
745
|
503: { $ref: 'ApiError#' },
|
|
676
746
|
},
|
|
@@ -684,6 +754,13 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
684
754
|
if (!agent) {
|
|
685
755
|
return reply.status(404).send({ error: `Agent "${parentSession.agentName}" not found`, statusCode: 404 });
|
|
686
756
|
}
|
|
757
|
+
if (!existsSync(agent.path)) {
|
|
758
|
+
const restored = await restoreAgentFromCloud(agent.name, agent.path, req.tenantId);
|
|
759
|
+
if (!restored) {
|
|
760
|
+
return reply.status(422).send({ error: `Agent directory not found at "${agent.path}". The agent "${parentSession.agentName}" may need to be re-deployed.`, statusCode: 422 });
|
|
761
|
+
}
|
|
762
|
+
console.log(`[sessions] Restored agent "${parentSession.agentName}" from cloud storage`);
|
|
763
|
+
}
|
|
687
764
|
// Persist parent workspace state if sandbox is still live
|
|
688
765
|
try {
|
|
689
766
|
const parentBackend = await coordinator.getBackendForRunnerAsync(parentSession.runnerId);
|
|
@@ -733,101 +810,123 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
733
810
|
200: sessionResponse,
|
|
734
811
|
404: { $ref: 'ApiError#' },
|
|
735
812
|
410: { $ref: 'ApiError#' },
|
|
813
|
+
422: { $ref: 'ApiError#' },
|
|
736
814
|
500: { $ref: 'ApiError#' },
|
|
737
815
|
503: { $ref: 'ApiError#' },
|
|
738
816
|
},
|
|
739
817
|
},
|
|
740
818
|
}, async (req, reply) => {
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
759
|
-
if (oldBackend.isSandboxAlive(session.sandboxId)) {
|
|
760
|
-
oldBackend.recordWarmHit();
|
|
761
|
-
logResume('warm', session.id, session.agentName);
|
|
762
|
-
await updateSessionStatus(session.id, 'active');
|
|
763
|
-
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
764
|
-
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
|
|
765
|
-
return reply.send({ session: { ...session, status: 'active' } });
|
|
766
|
-
}
|
|
767
|
-
}
|
|
768
|
-
catch { /* runner gone — cold path */ }
|
|
769
|
-
// Cold path: pick any healthy runner
|
|
770
|
-
try {
|
|
771
|
-
const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
|
|
772
|
-
const workspaceExists = existsSync(oldWorkspaceDir);
|
|
773
|
-
let resumeSource = 'fresh';
|
|
774
|
-
if (!workspaceExists) {
|
|
775
|
-
if (hasPersistedState(dataDir, session.id, session.tenantId)) {
|
|
776
|
-
restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
|
|
777
|
-
resumeSource = 'local';
|
|
819
|
+
return tracer.startActiveSpan('ash.session.resume', async (span) => {
|
|
820
|
+
try {
|
|
821
|
+
const session = await getSession(req.params.id);
|
|
822
|
+
if (!session || session.tenantId !== req.tenantId) {
|
|
823
|
+
return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
|
|
824
|
+
}
|
|
825
|
+
span.setAttribute('ash.session.id', session.id);
|
|
826
|
+
if (session.status === 'ended') {
|
|
827
|
+
return reply.status(410).send({ error: 'Session has ended — create a new session', statusCode: 410 });
|
|
828
|
+
}
|
|
829
|
+
if (session.status === 'active') {
|
|
830
|
+
return reply.send({ session });
|
|
831
|
+
}
|
|
832
|
+
// Resumable statuses: 'paused', 'stopped', 'error', 'starting'
|
|
833
|
+
const agentRecord = await getAgent(session.agentName, req.tenantId);
|
|
834
|
+
if (!agentRecord) {
|
|
835
|
+
return reply.status(404).send({ error: `Agent "${session.agentName}" not found`, statusCode: 404 });
|
|
778
836
|
}
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
const restored = await
|
|
782
|
-
if (restored) {
|
|
783
|
-
|
|
784
|
-
resumeSource = 'cloud';
|
|
837
|
+
// Validate agent directory exists — auto-restore from cloud if missing
|
|
838
|
+
if (!existsSync(agentRecord.path)) {
|
|
839
|
+
const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
|
|
840
|
+
if (!restored) {
|
|
841
|
+
return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${session.agentName}" may need to be re-deployed.`, statusCode: 422 });
|
|
785
842
|
}
|
|
843
|
+
console.log(`[sessions] Restored agent "${session.agentName}" from cloud storage`);
|
|
844
|
+
}
|
|
845
|
+
// Fast path: try the same runner if sandbox is still alive
|
|
846
|
+
try {
|
|
847
|
+
const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
848
|
+
if (oldBackend.isSandboxAlive(session.sandboxId)) {
|
|
849
|
+
oldBackend.recordWarmHit();
|
|
850
|
+
logResume('warm', session.id, session.agentName);
|
|
851
|
+
span.setAttribute('ash.resume.path', 'warm');
|
|
852
|
+
await updateSessionStatus(session.id, 'active');
|
|
853
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
854
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
|
|
855
|
+
return reply.send({ session: { ...session, status: 'active' } });
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
catch { /* runner gone — cold path */ }
|
|
859
|
+
// Cold path: pick any healthy runner
|
|
860
|
+
try {
|
|
861
|
+
const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
|
|
862
|
+
const workspaceExists = existsSync(oldWorkspaceDir);
|
|
863
|
+
let resumeSource = 'fresh';
|
|
864
|
+
if (!workspaceExists) {
|
|
865
|
+
if (hasPersistedState(dataDir, session.id, session.tenantId)) {
|
|
866
|
+
restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
|
|
867
|
+
resumeSource = 'local';
|
|
868
|
+
}
|
|
869
|
+
else {
|
|
870
|
+
// Fall back to cloud storage
|
|
871
|
+
const restored = await restoreStateFromCloud(dataDir, session.id, session.tenantId);
|
|
872
|
+
if (restored) {
|
|
873
|
+
restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
|
|
874
|
+
resumeSource = 'cloud';
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
else {
|
|
879
|
+
resumeSource = 'local';
|
|
880
|
+
}
|
|
881
|
+
span.setAttribute('ash.resume.path', 'cold');
|
|
882
|
+
span.setAttribute('ash.resume.source', resumeSource);
|
|
883
|
+
const workspaceAvailable = existsSync(oldWorkspaceDir);
|
|
884
|
+
const { backend, runnerId } = await coordinator.selectBackend();
|
|
885
|
+
const handle = await backend.createSandbox({
|
|
886
|
+
sessionId: session.id,
|
|
887
|
+
agentDir: agentRecord.path,
|
|
888
|
+
agentName: session.agentName,
|
|
889
|
+
sandboxId: session.id,
|
|
890
|
+
skipAgentCopy: workspaceAvailable,
|
|
891
|
+
onOomKill: () => {
|
|
892
|
+
updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
893
|
+
},
|
|
894
|
+
});
|
|
895
|
+
// Track resume source
|
|
896
|
+
switch (resumeSource) {
|
|
897
|
+
case 'local':
|
|
898
|
+
backend.recordColdLocalHit();
|
|
899
|
+
break;
|
|
900
|
+
case 'cloud':
|
|
901
|
+
backend.recordColdCloudHit();
|
|
902
|
+
break;
|
|
903
|
+
case 'fresh':
|
|
904
|
+
backend.recordColdFreshHit();
|
|
905
|
+
break;
|
|
906
|
+
}
|
|
907
|
+
logResume('cold', session.id, session.agentName, resumeSource);
|
|
908
|
+
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
909
|
+
await updateSessionSandbox(session.id, handle.sandboxId);
|
|
910
|
+
await updateSessionRunner(session.id, effectiveRunnerId);
|
|
911
|
+
await updateSessionStatus(session.id, 'active');
|
|
912
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
913
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
|
|
914
|
+
return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
|
|
915
|
+
}
|
|
916
|
+
catch (err) {
|
|
917
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
918
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
|
|
919
|
+
span.recordException(err instanceof Error ? err : new Error(msg));
|
|
920
|
+
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
921
|
+
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
922
|
+
}
|
|
923
|
+
return reply.status(500).send({ error: `Failed to resume session: ${msg}`, statusCode: 500 });
|
|
786
924
|
}
|
|
787
925
|
}
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
}
|
|
791
|
-
const workspaceAvailable = existsSync(oldWorkspaceDir);
|
|
792
|
-
const { backend, runnerId } = await coordinator.selectBackend();
|
|
793
|
-
const handle = await backend.createSandbox({
|
|
794
|
-
sessionId: session.id,
|
|
795
|
-
agentDir: agentRecord.path,
|
|
796
|
-
agentName: session.agentName,
|
|
797
|
-
sandboxId: session.id,
|
|
798
|
-
skipAgentCopy: workspaceAvailable,
|
|
799
|
-
onOomKill: () => {
|
|
800
|
-
updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
801
|
-
},
|
|
802
|
-
});
|
|
803
|
-
// Track resume source
|
|
804
|
-
switch (resumeSource) {
|
|
805
|
-
case 'local':
|
|
806
|
-
backend.recordColdLocalHit();
|
|
807
|
-
break;
|
|
808
|
-
case 'cloud':
|
|
809
|
-
backend.recordColdCloudHit();
|
|
810
|
-
break;
|
|
811
|
-
case 'fresh':
|
|
812
|
-
backend.recordColdFreshHit();
|
|
813
|
-
break;
|
|
814
|
-
}
|
|
815
|
-
logResume('cold', session.id, session.agentName, resumeSource);
|
|
816
|
-
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
817
|
-
await updateSessionSandbox(session.id, handle.sandboxId);
|
|
818
|
-
await updateSessionRunner(session.id, effectiveRunnerId);
|
|
819
|
-
await updateSessionStatus(session.id, 'active');
|
|
820
|
-
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
821
|
-
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
|
|
822
|
-
return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
|
|
823
|
-
}
|
|
824
|
-
catch (err) {
|
|
825
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
826
|
-
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
827
|
-
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
926
|
+
finally {
|
|
927
|
+
span.end();
|
|
828
928
|
}
|
|
829
|
-
|
|
830
|
-
}
|
|
929
|
+
});
|
|
831
930
|
});
|
|
832
931
|
// End session
|
|
833
932
|
app.delete('/api/sessions/:id', {
|