@ash-ai/server 0.0.23 → 0.0.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/sandbox-env.test.js +1 -0
- package/dist/__tests__/sandbox-env.test.js.map +1 -1
- package/dist/__tests__/tracing.test.d.ts +2 -0
- package/dist/__tests__/tracing.test.d.ts.map +1 -0
- package/dist/__tests__/tracing.test.js +69 -0
- package/dist/__tests__/tracing.test.js.map +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -2
- package/dist/index.js.map +1 -1
- package/dist/routes/agents.d.ts.map +1 -1
- package/dist/routes/agents.js +13 -6
- package/dist/routes/agents.js.map +1 -1
- package/dist/routes/api-keys.d.ts.map +1 -1
- package/dist/routes/api-keys.js +19 -3
- package/dist/routes/api-keys.js.map +1 -1
- package/dist/routes/runners.d.ts.map +1 -1
- package/dist/routes/runners.js +21 -3
- package/dist/routes/runners.js.map +1 -1
- package/dist/routes/sessions.d.ts.map +1 -1
- package/dist/routes/sessions.js +282 -203
- package/dist/routes/sessions.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +23 -6
- package/dist/server.js.map +1 -1
- package/dist/telemetry/tracing.d.ts +13 -0
- package/dist/telemetry/tracing.d.ts.map +1 -0
- package/dist/telemetry/tracing.js +50 -0
- package/dist/telemetry/tracing.js.map +1 -0
- package/package.json +20 -12
package/dist/routes/sessions.js
CHANGED
|
@@ -9,6 +9,8 @@ import { restoreSessionState, hasPersistedState, restoreStateFromCloud, restoreA
|
|
|
9
9
|
import { decryptCredential } from './credentials.js';
|
|
10
10
|
import { touchCredentialUsed } from '../db/index.js';
|
|
11
11
|
import { recordUsageFromMessage } from '../usage/extractor.js';
|
|
12
|
+
import { trace, context, propagation, SpanStatusCode } from '@opentelemetry/api';
|
|
13
|
+
const tracer = trace.getTracer('ash-coordinator');
|
|
12
14
|
/** Structured log line for every resume — always on, not gated by ASH_DEBUG_TIMING. */
|
|
13
15
|
function logResume(path, sessionId, agentName, source) {
|
|
14
16
|
process.stderr.write(JSON.stringify({
|
|
@@ -60,16 +62,22 @@ export async function writeSSE(raw, frame) {
|
|
|
60
62
|
export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
61
63
|
// Create session — picks the best runner via coordinator
|
|
62
64
|
app.post('/api/sessions', {
|
|
65
|
+
config: {
|
|
66
|
+
rateLimit: {
|
|
67
|
+
max: 20,
|
|
68
|
+
timeWindow: '15 minutes',
|
|
69
|
+
},
|
|
70
|
+
},
|
|
63
71
|
schema: {
|
|
64
72
|
tags: ['sessions'],
|
|
65
73
|
body: {
|
|
66
74
|
type: 'object',
|
|
67
75
|
properties: {
|
|
68
|
-
agent: { type: 'string' },
|
|
69
|
-
credentialId: { type: 'string' },
|
|
76
|
+
agent: { type: 'string', minLength: 1, maxLength: 255 },
|
|
77
|
+
credentialId: { type: 'string', maxLength: 255 },
|
|
70
78
|
extraEnv: { type: 'object', additionalProperties: { type: 'string' } },
|
|
71
|
-
startupScript: { type: 'string' },
|
|
72
|
-
model: { type: 'string', description: 'Model override for this session. Overrides agent .claude/settings.json default.' },
|
|
79
|
+
startupScript: { type: 'string', maxLength: 100_000 },
|
|
80
|
+
model: { type: 'string', maxLength: 100, description: 'Model override for this session. Overrides agent .claude/settings.json default.' },
|
|
73
81
|
mcpServers: {
|
|
74
82
|
type: 'object',
|
|
75
83
|
description: 'Per-session MCP servers. Merged into agent .mcp.json (session overrides agent). Enables sidecar pattern.',
|
|
@@ -83,7 +91,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
83
91
|
},
|
|
84
92
|
},
|
|
85
93
|
},
|
|
86
|
-
systemPrompt: { type: 'string', description: 'System prompt override. Replaces agent CLAUDE.md for this session.' },
|
|
94
|
+
systemPrompt: { type: 'string', maxLength: 1_000_000, description: 'System prompt override. Replaces agent CLAUDE.md for this session.' },
|
|
87
95
|
permissionMode: { type: 'string', enum: ['bypassPermissions', 'permissionsByAgent', 'default'], description: 'Permission mode for the SDK inside the sandbox. Defaults to bypassPermissions (sandbox isolation is the security boundary).' },
|
|
88
96
|
allowedTools: { type: 'array', items: { type: 'string' }, description: 'Whitelist of allowed tool names for this session.' },
|
|
89
97
|
disallowedTools: { type: 'array', items: { type: 'string' }, description: 'Blacklist of disallowed tool names for this session.' },
|
|
@@ -104,75 +112,94 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
104
112
|
},
|
|
105
113
|
}, async (req, reply) => {
|
|
106
114
|
const { agent, credentialId, extraEnv: bodyExtraEnv, startupScript, model, mcpServers, systemPrompt, permissionMode, allowedTools, disallowedTools, betas, subagents, initialAgent } = req.body;
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
115
|
+
return tracer.startActiveSpan('ash.session.create', { attributes: { 'ash.agent.name': agent } }, async (span) => {
|
|
116
|
+
try {
|
|
117
|
+
const agentRecord = await getAgent(agent, req.tenantId);
|
|
118
|
+
if (!agentRecord) {
|
|
119
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent not found' });
|
|
120
|
+
return reply.status(404).send({ error: `Agent "${agent}" not found`, statusCode: 404 });
|
|
121
|
+
}
|
|
122
|
+
// Validate agent directory exists on disk — auto-restore from cloud if missing
|
|
123
|
+
if (!existsSync(agentRecord.path)) {
|
|
124
|
+
const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
|
|
125
|
+
if (!restored) {
|
|
126
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent directory not found' });
|
|
127
|
+
return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${agent}" may need to be re-deployed.`, statusCode: 422 });
|
|
128
|
+
}
|
|
129
|
+
console.log(`[sessions] Restored agent "${agent}" from cloud storage`);
|
|
130
|
+
}
|
|
131
|
+
// Resolve credential to env vars if provided
|
|
132
|
+
let extraEnv;
|
|
133
|
+
if (credentialId) {
|
|
134
|
+
const cred = await decryptCredential(credentialId, req.tenantId);
|
|
135
|
+
if (!cred) {
|
|
136
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: 'Invalid credential' });
|
|
137
|
+
return reply.status(400).send({ error: 'Invalid or inaccessible credential', statusCode: 400 });
|
|
138
|
+
}
|
|
139
|
+
const envKey = cred.type === 'anthropic' ? 'ANTHROPIC_API_KEY' : cred.type === 'openai' ? 'OPENAI_API_KEY' : 'ASH_CUSTOM_API_KEY';
|
|
140
|
+
extraEnv = { [envKey]: cred.key };
|
|
141
|
+
touchCredentialUsed(credentialId).catch(() => { });
|
|
142
|
+
}
|
|
143
|
+
// Merge body-level extraEnv (overrides credential env on conflict)
|
|
144
|
+
if (bodyExtraEnv) {
|
|
145
|
+
extraEnv = { ...extraEnv, ...bodyExtraEnv };
|
|
146
|
+
}
|
|
147
|
+
// Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
|
|
148
|
+
if (permissionMode) {
|
|
149
|
+
extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
|
|
150
|
+
}
|
|
151
|
+
const sessionId = randomUUID();
|
|
152
|
+
span.setAttribute('ash.session.id', sessionId);
|
|
153
|
+
try {
|
|
154
|
+
span.addEvent('selectBackend.start');
|
|
155
|
+
const { backend, runnerId } = await coordinator.selectBackend();
|
|
156
|
+
span.addEvent('selectBackend.end');
|
|
157
|
+
span.addEvent('createSandbox.start');
|
|
158
|
+
const handle = await backend.createSandbox({
|
|
159
|
+
sessionId,
|
|
160
|
+
agentDir: agentRecord.path,
|
|
161
|
+
agentName: agentRecord.name,
|
|
162
|
+
sandboxId: sessionId,
|
|
163
|
+
extraEnv,
|
|
164
|
+
startupScript,
|
|
165
|
+
mcpServers,
|
|
166
|
+
systemPrompt,
|
|
167
|
+
onOomKill: () => {
|
|
168
|
+
updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
169
|
+
},
|
|
170
|
+
});
|
|
171
|
+
span.addEvent('createSandbox.end');
|
|
172
|
+
// Resolve effective model: explicit request > agent record > null (SDK default)
|
|
173
|
+
const effectiveModel = model || agentRecord.model || undefined;
|
|
174
|
+
if (effectiveModel)
|
|
175
|
+
span.setAttribute('ash.model', effectiveModel);
|
|
176
|
+
// Build session-level SDK config (persisted on session, injected into every query)
|
|
177
|
+
const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
|
|
178
|
+
? { allowedTools, disallowedTools, betas, subagents, initialAgent }
|
|
179
|
+
: null;
|
|
180
|
+
const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
|
|
181
|
+
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
182
|
+
await updateSessionRunner(sessionId, effectiveRunnerId);
|
|
183
|
+
await updateSessionStatus(sessionId, 'active');
|
|
184
|
+
// Record lifecycle event
|
|
185
|
+
insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
186
|
+
telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
|
|
187
|
+
return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
|
|
188
|
+
}
|
|
189
|
+
catch (err) {
|
|
190
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
191
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
|
|
192
|
+
span.recordException(err instanceof Error ? err : new Error(msg));
|
|
193
|
+
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
194
|
+
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
195
|
+
}
|
|
196
|
+
return reply.status(500).send({ error: `Failed to create session: ${msg}`, statusCode: 500 });
|
|
197
|
+
}
|
|
125
198
|
}
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
touchCredentialUsed(credentialId).catch(() => { });
|
|
129
|
-
}
|
|
130
|
-
// Merge body-level extraEnv (overrides credential env on conflict)
|
|
131
|
-
if (bodyExtraEnv) {
|
|
132
|
-
extraEnv = { ...extraEnv, ...bodyExtraEnv };
|
|
133
|
-
}
|
|
134
|
-
// Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
|
|
135
|
-
if (permissionMode) {
|
|
136
|
-
extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
|
|
137
|
-
}
|
|
138
|
-
const sessionId = randomUUID();
|
|
139
|
-
try {
|
|
140
|
-
const { backend, runnerId } = await coordinator.selectBackend();
|
|
141
|
-
const handle = await backend.createSandbox({
|
|
142
|
-
sessionId,
|
|
143
|
-
agentDir: agentRecord.path,
|
|
144
|
-
agentName: agentRecord.name,
|
|
145
|
-
sandboxId: sessionId,
|
|
146
|
-
extraEnv,
|
|
147
|
-
startupScript,
|
|
148
|
-
mcpServers,
|
|
149
|
-
systemPrompt,
|
|
150
|
-
onOomKill: () => {
|
|
151
|
-
updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
152
|
-
},
|
|
153
|
-
});
|
|
154
|
-
// Resolve effective model: explicit request > agent record > null (SDK default)
|
|
155
|
-
const effectiveModel = model || agentRecord.model || undefined;
|
|
156
|
-
// Build session-level SDK config (persisted on session, injected into every query)
|
|
157
|
-
const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
|
|
158
|
-
? { allowedTools, disallowedTools, betas, subagents, initialAgent }
|
|
159
|
-
: null;
|
|
160
|
-
const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
|
|
161
|
-
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
162
|
-
await updateSessionRunner(sessionId, effectiveRunnerId);
|
|
163
|
-
await updateSessionStatus(sessionId, 'active');
|
|
164
|
-
// Record lifecycle event
|
|
165
|
-
insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
166
|
-
telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
|
|
167
|
-
return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
|
|
168
|
-
}
|
|
169
|
-
catch (err) {
|
|
170
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
171
|
-
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
172
|
-
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
199
|
+
finally {
|
|
200
|
+
span.end();
|
|
173
201
|
}
|
|
174
|
-
|
|
175
|
-
}
|
|
202
|
+
});
|
|
176
203
|
});
|
|
177
204
|
// List sessions (optional ?agent=name filter)
|
|
178
205
|
app.get('/api/sessions', {
|
|
@@ -223,7 +250,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
223
250
|
body: {
|
|
224
251
|
type: 'object',
|
|
225
252
|
properties: {
|
|
226
|
-
model: { type: 'string', description: 'Model override for subsequent queries.' },
|
|
253
|
+
model: { type: 'string', maxLength: 100, description: 'Model override for subsequent queries.' },
|
|
227
254
|
allowedTools: { type: 'array', items: { type: 'string' }, description: 'Whitelist of allowed tool names.' },
|
|
228
255
|
disallowedTools: { type: 'array', items: { type: 'string' }, description: 'Blacklist of disallowed tool names.' },
|
|
229
256
|
betas: { type: 'array', items: { type: 'string' }, description: 'Beta feature flags.' },
|
|
@@ -334,9 +361,9 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
334
361
|
body: {
|
|
335
362
|
type: 'object',
|
|
336
363
|
properties: {
|
|
337
|
-
content: { type: 'string' },
|
|
364
|
+
content: { type: 'string', maxLength: 100_000 },
|
|
338
365
|
includePartialMessages: { type: 'boolean' },
|
|
339
|
-
model: { type: 'string', description: 'Model override for this query. Overrides session and agent defaults.' },
|
|
366
|
+
model: { type: 'string', maxLength: 100, description: 'Model override for this query. Overrides session and agent defaults.' },
|
|
340
367
|
maxTurns: { type: 'integer', minimum: 1, description: 'Maximum agentic turns for this query.' },
|
|
341
368
|
maxBudgetUsd: { type: 'number', minimum: 0, description: 'Maximum budget in USD for this query.' },
|
|
342
369
|
effort: { type: 'string', enum: ['low', 'medium', 'high', 'max'], description: 'Effort level for this query.' },
|
|
@@ -366,17 +393,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
366
393
|
return reply.status(400).send({ error: `Session is ${session.status}`, statusCode: 400 });
|
|
367
394
|
}
|
|
368
395
|
const { content, includePartialMessages, model: messageModel, maxTurns, maxBudgetUsd, effort, thinking, outputFormat } = req.body;
|
|
396
|
+
const messageSpan = tracer.startSpan('ash.session.message', {
|
|
397
|
+
attributes: {
|
|
398
|
+
'ash.session.id': session.id,
|
|
399
|
+
'ash.agent.name': session.agentName,
|
|
400
|
+
'ash.sandbox.id': session.sandboxId,
|
|
401
|
+
},
|
|
402
|
+
});
|
|
403
|
+
const messageCtx = trace.setSpan(context.active(), messageSpan);
|
|
404
|
+
if (session.model)
|
|
405
|
+
messageSpan.setAttribute('ash.model', session.model);
|
|
369
406
|
let backend;
|
|
370
407
|
try {
|
|
371
408
|
backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
372
409
|
}
|
|
373
410
|
catch {
|
|
374
411
|
await updateSessionStatus(session.id, 'error');
|
|
412
|
+
messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Runner not available' });
|
|
413
|
+
messageSpan.end();
|
|
375
414
|
return reply.status(500).send({ error: 'Runner not available', statusCode: 500 });
|
|
376
415
|
}
|
|
377
416
|
const sandbox = backend.getSandbox(session.sandboxId);
|
|
378
417
|
if (!sandbox) {
|
|
379
418
|
await updateSessionStatus(session.id, 'error');
|
|
419
|
+
messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Sandbox not found' });
|
|
420
|
+
messageSpan.end();
|
|
380
421
|
return reply.status(500).send({ error: 'Sandbox not found', statusCode: 500 });
|
|
381
422
|
}
|
|
382
423
|
const lookupMs = elapsed?.() ?? 0;
|
|
@@ -396,6 +437,10 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
396
437
|
await writeSSE(reply.raw, `event: session_start\ndata: ${JSON.stringify({ sessionId: session.id, version: VERSION })}\n\n`);
|
|
397
438
|
let eventCount = 0;
|
|
398
439
|
let firstEventMs = 0;
|
|
440
|
+
// Inject trace context for bridge propagation
|
|
441
|
+
const carrier = {};
|
|
442
|
+
propagation.inject(messageCtx, carrier);
|
|
443
|
+
const traceContext = carrier['traceparent'];
|
|
399
444
|
try {
|
|
400
445
|
// Model precedence: per-message > session-level > agent default (.claude/settings.json)
|
|
401
446
|
const queryModel = messageModel || session.model || undefined;
|
|
@@ -419,6 +464,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
419
464
|
...(cfg?.betas && { betas: cfg.betas }),
|
|
420
465
|
...(cfg?.subagents && { subagents: cfg.subagents }),
|
|
421
466
|
...(cfg?.initialAgent && { initialAgent: cfg.initialAgent }),
|
|
467
|
+
// Distributed tracing context
|
|
468
|
+
...(traceContext && { traceContext }),
|
|
422
469
|
});
|
|
423
470
|
for await (const event of events) {
|
|
424
471
|
eventCount++;
|
|
@@ -471,6 +518,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
471
518
|
}
|
|
472
519
|
catch (err) {
|
|
473
520
|
const msg = err instanceof Error ? err.message : String(err);
|
|
521
|
+
messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
|
|
522
|
+
messageSpan.recordException(err instanceof Error ? err : new Error(msg));
|
|
474
523
|
reply.raw.write(`event: error\ndata: ${JSON.stringify({ error: msg })}\n\n`);
|
|
475
524
|
// Signal stream completion — the session remains active (not ended)
|
|
476
525
|
reply.raw.write(`event: done\ndata: ${JSON.stringify({ sessionId: session.id })}\n\n`);
|
|
@@ -478,6 +527,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
478
527
|
finally {
|
|
479
528
|
// Mark waiting after message processing completes
|
|
480
529
|
backend.markWaiting(session.sandboxId);
|
|
530
|
+
messageSpan.end();
|
|
481
531
|
}
|
|
482
532
|
if (elapsed) {
|
|
483
533
|
logTiming({
|
|
@@ -623,23 +673,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
623
673
|
},
|
|
624
674
|
},
|
|
625
675
|
}, async (req, reply) => {
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
676
|
+
return tracer.startActiveSpan('ash.session.pause', async (span) => {
|
|
677
|
+
try {
|
|
678
|
+
const session = await getSession(req.params.id);
|
|
679
|
+
if (!session || session.tenantId !== req.tenantId) {
|
|
680
|
+
return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
|
|
681
|
+
}
|
|
682
|
+
span.setAttribute('ash.session.id', session.id);
|
|
683
|
+
if (session.status !== 'active') {
|
|
684
|
+
return reply.status(400).send({ error: `Cannot pause session with status "${session.status}"`, statusCode: 400 });
|
|
685
|
+
}
|
|
686
|
+
// Best-effort persist state before pausing
|
|
687
|
+
try {
|
|
688
|
+
const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
689
|
+
backend.persistState(session.sandboxId, session.id, session.agentName);
|
|
690
|
+
}
|
|
691
|
+
catch { /* runner may be gone */ }
|
|
692
|
+
await updateSessionStatus(session.id, 'paused');
|
|
693
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'paused' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
694
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'paused' } });
|
|
695
|
+
return reply.send({ session: { ...session, status: 'paused' } });
|
|
696
|
+
}
|
|
697
|
+
finally {
|
|
698
|
+
span.end();
|
|
699
|
+
}
|
|
700
|
+
});
|
|
643
701
|
});
|
|
644
702
|
// Stop session — explicit user action (distinct from pause which is idle-based)
|
|
645
703
|
app.post('/api/sessions/:id/stop', {
|
|
@@ -653,24 +711,32 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
653
711
|
},
|
|
654
712
|
},
|
|
655
713
|
}, async (req, reply) => {
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
714
|
+
return tracer.startActiveSpan('ash.session.stop', async (span) => {
|
|
715
|
+
try {
|
|
716
|
+
const session = await getSession(req.params.id);
|
|
717
|
+
if (!session || session.tenantId !== req.tenantId) {
|
|
718
|
+
return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
|
|
719
|
+
}
|
|
720
|
+
span.setAttribute('ash.session.id', session.id);
|
|
721
|
+
if (session.status !== 'active' && session.status !== 'starting') {
|
|
722
|
+
return reply.status(400).send({ error: `Cannot stop session with status "${session.status}"`, statusCode: 400 });
|
|
723
|
+
}
|
|
724
|
+
// Persist state and destroy sandbox
|
|
725
|
+
try {
|
|
726
|
+
const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
727
|
+
backend.persistState(session.sandboxId, session.id, session.agentName);
|
|
728
|
+
await backend.destroySandbox(session.sandboxId);
|
|
729
|
+
}
|
|
730
|
+
catch { /* runner may be gone */ }
|
|
731
|
+
await updateSessionStatus(session.id, 'stopped');
|
|
732
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'stopped' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
733
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'stopped' } });
|
|
734
|
+
return reply.send({ session: { ...session, status: 'stopped' } });
|
|
735
|
+
}
|
|
736
|
+
finally {
|
|
737
|
+
span.end();
|
|
738
|
+
}
|
|
739
|
+
});
|
|
674
740
|
});
|
|
675
741
|
// Fork session — create a new session branching from parent's state and messages
|
|
676
742
|
app.post('/api/sessions/:id/fork', {
|
|
@@ -756,104 +822,117 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
|
|
|
756
822
|
},
|
|
757
823
|
},
|
|
758
824
|
}, async (req, reply) => {
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
}
|
|
769
|
-
// Resumable statuses: 'paused', 'stopped', 'error', 'starting'
|
|
770
|
-
const agentRecord = await getAgent(session.agentName, req.tenantId);
|
|
771
|
-
if (!agentRecord) {
|
|
772
|
-
return reply.status(404).send({ error: `Agent "${session.agentName}" not found`, statusCode: 404 });
|
|
773
|
-
}
|
|
774
|
-
// Validate agent directory exists — auto-restore from cloud if missing
|
|
775
|
-
if (!existsSync(agentRecord.path)) {
|
|
776
|
-
const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
|
|
777
|
-
if (!restored) {
|
|
778
|
-
return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${session.agentName}" may need to be re-deployed.`, statusCode: 422 });
|
|
779
|
-
}
|
|
780
|
-
console.log(`[sessions] Restored agent "${session.agentName}" from cloud storage`);
|
|
781
|
-
}
|
|
782
|
-
// Fast path: try the same runner if sandbox is still alive
|
|
783
|
-
try {
|
|
784
|
-
const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
785
|
-
if (oldBackend.isSandboxAlive(session.sandboxId)) {
|
|
786
|
-
oldBackend.recordWarmHit();
|
|
787
|
-
logResume('warm', session.id, session.agentName);
|
|
788
|
-
await updateSessionStatus(session.id, 'active');
|
|
789
|
-
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
790
|
-
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
|
|
791
|
-
return reply.send({ session: { ...session, status: 'active' } });
|
|
792
|
-
}
|
|
793
|
-
}
|
|
794
|
-
catch { /* runner gone — cold path */ }
|
|
795
|
-
// Cold path: pick any healthy runner
|
|
796
|
-
try {
|
|
797
|
-
const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
|
|
798
|
-
const workspaceExists = existsSync(oldWorkspaceDir);
|
|
799
|
-
let resumeSource = 'fresh';
|
|
800
|
-
if (!workspaceExists) {
|
|
801
|
-
if (hasPersistedState(dataDir, session.id, session.tenantId)) {
|
|
802
|
-
restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
|
|
803
|
-
resumeSource = 'local';
|
|
825
|
+
return tracer.startActiveSpan('ash.session.resume', async (span) => {
|
|
826
|
+
try {
|
|
827
|
+
const session = await getSession(req.params.id);
|
|
828
|
+
if (!session || session.tenantId !== req.tenantId) {
|
|
829
|
+
return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
|
|
830
|
+
}
|
|
831
|
+
span.setAttribute('ash.session.id', session.id);
|
|
832
|
+
if (session.status === 'ended') {
|
|
833
|
+
return reply.status(410).send({ error: 'Session has ended — create a new session', statusCode: 410 });
|
|
804
834
|
}
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
835
|
+
if (session.status === 'active') {
|
|
836
|
+
return reply.send({ session });
|
|
837
|
+
}
|
|
838
|
+
// Resumable statuses: 'paused', 'stopped', 'error', 'starting'
|
|
839
|
+
const agentRecord = await getAgent(session.agentName, req.tenantId);
|
|
840
|
+
if (!agentRecord) {
|
|
841
|
+
return reply.status(404).send({ error: `Agent "${session.agentName}" not found`, statusCode: 404 });
|
|
842
|
+
}
|
|
843
|
+
// Validate agent directory exists — auto-restore from cloud if missing
|
|
844
|
+
if (!existsSync(agentRecord.path)) {
|
|
845
|
+
const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
|
|
846
|
+
if (!restored) {
|
|
847
|
+
return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${session.agentName}" may need to be re-deployed.`, statusCode: 422 });
|
|
811
848
|
}
|
|
849
|
+
console.log(`[sessions] Restored agent "${session.agentName}" from cloud storage`);
|
|
850
|
+
}
|
|
851
|
+
// Fast path: try the same runner if sandbox is still alive
|
|
852
|
+
try {
|
|
853
|
+
const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
|
|
854
|
+
if (oldBackend.isSandboxAlive(session.sandboxId)) {
|
|
855
|
+
oldBackend.recordWarmHit();
|
|
856
|
+
logResume('warm', session.id, session.agentName);
|
|
857
|
+
span.setAttribute('ash.resume.path', 'warm');
|
|
858
|
+
await updateSessionStatus(session.id, 'active');
|
|
859
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
860
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
|
|
861
|
+
return reply.send({ session: { ...session, status: 'active' } });
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
catch { /* runner gone — cold path */ }
|
|
865
|
+
// Cold path: pick any healthy runner
|
|
866
|
+
try {
|
|
867
|
+
const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
|
|
868
|
+
const workspaceExists = existsSync(oldWorkspaceDir);
|
|
869
|
+
let resumeSource = 'fresh';
|
|
870
|
+
if (!workspaceExists) {
|
|
871
|
+
if (hasPersistedState(dataDir, session.id, session.tenantId)) {
|
|
872
|
+
restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
|
|
873
|
+
resumeSource = 'local';
|
|
874
|
+
}
|
|
875
|
+
else {
|
|
876
|
+
// Fall back to cloud storage
|
|
877
|
+
const restored = await restoreStateFromCloud(dataDir, session.id, session.tenantId);
|
|
878
|
+
if (restored) {
|
|
879
|
+
restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
|
|
880
|
+
resumeSource = 'cloud';
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
else {
|
|
885
|
+
resumeSource = 'local';
|
|
886
|
+
}
|
|
887
|
+
span.setAttribute('ash.resume.path', 'cold');
|
|
888
|
+
span.setAttribute('ash.resume.source', resumeSource);
|
|
889
|
+
const workspaceAvailable = existsSync(oldWorkspaceDir);
|
|
890
|
+
const { backend, runnerId } = await coordinator.selectBackend();
|
|
891
|
+
const handle = await backend.createSandbox({
|
|
892
|
+
sessionId: session.id,
|
|
893
|
+
agentDir: agentRecord.path,
|
|
894
|
+
agentName: session.agentName,
|
|
895
|
+
sandboxId: session.id,
|
|
896
|
+
skipAgentCopy: workspaceAvailable,
|
|
897
|
+
onOomKill: () => {
|
|
898
|
+
updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
899
|
+
},
|
|
900
|
+
});
|
|
901
|
+
// Track resume source
|
|
902
|
+
switch (resumeSource) {
|
|
903
|
+
case 'local':
|
|
904
|
+
backend.recordColdLocalHit();
|
|
905
|
+
break;
|
|
906
|
+
case 'cloud':
|
|
907
|
+
backend.recordColdCloudHit();
|
|
908
|
+
break;
|
|
909
|
+
case 'fresh':
|
|
910
|
+
backend.recordColdFreshHit();
|
|
911
|
+
break;
|
|
912
|
+
}
|
|
913
|
+
logResume('cold', session.id, session.agentName, resumeSource);
|
|
914
|
+
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
915
|
+
await updateSessionSandbox(session.id, handle.sandboxId);
|
|
916
|
+
await updateSessionRunner(session.id, effectiveRunnerId);
|
|
917
|
+
await updateSessionStatus(session.id, 'active');
|
|
918
|
+
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
919
|
+
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
|
|
920
|
+
return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
|
|
921
|
+
}
|
|
922
|
+
catch (err) {
|
|
923
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
924
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
|
|
925
|
+
span.recordException(err instanceof Error ? err : new Error(msg));
|
|
926
|
+
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
927
|
+
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
928
|
+
}
|
|
929
|
+
return reply.status(500).send({ error: `Failed to resume session: ${msg}`, statusCode: 500 });
|
|
812
930
|
}
|
|
813
931
|
}
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
}
|
|
817
|
-
const workspaceAvailable = existsSync(oldWorkspaceDir);
|
|
818
|
-
const { backend, runnerId } = await coordinator.selectBackend();
|
|
819
|
-
const handle = await backend.createSandbox({
|
|
820
|
-
sessionId: session.id,
|
|
821
|
-
agentDir: agentRecord.path,
|
|
822
|
-
agentName: session.agentName,
|
|
823
|
-
sandboxId: session.id,
|
|
824
|
-
skipAgentCopy: workspaceAvailable,
|
|
825
|
-
onOomKill: () => {
|
|
826
|
-
updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
|
|
827
|
-
},
|
|
828
|
-
});
|
|
829
|
-
// Track resume source
|
|
830
|
-
switch (resumeSource) {
|
|
831
|
-
case 'local':
|
|
832
|
-
backend.recordColdLocalHit();
|
|
833
|
-
break;
|
|
834
|
-
case 'cloud':
|
|
835
|
-
backend.recordColdCloudHit();
|
|
836
|
-
break;
|
|
837
|
-
case 'fresh':
|
|
838
|
-
backend.recordColdFreshHit();
|
|
839
|
-
break;
|
|
840
|
-
}
|
|
841
|
-
logResume('cold', session.id, session.agentName, resumeSource);
|
|
842
|
-
const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
|
|
843
|
-
await updateSessionSandbox(session.id, handle.sandboxId);
|
|
844
|
-
await updateSessionRunner(session.id, effectiveRunnerId);
|
|
845
|
-
await updateSessionStatus(session.id, 'active');
|
|
846
|
-
insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
|
|
847
|
-
telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
|
|
848
|
-
return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
|
|
849
|
-
}
|
|
850
|
-
catch (err) {
|
|
851
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
852
|
-
if (msg.includes('capacity reached') || msg.includes('No runners available')) {
|
|
853
|
-
return reply.status(503).send({ error: msg, statusCode: 503 });
|
|
932
|
+
finally {
|
|
933
|
+
span.end();
|
|
854
934
|
}
|
|
855
|
-
|
|
856
|
-
}
|
|
935
|
+
});
|
|
857
936
|
});
|
|
858
937
|
// End session
|
|
859
938
|
app.delete('/api/sessions/:id', {
|