@ash-ai/server 0.0.23 → 0.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,8 @@ import { restoreSessionState, hasPersistedState, restoreStateFromCloud, restoreA
9
9
  import { decryptCredential } from './credentials.js';
10
10
  import { touchCredentialUsed } from '../db/index.js';
11
11
  import { recordUsageFromMessage } from '../usage/extractor.js';
12
+ import { trace, context, propagation, SpanStatusCode } from '@opentelemetry/api';
13
+ const tracer = trace.getTracer('ash-coordinator');
12
14
  /** Structured log line for every resume — always on, not gated by ASH_DEBUG_TIMING. */
13
15
  function logResume(path, sessionId, agentName, source) {
14
16
  process.stderr.write(JSON.stringify({
@@ -60,16 +62,22 @@ export async function writeSSE(raw, frame) {
60
62
  export function sessionRoutes(app, coordinator, dataDir, telemetry) {
61
63
  // Create session — picks the best runner via coordinator
62
64
  app.post('/api/sessions', {
65
+ config: {
66
+ rateLimit: {
67
+ max: 20,
68
+ timeWindow: '15 minutes',
69
+ },
70
+ },
63
71
  schema: {
64
72
  tags: ['sessions'],
65
73
  body: {
66
74
  type: 'object',
67
75
  properties: {
68
- agent: { type: 'string' },
69
- credentialId: { type: 'string' },
76
+ agent: { type: 'string', minLength: 1, maxLength: 255 },
77
+ credentialId: { type: 'string', maxLength: 255 },
70
78
  extraEnv: { type: 'object', additionalProperties: { type: 'string' } },
71
- startupScript: { type: 'string' },
72
- model: { type: 'string', description: 'Model override for this session. Overrides agent .claude/settings.json default.' },
79
+ startupScript: { type: 'string', maxLength: 100_000 },
80
+ model: { type: 'string', maxLength: 100, description: 'Model override for this session. Overrides agent .claude/settings.json default.' },
73
81
  mcpServers: {
74
82
  type: 'object',
75
83
  description: 'Per-session MCP servers. Merged into agent .mcp.json (session overrides agent). Enables sidecar pattern.',
@@ -83,7 +91,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
83
91
  },
84
92
  },
85
93
  },
86
- systemPrompt: { type: 'string', description: 'System prompt override. Replaces agent CLAUDE.md for this session.' },
94
+ systemPrompt: { type: 'string', maxLength: 1_000_000, description: 'System prompt override. Replaces agent CLAUDE.md for this session.' },
87
95
  permissionMode: { type: 'string', enum: ['bypassPermissions', 'permissionsByAgent', 'default'], description: 'Permission mode for the SDK inside the sandbox. Defaults to bypassPermissions (sandbox isolation is the security boundary).' },
88
96
  allowedTools: { type: 'array', items: { type: 'string' }, description: 'Whitelist of allowed tool names for this session.' },
89
97
  disallowedTools: { type: 'array', items: { type: 'string' }, description: 'Blacklist of disallowed tool names for this session.' },
@@ -104,75 +112,94 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
104
112
  },
105
113
  }, async (req, reply) => {
106
114
  const { agent, credentialId, extraEnv: bodyExtraEnv, startupScript, model, mcpServers, systemPrompt, permissionMode, allowedTools, disallowedTools, betas, subagents, initialAgent } = req.body;
107
- const agentRecord = await getAgent(agent, req.tenantId);
108
- if (!agentRecord) {
109
- return reply.status(404).send({ error: `Agent "${agent}" not found`, statusCode: 404 });
110
- }
111
- // Validate agent directory exists on disk — auto-restore from cloud if missing
112
- if (!existsSync(agentRecord.path)) {
113
- const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
114
- if (!restored) {
115
- return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${agent}" may need to be re-deployed.`, statusCode: 422 });
116
- }
117
- console.log(`[sessions] Restored agent "${agent}" from cloud storage`);
118
- }
119
- // Resolve credential to env vars if provided
120
- let extraEnv;
121
- if (credentialId) {
122
- const cred = await decryptCredential(credentialId, req.tenantId);
123
- if (!cred) {
124
- return reply.status(400).send({ error: 'Invalid or inaccessible credential', statusCode: 400 });
115
+ return tracer.startActiveSpan('ash.session.create', { attributes: { 'ash.agent.name': agent } }, async (span) => {
116
+ try {
117
+ const agentRecord = await getAgent(agent, req.tenantId);
118
+ if (!agentRecord) {
119
+ span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent not found' });
120
+ return reply.status(404).send({ error: `Agent "${agent}" not found`, statusCode: 404 });
121
+ }
122
+ // Validate agent directory exists on disk — auto-restore from cloud if missing
123
+ if (!existsSync(agentRecord.path)) {
124
+ const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
125
+ if (!restored) {
126
+ span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent directory not found' });
127
+ return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${agent}" may need to be re-deployed.`, statusCode: 422 });
128
+ }
129
+ console.log(`[sessions] Restored agent "${agent}" from cloud storage`);
130
+ }
131
+ // Resolve credential to env vars if provided
132
+ let extraEnv;
133
+ if (credentialId) {
134
+ const cred = await decryptCredential(credentialId, req.tenantId);
135
+ if (!cred) {
136
+ span.setStatus({ code: SpanStatusCode.ERROR, message: 'Invalid credential' });
137
+ return reply.status(400).send({ error: 'Invalid or inaccessible credential', statusCode: 400 });
138
+ }
139
+ const envKey = cred.type === 'anthropic' ? 'ANTHROPIC_API_KEY' : cred.type === 'openai' ? 'OPENAI_API_KEY' : 'ASH_CUSTOM_API_KEY';
140
+ extraEnv = { [envKey]: cred.key };
141
+ touchCredentialUsed(credentialId).catch(() => { });
142
+ }
143
+ // Merge body-level extraEnv (overrides credential env on conflict)
144
+ if (bodyExtraEnv) {
145
+ extraEnv = { ...extraEnv, ...bodyExtraEnv };
146
+ }
147
+ // Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
148
+ if (permissionMode) {
149
+ extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
150
+ }
151
+ const sessionId = randomUUID();
152
+ span.setAttribute('ash.session.id', sessionId);
153
+ try {
154
+ span.addEvent('selectBackend.start');
155
+ const { backend, runnerId } = await coordinator.selectBackend();
156
+ span.addEvent('selectBackend.end');
157
+ span.addEvent('createSandbox.start');
158
+ const handle = await backend.createSandbox({
159
+ sessionId,
160
+ agentDir: agentRecord.path,
161
+ agentName: agentRecord.name,
162
+ sandboxId: sessionId,
163
+ extraEnv,
164
+ startupScript,
165
+ mcpServers,
166
+ systemPrompt,
167
+ onOomKill: () => {
168
+ updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
169
+ },
170
+ });
171
+ span.addEvent('createSandbox.end');
172
+ // Resolve effective model: explicit request > agent record > null (SDK default)
173
+ const effectiveModel = model || agentRecord.model || undefined;
174
+ if (effectiveModel)
175
+ span.setAttribute('ash.model', effectiveModel);
176
+ // Build session-level SDK config (persisted on session, injected into every query)
177
+ const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
178
+ ? { allowedTools, disallowedTools, betas, subagents, initialAgent }
179
+ : null;
180
+ const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
181
+ const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
182
+ await updateSessionRunner(sessionId, effectiveRunnerId);
183
+ await updateSessionStatus(sessionId, 'active');
184
+ // Record lifecycle event
185
+ insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
186
+ telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
187
+ return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
188
+ }
189
+ catch (err) {
190
+ const msg = err instanceof Error ? err.message : String(err);
191
+ span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
192
+ span.recordException(err instanceof Error ? err : new Error(msg));
193
+ if (msg.includes('capacity reached') || msg.includes('No runners available')) {
194
+ return reply.status(503).send({ error: msg, statusCode: 503 });
195
+ }
196
+ return reply.status(500).send({ error: `Failed to create session: ${msg}`, statusCode: 500 });
197
+ }
125
198
  }
126
- const envKey = cred.type === 'anthropic' ? 'ANTHROPIC_API_KEY' : cred.type === 'openai' ? 'OPENAI_API_KEY' : 'ASH_CUSTOM_API_KEY';
127
- extraEnv = { [envKey]: cred.key };
128
- touchCredentialUsed(credentialId).catch(() => { });
129
- }
130
- // Merge body-level extraEnv (overrides credential env on conflict)
131
- if (bodyExtraEnv) {
132
- extraEnv = { ...extraEnv, ...bodyExtraEnv };
133
- }
134
- // Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
135
- if (permissionMode) {
136
- extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
137
- }
138
- const sessionId = randomUUID();
139
- try {
140
- const { backend, runnerId } = await coordinator.selectBackend();
141
- const handle = await backend.createSandbox({
142
- sessionId,
143
- agentDir: agentRecord.path,
144
- agentName: agentRecord.name,
145
- sandboxId: sessionId,
146
- extraEnv,
147
- startupScript,
148
- mcpServers,
149
- systemPrompt,
150
- onOomKill: () => {
151
- updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
152
- },
153
- });
154
- // Resolve effective model: explicit request > agent record > null (SDK default)
155
- const effectiveModel = model || agentRecord.model || undefined;
156
- // Build session-level SDK config (persisted on session, injected into every query)
157
- const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
158
- ? { allowedTools, disallowedTools, betas, subagents, initialAgent }
159
- : null;
160
- const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
161
- const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
162
- await updateSessionRunner(sessionId, effectiveRunnerId);
163
- await updateSessionStatus(sessionId, 'active');
164
- // Record lifecycle event
165
- insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
166
- telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
167
- return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
168
- }
169
- catch (err) {
170
- const msg = err instanceof Error ? err.message : String(err);
171
- if (msg.includes('capacity reached') || msg.includes('No runners available')) {
172
- return reply.status(503).send({ error: msg, statusCode: 503 });
199
+ finally {
200
+ span.end();
173
201
  }
174
- return reply.status(500).send({ error: `Failed to create session: ${msg}`, statusCode: 500 });
175
- }
202
+ });
176
203
  });
177
204
  // List sessions (optional ?agent=name filter)
178
205
  app.get('/api/sessions', {
@@ -223,7 +250,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
223
250
  body: {
224
251
  type: 'object',
225
252
  properties: {
226
- model: { type: 'string', description: 'Model override for subsequent queries.' },
253
+ model: { type: 'string', maxLength: 100, description: 'Model override for subsequent queries.' },
227
254
  allowedTools: { type: 'array', items: { type: 'string' }, description: 'Whitelist of allowed tool names.' },
228
255
  disallowedTools: { type: 'array', items: { type: 'string' }, description: 'Blacklist of disallowed tool names.' },
229
256
  betas: { type: 'array', items: { type: 'string' }, description: 'Beta feature flags.' },
@@ -334,9 +361,9 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
334
361
  body: {
335
362
  type: 'object',
336
363
  properties: {
337
- content: { type: 'string' },
364
+ content: { type: 'string', maxLength: 100_000 },
338
365
  includePartialMessages: { type: 'boolean' },
339
- model: { type: 'string', description: 'Model override for this query. Overrides session and agent defaults.' },
366
+ model: { type: 'string', maxLength: 100, description: 'Model override for this query. Overrides session and agent defaults.' },
340
367
  maxTurns: { type: 'integer', minimum: 1, description: 'Maximum agentic turns for this query.' },
341
368
  maxBudgetUsd: { type: 'number', minimum: 0, description: 'Maximum budget in USD for this query.' },
342
369
  effort: { type: 'string', enum: ['low', 'medium', 'high', 'max'], description: 'Effort level for this query.' },
@@ -366,17 +393,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
366
393
  return reply.status(400).send({ error: `Session is ${session.status}`, statusCode: 400 });
367
394
  }
368
395
  const { content, includePartialMessages, model: messageModel, maxTurns, maxBudgetUsd, effort, thinking, outputFormat } = req.body;
396
+ const messageSpan = tracer.startSpan('ash.session.message', {
397
+ attributes: {
398
+ 'ash.session.id': session.id,
399
+ 'ash.agent.name': session.agentName,
400
+ 'ash.sandbox.id': session.sandboxId,
401
+ },
402
+ });
403
+ const messageCtx = trace.setSpan(context.active(), messageSpan);
404
+ if (session.model)
405
+ messageSpan.setAttribute('ash.model', session.model);
369
406
  let backend;
370
407
  try {
371
408
  backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
372
409
  }
373
410
  catch {
374
411
  await updateSessionStatus(session.id, 'error');
412
+ messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Runner not available' });
413
+ messageSpan.end();
375
414
  return reply.status(500).send({ error: 'Runner not available', statusCode: 500 });
376
415
  }
377
416
  const sandbox = backend.getSandbox(session.sandboxId);
378
417
  if (!sandbox) {
379
418
  await updateSessionStatus(session.id, 'error');
419
+ messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Sandbox not found' });
420
+ messageSpan.end();
380
421
  return reply.status(500).send({ error: 'Sandbox not found', statusCode: 500 });
381
422
  }
382
423
  const lookupMs = elapsed?.() ?? 0;
@@ -396,6 +437,10 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
396
437
  await writeSSE(reply.raw, `event: session_start\ndata: ${JSON.stringify({ sessionId: session.id, version: VERSION })}\n\n`);
397
438
  let eventCount = 0;
398
439
  let firstEventMs = 0;
440
+ // Inject trace context for bridge propagation
441
+ const carrier = {};
442
+ propagation.inject(messageCtx, carrier);
443
+ const traceContext = carrier['traceparent'];
399
444
  try {
400
445
  // Model precedence: per-message > session-level > agent default (.claude/settings.json)
401
446
  const queryModel = messageModel || session.model || undefined;
@@ -419,6 +464,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
419
464
  ...(cfg?.betas && { betas: cfg.betas }),
420
465
  ...(cfg?.subagents && { subagents: cfg.subagents }),
421
466
  ...(cfg?.initialAgent && { initialAgent: cfg.initialAgent }),
467
+ // Distributed tracing context
468
+ ...(traceContext && { traceContext }),
422
469
  });
423
470
  for await (const event of events) {
424
471
  eventCount++;
@@ -471,6 +518,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
471
518
  }
472
519
  catch (err) {
473
520
  const msg = err instanceof Error ? err.message : String(err);
521
+ messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
522
+ messageSpan.recordException(err instanceof Error ? err : new Error(msg));
474
523
  reply.raw.write(`event: error\ndata: ${JSON.stringify({ error: msg })}\n\n`);
475
524
  // Signal stream completion — the session remains active (not ended)
476
525
  reply.raw.write(`event: done\ndata: ${JSON.stringify({ sessionId: session.id })}\n\n`);
@@ -478,6 +527,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
478
527
  finally {
479
528
  // Mark waiting after message processing completes
480
529
  backend.markWaiting(session.sandboxId);
530
+ messageSpan.end();
481
531
  }
482
532
  if (elapsed) {
483
533
  logTiming({
@@ -623,23 +673,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
623
673
  },
624
674
  },
625
675
  }, async (req, reply) => {
626
- const session = await getSession(req.params.id);
627
- if (!session || session.tenantId !== req.tenantId) {
628
- return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
629
- }
630
- if (session.status !== 'active') {
631
- return reply.status(400).send({ error: `Cannot pause session with status "${session.status}"`, statusCode: 400 });
632
- }
633
- // Best-effort persist state before pausing
634
- try {
635
- const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
636
- backend.persistState(session.sandboxId, session.id, session.agentName);
637
- }
638
- catch { /* runner may be gone */ }
639
- await updateSessionStatus(session.id, 'paused');
640
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'paused' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
641
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'paused' } });
642
- return reply.send({ session: { ...session, status: 'paused' } });
676
+ return tracer.startActiveSpan('ash.session.pause', async (span) => {
677
+ try {
678
+ const session = await getSession(req.params.id);
679
+ if (!session || session.tenantId !== req.tenantId) {
680
+ return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
681
+ }
682
+ span.setAttribute('ash.session.id', session.id);
683
+ if (session.status !== 'active') {
684
+ return reply.status(400).send({ error: `Cannot pause session with status "${session.status}"`, statusCode: 400 });
685
+ }
686
+ // Best-effort persist state before pausing
687
+ try {
688
+ const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
689
+ backend.persistState(session.sandboxId, session.id, session.agentName);
690
+ }
691
+ catch { /* runner may be gone */ }
692
+ await updateSessionStatus(session.id, 'paused');
693
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'paused' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
694
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'paused' } });
695
+ return reply.send({ session: { ...session, status: 'paused' } });
696
+ }
697
+ finally {
698
+ span.end();
699
+ }
700
+ });
643
701
  });
644
702
  // Stop session — explicit user action (distinct from pause which is idle-based)
645
703
  app.post('/api/sessions/:id/stop', {
@@ -653,24 +711,32 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
653
711
  },
654
712
  },
655
713
  }, async (req, reply) => {
656
- const session = await getSession(req.params.id);
657
- if (!session || session.tenantId !== req.tenantId) {
658
- return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
659
- }
660
- if (session.status !== 'active' && session.status !== 'starting') {
661
- return reply.status(400).send({ error: `Cannot stop session with status "${session.status}"`, statusCode: 400 });
662
- }
663
- // Persist state and destroy sandbox
664
- try {
665
- const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
666
- backend.persistState(session.sandboxId, session.id, session.agentName);
667
- await backend.destroySandbox(session.sandboxId);
668
- }
669
- catch { /* runner may be gone */ }
670
- await updateSessionStatus(session.id, 'stopped');
671
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'stopped' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
672
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'stopped' } });
673
- return reply.send({ session: { ...session, status: 'stopped' } });
714
+ return tracer.startActiveSpan('ash.session.stop', async (span) => {
715
+ try {
716
+ const session = await getSession(req.params.id);
717
+ if (!session || session.tenantId !== req.tenantId) {
718
+ return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
719
+ }
720
+ span.setAttribute('ash.session.id', session.id);
721
+ if (session.status !== 'active' && session.status !== 'starting') {
722
+ return reply.status(400).send({ error: `Cannot stop session with status "${session.status}"`, statusCode: 400 });
723
+ }
724
+ // Persist state and destroy sandbox
725
+ try {
726
+ const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
727
+ backend.persistState(session.sandboxId, session.id, session.agentName);
728
+ await backend.destroySandbox(session.sandboxId);
729
+ }
730
+ catch { /* runner may be gone */ }
731
+ await updateSessionStatus(session.id, 'stopped');
732
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'stopped' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
733
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'stopped' } });
734
+ return reply.send({ session: { ...session, status: 'stopped' } });
735
+ }
736
+ finally {
737
+ span.end();
738
+ }
739
+ });
674
740
  });
675
741
  // Fork session — create a new session branching from parent's state and messages
676
742
  app.post('/api/sessions/:id/fork', {
@@ -756,104 +822,117 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
756
822
  },
757
823
  },
758
824
  }, async (req, reply) => {
759
- const session = await getSession(req.params.id);
760
- if (!session || session.tenantId !== req.tenantId) {
761
- return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
762
- }
763
- if (session.status === 'ended') {
764
- return reply.status(410).send({ error: 'Session has ended — create a new session', statusCode: 410 });
765
- }
766
- if (session.status === 'active') {
767
- return reply.send({ session });
768
- }
769
- // Resumable statuses: 'paused', 'stopped', 'error', 'starting'
770
- const agentRecord = await getAgent(session.agentName, req.tenantId);
771
- if (!agentRecord) {
772
- return reply.status(404).send({ error: `Agent "${session.agentName}" not found`, statusCode: 404 });
773
- }
774
- // Validate agent directory exists — auto-restore from cloud if missing
775
- if (!existsSync(agentRecord.path)) {
776
- const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
777
- if (!restored) {
778
- return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${session.agentName}" may need to be re-deployed.`, statusCode: 422 });
779
- }
780
- console.log(`[sessions] Restored agent "${session.agentName}" from cloud storage`);
781
- }
782
- // Fast path: try the same runner if sandbox is still alive
783
- try {
784
- const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
785
- if (oldBackend.isSandboxAlive(session.sandboxId)) {
786
- oldBackend.recordWarmHit();
787
- logResume('warm', session.id, session.agentName);
788
- await updateSessionStatus(session.id, 'active');
789
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
790
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
791
- return reply.send({ session: { ...session, status: 'active' } });
792
- }
793
- }
794
- catch { /* runner gone — cold path */ }
795
- // Cold path: pick any healthy runner
796
- try {
797
- const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
798
- const workspaceExists = existsSync(oldWorkspaceDir);
799
- let resumeSource = 'fresh';
800
- if (!workspaceExists) {
801
- if (hasPersistedState(dataDir, session.id, session.tenantId)) {
802
- restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
803
- resumeSource = 'local';
825
+ return tracer.startActiveSpan('ash.session.resume', async (span) => {
826
+ try {
827
+ const session = await getSession(req.params.id);
828
+ if (!session || session.tenantId !== req.tenantId) {
829
+ return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
830
+ }
831
+ span.setAttribute('ash.session.id', session.id);
832
+ if (session.status === 'ended') {
833
+ return reply.status(410).send({ error: 'Session has ended — create a new session', statusCode: 410 });
804
834
  }
805
- else {
806
- // Fall back to cloud storage
807
- const restored = await restoreStateFromCloud(dataDir, session.id, session.tenantId);
808
- if (restored) {
809
- restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
810
- resumeSource = 'cloud';
835
+ if (session.status === 'active') {
836
+ return reply.send({ session });
837
+ }
838
+ // Resumable statuses: 'paused', 'stopped', 'error', 'starting'
839
+ const agentRecord = await getAgent(session.agentName, req.tenantId);
840
+ if (!agentRecord) {
841
+ return reply.status(404).send({ error: `Agent "${session.agentName}" not found`, statusCode: 404 });
842
+ }
843
+ // Validate agent directory exists — auto-restore from cloud if missing
844
+ if (!existsSync(agentRecord.path)) {
845
+ const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
846
+ if (!restored) {
847
+ return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${session.agentName}" may need to be re-deployed.`, statusCode: 422 });
811
848
  }
849
+ console.log(`[sessions] Restored agent "${session.agentName}" from cloud storage`);
850
+ }
851
+ // Fast path: try the same runner if sandbox is still alive
852
+ try {
853
+ const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
854
+ if (oldBackend.isSandboxAlive(session.sandboxId)) {
855
+ oldBackend.recordWarmHit();
856
+ logResume('warm', session.id, session.agentName);
857
+ span.setAttribute('ash.resume.path', 'warm');
858
+ await updateSessionStatus(session.id, 'active');
859
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
860
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
861
+ return reply.send({ session: { ...session, status: 'active' } });
862
+ }
863
+ }
864
+ catch { /* runner gone — cold path */ }
865
+ // Cold path: pick any healthy runner
866
+ try {
867
+ const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
868
+ const workspaceExists = existsSync(oldWorkspaceDir);
869
+ let resumeSource = 'fresh';
870
+ if (!workspaceExists) {
871
+ if (hasPersistedState(dataDir, session.id, session.tenantId)) {
872
+ restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
873
+ resumeSource = 'local';
874
+ }
875
+ else {
876
+ // Fall back to cloud storage
877
+ const restored = await restoreStateFromCloud(dataDir, session.id, session.tenantId);
878
+ if (restored) {
879
+ restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
880
+ resumeSource = 'cloud';
881
+ }
882
+ }
883
+ }
884
+ else {
885
+ resumeSource = 'local';
886
+ }
887
+ span.setAttribute('ash.resume.path', 'cold');
888
+ span.setAttribute('ash.resume.source', resumeSource);
889
+ const workspaceAvailable = existsSync(oldWorkspaceDir);
890
+ const { backend, runnerId } = await coordinator.selectBackend();
891
+ const handle = await backend.createSandbox({
892
+ sessionId: session.id,
893
+ agentDir: agentRecord.path,
894
+ agentName: session.agentName,
895
+ sandboxId: session.id,
896
+ skipAgentCopy: workspaceAvailable,
897
+ onOomKill: () => {
898
+ updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
899
+ },
900
+ });
901
+ // Track resume source
902
+ switch (resumeSource) {
903
+ case 'local':
904
+ backend.recordColdLocalHit();
905
+ break;
906
+ case 'cloud':
907
+ backend.recordColdCloudHit();
908
+ break;
909
+ case 'fresh':
910
+ backend.recordColdFreshHit();
911
+ break;
912
+ }
913
+ logResume('cold', session.id, session.agentName, resumeSource);
914
+ const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
915
+ await updateSessionSandbox(session.id, handle.sandboxId);
916
+ await updateSessionRunner(session.id, effectiveRunnerId);
917
+ await updateSessionStatus(session.id, 'active');
918
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
919
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
920
+ return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
921
+ }
922
+ catch (err) {
923
+ const msg = err instanceof Error ? err.message : String(err);
924
+ span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
925
+ span.recordException(err instanceof Error ? err : new Error(msg));
926
+ if (msg.includes('capacity reached') || msg.includes('No runners available')) {
927
+ return reply.status(503).send({ error: msg, statusCode: 503 });
928
+ }
929
+ return reply.status(500).send({ error: `Failed to resume session: ${msg}`, statusCode: 500 });
812
930
  }
813
931
  }
814
- else {
815
- resumeSource = 'local';
816
- }
817
- const workspaceAvailable = existsSync(oldWorkspaceDir);
818
- const { backend, runnerId } = await coordinator.selectBackend();
819
- const handle = await backend.createSandbox({
820
- sessionId: session.id,
821
- agentDir: agentRecord.path,
822
- agentName: session.agentName,
823
- sandboxId: session.id,
824
- skipAgentCopy: workspaceAvailable,
825
- onOomKill: () => {
826
- updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
827
- },
828
- });
829
- // Track resume source
830
- switch (resumeSource) {
831
- case 'local':
832
- backend.recordColdLocalHit();
833
- break;
834
- case 'cloud':
835
- backend.recordColdCloudHit();
836
- break;
837
- case 'fresh':
838
- backend.recordColdFreshHit();
839
- break;
840
- }
841
- logResume('cold', session.id, session.agentName, resumeSource);
842
- const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
843
- await updateSessionSandbox(session.id, handle.sandboxId);
844
- await updateSessionRunner(session.id, effectiveRunnerId);
845
- await updateSessionStatus(session.id, 'active');
846
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
847
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
848
- return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
849
- }
850
- catch (err) {
851
- const msg = err instanceof Error ? err.message : String(err);
852
- if (msg.includes('capacity reached') || msg.includes('No runners available')) {
853
- return reply.status(503).send({ error: msg, statusCode: 503 });
932
+ finally {
933
+ span.end();
854
934
  }
855
- return reply.status(500).send({ error: `Failed to resume session: ${msg}`, statusCode: 500 });
856
- }
935
+ });
857
936
  });
858
937
  // End session
859
938
  app.delete('/api/sessions/:id', {