@ash-ai/server 0.0.19 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,10 +5,12 @@ import { SSE_WRITE_TIMEOUT_MS, timingEnabled, startTimer, logTiming } from '@ash
5
5
  import { getAgent, insertSession, insertForkedSession, getSession, listSessions, updateSessionStatus, updateSessionSandbox, updateSessionConfig, touchSession, updateSessionRunner, insertMessage, listMessages, insertSessionEvent, insertSessionEvents, listSessionEvents } from '../db/index.js';
6
6
  import { classifyBridgeMessage, classifyToStreamEvents } from '@ash-ai/shared';
7
7
  import { VERSION } from '../version.js';
8
- import { restoreSessionState, hasPersistedState, restoreStateFromCloud } from '@ash-ai/sandbox';
8
+ import { restoreSessionState, hasPersistedState, restoreStateFromCloud, restoreAgentFromCloud } from '@ash-ai/sandbox';
9
9
  import { decryptCredential } from './credentials.js';
10
10
  import { touchCredentialUsed } from '../db/index.js';
11
11
  import { recordUsageFromMessage } from '../usage/extractor.js';
12
+ import { trace, context, propagation, SpanStatusCode } from '@opentelemetry/api';
13
+ const tracer = trace.getTracer('ash-coordinator');
12
14
  /** Structured log line for every resume — always on, not gated by ASH_DEBUG_TIMING. */
13
15
  function logResume(path, sessionId, agentName, source) {
14
16
  process.stderr.write(JSON.stringify({
@@ -97,73 +99,101 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
97
99
  201: sessionResponse,
98
100
  400: { $ref: 'ApiError#' },
99
101
  404: { $ref: 'ApiError#' },
102
+ 422: { $ref: 'ApiError#' },
100
103
  500: { $ref: 'ApiError#' },
101
104
  503: { $ref: 'ApiError#' },
102
105
  },
103
106
  },
104
107
  }, async (req, reply) => {
105
108
  const { agent, credentialId, extraEnv: bodyExtraEnv, startupScript, model, mcpServers, systemPrompt, permissionMode, allowedTools, disallowedTools, betas, subagents, initialAgent } = req.body;
106
- const agentRecord = await getAgent(agent, req.tenantId);
107
- if (!agentRecord) {
108
- return reply.status(404).send({ error: `Agent "${agent}" not found`, statusCode: 404 });
109
- }
110
- // Resolve credential to env vars if provided
111
- let extraEnv;
112
- if (credentialId) {
113
- const cred = await decryptCredential(credentialId, req.tenantId);
114
- if (!cred) {
115
- return reply.status(400).send({ error: 'Invalid or inaccessible credential', statusCode: 400 });
109
+ return tracer.startActiveSpan('ash.session.create', { attributes: { 'ash.agent.name': agent } }, async (span) => {
110
+ try {
111
+ const agentRecord = await getAgent(agent, req.tenantId);
112
+ if (!agentRecord) {
113
+ span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent not found' });
114
+ return reply.status(404).send({ error: `Agent "${agent}" not found`, statusCode: 404 });
115
+ }
116
+ // Validate agent directory exists on disk — auto-restore from cloud if missing
117
+ if (!existsSync(agentRecord.path)) {
118
+ const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
119
+ if (!restored) {
120
+ span.setStatus({ code: SpanStatusCode.ERROR, message: 'Agent directory not found' });
121
+ return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${agent}" may need to be re-deployed.`, statusCode: 422 });
122
+ }
123
+ console.log(`[sessions] Restored agent "${agent}" from cloud storage`);
124
+ }
125
+ // Resolve credential to env vars if provided
126
+ let extraEnv;
127
+ if (credentialId) {
128
+ const cred = await decryptCredential(credentialId, req.tenantId);
129
+ if (!cred) {
130
+ span.setStatus({ code: SpanStatusCode.ERROR, message: 'Invalid credential' });
131
+ return reply.status(400).send({ error: 'Invalid or inaccessible credential', statusCode: 400 });
132
+ }
133
+ const envKey = cred.type === 'anthropic' ? 'ANTHROPIC_API_KEY' : cred.type === 'openai' ? 'OPENAI_API_KEY' : 'ASH_CUSTOM_API_KEY';
134
+ extraEnv = { [envKey]: cred.key };
135
+ touchCredentialUsed(credentialId).catch(() => { });
136
+ }
137
+ // Merge body-level extraEnv (overrides credential env on conflict)
138
+ if (bodyExtraEnv) {
139
+ extraEnv = { ...extraEnv, ...bodyExtraEnv };
140
+ }
141
+ // Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
142
+ if (permissionMode) {
143
+ extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
144
+ }
145
+ const sessionId = randomUUID();
146
+ span.setAttribute('ash.session.id', sessionId);
147
+ try {
148
+ span.addEvent('selectBackend.start');
149
+ const { backend, runnerId } = await coordinator.selectBackend();
150
+ span.addEvent('selectBackend.end');
151
+ span.addEvent('createSandbox.start');
152
+ const handle = await backend.createSandbox({
153
+ sessionId,
154
+ agentDir: agentRecord.path,
155
+ agentName: agentRecord.name,
156
+ sandboxId: sessionId,
157
+ extraEnv,
158
+ startupScript,
159
+ mcpServers,
160
+ systemPrompt,
161
+ onOomKill: () => {
162
+ updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
163
+ },
164
+ });
165
+ span.addEvent('createSandbox.end');
166
+ // Resolve effective model: explicit request > agent record > null (SDK default)
167
+ const effectiveModel = model || agentRecord.model || undefined;
168
+ if (effectiveModel)
169
+ span.setAttribute('ash.model', effectiveModel);
170
+ // Build session-level SDK config (persisted on session, injected into every query)
171
+ const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
172
+ ? { allowedTools, disallowedTools, betas, subagents, initialAgent }
173
+ : null;
174
+ const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
175
+ const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
176
+ await updateSessionRunner(sessionId, effectiveRunnerId);
177
+ await updateSessionStatus(sessionId, 'active');
178
+ // Record lifecycle event
179
+ insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
180
+ telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
181
+ return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
182
+ }
183
+ catch (err) {
184
+ const msg = err instanceof Error ? err.message : String(err);
185
+ span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
186
+ span.recordException(err instanceof Error ? err : new Error(msg));
187
+ if (msg.includes('capacity reached') || msg.includes('No runners available')) {
188
+ return reply.status(503).send({ error: msg, statusCode: 503 });
189
+ }
190
+ return reply.status(500).send({ error: `Failed to create session: ${msg}`, statusCode: 500 });
191
+ }
116
192
  }
117
- const envKey = cred.type === 'anthropic' ? 'ANTHROPIC_API_KEY' : cred.type === 'openai' ? 'OPENAI_API_KEY' : 'ASH_CUSTOM_API_KEY';
118
- extraEnv = { [envKey]: cred.key };
119
- touchCredentialUsed(credentialId).catch(() => { });
120
- }
121
- // Merge body-level extraEnv (overrides credential env on conflict)
122
- if (bodyExtraEnv) {
123
- extraEnv = { ...extraEnv, ...bodyExtraEnv };
124
- }
125
- // Inject permission mode into sandbox env (bridge reads ASH_PERMISSION_MODE)
126
- if (permissionMode) {
127
- extraEnv = { ...extraEnv, ASH_PERMISSION_MODE: permissionMode };
128
- }
129
- const sessionId = randomUUID();
130
- try {
131
- const { backend, runnerId } = await coordinator.selectBackend();
132
- const handle = await backend.createSandbox({
133
- sessionId,
134
- agentDir: agentRecord.path,
135
- agentName: agentRecord.name,
136
- sandboxId: sessionId,
137
- extraEnv,
138
- startupScript,
139
- mcpServers,
140
- systemPrompt,
141
- onOomKill: () => {
142
- updateSessionStatus(sessionId, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
143
- },
144
- });
145
- // Resolve effective model: explicit request > agent record > null (SDK default)
146
- const effectiveModel = model || agentRecord.model || undefined;
147
- // Build session-level SDK config (persisted on session, injected into every query)
148
- const sessionConfig = (allowedTools || disallowedTools || betas || subagents || initialAgent)
149
- ? { allowedTools, disallowedTools, betas, subagents, initialAgent }
150
- : null;
151
- const session = await insertSession(sessionId, agentRecord.name, handle.sandboxId, req.tenantId, undefined, effectiveModel, sessionConfig);
152
- const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
153
- await updateSessionRunner(sessionId, effectiveRunnerId);
154
- await updateSessionStatus(sessionId, 'active');
155
- // Record lifecycle event
156
- insertSessionEvent(sessionId, 'lifecycle', JSON.stringify({ action: 'created', agentName: agentRecord.name, model: effectiveModel }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
157
- telemetry.emit({ sessionId, agentName: agentRecord.name, type: 'lifecycle', data: { status: 'active', action: 'created' } });
158
- return reply.status(201).send({ session: { ...session, status: 'active', runnerId: effectiveRunnerId } });
159
- }
160
- catch (err) {
161
- const msg = err instanceof Error ? err.message : String(err);
162
- if (msg.includes('capacity reached') || msg.includes('No runners available')) {
163
- return reply.status(503).send({ error: msg, statusCode: 503 });
193
+ finally {
194
+ span.end();
164
195
  }
165
- return reply.status(500).send({ error: `Failed to create session: ${msg}`, statusCode: 500 });
166
- }
196
+ });
167
197
  });
168
198
  // List sessions (optional ?agent=name filter)
169
199
  app.get('/api/sessions', {
@@ -357,17 +387,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
357
387
  return reply.status(400).send({ error: `Session is ${session.status}`, statusCode: 400 });
358
388
  }
359
389
  const { content, includePartialMessages, model: messageModel, maxTurns, maxBudgetUsd, effort, thinking, outputFormat } = req.body;
390
+ const messageSpan = tracer.startSpan('ash.session.message', {
391
+ attributes: {
392
+ 'ash.session.id': session.id,
393
+ 'ash.agent.name': session.agentName,
394
+ 'ash.sandbox.id': session.sandboxId,
395
+ },
396
+ });
397
+ const messageCtx = trace.setSpan(context.active(), messageSpan);
398
+ if (session.model)
399
+ messageSpan.setAttribute('ash.model', session.model);
360
400
  let backend;
361
401
  try {
362
402
  backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
363
403
  }
364
404
  catch {
365
405
  await updateSessionStatus(session.id, 'error');
406
+ messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Runner not available' });
407
+ messageSpan.end();
366
408
  return reply.status(500).send({ error: 'Runner not available', statusCode: 500 });
367
409
  }
368
410
  const sandbox = backend.getSandbox(session.sandboxId);
369
411
  if (!sandbox) {
370
412
  await updateSessionStatus(session.id, 'error');
413
+ messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: 'Sandbox not found' });
414
+ messageSpan.end();
371
415
  return reply.status(500).send({ error: 'Sandbox not found', statusCode: 500 });
372
416
  }
373
417
  const lookupMs = elapsed?.() ?? 0;
@@ -387,6 +431,10 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
387
431
  await writeSSE(reply.raw, `event: session_start\ndata: ${JSON.stringify({ sessionId: session.id, version: VERSION })}\n\n`);
388
432
  let eventCount = 0;
389
433
  let firstEventMs = 0;
434
+ // Inject trace context for bridge propagation
435
+ const carrier = {};
436
+ propagation.inject(messageCtx, carrier);
437
+ const traceContext = carrier['traceparent'];
390
438
  try {
391
439
  // Model precedence: per-message > session-level > agent default (.claude/settings.json)
392
440
  const queryModel = messageModel || session.model || undefined;
@@ -410,6 +458,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
410
458
  ...(cfg?.betas && { betas: cfg.betas }),
411
459
  ...(cfg?.subagents && { subagents: cfg.subagents }),
412
460
  ...(cfg?.initialAgent && { initialAgent: cfg.initialAgent }),
461
+ // Distributed tracing context
462
+ ...(traceContext && { traceContext }),
413
463
  });
414
464
  for await (const event of events) {
415
465
  eventCount++;
@@ -462,6 +512,8 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
462
512
  }
463
513
  catch (err) {
464
514
  const msg = err instanceof Error ? err.message : String(err);
515
+ messageSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
516
+ messageSpan.recordException(err instanceof Error ? err : new Error(msg));
465
517
  reply.raw.write(`event: error\ndata: ${JSON.stringify({ error: msg })}\n\n`);
466
518
  // Signal stream completion — the session remains active (not ended)
467
519
  reply.raw.write(`event: done\ndata: ${JSON.stringify({ sessionId: session.id })}\n\n`);
@@ -469,6 +521,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
469
521
  finally {
470
522
  // Mark waiting after message processing completes
471
523
  backend.markWaiting(session.sandboxId);
524
+ messageSpan.end();
472
525
  }
473
526
  if (elapsed) {
474
527
  logTiming({
@@ -614,23 +667,31 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
614
667
  },
615
668
  },
616
669
  }, async (req, reply) => {
617
- const session = await getSession(req.params.id);
618
- if (!session || session.tenantId !== req.tenantId) {
619
- return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
620
- }
621
- if (session.status !== 'active') {
622
- return reply.status(400).send({ error: `Cannot pause session with status "${session.status}"`, statusCode: 400 });
623
- }
624
- // Best-effort persist state before pausing
625
- try {
626
- const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
627
- backend.persistState(session.sandboxId, session.id, session.agentName);
628
- }
629
- catch { /* runner may be gone */ }
630
- await updateSessionStatus(session.id, 'paused');
631
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'paused' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
632
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'paused' } });
633
- return reply.send({ session: { ...session, status: 'paused' } });
670
+ return tracer.startActiveSpan('ash.session.pause', async (span) => {
671
+ try {
672
+ const session = await getSession(req.params.id);
673
+ if (!session || session.tenantId !== req.tenantId) {
674
+ return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
675
+ }
676
+ span.setAttribute('ash.session.id', session.id);
677
+ if (session.status !== 'active') {
678
+ return reply.status(400).send({ error: `Cannot pause session with status "${session.status}"`, statusCode: 400 });
679
+ }
680
+ // Best-effort persist state before pausing
681
+ try {
682
+ const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
683
+ backend.persistState(session.sandboxId, session.id, session.agentName);
684
+ }
685
+ catch { /* runner may be gone */ }
686
+ await updateSessionStatus(session.id, 'paused');
687
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'paused' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
688
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'paused' } });
689
+ return reply.send({ session: { ...session, status: 'paused' } });
690
+ }
691
+ finally {
692
+ span.end();
693
+ }
694
+ });
634
695
  });
635
696
  // Stop session — explicit user action (distinct from pause which is idle-based)
636
697
  app.post('/api/sessions/:id/stop', {
@@ -644,24 +705,32 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
644
705
  },
645
706
  },
646
707
  }, async (req, reply) => {
647
- const session = await getSession(req.params.id);
648
- if (!session || session.tenantId !== req.tenantId) {
649
- return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
650
- }
651
- if (session.status !== 'active' && session.status !== 'starting') {
652
- return reply.status(400).send({ error: `Cannot stop session with status "${session.status}"`, statusCode: 400 });
653
- }
654
- // Persist state and destroy sandbox
655
- try {
656
- const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
657
- backend.persistState(session.sandboxId, session.id, session.agentName);
658
- await backend.destroySandbox(session.sandboxId);
659
- }
660
- catch { /* runner may be gone */ }
661
- await updateSessionStatus(session.id, 'stopped');
662
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'stopped' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
663
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'stopped' } });
664
- return reply.send({ session: { ...session, status: 'stopped' } });
708
+ return tracer.startActiveSpan('ash.session.stop', async (span) => {
709
+ try {
710
+ const session = await getSession(req.params.id);
711
+ if (!session || session.tenantId !== req.tenantId) {
712
+ return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
713
+ }
714
+ span.setAttribute('ash.session.id', session.id);
715
+ if (session.status !== 'active' && session.status !== 'starting') {
716
+ return reply.status(400).send({ error: `Cannot stop session with status "${session.status}"`, statusCode: 400 });
717
+ }
718
+ // Persist state and destroy sandbox
719
+ try {
720
+ const backend = await coordinator.getBackendForRunnerAsync(session.runnerId);
721
+ backend.persistState(session.sandboxId, session.id, session.agentName);
722
+ await backend.destroySandbox(session.sandboxId);
723
+ }
724
+ catch { /* runner may be gone */ }
725
+ await updateSessionStatus(session.id, 'stopped');
726
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'stopped' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
727
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'stopped' } });
728
+ return reply.send({ session: { ...session, status: 'stopped' } });
729
+ }
730
+ finally {
731
+ span.end();
732
+ }
733
+ });
665
734
  });
666
735
  // Fork session — create a new session branching from parent's state and messages
667
736
  app.post('/api/sessions/:id/fork', {
@@ -671,6 +740,7 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
671
740
  response: {
672
741
  201: sessionResponse,
673
742
  404: { $ref: 'ApiError#' },
743
+ 422: { $ref: 'ApiError#' },
674
744
  500: { $ref: 'ApiError#' },
675
745
  503: { $ref: 'ApiError#' },
676
746
  },
@@ -684,6 +754,13 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
684
754
  if (!agent) {
685
755
  return reply.status(404).send({ error: `Agent "${parentSession.agentName}" not found`, statusCode: 404 });
686
756
  }
757
+ if (!existsSync(agent.path)) {
758
+ const restored = await restoreAgentFromCloud(agent.name, agent.path, req.tenantId);
759
+ if (!restored) {
760
+ return reply.status(422).send({ error: `Agent directory not found at "${agent.path}". The agent "${parentSession.agentName}" may need to be re-deployed.`, statusCode: 422 });
761
+ }
762
+ console.log(`[sessions] Restored agent "${parentSession.agentName}" from cloud storage`);
763
+ }
687
764
  // Persist parent workspace state if sandbox is still live
688
765
  try {
689
766
  const parentBackend = await coordinator.getBackendForRunnerAsync(parentSession.runnerId);
@@ -733,101 +810,123 @@ export function sessionRoutes(app, coordinator, dataDir, telemetry) {
733
810
  200: sessionResponse,
734
811
  404: { $ref: 'ApiError#' },
735
812
  410: { $ref: 'ApiError#' },
813
+ 422: { $ref: 'ApiError#' },
736
814
  500: { $ref: 'ApiError#' },
737
815
  503: { $ref: 'ApiError#' },
738
816
  },
739
817
  },
740
818
  }, async (req, reply) => {
741
- const session = await getSession(req.params.id);
742
- if (!session || session.tenantId !== req.tenantId) {
743
- return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
744
- }
745
- if (session.status === 'ended') {
746
- return reply.status(410).send({ error: 'Session has ended — create a new session', statusCode: 410 });
747
- }
748
- if (session.status === 'active') {
749
- return reply.send({ session });
750
- }
751
- // Resumable statuses: 'paused', 'stopped', 'error', 'starting'
752
- const agentRecord = await getAgent(session.agentName, req.tenantId);
753
- if (!agentRecord) {
754
- return reply.status(404).send({ error: `Agent "${session.agentName}" not found`, statusCode: 404 });
755
- }
756
- // Fast path: try the same runner if sandbox is still alive
757
- try {
758
- const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
759
- if (oldBackend.isSandboxAlive(session.sandboxId)) {
760
- oldBackend.recordWarmHit();
761
- logResume('warm', session.id, session.agentName);
762
- await updateSessionStatus(session.id, 'active');
763
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
764
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
765
- return reply.send({ session: { ...session, status: 'active' } });
766
- }
767
- }
768
- catch { /* runner gone — cold path */ }
769
- // Cold path: pick any healthy runner
770
- try {
771
- const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
772
- const workspaceExists = existsSync(oldWorkspaceDir);
773
- let resumeSource = 'fresh';
774
- if (!workspaceExists) {
775
- if (hasPersistedState(dataDir, session.id, session.tenantId)) {
776
- restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
777
- resumeSource = 'local';
819
+ return tracer.startActiveSpan('ash.session.resume', async (span) => {
820
+ try {
821
+ const session = await getSession(req.params.id);
822
+ if (!session || session.tenantId !== req.tenantId) {
823
+ return reply.status(404).send({ error: 'Session not found', statusCode: 404 });
824
+ }
825
+ span.setAttribute('ash.session.id', session.id);
826
+ if (session.status === 'ended') {
827
+ return reply.status(410).send({ error: 'Session has ended — create a new session', statusCode: 410 });
828
+ }
829
+ if (session.status === 'active') {
830
+ return reply.send({ session });
831
+ }
832
+ // Resumable statuses: 'paused', 'stopped', 'error', 'starting'
833
+ const agentRecord = await getAgent(session.agentName, req.tenantId);
834
+ if (!agentRecord) {
835
+ return reply.status(404).send({ error: `Agent "${session.agentName}" not found`, statusCode: 404 });
778
836
  }
779
- else {
780
- // Fall back to cloud storage
781
- const restored = await restoreStateFromCloud(dataDir, session.id, session.tenantId);
782
- if (restored) {
783
- restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
784
- resumeSource = 'cloud';
837
+ // Validate agent directory exists — auto-restore from cloud if missing
838
+ if (!existsSync(agentRecord.path)) {
839
+ const restored = await restoreAgentFromCloud(agentRecord.name, agentRecord.path, req.tenantId);
840
+ if (!restored) {
841
+ return reply.status(422).send({ error: `Agent directory not found at "${agentRecord.path}". The agent "${session.agentName}" may need to be re-deployed.`, statusCode: 422 });
785
842
  }
843
+ console.log(`[sessions] Restored agent "${session.agentName}" from cloud storage`);
844
+ }
845
+ // Fast path: try the same runner if sandbox is still alive
846
+ try {
847
+ const oldBackend = await coordinator.getBackendForRunnerAsync(session.runnerId);
848
+ if (oldBackend.isSandboxAlive(session.sandboxId)) {
849
+ oldBackend.recordWarmHit();
850
+ logResume('warm', session.id, session.agentName);
851
+ span.setAttribute('ash.resume.path', 'warm');
852
+ await updateSessionStatus(session.id, 'active');
853
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'warm' }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
854
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'warm' } });
855
+ return reply.send({ session: { ...session, status: 'active' } });
856
+ }
857
+ }
858
+ catch { /* runner gone — cold path */ }
859
+ // Cold path: pick any healthy runner
860
+ try {
861
+ const oldWorkspaceDir = join(dataDir, 'sandboxes', session.id, 'workspace');
862
+ const workspaceExists = existsSync(oldWorkspaceDir);
863
+ let resumeSource = 'fresh';
864
+ if (!workspaceExists) {
865
+ if (hasPersistedState(dataDir, session.id, session.tenantId)) {
866
+ restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
867
+ resumeSource = 'local';
868
+ }
869
+ else {
870
+ // Fall back to cloud storage
871
+ const restored = await restoreStateFromCloud(dataDir, session.id, session.tenantId);
872
+ if (restored) {
873
+ restoreSessionState(dataDir, session.id, oldWorkspaceDir, session.tenantId);
874
+ resumeSource = 'cloud';
875
+ }
876
+ }
877
+ }
878
+ else {
879
+ resumeSource = 'local';
880
+ }
881
+ span.setAttribute('ash.resume.path', 'cold');
882
+ span.setAttribute('ash.resume.source', resumeSource);
883
+ const workspaceAvailable = existsSync(oldWorkspaceDir);
884
+ const { backend, runnerId } = await coordinator.selectBackend();
885
+ const handle = await backend.createSandbox({
886
+ sessionId: session.id,
887
+ agentDir: agentRecord.path,
888
+ agentName: session.agentName,
889
+ sandboxId: session.id,
890
+ skipAgentCopy: workspaceAvailable,
891
+ onOomKill: () => {
892
+ updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
893
+ },
894
+ });
895
+ // Track resume source
896
+ switch (resumeSource) {
897
+ case 'local':
898
+ backend.recordColdLocalHit();
899
+ break;
900
+ case 'cloud':
901
+ backend.recordColdCloudHit();
902
+ break;
903
+ case 'fresh':
904
+ backend.recordColdFreshHit();
905
+ break;
906
+ }
907
+ logResume('cold', session.id, session.agentName, resumeSource);
908
+ const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
909
+ await updateSessionSandbox(session.id, handle.sandboxId);
910
+ await updateSessionRunner(session.id, effectiveRunnerId);
911
+ await updateSessionStatus(session.id, 'active');
912
+ insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
913
+ telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
914
+ return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
915
+ }
916
+ catch (err) {
917
+ const msg = err instanceof Error ? err.message : String(err);
918
+ span.setStatus({ code: SpanStatusCode.ERROR, message: msg });
919
+ span.recordException(err instanceof Error ? err : new Error(msg));
920
+ if (msg.includes('capacity reached') || msg.includes('No runners available')) {
921
+ return reply.status(503).send({ error: msg, statusCode: 503 });
922
+ }
923
+ return reply.status(500).send({ error: `Failed to resume session: ${msg}`, statusCode: 500 });
786
924
  }
787
925
  }
788
- else {
789
- resumeSource = 'local';
790
- }
791
- const workspaceAvailable = existsSync(oldWorkspaceDir);
792
- const { backend, runnerId } = await coordinator.selectBackend();
793
- const handle = await backend.createSandbox({
794
- sessionId: session.id,
795
- agentDir: agentRecord.path,
796
- agentName: session.agentName,
797
- sandboxId: session.id,
798
- skipAgentCopy: workspaceAvailable,
799
- onOomKill: () => {
800
- updateSessionStatus(session.id, 'paused').catch((err) => console.error(`Failed to update session status on OOM: ${err}`));
801
- },
802
- });
803
- // Track resume source
804
- switch (resumeSource) {
805
- case 'local':
806
- backend.recordColdLocalHit();
807
- break;
808
- case 'cloud':
809
- backend.recordColdCloudHit();
810
- break;
811
- case 'fresh':
812
- backend.recordColdFreshHit();
813
- break;
814
- }
815
- logResume('cold', session.id, session.agentName, resumeSource);
816
- const effectiveRunnerId = runnerId === '__local__' ? null : runnerId;
817
- await updateSessionSandbox(session.id, handle.sandboxId);
818
- await updateSessionRunner(session.id, effectiveRunnerId);
819
- await updateSessionStatus(session.id, 'active');
820
- insertSessionEvent(session.id, 'lifecycle', JSON.stringify({ action: 'resumed', path: 'cold', source: resumeSource }), req.tenantId).catch((err) => console.error(`Failed to persist lifecycle event: ${err}`));
821
- telemetry.emit({ sessionId: session.id, agentName: session.agentName, type: 'lifecycle', data: { status: 'active', action: 'resumed', path: 'cold', source: resumeSource } });
822
- return reply.send({ session: { ...session, sandboxId: handle.sandboxId, status: 'active', runnerId: effectiveRunnerId } });
823
- }
824
- catch (err) {
825
- const msg = err instanceof Error ? err.message : String(err);
826
- if (msg.includes('capacity reached') || msg.includes('No runners available')) {
827
- return reply.status(503).send({ error: msg, statusCode: 503 });
926
+ finally {
927
+ span.end();
828
928
  }
829
- return reply.status(500).send({ error: `Failed to resume session: ${msg}`, statusCode: 500 });
830
- }
929
+ });
831
930
  });
832
931
  // End session
833
932
  app.delete('/api/sessions/:id', {