pilotswarm-sdk 0.1.32 → 0.1.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import { approvePermissionForSession } from "./permissions.js";
10
10
  import { formatSessionOwnerLabel, getSessionOwnerKind, matchesSessionOwnerFilters } from "./session-owner-utils.js";
11
11
  import { cmsRetryBestEffort, cmsRetryCritical } from "./cms-retry.js";
12
12
  import { computeCronAtNextFire } from "./cron-at.js";
13
+ import { SpanStatusCode, trace as otelTrace } from "@opentelemetry/api";
13
14
  import os from "node:os";
14
15
  import fs from "node:fs";
15
16
  import { randomUUID } from "node:crypto";
@@ -328,6 +329,14 @@ function buildUsageSummaryUpsert(data) {
328
329
  ...(tokensCacheWriteIncrement != null ? { tokensCacheWriteIncrement } : {}),
329
330
  };
330
331
  }
332
+ function isFailureToolCompletion(data) {
333
+ const eventData = normalizeEventData(data);
334
+ if (!eventData)
335
+ return false;
336
+ return eventData.resultType === "failure"
337
+ || typeof eventData.error === "string"
338
+ || typeof eventData.errorMessage === "string";
339
+ }
331
340
  async function tryReadSnapshotSizeBytes(sessionStore, sessionId) {
332
341
  if (!sessionStore)
333
342
  return undefined;
@@ -526,6 +535,31 @@ workerNodeId) {
526
535
  // ── runTurn ──────────────────────────────────────────────
527
536
  runtime.registerActivity("runTurn", async (activityCtx, input) => {
528
537
  activityCtx.traceInfo(`[runTurn] session=${input.sessionId}`);
538
+ const turnTelemetry = {
539
+ tokensInput: 0,
540
+ tokensOutput: 0,
541
+ tokensCacheRead: 0,
542
+ tokensCacheWrite: 0,
543
+ toolCalls: 0,
544
+ toolErrors: 0,
545
+ toolNames: new Set(),
546
+ modelSummary: sessionManager.getModelSummary(),
547
+ };
548
+ const turnSpan = otelTrace.getTracer("pilotswarm-turns").startSpan("session.turn", {
549
+ attributes: {
550
+ "pilotswarm.session_id": input.sessionId,
551
+ "pilotswarm.turn_index": input.turnIndex ?? 0,
552
+ "pilotswarm.bootstrap": Boolean(input.bootstrap),
553
+ "pilotswarm.retry_count": input.retryCount ?? 0,
554
+ "pilotswarm.nesting_level": input.nestingLevel ?? 0,
555
+ "pilotswarm.has_parent_session": Boolean(input.parentSessionId),
556
+ ...(input.parentSessionId ? { "pilotswarm.parent_session_id": input.parentSessionId } : {}),
557
+ ...(input.requiredTool ? { "pilotswarm.required_tool": input.requiredTool } : {}),
558
+ ...(input.config.model ? { "pilotswarm.model": input.config.model } : {}),
559
+ ...(input.config.reasoningEffort ? { "pilotswarm.reasoning_effort": input.config.reasoningEffort } : {}),
560
+ ...(workerNodeId ? { "pilotswarm.worker_node_id": workerNodeId } : {}),
561
+ },
562
+ });
529
563
  const hostname = os.hostname();
530
564
  const MAX_SUB_AGENTS = 50;
531
565
  const MAX_NESTING_LEVEL = 2;
@@ -552,8 +586,9 @@ workerNodeId) {
552
586
  let inlineSdkClient = null;
553
587
  let inlineSdkClientPromise = null;
554
588
  let cancelPoll = null;
589
+ let finalTurnResult = null;
555
590
  try {
556
- return await sessionManager.withRunTurnLock(input.sessionId, "runTurn", async () => {
591
+ finalTurnResult = await sessionManager.withRunTurnLock(input.sessionId, "runTurn", async () => {
557
592
  let session = null;
558
593
  let effectivePrompt = input.prompt;
559
594
  try {
@@ -1238,9 +1273,27 @@ workerNodeId) {
1238
1273
  if (event.eventType === "assistant.usage") {
1239
1274
  const usageUpsert = buildUsageSummaryUpsert(event.data);
1240
1275
  if (usageUpsert) {
1276
+ turnTelemetry.tokensInput += usageUpsert.tokensInputIncrement ?? 0;
1277
+ turnTelemetry.tokensOutput += usageUpsert.tokensOutputIncrement ?? 0;
1278
+ turnTelemetry.tokensCacheRead += usageUpsert.tokensCacheReadIncrement ?? 0;
1279
+ turnTelemetry.tokensCacheWrite += usageUpsert.tokensCacheWriteIncrement ?? 0;
1241
1280
  void cmsRetryBestEffort(`runTurn.onEvent upsertSummary usage session=${input.sessionId}`, () => catalog.upsertSessionMetricSummary(input.sessionId, usageUpsert), (msg) => activityCtx.traceInfo(msg));
1242
1281
  }
1243
1282
  }
1283
+ else if (event.eventType === "tool.execution_start") {
1284
+ turnTelemetry.toolCalls += 1;
1285
+ const eventData = normalizeEventData(event.data);
1286
+ const toolName = typeof eventData?.toolName === "string"
1287
+ ? eventData.toolName
1288
+ : typeof eventData?.name === "string"
1289
+ ? eventData.name
1290
+ : undefined;
1291
+ if (toolName)
1292
+ turnTelemetry.toolNames.add(toolName);
1293
+ }
1294
+ else if (event.eventType === "tool.execution_complete" && isFailureToolCompletion(event.data)) {
1295
+ turnTelemetry.toolErrors += 1;
1296
+ }
1244
1297
  // Best-effort with one transient retry. trackEventWrite tracks
1245
1298
  // the wrapped promise so the post-turn barrier waits for the
1246
1299
  // retry to settle before emitting turn_completed.
@@ -1476,6 +1529,10 @@ workerNodeId) {
1476
1529
  }
1477
1530
  return result;
1478
1531
  }, { trace });
1532
+ if (!finalTurnResult) {
1533
+ throw new Error("runTurn completed without a turn result");
1534
+ }
1535
+ return finalTurnResult;
1479
1536
  }
1480
1537
  catch (err) {
1481
1538
  if (isSessionLockAcquireTimeoutError(err)) {
@@ -1493,11 +1550,36 @@ workerNodeId) {
1493
1550
  lastActiveAt: new Date(),
1494
1551
  }), (msg) => activityCtx.traceInfo(msg));
1495
1552
  }
1496
- return { type: "error", message };
1553
+ finalTurnResult = { type: "error", message };
1554
+ return finalTurnResult;
1497
1555
  }
1556
+ turnSpan.recordException(err);
1557
+ turnSpan.setStatus({ code: SpanStatusCode.ERROR, message: err?.message || String(err) });
1498
1558
  throw err;
1499
1559
  }
1500
1560
  finally {
1561
+ if (turnTelemetry.modelSummary) {
1562
+ turnSpan.setAttribute("pilotswarm.model_summary", turnTelemetry.modelSummary);
1563
+ }
1564
+ turnSpan.setAttribute("pilotswarm.tokens_input", turnTelemetry.tokensInput);
1565
+ turnSpan.setAttribute("pilotswarm.tokens_output", turnTelemetry.tokensOutput);
1566
+ turnSpan.setAttribute("pilotswarm.tokens_cache_read", turnTelemetry.tokensCacheRead);
1567
+ turnSpan.setAttribute("pilotswarm.tokens_cache_write", turnTelemetry.tokensCacheWrite);
1568
+ turnSpan.setAttribute("pilotswarm.tool_calls", turnTelemetry.toolCalls);
1569
+ turnSpan.setAttribute("pilotswarm.tool_errors", turnTelemetry.toolErrors);
1570
+ if (turnTelemetry.toolNames.size > 0) {
1571
+ turnSpan.setAttribute("pilotswarm.tool_names", Array.from(turnTelemetry.toolNames).sort().join(","));
1572
+ }
1573
+ if (finalTurnResult) {
1574
+ turnSpan.setAttribute("pilotswarm.turn_result", finalTurnResult.type);
1575
+ if (finalTurnResult.type === "error") {
1576
+ turnSpan.setStatus({
1577
+ code: SpanStatusCode.ERROR,
1578
+ message: finalTurnResult.message || "turn failed",
1579
+ });
1580
+ }
1581
+ }
1582
+ turnSpan.end();
1501
1583
  if (cancelPoll)
1502
1584
  clearInterval(cancelPoll);
1503
1585
  const clientToStop = inlineSdkClient;
@@ -1514,75 +1596,97 @@ workerNodeId) {
1514
1596
  const reason = input.reason ?? "unknown";
1515
1597
  const eventData = normalizeEventData(input.eventData);
1516
1598
  const trace = activityTrace(activityCtx, "dehydrateSession");
1599
+ const dehydrationSpan = otelTrace.getTracer("pilotswarm-lifecycle").startSpan("session.dehydration", {
1600
+ attributes: {
1601
+ "pilotswarm.session_id": input.sessionId,
1602
+ "pilotswarm.dehydration_reason": reason,
1603
+ "pilotswarm.worker_node_id": workerNodeId,
1604
+ },
1605
+ });
1517
1606
  trace(`session=${input.sessionId} start reason=${reason}`);
1518
1607
  try {
1519
- await sessionManager.dehydrate(input.sessionId, reason, { trace });
1520
- }
1521
- catch (err) {
1522
- const message = err?.message || String(err);
1523
- if (isMissingDehydrateSnapshotErrorMessage(message)) {
1524
- const sessionStoreAttemptCount = Number(err?.sessionStoreAttemptCount) || undefined;
1525
- const lossyHandoffData = {
1526
- ...(eventData ?? {}),
1527
- reason,
1528
- cause: "missing_local_session_state_during_dehydrate",
1529
- message: `Worker lost local Copilot session state before dehydrate completed for ${input.sessionId}. ` +
1530
- "The next turn will recreate a fresh Copilot session and continue with possible data loss.",
1531
- detail: "Local session files were unavailable during dehydrate, so the latest live Copilot state " +
1532
- "could not be durably archived.",
1533
- error: message,
1534
- recoveryMode: "fresh_session_replay",
1535
- nextStep: "recreate_copilot_session_on_next_turn",
1536
- ...(sessionStoreAttemptCount ? { sessionStoreAttemptCount } : {}),
1537
- ...(typeof err?.sessionStoreError === "string" ? { sessionStoreError: err.sessionStoreError } : {}),
1538
- };
1539
- trace(`session=${input.sessionId} lossy handoff reason=${reason} error=${message}`);
1540
- await recordLossyHandoffEvent(catalog, input.sessionId, workerNodeId, lossyHandoffData, (failureMessage) => activityCtx.traceInfo(`[dehydrateSession] ${failureMessage}`));
1541
- return { lossyHandoff: lossyHandoffData };
1608
+ try {
1609
+ await sessionManager.dehydrate(input.sessionId, reason, { trace });
1610
+ }
1611
+ catch (err) {
1612
+ const message = err?.message || String(err);
1613
+ if (isMissingDehydrateSnapshotErrorMessage(message)) {
1614
+ const sessionStoreAttemptCount = Number(err?.sessionStoreAttemptCount) || undefined;
1615
+ const lossyHandoffData = {
1616
+ ...(eventData ?? {}),
1617
+ reason,
1618
+ cause: "missing_local_session_state_during_dehydrate",
1619
+ message: `Worker lost local Copilot session state before dehydrate completed for ${input.sessionId}. ` +
1620
+ "The next turn will recreate a fresh Copilot session and continue with possible data loss.",
1621
+ detail: "Local session files were unavailable during dehydrate, so the latest live Copilot state " +
1622
+ "could not be durably archived.",
1623
+ error: message,
1624
+ recoveryMode: "fresh_session_replay",
1625
+ nextStep: "recreate_copilot_session_on_next_turn",
1626
+ ...(sessionStoreAttemptCount ? { sessionStoreAttemptCount } : {}),
1627
+ ...(typeof err?.sessionStoreError === "string" ? { sessionStoreError: err.sessionStoreError } : {}),
1628
+ };
1629
+ trace(`session=${input.sessionId} lossy handoff reason=${reason} error=${message}`);
1630
+ dehydrationSpan.setAttribute("pilotswarm.dehydration_result", "lossy_handoff");
1631
+ dehydrationSpan.setAttribute("pilotswarm.lossy_handoff", true);
1632
+ dehydrationSpan.setStatus({ code: SpanStatusCode.ERROR, message });
1633
+ await recordLossyHandoffEvent(catalog, input.sessionId, workerNodeId, lossyHandoffData, (failureMessage) => activityCtx.traceInfo(`[dehydrateSession] ${failureMessage}`));
1634
+ return { lossyHandoff: lossyHandoffData };
1635
+ }
1636
+ trace(`session=${input.sessionId} failed reason=${reason} error=${message}`);
1637
+ dehydrationSpan.setAttribute("pilotswarm.dehydration_result", "error");
1638
+ dehydrationSpan.recordException(err);
1639
+ dehydrationSpan.setStatus({ code: SpanStatusCode.ERROR, message });
1640
+ if (catalog) {
1641
+ const sessionStoreAttemptCount = Number(err?.sessionStoreAttemptCount) || undefined;
1642
+ await catalog.recordEvents(input.sessionId, [{
1643
+ eventType: "session.error",
1644
+ data: {
1645
+ ...(eventData ?? {}),
1646
+ reason,
1647
+ message,
1648
+ ...(sessionStoreAttemptCount ? { sessionStoreAttemptCount } : {}),
1649
+ ...(typeof err?.sessionStoreError === "string" ? { sessionStoreError: err.sessionStoreError } : {}),
1650
+ },
1651
+ }], workerNodeId).catch((catalogErr) => {
1652
+ activityCtx.traceInfo(`[dehydrateSession] CMS failure event write failed: ${catalogErr}`);
1653
+ });
1654
+ await catalog.updateSession(input.sessionId, {
1655
+ lastError: message,
1656
+ lastActiveAt: new Date(),
1657
+ }).catch((catalogErr) => {
1658
+ activityCtx.traceInfo(`[dehydrateSession] CMS lastError update failed: ${catalogErr}`);
1659
+ });
1660
+ }
1661
+ throw err;
1542
1662
  }
1543
- trace(`session=${input.sessionId} failed reason=${reason} error=${message}`);
1663
+ trace(`session=${input.sessionId} complete reason=${reason}`);
1664
+ dehydrationSpan.setAttribute("pilotswarm.dehydration_result", "completed");
1544
1665
  if (catalog) {
1545
- const sessionStoreAttemptCount = Number(err?.sessionStoreAttemptCount) || undefined;
1666
+ const snapshotSizeBytes = await tryReadSnapshotSizeBytes(_sessionStore, input.sessionId);
1667
+ if (snapshotSizeBytes != null) {
1668
+ dehydrationSpan.setAttribute("pilotswarm.snapshot_size_bytes", snapshotSizeBytes);
1669
+ }
1670
+ await catalog.upsertSessionMetricSummary(input.sessionId, {
1671
+ ...(snapshotSizeBytes != null ? { snapshotSizeBytes } : {}),
1672
+ dehydrationCountIncrement: 1,
1673
+ lastDehydratedAt: true,
1674
+ }).catch((err) => {
1675
+ activityCtx.traceInfo(`[dehydrateSession] CMS summary update failed: ${err}`);
1676
+ });
1546
1677
  await catalog.recordEvents(input.sessionId, [{
1547
- eventType: "session.error",
1678
+ eventType: "session.dehydrated",
1548
1679
  data: {
1549
- ...(eventData ?? {}),
1550
1680
  reason,
1551
- message,
1552
- ...(sessionStoreAttemptCount ? { sessionStoreAttemptCount } : {}),
1553
- ...(typeof err?.sessionStoreError === "string" ? { sessionStoreError: err.sessionStoreError } : {}),
1681
+ ...(eventData ?? {}),
1554
1682
  },
1555
- }], workerNodeId).catch((catalogErr) => {
1556
- activityCtx.traceInfo(`[dehydrateSession] CMS failure event write failed: ${catalogErr}`);
1557
- });
1558
- await catalog.updateSession(input.sessionId, {
1559
- lastError: message,
1560
- lastActiveAt: new Date(),
1561
- }).catch((catalogErr) => {
1562
- activityCtx.traceInfo(`[dehydrateSession] CMS lastError update failed: ${catalogErr}`);
1683
+ }], workerNodeId).catch((err) => {
1684
+ activityCtx.traceInfo(`[dehydrateSession] CMS success event write failed: ${err}`);
1563
1685
  });
1564
1686
  }
1565
- throw err;
1566
1687
  }
1567
- trace(`session=${input.sessionId} complete reason=${reason}`);
1568
- if (catalog) {
1569
- const snapshotSizeBytes = await tryReadSnapshotSizeBytes(_sessionStore, input.sessionId);
1570
- await catalog.upsertSessionMetricSummary(input.sessionId, {
1571
- ...(snapshotSizeBytes != null ? { snapshotSizeBytes } : {}),
1572
- dehydrationCountIncrement: 1,
1573
- lastDehydratedAt: true,
1574
- }).catch((err) => {
1575
- activityCtx.traceInfo(`[dehydrateSession] CMS summary update failed: ${err}`);
1576
- });
1577
- await catalog.recordEvents(input.sessionId, [{
1578
- eventType: "session.dehydrated",
1579
- data: {
1580
- reason,
1581
- ...(eventData ?? {}),
1582
- },
1583
- }], workerNodeId).catch((err) => {
1584
- activityCtx.traceInfo(`[dehydrateSession] CMS success event write failed: ${err}`);
1585
- });
1688
+ finally {
1689
+ dehydrationSpan.end();
1586
1690
  }
1587
1691
  });
1588
1692
  runtime.registerActivity("needsHydrationSession", async (activityCtx, input) => {
@@ -1601,26 +1705,41 @@ workerNodeId) {
1601
1705
  // ── hydrateSession ──────────────────────────────────────
1602
1706
  runtime.registerActivity("hydrateSession", async (activityCtx, input) => {
1603
1707
  const trace = activityTrace(activityCtx, "hydrateSession");
1708
+ const hydrationSpan = otelTrace.getTracer("pilotswarm-lifecycle").startSpan("session.hydration", {
1709
+ attributes: {
1710
+ "pilotswarm.session_id": input.sessionId,
1711
+ "pilotswarm.worker_node_id": workerNodeId,
1712
+ },
1713
+ });
1604
1714
  trace(`session=${input.sessionId} start`);
1605
1715
  try {
1606
- await sessionManager.hydrate(input.sessionId, { trace });
1607
- }
1608
- catch (error) {
1609
- trace(`session=${input.sessionId} failed error=${errorMessage(error)}`);
1610
- throw error;
1716
+ try {
1717
+ await sessionManager.hydrate(input.sessionId, { trace });
1718
+ }
1719
+ catch (error) {
1720
+ trace(`session=${input.sessionId} failed error=${errorMessage(error)}`);
1721
+ hydrationSpan.setAttribute("pilotswarm.hydration_result", "error");
1722
+ hydrationSpan.recordException(error);
1723
+ hydrationSpan.setStatus({ code: SpanStatusCode.ERROR, message: errorMessage(error) });
1724
+ throw error;
1725
+ }
1726
+ trace(`session=${input.sessionId} complete`);
1727
+ hydrationSpan.setAttribute("pilotswarm.hydration_result", "completed");
1728
+ if (catalog) {
1729
+ // Best-effort: metric summary and the session.hydrated event are
1730
+ // observability only. Never block hydrate on CMS hiccups.
1731
+ await cmsRetryBestEffort(`hydrateSession.upsertSummary session=${input.sessionId}`, () => catalog.upsertSessionMetricSummary(input.sessionId, {
1732
+ hydrationCountIncrement: 1,
1733
+ lastHydratedAt: true,
1734
+ }), (msg) => activityCtx.traceInfo(msg));
1735
+ await cmsRetryBestEffort(`hydrateSession.recordEvents session=${input.sessionId}`, () => catalog.recordEvents(input.sessionId, [{
1736
+ eventType: "session.hydrated",
1737
+ data: {},
1738
+ }], workerNodeId), (msg) => activityCtx.traceInfo(msg));
1739
+ }
1611
1740
  }
1612
- trace(`session=${input.sessionId} complete`);
1613
- if (catalog) {
1614
- // Best-effort: metric summary and the session.hydrated event are
1615
- // observability only. Never block hydrate on CMS hiccups.
1616
- await cmsRetryBestEffort(`hydrateSession.upsertSummary session=${input.sessionId}`, () => catalog.upsertSessionMetricSummary(input.sessionId, {
1617
- hydrationCountIncrement: 1,
1618
- lastHydratedAt: true,
1619
- }), (msg) => activityCtx.traceInfo(msg));
1620
- await cmsRetryBestEffort(`hydrateSession.recordEvents session=${input.sessionId}`, () => catalog.recordEvents(input.sessionId, [{
1621
- eventType: "session.hydrated",
1622
- data: {},
1623
- }], workerNodeId), (msg) => activityCtx.traceInfo(msg));
1741
+ finally {
1742
+ hydrationSpan.end();
1624
1743
  }
1625
1744
  });
1626
1745
  // ── destroySession ──────────────────────────────────────