@swarmclawai/swarmclaw 1.5.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -15,6 +15,46 @@ Docs: https://swarmclaw.ai/docs
15
15
  Website: https://swarmclaw.ai
16
16
  Extension tutorial: https://swarmclaw.ai/docs/extension-tutorial
17
17
 
18
+ ## Hosted Deploys
19
+
20
+ SwarmClaw now ships provider-ready deploy files at the repo root:
21
+
22
+ - `render.yaml` for Render Blueprint deploys from the public GHCR image
23
+ - `fly.toml` for Fly.io image-backed deploys
24
+ - `railway.json` for Railway-aligned health and restart defaults
25
+
26
+ The published image is:
27
+
28
+ ```text
29
+ ghcr.io/swarmclawai/swarmclaw:latest
30
+ ```
31
+
32
+ Hosted deployments should:
33
+
34
+ - mount persistent storage at `/app/data`
35
+ - manage secrets through the provider dashboard
36
+ - set `ACCESS_KEY` and `CREDENTIAL_SECRET`
37
+ - point health checks at `/api/healthz`
38
+
39
+ Full hosted deployment guides live at https://swarmclaw.ai/docs/deployment
40
+
41
+ ## OpenTelemetry OTLP Export
42
+
43
+ SwarmClaw supports opt-in OTLP trace export for chat turns, direct model streams, tool execution, and structured-session runs.
44
+
45
+ Minimal configuration:
46
+
47
+ ```bash
48
+ OTEL_ENABLED=true
49
+ OTEL_SERVICE_NAME=swarmclaw
50
+ OTEL_EXPORTER_OTLP_ENDPOINT=https://your-collector:4318
51
+ OTEL_EXPORTER_OTLP_HEADERS=Authorization=Bearer your-token
52
+ ```
53
+
54
+ If you need a trace-specific endpoint, set `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` directly instead.
55
+
56
+ Operational docs: https://swarmclaw.ai/docs/observability
57
+
18
58
  ## Screenshots
19
59
 
20
60
  <table>
@@ -215,6 +255,17 @@ SwarmClaw agents can join [SwarmFeed](https://swarmfeed.ai) — a social network
215
255
 
216
256
  Read the docs at [swarmclaw.ai/docs/swarmfeed](https://swarmclaw.ai/docs/swarmfeed) and visit [swarmfeed.ai](https://swarmfeed.ai) for the platform itself.
217
257
 
258
+ ### v1.5.2 Highlights
259
+
260
+ - **Hosted deploy path for SwarmClaw itself**: added root-level `render.yaml`, `fly.toml`, and `railway.json` so the published `ghcr.io/swarmclawai/swarmclaw:latest` image is easier to run on always-on platforms.
261
+ - **Public health endpoint for hosted platforms**: added `/api/healthz` and exempted it from access-key auth so Render, Fly.io, and Railway can perform liveness checks without weakening the rest of the API surface.
262
+ - **OTLP/OpenTelemetry foundation**: SwarmClaw can now export traces for chat turns, direct model streams, protocol runs, and tool execution to any OTLP-compatible backend using environment variables only.
263
+ - **Docs and landing-page deploy refresh**: `swarmclaw.ai` now exposes the hosted deploy path and a dedicated observability guide instead of burying those operator workflows in general setup docs.
264
+
265
+ ### v1.5.1 Highlights
266
+
267
+ - **Standalone connector lifecycle**: connector start, stop, status, and repair now work correctly in standalone production builds (`npm start` / pm2) where the daemon runs in-process. Previously these operations silently failed because the controller assumed a daemon subprocess was always present. (Community contribution by [@borislavnnikolov](https://github.com/borislavnnikolov) -- PR #35)
268
+
218
269
  ### v1.5.0 Highlights
219
270
 
220
271
  - **First-run activation refresh**: setup now includes a dedicated start-path step, broad starter shapes instead of niche presets, and draft agents generated directly from the chosen setup shape.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@swarmclawai/swarmclaw",
3
- "version": "1.5.0",
4
- "description": "Self-hosted AI runtime for OpenClaw, delegation, autonomy, runtime skills, crypto wallets, and chat platform connectors.",
3
+ "version": "1.5.2",
4
+ "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
5
5
  "license": "MIT",
6
6
  "publishConfig": {
7
7
  "access": "public",
@@ -74,7 +74,7 @@
74
74
  "test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
75
75
  "test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts",
76
76
  "test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
77
- "test:runtime": "tsx --test src/lib/server/knowledge-sources.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/safe-parse-body.test.ts src/app/api/approvals/route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/logs/route.test.ts src/app/api/tts/route.test.ts",
77
+ "test:runtime": "tsx --test src/lib/server/knowledge-sources.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/app/api/approvals/route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/tts/route.test.ts",
78
78
  "test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
79
79
  "test:e2e": "tsx .workbench/browser-e2e/run.ts",
80
80
  "test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
@@ -88,6 +88,9 @@
88
88
  "@langchain/langgraph": "^1.2.2",
89
89
  "@langchain/openai": "^1.2.8",
90
90
  "@modelcontextprotocol/sdk": "^1.27.1",
91
+ "@opentelemetry/api": "^1.9.1",
92
+ "@opentelemetry/exporter-trace-otlp-http": "^0.214.0",
93
+ "@opentelemetry/sdk-node": "^0.214.0",
91
94
  "@multiavatar/multiavatar": "^1.0.7",
92
95
  "@playwright/mcp": "^0.0.68",
93
96
  "@slack/bolt": "^4.6.0",
@@ -0,0 +1,14 @@
1
+ import assert from 'node:assert/strict'
2
+ import { test } from 'node:test'
3
+
4
+ import { GET } from '@/app/api/healthz/route'
5
+
6
+ test('GET /api/healthz returns an ok payload', async () => {
7
+ const response = await GET()
8
+ assert.equal(response.status, 200)
9
+
10
+ const payload = await response.json()
11
+ assert.equal(payload.ok, true)
12
+ assert.equal(payload.service, 'swarmclaw')
13
+ assert.equal(typeof payload.time, 'number')
14
+ })
@@ -0,0 +1,9 @@
1
+ import { NextResponse } from 'next/server'
2
+
3
+ export async function GET() {
4
+ return NextResponse.json({
5
+ ok: true,
6
+ service: 'swarmclaw',
7
+ time: Date.now(),
8
+ })
9
+ }
package/src/cli/index.js CHANGED
@@ -734,6 +734,13 @@ const COMMAND_GROUPS = [
734
734
  cmd('get', 'GET', '/system/status', 'Get system health summary (safe for external monitors)'),
735
735
  ],
736
736
  },
737
+ {
738
+ name: 'healthz',
739
+ description: 'Public liveness probe',
740
+ commands: [
741
+ cmd('get', 'GET', '/healthz', 'Get public health check payload'),
742
+ ],
743
+ },
737
744
  {
738
745
  name: 'usage',
739
746
  description: 'Usage and cost summary',
@@ -5,9 +5,11 @@ const TAG = 'instrumentation'
5
5
  export async function register() {
6
6
  if (process.env.NEXT_RUNTIME === 'nodejs') {
7
7
  const { log } = await import('@/lib/server/logger')
8
+ const { ensureOpenTelemetryStarted, shutdownOpenTelemetry } = await import('@/lib/server/observability/otel')
8
9
  const isWorkerOnly = process.env.SWARMCLAW_WORKER_ONLY === '1'
9
10
  const { initWsServer, closeWsServer } = await import('./lib/server/ws-hub')
10
11
  const { ensureDaemonStarted } = await import('@/lib/server/runtime/daemon-state')
12
+ await ensureOpenTelemetryStarted()
11
13
 
12
14
  // One-time migration: backfill allKnownPeerIds on existing connector sessions
13
15
  try {
@@ -44,6 +46,11 @@ export async function register() {
44
46
  } catch (err) {
45
47
  log.error(TAG, 'Failed to stop daemon during shutdown:', err)
46
48
  }
49
+ try {
50
+ await shutdownOpenTelemetry()
51
+ } catch (err) {
52
+ log.error(TAG, 'Failed to stop OpenTelemetry during shutdown:', err)
53
+ }
47
54
  if (!isWorkerOnly) {
48
55
  await closeWsServer()
49
56
  }
@@ -1,5 +1,6 @@
1
1
  import type { ExecuteChatTurnInput, ExecuteChatTurnResult } from './chat-execution-types'
2
2
  import { perf } from '@/lib/server/runtime/perf'
3
+ import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
3
4
  import { markProviderSuccess } from '@/lib/server/provider-health'
4
5
  import { executePreparedChatTurn } from '@/lib/server/chat-execution/chat-turn-stream-execution'
5
6
  import { finalizeChatTurn } from '@/lib/server/chat-execution/chat-turn-finalization'
@@ -50,84 +51,121 @@ export async function executeSessionChatTurn(input: ExecuteChatTurnInput): Promi
50
51
  sessionId,
51
52
  source = 'chat',
52
53
  } = input
53
- const endTurnPerf = perf.start('chat-execution', 'executeSessionChatTurn', { sessionId, source })
54
- const preparedTurn = await prepareChatTurn(input)
55
- if (preparedTurn.kind === 'blocked') {
56
- const result = await completeBlockedChatTurn(preparedTurn)
57
- endTurnPerf({
58
- durationMs: 0,
59
- toolEventCount: result.toolEvents.length,
60
- inputTokens: result.inputTokens || 0,
61
- outputTokens: result.outputTokens || 0,
62
- error: !!result.error,
63
- })
64
- return result
65
- }
54
+ return withServerSpan('swarmclaw.chat.turn', {
55
+ 'swarmclaw.session.id': sessionId,
56
+ 'swarmclaw.chat.source': source,
57
+ 'swarmclaw.chat.has_image': Boolean(input.imagePath || input.imageUrl),
58
+ 'swarmclaw.chat.attached_file_count': input.attachedFiles?.length || 0,
59
+ }, async (span) => {
60
+ const endTurnPerf = perf.start('chat-execution', 'executeSessionChatTurn', { sessionId, source })
61
+ const preparedTurn = await prepareChatTurn(input)
62
+ if (preparedTurn.kind === 'blocked') {
63
+ const result = await completeBlockedChatTurn(preparedTurn)
64
+ setSpanAttributes(span, {
65
+ 'swarmclaw.chat.blocked': true,
66
+ 'swarmclaw.chat.tool_event_count': result.toolEvents.length,
67
+ 'swarmclaw.chat.error': Boolean(result.error),
68
+ 'gen_ai.usage.input_tokens': result.inputTokens || 0,
69
+ 'gen_ai.usage.output_tokens': result.outputTokens || 0,
70
+ })
71
+ endTurnPerf({
72
+ durationMs: 0,
73
+ toolEventCount: result.toolEvents.length,
74
+ inputTokens: result.inputTokens || 0,
75
+ outputTokens: result.outputTokens || 0,
76
+ error: !!result.error,
77
+ })
78
+ return result
79
+ }
66
80
 
67
- const partialPersistence = createPartialAssistantPersistence({
68
- prepared: preparedTurn,
69
- onEvent: input.onEvent,
70
- })
81
+ setSpanAttributes(span, {
82
+ 'swarmclaw.chat.blocked': false,
83
+ 'swarmclaw.chat.agentic': preparedTurn.hasExtensions,
84
+ 'swarmclaw.chat.provider': preparedTurn.providerType,
85
+ 'gen_ai.request.model': preparedTurn.sessionForRun.model,
86
+ })
71
87
 
72
- const preflight = await runChatTurnPreflight({
73
- prepared: preparedTurn,
74
- emit: partialPersistence.emit,
75
- toolEvents: partialPersistence.getToolEvents(),
76
- })
88
+ const partialPersistence = createPartialAssistantPersistence({
89
+ prepared: preparedTurn,
90
+ onEvent: input.onEvent,
91
+ })
77
92
 
78
- if (preflight?.terminalResult) {
79
- if (preflight.terminalResult.text) input.onEvent?.({ t: 'd', text: preflight.terminalResult.text })
80
- partialPersistence.stop()
81
- await partialPersistence.awaitIdle()
82
- endTurnPerf({
83
- durationMs: 0,
84
- toolEventCount: preflight.terminalResult.toolEvents.length,
85
- inputTokens: preflight.terminalResult.inputTokens || 0,
86
- outputTokens: preflight.terminalResult.outputTokens || 0,
87
- error: !!preflight.terminalResult.error,
93
+ const preflight = await runChatTurnPreflight({
94
+ prepared: preparedTurn,
95
+ emit: partialPersistence.emit,
96
+ toolEvents: partialPersistence.getToolEvents(),
88
97
  })
89
- return preflight.terminalResult
90
- }
91
98
 
92
- let streamResult: Awaited<ReturnType<typeof executePreparedChatTurn>>
93
- try {
94
- streamResult = await executePreparedChatTurn({
99
+ if (preflight?.terminalResult) {
100
+ if (preflight.terminalResult.text) input.onEvent?.({ t: 'd', text: preflight.terminalResult.text })
101
+ partialPersistence.stop()
102
+ await partialPersistence.awaitIdle()
103
+ setSpanAttributes(span, {
104
+ 'swarmclaw.chat.preflight_terminal': true,
105
+ 'swarmclaw.chat.tool_event_count': preflight.terminalResult.toolEvents.length,
106
+ 'swarmclaw.chat.error': Boolean(preflight.terminalResult.error),
107
+ 'gen_ai.usage.input_tokens': preflight.terminalResult.inputTokens || 0,
108
+ 'gen_ai.usage.output_tokens': preflight.terminalResult.outputTokens || 0,
109
+ })
110
+ endTurnPerf({
111
+ durationMs: 0,
112
+ toolEventCount: preflight.terminalResult.toolEvents.length,
113
+ inputTokens: preflight.terminalResult.inputTokens || 0,
114
+ outputTokens: preflight.terminalResult.outputTokens || 0,
115
+ error: !!preflight.terminalResult.error,
116
+ })
117
+ return preflight.terminalResult
118
+ }
119
+
120
+ let streamResult: Awaited<ReturnType<typeof executePreparedChatTurn>>
121
+ try {
122
+ streamResult = await executePreparedChatTurn({
123
+ input,
124
+ prepared: preparedTurn,
125
+ partialPersistence,
126
+ preflightToolRoutingResult: preflight?.directMemoryResult || null,
127
+ })
128
+
129
+ await partialPersistence.awaitIdle()
130
+ } finally {
131
+ partialPersistence.stop()
132
+ }
133
+
134
+ if (!streamResult.errorMessage) {
135
+ markProviderSuccess(preparedTurn.providerType, preparedTurn.sessionForRun.credentialId)
136
+ }
137
+
138
+ const result = await finalizeChatTurn({
95
139
  input,
96
140
  prepared: preparedTurn,
97
141
  partialPersistence,
98
- preflightToolRoutingResult: preflight?.directMemoryResult || null,
142
+ fullResponse: streamResult.fullResponse,
143
+ errorMessage: streamResult.errorMessage,
144
+ initialToolRoutingResult: streamResult.toolRoutingResult,
145
+ responseCacheHit: streamResult.responseCacheHit,
146
+ directUsage: streamResult.directUsage,
147
+ durationMs: streamResult.durationMs,
148
+ knowledgeRetrievalTrace: streamResult.knowledgeRetrievalTrace || null,
149
+ emit: partialPersistence.emit,
99
150
  })
100
151
 
101
- await partialPersistence.awaitIdle()
102
- } finally {
103
- partialPersistence.stop()
104
- }
105
-
106
- if (!streamResult.errorMessage) {
107
- markProviderSuccess(preparedTurn.providerType, preparedTurn.sessionForRun.credentialId)
108
- }
109
-
110
- const result = await finalizeChatTurn({
111
- input,
112
- prepared: preparedTurn,
113
- partialPersistence,
114
- fullResponse: streamResult.fullResponse,
115
- errorMessage: streamResult.errorMessage,
116
- initialToolRoutingResult: streamResult.toolRoutingResult,
117
- responseCacheHit: streamResult.responseCacheHit,
118
- directUsage: streamResult.directUsage,
119
- durationMs: streamResult.durationMs,
120
- knowledgeRetrievalTrace: streamResult.knowledgeRetrievalTrace || null,
121
- emit: partialPersistence.emit,
122
- })
152
+ setSpanAttributes(span, {
153
+ 'swarmclaw.chat.cache_hit': streamResult.responseCacheHit,
154
+ 'swarmclaw.chat.tool_event_count': result.toolEvents.length,
155
+ 'swarmclaw.chat.error': Boolean(result.error),
156
+ 'swarmclaw.chat.estimated_cost': result.estimatedCost ?? 0,
157
+ 'swarmclaw.chat.has_retrieval_trace': Boolean(result.retrievalTrace),
158
+ 'gen_ai.usage.input_tokens': result.inputTokens || 0,
159
+ 'gen_ai.usage.output_tokens': result.outputTokens || 0,
160
+ })
161
+ endTurnPerf({
162
+ durationMs: streamResult.durationMs,
163
+ toolEventCount: result.toolEvents.length,
164
+ inputTokens: result.inputTokens || 0,
165
+ outputTokens: result.outputTokens || 0,
166
+ error: !!result.error,
167
+ })
123
168
 
124
- endTurnPerf({
125
- durationMs: streamResult.durationMs,
126
- toolEventCount: result.toolEvents.length,
127
- inputTokens: result.inputTokens || 0,
128
- outputTokens: result.outputTokens || 0,
129
- error: !!result.error,
169
+ return result
130
170
  })
131
-
132
- return result
133
171
  }
@@ -22,6 +22,7 @@ import {
22
22
  import { perf } from '@/lib/server/runtime/perf'
23
23
  import { getSessionMessages } from '@/lib/server/sessions/session-repository'
24
24
  import { notify } from '@/lib/server/ws-hub'
25
+ import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
25
26
  import { errorMessage as toErrorMessage } from '@/lib/shared-utils'
26
27
 
27
28
  import type { ExecuteChatTurnInput } from './chat-execution-types'
@@ -142,22 +143,34 @@ export async function executePreparedChatTurn(params: {
142
143
  )
143
144
 
144
145
  if (hasExtensions) {
145
- const result = await streamAgentChat({
146
- session: sessionForRun,
147
- message: effectiveMessage,
148
- imagePath: resolvedImagePath,
149
- imageUrl,
150
- attachedFiles,
151
- apiKey,
152
- systemPrompt,
153
- executionBrief,
154
- extraSystemContext: [executionBriefContextBlock].filter((value): value is string => typeof value === 'string' && value.trim().length > 0),
155
- write: (raw) => parseAndEmit(raw),
156
- history: heartbeatHistory ?? applyContextClearBoundary(getSessionMessages(sessionId)),
157
- signal: abortController.signal,
158
- source,
159
- classification,
160
- promptMode,
146
+ const result = await withServerSpan('swarmclaw.chat.agentic_stream', {
147
+ 'swarmclaw.session.id': sessionId,
148
+ 'swarmclaw.chat.source': source,
149
+ 'swarmclaw.chat.provider': providerType,
150
+ 'gen_ai.request.model': sessionForRun.model,
151
+ }, async (span) => {
152
+ const agenticResult = await streamAgentChat({
153
+ session: sessionForRun,
154
+ message: effectiveMessage,
155
+ imagePath: resolvedImagePath,
156
+ imageUrl,
157
+ attachedFiles,
158
+ apiKey,
159
+ systemPrompt,
160
+ executionBrief,
161
+ extraSystemContext: [executionBriefContextBlock].filter((value): value is string => typeof value === 'string' && value.trim().length > 0),
162
+ write: (raw) => parseAndEmit(raw),
163
+ history: heartbeatHistory ?? applyContextClearBoundary(getSessionMessages(sessionId)),
164
+ signal: abortController.signal,
165
+ source,
166
+ classification,
167
+ promptMode,
168
+ })
169
+ setSpanAttributes(span, {
170
+ 'swarmclaw.chat.tool_event_count': agenticResult.toolEvents.length,
171
+ 'swarmclaw.chat.has_retrieval_trace': Boolean(agenticResult.knowledgeRetrievalTrace),
172
+ })
173
+ return agenticResult
161
174
  })
162
175
  fullResponse = result.finalResponse || result.fullText
163
176
  knowledgeRetrievalTrace = result.knowledgeRetrievalTrace || null
@@ -232,7 +245,20 @@ export async function executePreparedChatTurn(params: {
232
245
  signal: abortController.signal,
233
246
  })
234
247
  try {
235
- fullResponse = await doStreamChat()
248
+ fullResponse = await withServerSpan('swarmclaw.chat.model_stream', {
249
+ 'swarmclaw.session.id': sessionId,
250
+ 'swarmclaw.chat.source': source,
251
+ 'swarmclaw.chat.provider': providerType,
252
+ 'gen_ai.request.model': sessionForRun.model,
253
+ }, async (span) => {
254
+ const response = await doStreamChat()
255
+ setSpanAttributes(span, {
256
+ 'gen_ai.usage.input_tokens': directUsage.inputTokens || 0,
257
+ 'gen_ai.usage.output_tokens': directUsage.outputTokens || 0,
258
+ 'swarmclaw.chat.response_cacheable': canUseResponseCache,
259
+ })
260
+ return response
261
+ })
236
262
  } catch (streamErr: unknown) {
237
263
  const streamErrMsg = toErrorMessage(streamErr)
238
264
  const streamStatus = (streamErr as Record<string, unknown>)?.status
@@ -243,7 +269,20 @@ export async function executePreparedChatTurn(params: {
243
269
  historyLen: directHistorySnapshot.length,
244
270
  })
245
271
  directHistorySnapshot = directHistorySnapshot.slice(-10)
246
- fullResponse = await doStreamChat()
272
+ fullResponse = await withServerSpan('swarmclaw.chat.model_stream.retry', {
273
+ 'swarmclaw.session.id': sessionId,
274
+ 'swarmclaw.chat.source': source,
275
+ 'swarmclaw.chat.provider': providerType,
276
+ 'gen_ai.request.model': sessionForRun.model,
277
+ 'swarmclaw.chat.retry_reason': 'context_overflow',
278
+ }, async (span) => {
279
+ const response = await doStreamChat()
280
+ setSpanAttributes(span, {
281
+ 'gen_ai.usage.input_tokens': directUsage.inputTokens || 0,
282
+ 'gen_ai.usage.output_tokens': directUsage.outputTokens || 0,
283
+ })
284
+ return response
285
+ })
247
286
  } else {
248
287
  throw streamErr
249
288
  }
@@ -523,6 +523,24 @@ export async function runDaemonHealthCheckViaAdmin(source: string): Promise<Daem
523
523
  }
524
524
 
525
525
  export async function listDaemonConnectorRuntime(): Promise<Record<string, DaemonConnectorRuntimeState>> {
526
+ // When the daemon is running in-process, read runtime state directly.
527
+ const inProcessStatus = getDaemonStatus()
528
+ if (inProcessStatus.running) {
529
+ const { listRunningConnectors, getConnectorStatus, isConnectorAuthenticated, hasConnectorCredentials, getConnectorQR, getConnectorPresence } =
530
+ await import('@/lib/server/connectors/connector-lifecycle')
531
+ const result: Record<string, DaemonConnectorRuntimeState> = {}
532
+ for (const { id } of listRunningConnectors()) {
533
+ result[id] = {
534
+ status: getConnectorStatus(id),
535
+ authenticated: isConnectorAuthenticated(id),
536
+ hasCredentials: hasConnectorCredentials(id),
537
+ qrDataUrl: getConnectorQR(id),
538
+ presence: getConnectorPresence(id),
539
+ }
540
+ }
541
+ return result
542
+ }
543
+
526
544
  const metadata = readDaemonAdminMetadata()
527
545
  if (!metadata || !isProcessRunning(metadata.pid)) return {}
528
546
  try {
@@ -534,6 +552,22 @@ export async function listDaemonConnectorRuntime(): Promise<Record<string, Daemo
534
552
  }
535
553
 
536
554
  export async function getDaemonConnectorRuntime(connectorId: string): Promise<DaemonConnectorRuntimeState | null> {
555
+ // When the daemon is running in-process, read runtime state directly from
556
+ // the connector lifecycle module instead of an unreachable subprocess HTTP API.
557
+ const inProcessStatus = getDaemonStatus()
558
+ if (inProcessStatus.running) {
559
+ const { getConnectorStatus, getConnectorQR, isConnectorAuthenticated, hasConnectorCredentials, getConnectorPresence } =
560
+ await import('@/lib/server/connectors/connector-lifecycle')
561
+ const status = getConnectorStatus(connectorId)
562
+ return {
563
+ status,
564
+ authenticated: isConnectorAuthenticated(connectorId),
565
+ hasCredentials: hasConnectorCredentials(connectorId),
566
+ qrDataUrl: getConnectorQR(connectorId),
567
+ presence: getConnectorPresence(connectorId),
568
+ }
569
+ }
570
+
537
571
  const metadata = readDaemonAdminMetadata()
538
572
  if (!metadata || !isProcessRunning(metadata.pid)) return null
539
573
  try {
@@ -555,6 +589,27 @@ export async function runDaemonConnectorAction(
555
589
  if (action !== 'stop') {
556
590
  await ensureDaemonProcessRunning(source, { manualStart: true })
557
591
  }
592
+
593
+ // When the daemon is running in-process (e.g. standalone production build),
594
+ // there is no subprocess admin server or daemon-admin.json. Execute the
595
+ // connector lifecycle action directly in the current process.
596
+ const inProcessStatus = getDaemonStatus()
597
+ if (inProcessStatus.running) {
598
+ try {
599
+ const { startConnector, stopConnector, repairConnector } = await import('@/lib/server/connectors/connector-lifecycle')
600
+ if (action === 'start') {
601
+ await startConnector(connectorId)
602
+ } else if (action === 'stop') {
603
+ await stopConnector(connectorId)
604
+ } else if (action === 'repair') {
605
+ await repairConnector(connectorId)
606
+ }
607
+ } catch (err: unknown) {
608
+ log.error(TAG, `In-process connector action "${action}" failed for ${connectorId}:`, errorMessage(err))
609
+ }
610
+ return null
611
+ }
612
+
558
613
  const metadata = readDaemonAdminMetadata()
559
614
  if (!metadata || !isProcessRunning(metadata.pid)) return null
560
615
  const result = await requestDaemon<{ connector: DaemonConnectorRuntimeState | null }>(
@@ -0,0 +1,62 @@
1
+ import assert from 'node:assert/strict'
2
+ import { describe, it } from 'node:test'
3
+
4
+ import {
5
+ parseOtelHeaders,
6
+ resolveOtelConfig,
7
+ resolveOtelTracesEndpoint,
8
+ } from '@/lib/server/observability/otel-config'
9
+
10
+ function env(overrides: Record<string, string>): NodeJS.ProcessEnv {
11
+ return {
12
+ NODE_ENV: 'test',
13
+ ...overrides,
14
+ }
15
+ }
16
+
17
+ describe('otel config', () => {
18
+ it('stays disabled unless OTEL_ENABLED is truthy', () => {
19
+ assert.equal(resolveOtelConfig(env({ OTEL_EXPORTER_OTLP_ENDPOINT: 'http://localhost:4318' })), null)
20
+ })
21
+
22
+ it('normalizes a base OTLP endpoint to the traces path', () => {
23
+ assert.equal(
24
+ resolveOtelTracesEndpoint(env({
25
+ OTEL_EXPORTER_OTLP_ENDPOINT: 'https://collector.example.com:4318',
26
+ })),
27
+ 'https://collector.example.com:4318/v1/traces',
28
+ )
29
+ })
30
+
31
+ it('prefers an explicit OTLP traces endpoint', () => {
32
+ assert.equal(
33
+ resolveOtelTracesEndpoint(env({
34
+ OTEL_EXPORTER_OTLP_ENDPOINT: 'https://collector.example.com:4318',
35
+ OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: 'https://collector.example.com/custom/traces',
36
+ })),
37
+ 'https://collector.example.com/custom/traces',
38
+ )
39
+ })
40
+
41
+ it('parses OTLP headers and applies the default service name', () => {
42
+ const config = resolveOtelConfig(env({
43
+ OTEL_ENABLED: 'true',
44
+ OTEL_EXPORTER_OTLP_ENDPOINT: 'https://collector.example.com:4318',
45
+ OTEL_EXPORTER_OTLP_HEADERS: 'Authorization=Bearer token, X-Team = swarm ',
46
+ }))
47
+
48
+ assert.ok(config)
49
+ assert.equal(config.serviceName, 'swarmclaw')
50
+ assert.deepEqual(config.headers, {
51
+ Authorization: 'Bearer token',
52
+ 'X-Team': 'swarm',
53
+ })
54
+ assert.equal(config.tracesEndpoint, 'https://collector.example.com:4318/v1/traces')
55
+ })
56
+
57
+ it('ignores malformed header entries', () => {
58
+ assert.deepEqual(parseOtelHeaders('good=value, broken, =oops, missing='), {
59
+ good: 'value',
60
+ })
61
+ })
62
+ })
@@ -0,0 +1,67 @@
1
+ export interface OTelConfig {
2
+ enabled: true
3
+ serviceName: string
4
+ tracesEndpoint: string
5
+ headers: Record<string, string>
6
+ }
7
+
8
+ function parseBooleanFlag(value: string | undefined): boolean {
9
+ if (typeof value !== 'string') return false
10
+ const normalized = value.trim().toLowerCase()
11
+ return normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'on'
12
+ }
13
+
14
+ function cleanEnvValue(value: string | undefined): string | null {
15
+ if (typeof value !== 'string') return null
16
+ const trimmed = value.trim()
17
+ return trimmed ? trimmed : null
18
+ }
19
+
20
+ export function resolveOtelTracesEndpoint(env: NodeJS.ProcessEnv = process.env): string | null {
21
+ const tracesEndpoint = cleanEnvValue(env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT)
22
+ if (tracesEndpoint) return tracesEndpoint.replace(/\/+$/, '')
23
+
24
+ const baseEndpoint = cleanEnvValue(env.OTEL_EXPORTER_OTLP_ENDPOINT)
25
+ if (!baseEndpoint) return null
26
+
27
+ const normalizedBase = baseEndpoint.replace(/\/+$/, '')
28
+ if (!normalizedBase) return null
29
+ if (normalizedBase.endsWith('/v1/traces')) return normalizedBase
30
+ return `${normalizedBase}/v1/traces`
31
+ }
32
+
33
+ export function parseOtelHeaders(value: string | undefined): Record<string, string> {
34
+ if (typeof value !== 'string') return {}
35
+ const entries = value
36
+ .split(',')
37
+ .map((entry) => entry.trim())
38
+ .filter(Boolean)
39
+
40
+ const headers: Record<string, string> = {}
41
+ for (const entry of entries) {
42
+ const separatorIndex = entry.indexOf('=')
43
+ if (separatorIndex <= 0) continue
44
+ const key = entry.slice(0, separatorIndex).trim()
45
+ const headerValue = entry.slice(separatorIndex + 1).trim()
46
+ if (!key || !headerValue) continue
47
+ headers[key] = headerValue
48
+ }
49
+ return headers
50
+ }
51
+
52
+ export function resolveOtelConfig(env: NodeJS.ProcessEnv = process.env): OTelConfig | null {
53
+ if (!parseBooleanFlag(env.OTEL_ENABLED)) return null
54
+
55
+ const tracesEndpoint = resolveOtelTracesEndpoint(env)
56
+ if (!tracesEndpoint) return null
57
+
58
+ const serviceName = cleanEnvValue(env.OTEL_SERVICE_NAME) || 'swarmclaw'
59
+ const headers = parseOtelHeaders(env.OTEL_EXPORTER_OTLP_TRACES_HEADERS || env.OTEL_EXPORTER_OTLP_HEADERS)
60
+
61
+ return {
62
+ enabled: true,
63
+ serviceName,
64
+ tracesEndpoint,
65
+ headers,
66
+ }
67
+ }
@@ -0,0 +1,52 @@
1
+ import {
2
+ trace,
3
+ SpanStatusCode,
4
+ type Attributes,
5
+ type AttributeValue,
6
+ type Span,
7
+ } from '@opentelemetry/api'
8
+ import { errorMessage } from '@/lib/shared-utils'
9
+
10
+ type SpanAttributeInput = Record<string, AttributeValue | null | undefined>
11
+
12
+ function sanitizeAttributes(attributes?: SpanAttributeInput): Attributes | undefined {
13
+ if (!attributes) return undefined
14
+ const cleaned: Attributes = {}
15
+ for (const [key, value] of Object.entries(attributes)) {
16
+ if (value === undefined || value === null) continue
17
+ cleaned[key] = value
18
+ }
19
+ return Object.keys(cleaned).length > 0 ? cleaned : undefined
20
+ }
21
+
22
+ export function setSpanAttributes(span: Span, attributes?: SpanAttributeInput): void {
23
+ const cleaned = sanitizeAttributes(attributes)
24
+ if (!cleaned) return
25
+ span.setAttributes(cleaned)
26
+ }
27
+
28
+ export function recordSpanError(span: Span, err: unknown): void {
29
+ span.recordException(err instanceof Error ? err : new Error(errorMessage(err)))
30
+ span.setStatus({
31
+ code: SpanStatusCode.ERROR,
32
+ message: errorMessage(err),
33
+ })
34
+ }
35
+
36
+ export async function withServerSpan<T>(
37
+ name: string,
38
+ attributes: SpanAttributeInput | undefined,
39
+ fn: (span: Span) => Promise<T> | T,
40
+ ): Promise<T> {
41
+ const tracer = trace.getTracer('swarmclaw.runtime')
42
+ return tracer.startActiveSpan(name, { attributes: sanitizeAttributes(attributes) }, async (span) => {
43
+ try {
44
+ return await fn(span)
45
+ } catch (err) {
46
+ recordSpanError(span, err)
47
+ throw err
48
+ } finally {
49
+ span.end()
50
+ }
51
+ })
52
+ }
@@ -0,0 +1,79 @@
1
+ import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
2
+ import { NodeSDK } from '@opentelemetry/sdk-node'
3
+ import { log } from '@/lib/server/logger'
4
+ import { hmrSingleton } from '@/lib/shared-utils'
5
+ import { resolveOtelConfig } from '@/lib/server/observability/otel-config'
6
+
7
+ const TAG = 'otel'
8
+
9
+ interface OTelState {
10
+ started: boolean
11
+ startPromise: Promise<boolean> | null
12
+ sdk: NodeSDK | null
13
+ }
14
+
15
+ const otelState = hmrSingleton<OTelState>('__swarmclaw_otel_state__', () => ({
16
+ started: false,
17
+ startPromise: null,
18
+ sdk: null,
19
+ }))
20
+
21
+ export function isOtelEnabled(): boolean {
22
+ return resolveOtelConfig() !== null
23
+ }
24
+
25
+ export async function ensureOpenTelemetryStarted(): Promise<boolean> {
26
+ const config = resolveOtelConfig()
27
+ if (!config) return false
28
+ if (otelState.started) return true
29
+ if (otelState.startPromise) return otelState.startPromise
30
+
31
+ otelState.startPromise = (async () => {
32
+ try {
33
+ process.env.OTEL_SERVICE_NAME = process.env.OTEL_SERVICE_NAME || config.serviceName
34
+ const exporter = new OTLPTraceExporter({
35
+ url: config.tracesEndpoint,
36
+ headers: Object.keys(config.headers).length > 0 ? config.headers : undefined,
37
+ })
38
+ const sdk = new NodeSDK({
39
+ traceExporter: exporter,
40
+ })
41
+ sdk.start()
42
+ otelState.sdk = sdk
43
+ otelState.started = true
44
+ log.info(TAG, 'OpenTelemetry OTLP tracing enabled', {
45
+ serviceName: config.serviceName,
46
+ tracesEndpoint: config.tracesEndpoint,
47
+ })
48
+ return true
49
+ } catch (err) {
50
+ otelState.sdk = null
51
+ otelState.started = false
52
+ log.error(TAG, 'Failed to initialize OpenTelemetry tracing', err)
53
+ return false
54
+ } finally {
55
+ otelState.startPromise = null
56
+ }
57
+ })()
58
+
59
+ return otelState.startPromise
60
+ }
61
+
62
+ export async function shutdownOpenTelemetry(): Promise<void> {
63
+ const sdk = otelState.sdk
64
+ if (!sdk) {
65
+ otelState.started = false
66
+ otelState.startPromise = null
67
+ return
68
+ }
69
+
70
+ otelState.sdk = null
71
+ otelState.started = false
72
+ otelState.startPromise = null
73
+
74
+ try {
75
+ await sdk.shutdown()
76
+ } catch (err) {
77
+ log.warn(TAG, 'Failed to flush OpenTelemetry tracing during shutdown', err)
78
+ }
79
+ }
@@ -39,6 +39,7 @@ import {
39
39
  syncProtocolParentFromChildRun,
40
40
  } from '@/lib/server/protocols/protocol-step-helpers'
41
41
  import { stepProtocolRun } from '@/lib/server/protocols/protocol-step-processors'
42
+ import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
42
43
 
43
44
  // ---- Singletons ----
44
45
 
@@ -308,79 +309,91 @@ export async function runProtocolRun(runId: string, deps?: ProtocolRunDeps): Pro
308
309
  return loadProtocolRunById(runId)
309
310
  }
310
311
  try {
311
- let run = loadProtocolRunById(runId)
312
- if (!run) return null
313
- if (run.status === 'cancelled' || run.status === 'archived' || run.status === 'completed' || run.status === 'paused') return run
314
- run = persistRun({
315
- ...run,
316
- status: run.status === 'waiting' ? 'running' : run.status,
317
- waitingReason: null,
318
- pauseReason: null,
319
- lastError: null,
320
- startedAt: run.startedAt || now(deps),
321
- updatedAt: now(deps),
322
- })
323
- if (run.parentRunId) syncProtocolParentFromChildRun(run, deps)
312
+ return await withServerSpan('swarmclaw.protocol.run', {
313
+ 'swarmclaw.protocol.run_id': runId,
314
+ }, async (span) => {
315
+ let run = loadProtocolRunById(runId)
316
+ if (!run) return null
317
+ setSpanAttributes(span, {
318
+ 'swarmclaw.protocol.template_id': run.templateId,
319
+ 'swarmclaw.protocol.source_kind': run.sourceRef.kind,
320
+ 'swarmclaw.protocol.participant_count': run.participantAgentIds.length,
321
+ 'swarmclaw.protocol.status': run.status,
322
+ })
323
+ if (run.status === 'cancelled' || run.status === 'archived' || run.status === 'completed' || run.status === 'paused') return run
324
+ run = persistRun({
325
+ ...run,
326
+ status: run.status === 'waiting' ? 'running' : run.status,
327
+ waitingReason: null,
328
+ pauseReason: null,
329
+ lastError: null,
330
+ startedAt: run.startedAt || now(deps),
331
+ updatedAt: now(deps),
332
+ })
333
+ if (run.parentRunId) syncProtocolParentFromChildRun(run, deps)
324
334
 
325
- const MAX_STEP_ITERATIONS = 500
326
- let stepIterations = 0
327
- while (run.status === 'running' || run.status === 'draft') {
328
- stepIterations++
329
- if (stepIterations > MAX_STEP_ITERATIONS) {
330
- run = persistRun({ ...run, status: 'failed', lastError: `Exceeded maximum step iterations (${MAX_STEP_ITERATIONS}). Possible infinite loop in step graph.`, updatedAt: now(deps) })
331
- appendProtocolEvent(run.id, { type: 'failed', summary: `Exceeded maximum step iterations (${MAX_STEP_ITERATIONS}).` }, deps)
332
- break
333
- }
334
- if (shouldYieldBetweenProtocolSteps(deps)) {
335
- // Yield between steps in the fire-and-forget runtime so I/O, HTTP responses,
336
- // and timers can run.
337
- await new Promise(r => setTimeout(r, 0))
338
- }
339
- const latest = loadProtocolRunById(run.id)
340
- if (!latest) return null
341
- if (latest.status === 'paused' || latest.status === 'cancelled' || latest.status === 'archived' || latest.status === 'completed') {
335
+ const MAX_STEP_ITERATIONS = 500
336
+ let stepIterations = 0
337
+ while (run.status === 'running' || run.status === 'draft') {
338
+ stepIterations++
339
+ if (stepIterations > MAX_STEP_ITERATIONS) {
340
+ run = persistRun({ ...run, status: 'failed', lastError: `Exceeded maximum step iterations (${MAX_STEP_ITERATIONS}). Possible infinite loop in step graph.`, updatedAt: now(deps) })
341
+ appendProtocolEvent(run.id, { type: 'failed', summary: `Exceeded maximum step iterations (${MAX_STEP_ITERATIONS}).` }, deps)
342
+ break
343
+ }
344
+ if (shouldYieldBetweenProtocolSteps(deps)) {
345
+ // Yield between steps in the fire-and-forget runtime so I/O, HTTP responses,
346
+ // and timers can run.
347
+ await new Promise(r => setTimeout(r, 0))
348
+ }
349
+ const latest = loadProtocolRunById(run.id)
350
+ if (!latest) return null
351
+ if (latest.status === 'paused' || latest.status === 'cancelled' || latest.status === 'archived' || latest.status === 'completed') {
352
+ run = latest
353
+ break
354
+ }
342
355
  run = latest
343
- break
344
- }
345
- run = latest
346
- renewProtocolLease(run.id)
356
+ renewProtocolLease(run.id)
347
357
 
348
- // DAG scheduler: compute step readiness before stepping
349
- const sched = computeStepReadiness(run.steps || [], run.entryStepId || null, run.stepState)
350
- if (sched.dagMode) {
351
- run = persistRun({
352
- ...run,
353
- stepState: sched.stepState,
354
- completedStepIds: sched.completedStepIds,
355
- runningStepIds: sched.runningStepIds,
356
- readyStepIds: sched.readyStepIds,
357
- failedStepIds: sched.failedStepIds,
358
- updatedAt: now(deps),
359
- })
360
- if (sched.readyStepIds.length === 0 && sched.runningStepIds.length === 0) {
361
- // No more work either all done or stuck
362
- const allSteps = run.steps || []
363
- const allCompleted = allSteps.every((s) => sched.stepState[s.id]?.status === 'completed')
364
- if (allCompleted) {
365
- run = completeProtocolRun(run, deps)
366
- } else {
367
- run = persistRun({ ...run, status: 'failed', lastError: 'DAG stuck: no ready steps and not all completed.', updatedAt: now(deps) })
368
- appendProtocolEvent(run.id, { type: 'failed', summary: 'DAG stuck: no ready steps and not all completed.' }, deps)
358
+ const sched = computeStepReadiness(run.steps || [], run.entryStepId || null, run.stepState)
359
+ if (sched.dagMode) {
360
+ run = persistRun({
361
+ ...run,
362
+ stepState: sched.stepState,
363
+ completedStepIds: sched.completedStepIds,
364
+ runningStepIds: sched.runningStepIds,
365
+ readyStepIds: sched.readyStepIds,
366
+ failedStepIds: sched.failedStepIds,
367
+ updatedAt: now(deps),
368
+ })
369
+ if (sched.readyStepIds.length === 0 && sched.runningStepIds.length === 0) {
370
+ const allSteps = run.steps || []
371
+ const allCompleted = allSteps.every((s) => sched.stepState[s.id]?.status === 'completed')
372
+ if (allCompleted) {
373
+ run = completeProtocolRun(run, deps)
374
+ } else {
375
+ run = persistRun({ ...run, status: 'failed', lastError: 'DAG stuck: no ready steps and not all completed.', updatedAt: now(deps) })
376
+ appendProtocolEvent(run.id, { type: 'failed', summary: 'DAG stuck: no ready steps and not all completed.' }, deps)
377
+ }
378
+ break
379
+ }
380
+ if (sched.readyStepIds.length > 0) {
381
+ const nextReadyId = sched.readyStepIds[0]
382
+ run = persistRun({ ...run, currentStepId: nextReadyId, updatedAt: now(deps) })
369
383
  }
370
- break
371
- }
372
- if (sched.readyStepIds.length > 0) {
373
- // Pick first ready step as currentStepId
374
- const nextReadyId = sched.readyStepIds[0]
375
- run = persistRun({ ...run, currentStepId: nextReadyId, updatedAt: now(deps) })
376
384
  }
377
- }
378
385
 
379
- run = await stepProtocolRun(run, deps)
380
- if (run.status === 'waiting' || run.status === 'paused' || run.status === 'failed' || run.status === 'cancelled' || run.status === 'archived' || run.status === 'completed') break
381
- }
382
- if (run.parentRunId) syncProtocolParentFromChildRun(run, deps)
383
- return run
386
+ run = await stepProtocolRun(run, deps)
387
+ if (run.status === 'waiting' || run.status === 'paused' || run.status === 'failed' || run.status === 'cancelled' || run.status === 'archived' || run.status === 'completed') break
388
+ }
389
+ setSpanAttributes(span, {
390
+ 'swarmclaw.protocol.step_iterations': stepIterations,
391
+ 'swarmclaw.protocol.status': run.status,
392
+ 'swarmclaw.protocol.current_step_id': run.currentStepId,
393
+ })
394
+ if (run.parentRunId) syncProtocolParentFromChildRun(run, deps)
395
+ return run
396
+ })
384
397
  } catch (err: unknown) {
385
398
  const failed = updateRun(runId, (current) => ({
386
399
  ...current,
@@ -58,6 +58,7 @@ import {
58
58
  isExternalExtensionId,
59
59
  splitCapabilityIds,
60
60
  } from '@/lib/capability-selection'
61
+ import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
61
62
 
62
63
  export type { ToolContext, SessionToolsResult }
63
64
  export { sweepOrphanedBrowsers, cleanupSessionBrowser, getActiveBrowserCount, hasActiveBrowser }
@@ -388,65 +389,80 @@ export async function buildSessionTools(cwd: string, enabledExtensions: string[]
388
389
  const schema = (candidate as unknown as { schema?: z.ZodTypeAny }).schema || z.object({}).passthrough()
389
390
  return tool(
390
391
  async (args) => {
391
- // Check abort before executing any tool — prevents wasted work after chat stop
392
- if (abortSignalRef.signal?.aborted) {
393
- throw new DOMException('Tool execution aborted', 'AbortError')
394
- }
395
- const normalizedArgs = normalizeToolInputArgs((args ?? {}) as Record<string, unknown>)
396
- const hookSession = resolveCurrentSession() || buildFallbackHookSession()
397
- // Enforce file access policy before execution
398
- if (fileAccessPolicy) {
399
- const denial = enforceFileAccessPolicy(candidate.name, normalizedArgs, cwd, fileAccessPolicy)
400
- if (denial) return denial
401
- }
402
- let guardedArgs: Record<string, unknown> | null = normalizedArgs
403
- if (ctx?.beforeToolCall) {
404
- const guardResult = await ctx.beforeToolCall({
405
- session: hookSession,
406
- toolName: candidate.name,
407
- input: guardedArgs,
408
- runId: ctx.runId,
409
- })
410
- if (guardResult?.warning) {
411
- ctx.onToolCallWarning?.({
392
+ return withServerSpan('swarmclaw.tool.call', {
393
+ 'swarmclaw.tool.name': candidate.name,
394
+ 'swarmclaw.session.id': ctx?.sessionId || null,
395
+ 'swarmclaw.agent.id': ctx?.agentId || null,
396
+ 'swarmclaw.run.id': ctx?.runId || null,
397
+ }, async (span) => {
398
+ // Check abort before executing any tool — prevents wasted work after chat stop
399
+ if (abortSignalRef.signal?.aborted) {
400
+ setSpanAttributes(span, { 'swarmclaw.tool.aborted': true })
401
+ throw new DOMException('Tool execution aborted', 'AbortError')
402
+ }
403
+ const normalizedArgs = normalizeToolInputArgs((args ?? {}) as Record<string, unknown>)
404
+ const hookSession = resolveCurrentSession() || buildFallbackHookSession()
405
+ if (fileAccessPolicy) {
406
+ const denial = enforceFileAccessPolicy(candidate.name, normalizedArgs, cwd, fileAccessPolicy)
407
+ if (denial) {
408
+ setSpanAttributes(span, { 'swarmclaw.tool.blocked': true })
409
+ return denial
410
+ }
411
+ }
412
+ let guardedArgs: Record<string, unknown> | null = normalizedArgs
413
+ if (ctx?.beforeToolCall) {
414
+ const guardResult = await ctx.beforeToolCall({
415
+ session: hookSession,
412
416
  toolName: candidate.name,
413
- message: guardResult.warning,
417
+ input: guardedArgs,
418
+ runId: ctx.runId,
414
419
  })
420
+ if (guardResult?.warning) {
421
+ ctx.onToolCallWarning?.({
422
+ toolName: candidate.name,
423
+ message: guardResult.warning,
424
+ })
425
+ }
426
+ if (typeof guardResult?.blockReason === 'string' && guardResult.blockReason.trim()) {
427
+ setSpanAttributes(span, { 'swarmclaw.tool.blocked': true })
428
+ throw new Error(guardResult.blockReason.trim())
429
+ }
430
+ if (guardResult && 'input' in guardResult) {
431
+ guardedArgs = guardResult.input === undefined ? guardedArgs : guardResult.input ?? null
432
+ }
415
433
  }
416
- if (typeof guardResult?.blockReason === 'string' && guardResult.blockReason.trim()) {
417
- throw new Error(guardResult.blockReason.trim())
434
+ const hookResult = await runCapabilityBeforeToolCall(
435
+ {
436
+ session: hookSession,
437
+ toolName: candidate.name,
438
+ input: guardedArgs,
439
+ runId: ctx?.runId || undefined,
440
+ },
441
+ { enabledIds: activeExtensions },
442
+ )
443
+ if (hookResult.warning) {
444
+ ctx?.onToolCallWarning?.({
445
+ toolName: candidate.name,
446
+ message: hookResult.warning,
447
+ })
418
448
  }
419
- if (guardResult && 'input' in guardResult) {
420
- guardedArgs = guardResult.input === undefined ? guardedArgs : guardResult.input ?? null
449
+ if (hookResult.blockReason) {
450
+ setSpanAttributes(span, { 'swarmclaw.tool.blocked': true })
451
+ throw new Error(hookResult.blockReason)
421
452
  }
422
- }
423
- const hookResult = await runCapabilityBeforeToolCall(
424
- {
425
- session: hookSession,
426
- toolName: candidate.name,
427
- input: guardedArgs,
428
- runId: ctx?.runId || undefined,
429
- },
430
- { enabledIds: activeExtensions },
431
- )
432
- if (hookResult.warning) {
433
- ctx?.onToolCallWarning?.({
434
- toolName: candidate.name,
435
- message: hookResult.warning,
453
+ const effectiveArgs = hookResult.input ?? guardedArgs
454
+ const result = await candidate.invoke(effectiveArgs ?? {})
455
+ const outputText = typeof result === 'string' ? result : JSON.stringify(result)
456
+ setSpanAttributes(span, {
457
+ 'swarmclaw.tool.output_bytes': Buffer.byteLength(outputText, 'utf-8'),
436
458
  })
437
- }
438
- if (hookResult.blockReason) {
439
- throw new Error(hookResult.blockReason)
440
- }
441
- const effectiveArgs = hookResult.input ?? guardedArgs
442
- const result = await candidate.invoke(effectiveArgs ?? {})
443
- const outputText = typeof result === 'string' ? result : JSON.stringify(result)
444
- await runCapabilityHook(
445
- 'afterToolExec',
446
- { session: hookSession, toolName: candidate.name, input: effectiveArgs, output: outputText },
447
- { enabledIds: activeExtensions },
448
- )
449
- return outputText
459
+ await runCapabilityHook(
460
+ 'afterToolExec',
461
+ { session: hookSession, toolName: candidate.name, input: effectiveArgs, output: outputText },
462
+ { enabledIds: activeExtensions },
463
+ )
464
+ return outputText
465
+ })
450
466
  },
451
467
  {
452
468
  name: candidate.name,
package/src/proxy.ts CHANGED
@@ -91,6 +91,7 @@ export function proxy(request: NextRequest) {
91
91
  if (
92
92
  !pathname.startsWith('/api/')
93
93
  || pathname === '/api/auth'
94
+ || pathname === '/api/healthz'
94
95
  || isWebhookTrigger
95
96
  || isConnectorWebhook
96
97
  ) {