@swarmclawai/swarmclaw 1.5.1 → 1.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -0
- package/package.json +6 -3
- package/src/app/api/healthz/route.test.ts +14 -0
- package/src/app/api/healthz/route.ts +9 -0
- package/src/cli/index.js +7 -0
- package/src/instrumentation.ts +7 -0
- package/src/lib/providers/copilot-cli.ts +30 -9
- package/src/lib/server/chat-execution/chat-execution.ts +106 -68
- package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +57 -18
- package/src/lib/server/observability/otel-config.test.ts +62 -0
- package/src/lib/server/observability/otel-config.ts +67 -0
- package/src/lib/server/observability/otel-tracing.ts +52 -0
- package/src/lib/server/observability/otel.ts +79 -0
- package/src/lib/server/protocols/protocol-run-lifecycle.ts +80 -67
- package/src/lib/server/session-tools/index.ts +69 -53
- package/src/proxy.ts +1 -0
package/README.md
CHANGED
|
@@ -15,6 +15,46 @@ Docs: https://swarmclaw.ai/docs
|
|
|
15
15
|
Website: https://swarmclaw.ai
|
|
16
16
|
Extension tutorial: https://swarmclaw.ai/docs/extension-tutorial
|
|
17
17
|
|
|
18
|
+
## Hosted Deploys
|
|
19
|
+
|
|
20
|
+
SwarmClaw now ships provider-ready deploy files at the repo root:
|
|
21
|
+
|
|
22
|
+
- `render.yaml` for Render Blueprint deploys from the public GHCR image
|
|
23
|
+
- `fly.toml` for Fly.io image-backed deploys
|
|
24
|
+
- `railway.json` for Railway-aligned health and restart defaults
|
|
25
|
+
|
|
26
|
+
The published image is:
|
|
27
|
+
|
|
28
|
+
```text
|
|
29
|
+
ghcr.io/swarmclawai/swarmclaw:latest
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Hosted deployments should:
|
|
33
|
+
|
|
34
|
+
- mount persistent storage at `/app/data`
|
|
35
|
+
- manage secrets through the provider dashboard
|
|
36
|
+
- set `ACCESS_KEY` and `CREDENTIAL_SECRET`
|
|
37
|
+
- point health checks at `/api/healthz`
|
|
38
|
+
|
|
39
|
+
Full hosted deployment guides live at https://swarmclaw.ai/docs/deployment
|
|
40
|
+
|
|
41
|
+
## OpenTelemetry OTLP Export
|
|
42
|
+
|
|
43
|
+
SwarmClaw supports opt-in OTLP trace export for chat turns, direct model streams, tool execution, and structured-session runs.
|
|
44
|
+
|
|
45
|
+
Minimal configuration:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
OTEL_ENABLED=true
|
|
49
|
+
OTEL_SERVICE_NAME=swarmclaw
|
|
50
|
+
OTEL_EXPORTER_OTLP_ENDPOINT=https://your-collector:4318
|
|
51
|
+
OTEL_EXPORTER_OTLP_HEADERS=Authorization=Bearer your-token
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
If you need a trace-specific endpoint, set `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` directly instead.
|
|
55
|
+
|
|
56
|
+
Operational docs: https://swarmclaw.ai/docs/observability
|
|
57
|
+
|
|
18
58
|
## Screenshots
|
|
19
59
|
|
|
20
60
|
<table>
|
|
@@ -215,6 +255,17 @@ SwarmClaw agents can join [SwarmFeed](https://swarmfeed.ai) — a social network
|
|
|
215
255
|
|
|
216
256
|
Read the docs at [swarmclaw.ai/docs/swarmfeed](https://swarmclaw.ai/docs/swarmfeed) and visit [swarmfeed.ai](https://swarmfeed.ai) for the platform itself.
|
|
217
257
|
|
|
258
|
+
### v1.5.3 Highlights
|
|
259
|
+
|
|
260
|
+
- **Copilot CLI v1.x compatibility**: the `copilot-cli` provider now handles the current event format (`assistant.message_delta`, `assistant.message`, updated `result` payload) while keeping backward compatibility with the legacy format. Also fixes `--resume` flag syntax. (Community contribution by [@borislavnnikolov](https://github.com/borislavnnikolov) -- PR #36)
|
|
261
|
+
|
|
262
|
+
### v1.5.2 Highlights
|
|
263
|
+
|
|
264
|
+
- **Hosted deploy path for SwarmClaw itself**: added root-level `render.yaml`, `fly.toml`, and `railway.json` so the published `ghcr.io/swarmclawai/swarmclaw:latest` image is easier to run on always-on platforms.
|
|
265
|
+
- **Public health endpoint for hosted platforms**: added `/api/healthz` and exempted it from access-key auth so Render, Fly.io, and Railway can perform liveness checks without weakening the rest of the API surface.
|
|
266
|
+
- **OTLP/OpenTelemetry foundation**: SwarmClaw can now export traces for chat turns, direct model streams, protocol runs, and tool execution to any OTLP-compatible backend using environment variables only.
|
|
267
|
+
- **Docs and landing-page deploy refresh**: `swarmclaw.ai` now exposes the hosted deploy path and a dedicated observability guide instead of burying those operator workflows in general setup docs.
|
|
268
|
+
|
|
218
269
|
### v1.5.1 Highlights
|
|
219
270
|
|
|
220
271
|
- **Standalone connector lifecycle**: connector start, stop, status, and repair now work correctly in standalone production builds (`npm start` / pm2) where the daemon runs in-process. Previously these operations silently failed because the controller assumed a daemon subprocess was always present. (Community contribution by [@borislavnnikolov](https://github.com/borislavnnikolov) -- PR #35)
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmclawai/swarmclaw",
|
|
3
|
-
"version": "1.5.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "1.5.3",
|
|
4
|
+
"description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"publishConfig": {
|
|
7
7
|
"access": "public",
|
|
@@ -74,7 +74,7 @@
|
|
|
74
74
|
"test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
|
|
75
75
|
"test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts",
|
|
76
76
|
"test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
|
|
77
|
-
"test:runtime": "tsx --test src/lib/server/knowledge-sources.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/safe-parse-body.test.ts src/app/api/approvals/route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/logs/route.test.ts src/app/api/tts/route.test.ts",
|
|
77
|
+
"test:runtime": "tsx --test src/lib/server/knowledge-sources.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/app/api/approvals/route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/tts/route.test.ts",
|
|
78
78
|
"test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
|
|
79
79
|
"test:e2e": "tsx .workbench/browser-e2e/run.ts",
|
|
80
80
|
"test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
|
|
@@ -88,6 +88,9 @@
|
|
|
88
88
|
"@langchain/langgraph": "^1.2.2",
|
|
89
89
|
"@langchain/openai": "^1.2.8",
|
|
90
90
|
"@modelcontextprotocol/sdk": "^1.27.1",
|
|
91
|
+
"@opentelemetry/api": "^1.9.1",
|
|
92
|
+
"@opentelemetry/exporter-trace-otlp-http": "^0.214.0",
|
|
93
|
+
"@opentelemetry/sdk-node": "^0.214.0",
|
|
91
94
|
"@multiavatar/multiavatar": "^1.0.7",
|
|
92
95
|
"@playwright/mcp": "^0.0.68",
|
|
93
96
|
"@slack/bolt": "^4.6.0",
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import { test } from 'node:test'
|
|
3
|
+
|
|
4
|
+
import { GET } from '@/app/api/healthz/route'
|
|
5
|
+
|
|
6
|
+
test('GET /api/healthz returns an ok payload', async () => {
|
|
7
|
+
const response = await GET()
|
|
8
|
+
assert.equal(response.status, 200)
|
|
9
|
+
|
|
10
|
+
const payload = await response.json()
|
|
11
|
+
assert.equal(payload.ok, true)
|
|
12
|
+
assert.equal(payload.service, 'swarmclaw')
|
|
13
|
+
assert.equal(typeof payload.time, 'number')
|
|
14
|
+
})
|
package/src/cli/index.js
CHANGED
|
@@ -734,6 +734,13 @@ const COMMAND_GROUPS = [
|
|
|
734
734
|
cmd('get', 'GET', '/system/status', 'Get system health summary (safe for external monitors)'),
|
|
735
735
|
],
|
|
736
736
|
},
|
|
737
|
+
{
|
|
738
|
+
name: 'healthz',
|
|
739
|
+
description: 'Public liveness probe',
|
|
740
|
+
commands: [
|
|
741
|
+
cmd('get', 'GET', '/healthz', 'Get public health check payload'),
|
|
742
|
+
],
|
|
743
|
+
},
|
|
737
744
|
{
|
|
738
745
|
name: 'usage',
|
|
739
746
|
description: 'Usage and cost summary',
|
package/src/instrumentation.ts
CHANGED
|
@@ -5,9 +5,11 @@ const TAG = 'instrumentation'
|
|
|
5
5
|
export async function register() {
|
|
6
6
|
if (process.env.NEXT_RUNTIME === 'nodejs') {
|
|
7
7
|
const { log } = await import('@/lib/server/logger')
|
|
8
|
+
const { ensureOpenTelemetryStarted, shutdownOpenTelemetry } = await import('@/lib/server/observability/otel')
|
|
8
9
|
const isWorkerOnly = process.env.SWARMCLAW_WORKER_ONLY === '1'
|
|
9
10
|
const { initWsServer, closeWsServer } = await import('./lib/server/ws-hub')
|
|
10
11
|
const { ensureDaemonStarted } = await import('@/lib/server/runtime/daemon-state')
|
|
12
|
+
await ensureOpenTelemetryStarted()
|
|
11
13
|
|
|
12
14
|
// One-time migration: backfill allKnownPeerIds on existing connector sessions
|
|
13
15
|
try {
|
|
@@ -44,6 +46,11 @@ export async function register() {
|
|
|
44
46
|
} catch (err) {
|
|
45
47
|
log.error(TAG, 'Failed to stop daemon during shutdown:', err)
|
|
46
48
|
}
|
|
49
|
+
try {
|
|
50
|
+
await shutdownOpenTelemetry()
|
|
51
|
+
} catch (err) {
|
|
52
|
+
log.error(TAG, 'Failed to stop OpenTelemetry during shutdown:', err)
|
|
53
|
+
}
|
|
47
54
|
if (!isWorkerOnly) {
|
|
48
55
|
await closeWsServer()
|
|
49
56
|
}
|
|
@@ -46,7 +46,7 @@ export function streamCopilotCliChat({ session, message, imagePath, systemPrompt
|
|
|
46
46
|
const prompt = promptParts.join('\n\n')
|
|
47
47
|
|
|
48
48
|
const args = ['-p', prompt, '--output-format=json', '-s', '--yolo']
|
|
49
|
-
if (session.copilotSessionId) args.push(
|
|
49
|
+
if (session.copilotSessionId) args.push(`--resume=${session.copilotSessionId}`)
|
|
50
50
|
if (session.model) args.push('--model', session.model)
|
|
51
51
|
|
|
52
52
|
// System prompt: write temp AGENTS.override.md in a temp config dir
|
|
@@ -106,14 +106,35 @@ export function streamCopilotCliChat({ session, message, imagePath, systemPrompt
|
|
|
106
106
|
const ev = JSON.parse(line) as Record<string, unknown>
|
|
107
107
|
eventCount++
|
|
108
108
|
|
|
109
|
-
|
|
109
|
+
const data = ev.data as Record<string, unknown> | undefined
|
|
110
|
+
|
|
111
|
+
// Capture session ID — legacy 'init' event or modern 'result' event
|
|
110
112
|
if (ev.type === 'init' && typeof ev.session_id === 'string') {
|
|
111
113
|
session.copilotSessionId = ev.session_id
|
|
112
|
-
log.info('copilot-cli', `Got session_id: ${ev.session_id}`)
|
|
114
|
+
log.info('copilot-cli', `Got session_id (init): ${ev.session_id}`)
|
|
115
|
+
} else if (ev.type === 'result' && typeof ev.sessionId === 'string') {
|
|
116
|
+
session.copilotSessionId = ev.sessionId
|
|
117
|
+
log.info('copilot-cli', `Got session_id (result): ${ev.sessionId}`)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Modern format: streaming delta — assistant.message_delta { data: { deltaContent } }
|
|
121
|
+
if (ev.type === 'assistant.message_delta' && typeof data?.deltaContent === 'string') {
|
|
122
|
+
fullResponse += data.deltaContent
|
|
123
|
+
write(`data: ${JSON.stringify({ t: 'd', text: data.deltaContent })}\n\n`)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Modern format: full assistant message — assistant.message { data: { content } }
|
|
127
|
+
else if (ev.type === 'assistant.message' && typeof data?.content === 'string') {
|
|
128
|
+
// Only emit as final result if we haven't been streaming deltas
|
|
129
|
+
if (!fullResponse) {
|
|
130
|
+
fullResponse = data.content
|
|
131
|
+
write(`data: ${JSON.stringify({ t: 'r', text: data.content })}\n\n`)
|
|
132
|
+
}
|
|
133
|
+
log.debug('copilot-cli', `Assistant message (${data.content.length} chars)`)
|
|
113
134
|
}
|
|
114
135
|
|
|
115
|
-
//
|
|
116
|
-
if (ev.type === 'content_block_delta') {
|
|
136
|
+
// Legacy: streaming text deltas — content_block_delta { delta: { text } }
|
|
137
|
+
else if (ev.type === 'content_block_delta') {
|
|
117
138
|
const delta = ev.delta as Record<string, unknown> | undefined
|
|
118
139
|
if (typeof delta?.text === 'string') {
|
|
119
140
|
fullResponse += delta.text
|
|
@@ -121,19 +142,19 @@ export function streamCopilotCliChat({ session, message, imagePath, systemPrompt
|
|
|
121
142
|
}
|
|
122
143
|
}
|
|
123
144
|
|
|
124
|
-
//
|
|
145
|
+
// Legacy: agent message chunks (ACP format)
|
|
125
146
|
else if (ev.type === 'agent_message_chunk' && typeof ev.text === 'string') {
|
|
126
147
|
fullResponse += ev.text
|
|
127
148
|
write(`data: ${JSON.stringify({ t: 'd', text: ev.text })}\n\n`)
|
|
128
149
|
}
|
|
129
150
|
|
|
130
|
-
//
|
|
151
|
+
// Legacy: assistant message content
|
|
131
152
|
else if (ev.type === 'message' && ev.role === 'assistant' && typeof ev.content === 'string') {
|
|
132
153
|
fullResponse += ev.content
|
|
133
154
|
write(`data: ${JSON.stringify({ t: 'd', text: ev.content })}\n\n`)
|
|
134
155
|
}
|
|
135
156
|
|
|
136
|
-
//
|
|
157
|
+
// Legacy: completed item with agent_message
|
|
137
158
|
else if (ev.type === 'item.completed' && (ev.item as Record<string, unknown>)?.type === 'agent_message') {
|
|
138
159
|
const item = ev.item as Record<string, unknown>
|
|
139
160
|
if (typeof item.text === 'string') {
|
|
@@ -143,7 +164,7 @@ export function streamCopilotCliChat({ session, message, imagePath, systemPrompt
|
|
|
143
164
|
}
|
|
144
165
|
}
|
|
145
166
|
|
|
146
|
-
//
|
|
167
|
+
// Legacy: final result with string result field
|
|
147
168
|
else if (ev.type === 'result' && typeof ev.result === 'string') {
|
|
148
169
|
fullResponse = ev.result
|
|
149
170
|
write(`data: ${JSON.stringify({ t: 'r', text: ev.result })}\n\n`)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { ExecuteChatTurnInput, ExecuteChatTurnResult } from './chat-execution-types'
|
|
2
2
|
import { perf } from '@/lib/server/runtime/perf'
|
|
3
|
+
import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
|
|
3
4
|
import { markProviderSuccess } from '@/lib/server/provider-health'
|
|
4
5
|
import { executePreparedChatTurn } from '@/lib/server/chat-execution/chat-turn-stream-execution'
|
|
5
6
|
import { finalizeChatTurn } from '@/lib/server/chat-execution/chat-turn-finalization'
|
|
@@ -50,84 +51,121 @@ export async function executeSessionChatTurn(input: ExecuteChatTurnInput): Promi
|
|
|
50
51
|
sessionId,
|
|
51
52
|
source = 'chat',
|
|
52
53
|
} = input
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
54
|
+
return withServerSpan('swarmclaw.chat.turn', {
|
|
55
|
+
'swarmclaw.session.id': sessionId,
|
|
56
|
+
'swarmclaw.chat.source': source,
|
|
57
|
+
'swarmclaw.chat.has_image': Boolean(input.imagePath || input.imageUrl),
|
|
58
|
+
'swarmclaw.chat.attached_file_count': input.attachedFiles?.length || 0,
|
|
59
|
+
}, async (span) => {
|
|
60
|
+
const endTurnPerf = perf.start('chat-execution', 'executeSessionChatTurn', { sessionId, source })
|
|
61
|
+
const preparedTurn = await prepareChatTurn(input)
|
|
62
|
+
if (preparedTurn.kind === 'blocked') {
|
|
63
|
+
const result = await completeBlockedChatTurn(preparedTurn)
|
|
64
|
+
setSpanAttributes(span, {
|
|
65
|
+
'swarmclaw.chat.blocked': true,
|
|
66
|
+
'swarmclaw.chat.tool_event_count': result.toolEvents.length,
|
|
67
|
+
'swarmclaw.chat.error': Boolean(result.error),
|
|
68
|
+
'gen_ai.usage.input_tokens': result.inputTokens || 0,
|
|
69
|
+
'gen_ai.usage.output_tokens': result.outputTokens || 0,
|
|
70
|
+
})
|
|
71
|
+
endTurnPerf({
|
|
72
|
+
durationMs: 0,
|
|
73
|
+
toolEventCount: result.toolEvents.length,
|
|
74
|
+
inputTokens: result.inputTokens || 0,
|
|
75
|
+
outputTokens: result.outputTokens || 0,
|
|
76
|
+
error: !!result.error,
|
|
77
|
+
})
|
|
78
|
+
return result
|
|
79
|
+
}
|
|
66
80
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
81
|
+
setSpanAttributes(span, {
|
|
82
|
+
'swarmclaw.chat.blocked': false,
|
|
83
|
+
'swarmclaw.chat.agentic': preparedTurn.hasExtensions,
|
|
84
|
+
'swarmclaw.chat.provider': preparedTurn.providerType,
|
|
85
|
+
'gen_ai.request.model': preparedTurn.sessionForRun.model,
|
|
86
|
+
})
|
|
71
87
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
})
|
|
88
|
+
const partialPersistence = createPartialAssistantPersistence({
|
|
89
|
+
prepared: preparedTurn,
|
|
90
|
+
onEvent: input.onEvent,
|
|
91
|
+
})
|
|
77
92
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
endTurnPerf({
|
|
83
|
-
durationMs: 0,
|
|
84
|
-
toolEventCount: preflight.terminalResult.toolEvents.length,
|
|
85
|
-
inputTokens: preflight.terminalResult.inputTokens || 0,
|
|
86
|
-
outputTokens: preflight.terminalResult.outputTokens || 0,
|
|
87
|
-
error: !!preflight.terminalResult.error,
|
|
93
|
+
const preflight = await runChatTurnPreflight({
|
|
94
|
+
prepared: preparedTurn,
|
|
95
|
+
emit: partialPersistence.emit,
|
|
96
|
+
toolEvents: partialPersistence.getToolEvents(),
|
|
88
97
|
})
|
|
89
|
-
return preflight.terminalResult
|
|
90
|
-
}
|
|
91
98
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
99
|
+
if (preflight?.terminalResult) {
|
|
100
|
+
if (preflight.terminalResult.text) input.onEvent?.({ t: 'd', text: preflight.terminalResult.text })
|
|
101
|
+
partialPersistence.stop()
|
|
102
|
+
await partialPersistence.awaitIdle()
|
|
103
|
+
setSpanAttributes(span, {
|
|
104
|
+
'swarmclaw.chat.preflight_terminal': true,
|
|
105
|
+
'swarmclaw.chat.tool_event_count': preflight.terminalResult.toolEvents.length,
|
|
106
|
+
'swarmclaw.chat.error': Boolean(preflight.terminalResult.error),
|
|
107
|
+
'gen_ai.usage.input_tokens': preflight.terminalResult.inputTokens || 0,
|
|
108
|
+
'gen_ai.usage.output_tokens': preflight.terminalResult.outputTokens || 0,
|
|
109
|
+
})
|
|
110
|
+
endTurnPerf({
|
|
111
|
+
durationMs: 0,
|
|
112
|
+
toolEventCount: preflight.terminalResult.toolEvents.length,
|
|
113
|
+
inputTokens: preflight.terminalResult.inputTokens || 0,
|
|
114
|
+
outputTokens: preflight.terminalResult.outputTokens || 0,
|
|
115
|
+
error: !!preflight.terminalResult.error,
|
|
116
|
+
})
|
|
117
|
+
return preflight.terminalResult
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
let streamResult: Awaited<ReturnType<typeof executePreparedChatTurn>>
|
|
121
|
+
try {
|
|
122
|
+
streamResult = await executePreparedChatTurn({
|
|
123
|
+
input,
|
|
124
|
+
prepared: preparedTurn,
|
|
125
|
+
partialPersistence,
|
|
126
|
+
preflightToolRoutingResult: preflight?.directMemoryResult || null,
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
await partialPersistence.awaitIdle()
|
|
130
|
+
} finally {
|
|
131
|
+
partialPersistence.stop()
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (!streamResult.errorMessage) {
|
|
135
|
+
markProviderSuccess(preparedTurn.providerType, preparedTurn.sessionForRun.credentialId)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const result = await finalizeChatTurn({
|
|
95
139
|
input,
|
|
96
140
|
prepared: preparedTurn,
|
|
97
141
|
partialPersistence,
|
|
98
|
-
|
|
142
|
+
fullResponse: streamResult.fullResponse,
|
|
143
|
+
errorMessage: streamResult.errorMessage,
|
|
144
|
+
initialToolRoutingResult: streamResult.toolRoutingResult,
|
|
145
|
+
responseCacheHit: streamResult.responseCacheHit,
|
|
146
|
+
directUsage: streamResult.directUsage,
|
|
147
|
+
durationMs: streamResult.durationMs,
|
|
148
|
+
knowledgeRetrievalTrace: streamResult.knowledgeRetrievalTrace || null,
|
|
149
|
+
emit: partialPersistence.emit,
|
|
99
150
|
})
|
|
100
151
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
responseCacheHit: streamResult.responseCacheHit,
|
|
118
|
-
directUsage: streamResult.directUsage,
|
|
119
|
-
durationMs: streamResult.durationMs,
|
|
120
|
-
knowledgeRetrievalTrace: streamResult.knowledgeRetrievalTrace || null,
|
|
121
|
-
emit: partialPersistence.emit,
|
|
122
|
-
})
|
|
152
|
+
setSpanAttributes(span, {
|
|
153
|
+
'swarmclaw.chat.cache_hit': streamResult.responseCacheHit,
|
|
154
|
+
'swarmclaw.chat.tool_event_count': result.toolEvents.length,
|
|
155
|
+
'swarmclaw.chat.error': Boolean(result.error),
|
|
156
|
+
'swarmclaw.chat.estimated_cost': result.estimatedCost ?? 0,
|
|
157
|
+
'swarmclaw.chat.has_retrieval_trace': Boolean(result.retrievalTrace),
|
|
158
|
+
'gen_ai.usage.input_tokens': result.inputTokens || 0,
|
|
159
|
+
'gen_ai.usage.output_tokens': result.outputTokens || 0,
|
|
160
|
+
})
|
|
161
|
+
endTurnPerf({
|
|
162
|
+
durationMs: streamResult.durationMs,
|
|
163
|
+
toolEventCount: result.toolEvents.length,
|
|
164
|
+
inputTokens: result.inputTokens || 0,
|
|
165
|
+
outputTokens: result.outputTokens || 0,
|
|
166
|
+
error: !!result.error,
|
|
167
|
+
})
|
|
123
168
|
|
|
124
|
-
|
|
125
|
-
durationMs: streamResult.durationMs,
|
|
126
|
-
toolEventCount: result.toolEvents.length,
|
|
127
|
-
inputTokens: result.inputTokens || 0,
|
|
128
|
-
outputTokens: result.outputTokens || 0,
|
|
129
|
-
error: !!result.error,
|
|
169
|
+
return result
|
|
130
170
|
})
|
|
131
|
-
|
|
132
|
-
return result
|
|
133
171
|
}
|
|
@@ -22,6 +22,7 @@ import {
|
|
|
22
22
|
import { perf } from '@/lib/server/runtime/perf'
|
|
23
23
|
import { getSessionMessages } from '@/lib/server/sessions/session-repository'
|
|
24
24
|
import { notify } from '@/lib/server/ws-hub'
|
|
25
|
+
import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
|
|
25
26
|
import { errorMessage as toErrorMessage } from '@/lib/shared-utils'
|
|
26
27
|
|
|
27
28
|
import type { ExecuteChatTurnInput } from './chat-execution-types'
|
|
@@ -142,22 +143,34 @@ export async function executePreparedChatTurn(params: {
|
|
|
142
143
|
)
|
|
143
144
|
|
|
144
145
|
if (hasExtensions) {
|
|
145
|
-
const result = await
|
|
146
|
-
session:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
146
|
+
const result = await withServerSpan('swarmclaw.chat.agentic_stream', {
|
|
147
|
+
'swarmclaw.session.id': sessionId,
|
|
148
|
+
'swarmclaw.chat.source': source,
|
|
149
|
+
'swarmclaw.chat.provider': providerType,
|
|
150
|
+
'gen_ai.request.model': sessionForRun.model,
|
|
151
|
+
}, async (span) => {
|
|
152
|
+
const agenticResult = await streamAgentChat({
|
|
153
|
+
session: sessionForRun,
|
|
154
|
+
message: effectiveMessage,
|
|
155
|
+
imagePath: resolvedImagePath,
|
|
156
|
+
imageUrl,
|
|
157
|
+
attachedFiles,
|
|
158
|
+
apiKey,
|
|
159
|
+
systemPrompt,
|
|
160
|
+
executionBrief,
|
|
161
|
+
extraSystemContext: [executionBriefContextBlock].filter((value): value is string => typeof value === 'string' && value.trim().length > 0),
|
|
162
|
+
write: (raw) => parseAndEmit(raw),
|
|
163
|
+
history: heartbeatHistory ?? applyContextClearBoundary(getSessionMessages(sessionId)),
|
|
164
|
+
signal: abortController.signal,
|
|
165
|
+
source,
|
|
166
|
+
classification,
|
|
167
|
+
promptMode,
|
|
168
|
+
})
|
|
169
|
+
setSpanAttributes(span, {
|
|
170
|
+
'swarmclaw.chat.tool_event_count': agenticResult.toolEvents.length,
|
|
171
|
+
'swarmclaw.chat.has_retrieval_trace': Boolean(agenticResult.knowledgeRetrievalTrace),
|
|
172
|
+
})
|
|
173
|
+
return agenticResult
|
|
161
174
|
})
|
|
162
175
|
fullResponse = result.finalResponse || result.fullText
|
|
163
176
|
knowledgeRetrievalTrace = result.knowledgeRetrievalTrace || null
|
|
@@ -232,7 +245,20 @@ export async function executePreparedChatTurn(params: {
|
|
|
232
245
|
signal: abortController.signal,
|
|
233
246
|
})
|
|
234
247
|
try {
|
|
235
|
-
fullResponse = await
|
|
248
|
+
fullResponse = await withServerSpan('swarmclaw.chat.model_stream', {
|
|
249
|
+
'swarmclaw.session.id': sessionId,
|
|
250
|
+
'swarmclaw.chat.source': source,
|
|
251
|
+
'swarmclaw.chat.provider': providerType,
|
|
252
|
+
'gen_ai.request.model': sessionForRun.model,
|
|
253
|
+
}, async (span) => {
|
|
254
|
+
const response = await doStreamChat()
|
|
255
|
+
setSpanAttributes(span, {
|
|
256
|
+
'gen_ai.usage.input_tokens': directUsage.inputTokens || 0,
|
|
257
|
+
'gen_ai.usage.output_tokens': directUsage.outputTokens || 0,
|
|
258
|
+
'swarmclaw.chat.response_cacheable': canUseResponseCache,
|
|
259
|
+
})
|
|
260
|
+
return response
|
|
261
|
+
})
|
|
236
262
|
} catch (streamErr: unknown) {
|
|
237
263
|
const streamErrMsg = toErrorMessage(streamErr)
|
|
238
264
|
const streamStatus = (streamErr as Record<string, unknown>)?.status
|
|
@@ -243,7 +269,20 @@ export async function executePreparedChatTurn(params: {
|
|
|
243
269
|
historyLen: directHistorySnapshot.length,
|
|
244
270
|
})
|
|
245
271
|
directHistorySnapshot = directHistorySnapshot.slice(-10)
|
|
246
|
-
fullResponse = await
|
|
272
|
+
fullResponse = await withServerSpan('swarmclaw.chat.model_stream.retry', {
|
|
273
|
+
'swarmclaw.session.id': sessionId,
|
|
274
|
+
'swarmclaw.chat.source': source,
|
|
275
|
+
'swarmclaw.chat.provider': providerType,
|
|
276
|
+
'gen_ai.request.model': sessionForRun.model,
|
|
277
|
+
'swarmclaw.chat.retry_reason': 'context_overflow',
|
|
278
|
+
}, async (span) => {
|
|
279
|
+
const response = await doStreamChat()
|
|
280
|
+
setSpanAttributes(span, {
|
|
281
|
+
'gen_ai.usage.input_tokens': directUsage.inputTokens || 0,
|
|
282
|
+
'gen_ai.usage.output_tokens': directUsage.outputTokens || 0,
|
|
283
|
+
})
|
|
284
|
+
return response
|
|
285
|
+
})
|
|
247
286
|
} else {
|
|
248
287
|
throw streamErr
|
|
249
288
|
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import { describe, it } from 'node:test'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
parseOtelHeaders,
|
|
6
|
+
resolveOtelConfig,
|
|
7
|
+
resolveOtelTracesEndpoint,
|
|
8
|
+
} from '@/lib/server/observability/otel-config'
|
|
9
|
+
|
|
10
|
+
function env(overrides: Record<string, string>): NodeJS.ProcessEnv {
|
|
11
|
+
return {
|
|
12
|
+
NODE_ENV: 'test',
|
|
13
|
+
...overrides,
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
describe('otel config', () => {
|
|
18
|
+
it('stays disabled unless OTEL_ENABLED is truthy', () => {
|
|
19
|
+
assert.equal(resolveOtelConfig(env({ OTEL_EXPORTER_OTLP_ENDPOINT: 'http://localhost:4318' })), null)
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
it('normalizes a base OTLP endpoint to the traces path', () => {
|
|
23
|
+
assert.equal(
|
|
24
|
+
resolveOtelTracesEndpoint(env({
|
|
25
|
+
OTEL_EXPORTER_OTLP_ENDPOINT: 'https://collector.example.com:4318',
|
|
26
|
+
})),
|
|
27
|
+
'https://collector.example.com:4318/v1/traces',
|
|
28
|
+
)
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
it('prefers an explicit OTLP traces endpoint', () => {
|
|
32
|
+
assert.equal(
|
|
33
|
+
resolveOtelTracesEndpoint(env({
|
|
34
|
+
OTEL_EXPORTER_OTLP_ENDPOINT: 'https://collector.example.com:4318',
|
|
35
|
+
OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: 'https://collector.example.com/custom/traces',
|
|
36
|
+
})),
|
|
37
|
+
'https://collector.example.com/custom/traces',
|
|
38
|
+
)
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
it('parses OTLP headers and applies the default service name', () => {
|
|
42
|
+
const config = resolveOtelConfig(env({
|
|
43
|
+
OTEL_ENABLED: 'true',
|
|
44
|
+
OTEL_EXPORTER_OTLP_ENDPOINT: 'https://collector.example.com:4318',
|
|
45
|
+
OTEL_EXPORTER_OTLP_HEADERS: 'Authorization=Bearer token, X-Team = swarm ',
|
|
46
|
+
}))
|
|
47
|
+
|
|
48
|
+
assert.ok(config)
|
|
49
|
+
assert.equal(config.serviceName, 'swarmclaw')
|
|
50
|
+
assert.deepEqual(config.headers, {
|
|
51
|
+
Authorization: 'Bearer token',
|
|
52
|
+
'X-Team': 'swarm',
|
|
53
|
+
})
|
|
54
|
+
assert.equal(config.tracesEndpoint, 'https://collector.example.com:4318/v1/traces')
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
it('ignores malformed header entries', () => {
|
|
58
|
+
assert.deepEqual(parseOtelHeaders('good=value, broken, =oops, missing='), {
|
|
59
|
+
good: 'value',
|
|
60
|
+
})
|
|
61
|
+
})
|
|
62
|
+
})
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
export interface OTelConfig {
|
|
2
|
+
enabled: true
|
|
3
|
+
serviceName: string
|
|
4
|
+
tracesEndpoint: string
|
|
5
|
+
headers: Record<string, string>
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
function parseBooleanFlag(value: string | undefined): boolean {
|
|
9
|
+
if (typeof value !== 'string') return false
|
|
10
|
+
const normalized = value.trim().toLowerCase()
|
|
11
|
+
return normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'on'
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function cleanEnvValue(value: string | undefined): string | null {
|
|
15
|
+
if (typeof value !== 'string') return null
|
|
16
|
+
const trimmed = value.trim()
|
|
17
|
+
return trimmed ? trimmed : null
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function resolveOtelTracesEndpoint(env: NodeJS.ProcessEnv = process.env): string | null {
|
|
21
|
+
const tracesEndpoint = cleanEnvValue(env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT)
|
|
22
|
+
if (tracesEndpoint) return tracesEndpoint.replace(/\/+$/, '')
|
|
23
|
+
|
|
24
|
+
const baseEndpoint = cleanEnvValue(env.OTEL_EXPORTER_OTLP_ENDPOINT)
|
|
25
|
+
if (!baseEndpoint) return null
|
|
26
|
+
|
|
27
|
+
const normalizedBase = baseEndpoint.replace(/\/+$/, '')
|
|
28
|
+
if (!normalizedBase) return null
|
|
29
|
+
if (normalizedBase.endsWith('/v1/traces')) return normalizedBase
|
|
30
|
+
return `${normalizedBase}/v1/traces`
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function parseOtelHeaders(value: string | undefined): Record<string, string> {
|
|
34
|
+
if (typeof value !== 'string') return {}
|
|
35
|
+
const entries = value
|
|
36
|
+
.split(',')
|
|
37
|
+
.map((entry) => entry.trim())
|
|
38
|
+
.filter(Boolean)
|
|
39
|
+
|
|
40
|
+
const headers: Record<string, string> = {}
|
|
41
|
+
for (const entry of entries) {
|
|
42
|
+
const separatorIndex = entry.indexOf('=')
|
|
43
|
+
if (separatorIndex <= 0) continue
|
|
44
|
+
const key = entry.slice(0, separatorIndex).trim()
|
|
45
|
+
const headerValue = entry.slice(separatorIndex + 1).trim()
|
|
46
|
+
if (!key || !headerValue) continue
|
|
47
|
+
headers[key] = headerValue
|
|
48
|
+
}
|
|
49
|
+
return headers
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export function resolveOtelConfig(env: NodeJS.ProcessEnv = process.env): OTelConfig | null {
|
|
53
|
+
if (!parseBooleanFlag(env.OTEL_ENABLED)) return null
|
|
54
|
+
|
|
55
|
+
const tracesEndpoint = resolveOtelTracesEndpoint(env)
|
|
56
|
+
if (!tracesEndpoint) return null
|
|
57
|
+
|
|
58
|
+
const serviceName = cleanEnvValue(env.OTEL_SERVICE_NAME) || 'swarmclaw'
|
|
59
|
+
const headers = parseOtelHeaders(env.OTEL_EXPORTER_OTLP_TRACES_HEADERS || env.OTEL_EXPORTER_OTLP_HEADERS)
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
enabled: true,
|
|
63
|
+
serviceName,
|
|
64
|
+
tracesEndpoint,
|
|
65
|
+
headers,
|
|
66
|
+
}
|
|
67
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import {
|
|
2
|
+
trace,
|
|
3
|
+
SpanStatusCode,
|
|
4
|
+
type Attributes,
|
|
5
|
+
type AttributeValue,
|
|
6
|
+
type Span,
|
|
7
|
+
} from '@opentelemetry/api'
|
|
8
|
+
import { errorMessage } from '@/lib/shared-utils'
|
|
9
|
+
|
|
10
|
+
type SpanAttributeInput = Record<string, AttributeValue | null | undefined>
|
|
11
|
+
|
|
12
|
+
function sanitizeAttributes(attributes?: SpanAttributeInput): Attributes | undefined {
|
|
13
|
+
if (!attributes) return undefined
|
|
14
|
+
const cleaned: Attributes = {}
|
|
15
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
16
|
+
if (value === undefined || value === null) continue
|
|
17
|
+
cleaned[key] = value
|
|
18
|
+
}
|
|
19
|
+
return Object.keys(cleaned).length > 0 ? cleaned : undefined
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function setSpanAttributes(span: Span, attributes?: SpanAttributeInput): void {
|
|
23
|
+
const cleaned = sanitizeAttributes(attributes)
|
|
24
|
+
if (!cleaned) return
|
|
25
|
+
span.setAttributes(cleaned)
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export function recordSpanError(span: Span, err: unknown): void {
|
|
29
|
+
span.recordException(err instanceof Error ? err : new Error(errorMessage(err)))
|
|
30
|
+
span.setStatus({
|
|
31
|
+
code: SpanStatusCode.ERROR,
|
|
32
|
+
message: errorMessage(err),
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export async function withServerSpan<T>(
|
|
37
|
+
name: string,
|
|
38
|
+
attributes: SpanAttributeInput | undefined,
|
|
39
|
+
fn: (span: Span) => Promise<T> | T,
|
|
40
|
+
): Promise<T> {
|
|
41
|
+
const tracer = trace.getTracer('swarmclaw.runtime')
|
|
42
|
+
return tracer.startActiveSpan(name, { attributes: sanitizeAttributes(attributes) }, async (span) => {
|
|
43
|
+
try {
|
|
44
|
+
return await fn(span)
|
|
45
|
+
} catch (err) {
|
|
46
|
+
recordSpanError(span, err)
|
|
47
|
+
throw err
|
|
48
|
+
} finally {
|
|
49
|
+
span.end()
|
|
50
|
+
}
|
|
51
|
+
})
|
|
52
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
|
|
2
|
+
import { NodeSDK } from '@opentelemetry/sdk-node'
|
|
3
|
+
import { log } from '@/lib/server/logger'
|
|
4
|
+
import { hmrSingleton } from '@/lib/shared-utils'
|
|
5
|
+
import { resolveOtelConfig } from '@/lib/server/observability/otel-config'
|
|
6
|
+
|
|
7
|
+
const TAG = 'otel'
|
|
8
|
+
|
|
9
|
+
interface OTelState {
|
|
10
|
+
started: boolean
|
|
11
|
+
startPromise: Promise<boolean> | null
|
|
12
|
+
sdk: NodeSDK | null
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const otelState = hmrSingleton<OTelState>('__swarmclaw_otel_state__', () => ({
|
|
16
|
+
started: false,
|
|
17
|
+
startPromise: null,
|
|
18
|
+
sdk: null,
|
|
19
|
+
}))
|
|
20
|
+
|
|
21
|
+
export function isOtelEnabled(): boolean {
|
|
22
|
+
return resolveOtelConfig() !== null
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export async function ensureOpenTelemetryStarted(): Promise<boolean> {
|
|
26
|
+
const config = resolveOtelConfig()
|
|
27
|
+
if (!config) return false
|
|
28
|
+
if (otelState.started) return true
|
|
29
|
+
if (otelState.startPromise) return otelState.startPromise
|
|
30
|
+
|
|
31
|
+
otelState.startPromise = (async () => {
|
|
32
|
+
try {
|
|
33
|
+
process.env.OTEL_SERVICE_NAME = process.env.OTEL_SERVICE_NAME || config.serviceName
|
|
34
|
+
const exporter = new OTLPTraceExporter({
|
|
35
|
+
url: config.tracesEndpoint,
|
|
36
|
+
headers: Object.keys(config.headers).length > 0 ? config.headers : undefined,
|
|
37
|
+
})
|
|
38
|
+
const sdk = new NodeSDK({
|
|
39
|
+
traceExporter: exporter,
|
|
40
|
+
})
|
|
41
|
+
sdk.start()
|
|
42
|
+
otelState.sdk = sdk
|
|
43
|
+
otelState.started = true
|
|
44
|
+
log.info(TAG, 'OpenTelemetry OTLP tracing enabled', {
|
|
45
|
+
serviceName: config.serviceName,
|
|
46
|
+
tracesEndpoint: config.tracesEndpoint,
|
|
47
|
+
})
|
|
48
|
+
return true
|
|
49
|
+
} catch (err) {
|
|
50
|
+
otelState.sdk = null
|
|
51
|
+
otelState.started = false
|
|
52
|
+
log.error(TAG, 'Failed to initialize OpenTelemetry tracing', err)
|
|
53
|
+
return false
|
|
54
|
+
} finally {
|
|
55
|
+
otelState.startPromise = null
|
|
56
|
+
}
|
|
57
|
+
})()
|
|
58
|
+
|
|
59
|
+
return otelState.startPromise
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export async function shutdownOpenTelemetry(): Promise<void> {
|
|
63
|
+
const sdk = otelState.sdk
|
|
64
|
+
if (!sdk) {
|
|
65
|
+
otelState.started = false
|
|
66
|
+
otelState.startPromise = null
|
|
67
|
+
return
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
otelState.sdk = null
|
|
71
|
+
otelState.started = false
|
|
72
|
+
otelState.startPromise = null
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
await sdk.shutdown()
|
|
76
|
+
} catch (err) {
|
|
77
|
+
log.warn(TAG, 'Failed to flush OpenTelemetry tracing during shutdown', err)
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -39,6 +39,7 @@ import {
|
|
|
39
39
|
syncProtocolParentFromChildRun,
|
|
40
40
|
} from '@/lib/server/protocols/protocol-step-helpers'
|
|
41
41
|
import { stepProtocolRun } from '@/lib/server/protocols/protocol-step-processors'
|
|
42
|
+
import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
|
|
42
43
|
|
|
43
44
|
// ---- Singletons ----
|
|
44
45
|
|
|
@@ -308,79 +309,91 @@ export async function runProtocolRun(runId: string, deps?: ProtocolRunDeps): Pro
|
|
|
308
309
|
return loadProtocolRunById(runId)
|
|
309
310
|
}
|
|
310
311
|
try {
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
312
|
+
return await withServerSpan('swarmclaw.protocol.run', {
|
|
313
|
+
'swarmclaw.protocol.run_id': runId,
|
|
314
|
+
}, async (span) => {
|
|
315
|
+
let run = loadProtocolRunById(runId)
|
|
316
|
+
if (!run) return null
|
|
317
|
+
setSpanAttributes(span, {
|
|
318
|
+
'swarmclaw.protocol.template_id': run.templateId,
|
|
319
|
+
'swarmclaw.protocol.source_kind': run.sourceRef.kind,
|
|
320
|
+
'swarmclaw.protocol.participant_count': run.participantAgentIds.length,
|
|
321
|
+
'swarmclaw.protocol.status': run.status,
|
|
322
|
+
})
|
|
323
|
+
if (run.status === 'cancelled' || run.status === 'archived' || run.status === 'completed' || run.status === 'paused') return run
|
|
324
|
+
run = persistRun({
|
|
325
|
+
...run,
|
|
326
|
+
status: run.status === 'waiting' ? 'running' : run.status,
|
|
327
|
+
waitingReason: null,
|
|
328
|
+
pauseReason: null,
|
|
329
|
+
lastError: null,
|
|
330
|
+
startedAt: run.startedAt || now(deps),
|
|
331
|
+
updatedAt: now(deps),
|
|
332
|
+
})
|
|
333
|
+
if (run.parentRunId) syncProtocolParentFromChildRun(run, deps)
|
|
324
334
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
335
|
+
const MAX_STEP_ITERATIONS = 500
|
|
336
|
+
let stepIterations = 0
|
|
337
|
+
while (run.status === 'running' || run.status === 'draft') {
|
|
338
|
+
stepIterations++
|
|
339
|
+
if (stepIterations > MAX_STEP_ITERATIONS) {
|
|
340
|
+
run = persistRun({ ...run, status: 'failed', lastError: `Exceeded maximum step iterations (${MAX_STEP_ITERATIONS}). Possible infinite loop in step graph.`, updatedAt: now(deps) })
|
|
341
|
+
appendProtocolEvent(run.id, { type: 'failed', summary: `Exceeded maximum step iterations (${MAX_STEP_ITERATIONS}).` }, deps)
|
|
342
|
+
break
|
|
343
|
+
}
|
|
344
|
+
if (shouldYieldBetweenProtocolSteps(deps)) {
|
|
345
|
+
// Yield between steps in the fire-and-forget runtime so I/O, HTTP responses,
|
|
346
|
+
// and timers can run.
|
|
347
|
+
await new Promise(r => setTimeout(r, 0))
|
|
348
|
+
}
|
|
349
|
+
const latest = loadProtocolRunById(run.id)
|
|
350
|
+
if (!latest) return null
|
|
351
|
+
if (latest.status === 'paused' || latest.status === 'cancelled' || latest.status === 'archived' || latest.status === 'completed') {
|
|
352
|
+
run = latest
|
|
353
|
+
break
|
|
354
|
+
}
|
|
342
355
|
run = latest
|
|
343
|
-
|
|
344
|
-
}
|
|
345
|
-
run = latest
|
|
346
|
-
renewProtocolLease(run.id)
|
|
356
|
+
renewProtocolLease(run.id)
|
|
347
357
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
358
|
+
const sched = computeStepReadiness(run.steps || [], run.entryStepId || null, run.stepState)
|
|
359
|
+
if (sched.dagMode) {
|
|
360
|
+
run = persistRun({
|
|
361
|
+
...run,
|
|
362
|
+
stepState: sched.stepState,
|
|
363
|
+
completedStepIds: sched.completedStepIds,
|
|
364
|
+
runningStepIds: sched.runningStepIds,
|
|
365
|
+
readyStepIds: sched.readyStepIds,
|
|
366
|
+
failedStepIds: sched.failedStepIds,
|
|
367
|
+
updatedAt: now(deps),
|
|
368
|
+
})
|
|
369
|
+
if (sched.readyStepIds.length === 0 && sched.runningStepIds.length === 0) {
|
|
370
|
+
const allSteps = run.steps || []
|
|
371
|
+
const allCompleted = allSteps.every((s) => sched.stepState[s.id]?.status === 'completed')
|
|
372
|
+
if (allCompleted) {
|
|
373
|
+
run = completeProtocolRun(run, deps)
|
|
374
|
+
} else {
|
|
375
|
+
run = persistRun({ ...run, status: 'failed', lastError: 'DAG stuck: no ready steps and not all completed.', updatedAt: now(deps) })
|
|
376
|
+
appendProtocolEvent(run.id, { type: 'failed', summary: 'DAG stuck: no ready steps and not all completed.' }, deps)
|
|
377
|
+
}
|
|
378
|
+
break
|
|
379
|
+
}
|
|
380
|
+
if (sched.readyStepIds.length > 0) {
|
|
381
|
+
const nextReadyId = sched.readyStepIds[0]
|
|
382
|
+
run = persistRun({ ...run, currentStepId: nextReadyId, updatedAt: now(deps) })
|
|
369
383
|
}
|
|
370
|
-
break
|
|
371
|
-
}
|
|
372
|
-
if (sched.readyStepIds.length > 0) {
|
|
373
|
-
// Pick first ready step as currentStepId
|
|
374
|
-
const nextReadyId = sched.readyStepIds[0]
|
|
375
|
-
run = persistRun({ ...run, currentStepId: nextReadyId, updatedAt: now(deps) })
|
|
376
384
|
}
|
|
377
|
-
}
|
|
378
385
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
386
|
+
run = await stepProtocolRun(run, deps)
|
|
387
|
+
if (run.status === 'waiting' || run.status === 'paused' || run.status === 'failed' || run.status === 'cancelled' || run.status === 'archived' || run.status === 'completed') break
|
|
388
|
+
}
|
|
389
|
+
setSpanAttributes(span, {
|
|
390
|
+
'swarmclaw.protocol.step_iterations': stepIterations,
|
|
391
|
+
'swarmclaw.protocol.status': run.status,
|
|
392
|
+
'swarmclaw.protocol.current_step_id': run.currentStepId,
|
|
393
|
+
})
|
|
394
|
+
if (run.parentRunId) syncProtocolParentFromChildRun(run, deps)
|
|
395
|
+
return run
|
|
396
|
+
})
|
|
384
397
|
} catch (err: unknown) {
|
|
385
398
|
const failed = updateRun(runId, (current) => ({
|
|
386
399
|
...current,
|
|
@@ -58,6 +58,7 @@ import {
|
|
|
58
58
|
isExternalExtensionId,
|
|
59
59
|
splitCapabilityIds,
|
|
60
60
|
} from '@/lib/capability-selection'
|
|
61
|
+
import { setSpanAttributes, withServerSpan } from '@/lib/server/observability/otel-tracing'
|
|
61
62
|
|
|
62
63
|
export type { ToolContext, SessionToolsResult }
|
|
63
64
|
export { sweepOrphanedBrowsers, cleanupSessionBrowser, getActiveBrowserCount, hasActiveBrowser }
|
|
@@ -388,65 +389,80 @@ export async function buildSessionTools(cwd: string, enabledExtensions: string[]
|
|
|
388
389
|
const schema = (candidate as unknown as { schema?: z.ZodTypeAny }).schema || z.object({}).passthrough()
|
|
389
390
|
return tool(
|
|
390
391
|
async (args) => {
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
392
|
+
return withServerSpan('swarmclaw.tool.call', {
|
|
393
|
+
'swarmclaw.tool.name': candidate.name,
|
|
394
|
+
'swarmclaw.session.id': ctx?.sessionId || null,
|
|
395
|
+
'swarmclaw.agent.id': ctx?.agentId || null,
|
|
396
|
+
'swarmclaw.run.id': ctx?.runId || null,
|
|
397
|
+
}, async (span) => {
|
|
398
|
+
// Check abort before executing any tool — prevents wasted work after chat stop
|
|
399
|
+
if (abortSignalRef.signal?.aborted) {
|
|
400
|
+
setSpanAttributes(span, { 'swarmclaw.tool.aborted': true })
|
|
401
|
+
throw new DOMException('Tool execution aborted', 'AbortError')
|
|
402
|
+
}
|
|
403
|
+
const normalizedArgs = normalizeToolInputArgs((args ?? {}) as Record<string, unknown>)
|
|
404
|
+
const hookSession = resolveCurrentSession() || buildFallbackHookSession()
|
|
405
|
+
if (fileAccessPolicy) {
|
|
406
|
+
const denial = enforceFileAccessPolicy(candidate.name, normalizedArgs, cwd, fileAccessPolicy)
|
|
407
|
+
if (denial) {
|
|
408
|
+
setSpanAttributes(span, { 'swarmclaw.tool.blocked': true })
|
|
409
|
+
return denial
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
let guardedArgs: Record<string, unknown> | null = normalizedArgs
|
|
413
|
+
if (ctx?.beforeToolCall) {
|
|
414
|
+
const guardResult = await ctx.beforeToolCall({
|
|
415
|
+
session: hookSession,
|
|
412
416
|
toolName: candidate.name,
|
|
413
|
-
|
|
417
|
+
input: guardedArgs,
|
|
418
|
+
runId: ctx.runId,
|
|
414
419
|
})
|
|
420
|
+
if (guardResult?.warning) {
|
|
421
|
+
ctx.onToolCallWarning?.({
|
|
422
|
+
toolName: candidate.name,
|
|
423
|
+
message: guardResult.warning,
|
|
424
|
+
})
|
|
425
|
+
}
|
|
426
|
+
if (typeof guardResult?.blockReason === 'string' && guardResult.blockReason.trim()) {
|
|
427
|
+
setSpanAttributes(span, { 'swarmclaw.tool.blocked': true })
|
|
428
|
+
throw new Error(guardResult.blockReason.trim())
|
|
429
|
+
}
|
|
430
|
+
if (guardResult && 'input' in guardResult) {
|
|
431
|
+
guardedArgs = guardResult.input === undefined ? guardedArgs : guardResult.input ?? null
|
|
432
|
+
}
|
|
415
433
|
}
|
|
416
|
-
|
|
417
|
-
|
|
434
|
+
const hookResult = await runCapabilityBeforeToolCall(
|
|
435
|
+
{
|
|
436
|
+
session: hookSession,
|
|
437
|
+
toolName: candidate.name,
|
|
438
|
+
input: guardedArgs,
|
|
439
|
+
runId: ctx?.runId || undefined,
|
|
440
|
+
},
|
|
441
|
+
{ enabledIds: activeExtensions },
|
|
442
|
+
)
|
|
443
|
+
if (hookResult.warning) {
|
|
444
|
+
ctx?.onToolCallWarning?.({
|
|
445
|
+
toolName: candidate.name,
|
|
446
|
+
message: hookResult.warning,
|
|
447
|
+
})
|
|
418
448
|
}
|
|
419
|
-
if (
|
|
420
|
-
|
|
449
|
+
if (hookResult.blockReason) {
|
|
450
|
+
setSpanAttributes(span, { 'swarmclaw.tool.blocked': true })
|
|
451
|
+
throw new Error(hookResult.blockReason)
|
|
421
452
|
}
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
input: guardedArgs,
|
|
428
|
-
runId: ctx?.runId || undefined,
|
|
429
|
-
},
|
|
430
|
-
{ enabledIds: activeExtensions },
|
|
431
|
-
)
|
|
432
|
-
if (hookResult.warning) {
|
|
433
|
-
ctx?.onToolCallWarning?.({
|
|
434
|
-
toolName: candidate.name,
|
|
435
|
-
message: hookResult.warning,
|
|
453
|
+
const effectiveArgs = hookResult.input ?? guardedArgs
|
|
454
|
+
const result = await candidate.invoke(effectiveArgs ?? {})
|
|
455
|
+
const outputText = typeof result === 'string' ? result : JSON.stringify(result)
|
|
456
|
+
setSpanAttributes(span, {
|
|
457
|
+
'swarmclaw.tool.output_bytes': Buffer.byteLength(outputText, 'utf-8'),
|
|
436
458
|
})
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
await runCapabilityHook(
|
|
445
|
-
'afterToolExec',
|
|
446
|
-
{ session: hookSession, toolName: candidate.name, input: effectiveArgs, output: outputText },
|
|
447
|
-
{ enabledIds: activeExtensions },
|
|
448
|
-
)
|
|
449
|
-
return outputText
|
|
459
|
+
await runCapabilityHook(
|
|
460
|
+
'afterToolExec',
|
|
461
|
+
{ session: hookSession, toolName: candidate.name, input: effectiveArgs, output: outputText },
|
|
462
|
+
{ enabledIds: activeExtensions },
|
|
463
|
+
)
|
|
464
|
+
return outputText
|
|
465
|
+
})
|
|
450
466
|
},
|
|
451
467
|
{
|
|
452
468
|
name: candidate.name,
|