@openclaw/diagnostics-otel 2026.3.13 → 2026.5.2-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/api.ts +20 -0
- package/index.ts +4 -8
- package/openclaw.plugin.json +3 -0
- package/package.json +35 -12
- package/src/service.test.ts +2272 -64
- package/src/service.ts +1824 -193
- package/tsconfig.json +16 -0
package/src/service.ts
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
1
|
+
import {
|
|
2
|
+
context as otelContextApi,
|
|
3
|
+
metrics,
|
|
4
|
+
trace,
|
|
5
|
+
SpanStatusCode,
|
|
6
|
+
TraceFlags,
|
|
7
|
+
} from "@opentelemetry/api";
|
|
8
|
+
import type { LogRecord, SeverityNumber } from "@opentelemetry/api-logs";
|
|
3
9
|
import { OTLPLogExporter } from "@opentelemetry/exporter-logs-otlp-proto";
|
|
4
10
|
import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-proto";
|
|
5
11
|
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
|
|
@@ -10,16 +16,87 @@ import { NodeSDK } from "@opentelemetry/sdk-node";
|
|
|
10
16
|
import { ParentBasedSampler, TraceIdRatioBasedSampler } from "@opentelemetry/sdk-trace-base";
|
|
11
17
|
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
|
|
12
18
|
import type {
|
|
19
|
+
DiagnosticEventMetadata,
|
|
13
20
|
DiagnosticEventPayload,
|
|
21
|
+
DiagnosticTraceContext,
|
|
14
22
|
OpenClawPluginService,
|
|
15
|
-
} from "
|
|
23
|
+
} from "../api.js";
|
|
16
24
|
import {
|
|
17
|
-
|
|
25
|
+
isValidDiagnosticSpanId,
|
|
26
|
+
isValidDiagnosticTraceFlags,
|
|
27
|
+
isValidDiagnosticTraceId,
|
|
18
28
|
redactSensitiveText,
|
|
19
|
-
|
|
20
|
-
} from "openclaw/plugin-sdk/diagnostics-otel";
|
|
29
|
+
} from "../api.js";
|
|
21
30
|
|
|
22
31
|
const DEFAULT_SERVICE_NAME = "openclaw";
|
|
32
|
+
const DROPPED_OTEL_ATTRIBUTE_KEYS = new Set([
|
|
33
|
+
"openclaw.callId",
|
|
34
|
+
"openclaw.parentSpanId",
|
|
35
|
+
"openclaw.runId",
|
|
36
|
+
"openclaw.sessionId",
|
|
37
|
+
"openclaw.sessionKey",
|
|
38
|
+
"openclaw.spanId",
|
|
39
|
+
"openclaw.toolCallId",
|
|
40
|
+
"openclaw.traceId",
|
|
41
|
+
]);
|
|
42
|
+
const LOW_CARDINALITY_VALUE_RE = /^[A-Za-z0-9_.:-]{1,120}$/u;
|
|
43
|
+
const MAX_OTEL_CONTENT_ATTRIBUTE_CHARS = 4 * 1024;
|
|
44
|
+
const MAX_OTEL_CONTENT_ARRAY_ITEMS = 16;
|
|
45
|
+
const MAX_OTEL_LOG_BODY_CHARS = 4 * 1024;
|
|
46
|
+
const MAX_OTEL_LOG_ATTRIBUTE_COUNT = 64;
|
|
47
|
+
const MAX_OTEL_LOG_ATTRIBUTE_VALUE_CHARS = 4 * 1024;
|
|
48
|
+
const LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS = 60_000;
|
|
49
|
+
const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u;
|
|
50
|
+
const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u;
|
|
51
|
+
const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
|
|
52
|
+
const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED";
|
|
53
|
+
const OTEL_EXPORTER_OTLP_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_ENDPOINT";
|
|
54
|
+
const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT";
|
|
55
|
+
const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT";
|
|
56
|
+
const OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT";
|
|
57
|
+
const OTEL_SEMCONV_STABILITY_OPT_IN_ENV = "OTEL_SEMCONV_STABILITY_OPT_IN";
|
|
58
|
+
const GEN_AI_LATEST_EXPERIMENTAL_OPT_IN = "gen_ai_latest_experimental";
|
|
59
|
+
const GEN_AI_TOKEN_USAGE_BUCKETS = [
|
|
60
|
+
1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864,
|
|
61
|
+
];
|
|
62
|
+
const GEN_AI_OPERATION_DURATION_BUCKETS = [
|
|
63
|
+
0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92,
|
|
64
|
+
];
|
|
65
|
+
|
|
66
|
+
type OtelContentCapturePolicy = {
|
|
67
|
+
inputMessages: boolean;
|
|
68
|
+
outputMessages: boolean;
|
|
69
|
+
toolInputs: boolean;
|
|
70
|
+
toolOutputs: boolean;
|
|
71
|
+
systemPrompt: boolean;
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
type MessageDeliveryDiagnosticEvent = Extract<
|
|
75
|
+
DiagnosticEventPayload,
|
|
76
|
+
{
|
|
77
|
+
type: "message.delivery.started" | "message.delivery.completed" | "message.delivery.error";
|
|
78
|
+
}
|
|
79
|
+
>;
|
|
80
|
+
type ModelCallLifecycleDiagnosticEvent = Extract<
|
|
81
|
+
DiagnosticEventPayload,
|
|
82
|
+
{ type: "model.call.completed" | "model.call.error" }
|
|
83
|
+
>;
|
|
84
|
+
type HarnessRunDiagnosticEvent = Extract<
|
|
85
|
+
DiagnosticEventPayload,
|
|
86
|
+
{ type: "harness.run.started" | "harness.run.completed" | "harness.run.error" }
|
|
87
|
+
>;
|
|
88
|
+
type TelemetryExporterDiagnosticEvent = Extract<
|
|
89
|
+
DiagnosticEventPayload,
|
|
90
|
+
{ type: "telemetry.exporter" }
|
|
91
|
+
>;
|
|
92
|
+
|
|
93
|
+
const NO_CONTENT_CAPTURE: OtelContentCapturePolicy = {
|
|
94
|
+
inputMessages: false,
|
|
95
|
+
outputMessages: false,
|
|
96
|
+
toolInputs: false,
|
|
97
|
+
toolOutputs: false,
|
|
98
|
+
systemPrompt: false,
|
|
99
|
+
};
|
|
23
100
|
|
|
24
101
|
function normalizeEndpoint(endpoint?: string): string | undefined {
|
|
25
102
|
const trimmed = endpoint?.trim();
|
|
@@ -37,6 +114,18 @@ function resolveOtelUrl(endpoint: string | undefined, path: string): string | un
|
|
|
37
114
|
return `${endpoint}/${path}`;
|
|
38
115
|
}
|
|
39
116
|
|
|
117
|
+
function resolveSignalOtelUrl(params: {
|
|
118
|
+
signalEndpoint?: string;
|
|
119
|
+
signalEnvEndpoint?: string;
|
|
120
|
+
endpoint?: string;
|
|
121
|
+
path: string;
|
|
122
|
+
}): string | undefined {
|
|
123
|
+
return resolveOtelUrl(
|
|
124
|
+
normalizeEndpoint(params.signalEndpoint ?? params.signalEnvEndpoint) ?? params.endpoint,
|
|
125
|
+
params.path,
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
|
|
40
129
|
function resolveSampleRate(value: number | undefined): number | undefined {
|
|
41
130
|
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
42
131
|
return undefined;
|
|
@@ -61,79 +150,504 @@ function formatError(err: unknown): string {
|
|
|
61
150
|
}
|
|
62
151
|
}
|
|
63
152
|
|
|
153
|
+
function errorCategory(err: unknown): string {
|
|
154
|
+
try {
|
|
155
|
+
if (err instanceof Error && typeof err.name === "string" && err.name.trim()) {
|
|
156
|
+
return lowCardinalityAttr(err.name, "Error");
|
|
157
|
+
}
|
|
158
|
+
return lowCardinalityAttr(typeof err, "unknown");
|
|
159
|
+
} catch {
|
|
160
|
+
return "unknown";
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
64
164
|
function redactOtelAttributes(attributes: Record<string, string | number | boolean>) {
|
|
65
165
|
const redactedAttributes: Record<string, string | number | boolean> = {};
|
|
66
166
|
for (const [key, value] of Object.entries(attributes)) {
|
|
167
|
+
if (DROPPED_OTEL_ATTRIBUTE_KEYS.has(key)) {
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
67
170
|
redactedAttributes[key] = typeof value === "string" ? redactSensitiveText(value) : value;
|
|
68
171
|
}
|
|
69
172
|
return redactedAttributes;
|
|
70
173
|
}
|
|
71
174
|
|
|
175
|
+
function lowCardinalityAttr(value: string | undefined, fallback = "unknown"): string {
|
|
176
|
+
if (!value) {
|
|
177
|
+
return fallback;
|
|
178
|
+
}
|
|
179
|
+
const redacted = redactSensitiveText(value.trim());
|
|
180
|
+
return LOW_CARDINALITY_VALUE_RE.test(redacted) ? redacted : fallback;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function hasOtelSemconvOptIn(value: string | undefined, optIn: string): boolean {
|
|
184
|
+
return (
|
|
185
|
+
value
|
|
186
|
+
?.split(",")
|
|
187
|
+
.map((part) => part.trim())
|
|
188
|
+
.includes(optIn) ?? false
|
|
189
|
+
);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function emitLatestGenAiSemconv(): boolean {
|
|
193
|
+
return hasOtelSemconvOptIn(
|
|
194
|
+
process.env[OTEL_SEMCONV_STABILITY_OPT_IN_ENV],
|
|
195
|
+
GEN_AI_LATEST_EXPERIMENTAL_OPT_IN,
|
|
196
|
+
);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function genAiOperationName(
|
|
200
|
+
api: string | undefined,
|
|
201
|
+
): "chat" | "generate_content" | "text_completion" {
|
|
202
|
+
const normalized = api?.trim().toLowerCase();
|
|
203
|
+
if (!normalized) {
|
|
204
|
+
return "chat";
|
|
205
|
+
}
|
|
206
|
+
if (normalized === "completions" || normalized.endsWith("-completions")) {
|
|
207
|
+
return "text_completion";
|
|
208
|
+
}
|
|
209
|
+
if (normalized === "generate_content" || normalized.includes("generative-ai")) {
|
|
210
|
+
return "generate_content";
|
|
211
|
+
}
|
|
212
|
+
return "chat";
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function positiveFiniteNumber(value: number | undefined): number | undefined {
|
|
216
|
+
return typeof value === "number" && Number.isFinite(value) && value > 0 ? value : undefined;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function assignPositiveNumberAttr(
|
|
220
|
+
attrs: Record<string, string | number | boolean>,
|
|
221
|
+
key: string,
|
|
222
|
+
value: number | undefined,
|
|
223
|
+
): void {
|
|
224
|
+
const normalized = positiveFiniteNumber(value);
|
|
225
|
+
if (normalized !== undefined) {
|
|
226
|
+
attrs[key] = normalized;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function assignModelCallSizeTimingAttrs(
|
|
231
|
+
attrs: Record<string, string | number | boolean>,
|
|
232
|
+
evt: {
|
|
233
|
+
requestPayloadBytes?: number;
|
|
234
|
+
responseStreamBytes?: number;
|
|
235
|
+
timeToFirstByteMs?: number;
|
|
236
|
+
},
|
|
237
|
+
): void {
|
|
238
|
+
assignPositiveNumberAttr(attrs, "openclaw.model_call.request_bytes", evt.requestPayloadBytes);
|
|
239
|
+
assignPositiveNumberAttr(attrs, "openclaw.model_call.response_bytes", evt.responseStreamBytes);
|
|
240
|
+
assignPositiveNumberAttr(
|
|
241
|
+
attrs,
|
|
242
|
+
"openclaw.model_call.time_to_first_byte_ms",
|
|
243
|
+
evt.timeToFirstByteMs,
|
|
244
|
+
);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
function assignGenAiSpanIdentityAttrs(
|
|
248
|
+
attrs: Record<string, string | number | boolean>,
|
|
249
|
+
input: { api?: string; model?: string; provider?: string },
|
|
250
|
+
): void {
|
|
251
|
+
if (emitLatestGenAiSemconv()) {
|
|
252
|
+
attrs["gen_ai.provider.name"] = lowCardinalityAttr(input.provider);
|
|
253
|
+
} else {
|
|
254
|
+
attrs["gen_ai.system"] = lowCardinalityAttr(input.provider);
|
|
255
|
+
}
|
|
256
|
+
if (input.model) {
|
|
257
|
+
attrs["gen_ai.request.model"] = lowCardinalityAttr(input.model);
|
|
258
|
+
}
|
|
259
|
+
attrs["gen_ai.operation.name"] = genAiOperationName(input.api);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function assignGenAiModelCallAttrs(
|
|
263
|
+
attrs: Record<string, string | number | boolean>,
|
|
264
|
+
evt: { api?: string; model?: string; provider?: string },
|
|
265
|
+
): void {
|
|
266
|
+
assignGenAiSpanIdentityAttrs(attrs, evt);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function addUpstreamRequestIdSpanEvent(
|
|
270
|
+
span: { addEvent?: (name: string, attributes?: Record<string, string>) => void },
|
|
271
|
+
upstreamRequestIdHash: string | undefined,
|
|
272
|
+
): void {
|
|
273
|
+
if (!upstreamRequestIdHash) {
|
|
274
|
+
return;
|
|
275
|
+
}
|
|
276
|
+
const boundedHash = lowCardinalityAttr(upstreamRequestIdHash);
|
|
277
|
+
if (boundedHash === "unknown") {
|
|
278
|
+
return;
|
|
279
|
+
}
|
|
280
|
+
span.addEvent?.("openclaw.provider.request", {
|
|
281
|
+
"openclaw.upstreamRequestIdHash": boundedHash,
|
|
282
|
+
});
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
function clampOtelLogText(value: string, maxChars: number): string {
|
|
286
|
+
return value.length > maxChars ? `${value.slice(0, maxChars)}...(truncated)` : value;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
function normalizeOtelLogString(value: string, maxChars: number): string {
|
|
290
|
+
return clampOtelLogText(redactSensitiveText(value), maxChars);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
function resolveContentCapturePolicy(value: unknown): OtelContentCapturePolicy {
|
|
294
|
+
if (value === true) {
|
|
295
|
+
return {
|
|
296
|
+
inputMessages: true,
|
|
297
|
+
outputMessages: true,
|
|
298
|
+
toolInputs: true,
|
|
299
|
+
toolOutputs: true,
|
|
300
|
+
systemPrompt: false,
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
304
|
+
return NO_CONTENT_CAPTURE;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
const config = value as Record<string, unknown>;
|
|
308
|
+
if (config.enabled !== true) {
|
|
309
|
+
return NO_CONTENT_CAPTURE;
|
|
310
|
+
}
|
|
311
|
+
return {
|
|
312
|
+
inputMessages: config.inputMessages === true,
|
|
313
|
+
outputMessages: config.outputMessages === true,
|
|
314
|
+
toolInputs: config.toolInputs === true,
|
|
315
|
+
toolOutputs: config.toolOutputs === true,
|
|
316
|
+
systemPrompt: config.systemPrompt === true,
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
function hasPreloadedOtelSdk(): boolean {
|
|
321
|
+
return process.env[PRELOADED_OTEL_SDK_ENV] === "1";
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
function normalizeOtelContentValue(value: unknown): string | undefined {
|
|
325
|
+
if (typeof value === "string") {
|
|
326
|
+
return normalizeOtelLogString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
|
|
327
|
+
}
|
|
328
|
+
if (Array.isArray(value)) {
|
|
329
|
+
const items: string[] = [];
|
|
330
|
+
for (const item of value.slice(0, MAX_OTEL_CONTENT_ARRAY_ITEMS)) {
|
|
331
|
+
if (typeof item === "string") {
|
|
332
|
+
items.push(item);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
if (items.length > 0) {
|
|
336
|
+
return normalizeOtelLogString(items.join("\n"), MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
return undefined;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
function assignOtelContentAttribute(
|
|
343
|
+
attributes: Record<string, string | number | boolean>,
|
|
344
|
+
key: string,
|
|
345
|
+
value: unknown,
|
|
346
|
+
): void {
|
|
347
|
+
const normalized = normalizeOtelContentValue(value);
|
|
348
|
+
if (normalized) {
|
|
349
|
+
attributes[key] = normalized;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
function assignOtelModelContentAttributes(
|
|
354
|
+
attributes: Record<string, string | number | boolean>,
|
|
355
|
+
event: Record<string, unknown>,
|
|
356
|
+
policy: OtelContentCapturePolicy,
|
|
357
|
+
): void {
|
|
358
|
+
if (policy.inputMessages) {
|
|
359
|
+
assignOtelContentAttribute(attributes, "openclaw.content.input_messages", event.inputMessages);
|
|
360
|
+
}
|
|
361
|
+
if (policy.outputMessages) {
|
|
362
|
+
assignOtelContentAttribute(
|
|
363
|
+
attributes,
|
|
364
|
+
"openclaw.content.output_messages",
|
|
365
|
+
event.outputMessages,
|
|
366
|
+
);
|
|
367
|
+
}
|
|
368
|
+
if (policy.systemPrompt) {
|
|
369
|
+
assignOtelContentAttribute(attributes, "openclaw.content.system_prompt", event.systemPrompt);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
function assignOtelToolContentAttributes(
|
|
374
|
+
attributes: Record<string, string | number | boolean>,
|
|
375
|
+
event: Record<string, unknown>,
|
|
376
|
+
policy: OtelContentCapturePolicy,
|
|
377
|
+
): void {
|
|
378
|
+
if (policy.toolInputs) {
|
|
379
|
+
assignOtelContentAttribute(attributes, "openclaw.content.tool_input", event.toolInput);
|
|
380
|
+
}
|
|
381
|
+
if (policy.toolOutputs) {
|
|
382
|
+
assignOtelContentAttribute(attributes, "openclaw.content.tool_output", event.toolOutput);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
function assignOtelLogAttribute(
|
|
387
|
+
attributes: Record<string, string | number | boolean>,
|
|
388
|
+
key: string,
|
|
389
|
+
value: string | number | boolean,
|
|
390
|
+
): void {
|
|
391
|
+
if (Object.keys(attributes).length >= MAX_OTEL_LOG_ATTRIBUTE_COUNT) {
|
|
392
|
+
return;
|
|
393
|
+
}
|
|
394
|
+
if (BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS.has(key)) {
|
|
395
|
+
return;
|
|
396
|
+
}
|
|
397
|
+
if (redactSensitiveText(key) !== key) {
|
|
398
|
+
return;
|
|
399
|
+
}
|
|
400
|
+
if (!OTEL_LOG_ATTRIBUTE_KEY_RE.test(key)) {
|
|
401
|
+
return;
|
|
402
|
+
}
|
|
403
|
+
if (typeof value === "string") {
|
|
404
|
+
attributes[key] = normalizeOtelLogString(value, MAX_OTEL_LOG_ATTRIBUTE_VALUE_CHARS);
|
|
405
|
+
return;
|
|
406
|
+
}
|
|
407
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
408
|
+
attributes[key] = value;
|
|
409
|
+
return;
|
|
410
|
+
}
|
|
411
|
+
if (typeof value === "boolean") {
|
|
412
|
+
attributes[key] = value;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
function normalizeTraceContext(value: unknown): DiagnosticTraceContext | undefined {
|
|
417
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
418
|
+
return undefined;
|
|
419
|
+
}
|
|
420
|
+
const candidate = value as Partial<DiagnosticTraceContext>;
|
|
421
|
+
if (!isValidDiagnosticTraceId(candidate.traceId)) {
|
|
422
|
+
return undefined;
|
|
423
|
+
}
|
|
424
|
+
if (candidate.spanId !== undefined && !isValidDiagnosticSpanId(candidate.spanId)) {
|
|
425
|
+
return undefined;
|
|
426
|
+
}
|
|
427
|
+
if (candidate.parentSpanId !== undefined && !isValidDiagnosticSpanId(candidate.parentSpanId)) {
|
|
428
|
+
return undefined;
|
|
429
|
+
}
|
|
430
|
+
if (candidate.traceFlags !== undefined && !isValidDiagnosticTraceFlags(candidate.traceFlags)) {
|
|
431
|
+
return undefined;
|
|
432
|
+
}
|
|
433
|
+
return {
|
|
434
|
+
traceId: candidate.traceId,
|
|
435
|
+
...(candidate.spanId ? { spanId: candidate.spanId } : {}),
|
|
436
|
+
...(candidate.parentSpanId ? { parentSpanId: candidate.parentSpanId } : {}),
|
|
437
|
+
...(candidate.traceFlags ? { traceFlags: candidate.traceFlags } : {}),
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
function assignOtelLogEventAttributes(
|
|
442
|
+
attributes: Record<string, string | number | boolean>,
|
|
443
|
+
eventAttributes: Record<string, string | number | boolean> | undefined,
|
|
444
|
+
): void {
|
|
445
|
+
if (!eventAttributes) {
|
|
446
|
+
return;
|
|
447
|
+
}
|
|
448
|
+
for (const rawKey in eventAttributes) {
|
|
449
|
+
if (Object.keys(attributes).length >= MAX_OTEL_LOG_ATTRIBUTE_COUNT) {
|
|
450
|
+
break;
|
|
451
|
+
}
|
|
452
|
+
if (!Object.hasOwn(eventAttributes, rawKey)) {
|
|
453
|
+
continue;
|
|
454
|
+
}
|
|
455
|
+
const key = rawKey.trim();
|
|
456
|
+
if (BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS.has(key)) {
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
if (redactSensitiveText(key) !== key) {
|
|
460
|
+
continue;
|
|
461
|
+
}
|
|
462
|
+
if (!OTEL_LOG_RAW_ATTRIBUTE_KEY_RE.test(key)) {
|
|
463
|
+
continue;
|
|
464
|
+
}
|
|
465
|
+
assignOtelLogAttribute(attributes, `openclaw.${key}`, eventAttributes[rawKey]);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
function traceFlagsToOtel(traceFlags: string | undefined): TraceFlags {
|
|
470
|
+
const parsed = Number.parseInt(traceFlags ?? "00", 16);
|
|
471
|
+
return (parsed & TraceFlags.SAMPLED) !== 0 ? TraceFlags.SAMPLED : TraceFlags.NONE;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
function contextForTraceContext(traceContext: DiagnosticTraceContext | undefined) {
|
|
475
|
+
const normalized = normalizeTraceContext(traceContext);
|
|
476
|
+
if (!normalized?.spanId) {
|
|
477
|
+
return undefined;
|
|
478
|
+
}
|
|
479
|
+
return trace.setSpanContext(otelContextApi.active(), {
|
|
480
|
+
traceId: normalized.traceId,
|
|
481
|
+
spanId: normalized.spanId,
|
|
482
|
+
traceFlags: traceFlagsToOtel(normalized.traceFlags),
|
|
483
|
+
isRemote: true,
|
|
484
|
+
});
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
function contextForTrustedTraceContext(
|
|
488
|
+
evt: DiagnosticEventPayload,
|
|
489
|
+
metadata: DiagnosticEventMetadata,
|
|
490
|
+
) {
|
|
491
|
+
return metadata.trusted ? contextForTraceContext(evt.trace) : undefined;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
function addTraceAttributes(
|
|
495
|
+
attributes: Record<string, string | number | boolean>,
|
|
496
|
+
traceContext: DiagnosticTraceContext | undefined,
|
|
497
|
+
): void {
|
|
498
|
+
const normalized = normalizeTraceContext(traceContext);
|
|
499
|
+
if (!normalized) {
|
|
500
|
+
return;
|
|
501
|
+
}
|
|
502
|
+
attributes["openclaw.traceId"] = normalized.traceId;
|
|
503
|
+
if (normalized.spanId) {
|
|
504
|
+
attributes["openclaw.spanId"] = normalized.spanId;
|
|
505
|
+
}
|
|
506
|
+
if (normalized.parentSpanId) {
|
|
507
|
+
attributes["openclaw.parentSpanId"] = normalized.parentSpanId;
|
|
508
|
+
}
|
|
509
|
+
if (normalized.traceFlags) {
|
|
510
|
+
attributes["openclaw.traceFlags"] = normalized.traceFlags;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
72
514
|
export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
73
515
|
let sdk: NodeSDK | null = null;
|
|
74
516
|
let logProvider: LoggerProvider | null = null;
|
|
75
|
-
let stopLogTransport: (() => void) | null = null;
|
|
76
517
|
let unsubscribe: (() => void) | null = null;
|
|
518
|
+
let stopActiveTrustedSpans: (() => void) | null = null;
|
|
519
|
+
|
|
520
|
+
const stopStarted = async () => {
|
|
521
|
+
const currentUnsubscribe = unsubscribe;
|
|
522
|
+
const currentLogProvider = logProvider;
|
|
523
|
+
const currentSdk = sdk;
|
|
524
|
+
const currentStopActiveTrustedSpans = stopActiveTrustedSpans;
|
|
525
|
+
|
|
526
|
+
unsubscribe = null;
|
|
527
|
+
logProvider = null;
|
|
528
|
+
sdk = null;
|
|
529
|
+
stopActiveTrustedSpans = null;
|
|
530
|
+
|
|
531
|
+
currentUnsubscribe?.();
|
|
532
|
+
currentStopActiveTrustedSpans?.();
|
|
533
|
+
if (currentLogProvider) {
|
|
534
|
+
await currentLogProvider.shutdown().catch(() => undefined);
|
|
535
|
+
}
|
|
536
|
+
if (currentSdk) {
|
|
537
|
+
await currentSdk.shutdown().catch(() => undefined);
|
|
538
|
+
}
|
|
539
|
+
};
|
|
77
540
|
|
|
78
541
|
return {
|
|
79
542
|
id: "diagnostics-otel",
|
|
80
543
|
async start(ctx) {
|
|
544
|
+
await stopStarted();
|
|
545
|
+
|
|
81
546
|
const cfg = ctx.config.diagnostics;
|
|
82
547
|
const otel = cfg?.otel;
|
|
83
548
|
if (!cfg?.enabled || !otel?.enabled) {
|
|
84
549
|
return;
|
|
85
550
|
}
|
|
86
551
|
|
|
552
|
+
const emitExporterEvent = (
|
|
553
|
+
event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts">,
|
|
554
|
+
) => {
|
|
555
|
+
try {
|
|
556
|
+
ctx.internalDiagnostics?.emit({
|
|
557
|
+
type: "telemetry.exporter",
|
|
558
|
+
...event,
|
|
559
|
+
});
|
|
560
|
+
} catch {
|
|
561
|
+
// Exporter health must never affect the exporter lifecycle.
|
|
562
|
+
}
|
|
563
|
+
};
|
|
564
|
+
const emitForSignals = (
|
|
565
|
+
signals: TelemetryExporterDiagnosticEvent["signal"][],
|
|
566
|
+
event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts" | "signal">,
|
|
567
|
+
) => {
|
|
568
|
+
for (const signal of signals) {
|
|
569
|
+
emitExporterEvent({ signal, ...event });
|
|
570
|
+
}
|
|
571
|
+
};
|
|
572
|
+
const tracesEnabled = otel.traces !== false;
|
|
573
|
+
const metricsEnabled = otel.metrics !== false;
|
|
574
|
+
const logsEnabled = otel.logs === true;
|
|
575
|
+
const enabledSignals: TelemetryExporterDiagnosticEvent["signal"][] = [
|
|
576
|
+
...(tracesEnabled ? (["traces"] as const) : []),
|
|
577
|
+
...(metricsEnabled ? (["metrics"] as const) : []),
|
|
578
|
+
...(logsEnabled ? (["logs"] as const) : []),
|
|
579
|
+
];
|
|
580
|
+
if (enabledSignals.length === 0) {
|
|
581
|
+
return;
|
|
582
|
+
}
|
|
583
|
+
|
|
87
584
|
const protocol = otel.protocol ?? process.env.OTEL_EXPORTER_OTLP_PROTOCOL ?? "http/protobuf";
|
|
88
585
|
if (protocol !== "http/protobuf") {
|
|
586
|
+
emitForSignals(enabledSignals, {
|
|
587
|
+
exporter: "diagnostics-otel",
|
|
588
|
+
status: "failure",
|
|
589
|
+
reason: "unsupported_protocol",
|
|
590
|
+
});
|
|
89
591
|
ctx.logger.warn(`diagnostics-otel: unsupported protocol ${protocol}`);
|
|
90
592
|
return;
|
|
91
593
|
}
|
|
92
594
|
|
|
93
|
-
const endpoint = normalizeEndpoint(
|
|
595
|
+
const endpoint = normalizeEndpoint(
|
|
596
|
+
otel.endpoint ?? process.env[OTEL_EXPORTER_OTLP_ENDPOINT_ENV],
|
|
597
|
+
);
|
|
94
598
|
const headers = otel.headers ?? undefined;
|
|
95
599
|
const serviceName =
|
|
96
600
|
otel.serviceName?.trim() || process.env.OTEL_SERVICE_NAME || DEFAULT_SERVICE_NAME;
|
|
97
601
|
const sampleRate = resolveSampleRate(otel.sampleRate);
|
|
98
|
-
|
|
99
|
-
const
|
|
100
|
-
const metricsEnabled = otel.metrics !== false;
|
|
101
|
-
const logsEnabled = otel.logs === true;
|
|
102
|
-
if (!tracesEnabled && !metricsEnabled && !logsEnabled) {
|
|
103
|
-
return;
|
|
104
|
-
}
|
|
602
|
+
const contentCapturePolicy = resolveContentCapturePolicy(otel.captureContent);
|
|
603
|
+
const sdkPreloaded = hasPreloadedOtelSdk();
|
|
105
604
|
|
|
106
605
|
const resource = resourceFromAttributes({
|
|
107
606
|
[ATTR_SERVICE_NAME]: serviceName,
|
|
108
607
|
});
|
|
109
608
|
|
|
110
|
-
const
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
? {
|
|
132
|
-
: {}),
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
609
|
+
const logUrl = resolveSignalOtelUrl({
|
|
610
|
+
signalEndpoint: otel.logsEndpoint,
|
|
611
|
+
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV],
|
|
612
|
+
endpoint,
|
|
613
|
+
path: "v1/logs",
|
|
614
|
+
});
|
|
615
|
+
if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) {
|
|
616
|
+
const traceUrl = resolveSignalOtelUrl({
|
|
617
|
+
signalEndpoint: otel.tracesEndpoint,
|
|
618
|
+
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV],
|
|
619
|
+
endpoint,
|
|
620
|
+
path: "v1/traces",
|
|
621
|
+
});
|
|
622
|
+
const metricUrl = resolveSignalOtelUrl({
|
|
623
|
+
signalEndpoint: otel.metricsEndpoint,
|
|
624
|
+
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV],
|
|
625
|
+
endpoint,
|
|
626
|
+
path: "v1/metrics",
|
|
627
|
+
});
|
|
628
|
+
const traceExporter = tracesEnabled
|
|
629
|
+
? new OTLPTraceExporter({
|
|
630
|
+
...(traceUrl ? { url: traceUrl } : {}),
|
|
631
|
+
...(headers ? { headers } : {}),
|
|
632
|
+
})
|
|
633
|
+
: undefined;
|
|
634
|
+
|
|
635
|
+
const metricExporter = metricsEnabled
|
|
636
|
+
? new OTLPMetricExporter({
|
|
637
|
+
...(metricUrl ? { url: metricUrl } : {}),
|
|
638
|
+
...(headers ? { headers } : {}),
|
|
639
|
+
})
|
|
640
|
+
: undefined;
|
|
641
|
+
|
|
642
|
+
const metricReader = metricExporter
|
|
643
|
+
? new PeriodicExportingMetricReader({
|
|
644
|
+
exporter: metricExporter,
|
|
645
|
+
...(typeof otel.flushIntervalMs === "number"
|
|
646
|
+
? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
|
|
647
|
+
: {}),
|
|
648
|
+
})
|
|
649
|
+
: undefined;
|
|
650
|
+
|
|
137
651
|
sdk = new NodeSDK({
|
|
138
652
|
resource,
|
|
139
653
|
...(traceExporter ? { traceExporter } : {}),
|
|
@@ -148,11 +662,26 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
148
662
|
});
|
|
149
663
|
|
|
150
664
|
try {
|
|
151
|
-
|
|
665
|
+
sdk.start();
|
|
152
666
|
} catch (err) {
|
|
667
|
+
emitForSignals(
|
|
668
|
+
[
|
|
669
|
+
...(tracesEnabled ? (["traces"] as const) : []),
|
|
670
|
+
...(metricsEnabled ? (["metrics"] as const) : []),
|
|
671
|
+
],
|
|
672
|
+
{
|
|
673
|
+
exporter: "diagnostics-otel",
|
|
674
|
+
status: "failure",
|
|
675
|
+
reason: "start_failed",
|
|
676
|
+
errorCategory: errorCategory(err),
|
|
677
|
+
},
|
|
678
|
+
);
|
|
679
|
+
await stopStarted();
|
|
153
680
|
ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
|
|
154
681
|
throw err;
|
|
155
682
|
}
|
|
683
|
+
} else if (sdkPreloaded && (tracesEnabled || metricsEnabled)) {
|
|
684
|
+
ctx.logger.info("diagnostics-otel: using preloaded OpenTelemetry SDK");
|
|
156
685
|
}
|
|
157
686
|
|
|
158
687
|
const logSeverityMap: Record<string, SeverityNumber> = {
|
|
@@ -166,11 +695,46 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
166
695
|
|
|
167
696
|
const meter = metrics.getMeter("openclaw");
|
|
168
697
|
const tracer = trace.getTracer("openclaw");
|
|
698
|
+
const activeTrustedSpans = new Map<string, ReturnType<typeof tracer.startSpan>>();
|
|
699
|
+
const activeTrustedSpanAliases = new Map<string, ReturnType<typeof tracer.startSpan>>();
|
|
700
|
+
const pendingTrustedRunFinalizers = new Map<string, ReturnType<typeof setImmediate>>();
|
|
701
|
+
stopActiveTrustedSpans = () => {
|
|
702
|
+
const stopAt = Date.now();
|
|
703
|
+
for (const handle of pendingTrustedRunFinalizers.values()) {
|
|
704
|
+
clearImmediate(handle);
|
|
705
|
+
}
|
|
706
|
+
pendingTrustedRunFinalizers.clear();
|
|
707
|
+
for (const span of new Set([
|
|
708
|
+
...activeTrustedSpans.values(),
|
|
709
|
+
...activeTrustedSpanAliases.values(),
|
|
710
|
+
])) {
|
|
711
|
+
span.end(stopAt);
|
|
712
|
+
}
|
|
713
|
+
activeTrustedSpans.clear();
|
|
714
|
+
activeTrustedSpanAliases.clear();
|
|
715
|
+
};
|
|
169
716
|
|
|
170
717
|
const tokensCounter = meter.createCounter("openclaw.tokens", {
|
|
171
718
|
unit: "1",
|
|
172
719
|
description: "Token usage by type",
|
|
173
720
|
});
|
|
721
|
+
const genAiTokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
|
|
722
|
+
unit: "{token}",
|
|
723
|
+
description: "Number of input and output tokens used by GenAI client operations",
|
|
724
|
+
advice: {
|
|
725
|
+
explicitBucketBoundaries: GEN_AI_TOKEN_USAGE_BUCKETS,
|
|
726
|
+
},
|
|
727
|
+
});
|
|
728
|
+
const genAiOperationDurationHistogram = meter.createHistogram(
|
|
729
|
+
"gen_ai.client.operation.duration",
|
|
730
|
+
{
|
|
731
|
+
unit: "s",
|
|
732
|
+
description: "GenAI client operation duration",
|
|
733
|
+
advice: {
|
|
734
|
+
explicitBucketBoundaries: GEN_AI_OPERATION_DURATION_BUCKETS,
|
|
735
|
+
},
|
|
736
|
+
},
|
|
737
|
+
);
|
|
174
738
|
const costCounter = meter.createCounter("openclaw.cost.usd", {
|
|
175
739
|
unit: "1",
|
|
176
740
|
description: "Estimated model cost (USD)",
|
|
@@ -179,6 +743,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
179
743
|
unit: "ms",
|
|
180
744
|
description: "Agent run duration",
|
|
181
745
|
});
|
|
746
|
+
const harnessDurationHistogram = meter.createHistogram("openclaw.harness.duration_ms", {
|
|
747
|
+
unit: "ms",
|
|
748
|
+
description: "Agent harness lifecycle duration",
|
|
749
|
+
});
|
|
182
750
|
const contextHistogram = meter.createHistogram("openclaw.context.tokens", {
|
|
183
751
|
unit: "1",
|
|
184
752
|
description: "Context window size and usage",
|
|
@@ -207,6 +775,20 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
207
775
|
unit: "ms",
|
|
208
776
|
description: "Message processing duration",
|
|
209
777
|
});
|
|
778
|
+
const messageDeliveryStartedCounter = meter.createCounter(
|
|
779
|
+
"openclaw.message.delivery.started",
|
|
780
|
+
{
|
|
781
|
+
unit: "1",
|
|
782
|
+
description: "Outbound message delivery attempts started",
|
|
783
|
+
},
|
|
784
|
+
);
|
|
785
|
+
const messageDeliveryDurationHistogram = meter.createHistogram(
|
|
786
|
+
"openclaw.message.delivery.duration_ms",
|
|
787
|
+
{
|
|
788
|
+
unit: "ms",
|
|
789
|
+
description: "Outbound message delivery duration",
|
|
790
|
+
},
|
|
791
|
+
);
|
|
210
792
|
const queueDepthHistogram = meter.createHistogram("openclaw.queue.depth", {
|
|
211
793
|
unit: "1",
|
|
212
794
|
description: "Queue depth on enqueue/dequeue",
|
|
@@ -239,8 +821,118 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
239
821
|
unit: "1",
|
|
240
822
|
description: "Run attempts",
|
|
241
823
|
});
|
|
824
|
+
const toolLoopCounter = meter.createCounter("openclaw.tool.loop", {
|
|
825
|
+
unit: "1",
|
|
826
|
+
description: "Detected repetitive tool-call loop events",
|
|
827
|
+
});
|
|
828
|
+
const modelCallDurationHistogram = meter.createHistogram("openclaw.model_call.duration_ms", {
|
|
829
|
+
unit: "ms",
|
|
830
|
+
description: "Model call duration",
|
|
831
|
+
});
|
|
832
|
+
const modelCallRequestBytesHistogram = meter.createHistogram(
|
|
833
|
+
"openclaw.model_call.request_bytes",
|
|
834
|
+
{
|
|
835
|
+
unit: "By",
|
|
836
|
+
description: "UTF-8 byte size of sanitized model request payloads",
|
|
837
|
+
},
|
|
838
|
+
);
|
|
839
|
+
const modelCallResponseBytesHistogram = meter.createHistogram(
|
|
840
|
+
"openclaw.model_call.response_bytes",
|
|
841
|
+
{
|
|
842
|
+
unit: "By",
|
|
843
|
+
description: "UTF-8 byte size of streamed model response events",
|
|
844
|
+
},
|
|
845
|
+
);
|
|
846
|
+
const modelCallTimeToFirstByteHistogram = meter.createHistogram(
|
|
847
|
+
"openclaw.model_call.time_to_first_byte_ms",
|
|
848
|
+
{
|
|
849
|
+
unit: "ms",
|
|
850
|
+
description: "Elapsed time before the first streamed model response event",
|
|
851
|
+
},
|
|
852
|
+
);
|
|
853
|
+
const toolExecutionDurationHistogram = meter.createHistogram(
|
|
854
|
+
"openclaw.tool.execution.duration_ms",
|
|
855
|
+
{
|
|
856
|
+
unit: "ms",
|
|
857
|
+
description: "Tool execution duration",
|
|
858
|
+
},
|
|
859
|
+
);
|
|
860
|
+
const execProcessDurationHistogram = meter.createHistogram("openclaw.exec.duration_ms", {
|
|
861
|
+
unit: "ms",
|
|
862
|
+
description: "Exec process duration",
|
|
863
|
+
});
|
|
864
|
+
const memoryRssHistogram = meter.createHistogram("openclaw.memory.rss_bytes", {
|
|
865
|
+
unit: "By",
|
|
866
|
+
description: "Resident set size reported by diagnostic memory samples",
|
|
867
|
+
});
|
|
868
|
+
const memoryHeapUsedHistogram = meter.createHistogram("openclaw.memory.heap_used_bytes", {
|
|
869
|
+
unit: "By",
|
|
870
|
+
description: "Heap used bytes reported by diagnostic memory samples",
|
|
871
|
+
});
|
|
872
|
+
const memoryHeapTotalHistogram = meter.createHistogram("openclaw.memory.heap_total_bytes", {
|
|
873
|
+
unit: "By",
|
|
874
|
+
description: "Heap total bytes reported by diagnostic memory samples",
|
|
875
|
+
});
|
|
876
|
+
const memoryExternalHistogram = meter.createHistogram("openclaw.memory.external_bytes", {
|
|
877
|
+
unit: "By",
|
|
878
|
+
description: "External memory bytes reported by diagnostic memory samples",
|
|
879
|
+
});
|
|
880
|
+
const memoryArrayBuffersHistogram = meter.createHistogram(
|
|
881
|
+
"openclaw.memory.array_buffers_bytes",
|
|
882
|
+
{
|
|
883
|
+
unit: "By",
|
|
884
|
+
description: "ArrayBuffer bytes reported by diagnostic memory samples",
|
|
885
|
+
},
|
|
886
|
+
);
|
|
887
|
+
const memoryPressureCounter = meter.createCounter("openclaw.memory.pressure", {
|
|
888
|
+
unit: "1",
|
|
889
|
+
description: "Diagnostic memory pressure events",
|
|
890
|
+
});
|
|
891
|
+
const livenessWarningCounter = meter.createCounter("openclaw.liveness.warning", {
|
|
892
|
+
unit: "1",
|
|
893
|
+
description: "Diagnostic liveness warning events",
|
|
894
|
+
});
|
|
895
|
+
const livenessEventLoopDelayP99Histogram = meter.createHistogram(
|
|
896
|
+
"openclaw.liveness.event_loop_delay_p99_ms",
|
|
897
|
+
{
|
|
898
|
+
unit: "ms",
|
|
899
|
+
description: "P99 event-loop delay reported by diagnostic liveness warnings",
|
|
900
|
+
},
|
|
901
|
+
);
|
|
902
|
+
const livenessEventLoopDelayMaxHistogram = meter.createHistogram(
|
|
903
|
+
"openclaw.liveness.event_loop_delay_max_ms",
|
|
904
|
+
{
|
|
905
|
+
unit: "ms",
|
|
906
|
+
description: "Maximum event-loop delay reported by diagnostic liveness warnings",
|
|
907
|
+
},
|
|
908
|
+
);
|
|
909
|
+
const livenessEventLoopUtilizationHistogram = meter.createHistogram(
|
|
910
|
+
"openclaw.liveness.event_loop_utilization",
|
|
911
|
+
{
|
|
912
|
+
unit: "1",
|
|
913
|
+
description: "Event-loop utilization reported by diagnostic liveness warnings",
|
|
914
|
+
},
|
|
915
|
+
);
|
|
916
|
+
const livenessCpuCoreRatioHistogram = meter.createHistogram(
|
|
917
|
+
"openclaw.liveness.cpu_core_ratio",
|
|
918
|
+
{
|
|
919
|
+
unit: "1",
|
|
920
|
+
description: "CPU core ratio reported by diagnostic liveness warnings",
|
|
921
|
+
},
|
|
922
|
+
);
|
|
923
|
+
const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", {
|
|
924
|
+
unit: "1",
|
|
925
|
+
description: "Diagnostic telemetry exporter lifecycle and failure events",
|
|
926
|
+
});
|
|
242
927
|
|
|
928
|
+
let recordLogRecord:
|
|
929
|
+
| ((
|
|
930
|
+
evt: Extract<DiagnosticEventPayload, { type: "log.record" }>,
|
|
931
|
+
metadata: DiagnosticEventMetadata,
|
|
932
|
+
) => void)
|
|
933
|
+
| undefined;
|
|
243
934
|
if (logsEnabled) {
|
|
935
|
+
let logRecordExportFailureLastReportedAt = Number.NEGATIVE_INFINITY;
|
|
244
936
|
const logExporter = new OTLPLogExporter({
|
|
245
937
|
...(logUrl ? { url: logUrl } : {}),
|
|
246
938
|
...(headers ? { headers } : {}),
|
|
@@ -256,142 +948,238 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
256
948
|
processors: [logProcessor],
|
|
257
949
|
});
|
|
258
950
|
const otelLogger = logProvider.getLogger("openclaw");
|
|
259
|
-
|
|
260
|
-
stopLogTransport = registerLogTransport((logObj) => {
|
|
951
|
+
recordLogRecord = (evt, metadata) => {
|
|
261
952
|
try {
|
|
262
|
-
const
|
|
263
|
-
try {
|
|
264
|
-
return JSON.stringify(value);
|
|
265
|
-
} catch {
|
|
266
|
-
return String(value);
|
|
267
|
-
}
|
|
268
|
-
};
|
|
269
|
-
const meta = (logObj as Record<string, unknown>)._meta as
|
|
270
|
-
| {
|
|
271
|
-
logLevelName?: string;
|
|
272
|
-
date?: Date;
|
|
273
|
-
name?: string;
|
|
274
|
-
parentNames?: string[];
|
|
275
|
-
path?: {
|
|
276
|
-
filePath?: string;
|
|
277
|
-
fileLine?: string;
|
|
278
|
-
fileColumn?: string;
|
|
279
|
-
filePathWithLine?: string;
|
|
280
|
-
method?: string;
|
|
281
|
-
};
|
|
282
|
-
}
|
|
283
|
-
| undefined;
|
|
284
|
-
const logLevelName = meta?.logLevelName ?? "INFO";
|
|
953
|
+
const logLevelName = evt.level || "INFO";
|
|
285
954
|
const severityNumber = logSeverityMap[logLevelName] ?? (9 as SeverityNumber);
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
.map(([, value]) => value);
|
|
291
|
-
|
|
292
|
-
let bindings: Record<string, unknown> | undefined;
|
|
293
|
-
if (typeof numericArgs[0] === "string" && numericArgs[0].trim().startsWith("{")) {
|
|
294
|
-
try {
|
|
295
|
-
const parsed = JSON.parse(numericArgs[0]);
|
|
296
|
-
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
297
|
-
bindings = parsed as Record<string, unknown>;
|
|
298
|
-
numericArgs.shift();
|
|
299
|
-
}
|
|
300
|
-
} catch {
|
|
301
|
-
// ignore malformed json bindings
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
let message = "";
|
|
306
|
-
if (numericArgs.length > 0 && typeof numericArgs[numericArgs.length - 1] === "string") {
|
|
307
|
-
message = String(numericArgs.pop());
|
|
308
|
-
} else if (numericArgs.length === 1) {
|
|
309
|
-
message = safeStringify(numericArgs[0]);
|
|
310
|
-
numericArgs.length = 0;
|
|
955
|
+
const attributes = Object.create(null) as Record<string, string | number | boolean>;
|
|
956
|
+
assignOtelLogAttribute(attributes, "openclaw.log.level", logLevelName);
|
|
957
|
+
if (evt.loggerName) {
|
|
958
|
+
assignOtelLogAttribute(attributes, "openclaw.logger", evt.loggerName);
|
|
311
959
|
}
|
|
312
|
-
if (
|
|
313
|
-
|
|
960
|
+
if (evt.loggerParents?.length) {
|
|
961
|
+
assignOtelLogAttribute(
|
|
962
|
+
attributes,
|
|
963
|
+
"openclaw.logger.parents",
|
|
964
|
+
evt.loggerParents.join("."),
|
|
965
|
+
);
|
|
314
966
|
}
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
"
|
|
318
|
-
};
|
|
319
|
-
if (meta?.name) {
|
|
320
|
-
attributes["openclaw.logger"] = meta.name;
|
|
321
|
-
}
|
|
322
|
-
if (meta?.parentNames?.length) {
|
|
323
|
-
attributes["openclaw.logger.parents"] = meta.parentNames.join(".");
|
|
324
|
-
}
|
|
325
|
-
if (bindings) {
|
|
326
|
-
for (const [key, value] of Object.entries(bindings)) {
|
|
327
|
-
if (
|
|
328
|
-
typeof value === "string" ||
|
|
329
|
-
typeof value === "number" ||
|
|
330
|
-
typeof value === "boolean"
|
|
331
|
-
) {
|
|
332
|
-
attributes[`openclaw.${key}`] = value;
|
|
333
|
-
} else if (value != null) {
|
|
334
|
-
attributes[`openclaw.${key}`] = safeStringify(value);
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
}
|
|
338
|
-
if (numericArgs.length > 0) {
|
|
339
|
-
attributes["openclaw.log.args"] = safeStringify(numericArgs);
|
|
340
|
-
}
|
|
341
|
-
if (meta?.path?.filePath) {
|
|
342
|
-
attributes["code.filepath"] = meta.path.filePath;
|
|
343
|
-
}
|
|
344
|
-
if (meta?.path?.fileLine) {
|
|
345
|
-
attributes["code.lineno"] = Number(meta.path.fileLine);
|
|
967
|
+
assignOtelLogEventAttributes(attributes, evt.attributes);
|
|
968
|
+
if (evt.code?.line) {
|
|
969
|
+
assignOtelLogAttribute(attributes, "code.lineno", evt.code.line);
|
|
346
970
|
}
|
|
347
|
-
if (
|
|
348
|
-
attributes
|
|
971
|
+
if (evt.code?.functionName) {
|
|
972
|
+
assignOtelLogAttribute(attributes, "code.function", evt.code.functionName);
|
|
349
973
|
}
|
|
350
|
-
if (
|
|
351
|
-
attributes
|
|
974
|
+
if (metadata.trusted) {
|
|
975
|
+
addTraceAttributes(attributes, evt.trace);
|
|
352
976
|
}
|
|
353
977
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
body: redactSensitiveText(message),
|
|
978
|
+
const logRecord: LogRecord = {
|
|
979
|
+
body: normalizeOtelLogString(evt.message || "log", MAX_OTEL_LOG_BODY_CHARS),
|
|
357
980
|
severityText: logLevelName,
|
|
358
981
|
severityNumber,
|
|
359
982
|
attributes: redactOtelAttributes(attributes),
|
|
360
|
-
timestamp:
|
|
361
|
-
}
|
|
983
|
+
timestamp: evt.ts,
|
|
984
|
+
};
|
|
985
|
+
const logContext = contextForTrustedTraceContext(evt, metadata);
|
|
986
|
+
if (logContext) {
|
|
987
|
+
logRecord.context = logContext;
|
|
988
|
+
}
|
|
989
|
+
otelLogger.emit(logRecord);
|
|
362
990
|
} catch (err) {
|
|
363
|
-
|
|
991
|
+
emitExporterEvent({
|
|
992
|
+
exporter: "diagnostics-otel",
|
|
993
|
+
signal: "logs",
|
|
994
|
+
status: "failure",
|
|
995
|
+
reason: "emit_failed",
|
|
996
|
+
errorCategory: errorCategory(err),
|
|
997
|
+
});
|
|
998
|
+
const now = Date.now();
|
|
999
|
+
if (
|
|
1000
|
+
now - logRecordExportFailureLastReportedAt >=
|
|
1001
|
+
LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS
|
|
1002
|
+
) {
|
|
1003
|
+
logRecordExportFailureLastReportedAt = now;
|
|
1004
|
+
ctx.logger.error(`diagnostics-otel: log record export failed: ${formatError(err)}`);
|
|
1005
|
+
}
|
|
364
1006
|
}
|
|
365
|
-
}
|
|
1007
|
+
};
|
|
366
1008
|
}
|
|
367
1009
|
|
|
368
1010
|
const spanWithDuration = (
|
|
369
1011
|
name: string,
|
|
370
|
-
attributes: Record<string, string | number>,
|
|
1012
|
+
attributes: Record<string, string | number | boolean>,
|
|
371
1013
|
durationMs?: number,
|
|
1014
|
+
options: {
|
|
1015
|
+
parentContext?: ReturnType<typeof contextForTraceContext> | null;
|
|
1016
|
+
endTimeMs?: number;
|
|
1017
|
+
startTimeMs?: number;
|
|
1018
|
+
} = {},
|
|
372
1019
|
) => {
|
|
1020
|
+
const endTimeMs = options.endTimeMs ?? Date.now();
|
|
373
1021
|
const startTime =
|
|
374
|
-
typeof
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
1022
|
+
typeof options.startTimeMs === "number"
|
|
1023
|
+
? options.startTimeMs
|
|
1024
|
+
: typeof durationMs === "number" && durationMs >= 0
|
|
1025
|
+
? endTimeMs - durationMs
|
|
1026
|
+
: undefined;
|
|
1027
|
+
const parentContext =
|
|
1028
|
+
"parentContext" in options ? (options.parentContext ?? undefined) : undefined;
|
|
1029
|
+
const span = tracer.startSpan(
|
|
1030
|
+
name,
|
|
1031
|
+
{
|
|
1032
|
+
attributes: redactOtelAttributes(attributes),
|
|
1033
|
+
...(startTime !== undefined ? { startTime } : {}),
|
|
1034
|
+
},
|
|
1035
|
+
parentContext,
|
|
1036
|
+
);
|
|
1037
|
+
return span;
|
|
1038
|
+
};
|
|
1039
|
+
const trustedTraceContext = (
|
|
1040
|
+
evt: DiagnosticEventPayload,
|
|
1041
|
+
metadata: DiagnosticEventMetadata,
|
|
1042
|
+
) => (metadata.trusted ? normalizeTraceContext(evt.trace) : undefined);
|
|
1043
|
+
const activeTrustedParentContext = (
|
|
1044
|
+
evt: DiagnosticEventPayload,
|
|
1045
|
+
metadata: DiagnosticEventMetadata,
|
|
1046
|
+
) => {
|
|
1047
|
+
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
|
|
1048
|
+
if (!parentSpanId) {
|
|
1049
|
+
return undefined;
|
|
1050
|
+
}
|
|
1051
|
+
const activeParentSpan =
|
|
1052
|
+
activeTrustedSpans.get(parentSpanId) ?? activeTrustedSpanAliases.get(parentSpanId);
|
|
1053
|
+
if (!activeParentSpan) {
|
|
1054
|
+
return undefined;
|
|
1055
|
+
}
|
|
1056
|
+
return trace.setSpanContext(otelContextApi.active(), activeParentSpan.spanContext());
|
|
1057
|
+
};
|
|
1058
|
+
const trackTrustedSpan = (
|
|
1059
|
+
evt: DiagnosticEventPayload,
|
|
1060
|
+
metadata: DiagnosticEventMetadata,
|
|
1061
|
+
span: ReturnType<typeof tracer.startSpan>,
|
|
1062
|
+
) => {
|
|
1063
|
+
const spanId = trustedTraceContext(evt, metadata)?.spanId;
|
|
1064
|
+
if (spanId) {
|
|
1065
|
+
activeTrustedSpans.set(spanId, span);
|
|
1066
|
+
}
|
|
1067
|
+
return span;
|
|
1068
|
+
};
|
|
1069
|
+
const takeTrackedTrustedSpan = (
|
|
1070
|
+
evt: DiagnosticEventPayload,
|
|
1071
|
+
metadata: DiagnosticEventMetadata,
|
|
1072
|
+
) => {
|
|
1073
|
+
const spanId = trustedTraceContext(evt, metadata)?.spanId;
|
|
1074
|
+
if (!spanId) {
|
|
1075
|
+
return undefined;
|
|
1076
|
+
}
|
|
1077
|
+
const span = activeTrustedSpans.get(spanId);
|
|
1078
|
+
if (span) {
|
|
1079
|
+
activeTrustedSpans.delete(spanId);
|
|
1080
|
+
}
|
|
379
1081
|
return span;
|
|
380
1082
|
};
|
|
1083
|
+
const setSpanAttrs = (
|
|
1084
|
+
span: ReturnType<typeof tracer.startSpan>,
|
|
1085
|
+
attributes: Record<string, string | number | boolean>,
|
|
1086
|
+
) => {
|
|
1087
|
+
span.setAttributes?.(redactOtelAttributes(attributes));
|
|
1088
|
+
};
|
|
1089
|
+
const scheduleTrackedRunSpanFinalize = (
|
|
1090
|
+
spanId: string,
|
|
1091
|
+
parentSpanId: string | undefined,
|
|
1092
|
+
span: ReturnType<typeof tracer.startSpan>,
|
|
1093
|
+
endTimeMs: number,
|
|
1094
|
+
) => {
|
|
1095
|
+
const existingHandle = pendingTrustedRunFinalizers.get(spanId);
|
|
1096
|
+
if (existingHandle) {
|
|
1097
|
+
clearImmediate(existingHandle);
|
|
1098
|
+
}
|
|
1099
|
+
const handle = setImmediate(() => {
|
|
1100
|
+
pendingTrustedRunFinalizers.delete(spanId);
|
|
1101
|
+
if (activeTrustedSpans.get(spanId) === span) {
|
|
1102
|
+
activeTrustedSpans.delete(spanId);
|
|
1103
|
+
}
|
|
1104
|
+
if (parentSpanId && activeTrustedSpanAliases.get(parentSpanId) === span) {
|
|
1105
|
+
activeTrustedSpanAliases.delete(parentSpanId);
|
|
1106
|
+
}
|
|
1107
|
+
span.end(endTimeMs);
|
|
1108
|
+
});
|
|
1109
|
+
pendingTrustedRunFinalizers.set(spanId, handle);
|
|
1110
|
+
};
|
|
1111
|
+
|
|
1112
|
+
const addRunAttrs = (
|
|
1113
|
+
spanAttrs: Record<string, string | number | boolean>,
|
|
1114
|
+
evt: {
|
|
1115
|
+
runId?: string;
|
|
1116
|
+
sessionKey?: string;
|
|
1117
|
+
sessionId?: string;
|
|
1118
|
+
provider?: string;
|
|
1119
|
+
model?: string;
|
|
1120
|
+
channel?: string;
|
|
1121
|
+
trigger?: string;
|
|
1122
|
+
},
|
|
1123
|
+
) => {
|
|
1124
|
+
if (evt.provider) {
|
|
1125
|
+
spanAttrs["openclaw.provider"] = evt.provider;
|
|
1126
|
+
}
|
|
1127
|
+
if (evt.model) {
|
|
1128
|
+
spanAttrs["openclaw.model"] = evt.model;
|
|
1129
|
+
}
|
|
1130
|
+
if (evt.channel) {
|
|
1131
|
+
spanAttrs["openclaw.channel"] = evt.channel;
|
|
1132
|
+
}
|
|
1133
|
+
if (evt.trigger) {
|
|
1134
|
+
spanAttrs["openclaw.trigger"] = evt.trigger;
|
|
1135
|
+
}
|
|
1136
|
+
};
|
|
1137
|
+
|
|
1138
|
+
const paramsSummaryAttrs = (
|
|
1139
|
+
summary: Extract<
|
|
1140
|
+
DiagnosticEventPayload,
|
|
1141
|
+
{ type: "tool.execution.started" }
|
|
1142
|
+
>["paramsSummary"],
|
|
1143
|
+
): Record<string, string | number> => {
|
|
1144
|
+
if (!summary) {
|
|
1145
|
+
return {};
|
|
1146
|
+
}
|
|
1147
|
+
return {
|
|
1148
|
+
"openclaw.tool.params.kind": summary.kind,
|
|
1149
|
+
...("length" in summary ? { "openclaw.tool.params.length": summary.length } : {}),
|
|
1150
|
+
};
|
|
1151
|
+
};
|
|
381
1152
|
|
|
382
|
-
const recordModelUsage = (
|
|
1153
|
+
const recordModelUsage = (
|
|
1154
|
+
evt: Extract<DiagnosticEventPayload, { type: "model.usage" }>,
|
|
1155
|
+
metadata: DiagnosticEventMetadata,
|
|
1156
|
+
) => {
|
|
383
1157
|
const attrs = {
|
|
384
1158
|
"openclaw.channel": evt.channel ?? "unknown",
|
|
1159
|
+
"openclaw.agent": lowCardinalityAttr(evt.agentId),
|
|
385
1160
|
"openclaw.provider": evt.provider ?? "unknown",
|
|
386
1161
|
"openclaw.model": evt.model ?? "unknown",
|
|
387
1162
|
};
|
|
1163
|
+
const genAiAttrs: Record<string, string> = {
|
|
1164
|
+
"gen_ai.operation.name": "chat",
|
|
1165
|
+
"gen_ai.provider.name": lowCardinalityAttr(evt.provider),
|
|
1166
|
+
"gen_ai.request.model": lowCardinalityAttr(evt.model),
|
|
1167
|
+
};
|
|
388
1168
|
|
|
389
1169
|
const usage = evt.usage;
|
|
390
1170
|
if (usage.input) {
|
|
391
1171
|
tokensCounter.add(usage.input, { ...attrs, "openclaw.token": "input" });
|
|
1172
|
+
genAiTokenUsageHistogram.record(usage.input, {
|
|
1173
|
+
...genAiAttrs,
|
|
1174
|
+
"gen_ai.token.type": "input",
|
|
1175
|
+
});
|
|
392
1176
|
}
|
|
393
1177
|
if (usage.output) {
|
|
394
1178
|
tokensCounter.add(usage.output, { ...attrs, "openclaw.token": "output" });
|
|
1179
|
+
genAiTokenUsageHistogram.record(usage.output, {
|
|
1180
|
+
...genAiAttrs,
|
|
1181
|
+
"gen_ai.token.type": "output",
|
|
1182
|
+
});
|
|
395
1183
|
}
|
|
396
1184
|
if (usage.cacheRead) {
|
|
397
1185
|
tokensCounter.add(usage.cacheRead, { ...attrs, "openclaw.token": "cache_read" });
|
|
@@ -428,19 +1216,36 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
428
1216
|
if (!tracesEnabled) {
|
|
429
1217
|
return;
|
|
430
1218
|
}
|
|
1219
|
+
const genAiInputTokens =
|
|
1220
|
+
usage.promptTokens ??
|
|
1221
|
+
(usage.input ?? 0) + (usage.cacheRead ?? 0) + (usage.cacheWrite ?? 0);
|
|
431
1222
|
const spanAttrs: Record<string, string | number> = {
|
|
432
1223
|
...attrs,
|
|
433
|
-
"openclaw.sessionKey": evt.sessionKey ?? "",
|
|
434
|
-
"openclaw.sessionId": evt.sessionId ?? "",
|
|
435
1224
|
"openclaw.tokens.input": usage.input ?? 0,
|
|
436
1225
|
"openclaw.tokens.output": usage.output ?? 0,
|
|
437
1226
|
"openclaw.tokens.cache_read": usage.cacheRead ?? 0,
|
|
438
1227
|
"openclaw.tokens.cache_write": usage.cacheWrite ?? 0,
|
|
439
1228
|
"openclaw.tokens.total": usage.total ?? 0,
|
|
440
1229
|
};
|
|
1230
|
+
assignGenAiSpanIdentityAttrs(spanAttrs, evt);
|
|
1231
|
+
assignPositiveNumberAttr(spanAttrs, "gen_ai.usage.input_tokens", genAiInputTokens);
|
|
1232
|
+
assignPositiveNumberAttr(spanAttrs, "gen_ai.usage.output_tokens", usage.output);
|
|
1233
|
+
assignPositiveNumberAttr(
|
|
1234
|
+
spanAttrs,
|
|
1235
|
+
"gen_ai.usage.cache_read.input_tokens",
|
|
1236
|
+
usage.cacheRead,
|
|
1237
|
+
);
|
|
1238
|
+
assignPositiveNumberAttr(
|
|
1239
|
+
spanAttrs,
|
|
1240
|
+
"gen_ai.usage.cache_creation.input_tokens",
|
|
1241
|
+
usage.cacheWrite,
|
|
1242
|
+
);
|
|
441
1243
|
|
|
442
|
-
const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs
|
|
443
|
-
|
|
1244
|
+
const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs, {
|
|
1245
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1246
|
+
endTimeMs: evt.ts,
|
|
1247
|
+
});
|
|
1248
|
+
span.end(evt.ts);
|
|
444
1249
|
};
|
|
445
1250
|
|
|
446
1251
|
const recordWebhookReceived = (
|
|
@@ -513,18 +1318,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
513
1318
|
}
|
|
514
1319
|
};
|
|
515
1320
|
|
|
516
|
-
const addSessionIdentityAttrs = (
|
|
517
|
-
spanAttrs: Record<string, string | number>,
|
|
518
|
-
evt: { sessionKey?: string; sessionId?: string },
|
|
519
|
-
) => {
|
|
520
|
-
if (evt.sessionKey) {
|
|
521
|
-
spanAttrs["openclaw.sessionKey"] = evt.sessionKey;
|
|
522
|
-
}
|
|
523
|
-
if (evt.sessionId) {
|
|
524
|
-
spanAttrs["openclaw.sessionId"] = evt.sessionId;
|
|
525
|
-
}
|
|
526
|
-
};
|
|
527
|
-
|
|
528
1321
|
const recordMessageProcessed = (
|
|
529
1322
|
evt: Extract<DiagnosticEventPayload, { type: "message.processed" }>,
|
|
530
1323
|
) => {
|
|
@@ -540,7 +1333,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
540
1333
|
return;
|
|
541
1334
|
}
|
|
542
1335
|
const spanAttrs: Record<string, string | number> = { ...attrs };
|
|
543
|
-
addSessionIdentityAttrs(spanAttrs, evt);
|
|
544
1336
|
if (evt.chatId !== undefined) {
|
|
545
1337
|
spanAttrs["openclaw.chatId"] = String(evt.chatId);
|
|
546
1338
|
}
|
|
@@ -557,27 +1349,108 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
557
1349
|
span.end();
|
|
558
1350
|
};
|
|
559
1351
|
|
|
560
|
-
const
|
|
561
|
-
evt:
|
|
562
|
-
) => {
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
};
|
|
1352
|
+
const messageDeliveryAttrs = (
|
|
1353
|
+
evt: MessageDeliveryDiagnosticEvent,
|
|
1354
|
+
): Record<string, string> => ({
|
|
1355
|
+
"openclaw.channel": evt.channel,
|
|
1356
|
+
"openclaw.delivery.kind": evt.deliveryKind,
|
|
1357
|
+
});
|
|
567
1358
|
|
|
568
|
-
const
|
|
569
|
-
evt: Extract<DiagnosticEventPayload, { type: "
|
|
1359
|
+
const recordMessageDeliveryStarted = (
|
|
1360
|
+
evt: Extract<DiagnosticEventPayload, { type: "message.delivery.started" }>,
|
|
570
1361
|
) => {
|
|
571
|
-
|
|
572
|
-
laneDequeueCounter.add(1, attrs);
|
|
573
|
-
queueDepthHistogram.record(evt.queueSize, attrs);
|
|
574
|
-
if (typeof evt.waitMs === "number") {
|
|
575
|
-
queueWaitHistogram.record(evt.waitMs, attrs);
|
|
576
|
-
}
|
|
1362
|
+
messageDeliveryStartedCounter.add(1, messageDeliveryAttrs(evt));
|
|
577
1363
|
};
|
|
578
1364
|
|
|
579
|
-
const
|
|
580
|
-
evt: Extract<DiagnosticEventPayload, { type: "
|
|
1365
|
+
const recordMessageDeliveryCompleted = (
|
|
1366
|
+
evt: Extract<DiagnosticEventPayload, { type: "message.delivery.completed" }>,
|
|
1367
|
+
) => {
|
|
1368
|
+
const attrs = {
|
|
1369
|
+
...messageDeliveryAttrs(evt),
|
|
1370
|
+
"openclaw.outcome": "completed",
|
|
1371
|
+
};
|
|
1372
|
+
messageDeliveryDurationHistogram.record(evt.durationMs, attrs);
|
|
1373
|
+
if (!tracesEnabled) {
|
|
1374
|
+
return;
|
|
1375
|
+
}
|
|
1376
|
+
const span = spanWithDuration(
|
|
1377
|
+
"openclaw.message.delivery",
|
|
1378
|
+
{
|
|
1379
|
+
...attrs,
|
|
1380
|
+
"openclaw.delivery.result_count": evt.resultCount,
|
|
1381
|
+
},
|
|
1382
|
+
evt.durationMs,
|
|
1383
|
+
{ endTimeMs: evt.ts },
|
|
1384
|
+
);
|
|
1385
|
+
span.end(evt.ts);
|
|
1386
|
+
};
|
|
1387
|
+
|
|
1388
|
+
const recordMessageDeliveryError = (
|
|
1389
|
+
evt: Extract<DiagnosticEventPayload, { type: "message.delivery.error" }>,
|
|
1390
|
+
) => {
|
|
1391
|
+
const attrs = {
|
|
1392
|
+
...messageDeliveryAttrs(evt),
|
|
1393
|
+
"openclaw.outcome": "error",
|
|
1394
|
+
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
|
|
1395
|
+
};
|
|
1396
|
+
messageDeliveryDurationHistogram.record(evt.durationMs, attrs);
|
|
1397
|
+
if (!tracesEnabled) {
|
|
1398
|
+
return;
|
|
1399
|
+
}
|
|
1400
|
+
const span = spanWithDuration("openclaw.message.delivery", attrs, evt.durationMs, {
|
|
1401
|
+
endTimeMs: evt.ts,
|
|
1402
|
+
});
|
|
1403
|
+
span.setStatus({
|
|
1404
|
+
code: SpanStatusCode.ERROR,
|
|
1405
|
+
message: redactSensitiveText(evt.errorCategory),
|
|
1406
|
+
});
|
|
1407
|
+
span.end(evt.ts);
|
|
1408
|
+
};
|
|
1409
|
+
|
|
1410
|
+
const recordRunStarted = (
|
|
1411
|
+
evt: Extract<DiagnosticEventPayload, { type: "run.started" }>,
|
|
1412
|
+
metadata: DiagnosticEventMetadata,
|
|
1413
|
+
) => {
|
|
1414
|
+
if (!tracesEnabled || !metadata.trusted) {
|
|
1415
|
+
return;
|
|
1416
|
+
}
|
|
1417
|
+
const spanAttrs: Record<string, string | number | boolean> = {};
|
|
1418
|
+
addRunAttrs(spanAttrs, evt);
|
|
1419
|
+
const span = trackTrustedSpan(
|
|
1420
|
+
evt,
|
|
1421
|
+
metadata,
|
|
1422
|
+
spanWithDuration("openclaw.run", spanAttrs, undefined, {
|
|
1423
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1424
|
+
startTimeMs: evt.ts,
|
|
1425
|
+
}),
|
|
1426
|
+
);
|
|
1427
|
+
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
|
|
1428
|
+
if (parentSpanId && !activeTrustedSpans.has(parentSpanId)) {
|
|
1429
|
+
activeTrustedSpanAliases.set(parentSpanId, span);
|
|
1430
|
+
}
|
|
1431
|
+
};
|
|
1432
|
+
|
|
1433
|
+
const recordLaneEnqueue = (
|
|
1434
|
+
evt: Extract<DiagnosticEventPayload, { type: "queue.lane.enqueue" }>,
|
|
1435
|
+
) => {
|
|
1436
|
+
const attrs = { "openclaw.lane": evt.lane };
|
|
1437
|
+
laneEnqueueCounter.add(1, attrs);
|
|
1438
|
+
queueDepthHistogram.record(evt.queueSize, attrs);
|
|
1439
|
+
};
|
|
1440
|
+
|
|
1441
|
+
const recordLaneDequeue = (
|
|
1442
|
+
evt: Extract<DiagnosticEventPayload, { type: "queue.lane.dequeue" }>,
|
|
1443
|
+
) => {
|
|
1444
|
+
const attrs = { "openclaw.lane": evt.lane };
|
|
1445
|
+
laneDequeueCounter.add(1, attrs);
|
|
1446
|
+
queueDepthHistogram.record(evt.queueSize, attrs);
|
|
1447
|
+
if (typeof evt.waitMs === "number") {
|
|
1448
|
+
queueWaitHistogram.record(evt.waitMs, attrs);
|
|
1449
|
+
}
|
|
1450
|
+
};
|
|
1451
|
+
|
|
1452
|
+
const recordSessionState = (
|
|
1453
|
+
evt: Extract<DiagnosticEventPayload, { type: "session.state" }>,
|
|
581
1454
|
) => {
|
|
582
1455
|
const attrs: Record<string, string> = { "openclaw.state": evt.state };
|
|
583
1456
|
if (evt.reason) {
|
|
@@ -598,7 +1471,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
598
1471
|
return;
|
|
599
1472
|
}
|
|
600
1473
|
const spanAttrs: Record<string, string | number> = { ...attrs };
|
|
601
|
-
addSessionIdentityAttrs(spanAttrs, evt);
|
|
602
1474
|
spanAttrs["openclaw.queueDepth"] = evt.queueDepth ?? 0;
|
|
603
1475
|
spanAttrs["openclaw.ageMs"] = evt.ageMs;
|
|
604
1476
|
const span = tracer.startSpan("openclaw.session.stuck", { attributes: spanAttrs });
|
|
@@ -610,17 +1482,705 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
610
1482
|
runAttemptCounter.add(1, { "openclaw.attempt": evt.attempt });
|
|
611
1483
|
};
|
|
612
1484
|
|
|
1485
|
+
const toolLoopAttrs = (
|
|
1486
|
+
evt: Extract<DiagnosticEventPayload, { type: "tool.loop" }>,
|
|
1487
|
+
): Record<string, string | number> => ({
|
|
1488
|
+
"openclaw.toolName": lowCardinalityAttr(evt.toolName, "tool"),
|
|
1489
|
+
"openclaw.loop.level": evt.level,
|
|
1490
|
+
"openclaw.loop.action": evt.action,
|
|
1491
|
+
"openclaw.loop.detector": evt.detector,
|
|
1492
|
+
"openclaw.loop.count": evt.count,
|
|
1493
|
+
...(evt.pairedToolName
|
|
1494
|
+
? { "openclaw.loop.paired_tool": lowCardinalityAttr(evt.pairedToolName, "tool") }
|
|
1495
|
+
: {}),
|
|
1496
|
+
});
|
|
1497
|
+
|
|
1498
|
+
const recordToolLoop = (evt: Extract<DiagnosticEventPayload, { type: "tool.loop" }>) => {
|
|
1499
|
+
const attrs = toolLoopAttrs(evt);
|
|
1500
|
+
toolLoopCounter.add(1, attrs);
|
|
1501
|
+
if (!tracesEnabled) {
|
|
1502
|
+
return;
|
|
1503
|
+
}
|
|
1504
|
+
const span = spanWithDuration("openclaw.tool.loop", attrs, 0, { endTimeMs: evt.ts });
|
|
1505
|
+
if (evt.level === "critical" || evt.action === "block") {
|
|
1506
|
+
span.setStatus({
|
|
1507
|
+
code: SpanStatusCode.ERROR,
|
|
1508
|
+
message: `${evt.detector}:${evt.action}`,
|
|
1509
|
+
});
|
|
1510
|
+
}
|
|
1511
|
+
span.end(evt.ts);
|
|
1512
|
+
};
|
|
1513
|
+
|
|
1514
|
+
const recordMemoryUsageMetrics = (
|
|
1515
|
+
evt: Extract<
|
|
1516
|
+
DiagnosticEventPayload,
|
|
1517
|
+
{ type: "diagnostic.memory.sample" | "diagnostic.memory.pressure" }
|
|
1518
|
+
>,
|
|
1519
|
+
attrs: Record<string, string> = {},
|
|
1520
|
+
) => {
|
|
1521
|
+
memoryRssHistogram.record(evt.memory.rssBytes, attrs);
|
|
1522
|
+
memoryHeapUsedHistogram.record(evt.memory.heapUsedBytes, attrs);
|
|
1523
|
+
memoryHeapTotalHistogram.record(evt.memory.heapTotalBytes, attrs);
|
|
1524
|
+
memoryExternalHistogram.record(evt.memory.externalBytes, attrs);
|
|
1525
|
+
memoryArrayBuffersHistogram.record(evt.memory.arrayBuffersBytes, attrs);
|
|
1526
|
+
};
|
|
1527
|
+
|
|
1528
|
+
const recordMemorySample = (
|
|
1529
|
+
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.sample" }>,
|
|
1530
|
+
) => {
|
|
1531
|
+
recordMemoryUsageMetrics(evt);
|
|
1532
|
+
};
|
|
1533
|
+
|
|
1534
|
+
const recordMemoryPressure = (
|
|
1535
|
+
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.pressure" }>,
|
|
1536
|
+
) => {
|
|
1537
|
+
const attrs = {
|
|
1538
|
+
"openclaw.memory.level": evt.level,
|
|
1539
|
+
"openclaw.memory.reason": evt.reason,
|
|
1540
|
+
};
|
|
1541
|
+
memoryPressureCounter.add(1, attrs);
|
|
1542
|
+
recordMemoryUsageMetrics(evt, attrs);
|
|
1543
|
+
if (!tracesEnabled) {
|
|
1544
|
+
return;
|
|
1545
|
+
}
|
|
1546
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1547
|
+
...attrs,
|
|
1548
|
+
"openclaw.memory.rss_bytes": evt.memory.rssBytes,
|
|
1549
|
+
"openclaw.memory.heap_used_bytes": evt.memory.heapUsedBytes,
|
|
1550
|
+
"openclaw.memory.heap_total_bytes": evt.memory.heapTotalBytes,
|
|
1551
|
+
"openclaw.memory.external_bytes": evt.memory.externalBytes,
|
|
1552
|
+
"openclaw.memory.array_buffers_bytes": evt.memory.arrayBuffersBytes,
|
|
1553
|
+
...(evt.thresholdBytes !== undefined
|
|
1554
|
+
? { "openclaw.memory.threshold_bytes": evt.thresholdBytes }
|
|
1555
|
+
: {}),
|
|
1556
|
+
...(evt.rssGrowthBytes !== undefined
|
|
1557
|
+
? { "openclaw.memory.rss_growth_bytes": evt.rssGrowthBytes }
|
|
1558
|
+
: {}),
|
|
1559
|
+
...(evt.windowMs !== undefined ? { "openclaw.memory.window_ms": evt.windowMs } : {}),
|
|
1560
|
+
};
|
|
1561
|
+
const span = spanWithDuration("openclaw.memory.pressure", spanAttrs, 0, {
|
|
1562
|
+
endTimeMs: evt.ts,
|
|
1563
|
+
});
|
|
1564
|
+
if (evt.level === "critical") {
|
|
1565
|
+
span.setStatus({
|
|
1566
|
+
code: SpanStatusCode.ERROR,
|
|
1567
|
+
message: evt.reason,
|
|
1568
|
+
});
|
|
1569
|
+
}
|
|
1570
|
+
span.end(evt.ts);
|
|
1571
|
+
};
|
|
1572
|
+
|
|
1573
|
+
const recordRunCompleted = (
|
|
1574
|
+
evt: Extract<DiagnosticEventPayload, { type: "run.completed" }>,
|
|
1575
|
+
metadata: DiagnosticEventMetadata,
|
|
1576
|
+
) => {
|
|
1577
|
+
const attrs: Record<string, string | number> = {
|
|
1578
|
+
"openclaw.outcome": evt.outcome,
|
|
1579
|
+
"openclaw.provider": evt.provider ?? "unknown",
|
|
1580
|
+
"openclaw.model": evt.model ?? "unknown",
|
|
1581
|
+
};
|
|
1582
|
+
if (evt.channel) {
|
|
1583
|
+
attrs["openclaw.channel"] = evt.channel;
|
|
1584
|
+
}
|
|
1585
|
+
durationHistogram.record(evt.durationMs, attrs);
|
|
1586
|
+
if (!tracesEnabled) {
|
|
1587
|
+
return;
|
|
1588
|
+
}
|
|
1589
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1590
|
+
"openclaw.outcome": evt.outcome,
|
|
1591
|
+
};
|
|
1592
|
+
addRunAttrs(spanAttrs, evt);
|
|
1593
|
+
if (evt.errorCategory) {
|
|
1594
|
+
spanAttrs["openclaw.errorCategory"] = lowCardinalityAttr(evt.errorCategory, "other");
|
|
1595
|
+
}
|
|
1596
|
+
const trustedTrace = trustedTraceContext(evt, metadata);
|
|
1597
|
+
const trackedSpan = trustedTrace?.spanId
|
|
1598
|
+
? activeTrustedSpans.get(trustedTrace.spanId)
|
|
1599
|
+
: undefined;
|
|
1600
|
+
const span =
|
|
1601
|
+
trackedSpan ??
|
|
1602
|
+
spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, {
|
|
1603
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1604
|
+
endTimeMs: evt.ts,
|
|
1605
|
+
});
|
|
1606
|
+
setSpanAttrs(span, spanAttrs);
|
|
1607
|
+
if (evt.outcome === "error") {
|
|
1608
|
+
span.setStatus({
|
|
1609
|
+
code: SpanStatusCode.ERROR,
|
|
1610
|
+
...(evt.errorCategory ? { message: redactSensitiveText(evt.errorCategory) } : {}),
|
|
1611
|
+
});
|
|
1612
|
+
}
|
|
1613
|
+
if (trackedSpan && trustedTrace?.spanId) {
|
|
1614
|
+
scheduleTrackedRunSpanFinalize(
|
|
1615
|
+
trustedTrace.spanId,
|
|
1616
|
+
trustedTrace.parentSpanId,
|
|
1617
|
+
trackedSpan,
|
|
1618
|
+
evt.ts,
|
|
1619
|
+
);
|
|
1620
|
+
return;
|
|
1621
|
+
}
|
|
1622
|
+
span.end(evt.ts);
|
|
1623
|
+
};
|
|
1624
|
+
|
|
1625
|
+
const harnessRunMetricAttrs = (evt: HarnessRunDiagnosticEvent) => ({
|
|
1626
|
+
"openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
|
|
1627
|
+
"openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
|
|
1628
|
+
...(evt.type === "harness.run.started"
|
|
1629
|
+
? {}
|
|
1630
|
+
: {
|
|
1631
|
+
"openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
|
|
1632
|
+
}),
|
|
1633
|
+
"openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
|
|
1634
|
+
"openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
|
|
1635
|
+
...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
|
|
1636
|
+
});
|
|
1637
|
+
|
|
1638
|
+
const recordHarnessRunStarted = (
|
|
1639
|
+
evt: Extract<DiagnosticEventPayload, { type: "harness.run.started" }>,
|
|
1640
|
+
metadata: DiagnosticEventMetadata,
|
|
1641
|
+
) => {
|
|
1642
|
+
if (!tracesEnabled || !metadata.trusted) {
|
|
1643
|
+
return;
|
|
1644
|
+
}
|
|
1645
|
+
trackTrustedSpan(
|
|
1646
|
+
evt,
|
|
1647
|
+
metadata,
|
|
1648
|
+
spanWithDuration("openclaw.harness.run", harnessRunMetricAttrs(evt), undefined, {
|
|
1649
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1650
|
+
startTimeMs: evt.ts,
|
|
1651
|
+
}),
|
|
1652
|
+
);
|
|
1653
|
+
};
|
|
1654
|
+
|
|
1655
|
+
const recordHarnessRunCompleted = (
|
|
1656
|
+
evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
|
|
1657
|
+
metadata: DiagnosticEventMetadata,
|
|
1658
|
+
) => {
|
|
1659
|
+
harnessDurationHistogram.record(evt.durationMs, harnessRunMetricAttrs(evt));
|
|
1660
|
+
if (!tracesEnabled) {
|
|
1661
|
+
return;
|
|
1662
|
+
}
|
|
1663
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1664
|
+
...harnessRunMetricAttrs(evt),
|
|
1665
|
+
};
|
|
1666
|
+
if (evt.resultClassification) {
|
|
1667
|
+
spanAttrs["openclaw.harness.result_classification"] = lowCardinalityAttr(
|
|
1668
|
+
evt.resultClassification,
|
|
1669
|
+
);
|
|
1670
|
+
}
|
|
1671
|
+
if (typeof evt.yieldDetected === "boolean") {
|
|
1672
|
+
spanAttrs["openclaw.harness.yield_detected"] = evt.yieldDetected;
|
|
1673
|
+
}
|
|
1674
|
+
if (evt.itemLifecycle) {
|
|
1675
|
+
spanAttrs["openclaw.harness.items.started"] = evt.itemLifecycle.startedCount;
|
|
1676
|
+
spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
|
|
1677
|
+
spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
|
|
1678
|
+
}
|
|
1679
|
+
const span =
|
|
1680
|
+
takeTrackedTrustedSpan(evt, metadata) ??
|
|
1681
|
+
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
|
1682
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1683
|
+
endTimeMs: evt.ts,
|
|
1684
|
+
});
|
|
1685
|
+
setSpanAttrs(span, spanAttrs);
|
|
1686
|
+
if (evt.outcome === "error") {
|
|
1687
|
+
span.setStatus({
|
|
1688
|
+
code: SpanStatusCode.ERROR,
|
|
1689
|
+
message: "error",
|
|
1690
|
+
});
|
|
1691
|
+
}
|
|
1692
|
+
span.end(evt.ts);
|
|
1693
|
+
};
|
|
1694
|
+
|
|
1695
|
+
const recordHarnessRunError = (
|
|
1696
|
+
evt: Extract<DiagnosticEventPayload, { type: "harness.run.error" }>,
|
|
1697
|
+
metadata: DiagnosticEventMetadata,
|
|
1698
|
+
) => {
|
|
1699
|
+
const errorType = lowCardinalityAttr(evt.errorCategory, "other");
|
|
1700
|
+
const attrs = {
|
|
1701
|
+
...harnessRunMetricAttrs(evt),
|
|
1702
|
+
"openclaw.harness.phase": evt.phase,
|
|
1703
|
+
"openclaw.errorCategory": errorType,
|
|
1704
|
+
};
|
|
1705
|
+
harnessDurationHistogram.record(evt.durationMs, attrs);
|
|
1706
|
+
if (!tracesEnabled) {
|
|
1707
|
+
return;
|
|
1708
|
+
}
|
|
1709
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1710
|
+
...attrs,
|
|
1711
|
+
"error.type": errorType,
|
|
1712
|
+
...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
|
|
1713
|
+
};
|
|
1714
|
+
const span =
|
|
1715
|
+
takeTrackedTrustedSpan(evt, metadata) ??
|
|
1716
|
+
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
|
1717
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1718
|
+
endTimeMs: evt.ts,
|
|
1719
|
+
});
|
|
1720
|
+
setSpanAttrs(span, spanAttrs);
|
|
1721
|
+
span.setStatus({
|
|
1722
|
+
code: SpanStatusCode.ERROR,
|
|
1723
|
+
message: errorType,
|
|
1724
|
+
});
|
|
1725
|
+
span.end(evt.ts);
|
|
1726
|
+
};
|
|
1727
|
+
|
|
1728
|
+
const recordContextAssembled = (
|
|
1729
|
+
evt: Extract<DiagnosticEventPayload, { type: "context.assembled" }>,
|
|
1730
|
+
metadata: DiagnosticEventMetadata,
|
|
1731
|
+
) => {
|
|
1732
|
+
if (!tracesEnabled) {
|
|
1733
|
+
return;
|
|
1734
|
+
}
|
|
1735
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1736
|
+
"openclaw.context.message_count": evt.messageCount,
|
|
1737
|
+
"openclaw.context.history_text_chars": evt.historyTextChars,
|
|
1738
|
+
"openclaw.context.history_image_blocks": evt.historyImageBlocks,
|
|
1739
|
+
"openclaw.context.max_message_text_chars": evt.maxMessageTextChars,
|
|
1740
|
+
"openclaw.context.system_prompt_chars": evt.systemPromptChars,
|
|
1741
|
+
"openclaw.context.prompt_chars": evt.promptChars,
|
|
1742
|
+
"openclaw.context.prompt_images": evt.promptImages,
|
|
1743
|
+
};
|
|
1744
|
+
addRunAttrs(spanAttrs, evt);
|
|
1745
|
+
if (evt.contextTokenBudget !== undefined) {
|
|
1746
|
+
spanAttrs["openclaw.context.token_budget"] = evt.contextTokenBudget;
|
|
1747
|
+
}
|
|
1748
|
+
if (evt.reserveTokens !== undefined) {
|
|
1749
|
+
spanAttrs["openclaw.context.reserve_tokens"] = evt.reserveTokens;
|
|
1750
|
+
}
|
|
1751
|
+
const span = spanWithDuration("openclaw.context.assembled", spanAttrs, 0, {
|
|
1752
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1753
|
+
endTimeMs: evt.ts,
|
|
1754
|
+
});
|
|
1755
|
+
span.end(evt.ts);
|
|
1756
|
+
};
|
|
1757
|
+
|
|
1758
|
+
const modelCallMetricAttrs = (evt: ModelCallLifecycleDiagnosticEvent) => ({
|
|
1759
|
+
"openclaw.provider": evt.provider,
|
|
1760
|
+
"openclaw.model": evt.model,
|
|
1761
|
+
"openclaw.api": lowCardinalityAttr(evt.api),
|
|
1762
|
+
"openclaw.transport": lowCardinalityAttr(evt.transport),
|
|
1763
|
+
});
|
|
1764
|
+
const genAiModelCallMetricAttrs = (
|
|
1765
|
+
evt: ModelCallLifecycleDiagnosticEvent,
|
|
1766
|
+
errorType?: string,
|
|
1767
|
+
) => ({
|
|
1768
|
+
"gen_ai.operation.name": genAiOperationName(evt.api),
|
|
1769
|
+
"gen_ai.provider.name": lowCardinalityAttr(evt.provider),
|
|
1770
|
+
"gen_ai.request.model": lowCardinalityAttr(evt.model),
|
|
1771
|
+
...(errorType ? { "error.type": errorType } : {}),
|
|
1772
|
+
});
|
|
1773
|
+
const recordModelCallSizeTimingMetrics = (
|
|
1774
|
+
evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" | "model.call.error" }>,
|
|
1775
|
+
attrs: ReturnType<typeof modelCallMetricAttrs>,
|
|
1776
|
+
) => {
|
|
1777
|
+
const requestPayloadBytes = positiveFiniteNumber(evt.requestPayloadBytes);
|
|
1778
|
+
if (requestPayloadBytes !== undefined) {
|
|
1779
|
+
modelCallRequestBytesHistogram.record(requestPayloadBytes, attrs);
|
|
1780
|
+
}
|
|
1781
|
+
const responseStreamBytes = positiveFiniteNumber(evt.responseStreamBytes);
|
|
1782
|
+
if (responseStreamBytes !== undefined) {
|
|
1783
|
+
modelCallResponseBytesHistogram.record(responseStreamBytes, attrs);
|
|
1784
|
+
}
|
|
1785
|
+
const timeToFirstByteMs = positiveFiniteNumber(evt.timeToFirstByteMs);
|
|
1786
|
+
if (timeToFirstByteMs !== undefined) {
|
|
1787
|
+
modelCallTimeToFirstByteHistogram.record(timeToFirstByteMs, attrs);
|
|
1788
|
+
}
|
|
1789
|
+
};
|
|
1790
|
+
|
|
1791
|
+
const recordModelCallStarted = (
|
|
1792
|
+
evt: Extract<DiagnosticEventPayload, { type: "model.call.started" }>,
|
|
1793
|
+
metadata: DiagnosticEventMetadata,
|
|
1794
|
+
) => {
|
|
1795
|
+
if (!tracesEnabled || !metadata.trusted) {
|
|
1796
|
+
return;
|
|
1797
|
+
}
|
|
1798
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1799
|
+
"openclaw.provider": evt.provider,
|
|
1800
|
+
"openclaw.model": evt.model,
|
|
1801
|
+
};
|
|
1802
|
+
assignGenAiModelCallAttrs(spanAttrs, evt);
|
|
1803
|
+
if (evt.api) {
|
|
1804
|
+
spanAttrs["openclaw.api"] = evt.api;
|
|
1805
|
+
}
|
|
1806
|
+
if (evt.transport) {
|
|
1807
|
+
spanAttrs["openclaw.transport"] = evt.transport;
|
|
1808
|
+
}
|
|
1809
|
+
trackTrustedSpan(
|
|
1810
|
+
evt,
|
|
1811
|
+
metadata,
|
|
1812
|
+
spanWithDuration("openclaw.model.call", spanAttrs, undefined, {
|
|
1813
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1814
|
+
startTimeMs: evt.ts,
|
|
1815
|
+
}),
|
|
1816
|
+
);
|
|
1817
|
+
};
|
|
1818
|
+
|
|
1819
|
+
const recordModelCallCompleted = (
|
|
1820
|
+
evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" }>,
|
|
1821
|
+
metadata: DiagnosticEventMetadata,
|
|
1822
|
+
) => {
|
|
1823
|
+
const metricAttrs = modelCallMetricAttrs(evt);
|
|
1824
|
+
modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
|
|
1825
|
+
recordModelCallSizeTimingMetrics(evt, metricAttrs);
|
|
1826
|
+
genAiOperationDurationHistogram.record(
|
|
1827
|
+
evt.durationMs / 1000,
|
|
1828
|
+
genAiModelCallMetricAttrs(evt),
|
|
1829
|
+
);
|
|
1830
|
+
if (!tracesEnabled) {
|
|
1831
|
+
return;
|
|
1832
|
+
}
|
|
1833
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1834
|
+
"openclaw.provider": evt.provider,
|
|
1835
|
+
"openclaw.model": evt.model,
|
|
1836
|
+
};
|
|
1837
|
+
assignGenAiModelCallAttrs(spanAttrs, evt);
|
|
1838
|
+
if (evt.api) {
|
|
1839
|
+
spanAttrs["openclaw.api"] = evt.api;
|
|
1840
|
+
}
|
|
1841
|
+
if (evt.transport) {
|
|
1842
|
+
spanAttrs["openclaw.transport"] = evt.transport;
|
|
1843
|
+
}
|
|
1844
|
+
assignModelCallSizeTimingAttrs(spanAttrs, evt);
|
|
1845
|
+
assignOtelModelContentAttributes(
|
|
1846
|
+
spanAttrs,
|
|
1847
|
+
evt as unknown as Record<string, unknown>,
|
|
1848
|
+
contentCapturePolicy,
|
|
1849
|
+
);
|
|
1850
|
+
const span =
|
|
1851
|
+
takeTrackedTrustedSpan(evt, metadata) ??
|
|
1852
|
+
spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
|
|
1853
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1854
|
+
endTimeMs: evt.ts,
|
|
1855
|
+
});
|
|
1856
|
+
setSpanAttrs(span, spanAttrs);
|
|
1857
|
+
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
|
|
1858
|
+
span.end(evt.ts);
|
|
1859
|
+
};
|
|
1860
|
+
|
|
1861
|
+
const recordModelCallError = (
|
|
1862
|
+
evt: Extract<DiagnosticEventPayload, { type: "model.call.error" }>,
|
|
1863
|
+
metadata: DiagnosticEventMetadata,
|
|
1864
|
+
) => {
|
|
1865
|
+
const errorType = lowCardinalityAttr(evt.errorCategory, "other");
|
|
1866
|
+
const metricAttrs = {
|
|
1867
|
+
...modelCallMetricAttrs(evt),
|
|
1868
|
+
"openclaw.errorCategory": errorType,
|
|
1869
|
+
...(evt.failureKind
|
|
1870
|
+
? { "openclaw.failureKind": lowCardinalityAttr(evt.failureKind, "other") }
|
|
1871
|
+
: {}),
|
|
1872
|
+
};
|
|
1873
|
+
modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
|
|
1874
|
+
recordModelCallSizeTimingMetrics(evt, metricAttrs);
|
|
1875
|
+
genAiOperationDurationHistogram.record(
|
|
1876
|
+
evt.durationMs / 1000,
|
|
1877
|
+
genAiModelCallMetricAttrs(evt, errorType),
|
|
1878
|
+
);
|
|
1879
|
+
if (!tracesEnabled) {
|
|
1880
|
+
return;
|
|
1881
|
+
}
|
|
1882
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1883
|
+
"openclaw.provider": evt.provider,
|
|
1884
|
+
"openclaw.model": evt.model,
|
|
1885
|
+
"openclaw.errorCategory": errorType,
|
|
1886
|
+
"error.type": errorType,
|
|
1887
|
+
};
|
|
1888
|
+
if (evt.failureKind) {
|
|
1889
|
+
spanAttrs["openclaw.failureKind"] = lowCardinalityAttr(evt.failureKind, "other");
|
|
1890
|
+
}
|
|
1891
|
+
assignGenAiModelCallAttrs(spanAttrs, evt);
|
|
1892
|
+
if (evt.api) {
|
|
1893
|
+
spanAttrs["openclaw.api"] = evt.api;
|
|
1894
|
+
}
|
|
1895
|
+
if (evt.transport) {
|
|
1896
|
+
spanAttrs["openclaw.transport"] = evt.transport;
|
|
1897
|
+
}
|
|
1898
|
+
assignModelCallSizeTimingAttrs(spanAttrs, evt);
|
|
1899
|
+
assignOtelModelContentAttributes(
|
|
1900
|
+
spanAttrs,
|
|
1901
|
+
evt as unknown as Record<string, unknown>,
|
|
1902
|
+
contentCapturePolicy,
|
|
1903
|
+
);
|
|
1904
|
+
const span =
|
|
1905
|
+
takeTrackedTrustedSpan(evt, metadata) ??
|
|
1906
|
+
spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
|
|
1907
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1908
|
+
endTimeMs: evt.ts,
|
|
1909
|
+
});
|
|
1910
|
+
setSpanAttrs(span, spanAttrs);
|
|
1911
|
+
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
|
|
1912
|
+
span.setStatus({
|
|
1913
|
+
code: SpanStatusCode.ERROR,
|
|
1914
|
+
message: redactSensitiveText(evt.errorCategory),
|
|
1915
|
+
});
|
|
1916
|
+
span.end(evt.ts);
|
|
1917
|
+
};
|
|
1918
|
+
|
|
1919
|
+
const toolExecutionBaseAttrs = (
|
|
1920
|
+
evt: Extract<
|
|
1921
|
+
DiagnosticEventPayload,
|
|
1922
|
+
{
|
|
1923
|
+
type:
|
|
1924
|
+
| "tool.execution.started"
|
|
1925
|
+
| "tool.execution.completed"
|
|
1926
|
+
| "tool.execution.error"
|
|
1927
|
+
| "tool.execution.blocked";
|
|
1928
|
+
}
|
|
1929
|
+
>,
|
|
1930
|
+
): Record<string, string | number | boolean> => ({
|
|
1931
|
+
"openclaw.toolName": evt.toolName,
|
|
1932
|
+
"gen_ai.tool.name": evt.toolName,
|
|
1933
|
+
...paramsSummaryAttrs(evt.paramsSummary),
|
|
1934
|
+
});
|
|
1935
|
+
|
|
1936
|
+
const recordToolExecutionStarted = (
|
|
1937
|
+
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.started" }>,
|
|
1938
|
+
metadata: DiagnosticEventMetadata,
|
|
1939
|
+
) => {
|
|
1940
|
+
if (!tracesEnabled || !metadata.trusted) {
|
|
1941
|
+
return;
|
|
1942
|
+
}
|
|
1943
|
+
trackTrustedSpan(
|
|
1944
|
+
evt,
|
|
1945
|
+
metadata,
|
|
1946
|
+
spanWithDuration("openclaw.tool.execution", toolExecutionBaseAttrs(evt), undefined, {
|
|
1947
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1948
|
+
startTimeMs: evt.ts,
|
|
1949
|
+
}),
|
|
1950
|
+
);
|
|
1951
|
+
};
|
|
1952
|
+
|
|
1953
|
+
const recordToolExecutionCompleted = (
|
|
1954
|
+
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.completed" }>,
|
|
1955
|
+
metadata: DiagnosticEventMetadata,
|
|
1956
|
+
) => {
|
|
1957
|
+
const attrs = {
|
|
1958
|
+
"openclaw.toolName": evt.toolName,
|
|
1959
|
+
...paramsSummaryAttrs(evt.paramsSummary),
|
|
1960
|
+
};
|
|
1961
|
+
toolExecutionDurationHistogram.record(evt.durationMs, attrs);
|
|
1962
|
+
if (!tracesEnabled) {
|
|
1963
|
+
return;
|
|
1964
|
+
}
|
|
1965
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1966
|
+
...toolExecutionBaseAttrs(evt),
|
|
1967
|
+
};
|
|
1968
|
+
addRunAttrs(spanAttrs, evt);
|
|
1969
|
+
assignOtelToolContentAttributes(
|
|
1970
|
+
spanAttrs,
|
|
1971
|
+
evt as unknown as Record<string, unknown>,
|
|
1972
|
+
contentCapturePolicy,
|
|
1973
|
+
);
|
|
1974
|
+
const span =
|
|
1975
|
+
takeTrackedTrustedSpan(evt, metadata) ??
|
|
1976
|
+
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
|
|
1977
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
1978
|
+
endTimeMs: evt.ts,
|
|
1979
|
+
});
|
|
1980
|
+
setSpanAttrs(span, spanAttrs);
|
|
1981
|
+
span.end(evt.ts);
|
|
1982
|
+
};
|
|
1983
|
+
|
|
1984
|
+
const recordToolExecutionError = (
|
|
1985
|
+
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.error" }>,
|
|
1986
|
+
metadata: DiagnosticEventMetadata,
|
|
1987
|
+
) => {
|
|
1988
|
+
const attrs = {
|
|
1989
|
+
"openclaw.toolName": evt.toolName,
|
|
1990
|
+
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
|
|
1991
|
+
...paramsSummaryAttrs(evt.paramsSummary),
|
|
1992
|
+
};
|
|
1993
|
+
toolExecutionDurationHistogram.record(evt.durationMs, attrs);
|
|
1994
|
+
if (!tracesEnabled) {
|
|
1995
|
+
return;
|
|
1996
|
+
}
|
|
1997
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
1998
|
+
...toolExecutionBaseAttrs(evt),
|
|
1999
|
+
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
|
|
2000
|
+
};
|
|
2001
|
+
addRunAttrs(spanAttrs, evt);
|
|
2002
|
+
if (evt.errorCode) {
|
|
2003
|
+
spanAttrs["openclaw.errorCode"] = lowCardinalityAttr(evt.errorCode, "other");
|
|
2004
|
+
}
|
|
2005
|
+
assignOtelToolContentAttributes(
|
|
2006
|
+
spanAttrs,
|
|
2007
|
+
evt as unknown as Record<string, unknown>,
|
|
2008
|
+
contentCapturePolicy,
|
|
2009
|
+
);
|
|
2010
|
+
const span =
|
|
2011
|
+
takeTrackedTrustedSpan(evt, metadata) ??
|
|
2012
|
+
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
|
|
2013
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
2014
|
+
endTimeMs: evt.ts,
|
|
2015
|
+
});
|
|
2016
|
+
setSpanAttrs(span, spanAttrs);
|
|
2017
|
+
span.setStatus({
|
|
2018
|
+
code: SpanStatusCode.ERROR,
|
|
2019
|
+
message: redactSensitiveText(evt.errorCategory),
|
|
2020
|
+
});
|
|
2021
|
+
span.end(evt.ts);
|
|
2022
|
+
};
|
|
2023
|
+
|
|
2024
|
+
const recordToolExecutionBlocked = (
|
|
2025
|
+
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.blocked" }>,
|
|
2026
|
+
metadata: DiagnosticEventMetadata,
|
|
2027
|
+
) => {
|
|
2028
|
+
if (!tracesEnabled) {
|
|
2029
|
+
return;
|
|
2030
|
+
}
|
|
2031
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
2032
|
+
...toolExecutionBaseAttrs(evt),
|
|
2033
|
+
"openclaw.outcome": "blocked",
|
|
2034
|
+
"openclaw.deniedReason": lowCardinalityAttr(evt.deniedReason, "other"),
|
|
2035
|
+
};
|
|
2036
|
+
addRunAttrs(spanAttrs, evt);
|
|
2037
|
+
const span = spanWithDuration("openclaw.tool.execution", spanAttrs, 0, {
|
|
2038
|
+
parentContext: activeTrustedParentContext(evt, metadata),
|
|
2039
|
+
endTimeMs: evt.ts,
|
|
2040
|
+
});
|
|
2041
|
+
setSpanAttrs(span, spanAttrs);
|
|
2042
|
+
span.end(evt.ts);
|
|
2043
|
+
};
|
|
2044
|
+
|
|
2045
|
+
const recordExecProcessCompleted = (
|
|
2046
|
+
evt: Extract<DiagnosticEventPayload, { type: "exec.process.completed" }>,
|
|
2047
|
+
) => {
|
|
2048
|
+
const attrs: Record<string, string | number> = {
|
|
2049
|
+
"openclaw.exec.target": evt.target,
|
|
2050
|
+
"openclaw.exec.mode": evt.mode,
|
|
2051
|
+
"openclaw.outcome": evt.outcome,
|
|
2052
|
+
};
|
|
2053
|
+
if (evt.failureKind) {
|
|
2054
|
+
attrs["openclaw.failureKind"] = evt.failureKind;
|
|
2055
|
+
}
|
|
2056
|
+
execProcessDurationHistogram.record(evt.durationMs, attrs);
|
|
2057
|
+
if (!tracesEnabled) {
|
|
2058
|
+
return;
|
|
2059
|
+
}
|
|
2060
|
+
|
|
2061
|
+
const spanAttrs: Record<string, string | number | boolean> = {
|
|
2062
|
+
...attrs,
|
|
2063
|
+
"openclaw.exec.command_length": evt.commandLength,
|
|
2064
|
+
};
|
|
2065
|
+
if (typeof evt.exitCode === "number") {
|
|
2066
|
+
spanAttrs["openclaw.exec.exit_code"] = evt.exitCode;
|
|
2067
|
+
}
|
|
2068
|
+
if (evt.exitSignal) {
|
|
2069
|
+
spanAttrs["openclaw.exec.exit_signal"] = lowCardinalityAttr(evt.exitSignal, "other");
|
|
2070
|
+
}
|
|
2071
|
+
if (evt.timedOut !== undefined) {
|
|
2072
|
+
spanAttrs["openclaw.exec.timed_out"] = evt.timedOut;
|
|
2073
|
+
}
|
|
2074
|
+
|
|
2075
|
+
const span = spanWithDuration("openclaw.exec", spanAttrs, evt.durationMs, {
|
|
2076
|
+
endTimeMs: evt.ts,
|
|
2077
|
+
});
|
|
2078
|
+
if (evt.outcome === "failed") {
|
|
2079
|
+
span.setStatus({
|
|
2080
|
+
code: SpanStatusCode.ERROR,
|
|
2081
|
+
...(evt.failureKind ? { message: evt.failureKind } : {}),
|
|
2082
|
+
});
|
|
2083
|
+
}
|
|
2084
|
+
span.end(evt.ts);
|
|
2085
|
+
};
|
|
2086
|
+
|
|
613
2087
|
const recordHeartbeat = (
|
|
614
2088
|
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.heartbeat" }>,
|
|
615
2089
|
) => {
|
|
616
2090
|
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" });
|
|
617
2091
|
};
|
|
618
2092
|
|
|
619
|
-
|
|
2093
|
+
const recordLivenessWarning = (
|
|
2094
|
+
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.liveness.warning" }>,
|
|
2095
|
+
) => {
|
|
2096
|
+
const reason = evt.reasons.join(":");
|
|
2097
|
+
const attrs = {
|
|
2098
|
+
"openclaw.liveness.reason": lowCardinalityAttr(reason, "unknown"),
|
|
2099
|
+
};
|
|
2100
|
+
livenessWarningCounter.add(1, attrs);
|
|
2101
|
+
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "liveness" });
|
|
2102
|
+
if (evt.eventLoopDelayP99Ms !== undefined) {
|
|
2103
|
+
livenessEventLoopDelayP99Histogram.record(evt.eventLoopDelayP99Ms, attrs);
|
|
2104
|
+
}
|
|
2105
|
+
if (evt.eventLoopDelayMaxMs !== undefined) {
|
|
2106
|
+
livenessEventLoopDelayMaxHistogram.record(evt.eventLoopDelayMaxMs, attrs);
|
|
2107
|
+
}
|
|
2108
|
+
if (evt.eventLoopUtilization !== undefined) {
|
|
2109
|
+
livenessEventLoopUtilizationHistogram.record(evt.eventLoopUtilization, attrs);
|
|
2110
|
+
}
|
|
2111
|
+
if (evt.cpuCoreRatio !== undefined) {
|
|
2112
|
+
livenessCpuCoreRatioHistogram.record(evt.cpuCoreRatio, attrs);
|
|
2113
|
+
}
|
|
2114
|
+
if (!tracesEnabled) {
|
|
2115
|
+
return;
|
|
2116
|
+
}
|
|
2117
|
+
const spanAttrs: Record<string, string | number> = {
|
|
2118
|
+
...attrs,
|
|
2119
|
+
"openclaw.liveness.active": evt.active,
|
|
2120
|
+
"openclaw.liveness.waiting": evt.waiting,
|
|
2121
|
+
"openclaw.liveness.queued": evt.queued,
|
|
2122
|
+
"openclaw.liveness.interval_ms": evt.intervalMs,
|
|
2123
|
+
...(evt.eventLoopDelayP99Ms !== undefined
|
|
2124
|
+
? { "openclaw.liveness.event_loop_delay_p99_ms": evt.eventLoopDelayP99Ms }
|
|
2125
|
+
: {}),
|
|
2126
|
+
...(evt.eventLoopDelayMaxMs !== undefined
|
|
2127
|
+
? { "openclaw.liveness.event_loop_delay_max_ms": evt.eventLoopDelayMaxMs }
|
|
2128
|
+
: {}),
|
|
2129
|
+
...(evt.eventLoopUtilization !== undefined
|
|
2130
|
+
? { "openclaw.liveness.event_loop_utilization": evt.eventLoopUtilization }
|
|
2131
|
+
: {}),
|
|
2132
|
+
...(evt.cpuUserMs !== undefined
|
|
2133
|
+
? { "openclaw.liveness.cpu_user_ms": evt.cpuUserMs }
|
|
2134
|
+
: {}),
|
|
2135
|
+
...(evt.cpuSystemMs !== undefined
|
|
2136
|
+
? { "openclaw.liveness.cpu_system_ms": evt.cpuSystemMs }
|
|
2137
|
+
: {}),
|
|
2138
|
+
...(evt.cpuTotalMs !== undefined
|
|
2139
|
+
? { "openclaw.liveness.cpu_total_ms": evt.cpuTotalMs }
|
|
2140
|
+
: {}),
|
|
2141
|
+
...(evt.cpuCoreRatio !== undefined
|
|
2142
|
+
? { "openclaw.liveness.cpu_core_ratio": evt.cpuCoreRatio }
|
|
2143
|
+
: {}),
|
|
2144
|
+
};
|
|
2145
|
+
const span = spanWithDuration("openclaw.liveness.warning", spanAttrs, 0, {
|
|
2146
|
+
endTimeMs: evt.ts,
|
|
2147
|
+
});
|
|
2148
|
+
span.setStatus({
|
|
2149
|
+
code: SpanStatusCode.ERROR,
|
|
2150
|
+
message: reason,
|
|
2151
|
+
});
|
|
2152
|
+
span.end(evt.ts);
|
|
2153
|
+
};
|
|
2154
|
+
|
|
2155
|
+
const recordTelemetryExporter = (
|
|
2156
|
+
evt: TelemetryExporterDiagnosticEvent,
|
|
2157
|
+
metadata: DiagnosticEventMetadata,
|
|
2158
|
+
) => {
|
|
2159
|
+
if (!metadata.trusted) {
|
|
2160
|
+
return;
|
|
2161
|
+
}
|
|
2162
|
+
telemetryExporterCounter.add(1, {
|
|
2163
|
+
"openclaw.exporter": lowCardinalityAttr(evt.exporter, "unknown"),
|
|
2164
|
+
"openclaw.signal": evt.signal,
|
|
2165
|
+
"openclaw.status": evt.status,
|
|
2166
|
+
...(evt.reason ? { "openclaw.reason": evt.reason } : {}),
|
|
2167
|
+
...(evt.errorCategory
|
|
2168
|
+
? { "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other") }
|
|
2169
|
+
: {}),
|
|
2170
|
+
});
|
|
2171
|
+
};
|
|
2172
|
+
|
|
2173
|
+
const subscribe = ctx.internalDiagnostics?.onEvent;
|
|
2174
|
+
if (!subscribe) {
|
|
2175
|
+
ctx.logger.error("diagnostics-otel: internal diagnostics capability unavailable");
|
|
2176
|
+
return;
|
|
2177
|
+
}
|
|
2178
|
+
|
|
2179
|
+
unsubscribe = subscribe((evt: DiagnosticEventPayload, metadata: DiagnosticEventMetadata) => {
|
|
620
2180
|
try {
|
|
621
2181
|
switch (evt.type) {
|
|
622
2182
|
case "model.usage":
|
|
623
|
-
recordModelUsage(evt);
|
|
2183
|
+
recordModelUsage(evt, metadata);
|
|
624
2184
|
return;
|
|
625
2185
|
case "webhook.received":
|
|
626
2186
|
recordWebhookReceived(evt);
|
|
@@ -637,6 +2197,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
637
2197
|
case "message.processed":
|
|
638
2198
|
recordMessageProcessed(evt);
|
|
639
2199
|
return;
|
|
2200
|
+
case "message.delivery.started":
|
|
2201
|
+
recordMessageDeliveryStarted(evt);
|
|
2202
|
+
return;
|
|
2203
|
+
case "message.delivery.completed":
|
|
2204
|
+
recordMessageDeliveryCompleted(evt);
|
|
2205
|
+
return;
|
|
2206
|
+
case "message.delivery.error":
|
|
2207
|
+
recordMessageDeliveryError(evt);
|
|
2208
|
+
return;
|
|
640
2209
|
case "queue.lane.enqueue":
|
|
641
2210
|
recordLaneEnqueue(evt);
|
|
642
2211
|
return;
|
|
@@ -646,15 +2215,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
646
2215
|
case "session.state":
|
|
647
2216
|
recordSessionState(evt);
|
|
648
2217
|
return;
|
|
2218
|
+
case "session.long_running":
|
|
2219
|
+
case "session.stalled":
|
|
2220
|
+
return;
|
|
649
2221
|
case "session.stuck":
|
|
650
2222
|
recordSessionStuck(evt);
|
|
651
2223
|
return;
|
|
652
2224
|
case "run.attempt":
|
|
653
2225
|
recordRunAttempt(evt);
|
|
654
2226
|
return;
|
|
2227
|
+
case "run.progress":
|
|
2228
|
+
return;
|
|
655
2229
|
case "diagnostic.heartbeat":
|
|
656
2230
|
recordHeartbeat(evt);
|
|
657
2231
|
return;
|
|
2232
|
+
case "diagnostic.liveness.warning":
|
|
2233
|
+
recordLivenessWarning(evt);
|
|
2234
|
+
return;
|
|
2235
|
+
case "run.started":
|
|
2236
|
+
recordRunStarted(evt, metadata);
|
|
2237
|
+
return;
|
|
2238
|
+
case "run.completed":
|
|
2239
|
+
recordRunCompleted(evt, metadata);
|
|
2240
|
+
return;
|
|
2241
|
+
case "harness.run.started":
|
|
2242
|
+
recordHarnessRunStarted(evt, metadata);
|
|
2243
|
+
return;
|
|
2244
|
+
case "harness.run.completed":
|
|
2245
|
+
recordHarnessRunCompleted(evt, metadata);
|
|
2246
|
+
return;
|
|
2247
|
+
case "harness.run.error":
|
|
2248
|
+
recordHarnessRunError(evt, metadata);
|
|
2249
|
+
return;
|
|
2250
|
+
case "context.assembled":
|
|
2251
|
+
recordContextAssembled(evt, metadata);
|
|
2252
|
+
return;
|
|
2253
|
+
case "model.call.started":
|
|
2254
|
+
recordModelCallStarted(evt, metadata);
|
|
2255
|
+
return;
|
|
2256
|
+
case "model.call.completed":
|
|
2257
|
+
recordModelCallCompleted(evt, metadata);
|
|
2258
|
+
return;
|
|
2259
|
+
case "model.call.error":
|
|
2260
|
+
recordModelCallError(evt, metadata);
|
|
2261
|
+
return;
|
|
2262
|
+
case "tool.execution.started":
|
|
2263
|
+
recordToolExecutionStarted(evt, metadata);
|
|
2264
|
+
return;
|
|
2265
|
+
case "tool.execution.completed":
|
|
2266
|
+
recordToolExecutionCompleted(evt, metadata);
|
|
2267
|
+
return;
|
|
2268
|
+
case "tool.execution.error":
|
|
2269
|
+
recordToolExecutionError(evt, metadata);
|
|
2270
|
+
return;
|
|
2271
|
+
case "tool.execution.blocked":
|
|
2272
|
+
recordToolExecutionBlocked(evt, metadata);
|
|
2273
|
+
return;
|
|
2274
|
+
case "exec.process.completed":
|
|
2275
|
+
recordExecProcessCompleted(evt);
|
|
2276
|
+
return;
|
|
2277
|
+
case "log.record":
|
|
2278
|
+
recordLogRecord?.(evt, metadata);
|
|
2279
|
+
return;
|
|
2280
|
+
case "tool.loop":
|
|
2281
|
+
recordToolLoop(evt);
|
|
2282
|
+
return;
|
|
2283
|
+
case "diagnostic.memory.sample":
|
|
2284
|
+
recordMemorySample(evt);
|
|
2285
|
+
return;
|
|
2286
|
+
case "diagnostic.memory.pressure":
|
|
2287
|
+
recordMemoryPressure(evt);
|
|
2288
|
+
return;
|
|
2289
|
+
case "telemetry.exporter":
|
|
2290
|
+
recordTelemetryExporter(evt, metadata);
|
|
2291
|
+
return;
|
|
2292
|
+
case "payload.large":
|
|
2293
|
+
return;
|
|
658
2294
|
}
|
|
659
2295
|
} catch (err) {
|
|
660
2296
|
ctx.logger.error(
|
|
@@ -663,23 +2299,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|
|
663
2299
|
}
|
|
664
2300
|
});
|
|
665
2301
|
|
|
2302
|
+
emitForSignals(enabledSignals, {
|
|
2303
|
+
exporter: "diagnostics-otel",
|
|
2304
|
+
status: "started",
|
|
2305
|
+
reason: "configured",
|
|
2306
|
+
});
|
|
2307
|
+
|
|
666
2308
|
if (logsEnabled) {
|
|
667
2309
|
ctx.logger.info("diagnostics-otel: logs exporter enabled (OTLP/Protobuf)");
|
|
668
2310
|
}
|
|
669
2311
|
},
|
|
670
2312
|
async stop() {
|
|
671
|
-
|
|
672
|
-
unsubscribe = null;
|
|
673
|
-
stopLogTransport?.();
|
|
674
|
-
stopLogTransport = null;
|
|
675
|
-
if (logProvider) {
|
|
676
|
-
await logProvider.shutdown().catch(() => undefined);
|
|
677
|
-
logProvider = null;
|
|
678
|
-
}
|
|
679
|
-
if (sdk) {
|
|
680
|
-
await sdk.shutdown().catch(() => undefined);
|
|
681
|
-
sdk = null;
|
|
682
|
-
}
|
|
2313
|
+
await stopStarted();
|
|
683
2314
|
},
|
|
684
2315
|
} satisfies OpenClawPluginService;
|
|
685
2316
|
}
|