@openclaw/diagnostics-otel 2026.3.13 → 2026.5.2-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/service.ts CHANGED
@@ -1,5 +1,11 @@
1
- import { metrics, trace, SpanStatusCode } from "@opentelemetry/api";
2
- import type { SeverityNumber } from "@opentelemetry/api-logs";
1
+ import {
2
+ context as otelContextApi,
3
+ metrics,
4
+ trace,
5
+ SpanStatusCode,
6
+ TraceFlags,
7
+ } from "@opentelemetry/api";
8
+ import type { LogRecord, SeverityNumber } from "@opentelemetry/api-logs";
3
9
  import { OTLPLogExporter } from "@opentelemetry/exporter-logs-otlp-proto";
4
10
  import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-proto";
5
11
  import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
@@ -10,16 +16,87 @@ import { NodeSDK } from "@opentelemetry/sdk-node";
10
16
  import { ParentBasedSampler, TraceIdRatioBasedSampler } from "@opentelemetry/sdk-trace-base";
11
17
  import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
12
18
  import type {
19
+ DiagnosticEventMetadata,
13
20
  DiagnosticEventPayload,
21
+ DiagnosticTraceContext,
14
22
  OpenClawPluginService,
15
- } from "openclaw/plugin-sdk/diagnostics-otel";
23
+ } from "../api.js";
16
24
  import {
17
- onDiagnosticEvent,
25
+ isValidDiagnosticSpanId,
26
+ isValidDiagnosticTraceFlags,
27
+ isValidDiagnosticTraceId,
18
28
  redactSensitiveText,
19
- registerLogTransport,
20
- } from "openclaw/plugin-sdk/diagnostics-otel";
29
+ } from "../api.js";
21
30
 
22
31
  const DEFAULT_SERVICE_NAME = "openclaw";
32
+ const DROPPED_OTEL_ATTRIBUTE_KEYS = new Set([
33
+ "openclaw.callId",
34
+ "openclaw.parentSpanId",
35
+ "openclaw.runId",
36
+ "openclaw.sessionId",
37
+ "openclaw.sessionKey",
38
+ "openclaw.spanId",
39
+ "openclaw.toolCallId",
40
+ "openclaw.traceId",
41
+ ]);
42
+ const LOW_CARDINALITY_VALUE_RE = /^[A-Za-z0-9_.:-]{1,120}$/u;
43
+ const MAX_OTEL_CONTENT_ATTRIBUTE_CHARS = 4 * 1024;
44
+ const MAX_OTEL_CONTENT_ARRAY_ITEMS = 16;
45
+ const MAX_OTEL_LOG_BODY_CHARS = 4 * 1024;
46
+ const MAX_OTEL_LOG_ATTRIBUTE_COUNT = 64;
47
+ const MAX_OTEL_LOG_ATTRIBUTE_VALUE_CHARS = 4 * 1024;
48
+ const LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS = 60_000;
49
+ const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u;
50
+ const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u;
51
+ const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
52
+ const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED";
53
+ const OTEL_EXPORTER_OTLP_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_ENDPOINT";
54
+ const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT";
55
+ const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT";
56
+ const OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT";
57
+ const OTEL_SEMCONV_STABILITY_OPT_IN_ENV = "OTEL_SEMCONV_STABILITY_OPT_IN";
58
+ const GEN_AI_LATEST_EXPERIMENTAL_OPT_IN = "gen_ai_latest_experimental";
59
+ const GEN_AI_TOKEN_USAGE_BUCKETS = [
60
+ 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864,
61
+ ];
62
+ const GEN_AI_OPERATION_DURATION_BUCKETS = [
63
+ 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92,
64
+ ];
65
+
66
+ type OtelContentCapturePolicy = {
67
+ inputMessages: boolean;
68
+ outputMessages: boolean;
69
+ toolInputs: boolean;
70
+ toolOutputs: boolean;
71
+ systemPrompt: boolean;
72
+ };
73
+
74
+ type MessageDeliveryDiagnosticEvent = Extract<
75
+ DiagnosticEventPayload,
76
+ {
77
+ type: "message.delivery.started" | "message.delivery.completed" | "message.delivery.error";
78
+ }
79
+ >;
80
+ type ModelCallLifecycleDiagnosticEvent = Extract<
81
+ DiagnosticEventPayload,
82
+ { type: "model.call.completed" | "model.call.error" }
83
+ >;
84
+ type HarnessRunDiagnosticEvent = Extract<
85
+ DiagnosticEventPayload,
86
+ { type: "harness.run.started" | "harness.run.completed" | "harness.run.error" }
87
+ >;
88
+ type TelemetryExporterDiagnosticEvent = Extract<
89
+ DiagnosticEventPayload,
90
+ { type: "telemetry.exporter" }
91
+ >;
92
+
93
+ const NO_CONTENT_CAPTURE: OtelContentCapturePolicy = {
94
+ inputMessages: false,
95
+ outputMessages: false,
96
+ toolInputs: false,
97
+ toolOutputs: false,
98
+ systemPrompt: false,
99
+ };
23
100
 
24
101
  function normalizeEndpoint(endpoint?: string): string | undefined {
25
102
  const trimmed = endpoint?.trim();
@@ -37,6 +114,18 @@ function resolveOtelUrl(endpoint: string | undefined, path: string): string | un
37
114
  return `${endpoint}/${path}`;
38
115
  }
39
116
 
117
+ function resolveSignalOtelUrl(params: {
118
+ signalEndpoint?: string;
119
+ signalEnvEndpoint?: string;
120
+ endpoint?: string;
121
+ path: string;
122
+ }): string | undefined {
123
+ return resolveOtelUrl(
124
+ normalizeEndpoint(params.signalEndpoint ?? params.signalEnvEndpoint) ?? params.endpoint,
125
+ params.path,
126
+ );
127
+ }
128
+
40
129
  function resolveSampleRate(value: number | undefined): number | undefined {
41
130
  if (typeof value !== "number" || !Number.isFinite(value)) {
42
131
  return undefined;
@@ -61,79 +150,504 @@ function formatError(err: unknown): string {
61
150
  }
62
151
  }
63
152
 
153
+ function errorCategory(err: unknown): string {
154
+ try {
155
+ if (err instanceof Error && typeof err.name === "string" && err.name.trim()) {
156
+ return lowCardinalityAttr(err.name, "Error");
157
+ }
158
+ return lowCardinalityAttr(typeof err, "unknown");
159
+ } catch {
160
+ return "unknown";
161
+ }
162
+ }
163
+
64
164
  function redactOtelAttributes(attributes: Record<string, string | number | boolean>) {
65
165
  const redactedAttributes: Record<string, string | number | boolean> = {};
66
166
  for (const [key, value] of Object.entries(attributes)) {
167
+ if (DROPPED_OTEL_ATTRIBUTE_KEYS.has(key)) {
168
+ continue;
169
+ }
67
170
  redactedAttributes[key] = typeof value === "string" ? redactSensitiveText(value) : value;
68
171
  }
69
172
  return redactedAttributes;
70
173
  }
71
174
 
175
+ function lowCardinalityAttr(value: string | undefined, fallback = "unknown"): string {
176
+ if (!value) {
177
+ return fallback;
178
+ }
179
+ const redacted = redactSensitiveText(value.trim());
180
+ return LOW_CARDINALITY_VALUE_RE.test(redacted) ? redacted : fallback;
181
+ }
182
+
183
+ function hasOtelSemconvOptIn(value: string | undefined, optIn: string): boolean {
184
+ return (
185
+ value
186
+ ?.split(",")
187
+ .map((part) => part.trim())
188
+ .includes(optIn) ?? false
189
+ );
190
+ }
191
+
192
+ function emitLatestGenAiSemconv(): boolean {
193
+ return hasOtelSemconvOptIn(
194
+ process.env[OTEL_SEMCONV_STABILITY_OPT_IN_ENV],
195
+ GEN_AI_LATEST_EXPERIMENTAL_OPT_IN,
196
+ );
197
+ }
198
+
199
+ function genAiOperationName(
200
+ api: string | undefined,
201
+ ): "chat" | "generate_content" | "text_completion" {
202
+ const normalized = api?.trim().toLowerCase();
203
+ if (!normalized) {
204
+ return "chat";
205
+ }
206
+ if (normalized === "completions" || normalized.endsWith("-completions")) {
207
+ return "text_completion";
208
+ }
209
+ if (normalized === "generate_content" || normalized.includes("generative-ai")) {
210
+ return "generate_content";
211
+ }
212
+ return "chat";
213
+ }
214
+
215
+ function positiveFiniteNumber(value: number | undefined): number | undefined {
216
+ return typeof value === "number" && Number.isFinite(value) && value > 0 ? value : undefined;
217
+ }
218
+
219
+ function assignPositiveNumberAttr(
220
+ attrs: Record<string, string | number | boolean>,
221
+ key: string,
222
+ value: number | undefined,
223
+ ): void {
224
+ const normalized = positiveFiniteNumber(value);
225
+ if (normalized !== undefined) {
226
+ attrs[key] = normalized;
227
+ }
228
+ }
229
+
230
+ function assignModelCallSizeTimingAttrs(
231
+ attrs: Record<string, string | number | boolean>,
232
+ evt: {
233
+ requestPayloadBytes?: number;
234
+ responseStreamBytes?: number;
235
+ timeToFirstByteMs?: number;
236
+ },
237
+ ): void {
238
+ assignPositiveNumberAttr(attrs, "openclaw.model_call.request_bytes", evt.requestPayloadBytes);
239
+ assignPositiveNumberAttr(attrs, "openclaw.model_call.response_bytes", evt.responseStreamBytes);
240
+ assignPositiveNumberAttr(
241
+ attrs,
242
+ "openclaw.model_call.time_to_first_byte_ms",
243
+ evt.timeToFirstByteMs,
244
+ );
245
+ }
246
+
247
+ function assignGenAiSpanIdentityAttrs(
248
+ attrs: Record<string, string | number | boolean>,
249
+ input: { api?: string; model?: string; provider?: string },
250
+ ): void {
251
+ if (emitLatestGenAiSemconv()) {
252
+ attrs["gen_ai.provider.name"] = lowCardinalityAttr(input.provider);
253
+ } else {
254
+ attrs["gen_ai.system"] = lowCardinalityAttr(input.provider);
255
+ }
256
+ if (input.model) {
257
+ attrs["gen_ai.request.model"] = lowCardinalityAttr(input.model);
258
+ }
259
+ attrs["gen_ai.operation.name"] = genAiOperationName(input.api);
260
+ }
261
+
262
+ function assignGenAiModelCallAttrs(
263
+ attrs: Record<string, string | number | boolean>,
264
+ evt: { api?: string; model?: string; provider?: string },
265
+ ): void {
266
+ assignGenAiSpanIdentityAttrs(attrs, evt);
267
+ }
268
+
269
+ function addUpstreamRequestIdSpanEvent(
270
+ span: { addEvent?: (name: string, attributes?: Record<string, string>) => void },
271
+ upstreamRequestIdHash: string | undefined,
272
+ ): void {
273
+ if (!upstreamRequestIdHash) {
274
+ return;
275
+ }
276
+ const boundedHash = lowCardinalityAttr(upstreamRequestIdHash);
277
+ if (boundedHash === "unknown") {
278
+ return;
279
+ }
280
+ span.addEvent?.("openclaw.provider.request", {
281
+ "openclaw.upstreamRequestIdHash": boundedHash,
282
+ });
283
+ }
284
+
285
+ function clampOtelLogText(value: string, maxChars: number): string {
286
+ return value.length > maxChars ? `${value.slice(0, maxChars)}...(truncated)` : value;
287
+ }
288
+
289
+ function normalizeOtelLogString(value: string, maxChars: number): string {
290
+ return clampOtelLogText(redactSensitiveText(value), maxChars);
291
+ }
292
+
293
+ function resolveContentCapturePolicy(value: unknown): OtelContentCapturePolicy {
294
+ if (value === true) {
295
+ return {
296
+ inputMessages: true,
297
+ outputMessages: true,
298
+ toolInputs: true,
299
+ toolOutputs: true,
300
+ systemPrompt: false,
301
+ };
302
+ }
303
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
304
+ return NO_CONTENT_CAPTURE;
305
+ }
306
+
307
+ const config = value as Record<string, unknown>;
308
+ if (config.enabled !== true) {
309
+ return NO_CONTENT_CAPTURE;
310
+ }
311
+ return {
312
+ inputMessages: config.inputMessages === true,
313
+ outputMessages: config.outputMessages === true,
314
+ toolInputs: config.toolInputs === true,
315
+ toolOutputs: config.toolOutputs === true,
316
+ systemPrompt: config.systemPrompt === true,
317
+ };
318
+ }
319
+
320
+ function hasPreloadedOtelSdk(): boolean {
321
+ return process.env[PRELOADED_OTEL_SDK_ENV] === "1";
322
+ }
323
+
324
+ function normalizeOtelContentValue(value: unknown): string | undefined {
325
+ if (typeof value === "string") {
326
+ return normalizeOtelLogString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
327
+ }
328
+ if (Array.isArray(value)) {
329
+ const items: string[] = [];
330
+ for (const item of value.slice(0, MAX_OTEL_CONTENT_ARRAY_ITEMS)) {
331
+ if (typeof item === "string") {
332
+ items.push(item);
333
+ }
334
+ }
335
+ if (items.length > 0) {
336
+ return normalizeOtelLogString(items.join("\n"), MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
337
+ }
338
+ }
339
+ return undefined;
340
+ }
341
+
342
+ function assignOtelContentAttribute(
343
+ attributes: Record<string, string | number | boolean>,
344
+ key: string,
345
+ value: unknown,
346
+ ): void {
347
+ const normalized = normalizeOtelContentValue(value);
348
+ if (normalized) {
349
+ attributes[key] = normalized;
350
+ }
351
+ }
352
+
353
+ function assignOtelModelContentAttributes(
354
+ attributes: Record<string, string | number | boolean>,
355
+ event: Record<string, unknown>,
356
+ policy: OtelContentCapturePolicy,
357
+ ): void {
358
+ if (policy.inputMessages) {
359
+ assignOtelContentAttribute(attributes, "openclaw.content.input_messages", event.inputMessages);
360
+ }
361
+ if (policy.outputMessages) {
362
+ assignOtelContentAttribute(
363
+ attributes,
364
+ "openclaw.content.output_messages",
365
+ event.outputMessages,
366
+ );
367
+ }
368
+ if (policy.systemPrompt) {
369
+ assignOtelContentAttribute(attributes, "openclaw.content.system_prompt", event.systemPrompt);
370
+ }
371
+ }
372
+
373
+ function assignOtelToolContentAttributes(
374
+ attributes: Record<string, string | number | boolean>,
375
+ event: Record<string, unknown>,
376
+ policy: OtelContentCapturePolicy,
377
+ ): void {
378
+ if (policy.toolInputs) {
379
+ assignOtelContentAttribute(attributes, "openclaw.content.tool_input", event.toolInput);
380
+ }
381
+ if (policy.toolOutputs) {
382
+ assignOtelContentAttribute(attributes, "openclaw.content.tool_output", event.toolOutput);
383
+ }
384
+ }
385
+
386
+ function assignOtelLogAttribute(
387
+ attributes: Record<string, string | number | boolean>,
388
+ key: string,
389
+ value: string | number | boolean,
390
+ ): void {
391
+ if (Object.keys(attributes).length >= MAX_OTEL_LOG_ATTRIBUTE_COUNT) {
392
+ return;
393
+ }
394
+ if (BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS.has(key)) {
395
+ return;
396
+ }
397
+ if (redactSensitiveText(key) !== key) {
398
+ return;
399
+ }
400
+ if (!OTEL_LOG_ATTRIBUTE_KEY_RE.test(key)) {
401
+ return;
402
+ }
403
+ if (typeof value === "string") {
404
+ attributes[key] = normalizeOtelLogString(value, MAX_OTEL_LOG_ATTRIBUTE_VALUE_CHARS);
405
+ return;
406
+ }
407
+ if (typeof value === "number" && Number.isFinite(value)) {
408
+ attributes[key] = value;
409
+ return;
410
+ }
411
+ if (typeof value === "boolean") {
412
+ attributes[key] = value;
413
+ }
414
+ }
415
+
416
+ function normalizeTraceContext(value: unknown): DiagnosticTraceContext | undefined {
417
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
418
+ return undefined;
419
+ }
420
+ const candidate = value as Partial<DiagnosticTraceContext>;
421
+ if (!isValidDiagnosticTraceId(candidate.traceId)) {
422
+ return undefined;
423
+ }
424
+ if (candidate.spanId !== undefined && !isValidDiagnosticSpanId(candidate.spanId)) {
425
+ return undefined;
426
+ }
427
+ if (candidate.parentSpanId !== undefined && !isValidDiagnosticSpanId(candidate.parentSpanId)) {
428
+ return undefined;
429
+ }
430
+ if (candidate.traceFlags !== undefined && !isValidDiagnosticTraceFlags(candidate.traceFlags)) {
431
+ return undefined;
432
+ }
433
+ return {
434
+ traceId: candidate.traceId,
435
+ ...(candidate.spanId ? { spanId: candidate.spanId } : {}),
436
+ ...(candidate.parentSpanId ? { parentSpanId: candidate.parentSpanId } : {}),
437
+ ...(candidate.traceFlags ? { traceFlags: candidate.traceFlags } : {}),
438
+ };
439
+ }
440
+
441
+ function assignOtelLogEventAttributes(
442
+ attributes: Record<string, string | number | boolean>,
443
+ eventAttributes: Record<string, string | number | boolean> | undefined,
444
+ ): void {
445
+ if (!eventAttributes) {
446
+ return;
447
+ }
448
+ for (const rawKey in eventAttributes) {
449
+ if (Object.keys(attributes).length >= MAX_OTEL_LOG_ATTRIBUTE_COUNT) {
450
+ break;
451
+ }
452
+ if (!Object.hasOwn(eventAttributes, rawKey)) {
453
+ continue;
454
+ }
455
+ const key = rawKey.trim();
456
+ if (BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS.has(key)) {
457
+ continue;
458
+ }
459
+ if (redactSensitiveText(key) !== key) {
460
+ continue;
461
+ }
462
+ if (!OTEL_LOG_RAW_ATTRIBUTE_KEY_RE.test(key)) {
463
+ continue;
464
+ }
465
+ assignOtelLogAttribute(attributes, `openclaw.${key}`, eventAttributes[rawKey]);
466
+ }
467
+ }
468
+
469
+ function traceFlagsToOtel(traceFlags: string | undefined): TraceFlags {
470
+ const parsed = Number.parseInt(traceFlags ?? "00", 16);
471
+ return (parsed & TraceFlags.SAMPLED) !== 0 ? TraceFlags.SAMPLED : TraceFlags.NONE;
472
+ }
473
+
474
+ function contextForTraceContext(traceContext: DiagnosticTraceContext | undefined) {
475
+ const normalized = normalizeTraceContext(traceContext);
476
+ if (!normalized?.spanId) {
477
+ return undefined;
478
+ }
479
+ return trace.setSpanContext(otelContextApi.active(), {
480
+ traceId: normalized.traceId,
481
+ spanId: normalized.spanId,
482
+ traceFlags: traceFlagsToOtel(normalized.traceFlags),
483
+ isRemote: true,
484
+ });
485
+ }
486
+
487
+ function contextForTrustedTraceContext(
488
+ evt: DiagnosticEventPayload,
489
+ metadata: DiagnosticEventMetadata,
490
+ ) {
491
+ return metadata.trusted ? contextForTraceContext(evt.trace) : undefined;
492
+ }
493
+
494
+ function addTraceAttributes(
495
+ attributes: Record<string, string | number | boolean>,
496
+ traceContext: DiagnosticTraceContext | undefined,
497
+ ): void {
498
+ const normalized = normalizeTraceContext(traceContext);
499
+ if (!normalized) {
500
+ return;
501
+ }
502
+ attributes["openclaw.traceId"] = normalized.traceId;
503
+ if (normalized.spanId) {
504
+ attributes["openclaw.spanId"] = normalized.spanId;
505
+ }
506
+ if (normalized.parentSpanId) {
507
+ attributes["openclaw.parentSpanId"] = normalized.parentSpanId;
508
+ }
509
+ if (normalized.traceFlags) {
510
+ attributes["openclaw.traceFlags"] = normalized.traceFlags;
511
+ }
512
+ }
513
+
72
514
  export function createDiagnosticsOtelService(): OpenClawPluginService {
73
515
  let sdk: NodeSDK | null = null;
74
516
  let logProvider: LoggerProvider | null = null;
75
- let stopLogTransport: (() => void) | null = null;
76
517
  let unsubscribe: (() => void) | null = null;
518
+ let stopActiveTrustedSpans: (() => void) | null = null;
519
+
520
+ const stopStarted = async () => {
521
+ const currentUnsubscribe = unsubscribe;
522
+ const currentLogProvider = logProvider;
523
+ const currentSdk = sdk;
524
+ const currentStopActiveTrustedSpans = stopActiveTrustedSpans;
525
+
526
+ unsubscribe = null;
527
+ logProvider = null;
528
+ sdk = null;
529
+ stopActiveTrustedSpans = null;
530
+
531
+ currentUnsubscribe?.();
532
+ currentStopActiveTrustedSpans?.();
533
+ if (currentLogProvider) {
534
+ await currentLogProvider.shutdown().catch(() => undefined);
535
+ }
536
+ if (currentSdk) {
537
+ await currentSdk.shutdown().catch(() => undefined);
538
+ }
539
+ };
77
540
 
78
541
  return {
79
542
  id: "diagnostics-otel",
80
543
  async start(ctx) {
544
+ await stopStarted();
545
+
81
546
  const cfg = ctx.config.diagnostics;
82
547
  const otel = cfg?.otel;
83
548
  if (!cfg?.enabled || !otel?.enabled) {
84
549
  return;
85
550
  }
86
551
 
552
+ const emitExporterEvent = (
553
+ event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts">,
554
+ ) => {
555
+ try {
556
+ ctx.internalDiagnostics?.emit({
557
+ type: "telemetry.exporter",
558
+ ...event,
559
+ });
560
+ } catch {
561
+ // Exporter health must never affect the exporter lifecycle.
562
+ }
563
+ };
564
+ const emitForSignals = (
565
+ signals: TelemetryExporterDiagnosticEvent["signal"][],
566
+ event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts" | "signal">,
567
+ ) => {
568
+ for (const signal of signals) {
569
+ emitExporterEvent({ signal, ...event });
570
+ }
571
+ };
572
+ const tracesEnabled = otel.traces !== false;
573
+ const metricsEnabled = otel.metrics !== false;
574
+ const logsEnabled = otel.logs === true;
575
+ const enabledSignals: TelemetryExporterDiagnosticEvent["signal"][] = [
576
+ ...(tracesEnabled ? (["traces"] as const) : []),
577
+ ...(metricsEnabled ? (["metrics"] as const) : []),
578
+ ...(logsEnabled ? (["logs"] as const) : []),
579
+ ];
580
+ if (enabledSignals.length === 0) {
581
+ return;
582
+ }
583
+
87
584
  const protocol = otel.protocol ?? process.env.OTEL_EXPORTER_OTLP_PROTOCOL ?? "http/protobuf";
88
585
  if (protocol !== "http/protobuf") {
586
+ emitForSignals(enabledSignals, {
587
+ exporter: "diagnostics-otel",
588
+ status: "failure",
589
+ reason: "unsupported_protocol",
590
+ });
89
591
  ctx.logger.warn(`diagnostics-otel: unsupported protocol ${protocol}`);
90
592
  return;
91
593
  }
92
594
 
93
- const endpoint = normalizeEndpoint(otel.endpoint ?? process.env.OTEL_EXPORTER_OTLP_ENDPOINT);
595
+ const endpoint = normalizeEndpoint(
596
+ otel.endpoint ?? process.env[OTEL_EXPORTER_OTLP_ENDPOINT_ENV],
597
+ );
94
598
  const headers = otel.headers ?? undefined;
95
599
  const serviceName =
96
600
  otel.serviceName?.trim() || process.env.OTEL_SERVICE_NAME || DEFAULT_SERVICE_NAME;
97
601
  const sampleRate = resolveSampleRate(otel.sampleRate);
98
-
99
- const tracesEnabled = otel.traces !== false;
100
- const metricsEnabled = otel.metrics !== false;
101
- const logsEnabled = otel.logs === true;
102
- if (!tracesEnabled && !metricsEnabled && !logsEnabled) {
103
- return;
104
- }
602
+ const contentCapturePolicy = resolveContentCapturePolicy(otel.captureContent);
603
+ const sdkPreloaded = hasPreloadedOtelSdk();
105
604
 
106
605
  const resource = resourceFromAttributes({
107
606
  [ATTR_SERVICE_NAME]: serviceName,
108
607
  });
109
608
 
110
- const traceUrl = resolveOtelUrl(endpoint, "v1/traces");
111
- const metricUrl = resolveOtelUrl(endpoint, "v1/metrics");
112
- const logUrl = resolveOtelUrl(endpoint, "v1/logs");
113
- const traceExporter = tracesEnabled
114
- ? new OTLPTraceExporter({
115
- ...(traceUrl ? { url: traceUrl } : {}),
116
- ...(headers ? { headers } : {}),
117
- })
118
- : undefined;
119
-
120
- const metricExporter = metricsEnabled
121
- ? new OTLPMetricExporter({
122
- ...(metricUrl ? { url: metricUrl } : {}),
123
- ...(headers ? { headers } : {}),
124
- })
125
- : undefined;
126
-
127
- const metricReader = metricExporter
128
- ? new PeriodicExportingMetricReader({
129
- exporter: metricExporter,
130
- ...(typeof otel.flushIntervalMs === "number"
131
- ? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
132
- : {}),
133
- })
134
- : undefined;
135
-
136
- if (tracesEnabled || metricsEnabled) {
609
+ const logUrl = resolveSignalOtelUrl({
610
+ signalEndpoint: otel.logsEndpoint,
611
+ signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV],
612
+ endpoint,
613
+ path: "v1/logs",
614
+ });
615
+ if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) {
616
+ const traceUrl = resolveSignalOtelUrl({
617
+ signalEndpoint: otel.tracesEndpoint,
618
+ signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV],
619
+ endpoint,
620
+ path: "v1/traces",
621
+ });
622
+ const metricUrl = resolveSignalOtelUrl({
623
+ signalEndpoint: otel.metricsEndpoint,
624
+ signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV],
625
+ endpoint,
626
+ path: "v1/metrics",
627
+ });
628
+ const traceExporter = tracesEnabled
629
+ ? new OTLPTraceExporter({
630
+ ...(traceUrl ? { url: traceUrl } : {}),
631
+ ...(headers ? { headers } : {}),
632
+ })
633
+ : undefined;
634
+
635
+ const metricExporter = metricsEnabled
636
+ ? new OTLPMetricExporter({
637
+ ...(metricUrl ? { url: metricUrl } : {}),
638
+ ...(headers ? { headers } : {}),
639
+ })
640
+ : undefined;
641
+
642
+ const metricReader = metricExporter
643
+ ? new PeriodicExportingMetricReader({
644
+ exporter: metricExporter,
645
+ ...(typeof otel.flushIntervalMs === "number"
646
+ ? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
647
+ : {}),
648
+ })
649
+ : undefined;
650
+
137
651
  sdk = new NodeSDK({
138
652
  resource,
139
653
  ...(traceExporter ? { traceExporter } : {}),
@@ -148,11 +662,26 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
148
662
  });
149
663
 
150
664
  try {
151
- await sdk.start();
665
+ sdk.start();
152
666
  } catch (err) {
667
+ emitForSignals(
668
+ [
669
+ ...(tracesEnabled ? (["traces"] as const) : []),
670
+ ...(metricsEnabled ? (["metrics"] as const) : []),
671
+ ],
672
+ {
673
+ exporter: "diagnostics-otel",
674
+ status: "failure",
675
+ reason: "start_failed",
676
+ errorCategory: errorCategory(err),
677
+ },
678
+ );
679
+ await stopStarted();
153
680
  ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
154
681
  throw err;
155
682
  }
683
+ } else if (sdkPreloaded && (tracesEnabled || metricsEnabled)) {
684
+ ctx.logger.info("diagnostics-otel: using preloaded OpenTelemetry SDK");
156
685
  }
157
686
 
158
687
  const logSeverityMap: Record<string, SeverityNumber> = {
@@ -166,11 +695,46 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
166
695
 
167
696
  const meter = metrics.getMeter("openclaw");
168
697
  const tracer = trace.getTracer("openclaw");
698
+ const activeTrustedSpans = new Map<string, ReturnType<typeof tracer.startSpan>>();
699
+ const activeTrustedSpanAliases = new Map<string, ReturnType<typeof tracer.startSpan>>();
700
+ const pendingTrustedRunFinalizers = new Map<string, ReturnType<typeof setImmediate>>();
701
+ stopActiveTrustedSpans = () => {
702
+ const stopAt = Date.now();
703
+ for (const handle of pendingTrustedRunFinalizers.values()) {
704
+ clearImmediate(handle);
705
+ }
706
+ pendingTrustedRunFinalizers.clear();
707
+ for (const span of new Set([
708
+ ...activeTrustedSpans.values(),
709
+ ...activeTrustedSpanAliases.values(),
710
+ ])) {
711
+ span.end(stopAt);
712
+ }
713
+ activeTrustedSpans.clear();
714
+ activeTrustedSpanAliases.clear();
715
+ };
169
716
 
170
717
  const tokensCounter = meter.createCounter("openclaw.tokens", {
171
718
  unit: "1",
172
719
  description: "Token usage by type",
173
720
  });
721
+ const genAiTokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
722
+ unit: "{token}",
723
+ description: "Number of input and output tokens used by GenAI client operations",
724
+ advice: {
725
+ explicitBucketBoundaries: GEN_AI_TOKEN_USAGE_BUCKETS,
726
+ },
727
+ });
728
+ const genAiOperationDurationHistogram = meter.createHistogram(
729
+ "gen_ai.client.operation.duration",
730
+ {
731
+ unit: "s",
732
+ description: "GenAI client operation duration",
733
+ advice: {
734
+ explicitBucketBoundaries: GEN_AI_OPERATION_DURATION_BUCKETS,
735
+ },
736
+ },
737
+ );
174
738
  const costCounter = meter.createCounter("openclaw.cost.usd", {
175
739
  unit: "1",
176
740
  description: "Estimated model cost (USD)",
@@ -179,6 +743,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
179
743
  unit: "ms",
180
744
  description: "Agent run duration",
181
745
  });
746
+ const harnessDurationHistogram = meter.createHistogram("openclaw.harness.duration_ms", {
747
+ unit: "ms",
748
+ description: "Agent harness lifecycle duration",
749
+ });
182
750
  const contextHistogram = meter.createHistogram("openclaw.context.tokens", {
183
751
  unit: "1",
184
752
  description: "Context window size and usage",
@@ -207,6 +775,20 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
207
775
  unit: "ms",
208
776
  description: "Message processing duration",
209
777
  });
778
+ const messageDeliveryStartedCounter = meter.createCounter(
779
+ "openclaw.message.delivery.started",
780
+ {
781
+ unit: "1",
782
+ description: "Outbound message delivery attempts started",
783
+ },
784
+ );
785
+ const messageDeliveryDurationHistogram = meter.createHistogram(
786
+ "openclaw.message.delivery.duration_ms",
787
+ {
788
+ unit: "ms",
789
+ description: "Outbound message delivery duration",
790
+ },
791
+ );
210
792
  const queueDepthHistogram = meter.createHistogram("openclaw.queue.depth", {
211
793
  unit: "1",
212
794
  description: "Queue depth on enqueue/dequeue",
@@ -239,8 +821,118 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
239
821
  unit: "1",
240
822
  description: "Run attempts",
241
823
  });
824
+ const toolLoopCounter = meter.createCounter("openclaw.tool.loop", {
825
+ unit: "1",
826
+ description: "Detected repetitive tool-call loop events",
827
+ });
828
+ const modelCallDurationHistogram = meter.createHistogram("openclaw.model_call.duration_ms", {
829
+ unit: "ms",
830
+ description: "Model call duration",
831
+ });
832
+ const modelCallRequestBytesHistogram = meter.createHistogram(
833
+ "openclaw.model_call.request_bytes",
834
+ {
835
+ unit: "By",
836
+ description: "UTF-8 byte size of sanitized model request payloads",
837
+ },
838
+ );
839
+ const modelCallResponseBytesHistogram = meter.createHistogram(
840
+ "openclaw.model_call.response_bytes",
841
+ {
842
+ unit: "By",
843
+ description: "UTF-8 byte size of streamed model response events",
844
+ },
845
+ );
846
+ const modelCallTimeToFirstByteHistogram = meter.createHistogram(
847
+ "openclaw.model_call.time_to_first_byte_ms",
848
+ {
849
+ unit: "ms",
850
+ description: "Elapsed time before the first streamed model response event",
851
+ },
852
+ );
853
+ const toolExecutionDurationHistogram = meter.createHistogram(
854
+ "openclaw.tool.execution.duration_ms",
855
+ {
856
+ unit: "ms",
857
+ description: "Tool execution duration",
858
+ },
859
+ );
860
+ const execProcessDurationHistogram = meter.createHistogram("openclaw.exec.duration_ms", {
861
+ unit: "ms",
862
+ description: "Exec process duration",
863
+ });
864
+ const memoryRssHistogram = meter.createHistogram("openclaw.memory.rss_bytes", {
865
+ unit: "By",
866
+ description: "Resident set size reported by diagnostic memory samples",
867
+ });
868
+ const memoryHeapUsedHistogram = meter.createHistogram("openclaw.memory.heap_used_bytes", {
869
+ unit: "By",
870
+ description: "Heap used bytes reported by diagnostic memory samples",
871
+ });
872
+ const memoryHeapTotalHistogram = meter.createHistogram("openclaw.memory.heap_total_bytes", {
873
+ unit: "By",
874
+ description: "Heap total bytes reported by diagnostic memory samples",
875
+ });
876
+ const memoryExternalHistogram = meter.createHistogram("openclaw.memory.external_bytes", {
877
+ unit: "By",
878
+ description: "External memory bytes reported by diagnostic memory samples",
879
+ });
880
+ const memoryArrayBuffersHistogram = meter.createHistogram(
881
+ "openclaw.memory.array_buffers_bytes",
882
+ {
883
+ unit: "By",
884
+ description: "ArrayBuffer bytes reported by diagnostic memory samples",
885
+ },
886
+ );
887
+ const memoryPressureCounter = meter.createCounter("openclaw.memory.pressure", {
888
+ unit: "1",
889
+ description: "Diagnostic memory pressure events",
890
+ });
891
+ const livenessWarningCounter = meter.createCounter("openclaw.liveness.warning", {
892
+ unit: "1",
893
+ description: "Diagnostic liveness warning events",
894
+ });
895
+ const livenessEventLoopDelayP99Histogram = meter.createHistogram(
896
+ "openclaw.liveness.event_loop_delay_p99_ms",
897
+ {
898
+ unit: "ms",
899
+ description: "P99 event-loop delay reported by diagnostic liveness warnings",
900
+ },
901
+ );
902
+ const livenessEventLoopDelayMaxHistogram = meter.createHistogram(
903
+ "openclaw.liveness.event_loop_delay_max_ms",
904
+ {
905
+ unit: "ms",
906
+ description: "Maximum event-loop delay reported by diagnostic liveness warnings",
907
+ },
908
+ );
909
+ const livenessEventLoopUtilizationHistogram = meter.createHistogram(
910
+ "openclaw.liveness.event_loop_utilization",
911
+ {
912
+ unit: "1",
913
+ description: "Event-loop utilization reported by diagnostic liveness warnings",
914
+ },
915
+ );
916
+ const livenessCpuCoreRatioHistogram = meter.createHistogram(
917
+ "openclaw.liveness.cpu_core_ratio",
918
+ {
919
+ unit: "1",
920
+ description: "CPU core ratio reported by diagnostic liveness warnings",
921
+ },
922
+ );
923
+ const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", {
924
+ unit: "1",
925
+ description: "Diagnostic telemetry exporter lifecycle and failure events",
926
+ });
242
927
 
928
+ let recordLogRecord:
929
+ | ((
930
+ evt: Extract<DiagnosticEventPayload, { type: "log.record" }>,
931
+ metadata: DiagnosticEventMetadata,
932
+ ) => void)
933
+ | undefined;
243
934
  if (logsEnabled) {
935
+ let logRecordExportFailureLastReportedAt = Number.NEGATIVE_INFINITY;
244
936
  const logExporter = new OTLPLogExporter({
245
937
  ...(logUrl ? { url: logUrl } : {}),
246
938
  ...(headers ? { headers } : {}),
@@ -256,142 +948,238 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
256
948
  processors: [logProcessor],
257
949
  });
258
950
  const otelLogger = logProvider.getLogger("openclaw");
259
-
260
- stopLogTransport = registerLogTransport((logObj) => {
951
+ recordLogRecord = (evt, metadata) => {
261
952
  try {
262
- const safeStringify = (value: unknown) => {
263
- try {
264
- return JSON.stringify(value);
265
- } catch {
266
- return String(value);
267
- }
268
- };
269
- const meta = (logObj as Record<string, unknown>)._meta as
270
- | {
271
- logLevelName?: string;
272
- date?: Date;
273
- name?: string;
274
- parentNames?: string[];
275
- path?: {
276
- filePath?: string;
277
- fileLine?: string;
278
- fileColumn?: string;
279
- filePathWithLine?: string;
280
- method?: string;
281
- };
282
- }
283
- | undefined;
284
- const logLevelName = meta?.logLevelName ?? "INFO";
953
+ const logLevelName = evt.level || "INFO";
285
954
  const severityNumber = logSeverityMap[logLevelName] ?? (9 as SeverityNumber);
286
-
287
- const numericArgs = Object.entries(logObj)
288
- .filter(([key]) => /^\d+$/.test(key))
289
- .toSorted((a, b) => Number(a[0]) - Number(b[0]))
290
- .map(([, value]) => value);
291
-
292
- let bindings: Record<string, unknown> | undefined;
293
- if (typeof numericArgs[0] === "string" && numericArgs[0].trim().startsWith("{")) {
294
- try {
295
- const parsed = JSON.parse(numericArgs[0]);
296
- if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
297
- bindings = parsed as Record<string, unknown>;
298
- numericArgs.shift();
299
- }
300
- } catch {
301
- // ignore malformed json bindings
302
- }
303
- }
304
-
305
- let message = "";
306
- if (numericArgs.length > 0 && typeof numericArgs[numericArgs.length - 1] === "string") {
307
- message = String(numericArgs.pop());
308
- } else if (numericArgs.length === 1) {
309
- message = safeStringify(numericArgs[0]);
310
- numericArgs.length = 0;
955
+ const attributes = Object.create(null) as Record<string, string | number | boolean>;
956
+ assignOtelLogAttribute(attributes, "openclaw.log.level", logLevelName);
957
+ if (evt.loggerName) {
958
+ assignOtelLogAttribute(attributes, "openclaw.logger", evt.loggerName);
311
959
  }
312
- if (!message) {
313
- message = "log";
960
+ if (evt.loggerParents?.length) {
961
+ assignOtelLogAttribute(
962
+ attributes,
963
+ "openclaw.logger.parents",
964
+ evt.loggerParents.join("."),
965
+ );
314
966
  }
315
-
316
- const attributes: Record<string, string | number | boolean> = {
317
- "openclaw.log.level": logLevelName,
318
- };
319
- if (meta?.name) {
320
- attributes["openclaw.logger"] = meta.name;
321
- }
322
- if (meta?.parentNames?.length) {
323
- attributes["openclaw.logger.parents"] = meta.parentNames.join(".");
324
- }
325
- if (bindings) {
326
- for (const [key, value] of Object.entries(bindings)) {
327
- if (
328
- typeof value === "string" ||
329
- typeof value === "number" ||
330
- typeof value === "boolean"
331
- ) {
332
- attributes[`openclaw.${key}`] = value;
333
- } else if (value != null) {
334
- attributes[`openclaw.${key}`] = safeStringify(value);
335
- }
336
- }
337
- }
338
- if (numericArgs.length > 0) {
339
- attributes["openclaw.log.args"] = safeStringify(numericArgs);
340
- }
341
- if (meta?.path?.filePath) {
342
- attributes["code.filepath"] = meta.path.filePath;
343
- }
344
- if (meta?.path?.fileLine) {
345
- attributes["code.lineno"] = Number(meta.path.fileLine);
967
+ assignOtelLogEventAttributes(attributes, evt.attributes);
968
+ if (evt.code?.line) {
969
+ assignOtelLogAttribute(attributes, "code.lineno", evt.code.line);
346
970
  }
347
- if (meta?.path?.method) {
348
- attributes["code.function"] = meta.path.method;
971
+ if (evt.code?.functionName) {
972
+ assignOtelLogAttribute(attributes, "code.function", evt.code.functionName);
349
973
  }
350
- if (meta?.path?.filePathWithLine) {
351
- attributes["openclaw.code.location"] = meta.path.filePathWithLine;
974
+ if (metadata.trusted) {
975
+ addTraceAttributes(attributes, evt.trace);
352
976
  }
353
977
 
354
- // OTLP can leave the host boundary, so redact string fields before export.
355
- otelLogger.emit({
356
- body: redactSensitiveText(message),
978
+ const logRecord: LogRecord = {
979
+ body: normalizeOtelLogString(evt.message || "log", MAX_OTEL_LOG_BODY_CHARS),
357
980
  severityText: logLevelName,
358
981
  severityNumber,
359
982
  attributes: redactOtelAttributes(attributes),
360
- timestamp: meta?.date ?? new Date(),
361
- });
983
+ timestamp: evt.ts,
984
+ };
985
+ const logContext = contextForTrustedTraceContext(evt, metadata);
986
+ if (logContext) {
987
+ logRecord.context = logContext;
988
+ }
989
+ otelLogger.emit(logRecord);
362
990
  } catch (err) {
363
- ctx.logger.error(`diagnostics-otel: log transport failed: ${formatError(err)}`);
991
+ emitExporterEvent({
992
+ exporter: "diagnostics-otel",
993
+ signal: "logs",
994
+ status: "failure",
995
+ reason: "emit_failed",
996
+ errorCategory: errorCategory(err),
997
+ });
998
+ const now = Date.now();
999
+ if (
1000
+ now - logRecordExportFailureLastReportedAt >=
1001
+ LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS
1002
+ ) {
1003
+ logRecordExportFailureLastReportedAt = now;
1004
+ ctx.logger.error(`diagnostics-otel: log record export failed: ${formatError(err)}`);
1005
+ }
364
1006
  }
365
- });
1007
+ };
366
1008
  }
367
1009
 
368
1010
  const spanWithDuration = (
369
1011
  name: string,
370
- attributes: Record<string, string | number>,
1012
+ attributes: Record<string, string | number | boolean>,
371
1013
  durationMs?: number,
1014
+ options: {
1015
+ parentContext?: ReturnType<typeof contextForTraceContext> | null;
1016
+ endTimeMs?: number;
1017
+ startTimeMs?: number;
1018
+ } = {},
372
1019
  ) => {
1020
+ const endTimeMs = options.endTimeMs ?? Date.now();
373
1021
  const startTime =
374
- typeof durationMs === "number" ? Date.now() - Math.max(0, durationMs) : undefined;
375
- const span = tracer.startSpan(name, {
376
- attributes,
377
- ...(startTime ? { startTime } : {}),
378
- });
1022
+ typeof options.startTimeMs === "number"
1023
+ ? options.startTimeMs
1024
+ : typeof durationMs === "number" && durationMs >= 0
1025
+ ? endTimeMs - durationMs
1026
+ : undefined;
1027
+ const parentContext =
1028
+ "parentContext" in options ? (options.parentContext ?? undefined) : undefined;
1029
+ const span = tracer.startSpan(
1030
+ name,
1031
+ {
1032
+ attributes: redactOtelAttributes(attributes),
1033
+ ...(startTime !== undefined ? { startTime } : {}),
1034
+ },
1035
+ parentContext,
1036
+ );
1037
+ return span;
1038
+ };
1039
+ const trustedTraceContext = (
1040
+ evt: DiagnosticEventPayload,
1041
+ metadata: DiagnosticEventMetadata,
1042
+ ) => (metadata.trusted ? normalizeTraceContext(evt.trace) : undefined);
1043
+ const activeTrustedParentContext = (
1044
+ evt: DiagnosticEventPayload,
1045
+ metadata: DiagnosticEventMetadata,
1046
+ ) => {
1047
+ const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
1048
+ if (!parentSpanId) {
1049
+ return undefined;
1050
+ }
1051
+ const activeParentSpan =
1052
+ activeTrustedSpans.get(parentSpanId) ?? activeTrustedSpanAliases.get(parentSpanId);
1053
+ if (!activeParentSpan) {
1054
+ return undefined;
1055
+ }
1056
+ return trace.setSpanContext(otelContextApi.active(), activeParentSpan.spanContext());
1057
+ };
1058
+ const trackTrustedSpan = (
1059
+ evt: DiagnosticEventPayload,
1060
+ metadata: DiagnosticEventMetadata,
1061
+ span: ReturnType<typeof tracer.startSpan>,
1062
+ ) => {
1063
+ const spanId = trustedTraceContext(evt, metadata)?.spanId;
1064
+ if (spanId) {
1065
+ activeTrustedSpans.set(spanId, span);
1066
+ }
1067
+ return span;
1068
+ };
1069
+ const takeTrackedTrustedSpan = (
1070
+ evt: DiagnosticEventPayload,
1071
+ metadata: DiagnosticEventMetadata,
1072
+ ) => {
1073
+ const spanId = trustedTraceContext(evt, metadata)?.spanId;
1074
+ if (!spanId) {
1075
+ return undefined;
1076
+ }
1077
+ const span = activeTrustedSpans.get(spanId);
1078
+ if (span) {
1079
+ activeTrustedSpans.delete(spanId);
1080
+ }
379
1081
  return span;
380
1082
  };
1083
+ const setSpanAttrs = (
1084
+ span: ReturnType<typeof tracer.startSpan>,
1085
+ attributes: Record<string, string | number | boolean>,
1086
+ ) => {
1087
+ span.setAttributes?.(redactOtelAttributes(attributes));
1088
+ };
1089
+ const scheduleTrackedRunSpanFinalize = (
1090
+ spanId: string,
1091
+ parentSpanId: string | undefined,
1092
+ span: ReturnType<typeof tracer.startSpan>,
1093
+ endTimeMs: number,
1094
+ ) => {
1095
+ const existingHandle = pendingTrustedRunFinalizers.get(spanId);
1096
+ if (existingHandle) {
1097
+ clearImmediate(existingHandle);
1098
+ }
1099
+ const handle = setImmediate(() => {
1100
+ pendingTrustedRunFinalizers.delete(spanId);
1101
+ if (activeTrustedSpans.get(spanId) === span) {
1102
+ activeTrustedSpans.delete(spanId);
1103
+ }
1104
+ if (parentSpanId && activeTrustedSpanAliases.get(parentSpanId) === span) {
1105
+ activeTrustedSpanAliases.delete(parentSpanId);
1106
+ }
1107
+ span.end(endTimeMs);
1108
+ });
1109
+ pendingTrustedRunFinalizers.set(spanId, handle);
1110
+ };
1111
+
1112
+ const addRunAttrs = (
1113
+ spanAttrs: Record<string, string | number | boolean>,
1114
+ evt: {
1115
+ runId?: string;
1116
+ sessionKey?: string;
1117
+ sessionId?: string;
1118
+ provider?: string;
1119
+ model?: string;
1120
+ channel?: string;
1121
+ trigger?: string;
1122
+ },
1123
+ ) => {
1124
+ if (evt.provider) {
1125
+ spanAttrs["openclaw.provider"] = evt.provider;
1126
+ }
1127
+ if (evt.model) {
1128
+ spanAttrs["openclaw.model"] = evt.model;
1129
+ }
1130
+ if (evt.channel) {
1131
+ spanAttrs["openclaw.channel"] = evt.channel;
1132
+ }
1133
+ if (evt.trigger) {
1134
+ spanAttrs["openclaw.trigger"] = evt.trigger;
1135
+ }
1136
+ };
1137
+
1138
+ const paramsSummaryAttrs = (
1139
+ summary: Extract<
1140
+ DiagnosticEventPayload,
1141
+ { type: "tool.execution.started" }
1142
+ >["paramsSummary"],
1143
+ ): Record<string, string | number> => {
1144
+ if (!summary) {
1145
+ return {};
1146
+ }
1147
+ return {
1148
+ "openclaw.tool.params.kind": summary.kind,
1149
+ ...("length" in summary ? { "openclaw.tool.params.length": summary.length } : {}),
1150
+ };
1151
+ };
381
1152
 
382
- const recordModelUsage = (evt: Extract<DiagnosticEventPayload, { type: "model.usage" }>) => {
1153
+ const recordModelUsage = (
1154
+ evt: Extract<DiagnosticEventPayload, { type: "model.usage" }>,
1155
+ metadata: DiagnosticEventMetadata,
1156
+ ) => {
383
1157
  const attrs = {
384
1158
  "openclaw.channel": evt.channel ?? "unknown",
1159
+ "openclaw.agent": lowCardinalityAttr(evt.agentId),
385
1160
  "openclaw.provider": evt.provider ?? "unknown",
386
1161
  "openclaw.model": evt.model ?? "unknown",
387
1162
  };
1163
+ const genAiAttrs: Record<string, string> = {
1164
+ "gen_ai.operation.name": "chat",
1165
+ "gen_ai.provider.name": lowCardinalityAttr(evt.provider),
1166
+ "gen_ai.request.model": lowCardinalityAttr(evt.model),
1167
+ };
388
1168
 
389
1169
  const usage = evt.usage;
390
1170
  if (usage.input) {
391
1171
  tokensCounter.add(usage.input, { ...attrs, "openclaw.token": "input" });
1172
+ genAiTokenUsageHistogram.record(usage.input, {
1173
+ ...genAiAttrs,
1174
+ "gen_ai.token.type": "input",
1175
+ });
392
1176
  }
393
1177
  if (usage.output) {
394
1178
  tokensCounter.add(usage.output, { ...attrs, "openclaw.token": "output" });
1179
+ genAiTokenUsageHistogram.record(usage.output, {
1180
+ ...genAiAttrs,
1181
+ "gen_ai.token.type": "output",
1182
+ });
395
1183
  }
396
1184
  if (usage.cacheRead) {
397
1185
  tokensCounter.add(usage.cacheRead, { ...attrs, "openclaw.token": "cache_read" });
@@ -428,19 +1216,36 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
428
1216
  if (!tracesEnabled) {
429
1217
  return;
430
1218
  }
1219
+ const genAiInputTokens =
1220
+ usage.promptTokens ??
1221
+ (usage.input ?? 0) + (usage.cacheRead ?? 0) + (usage.cacheWrite ?? 0);
431
1222
  const spanAttrs: Record<string, string | number> = {
432
1223
  ...attrs,
433
- "openclaw.sessionKey": evt.sessionKey ?? "",
434
- "openclaw.sessionId": evt.sessionId ?? "",
435
1224
  "openclaw.tokens.input": usage.input ?? 0,
436
1225
  "openclaw.tokens.output": usage.output ?? 0,
437
1226
  "openclaw.tokens.cache_read": usage.cacheRead ?? 0,
438
1227
  "openclaw.tokens.cache_write": usage.cacheWrite ?? 0,
439
1228
  "openclaw.tokens.total": usage.total ?? 0,
440
1229
  };
1230
+ assignGenAiSpanIdentityAttrs(spanAttrs, evt);
1231
+ assignPositiveNumberAttr(spanAttrs, "gen_ai.usage.input_tokens", genAiInputTokens);
1232
+ assignPositiveNumberAttr(spanAttrs, "gen_ai.usage.output_tokens", usage.output);
1233
+ assignPositiveNumberAttr(
1234
+ spanAttrs,
1235
+ "gen_ai.usage.cache_read.input_tokens",
1236
+ usage.cacheRead,
1237
+ );
1238
+ assignPositiveNumberAttr(
1239
+ spanAttrs,
1240
+ "gen_ai.usage.cache_creation.input_tokens",
1241
+ usage.cacheWrite,
1242
+ );
441
1243
 
442
- const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs);
443
- span.end();
1244
+ const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs, {
1245
+ parentContext: activeTrustedParentContext(evt, metadata),
1246
+ endTimeMs: evt.ts,
1247
+ });
1248
+ span.end(evt.ts);
444
1249
  };
445
1250
 
446
1251
  const recordWebhookReceived = (
@@ -513,18 +1318,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
513
1318
  }
514
1319
  };
515
1320
 
516
- const addSessionIdentityAttrs = (
517
- spanAttrs: Record<string, string | number>,
518
- evt: { sessionKey?: string; sessionId?: string },
519
- ) => {
520
- if (evt.sessionKey) {
521
- spanAttrs["openclaw.sessionKey"] = evt.sessionKey;
522
- }
523
- if (evt.sessionId) {
524
- spanAttrs["openclaw.sessionId"] = evt.sessionId;
525
- }
526
- };
527
-
528
1321
  const recordMessageProcessed = (
529
1322
  evt: Extract<DiagnosticEventPayload, { type: "message.processed" }>,
530
1323
  ) => {
@@ -540,7 +1333,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
540
1333
  return;
541
1334
  }
542
1335
  const spanAttrs: Record<string, string | number> = { ...attrs };
543
- addSessionIdentityAttrs(spanAttrs, evt);
544
1336
  if (evt.chatId !== undefined) {
545
1337
  spanAttrs["openclaw.chatId"] = String(evt.chatId);
546
1338
  }
@@ -557,27 +1349,108 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
557
1349
  span.end();
558
1350
  };
559
1351
 
560
- const recordLaneEnqueue = (
561
- evt: Extract<DiagnosticEventPayload, { type: "queue.lane.enqueue" }>,
562
- ) => {
563
- const attrs = { "openclaw.lane": evt.lane };
564
- laneEnqueueCounter.add(1, attrs);
565
- queueDepthHistogram.record(evt.queueSize, attrs);
566
- };
1352
+ const messageDeliveryAttrs = (
1353
+ evt: MessageDeliveryDiagnosticEvent,
1354
+ ): Record<string, string> => ({
1355
+ "openclaw.channel": evt.channel,
1356
+ "openclaw.delivery.kind": evt.deliveryKind,
1357
+ });
567
1358
 
568
- const recordLaneDequeue = (
569
- evt: Extract<DiagnosticEventPayload, { type: "queue.lane.dequeue" }>,
1359
+ const recordMessageDeliveryStarted = (
1360
+ evt: Extract<DiagnosticEventPayload, { type: "message.delivery.started" }>,
570
1361
  ) => {
571
- const attrs = { "openclaw.lane": evt.lane };
572
- laneDequeueCounter.add(1, attrs);
573
- queueDepthHistogram.record(evt.queueSize, attrs);
574
- if (typeof evt.waitMs === "number") {
575
- queueWaitHistogram.record(evt.waitMs, attrs);
576
- }
1362
+ messageDeliveryStartedCounter.add(1, messageDeliveryAttrs(evt));
577
1363
  };
578
1364
 
579
- const recordSessionState = (
580
- evt: Extract<DiagnosticEventPayload, { type: "session.state" }>,
1365
+ const recordMessageDeliveryCompleted = (
1366
+ evt: Extract<DiagnosticEventPayload, { type: "message.delivery.completed" }>,
1367
+ ) => {
1368
+ const attrs = {
1369
+ ...messageDeliveryAttrs(evt),
1370
+ "openclaw.outcome": "completed",
1371
+ };
1372
+ messageDeliveryDurationHistogram.record(evt.durationMs, attrs);
1373
+ if (!tracesEnabled) {
1374
+ return;
1375
+ }
1376
+ const span = spanWithDuration(
1377
+ "openclaw.message.delivery",
1378
+ {
1379
+ ...attrs,
1380
+ "openclaw.delivery.result_count": evt.resultCount,
1381
+ },
1382
+ evt.durationMs,
1383
+ { endTimeMs: evt.ts },
1384
+ );
1385
+ span.end(evt.ts);
1386
+ };
1387
+
1388
+ const recordMessageDeliveryError = (
1389
+ evt: Extract<DiagnosticEventPayload, { type: "message.delivery.error" }>,
1390
+ ) => {
1391
+ const attrs = {
1392
+ ...messageDeliveryAttrs(evt),
1393
+ "openclaw.outcome": "error",
1394
+ "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
1395
+ };
1396
+ messageDeliveryDurationHistogram.record(evt.durationMs, attrs);
1397
+ if (!tracesEnabled) {
1398
+ return;
1399
+ }
1400
+ const span = spanWithDuration("openclaw.message.delivery", attrs, evt.durationMs, {
1401
+ endTimeMs: evt.ts,
1402
+ });
1403
+ span.setStatus({
1404
+ code: SpanStatusCode.ERROR,
1405
+ message: redactSensitiveText(evt.errorCategory),
1406
+ });
1407
+ span.end(evt.ts);
1408
+ };
1409
+
1410
+ const recordRunStarted = (
1411
+ evt: Extract<DiagnosticEventPayload, { type: "run.started" }>,
1412
+ metadata: DiagnosticEventMetadata,
1413
+ ) => {
1414
+ if (!tracesEnabled || !metadata.trusted) {
1415
+ return;
1416
+ }
1417
+ const spanAttrs: Record<string, string | number | boolean> = {};
1418
+ addRunAttrs(spanAttrs, evt);
1419
+ const span = trackTrustedSpan(
1420
+ evt,
1421
+ metadata,
1422
+ spanWithDuration("openclaw.run", spanAttrs, undefined, {
1423
+ parentContext: activeTrustedParentContext(evt, metadata),
1424
+ startTimeMs: evt.ts,
1425
+ }),
1426
+ );
1427
+ const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
1428
+ if (parentSpanId && !activeTrustedSpans.has(parentSpanId)) {
1429
+ activeTrustedSpanAliases.set(parentSpanId, span);
1430
+ }
1431
+ };
1432
+
1433
+ const recordLaneEnqueue = (
1434
+ evt: Extract<DiagnosticEventPayload, { type: "queue.lane.enqueue" }>,
1435
+ ) => {
1436
+ const attrs = { "openclaw.lane": evt.lane };
1437
+ laneEnqueueCounter.add(1, attrs);
1438
+ queueDepthHistogram.record(evt.queueSize, attrs);
1439
+ };
1440
+
1441
+ const recordLaneDequeue = (
1442
+ evt: Extract<DiagnosticEventPayload, { type: "queue.lane.dequeue" }>,
1443
+ ) => {
1444
+ const attrs = { "openclaw.lane": evt.lane };
1445
+ laneDequeueCounter.add(1, attrs);
1446
+ queueDepthHistogram.record(evt.queueSize, attrs);
1447
+ if (typeof evt.waitMs === "number") {
1448
+ queueWaitHistogram.record(evt.waitMs, attrs);
1449
+ }
1450
+ };
1451
+
1452
+ const recordSessionState = (
1453
+ evt: Extract<DiagnosticEventPayload, { type: "session.state" }>,
581
1454
  ) => {
582
1455
  const attrs: Record<string, string> = { "openclaw.state": evt.state };
583
1456
  if (evt.reason) {
@@ -598,7 +1471,6 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
598
1471
  return;
599
1472
  }
600
1473
  const spanAttrs: Record<string, string | number> = { ...attrs };
601
- addSessionIdentityAttrs(spanAttrs, evt);
602
1474
  spanAttrs["openclaw.queueDepth"] = evt.queueDepth ?? 0;
603
1475
  spanAttrs["openclaw.ageMs"] = evt.ageMs;
604
1476
  const span = tracer.startSpan("openclaw.session.stuck", { attributes: spanAttrs });
@@ -610,17 +1482,705 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
610
1482
  runAttemptCounter.add(1, { "openclaw.attempt": evt.attempt });
611
1483
  };
612
1484
 
1485
+ const toolLoopAttrs = (
1486
+ evt: Extract<DiagnosticEventPayload, { type: "tool.loop" }>,
1487
+ ): Record<string, string | number> => ({
1488
+ "openclaw.toolName": lowCardinalityAttr(evt.toolName, "tool"),
1489
+ "openclaw.loop.level": evt.level,
1490
+ "openclaw.loop.action": evt.action,
1491
+ "openclaw.loop.detector": evt.detector,
1492
+ "openclaw.loop.count": evt.count,
1493
+ ...(evt.pairedToolName
1494
+ ? { "openclaw.loop.paired_tool": lowCardinalityAttr(evt.pairedToolName, "tool") }
1495
+ : {}),
1496
+ });
1497
+
1498
+ const recordToolLoop = (evt: Extract<DiagnosticEventPayload, { type: "tool.loop" }>) => {
1499
+ const attrs = toolLoopAttrs(evt);
1500
+ toolLoopCounter.add(1, attrs);
1501
+ if (!tracesEnabled) {
1502
+ return;
1503
+ }
1504
+ const span = spanWithDuration("openclaw.tool.loop", attrs, 0, { endTimeMs: evt.ts });
1505
+ if (evt.level === "critical" || evt.action === "block") {
1506
+ span.setStatus({
1507
+ code: SpanStatusCode.ERROR,
1508
+ message: `${evt.detector}:${evt.action}`,
1509
+ });
1510
+ }
1511
+ span.end(evt.ts);
1512
+ };
1513
+
1514
+ const recordMemoryUsageMetrics = (
1515
+ evt: Extract<
1516
+ DiagnosticEventPayload,
1517
+ { type: "diagnostic.memory.sample" | "diagnostic.memory.pressure" }
1518
+ >,
1519
+ attrs: Record<string, string> = {},
1520
+ ) => {
1521
+ memoryRssHistogram.record(evt.memory.rssBytes, attrs);
1522
+ memoryHeapUsedHistogram.record(evt.memory.heapUsedBytes, attrs);
1523
+ memoryHeapTotalHistogram.record(evt.memory.heapTotalBytes, attrs);
1524
+ memoryExternalHistogram.record(evt.memory.externalBytes, attrs);
1525
+ memoryArrayBuffersHistogram.record(evt.memory.arrayBuffersBytes, attrs);
1526
+ };
1527
+
1528
+ const recordMemorySample = (
1529
+ evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.sample" }>,
1530
+ ) => {
1531
+ recordMemoryUsageMetrics(evt);
1532
+ };
1533
+
1534
+ const recordMemoryPressure = (
1535
+ evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.pressure" }>,
1536
+ ) => {
1537
+ const attrs = {
1538
+ "openclaw.memory.level": evt.level,
1539
+ "openclaw.memory.reason": evt.reason,
1540
+ };
1541
+ memoryPressureCounter.add(1, attrs);
1542
+ recordMemoryUsageMetrics(evt, attrs);
1543
+ if (!tracesEnabled) {
1544
+ return;
1545
+ }
1546
+ const spanAttrs: Record<string, string | number | boolean> = {
1547
+ ...attrs,
1548
+ "openclaw.memory.rss_bytes": evt.memory.rssBytes,
1549
+ "openclaw.memory.heap_used_bytes": evt.memory.heapUsedBytes,
1550
+ "openclaw.memory.heap_total_bytes": evt.memory.heapTotalBytes,
1551
+ "openclaw.memory.external_bytes": evt.memory.externalBytes,
1552
+ "openclaw.memory.array_buffers_bytes": evt.memory.arrayBuffersBytes,
1553
+ ...(evt.thresholdBytes !== undefined
1554
+ ? { "openclaw.memory.threshold_bytes": evt.thresholdBytes }
1555
+ : {}),
1556
+ ...(evt.rssGrowthBytes !== undefined
1557
+ ? { "openclaw.memory.rss_growth_bytes": evt.rssGrowthBytes }
1558
+ : {}),
1559
+ ...(evt.windowMs !== undefined ? { "openclaw.memory.window_ms": evt.windowMs } : {}),
1560
+ };
1561
+ const span = spanWithDuration("openclaw.memory.pressure", spanAttrs, 0, {
1562
+ endTimeMs: evt.ts,
1563
+ });
1564
+ if (evt.level === "critical") {
1565
+ span.setStatus({
1566
+ code: SpanStatusCode.ERROR,
1567
+ message: evt.reason,
1568
+ });
1569
+ }
1570
+ span.end(evt.ts);
1571
+ };
1572
+
1573
+ const recordRunCompleted = (
1574
+ evt: Extract<DiagnosticEventPayload, { type: "run.completed" }>,
1575
+ metadata: DiagnosticEventMetadata,
1576
+ ) => {
1577
+ const attrs: Record<string, string | number> = {
1578
+ "openclaw.outcome": evt.outcome,
1579
+ "openclaw.provider": evt.provider ?? "unknown",
1580
+ "openclaw.model": evt.model ?? "unknown",
1581
+ };
1582
+ if (evt.channel) {
1583
+ attrs["openclaw.channel"] = evt.channel;
1584
+ }
1585
+ durationHistogram.record(evt.durationMs, attrs);
1586
+ if (!tracesEnabled) {
1587
+ return;
1588
+ }
1589
+ const spanAttrs: Record<string, string | number | boolean> = {
1590
+ "openclaw.outcome": evt.outcome,
1591
+ };
1592
+ addRunAttrs(spanAttrs, evt);
1593
+ if (evt.errorCategory) {
1594
+ spanAttrs["openclaw.errorCategory"] = lowCardinalityAttr(evt.errorCategory, "other");
1595
+ }
1596
+ const trustedTrace = trustedTraceContext(evt, metadata);
1597
+ const trackedSpan = trustedTrace?.spanId
1598
+ ? activeTrustedSpans.get(trustedTrace.spanId)
1599
+ : undefined;
1600
+ const span =
1601
+ trackedSpan ??
1602
+ spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, {
1603
+ parentContext: activeTrustedParentContext(evt, metadata),
1604
+ endTimeMs: evt.ts,
1605
+ });
1606
+ setSpanAttrs(span, spanAttrs);
1607
+ if (evt.outcome === "error") {
1608
+ span.setStatus({
1609
+ code: SpanStatusCode.ERROR,
1610
+ ...(evt.errorCategory ? { message: redactSensitiveText(evt.errorCategory) } : {}),
1611
+ });
1612
+ }
1613
+ if (trackedSpan && trustedTrace?.spanId) {
1614
+ scheduleTrackedRunSpanFinalize(
1615
+ trustedTrace.spanId,
1616
+ trustedTrace.parentSpanId,
1617
+ trackedSpan,
1618
+ evt.ts,
1619
+ );
1620
+ return;
1621
+ }
1622
+ span.end(evt.ts);
1623
+ };
1624
+
1625
+ const harnessRunMetricAttrs = (evt: HarnessRunDiagnosticEvent) => ({
1626
+ "openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
1627
+ "openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
1628
+ ...(evt.type === "harness.run.started"
1629
+ ? {}
1630
+ : {
1631
+ "openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
1632
+ }),
1633
+ "openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
1634
+ "openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
1635
+ ...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
1636
+ });
1637
+
1638
+ const recordHarnessRunStarted = (
1639
+ evt: Extract<DiagnosticEventPayload, { type: "harness.run.started" }>,
1640
+ metadata: DiagnosticEventMetadata,
1641
+ ) => {
1642
+ if (!tracesEnabled || !metadata.trusted) {
1643
+ return;
1644
+ }
1645
+ trackTrustedSpan(
1646
+ evt,
1647
+ metadata,
1648
+ spanWithDuration("openclaw.harness.run", harnessRunMetricAttrs(evt), undefined, {
1649
+ parentContext: activeTrustedParentContext(evt, metadata),
1650
+ startTimeMs: evt.ts,
1651
+ }),
1652
+ );
1653
+ };
1654
+
1655
+ const recordHarnessRunCompleted = (
1656
+ evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
1657
+ metadata: DiagnosticEventMetadata,
1658
+ ) => {
1659
+ harnessDurationHistogram.record(evt.durationMs, harnessRunMetricAttrs(evt));
1660
+ if (!tracesEnabled) {
1661
+ return;
1662
+ }
1663
+ const spanAttrs: Record<string, string | number | boolean> = {
1664
+ ...harnessRunMetricAttrs(evt),
1665
+ };
1666
+ if (evt.resultClassification) {
1667
+ spanAttrs["openclaw.harness.result_classification"] = lowCardinalityAttr(
1668
+ evt.resultClassification,
1669
+ );
1670
+ }
1671
+ if (typeof evt.yieldDetected === "boolean") {
1672
+ spanAttrs["openclaw.harness.yield_detected"] = evt.yieldDetected;
1673
+ }
1674
+ if (evt.itemLifecycle) {
1675
+ spanAttrs["openclaw.harness.items.started"] = evt.itemLifecycle.startedCount;
1676
+ spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
1677
+ spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
1678
+ }
1679
+ const span =
1680
+ takeTrackedTrustedSpan(evt, metadata) ??
1681
+ spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
1682
+ parentContext: activeTrustedParentContext(evt, metadata),
1683
+ endTimeMs: evt.ts,
1684
+ });
1685
+ setSpanAttrs(span, spanAttrs);
1686
+ if (evt.outcome === "error") {
1687
+ span.setStatus({
1688
+ code: SpanStatusCode.ERROR,
1689
+ message: "error",
1690
+ });
1691
+ }
1692
+ span.end(evt.ts);
1693
+ };
1694
+
1695
+ const recordHarnessRunError = (
1696
+ evt: Extract<DiagnosticEventPayload, { type: "harness.run.error" }>,
1697
+ metadata: DiagnosticEventMetadata,
1698
+ ) => {
1699
+ const errorType = lowCardinalityAttr(evt.errorCategory, "other");
1700
+ const attrs = {
1701
+ ...harnessRunMetricAttrs(evt),
1702
+ "openclaw.harness.phase": evt.phase,
1703
+ "openclaw.errorCategory": errorType,
1704
+ };
1705
+ harnessDurationHistogram.record(evt.durationMs, attrs);
1706
+ if (!tracesEnabled) {
1707
+ return;
1708
+ }
1709
+ const spanAttrs: Record<string, string | number | boolean> = {
1710
+ ...attrs,
1711
+ "error.type": errorType,
1712
+ ...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
1713
+ };
1714
+ const span =
1715
+ takeTrackedTrustedSpan(evt, metadata) ??
1716
+ spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
1717
+ parentContext: activeTrustedParentContext(evt, metadata),
1718
+ endTimeMs: evt.ts,
1719
+ });
1720
+ setSpanAttrs(span, spanAttrs);
1721
+ span.setStatus({
1722
+ code: SpanStatusCode.ERROR,
1723
+ message: errorType,
1724
+ });
1725
+ span.end(evt.ts);
1726
+ };
1727
+
1728
+ const recordContextAssembled = (
1729
+ evt: Extract<DiagnosticEventPayload, { type: "context.assembled" }>,
1730
+ metadata: DiagnosticEventMetadata,
1731
+ ) => {
1732
+ if (!tracesEnabled) {
1733
+ return;
1734
+ }
1735
+ const spanAttrs: Record<string, string | number | boolean> = {
1736
+ "openclaw.context.message_count": evt.messageCount,
1737
+ "openclaw.context.history_text_chars": evt.historyTextChars,
1738
+ "openclaw.context.history_image_blocks": evt.historyImageBlocks,
1739
+ "openclaw.context.max_message_text_chars": evt.maxMessageTextChars,
1740
+ "openclaw.context.system_prompt_chars": evt.systemPromptChars,
1741
+ "openclaw.context.prompt_chars": evt.promptChars,
1742
+ "openclaw.context.prompt_images": evt.promptImages,
1743
+ };
1744
+ addRunAttrs(spanAttrs, evt);
1745
+ if (evt.contextTokenBudget !== undefined) {
1746
+ spanAttrs["openclaw.context.token_budget"] = evt.contextTokenBudget;
1747
+ }
1748
+ if (evt.reserveTokens !== undefined) {
1749
+ spanAttrs["openclaw.context.reserve_tokens"] = evt.reserveTokens;
1750
+ }
1751
+ const span = spanWithDuration("openclaw.context.assembled", spanAttrs, 0, {
1752
+ parentContext: activeTrustedParentContext(evt, metadata),
1753
+ endTimeMs: evt.ts,
1754
+ });
1755
+ span.end(evt.ts);
1756
+ };
1757
+
1758
+ const modelCallMetricAttrs = (evt: ModelCallLifecycleDiagnosticEvent) => ({
1759
+ "openclaw.provider": evt.provider,
1760
+ "openclaw.model": evt.model,
1761
+ "openclaw.api": lowCardinalityAttr(evt.api),
1762
+ "openclaw.transport": lowCardinalityAttr(evt.transport),
1763
+ });
1764
+ const genAiModelCallMetricAttrs = (
1765
+ evt: ModelCallLifecycleDiagnosticEvent,
1766
+ errorType?: string,
1767
+ ) => ({
1768
+ "gen_ai.operation.name": genAiOperationName(evt.api),
1769
+ "gen_ai.provider.name": lowCardinalityAttr(evt.provider),
1770
+ "gen_ai.request.model": lowCardinalityAttr(evt.model),
1771
+ ...(errorType ? { "error.type": errorType } : {}),
1772
+ });
1773
+ const recordModelCallSizeTimingMetrics = (
1774
+ evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" | "model.call.error" }>,
1775
+ attrs: ReturnType<typeof modelCallMetricAttrs>,
1776
+ ) => {
1777
+ const requestPayloadBytes = positiveFiniteNumber(evt.requestPayloadBytes);
1778
+ if (requestPayloadBytes !== undefined) {
1779
+ modelCallRequestBytesHistogram.record(requestPayloadBytes, attrs);
1780
+ }
1781
+ const responseStreamBytes = positiveFiniteNumber(evt.responseStreamBytes);
1782
+ if (responseStreamBytes !== undefined) {
1783
+ modelCallResponseBytesHistogram.record(responseStreamBytes, attrs);
1784
+ }
1785
+ const timeToFirstByteMs = positiveFiniteNumber(evt.timeToFirstByteMs);
1786
+ if (timeToFirstByteMs !== undefined) {
1787
+ modelCallTimeToFirstByteHistogram.record(timeToFirstByteMs, attrs);
1788
+ }
1789
+ };
1790
+
1791
+ const recordModelCallStarted = (
1792
+ evt: Extract<DiagnosticEventPayload, { type: "model.call.started" }>,
1793
+ metadata: DiagnosticEventMetadata,
1794
+ ) => {
1795
+ if (!tracesEnabled || !metadata.trusted) {
1796
+ return;
1797
+ }
1798
+ const spanAttrs: Record<string, string | number | boolean> = {
1799
+ "openclaw.provider": evt.provider,
1800
+ "openclaw.model": evt.model,
1801
+ };
1802
+ assignGenAiModelCallAttrs(spanAttrs, evt);
1803
+ if (evt.api) {
1804
+ spanAttrs["openclaw.api"] = evt.api;
1805
+ }
1806
+ if (evt.transport) {
1807
+ spanAttrs["openclaw.transport"] = evt.transport;
1808
+ }
1809
+ trackTrustedSpan(
1810
+ evt,
1811
+ metadata,
1812
+ spanWithDuration("openclaw.model.call", spanAttrs, undefined, {
1813
+ parentContext: activeTrustedParentContext(evt, metadata),
1814
+ startTimeMs: evt.ts,
1815
+ }),
1816
+ );
1817
+ };
1818
+
1819
+ const recordModelCallCompleted = (
1820
+ evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" }>,
1821
+ metadata: DiagnosticEventMetadata,
1822
+ ) => {
1823
+ const metricAttrs = modelCallMetricAttrs(evt);
1824
+ modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
1825
+ recordModelCallSizeTimingMetrics(evt, metricAttrs);
1826
+ genAiOperationDurationHistogram.record(
1827
+ evt.durationMs / 1000,
1828
+ genAiModelCallMetricAttrs(evt),
1829
+ );
1830
+ if (!tracesEnabled) {
1831
+ return;
1832
+ }
1833
+ const spanAttrs: Record<string, string | number | boolean> = {
1834
+ "openclaw.provider": evt.provider,
1835
+ "openclaw.model": evt.model,
1836
+ };
1837
+ assignGenAiModelCallAttrs(spanAttrs, evt);
1838
+ if (evt.api) {
1839
+ spanAttrs["openclaw.api"] = evt.api;
1840
+ }
1841
+ if (evt.transport) {
1842
+ spanAttrs["openclaw.transport"] = evt.transport;
1843
+ }
1844
+ assignModelCallSizeTimingAttrs(spanAttrs, evt);
1845
+ assignOtelModelContentAttributes(
1846
+ spanAttrs,
1847
+ evt as unknown as Record<string, unknown>,
1848
+ contentCapturePolicy,
1849
+ );
1850
+ const span =
1851
+ takeTrackedTrustedSpan(evt, metadata) ??
1852
+ spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
1853
+ parentContext: activeTrustedParentContext(evt, metadata),
1854
+ endTimeMs: evt.ts,
1855
+ });
1856
+ setSpanAttrs(span, spanAttrs);
1857
+ addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
1858
+ span.end(evt.ts);
1859
+ };
1860
+
1861
+ const recordModelCallError = (
1862
+ evt: Extract<DiagnosticEventPayload, { type: "model.call.error" }>,
1863
+ metadata: DiagnosticEventMetadata,
1864
+ ) => {
1865
+ const errorType = lowCardinalityAttr(evt.errorCategory, "other");
1866
+ const metricAttrs = {
1867
+ ...modelCallMetricAttrs(evt),
1868
+ "openclaw.errorCategory": errorType,
1869
+ ...(evt.failureKind
1870
+ ? { "openclaw.failureKind": lowCardinalityAttr(evt.failureKind, "other") }
1871
+ : {}),
1872
+ };
1873
+ modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
1874
+ recordModelCallSizeTimingMetrics(evt, metricAttrs);
1875
+ genAiOperationDurationHistogram.record(
1876
+ evt.durationMs / 1000,
1877
+ genAiModelCallMetricAttrs(evt, errorType),
1878
+ );
1879
+ if (!tracesEnabled) {
1880
+ return;
1881
+ }
1882
+ const spanAttrs: Record<string, string | number | boolean> = {
1883
+ "openclaw.provider": evt.provider,
1884
+ "openclaw.model": evt.model,
1885
+ "openclaw.errorCategory": errorType,
1886
+ "error.type": errorType,
1887
+ };
1888
+ if (evt.failureKind) {
1889
+ spanAttrs["openclaw.failureKind"] = lowCardinalityAttr(evt.failureKind, "other");
1890
+ }
1891
+ assignGenAiModelCallAttrs(spanAttrs, evt);
1892
+ if (evt.api) {
1893
+ spanAttrs["openclaw.api"] = evt.api;
1894
+ }
1895
+ if (evt.transport) {
1896
+ spanAttrs["openclaw.transport"] = evt.transport;
1897
+ }
1898
+ assignModelCallSizeTimingAttrs(spanAttrs, evt);
1899
+ assignOtelModelContentAttributes(
1900
+ spanAttrs,
1901
+ evt as unknown as Record<string, unknown>,
1902
+ contentCapturePolicy,
1903
+ );
1904
+ const span =
1905
+ takeTrackedTrustedSpan(evt, metadata) ??
1906
+ spanWithDuration("openclaw.model.call", spanAttrs, evt.durationMs, {
1907
+ parentContext: activeTrustedParentContext(evt, metadata),
1908
+ endTimeMs: evt.ts,
1909
+ });
1910
+ setSpanAttrs(span, spanAttrs);
1911
+ addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
1912
+ span.setStatus({
1913
+ code: SpanStatusCode.ERROR,
1914
+ message: redactSensitiveText(evt.errorCategory),
1915
+ });
1916
+ span.end(evt.ts);
1917
+ };
1918
+
1919
+ const toolExecutionBaseAttrs = (
1920
+ evt: Extract<
1921
+ DiagnosticEventPayload,
1922
+ {
1923
+ type:
1924
+ | "tool.execution.started"
1925
+ | "tool.execution.completed"
1926
+ | "tool.execution.error"
1927
+ | "tool.execution.blocked";
1928
+ }
1929
+ >,
1930
+ ): Record<string, string | number | boolean> => ({
1931
+ "openclaw.toolName": evt.toolName,
1932
+ "gen_ai.tool.name": evt.toolName,
1933
+ ...paramsSummaryAttrs(evt.paramsSummary),
1934
+ });
1935
+
1936
+ const recordToolExecutionStarted = (
1937
+ evt: Extract<DiagnosticEventPayload, { type: "tool.execution.started" }>,
1938
+ metadata: DiagnosticEventMetadata,
1939
+ ) => {
1940
+ if (!tracesEnabled || !metadata.trusted) {
1941
+ return;
1942
+ }
1943
+ trackTrustedSpan(
1944
+ evt,
1945
+ metadata,
1946
+ spanWithDuration("openclaw.tool.execution", toolExecutionBaseAttrs(evt), undefined, {
1947
+ parentContext: activeTrustedParentContext(evt, metadata),
1948
+ startTimeMs: evt.ts,
1949
+ }),
1950
+ );
1951
+ };
1952
+
1953
+ const recordToolExecutionCompleted = (
1954
+ evt: Extract<DiagnosticEventPayload, { type: "tool.execution.completed" }>,
1955
+ metadata: DiagnosticEventMetadata,
1956
+ ) => {
1957
+ const attrs = {
1958
+ "openclaw.toolName": evt.toolName,
1959
+ ...paramsSummaryAttrs(evt.paramsSummary),
1960
+ };
1961
+ toolExecutionDurationHistogram.record(evt.durationMs, attrs);
1962
+ if (!tracesEnabled) {
1963
+ return;
1964
+ }
1965
+ const spanAttrs: Record<string, string | number | boolean> = {
1966
+ ...toolExecutionBaseAttrs(evt),
1967
+ };
1968
+ addRunAttrs(spanAttrs, evt);
1969
+ assignOtelToolContentAttributes(
1970
+ spanAttrs,
1971
+ evt as unknown as Record<string, unknown>,
1972
+ contentCapturePolicy,
1973
+ );
1974
+ const span =
1975
+ takeTrackedTrustedSpan(evt, metadata) ??
1976
+ spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
1977
+ parentContext: activeTrustedParentContext(evt, metadata),
1978
+ endTimeMs: evt.ts,
1979
+ });
1980
+ setSpanAttrs(span, spanAttrs);
1981
+ span.end(evt.ts);
1982
+ };
1983
+
1984
+ const recordToolExecutionError = (
1985
+ evt: Extract<DiagnosticEventPayload, { type: "tool.execution.error" }>,
1986
+ metadata: DiagnosticEventMetadata,
1987
+ ) => {
1988
+ const attrs = {
1989
+ "openclaw.toolName": evt.toolName,
1990
+ "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
1991
+ ...paramsSummaryAttrs(evt.paramsSummary),
1992
+ };
1993
+ toolExecutionDurationHistogram.record(evt.durationMs, attrs);
1994
+ if (!tracesEnabled) {
1995
+ return;
1996
+ }
1997
+ const spanAttrs: Record<string, string | number | boolean> = {
1998
+ ...toolExecutionBaseAttrs(evt),
1999
+ "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
2000
+ };
2001
+ addRunAttrs(spanAttrs, evt);
2002
+ if (evt.errorCode) {
2003
+ spanAttrs["openclaw.errorCode"] = lowCardinalityAttr(evt.errorCode, "other");
2004
+ }
2005
+ assignOtelToolContentAttributes(
2006
+ spanAttrs,
2007
+ evt as unknown as Record<string, unknown>,
2008
+ contentCapturePolicy,
2009
+ );
2010
+ const span =
2011
+ takeTrackedTrustedSpan(evt, metadata) ??
2012
+ spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
2013
+ parentContext: activeTrustedParentContext(evt, metadata),
2014
+ endTimeMs: evt.ts,
2015
+ });
2016
+ setSpanAttrs(span, spanAttrs);
2017
+ span.setStatus({
2018
+ code: SpanStatusCode.ERROR,
2019
+ message: redactSensitiveText(evt.errorCategory),
2020
+ });
2021
+ span.end(evt.ts);
2022
+ };
2023
+
2024
+ const recordToolExecutionBlocked = (
2025
+ evt: Extract<DiagnosticEventPayload, { type: "tool.execution.blocked" }>,
2026
+ metadata: DiagnosticEventMetadata,
2027
+ ) => {
2028
+ if (!tracesEnabled) {
2029
+ return;
2030
+ }
2031
+ const spanAttrs: Record<string, string | number | boolean> = {
2032
+ ...toolExecutionBaseAttrs(evt),
2033
+ "openclaw.outcome": "blocked",
2034
+ "openclaw.deniedReason": lowCardinalityAttr(evt.deniedReason, "other"),
2035
+ };
2036
+ addRunAttrs(spanAttrs, evt);
2037
+ const span = spanWithDuration("openclaw.tool.execution", spanAttrs, 0, {
2038
+ parentContext: activeTrustedParentContext(evt, metadata),
2039
+ endTimeMs: evt.ts,
2040
+ });
2041
+ setSpanAttrs(span, spanAttrs);
2042
+ span.end(evt.ts);
2043
+ };
2044
+
2045
+ const recordExecProcessCompleted = (
2046
+ evt: Extract<DiagnosticEventPayload, { type: "exec.process.completed" }>,
2047
+ ) => {
2048
+ const attrs: Record<string, string | number> = {
2049
+ "openclaw.exec.target": evt.target,
2050
+ "openclaw.exec.mode": evt.mode,
2051
+ "openclaw.outcome": evt.outcome,
2052
+ };
2053
+ if (evt.failureKind) {
2054
+ attrs["openclaw.failureKind"] = evt.failureKind;
2055
+ }
2056
+ execProcessDurationHistogram.record(evt.durationMs, attrs);
2057
+ if (!tracesEnabled) {
2058
+ return;
2059
+ }
2060
+
2061
+ const spanAttrs: Record<string, string | number | boolean> = {
2062
+ ...attrs,
2063
+ "openclaw.exec.command_length": evt.commandLength,
2064
+ };
2065
+ if (typeof evt.exitCode === "number") {
2066
+ spanAttrs["openclaw.exec.exit_code"] = evt.exitCode;
2067
+ }
2068
+ if (evt.exitSignal) {
2069
+ spanAttrs["openclaw.exec.exit_signal"] = lowCardinalityAttr(evt.exitSignal, "other");
2070
+ }
2071
+ if (evt.timedOut !== undefined) {
2072
+ spanAttrs["openclaw.exec.timed_out"] = evt.timedOut;
2073
+ }
2074
+
2075
+ const span = spanWithDuration("openclaw.exec", spanAttrs, evt.durationMs, {
2076
+ endTimeMs: evt.ts,
2077
+ });
2078
+ if (evt.outcome === "failed") {
2079
+ span.setStatus({
2080
+ code: SpanStatusCode.ERROR,
2081
+ ...(evt.failureKind ? { message: evt.failureKind } : {}),
2082
+ });
2083
+ }
2084
+ span.end(evt.ts);
2085
+ };
2086
+
613
2087
  const recordHeartbeat = (
614
2088
  evt: Extract<DiagnosticEventPayload, { type: "diagnostic.heartbeat" }>,
615
2089
  ) => {
616
2090
  queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" });
617
2091
  };
618
2092
 
619
- unsubscribe = onDiagnosticEvent((evt: DiagnosticEventPayload) => {
2093
+ const recordLivenessWarning = (
2094
+ evt: Extract<DiagnosticEventPayload, { type: "diagnostic.liveness.warning" }>,
2095
+ ) => {
2096
+ const reason = evt.reasons.join(":");
2097
+ const attrs = {
2098
+ "openclaw.liveness.reason": lowCardinalityAttr(reason, "unknown"),
2099
+ };
2100
+ livenessWarningCounter.add(1, attrs);
2101
+ queueDepthHistogram.record(evt.queued, { "openclaw.channel": "liveness" });
2102
+ if (evt.eventLoopDelayP99Ms !== undefined) {
2103
+ livenessEventLoopDelayP99Histogram.record(evt.eventLoopDelayP99Ms, attrs);
2104
+ }
2105
+ if (evt.eventLoopDelayMaxMs !== undefined) {
2106
+ livenessEventLoopDelayMaxHistogram.record(evt.eventLoopDelayMaxMs, attrs);
2107
+ }
2108
+ if (evt.eventLoopUtilization !== undefined) {
2109
+ livenessEventLoopUtilizationHistogram.record(evt.eventLoopUtilization, attrs);
2110
+ }
2111
+ if (evt.cpuCoreRatio !== undefined) {
2112
+ livenessCpuCoreRatioHistogram.record(evt.cpuCoreRatio, attrs);
2113
+ }
2114
+ if (!tracesEnabled) {
2115
+ return;
2116
+ }
2117
+ const spanAttrs: Record<string, string | number> = {
2118
+ ...attrs,
2119
+ "openclaw.liveness.active": evt.active,
2120
+ "openclaw.liveness.waiting": evt.waiting,
2121
+ "openclaw.liveness.queued": evt.queued,
2122
+ "openclaw.liveness.interval_ms": evt.intervalMs,
2123
+ ...(evt.eventLoopDelayP99Ms !== undefined
2124
+ ? { "openclaw.liveness.event_loop_delay_p99_ms": evt.eventLoopDelayP99Ms }
2125
+ : {}),
2126
+ ...(evt.eventLoopDelayMaxMs !== undefined
2127
+ ? { "openclaw.liveness.event_loop_delay_max_ms": evt.eventLoopDelayMaxMs }
2128
+ : {}),
2129
+ ...(evt.eventLoopUtilization !== undefined
2130
+ ? { "openclaw.liveness.event_loop_utilization": evt.eventLoopUtilization }
2131
+ : {}),
2132
+ ...(evt.cpuUserMs !== undefined
2133
+ ? { "openclaw.liveness.cpu_user_ms": evt.cpuUserMs }
2134
+ : {}),
2135
+ ...(evt.cpuSystemMs !== undefined
2136
+ ? { "openclaw.liveness.cpu_system_ms": evt.cpuSystemMs }
2137
+ : {}),
2138
+ ...(evt.cpuTotalMs !== undefined
2139
+ ? { "openclaw.liveness.cpu_total_ms": evt.cpuTotalMs }
2140
+ : {}),
2141
+ ...(evt.cpuCoreRatio !== undefined
2142
+ ? { "openclaw.liveness.cpu_core_ratio": evt.cpuCoreRatio }
2143
+ : {}),
2144
+ };
2145
+ const span = spanWithDuration("openclaw.liveness.warning", spanAttrs, 0, {
2146
+ endTimeMs: evt.ts,
2147
+ });
2148
+ span.setStatus({
2149
+ code: SpanStatusCode.ERROR,
2150
+ message: reason,
2151
+ });
2152
+ span.end(evt.ts);
2153
+ };
2154
+
2155
+ const recordTelemetryExporter = (
2156
+ evt: TelemetryExporterDiagnosticEvent,
2157
+ metadata: DiagnosticEventMetadata,
2158
+ ) => {
2159
+ if (!metadata.trusted) {
2160
+ return;
2161
+ }
2162
+ telemetryExporterCounter.add(1, {
2163
+ "openclaw.exporter": lowCardinalityAttr(evt.exporter, "unknown"),
2164
+ "openclaw.signal": evt.signal,
2165
+ "openclaw.status": evt.status,
2166
+ ...(evt.reason ? { "openclaw.reason": evt.reason } : {}),
2167
+ ...(evt.errorCategory
2168
+ ? { "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other") }
2169
+ : {}),
2170
+ });
2171
+ };
2172
+
2173
+ const subscribe = ctx.internalDiagnostics?.onEvent;
2174
+ if (!subscribe) {
2175
+ ctx.logger.error("diagnostics-otel: internal diagnostics capability unavailable");
2176
+ return;
2177
+ }
2178
+
2179
+ unsubscribe = subscribe((evt: DiagnosticEventPayload, metadata: DiagnosticEventMetadata) => {
620
2180
  try {
621
2181
  switch (evt.type) {
622
2182
  case "model.usage":
623
- recordModelUsage(evt);
2183
+ recordModelUsage(evt, metadata);
624
2184
  return;
625
2185
  case "webhook.received":
626
2186
  recordWebhookReceived(evt);
@@ -637,6 +2197,15 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
637
2197
  case "message.processed":
638
2198
  recordMessageProcessed(evt);
639
2199
  return;
2200
+ case "message.delivery.started":
2201
+ recordMessageDeliveryStarted(evt);
2202
+ return;
2203
+ case "message.delivery.completed":
2204
+ recordMessageDeliveryCompleted(evt);
2205
+ return;
2206
+ case "message.delivery.error":
2207
+ recordMessageDeliveryError(evt);
2208
+ return;
640
2209
  case "queue.lane.enqueue":
641
2210
  recordLaneEnqueue(evt);
642
2211
  return;
@@ -646,15 +2215,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
646
2215
  case "session.state":
647
2216
  recordSessionState(evt);
648
2217
  return;
2218
+ case "session.long_running":
2219
+ case "session.stalled":
2220
+ return;
649
2221
  case "session.stuck":
650
2222
  recordSessionStuck(evt);
651
2223
  return;
652
2224
  case "run.attempt":
653
2225
  recordRunAttempt(evt);
654
2226
  return;
2227
+ case "run.progress":
2228
+ return;
655
2229
  case "diagnostic.heartbeat":
656
2230
  recordHeartbeat(evt);
657
2231
  return;
2232
+ case "diagnostic.liveness.warning":
2233
+ recordLivenessWarning(evt);
2234
+ return;
2235
+ case "run.started":
2236
+ recordRunStarted(evt, metadata);
2237
+ return;
2238
+ case "run.completed":
2239
+ recordRunCompleted(evt, metadata);
2240
+ return;
2241
+ case "harness.run.started":
2242
+ recordHarnessRunStarted(evt, metadata);
2243
+ return;
2244
+ case "harness.run.completed":
2245
+ recordHarnessRunCompleted(evt, metadata);
2246
+ return;
2247
+ case "harness.run.error":
2248
+ recordHarnessRunError(evt, metadata);
2249
+ return;
2250
+ case "context.assembled":
2251
+ recordContextAssembled(evt, metadata);
2252
+ return;
2253
+ case "model.call.started":
2254
+ recordModelCallStarted(evt, metadata);
2255
+ return;
2256
+ case "model.call.completed":
2257
+ recordModelCallCompleted(evt, metadata);
2258
+ return;
2259
+ case "model.call.error":
2260
+ recordModelCallError(evt, metadata);
2261
+ return;
2262
+ case "tool.execution.started":
2263
+ recordToolExecutionStarted(evt, metadata);
2264
+ return;
2265
+ case "tool.execution.completed":
2266
+ recordToolExecutionCompleted(evt, metadata);
2267
+ return;
2268
+ case "tool.execution.error":
2269
+ recordToolExecutionError(evt, metadata);
2270
+ return;
2271
+ case "tool.execution.blocked":
2272
+ recordToolExecutionBlocked(evt, metadata);
2273
+ return;
2274
+ case "exec.process.completed":
2275
+ recordExecProcessCompleted(evt);
2276
+ return;
2277
+ case "log.record":
2278
+ recordLogRecord?.(evt, metadata);
2279
+ return;
2280
+ case "tool.loop":
2281
+ recordToolLoop(evt);
2282
+ return;
2283
+ case "diagnostic.memory.sample":
2284
+ recordMemorySample(evt);
2285
+ return;
2286
+ case "diagnostic.memory.pressure":
2287
+ recordMemoryPressure(evt);
2288
+ return;
2289
+ case "telemetry.exporter":
2290
+ recordTelemetryExporter(evt, metadata);
2291
+ return;
2292
+ case "payload.large":
2293
+ return;
658
2294
  }
659
2295
  } catch (err) {
660
2296
  ctx.logger.error(
@@ -663,23 +2299,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
663
2299
  }
664
2300
  });
665
2301
 
2302
+ emitForSignals(enabledSignals, {
2303
+ exporter: "diagnostics-otel",
2304
+ status: "started",
2305
+ reason: "configured",
2306
+ });
2307
+
666
2308
  if (logsEnabled) {
667
2309
  ctx.logger.info("diagnostics-otel: logs exporter enabled (OTLP/Protobuf)");
668
2310
  }
669
2311
  },
670
2312
  async stop() {
671
- unsubscribe?.();
672
- unsubscribe = null;
673
- stopLogTransport?.();
674
- stopLogTransport = null;
675
- if (logProvider) {
676
- await logProvider.shutdown().catch(() => undefined);
677
- logProvider = null;
678
- }
679
- if (sdk) {
680
- await sdk.shutdown().catch(() => undefined);
681
- sdk = null;
682
- }
2313
+ await stopStarted();
683
2314
  },
684
2315
  } satisfies OpenClawPluginService;
685
2316
  }