@aigentsphere/openclaw-otel-observability 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.github/workflows/ci.yml +52 -0
  2. package/.github/workflows/docs.yml +25 -0
  3. package/LICENSE +15 -0
  4. package/README.md +300 -0
  5. package/collector/README.md +186 -0
  6. package/collector/otel-collector-config.yaml +230 -0
  7. package/docker-compose.yaml +32 -0
  8. package/docs/architecture.md +319 -0
  9. package/docs/backends/dynatrace.md +168 -0
  10. package/docs/backends/generic-otlp.md +166 -0
  11. package/docs/backends/grafana.md +167 -0
  12. package/docs/backends/index.md +49 -0
  13. package/docs/backends/otel-collector.md +210 -0
  14. package/docs/configuration.md +276 -0
  15. package/docs/development.md +198 -0
  16. package/docs/getting-started.md +295 -0
  17. package/docs/index.md +139 -0
  18. package/docs/limitations.md +95 -0
  19. package/docs/security/detection.md +274 -0
  20. package/docs/security/tetragon.md +454 -0
  21. package/docs/telemetry/metrics.md +283 -0
  22. package/docs/telemetry/tokens.md +188 -0
  23. package/docs/telemetry/traces.md +165 -0
  24. package/dynatrace/security-slo-dql.md +263 -0
  25. package/index.ts +191 -0
  26. package/instrumentation/preload.mjs +59 -0
  27. package/mkdocs.yml +90 -0
  28. package/openclaw.plugin.json +99 -0
  29. package/package.json +49 -0
  30. package/src/config.ts +72 -0
  31. package/src/diagnostics.ts +214 -0
  32. package/src/hooks.ts +575 -0
  33. package/src/openllmetry.ts +27 -0
  34. package/src/security.ts +396 -0
  35. package/src/telemetry.ts +282 -0
  36. package/tetragon-policies/01-process-exec.yaml +20 -0
  37. package/tetragon-policies/02-sensitive-files.yaml +86 -0
  38. package/tetragon-policies/04-privilege-escalation.yaml +25 -0
  39. package/tetragon-policies/05-dangerous-commands.yaml +97 -0
  40. package/tetragon-policies/06-kernel-modules.yaml +27 -0
  41. package/tetragon-policies/07-prompt-injection-shell.yaml +73 -0
  42. package/tetragon-policies/README.md +143 -0
  43. package/tsconfig.json +17 -0
package/package.json ADDED
@@ -0,0 +1,49 @@
1
+ {
2
+ "name": "@aigentsphere/openclaw-otel-observability",
3
+ "version": "0.2.1",
4
+ "description": "OpenTelemetry observability plugin for OpenClaw — traces, metrics, and logs for your AI agent using OpenLLMetry",
5
+ "type": "module",
6
+ "openclaw": {
7
+ "extensions": ["./index.ts"]
8
+ },
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "https://github.com/aigentsphere/openclaw-observability-plugin.git"
12
+ },
13
+ "publishConfig": {
14
+ "registry": "https://registry.npmjs.org",
15
+ "access": "public"
16
+ },
17
+ "scripts": {
18
+ "typecheck": "tsc --noEmit",
19
+ "lint": "eslint src/ index.ts"
20
+ },
21
+ "keywords": [
22
+ "openclaw",
23
+ "opentelemetry",
24
+ "observability",
25
+ "llm",
26
+ "tracing",
27
+ "metrics",
28
+ "dynatrace",
29
+ "openllmetry"
30
+ ],
31
+ "author": "Henrik Rexed",
32
+ "license": "Apache-2.0",
33
+ "dependencies": {
34
+ "@traceloop/node-server-sdk": "^0.22.6",
35
+ "@opentelemetry/api": "^1.9.0",
36
+ "@opentelemetry/sdk-node": "^0.203.0",
37
+ "@opentelemetry/sdk-trace-node": "^2.0.1",
38
+ "@opentelemetry/sdk-metrics": "^2.0.1",
39
+ "@opentelemetry/exporter-trace-otlp-http": "^0.203.0",
40
+ "@opentelemetry/exporter-trace-otlp-grpc": "^0.203.0",
41
+ "@opentelemetry/exporter-metrics-otlp-http": "^0.203.0",
42
+ "@opentelemetry/exporter-metrics-otlp-grpc": "^0.203.0",
43
+ "@opentelemetry/resources": "^2.0.1",
44
+ "@opentelemetry/semantic-conventions": "^1.30.0"
45
+ },
46
+ "devDependencies": {
47
+ "typescript": "^5.7.0"
48
+ }
49
+ }
package/src/config.ts ADDED
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Configuration types and defaults for the OTel Observability plugin.
3
+ */
4
+
5
+ export interface OtelObservabilityConfig {
6
+ /** OTLP endpoint URL */
7
+ endpoint: string;
8
+ /** OTLP export protocol: 'http' (OTLP/HTTP) or 'grpc' (OTLP/gRPC) */
9
+ protocol: "http" | "grpc";
10
+ /** OpenTelemetry service name */
11
+ serviceName: string;
12
+ /** Custom headers for OTLP export (e.g., Authorization for Dynatrace) */
13
+ headers: Record<string, string>;
14
+ /** Enable trace export */
15
+ traces: boolean;
16
+ /** Enable metrics export */
17
+ metrics: boolean;
18
+ /** Enable log export */
19
+ logs: boolean;
20
+ /** Capture prompt/completion content in spans (disable for privacy) */
21
+ captureContent: boolean;
22
+ /** Metrics export interval in milliseconds */
23
+ metricsIntervalMs: number;
24
+ /** Additional OTel resource attributes */
25
+ resourceAttributes: Record<string, string>;
26
+ }
27
+
28
+ const DEFAULTS: OtelObservabilityConfig = {
29
+ endpoint: "http://localhost:4318",
30
+ protocol: "http",
31
+ serviceName: "openclaw-gateway",
32
+ headers: {},
33
+ traces: true,
34
+ metrics: true,
35
+ logs: true,
36
+ captureContent: false,
37
+ metricsIntervalMs: 30_000,
38
+ resourceAttributes: {},
39
+ };
40
+
41
+ export function parseConfig(raw: unknown): OtelObservabilityConfig {
42
+ const obj =
43
+ raw && typeof raw === "object" && !Array.isArray(raw)
44
+ ? (raw as Record<string, unknown>)
45
+ : {};
46
+
47
+ return {
48
+ endpoint: typeof obj.endpoint === "string" ? obj.endpoint : DEFAULTS.endpoint,
49
+ protocol: obj.protocol === "grpc" ? "grpc" : DEFAULTS.protocol,
50
+ serviceName:
51
+ typeof obj.serviceName === "string" ? obj.serviceName : DEFAULTS.serviceName,
52
+ headers:
53
+ obj.headers && typeof obj.headers === "object" && !Array.isArray(obj.headers)
54
+ ? (obj.headers as Record<string, string>)
55
+ : DEFAULTS.headers,
56
+ traces: typeof obj.traces === "boolean" ? obj.traces : DEFAULTS.traces,
57
+ metrics: typeof obj.metrics === "boolean" ? obj.metrics : DEFAULTS.metrics,
58
+ logs: typeof obj.logs === "boolean" ? obj.logs : DEFAULTS.logs,
59
+ captureContent:
60
+ typeof obj.captureContent === "boolean" ? obj.captureContent : DEFAULTS.captureContent,
61
+ metricsIntervalMs:
62
+ typeof obj.metricsIntervalMs === "number" && obj.metricsIntervalMs >= 1000
63
+ ? obj.metricsIntervalMs
64
+ : DEFAULTS.metricsIntervalMs,
65
+ resourceAttributes:
66
+ obj.resourceAttributes &&
67
+ typeof obj.resourceAttributes === "object" &&
68
+ !Array.isArray(obj.resourceAttributes)
69
+ ? (obj.resourceAttributes as Record<string, string>)
70
+ : DEFAULTS.resourceAttributes,
71
+ };
72
+ }
@@ -0,0 +1,214 @@
1
+ /**
2
+ * Diagnostic events integration — subscribes to OpenClaw's internal diagnostic
3
+ * events to get accurate cost/token data, then enriches our connected traces.
4
+ *
5
+ * This combines the best of both approaches:
6
+ * - Our plugin: Connected traces (request → agent turn → tools)
7
+ * - Official diagnostics: Accurate cost, token counts, context limits
8
+ */
9
+
10
+ import type { Span } from "@opentelemetry/api";
11
+ import type { TelemetryRuntime } from "./telemetry.js";
12
+
13
+ // Import from OpenClaw plugin SDK (loaded lazily)
14
+ let onDiagnosticEvent: ((listener: (evt: any) => void) => () => void) | null = null;
15
+ let sdkLoadAttempted = false;
16
+
17
+ async function loadSdk(): Promise<void> {
18
+ if (sdkLoadAttempted) return;
19
+ sdkLoadAttempted = true;
20
+ try {
21
+ // Dynamic import to avoid build issues if SDK not available
22
+ // @ts-ignore - openclaw/plugin-sdk types not available at build time
23
+ const sdk = await import("openclaw/plugin-sdk") as any;
24
+ onDiagnosticEvent = sdk.onDiagnosticEvent;
25
+ } catch {
26
+ // SDK not available — will use fallback token extraction
27
+ }
28
+ }
29
+
30
+ /** Pending usage data waiting to be attached to spans */
31
+ interface PendingUsageData {
32
+ costUsd?: number;
33
+ usage: {
34
+ input?: number;
35
+ output?: number;
36
+ cacheRead?: number;
37
+ cacheWrite?: number;
38
+ total?: number;
39
+ };
40
+ context?: {
41
+ limit?: number;
42
+ used?: number;
43
+ };
44
+ durationMs?: number;
45
+ provider?: string;
46
+ model?: string;
47
+ }
48
+
49
+ /** Map of sessionKey → pending usage data from diagnostic events */
50
+ const pendingUsageMap = new Map<string, PendingUsageData>();
51
+
52
+ /** Map of sessionKey → active agent span (set by hooks.ts) */
53
+ export const activeAgentSpans = new Map<string, Span>();
54
+
55
+ /**
56
+ * Register diagnostic event listener to capture model.usage events.
57
+ * Returns unsubscribe function.
58
+ */
59
+ export async function registerDiagnosticsListener(
60
+ telemetry: TelemetryRuntime,
61
+ logger: any
62
+ ): Promise<() => void> {
63
+ // Load the SDK if not already loaded
64
+ await loadSdk();
65
+
66
+ if (!onDiagnosticEvent) {
67
+ logger.debug?.("[otel] onDiagnosticEvent not available — using fallback token extraction");
68
+ return () => {};
69
+ }
70
+
71
+ const { counters, histograms } = telemetry;
72
+
73
+ const unsubscribe = onDiagnosticEvent((evt: any) => {
74
+ if (evt.type !== "model.usage") return;
75
+
76
+ const sessionKey = evt.sessionKey || "unknown";
77
+ const usage = evt.usage || {};
78
+ const costUsd = evt.costUsd;
79
+ const model = evt.model || "unknown";
80
+ const provider = evt.provider || "unknown";
81
+
82
+ // Store for later attachment to agent span
83
+ pendingUsageMap.set(sessionKey, {
84
+ costUsd,
85
+ usage,
86
+ context: evt.context,
87
+ durationMs: evt.durationMs,
88
+ provider,
89
+ model,
90
+ });
91
+
92
+ // Record metrics immediately (don't wait for span)
93
+ const metricAttrs = {
94
+ "gen_ai.response.model": model,
95
+ "openclaw.provider": provider,
96
+ };
97
+
98
+ if (usage.input) {
99
+ counters.tokensPrompt.add(usage.input, metricAttrs);
100
+ }
101
+ if (usage.output) {
102
+ counters.tokensCompletion.add(usage.output, metricAttrs);
103
+ }
104
+ if (usage.cacheRead) {
105
+ counters.tokensPrompt.add(usage.cacheRead, { ...metricAttrs, "token.type": "cache_read" });
106
+ }
107
+ if (usage.cacheWrite) {
108
+ counters.tokensPrompt.add(usage.cacheWrite, { ...metricAttrs, "token.type": "cache_write" });
109
+ }
110
+ if (usage.total) {
111
+ counters.tokensTotal.add(usage.total, metricAttrs);
112
+ }
113
+
114
+ // Record cost metric
115
+ if (typeof costUsd === "number" && costUsd > 0) {
116
+ telemetry.meter.createCounter("openclaw.llm.cost.usd", {
117
+ description: "Estimated LLM cost in USD",
118
+ unit: "usd",
119
+ }).add(costUsd, metricAttrs);
120
+ }
121
+
122
+ // Record LLM duration
123
+ if (typeof evt.durationMs === "number") {
124
+ histograms.llmDuration.record(evt.durationMs, metricAttrs);
125
+ }
126
+
127
+ counters.llmRequests.add(1, metricAttrs);
128
+
129
+ // If we have an active agent span for this session, enrich it now
130
+ const agentSpan = activeAgentSpans.get(sessionKey);
131
+ if (agentSpan) {
132
+ enrichSpanWithUsage(agentSpan, evt);
133
+ pendingUsageMap.delete(sessionKey);
134
+ }
135
+
136
+ logger.debug?.(`[otel] model.usage: session=${sessionKey}, model=${model}, cost=$${costUsd?.toFixed(4) || "?"}, tokens=${usage.total || "?"}`);
137
+ });
138
+
139
+ logger.info("[otel] Subscribed to OpenClaw diagnostic events (model.usage, etc.)");
140
+ return unsubscribe;
141
+ }
142
+
143
+ /**
144
+ * Get pending usage data for a session (if any).
145
+ * Called by agent_end hook to attach data to span.
146
+ */
147
+ export function getPendingUsage(sessionKey: string): PendingUsageData | undefined {
148
+ const data = pendingUsageMap.get(sessionKey);
149
+ if (data) {
150
+ pendingUsageMap.delete(sessionKey);
151
+ }
152
+ return data;
153
+ }
154
+
155
+ /**
156
+ * Enrich a span with usage data from diagnostic event.
157
+ */
158
+ export function enrichSpanWithUsage(span: Span, data: PendingUsageData): void {
159
+ const usage = data.usage || {};
160
+
161
+ // GenAI semantic convention attributes
162
+ if (usage.input !== undefined) {
163
+ span.setAttribute("gen_ai.usage.input_tokens", usage.input);
164
+ }
165
+ if (usage.output !== undefined) {
166
+ span.setAttribute("gen_ai.usage.output_tokens", usage.output);
167
+ }
168
+ if (usage.total !== undefined) {
169
+ span.setAttribute("gen_ai.usage.total_tokens", usage.total);
170
+ }
171
+ if (usage.cacheRead !== undefined) {
172
+ span.setAttribute("gen_ai.usage.cache_read_tokens", usage.cacheRead);
173
+ }
174
+ if (usage.cacheWrite !== undefined) {
175
+ span.setAttribute("gen_ai.usage.cache_write_tokens", usage.cacheWrite);
176
+ }
177
+
178
+ // Cost (custom attribute — not in GenAI semconv yet)
179
+ if (data.costUsd !== undefined) {
180
+ span.setAttribute("openclaw.llm.cost_usd", data.costUsd);
181
+ }
182
+
183
+ // Context window
184
+ if (data.context?.limit !== undefined) {
185
+ span.setAttribute("openclaw.context.limit", data.context.limit);
186
+ }
187
+ if (data.context?.used !== undefined) {
188
+ span.setAttribute("openclaw.context.used", data.context.used);
189
+ }
190
+
191
+ // Provider/model
192
+ if (data.provider) {
193
+ span.setAttribute("gen_ai.system", data.provider);
194
+ }
195
+ if (data.model) {
196
+ span.setAttribute("gen_ai.response.model", data.model);
197
+ }
198
+ }
199
+
200
+ /**
201
+ * Check if diagnostic events are available.
202
+ * Note: Only accurate after registerDiagnosticsListener() has been called.
203
+ */
204
+ export function hasDiagnosticsSupport(): boolean {
205
+ return onDiagnosticEvent !== null;
206
+ }
207
+
208
+ /**
209
+ * Async check for diagnostics support (loads SDK if needed).
210
+ */
211
+ export async function checkDiagnosticsSupport(): Promise<boolean> {
212
+ await loadSdk();
213
+ return onDiagnosticEvent !== null;
214
+ }