@aigentsphere/openclaw-otel-observability 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +52 -0
- package/.github/workflows/docs.yml +25 -0
- package/LICENSE +15 -0
- package/README.md +300 -0
- package/collector/README.md +186 -0
- package/collector/otel-collector-config.yaml +230 -0
- package/docker-compose.yaml +32 -0
- package/docs/architecture.md +319 -0
- package/docs/backends/dynatrace.md +168 -0
- package/docs/backends/generic-otlp.md +166 -0
- package/docs/backends/grafana.md +167 -0
- package/docs/backends/index.md +49 -0
- package/docs/backends/otel-collector.md +210 -0
- package/docs/configuration.md +276 -0
- package/docs/development.md +198 -0
- package/docs/getting-started.md +295 -0
- package/docs/index.md +139 -0
- package/docs/limitations.md +95 -0
- package/docs/security/detection.md +274 -0
- package/docs/security/tetragon.md +454 -0
- package/docs/telemetry/metrics.md +283 -0
- package/docs/telemetry/tokens.md +188 -0
- package/docs/telemetry/traces.md +165 -0
- package/dynatrace/security-slo-dql.md +263 -0
- package/index.ts +191 -0
- package/instrumentation/preload.mjs +59 -0
- package/mkdocs.yml +90 -0
- package/openclaw.plugin.json +99 -0
- package/package.json +49 -0
- package/src/config.ts +72 -0
- package/src/diagnostics.ts +214 -0
- package/src/hooks.ts +575 -0
- package/src/openllmetry.ts +27 -0
- package/src/security.ts +396 -0
- package/src/telemetry.ts +282 -0
- package/tetragon-policies/01-process-exec.yaml +20 -0
- package/tetragon-policies/02-sensitive-files.yaml +86 -0
- package/tetragon-policies/04-privilege-escalation.yaml +25 -0
- package/tetragon-policies/05-dangerous-commands.yaml +97 -0
- package/tetragon-policies/06-kernel-modules.yaml +27 -0
- package/tetragon-policies/07-prompt-injection-shell.yaml +73 -0
- package/tetragon-policies/README.md +143 -0
- package/tsconfig.json +17 -0
package/package.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@aigentsphere/openclaw-otel-observability",
|
|
3
|
+
"version": "0.2.1",
|
|
4
|
+
"description": "OpenTelemetry observability plugin for OpenClaw — traces, metrics, and logs for your AI agent using OpenLLMetry",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"openclaw": {
|
|
7
|
+
"extensions": ["./index.ts"]
|
|
8
|
+
},
|
|
9
|
+
"repository": {
|
|
10
|
+
"type": "git",
|
|
11
|
+
"url": "https://github.com/aigentsphere/openclaw-observability-plugin.git"
|
|
12
|
+
},
|
|
13
|
+
"publishConfig": {
|
|
14
|
+
"registry": "https://registry.npmjs.org",
|
|
15
|
+
"access": "public"
|
|
16
|
+
},
|
|
17
|
+
"scripts": {
|
|
18
|
+
"typecheck": "tsc --noEmit",
|
|
19
|
+
"lint": "eslint src/ index.ts"
|
|
20
|
+
},
|
|
21
|
+
"keywords": [
|
|
22
|
+
"openclaw",
|
|
23
|
+
"opentelemetry",
|
|
24
|
+
"observability",
|
|
25
|
+
"llm",
|
|
26
|
+
"tracing",
|
|
27
|
+
"metrics",
|
|
28
|
+
"dynatrace",
|
|
29
|
+
"openllmetry"
|
|
30
|
+
],
|
|
31
|
+
"author": "Henrik Rexed",
|
|
32
|
+
"license": "Apache-2.0",
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"@traceloop/node-server-sdk": "^0.22.6",
|
|
35
|
+
"@opentelemetry/api": "^1.9.0",
|
|
36
|
+
"@opentelemetry/sdk-node": "^0.203.0",
|
|
37
|
+
"@opentelemetry/sdk-trace-node": "^2.0.1",
|
|
38
|
+
"@opentelemetry/sdk-metrics": "^2.0.1",
|
|
39
|
+
"@opentelemetry/exporter-trace-otlp-http": "^0.203.0",
|
|
40
|
+
"@opentelemetry/exporter-trace-otlp-grpc": "^0.203.0",
|
|
41
|
+
"@opentelemetry/exporter-metrics-otlp-http": "^0.203.0",
|
|
42
|
+
"@opentelemetry/exporter-metrics-otlp-grpc": "^0.203.0",
|
|
43
|
+
"@opentelemetry/resources": "^2.0.1",
|
|
44
|
+
"@opentelemetry/semantic-conventions": "^1.30.0"
|
|
45
|
+
},
|
|
46
|
+
"devDependencies": {
|
|
47
|
+
"typescript": "^5.7.0"
|
|
48
|
+
}
|
|
49
|
+
}
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Configuration types and defaults for the OTel Observability plugin.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export interface OtelObservabilityConfig {
|
|
6
|
+
/** OTLP endpoint URL */
|
|
7
|
+
endpoint: string;
|
|
8
|
+
/** OTLP export protocol: 'http' (OTLP/HTTP) or 'grpc' (OTLP/gRPC) */
|
|
9
|
+
protocol: "http" | "grpc";
|
|
10
|
+
/** OpenTelemetry service name */
|
|
11
|
+
serviceName: string;
|
|
12
|
+
/** Custom headers for OTLP export (e.g., Authorization for Dynatrace) */
|
|
13
|
+
headers: Record<string, string>;
|
|
14
|
+
/** Enable trace export */
|
|
15
|
+
traces: boolean;
|
|
16
|
+
/** Enable metrics export */
|
|
17
|
+
metrics: boolean;
|
|
18
|
+
/** Enable log export */
|
|
19
|
+
logs: boolean;
|
|
20
|
+
/** Capture prompt/completion content in spans (disable for privacy) */
|
|
21
|
+
captureContent: boolean;
|
|
22
|
+
/** Metrics export interval in milliseconds */
|
|
23
|
+
metricsIntervalMs: number;
|
|
24
|
+
/** Additional OTel resource attributes */
|
|
25
|
+
resourceAttributes: Record<string, string>;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const DEFAULTS: OtelObservabilityConfig = {
|
|
29
|
+
endpoint: "http://localhost:4318",
|
|
30
|
+
protocol: "http",
|
|
31
|
+
serviceName: "openclaw-gateway",
|
|
32
|
+
headers: {},
|
|
33
|
+
traces: true,
|
|
34
|
+
metrics: true,
|
|
35
|
+
logs: true,
|
|
36
|
+
captureContent: false,
|
|
37
|
+
metricsIntervalMs: 30_000,
|
|
38
|
+
resourceAttributes: {},
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
export function parseConfig(raw: unknown): OtelObservabilityConfig {
|
|
42
|
+
const obj =
|
|
43
|
+
raw && typeof raw === "object" && !Array.isArray(raw)
|
|
44
|
+
? (raw as Record<string, unknown>)
|
|
45
|
+
: {};
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
endpoint: typeof obj.endpoint === "string" ? obj.endpoint : DEFAULTS.endpoint,
|
|
49
|
+
protocol: obj.protocol === "grpc" ? "grpc" : DEFAULTS.protocol,
|
|
50
|
+
serviceName:
|
|
51
|
+
typeof obj.serviceName === "string" ? obj.serviceName : DEFAULTS.serviceName,
|
|
52
|
+
headers:
|
|
53
|
+
obj.headers && typeof obj.headers === "object" && !Array.isArray(obj.headers)
|
|
54
|
+
? (obj.headers as Record<string, string>)
|
|
55
|
+
: DEFAULTS.headers,
|
|
56
|
+
traces: typeof obj.traces === "boolean" ? obj.traces : DEFAULTS.traces,
|
|
57
|
+
metrics: typeof obj.metrics === "boolean" ? obj.metrics : DEFAULTS.metrics,
|
|
58
|
+
logs: typeof obj.logs === "boolean" ? obj.logs : DEFAULTS.logs,
|
|
59
|
+
captureContent:
|
|
60
|
+
typeof obj.captureContent === "boolean" ? obj.captureContent : DEFAULTS.captureContent,
|
|
61
|
+
metricsIntervalMs:
|
|
62
|
+
typeof obj.metricsIntervalMs === "number" && obj.metricsIntervalMs >= 1000
|
|
63
|
+
? obj.metricsIntervalMs
|
|
64
|
+
: DEFAULTS.metricsIntervalMs,
|
|
65
|
+
resourceAttributes:
|
|
66
|
+
obj.resourceAttributes &&
|
|
67
|
+
typeof obj.resourceAttributes === "object" &&
|
|
68
|
+
!Array.isArray(obj.resourceAttributes)
|
|
69
|
+
? (obj.resourceAttributes as Record<string, string>)
|
|
70
|
+
: DEFAULTS.resourceAttributes,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnostic events integration — subscribes to OpenClaw's internal diagnostic
|
|
3
|
+
* events to get accurate cost/token data, then enriches our connected traces.
|
|
4
|
+
*
|
|
5
|
+
* This combines the best of both approaches:
|
|
6
|
+
* - Our plugin: Connected traces (request → agent turn → tools)
|
|
7
|
+
* - Official diagnostics: Accurate cost, token counts, context limits
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { Span } from "@opentelemetry/api";
|
|
11
|
+
import type { TelemetryRuntime } from "./telemetry.js";
|
|
12
|
+
|
|
13
|
+
// Import from OpenClaw plugin SDK (loaded lazily)
|
|
14
|
+
let onDiagnosticEvent: ((listener: (evt: any) => void) => () => void) | null = null;
|
|
15
|
+
let sdkLoadAttempted = false;
|
|
16
|
+
|
|
17
|
+
async function loadSdk(): Promise<void> {
|
|
18
|
+
if (sdkLoadAttempted) return;
|
|
19
|
+
sdkLoadAttempted = true;
|
|
20
|
+
try {
|
|
21
|
+
// Dynamic import to avoid build issues if SDK not available
|
|
22
|
+
// @ts-ignore - openclaw/plugin-sdk types not available at build time
|
|
23
|
+
const sdk = await import("openclaw/plugin-sdk") as any;
|
|
24
|
+
onDiagnosticEvent = sdk.onDiagnosticEvent;
|
|
25
|
+
} catch {
|
|
26
|
+
// SDK not available — will use fallback token extraction
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Pending usage data waiting to be attached to spans */
|
|
31
|
+
interface PendingUsageData {
|
|
32
|
+
costUsd?: number;
|
|
33
|
+
usage: {
|
|
34
|
+
input?: number;
|
|
35
|
+
output?: number;
|
|
36
|
+
cacheRead?: number;
|
|
37
|
+
cacheWrite?: number;
|
|
38
|
+
total?: number;
|
|
39
|
+
};
|
|
40
|
+
context?: {
|
|
41
|
+
limit?: number;
|
|
42
|
+
used?: number;
|
|
43
|
+
};
|
|
44
|
+
durationMs?: number;
|
|
45
|
+
provider?: string;
|
|
46
|
+
model?: string;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Map of sessionKey → pending usage data from diagnostic events */
|
|
50
|
+
const pendingUsageMap = new Map<string, PendingUsageData>();
|
|
51
|
+
|
|
52
|
+
/** Map of sessionKey → active agent span (set by hooks.ts) */
|
|
53
|
+
export const activeAgentSpans = new Map<string, Span>();
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Register diagnostic event listener to capture model.usage events.
|
|
57
|
+
* Returns unsubscribe function.
|
|
58
|
+
*/
|
|
59
|
+
export async function registerDiagnosticsListener(
|
|
60
|
+
telemetry: TelemetryRuntime,
|
|
61
|
+
logger: any
|
|
62
|
+
): Promise<() => void> {
|
|
63
|
+
// Load the SDK if not already loaded
|
|
64
|
+
await loadSdk();
|
|
65
|
+
|
|
66
|
+
if (!onDiagnosticEvent) {
|
|
67
|
+
logger.debug?.("[otel] onDiagnosticEvent not available — using fallback token extraction");
|
|
68
|
+
return () => {};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const { counters, histograms } = telemetry;
|
|
72
|
+
|
|
73
|
+
const unsubscribe = onDiagnosticEvent((evt: any) => {
|
|
74
|
+
if (evt.type !== "model.usage") return;
|
|
75
|
+
|
|
76
|
+
const sessionKey = evt.sessionKey || "unknown";
|
|
77
|
+
const usage = evt.usage || {};
|
|
78
|
+
const costUsd = evt.costUsd;
|
|
79
|
+
const model = evt.model || "unknown";
|
|
80
|
+
const provider = evt.provider || "unknown";
|
|
81
|
+
|
|
82
|
+
// Store for later attachment to agent span
|
|
83
|
+
pendingUsageMap.set(sessionKey, {
|
|
84
|
+
costUsd,
|
|
85
|
+
usage,
|
|
86
|
+
context: evt.context,
|
|
87
|
+
durationMs: evt.durationMs,
|
|
88
|
+
provider,
|
|
89
|
+
model,
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
// Record metrics immediately (don't wait for span)
|
|
93
|
+
const metricAttrs = {
|
|
94
|
+
"gen_ai.response.model": model,
|
|
95
|
+
"openclaw.provider": provider,
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
if (usage.input) {
|
|
99
|
+
counters.tokensPrompt.add(usage.input, metricAttrs);
|
|
100
|
+
}
|
|
101
|
+
if (usage.output) {
|
|
102
|
+
counters.tokensCompletion.add(usage.output, metricAttrs);
|
|
103
|
+
}
|
|
104
|
+
if (usage.cacheRead) {
|
|
105
|
+
counters.tokensPrompt.add(usage.cacheRead, { ...metricAttrs, "token.type": "cache_read" });
|
|
106
|
+
}
|
|
107
|
+
if (usage.cacheWrite) {
|
|
108
|
+
counters.tokensPrompt.add(usage.cacheWrite, { ...metricAttrs, "token.type": "cache_write" });
|
|
109
|
+
}
|
|
110
|
+
if (usage.total) {
|
|
111
|
+
counters.tokensTotal.add(usage.total, metricAttrs);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Record cost metric
|
|
115
|
+
if (typeof costUsd === "number" && costUsd > 0) {
|
|
116
|
+
telemetry.meter.createCounter("openclaw.llm.cost.usd", {
|
|
117
|
+
description: "Estimated LLM cost in USD",
|
|
118
|
+
unit: "usd",
|
|
119
|
+
}).add(costUsd, metricAttrs);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Record LLM duration
|
|
123
|
+
if (typeof evt.durationMs === "number") {
|
|
124
|
+
histograms.llmDuration.record(evt.durationMs, metricAttrs);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
counters.llmRequests.add(1, metricAttrs);
|
|
128
|
+
|
|
129
|
+
// If we have an active agent span for this session, enrich it now
|
|
130
|
+
const agentSpan = activeAgentSpans.get(sessionKey);
|
|
131
|
+
if (agentSpan) {
|
|
132
|
+
enrichSpanWithUsage(agentSpan, evt);
|
|
133
|
+
pendingUsageMap.delete(sessionKey);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
logger.debug?.(`[otel] model.usage: session=${sessionKey}, model=${model}, cost=$${costUsd?.toFixed(4) || "?"}, tokens=${usage.total || "?"}`);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
logger.info("[otel] Subscribed to OpenClaw diagnostic events (model.usage, etc.)");
|
|
140
|
+
return unsubscribe;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Get pending usage data for a session (if any).
|
|
145
|
+
* Called by agent_end hook to attach data to span.
|
|
146
|
+
*/
|
|
147
|
+
export function getPendingUsage(sessionKey: string): PendingUsageData | undefined {
|
|
148
|
+
const data = pendingUsageMap.get(sessionKey);
|
|
149
|
+
if (data) {
|
|
150
|
+
pendingUsageMap.delete(sessionKey);
|
|
151
|
+
}
|
|
152
|
+
return data;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Enrich a span with usage data from diagnostic event.
|
|
157
|
+
*/
|
|
158
|
+
export function enrichSpanWithUsage(span: Span, data: PendingUsageData): void {
|
|
159
|
+
const usage = data.usage || {};
|
|
160
|
+
|
|
161
|
+
// GenAI semantic convention attributes
|
|
162
|
+
if (usage.input !== undefined) {
|
|
163
|
+
span.setAttribute("gen_ai.usage.input_tokens", usage.input);
|
|
164
|
+
}
|
|
165
|
+
if (usage.output !== undefined) {
|
|
166
|
+
span.setAttribute("gen_ai.usage.output_tokens", usage.output);
|
|
167
|
+
}
|
|
168
|
+
if (usage.total !== undefined) {
|
|
169
|
+
span.setAttribute("gen_ai.usage.total_tokens", usage.total);
|
|
170
|
+
}
|
|
171
|
+
if (usage.cacheRead !== undefined) {
|
|
172
|
+
span.setAttribute("gen_ai.usage.cache_read_tokens", usage.cacheRead);
|
|
173
|
+
}
|
|
174
|
+
if (usage.cacheWrite !== undefined) {
|
|
175
|
+
span.setAttribute("gen_ai.usage.cache_write_tokens", usage.cacheWrite);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Cost (custom attribute — not in GenAI semconv yet)
|
|
179
|
+
if (data.costUsd !== undefined) {
|
|
180
|
+
span.setAttribute("openclaw.llm.cost_usd", data.costUsd);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Context window
|
|
184
|
+
if (data.context?.limit !== undefined) {
|
|
185
|
+
span.setAttribute("openclaw.context.limit", data.context.limit);
|
|
186
|
+
}
|
|
187
|
+
if (data.context?.used !== undefined) {
|
|
188
|
+
span.setAttribute("openclaw.context.used", data.context.used);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Provider/model
|
|
192
|
+
if (data.provider) {
|
|
193
|
+
span.setAttribute("gen_ai.system", data.provider);
|
|
194
|
+
}
|
|
195
|
+
if (data.model) {
|
|
196
|
+
span.setAttribute("gen_ai.response.model", data.model);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Check if diagnostic events are available.
|
|
202
|
+
* Note: Only accurate after registerDiagnosticsListener() has been called.
|
|
203
|
+
*/
|
|
204
|
+
export function hasDiagnosticsSupport(): boolean {
|
|
205
|
+
return onDiagnosticEvent !== null;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Async check for diagnostics support (loads SDK if needed).
|
|
210
|
+
*/
|
|
211
|
+
export async function checkDiagnosticsSupport(): Promise<boolean> {
|
|
212
|
+
await loadSdk();
|
|
213
|
+
return onDiagnosticEvent !== null;
|
|
214
|
+
}
|