@musashishao/agent-kit 1.6.1 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/.shared/ui-ux-pro-max/data/charts.csv +26 -0
- package/.agent/.shared/ui-ux-pro-max/data/colors.csv +97 -0
- package/.agent/.shared/ui-ux-pro-max/data/icons.csv +101 -0
- package/.agent/.shared/ui-ux-pro-max/data/landing.csv +31 -0
- package/.agent/.shared/ui-ux-pro-max/data/products.csv +97 -0
- package/.agent/.shared/ui-ux-pro-max/data/prompts.csv +24 -0
- package/.agent/.shared/ui-ux-pro-max/data/react-performance.csv +45 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/flutter.csv +53 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/nextjs.csv +53 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/react-native.csv +52 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/react.csv +54 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/shadcn.csv +61 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/svelte.csv +54 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/swiftui.csv +51 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/vue.csv +50 -0
- package/.agent/.shared/ui-ux-pro-max/data/styles.csv +59 -0
- package/.agent/.shared/ui-ux-pro-max/data/typography.csv +58 -0
- package/.agent/.shared/ui-ux-pro-max/data/ui-reasoning.csv +101 -0
- package/.agent/.shared/ui-ux-pro-max/data/ux-guidelines.csv +100 -0
- package/.agent/.shared/ui-ux-pro-max/data/web-interface.csv +31 -0
- package/.agent/.shared/ui-ux-pro-max/scripts/core.py +258 -0
- package/.agent/.shared/ui-ux-pro-max/scripts/design_system.py +487 -0
- package/.agent/.shared/ui-ux-pro-max/scripts/search.py +76 -0
- package/.agent/adr/ADR-TEMPLATE.md +57 -0
- package/.agent/adr/README.md +30 -0
- package/.agent/agents/backend-specialist.md +1 -1
- package/.agent/agents/devops-engineer.md +1 -1
- package/.agent/agents/performance-optimizer.md +1 -1
- package/.agent/agents/security-auditor.md +1 -1
- package/.agent/dashboard/index.html +169 -0
- package/.agent/rules/REFERENCE.md +14 -0
- package/.agent/skills/ai-incident-management/SKILL.md +517 -0
- package/.agent/skills/ai-security-guardrails/SKILL.md +405 -0
- package/.agent/skills/ai-security-guardrails/owasp-llm-top10.md +160 -0
- package/.agent/skills/ai-security-guardrails/scripts/prompt_injection_scanner.py +230 -0
- package/.agent/skills/compliance-for-ai/SKILL.md +411 -0
- package/.agent/skills/observability-patterns/SKILL.md +484 -0
- package/.agent/skills/observability-patterns/scripts/otel_validator.py +330 -0
- package/.agent/skills/opentelemetry-expert/SKILL.md +738 -0
- package/.agent/skills/opentelemetry-expert/scripts/trace_analyzer.py +351 -0
- package/.agent/skills/privacy-preserving-dev/SKILL.md +442 -0
- package/.agent/skills/privacy-preserving-dev/scripts/pii_scanner.py +285 -0
- package/.agent/workflows/autofix.md +4 -1
- package/.agent/workflows/brainstorm.md +1 -1
- package/.agent/workflows/context.md +3 -1
- package/.agent/workflows/create.md +1 -1
- package/.agent/workflows/dashboard.md +4 -1
- package/.agent/workflows/debug.md +1 -1
- package/.agent/workflows/deploy.md +1 -1
- package/.agent/workflows/enhance.md +1 -1
- package/.agent/workflows/next.md +4 -1
- package/.agent/workflows/orchestrate.md +1 -1
- package/.agent/workflows/plan.md +1 -1
- package/.agent/workflows/preview.md +1 -1
- package/.agent/workflows/quality.md +1 -1
- package/.agent/workflows/spec.md +1 -1
- package/.agent/workflows/status.md +1 -1
- package/.agent/workflows/test.md +1 -1
- package/.agent/workflows/ui-ux-pro-max.md +1 -1
- package/package.json +4 -1
|
@@ -0,0 +1,738 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: opentelemetry-expert
|
|
3
|
+
description: Deep-dive OpenTelemetry SDK (Node.js, Python). Custom instrumentation for LLM calls, context propagation, exporter configurations, sampling strategies.
|
|
4
|
+
allowed-tools: Read, Write, Bash, Glob, Grep
|
|
5
|
+
skills:
|
|
6
|
+
- observability-patterns
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# OpenTelemetry Expert
|
|
10
|
+
|
|
11
|
+
> Master-level OpenTelemetry instrumentation for AI applications.
|
|
12
|
+
|
|
13
|
+
## 🔧 Runtime Scripts
|
|
14
|
+
|
|
15
|
+
**Execute for automated analysis:**
|
|
16
|
+
|
|
17
|
+
| Script | Purpose | Usage |
|
|
18
|
+
|--------|---------|-------|
|
|
19
|
+
| `scripts/trace_analyzer.py` | Analyze trace spans for performance issues | `python scripts/trace_analyzer.py <trace_file.json>` |
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 1. SDK Setup
|
|
24
|
+
|
|
25
|
+
### Node.js (TypeScript)
|
|
26
|
+
|
|
27
|
+
```typescript
|
|
28
|
+
// otel.ts - Initialize before all imports
|
|
29
|
+
import { NodeSDK } from '@opentelemetry/sdk-node';
|
|
30
|
+
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
|
|
31
|
+
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
|
|
32
|
+
import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http';
|
|
33
|
+
import { PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics';
|
|
34
|
+
import { Resource } from '@opentelemetry/resources';
|
|
35
|
+
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
|
|
36
|
+
|
|
37
|
+
const sdk = new NodeSDK({
|
|
38
|
+
resource: new Resource({
|
|
39
|
+
[SemanticResourceAttributes.SERVICE_NAME]: 'ai-service',
|
|
40
|
+
[SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
|
|
41
|
+
'deployment.environment': process.env.NODE_ENV || 'development',
|
|
42
|
+
}),
|
|
43
|
+
|
|
44
|
+
traceExporter: new OTLPTraceExporter({
|
|
45
|
+
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces',
|
|
46
|
+
}),
|
|
47
|
+
|
|
48
|
+
metricReader: new PeriodicExportingMetricReader({
|
|
49
|
+
exporter: new OTLPMetricExporter({
|
|
50
|
+
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/metrics',
|
|
51
|
+
}),
|
|
52
|
+
exportIntervalMillis: 60000,
|
|
53
|
+
}),
|
|
54
|
+
|
|
55
|
+
instrumentations: [
|
|
56
|
+
getNodeAutoInstrumentations({
|
|
57
|
+
'@opentelemetry/instrumentation-fs': { enabled: false },
|
|
58
|
+
}),
|
|
59
|
+
],
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
sdk.start();
|
|
63
|
+
|
|
64
|
+
// Graceful shutdown
|
|
65
|
+
process.on('SIGTERM', () => {
|
|
66
|
+
sdk.shutdown().then(() => process.exit(0));
|
|
67
|
+
});
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Python
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
# otel_setup.py
|
|
74
|
+
from opentelemetry import trace, metrics
|
|
75
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
76
|
+
from opentelemetry.sdk.metrics import MeterProvider
|
|
77
|
+
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
|
78
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
79
|
+
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
|
|
80
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
81
|
+
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
|
82
|
+
import os
|
|
83
|
+
|
|
84
|
+
def setup_opentelemetry():
|
|
85
|
+
resource = Resource.create({
|
|
86
|
+
SERVICE_NAME: "ai-service",
|
|
87
|
+
SERVICE_VERSION: "1.0.0",
|
|
88
|
+
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
# Tracing
|
|
92
|
+
trace_provider = TracerProvider(resource=resource)
|
|
93
|
+
trace_provider.add_span_processor(
|
|
94
|
+
BatchSpanProcessor(
|
|
95
|
+
OTLPSpanExporter(endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318/v1/traces"))
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
trace.set_tracer_provider(trace_provider)
|
|
99
|
+
|
|
100
|
+
# Metrics
|
|
101
|
+
metric_reader = PeriodicExportingMetricReader(
|
|
102
|
+
OTLPMetricExporter(endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318/v1/metrics")),
|
|
103
|
+
export_interval_millis=60000,
|
|
104
|
+
)
|
|
105
|
+
metrics.set_meter_provider(MeterProvider(resource=resource, metric_readers=[metric_reader]))
|
|
106
|
+
|
|
107
|
+
return trace.get_tracer("ai-service"), metrics.get_meter("ai-service")
|
|
108
|
+
|
|
109
|
+
tracer, meter = setup_opentelemetry()
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## 2. Custom LLM Instrumentation
|
|
115
|
+
|
|
116
|
+
### OpenAI Wrapper (TypeScript)
|
|
117
|
+
|
|
118
|
+
```typescript
|
|
119
|
+
import { trace, SpanKind, SpanStatusCode, context } from '@opentelemetry/api';
|
|
120
|
+
import OpenAI from 'openai';
|
|
121
|
+
|
|
122
|
+
const tracer = trace.getTracer('openai-instrumentation');
|
|
123
|
+
|
|
124
|
+
interface TracedOpenAIOptions {
|
|
125
|
+
client: OpenAI;
|
|
126
|
+
capturePrompts?: boolean; // Set false in production for privacy
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export function createTracedOpenAI({ client, capturePrompts = false }: TracedOpenAIOptions) {
|
|
130
|
+
return {
|
|
131
|
+
async createCompletion(params: OpenAI.ChatCompletionCreateParams) {
|
|
132
|
+
const spanName = `openai.chat.${params.model}`;
|
|
133
|
+
|
|
134
|
+
return tracer.startActiveSpan(spanName, {
|
|
135
|
+
kind: SpanKind.CLIENT,
|
|
136
|
+
attributes: {
|
|
137
|
+
'llm.vendor': 'openai',
|
|
138
|
+
'llm.request.model': params.model,
|
|
139
|
+
'llm.request.max_tokens': params.max_tokens || -1,
|
|
140
|
+
'llm.request.temperature': params.temperature || 1,
|
|
141
|
+
},
|
|
142
|
+
}, async (span) => {
|
|
143
|
+
const startTime = performance.now();
|
|
144
|
+
|
|
145
|
+
try {
|
|
146
|
+
// Optionally capture prompt (be careful with PII)
|
|
147
|
+
if (capturePrompts) {
|
|
148
|
+
span.setAttribute('llm.request.messages', JSON.stringify(params.messages).slice(0, 1000));
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const response = await client.chat.completions.create(params);
|
|
152
|
+
|
|
153
|
+
// Record response attributes
|
|
154
|
+
span.setAttributes({
|
|
155
|
+
'llm.response.model': response.model,
|
|
156
|
+
'llm.response.id': response.id,
|
|
157
|
+
'llm.usage.prompt_tokens': response.usage?.prompt_tokens || 0,
|
|
158
|
+
'llm.usage.completion_tokens': response.usage?.completion_tokens || 0,
|
|
159
|
+
'llm.usage.total_tokens': response.usage?.total_tokens || 0,
|
|
160
|
+
'llm.response.finish_reason': response.choices[0]?.finish_reason || 'unknown',
|
|
161
|
+
'llm.latency_ms': performance.now() - startTime,
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
165
|
+
return response;
|
|
166
|
+
|
|
167
|
+
} catch (error) {
|
|
168
|
+
span.recordException(error as Error);
|
|
169
|
+
span.setStatus({
|
|
170
|
+
code: SpanStatusCode.ERROR,
|
|
171
|
+
message: error instanceof Error ? error.message : 'Unknown error',
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
// Add error details
|
|
175
|
+
if (error instanceof OpenAI.APIError) {
|
|
176
|
+
span.setAttributes({
|
|
177
|
+
'error.type': error.type || 'api_error',
|
|
178
|
+
'error.code': error.code || 'unknown',
|
|
179
|
+
'error.status': error.status,
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
throw error;
|
|
184
|
+
} finally {
|
|
185
|
+
span.end();
|
|
186
|
+
}
|
|
187
|
+
});
|
|
188
|
+
},
|
|
189
|
+
|
|
190
|
+
// Add streaming support
|
|
191
|
+
async *createStreamingCompletion(params: OpenAI.ChatCompletionCreateParams) {
|
|
192
|
+
const spanName = `openai.chat.stream.${params.model}`;
|
|
193
|
+
|
|
194
|
+
const span = tracer.startSpan(spanName, {
|
|
195
|
+
kind: SpanKind.CLIENT,
|
|
196
|
+
attributes: {
|
|
197
|
+
'llm.vendor': 'openai',
|
|
198
|
+
'llm.request.model': params.model,
|
|
199
|
+
'llm.streaming': true,
|
|
200
|
+
},
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
const startTime = performance.now();
|
|
204
|
+
let totalTokens = 0;
|
|
205
|
+
|
|
206
|
+
try {
|
|
207
|
+
const stream = await client.chat.completions.create({
|
|
208
|
+
...params,
|
|
209
|
+
stream: true,
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
for await (const chunk of stream) {
|
|
213
|
+
totalTokens++; // Approximate
|
|
214
|
+
yield chunk;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
span.setAttributes({
|
|
218
|
+
'llm.usage.estimated_tokens': totalTokens,
|
|
219
|
+
'llm.latency_ms': performance.now() - startTime,
|
|
220
|
+
});
|
|
221
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
222
|
+
|
|
223
|
+
} catch (error) {
|
|
224
|
+
span.recordException(error as Error);
|
|
225
|
+
span.setStatus({ code: SpanStatusCode.ERROR });
|
|
226
|
+
throw error;
|
|
227
|
+
} finally {
|
|
228
|
+
span.end();
|
|
229
|
+
}
|
|
230
|
+
},
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Anthropic Wrapper (Python)
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
from opentelemetry import trace
|
|
239
|
+
from opentelemetry.trace import SpanKind, Status, StatusCode
|
|
240
|
+
import anthropic
|
|
241
|
+
from functools import wraps
|
|
242
|
+
import time
|
|
243
|
+
|
|
244
|
+
tracer = trace.get_tracer("anthropic-instrumentation")
|
|
245
|
+
|
|
246
|
+
def traced_anthropic_call(capture_prompts: bool = False):
|
|
247
|
+
"""Decorator to add tracing to Anthropic API calls."""
|
|
248
|
+
|
|
249
|
+
def decorator(func):
|
|
250
|
+
@wraps(func)
|
|
251
|
+
def wrapper(*args, **kwargs):
|
|
252
|
+
model = kwargs.get("model", "claude-3")
|
|
253
|
+
|
|
254
|
+
with tracer.start_as_current_span(
|
|
255
|
+
f"anthropic.messages.{model}",
|
|
256
|
+
kind=SpanKind.CLIENT,
|
|
257
|
+
attributes={
|
|
258
|
+
"llm.vendor": "anthropic",
|
|
259
|
+
"llm.request.model": model,
|
|
260
|
+
"llm.request.max_tokens": kwargs.get("max_tokens", 1024),
|
|
261
|
+
}
|
|
262
|
+
) as span:
|
|
263
|
+
start_time = time.perf_counter()
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
if capture_prompts:
|
|
267
|
+
messages = kwargs.get("messages", [])
|
|
268
|
+
span.set_attribute("llm.request.messages", str(messages)[:1000])
|
|
269
|
+
|
|
270
|
+
response = func(*args, **kwargs)
|
|
271
|
+
|
|
272
|
+
# Record usage
|
|
273
|
+
if hasattr(response, "usage"):
|
|
274
|
+
span.set_attributes({
|
|
275
|
+
"llm.usage.input_tokens": response.usage.input_tokens,
|
|
276
|
+
"llm.usage.output_tokens": response.usage.output_tokens,
|
|
277
|
+
})
|
|
278
|
+
|
|
279
|
+
span.set_attributes({
|
|
280
|
+
"llm.response.stop_reason": response.stop_reason,
|
|
281
|
+
"llm.latency_ms": (time.perf_counter() - start_time) * 1000,
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
span.set_status(Status(StatusCode.OK))
|
|
285
|
+
return response
|
|
286
|
+
|
|
287
|
+
except anthropic.APIError as e:
|
|
288
|
+
span.record_exception(e)
|
|
289
|
+
span.set_status(Status(StatusCode.ERROR, str(e)))
|
|
290
|
+
span.set_attributes({
|
|
291
|
+
"error.type": type(e).__name__,
|
|
292
|
+
"error.message": str(e),
|
|
293
|
+
})
|
|
294
|
+
raise
|
|
295
|
+
|
|
296
|
+
return wrapper
|
|
297
|
+
return decorator
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# Usage
|
|
301
|
+
client = anthropic.Anthropic()
|
|
302
|
+
|
|
303
|
+
@traced_anthropic_call(capture_prompts=False)
|
|
304
|
+
def call_claude(messages: list, model: str = "claude-3-sonnet-20240229", max_tokens: int = 1024):
|
|
305
|
+
return client.messages.create(
|
|
306
|
+
model=model,
|
|
307
|
+
max_tokens=max_tokens,
|
|
308
|
+
messages=messages,
|
|
309
|
+
)
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## 3. Agent Loop Instrumentation
|
|
315
|
+
|
|
316
|
+
### Multi-Step Agent Tracing
|
|
317
|
+
|
|
318
|
+
```typescript
|
|
319
|
+
import { trace, context, SpanKind } from '@opentelemetry/api';
|
|
320
|
+
|
|
321
|
+
const tracer = trace.getTracer('agent-loop');
|
|
322
|
+
|
|
323
|
+
interface AgentStep {
|
|
324
|
+
thought: string;
|
|
325
|
+
action?: { tool: string; input: any };
|
|
326
|
+
observation?: string;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
export async function runTracedAgent(
|
|
330
|
+
query: string,
|
|
331
|
+
maxSteps: number = 10
|
|
332
|
+
): Promise<string> {
|
|
333
|
+
|
|
334
|
+
return tracer.startActiveSpan('agent.run', {
|
|
335
|
+
attributes: {
|
|
336
|
+
'agent.query': query.slice(0, 500),
|
|
337
|
+
'agent.max_steps': maxSteps,
|
|
338
|
+
},
|
|
339
|
+
}, async (rootSpan) => {
|
|
340
|
+
const steps: AgentStep[] = [];
|
|
341
|
+
let stepCount = 0;
|
|
342
|
+
let finalAnswer: string | null = null;
|
|
343
|
+
|
|
344
|
+
try {
|
|
345
|
+
while (!finalAnswer && stepCount < maxSteps) {
|
|
346
|
+
stepCount++;
|
|
347
|
+
|
|
348
|
+
// Create child span for each step
|
|
349
|
+
const step = await tracer.startActiveSpan(`agent.step.${stepCount}`, {
|
|
350
|
+
attributes: { 'agent.step.number': stepCount },
|
|
351
|
+
}, async (stepSpan) => {
|
|
352
|
+
|
|
353
|
+
// Think phase
|
|
354
|
+
const thought = await tracer.startActiveSpan('agent.think', async (thinkSpan) => {
|
|
355
|
+
const result = await this.think(query, steps);
|
|
356
|
+
thinkSpan.setAttribute('agent.thought.length', result.length);
|
|
357
|
+
thinkSpan.end();
|
|
358
|
+
return result;
|
|
359
|
+
});
|
|
360
|
+
|
|
361
|
+
// Parse action
|
|
362
|
+
const action = this.parseAction(thought);
|
|
363
|
+
|
|
364
|
+
if (action) {
|
|
365
|
+
// Tool execution
|
|
366
|
+
const observation = await tracer.startActiveSpan('agent.tool', {
|
|
367
|
+
attributes: {
|
|
368
|
+
'agent.tool.name': action.tool,
|
|
369
|
+
'agent.tool.input_size': JSON.stringify(action.input).length,
|
|
370
|
+
},
|
|
371
|
+
}, async (toolSpan) => {
|
|
372
|
+
try {
|
|
373
|
+
const result = await this.executeTool(action);
|
|
374
|
+
toolSpan.setAttribute('agent.tool.success', true);
|
|
375
|
+
toolSpan.setAttribute('agent.tool.output_size', result.length);
|
|
376
|
+
toolSpan.end();
|
|
377
|
+
return result;
|
|
378
|
+
} catch (error) {
|
|
379
|
+
toolSpan.recordException(error as Error);
|
|
380
|
+
toolSpan.setAttribute('agent.tool.success', false);
|
|
381
|
+
toolSpan.end();
|
|
382
|
+
throw error;
|
|
383
|
+
}
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
stepSpan.setAttribute('agent.step.has_tool', true);
|
|
387
|
+
stepSpan.end();
|
|
388
|
+
return { thought, action, observation };
|
|
389
|
+
} else {
|
|
390
|
+
// Final answer
|
|
391
|
+
finalAnswer = this.extractAnswer(thought);
|
|
392
|
+
stepSpan.setAttribute('agent.step.is_final', true);
|
|
393
|
+
stepSpan.end();
|
|
394
|
+
return { thought };
|
|
395
|
+
}
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
steps.push(step);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Record final metrics
|
|
402
|
+
rootSpan.setAttributes({
|
|
403
|
+
'agent.total_steps': stepCount,
|
|
404
|
+
'agent.completed': finalAnswer !== null,
|
|
405
|
+
'agent.answer_length': finalAnswer?.length || 0,
|
|
406
|
+
});
|
|
407
|
+
|
|
408
|
+
return finalAnswer || 'No answer found';
|
|
409
|
+
|
|
410
|
+
} catch (error) {
|
|
411
|
+
rootSpan.recordException(error as Error);
|
|
412
|
+
throw error;
|
|
413
|
+
} finally {
|
|
414
|
+
rootSpan.end();
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
}
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
---
|
|
421
|
+
|
|
422
|
+
## 4. Context Propagation
|
|
423
|
+
|
|
424
|
+
### Across HTTP Boundaries
|
|
425
|
+
|
|
426
|
+
```typescript
|
|
427
|
+
import { propagation, context, trace } from '@opentelemetry/api';
|
|
428
|
+
|
|
429
|
+
// Client side: Inject context into headers
|
|
430
|
+
async function callService(data: any): Promise<Response> {
|
|
431
|
+
const headers: Record<string, string> = {
|
|
432
|
+
'Content-Type': 'application/json',
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
// Inject current trace context
|
|
436
|
+
propagation.inject(context.active(), headers);
|
|
437
|
+
|
|
438
|
+
return fetch('https://api.service.com/endpoint', {
|
|
439
|
+
method: 'POST',
|
|
440
|
+
headers,
|
|
441
|
+
body: JSON.stringify(data),
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Server side: Extract context from headers
|
|
446
|
+
function handleRequest(req: Request): void {
|
|
447
|
+
const extractedContext = propagation.extract(context.active(), req.headers);
|
|
448
|
+
|
|
449
|
+
// Run handler with extracted context
|
|
450
|
+
context.with(extractedContext, () => {
|
|
451
|
+
const span = trace.getTracer('server').startSpan('handle_request');
|
|
452
|
+
try {
|
|
453
|
+
// Handler logic - spans created here will be children
|
|
454
|
+
processRequest(req);
|
|
455
|
+
} finally {
|
|
456
|
+
span.end();
|
|
457
|
+
}
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
### Across Message Queues
|
|
463
|
+
|
|
464
|
+
```typescript
|
|
465
|
+
// Producer: Attach context to message
|
|
466
|
+
async function produceMessage(topic: string, message: any): Promise<void> {
|
|
467
|
+
const carriers: Record<string, string> = {};
|
|
468
|
+
propagation.inject(context.active(), carriers);
|
|
469
|
+
|
|
470
|
+
await kafka.produce({
|
|
471
|
+
topic,
|
|
472
|
+
messages: [{
|
|
473
|
+
value: JSON.stringify(message),
|
|
474
|
+
headers: carriers, // Context in headers
|
|
475
|
+
}],
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// Consumer: Extract context from message
|
|
480
|
+
async function consumeMessage(message: KafkaMessage): Promise<void> {
|
|
481
|
+
const extractedContext = propagation.extract(
|
|
482
|
+
context.active(),
|
|
483
|
+
Object.fromEntries(
|
|
484
|
+
Object.entries(message.headers || {}).map(([k, v]) => [k, v?.toString()])
|
|
485
|
+
)
|
|
486
|
+
);
|
|
487
|
+
|
|
488
|
+
await context.with(extractedContext, async () => {
|
|
489
|
+
// Process with correct parent context
|
|
490
|
+
await processMessage(JSON.parse(message.value.toString()));
|
|
491
|
+
});
|
|
492
|
+
}
|
|
493
|
+
```
|
|
494
|
+
|
|
495
|
+
---
|
|
496
|
+
|
|
497
|
+
## 5. Exporter Configurations
|
|
498
|
+
|
|
499
|
+
### Multi-Backend Export
|
|
500
|
+
|
|
501
|
+
```typescript
|
|
502
|
+
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
|
|
503
|
+
import { JaegerExporter } from '@opentelemetry/exporter-jaeger';
|
|
504
|
+
import { ZipkinExporter } from '@opentelemetry/exporter-zipkin';
|
|
505
|
+
import { ConsoleSpanExporter, SimpleSpanProcessor, BatchSpanProcessor } from '@opentelemetry/sdk-trace-base';
|
|
506
|
+
|
|
507
|
+
// Development: Console + Jaeger
|
|
508
|
+
const devExporters = [
|
|
509
|
+
new SimpleSpanProcessor(new ConsoleSpanExporter()),
|
|
510
|
+
new BatchSpanProcessor(new JaegerExporter({
|
|
511
|
+
endpoint: 'http://localhost:14268/api/traces',
|
|
512
|
+
})),
|
|
513
|
+
];
|
|
514
|
+
|
|
515
|
+
// Production: OTLP to collector
|
|
516
|
+
const prodExporter = new BatchSpanProcessor(
|
|
517
|
+
new OTLPTraceExporter({
|
|
518
|
+
url: process.env.OTEL_COLLECTOR_URL,
|
|
519
|
+
headers: {
|
|
520
|
+
'Authorization': `Bearer ${process.env.OTEL_AUTH_TOKEN}`,
|
|
521
|
+
},
|
|
522
|
+
}),
|
|
523
|
+
{
|
|
524
|
+
maxQueueSize: 2048,
|
|
525
|
+
maxExportBatchSize: 512,
|
|
526
|
+
scheduledDelayMillis: 5000,
|
|
527
|
+
}
|
|
528
|
+
);
|
|
529
|
+
|
|
530
|
+
// Conditional setup
|
|
531
|
+
const isProduction = process.env.NODE_ENV === 'production';
|
|
532
|
+
const spanProcessor = isProduction ? prodExporter : devExporters[1];
|
|
533
|
+
```
|
|
534
|
+
|
|
535
|
+
### Environment-Based Configuration
|
|
536
|
+
|
|
537
|
+
```bash
|
|
538
|
+
# .env
|
|
539
|
+
OTEL_SERVICE_NAME=ai-service
|
|
540
|
+
OTEL_EXPORTER_OTLP_ENDPOINT=https://otel-collector.company.com
|
|
541
|
+
OTEL_EXPORTER_OTLP_HEADERS=Authorization=Bearer abc123
|
|
542
|
+
OTEL_TRACES_SAMPLER=parentbased_traceidratio
|
|
543
|
+
OTEL_TRACES_SAMPLER_ARG=0.1
|
|
544
|
+
OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production,service.version=1.2.3
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
---
|
|
548
|
+
|
|
549
|
+
## 6. Sampling Strategies
|
|
550
|
+
|
|
551
|
+
### Adaptive Sampling
|
|
552
|
+
|
|
553
|
+
```typescript
|
|
554
|
+
import {
|
|
555
|
+
Sampler,
|
|
556
|
+
SamplingDecision,
|
|
557
|
+
SamplingResult,
|
|
558
|
+
ParentBasedSampler,
|
|
559
|
+
TraceIdRatioBasedSampler,
|
|
560
|
+
} from '@opentelemetry/sdk-trace-base';
|
|
561
|
+
import { Context, SpanKind, Attributes } from '@opentelemetry/api';
|
|
562
|
+
|
|
563
|
+
class AIAwareSampler implements Sampler {
|
|
564
|
+
private baseRatio: number;
|
|
565
|
+
|
|
566
|
+
constructor(baseRatio: number = 0.1) {
|
|
567
|
+
this.baseRatio = baseRatio;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
shouldSample(
|
|
571
|
+
context: Context,
|
|
572
|
+
traceId: string,
|
|
573
|
+
spanName: string,
|
|
574
|
+
spanKind: SpanKind,
|
|
575
|
+
attributes: Attributes
|
|
576
|
+
): SamplingResult {
|
|
577
|
+
|
|
578
|
+
// Always sample errors
|
|
579
|
+
if (attributes['error'] === true) {
|
|
580
|
+
return { decision: SamplingDecision.RECORD_AND_SAMPLED };
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// Always sample high-cost operations
|
|
584
|
+
if (spanName.includes('gpt-4') || spanName.includes('claude-3-opus')) {
|
|
585
|
+
return { decision: SamplingDecision.RECORD_AND_SAMPLED };
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
// Always sample slow operations (based on previous metrics)
|
|
589
|
+
const estimatedLatency = attributes['estimated_latency_ms'] as number;
|
|
590
|
+
if (estimatedLatency && estimatedLatency > 5000) {
|
|
591
|
+
return { decision: SamplingDecision.RECORD_AND_SAMPLED };
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
// Dynamic sampling based on load
|
|
595
|
+
const currentLoad = getSystemLoad(); // Your implementation
|
|
596
|
+
const adjustedRatio = currentLoad > 0.8 ? this.baseRatio / 2 : this.baseRatio;
|
|
597
|
+
|
|
598
|
+
// Hash-based decision for consistency
|
|
599
|
+
const hash = parseInt(traceId.slice(-8), 16) / 0xffffffff;
|
|
600
|
+
return {
|
|
601
|
+
decision: hash < adjustedRatio
|
|
602
|
+
? SamplingDecision.RECORD_AND_SAMPLED
|
|
603
|
+
: SamplingDecision.NOT_RECORD
|
|
604
|
+
};
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
toString(): string {
|
|
608
|
+
return `AIAwareSampler{ratio=${this.baseRatio}}`;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// Use parent-based to respect remote decisions
|
|
613
|
+
const sampler = new ParentBasedSampler({
|
|
614
|
+
root: new AIAwareSampler(0.1),
|
|
615
|
+
});
|
|
616
|
+
```
|
|
617
|
+
|
|
618
|
+
---
|
|
619
|
+
|
|
620
|
+
## 7. Custom Metrics
|
|
621
|
+
|
|
622
|
+
### AI-Specific Metrics
|
|
623
|
+
|
|
624
|
+
```typescript
|
|
625
|
+
import { metrics } from '@opentelemetry/api';
|
|
626
|
+
|
|
627
|
+
const meter = metrics.getMeter('ai-metrics');
|
|
628
|
+
|
|
629
|
+
// Token usage counter
|
|
630
|
+
const tokenCounter = meter.createCounter('llm.tokens', {
|
|
631
|
+
description: 'Total tokens used',
|
|
632
|
+
unit: '{token}',
|
|
633
|
+
});
|
|
634
|
+
|
|
635
|
+
// Latency histogram with AI-specific buckets
|
|
636
|
+
const latencyHistogram = meter.createHistogram('llm.latency', {
|
|
637
|
+
description: 'LLM response latency',
|
|
638
|
+
unit: 'ms',
|
|
639
|
+
advice: {
|
|
640
|
+
explicitBucketBoundaries: [100, 250, 500, 1000, 2500, 5000, 10000, 30000],
|
|
641
|
+
},
|
|
642
|
+
});
|
|
643
|
+
|
|
644
|
+
// Cost gauge (current daily spend)
|
|
645
|
+
const costGauge = meter.createObservableGauge('llm.cost.daily', {
|
|
646
|
+
description: 'Estimated daily LLM cost',
|
|
647
|
+
unit: 'cents',
|
|
648
|
+
});
|
|
649
|
+
costGauge.addCallback((result) => {
|
|
650
|
+
result.observe(calculateDailyCost());
|
|
651
|
+
});
|
|
652
|
+
|
|
653
|
+
// Agent success rate
|
|
654
|
+
const agentSuccessRate = meter.createObservableGauge('agent.success_rate', {
|
|
655
|
+
description: 'Agent task success rate over last hour',
|
|
656
|
+
unit: '%',
|
|
657
|
+
});
|
|
658
|
+
|
|
659
|
+
// Quality score distribution
|
|
660
|
+
const qualityHistogram = meter.createHistogram('llm.quality_score', {
|
|
661
|
+
description: 'Response quality scores',
|
|
662
|
+
unit: '{score}',
|
|
663
|
+
advice: {
|
|
664
|
+
explicitBucketBoundaries: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
|
665
|
+
},
|
|
666
|
+
});
|
|
667
|
+
|
|
668
|
+
// Record metrics
|
|
669
|
+
function recordLLMCall(result: LLMResult): void {
|
|
670
|
+
const attributes = {
|
|
671
|
+
model: result.model,
|
|
672
|
+
success: result.success,
|
|
673
|
+
};
|
|
674
|
+
|
|
675
|
+
tokenCounter.add(result.totalTokens, { ...attributes, type: 'total' });
|
|
676
|
+
tokenCounter.add(result.inputTokens, { ...attributes, type: 'input' });
|
|
677
|
+
tokenCounter.add(result.outputTokens, { ...attributes, type: 'output' });
|
|
678
|
+
|
|
679
|
+
latencyHistogram.record(result.latencyMs, attributes);
|
|
680
|
+
|
|
681
|
+
if (result.qualityScore) {
|
|
682
|
+
qualityHistogram.record(result.qualityScore, attributes);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
---
|
|
688
|
+
|
|
689
|
+
## 8. Troubleshooting
|
|
690
|
+
|
|
691
|
+
### Debug Mode
|
|
692
|
+
|
|
693
|
+
```bash
|
|
694
|
+
# Enable debug logging
|
|
695
|
+
export OTEL_LOG_LEVEL=debug
|
|
696
|
+
|
|
697
|
+
# Verify spans are being created
|
|
698
|
+
export OTEL_TRACES_EXPORTER=console
|
|
699
|
+
|
|
700
|
+
# Verify metrics
|
|
701
|
+
export OTEL_METRICS_EXPORTER=console
|
|
702
|
+
```
|
|
703
|
+
|
|
704
|
+
### Common Issues
|
|
705
|
+
|
|
706
|
+
| Issue | Cause | Solution |
|
|
707
|
+
|-------|-------|----------|
|
|
708
|
+
| No traces | Missing SDK init | Import otel.ts first |
|
|
709
|
+
| Missing child spans | Wrong context | Use `startActiveSpan` |
|
|
710
|
+
| High memory | Too many attributes | Limit attribute sizes |
|
|
711
|
+
| Missing attributes | Set after span ends | Set before `end()` |
|
|
712
|
+
| Dropped spans | Batch too slow | Reduce batch size |
|
|
713
|
+
|
|
714
|
+
---
|
|
715
|
+
|
|
716
|
+
## 9. Checklist
|
|
717
|
+
|
|
718
|
+
### SDK Setup
|
|
719
|
+
- [ ] Resource configured with service info
|
|
720
|
+
- [ ] Trace exporter configured
|
|
721
|
+
- [ ] Metric exporter configured
|
|
722
|
+
- [ ] Graceful shutdown handled
|
|
723
|
+
|
|
724
|
+
### Instrumentation
|
|
725
|
+
- [ ] All LLM calls have spans
|
|
726
|
+
- [ ] AI-specific attributes set
|
|
727
|
+
- [ ] Errors recorded with context
|
|
728
|
+
- [ ] Token usage metered
|
|
729
|
+
|
|
730
|
+
### Production
|
|
731
|
+
- [ ] Sampling configured
|
|
732
|
+
- [ ] Export batching tuned
|
|
733
|
+
- [ ] Sensitive data not logged
|
|
734
|
+
- [ ] Cardinality controlled
|
|
735
|
+
|
|
736
|
+
---
|
|
737
|
+
|
|
738
|
+
> **Remember:** Good instrumentation is invisible when things work and invaluable when they don't.
|