@musashishao/agent-kit 1.6.1 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/.agent/.shared/ui-ux-pro-max/data/charts.csv +26 -0
  2. package/.agent/.shared/ui-ux-pro-max/data/colors.csv +97 -0
  3. package/.agent/.shared/ui-ux-pro-max/data/icons.csv +101 -0
  4. package/.agent/.shared/ui-ux-pro-max/data/landing.csv +31 -0
  5. package/.agent/.shared/ui-ux-pro-max/data/products.csv +97 -0
  6. package/.agent/.shared/ui-ux-pro-max/data/prompts.csv +24 -0
  7. package/.agent/.shared/ui-ux-pro-max/data/react-performance.csv +45 -0
  8. package/.agent/.shared/ui-ux-pro-max/data/stacks/flutter.csv +53 -0
  9. package/.agent/.shared/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -0
  10. package/.agent/.shared/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -0
  11. package/.agent/.shared/ui-ux-pro-max/data/stacks/nextjs.csv +53 -0
  12. package/.agent/.shared/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -0
  13. package/.agent/.shared/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -0
  14. package/.agent/.shared/ui-ux-pro-max/data/stacks/react-native.csv +52 -0
  15. package/.agent/.shared/ui-ux-pro-max/data/stacks/react.csv +54 -0
  16. package/.agent/.shared/ui-ux-pro-max/data/stacks/shadcn.csv +61 -0
  17. package/.agent/.shared/ui-ux-pro-max/data/stacks/svelte.csv +54 -0
  18. package/.agent/.shared/ui-ux-pro-max/data/stacks/swiftui.csv +51 -0
  19. package/.agent/.shared/ui-ux-pro-max/data/stacks/vue.csv +50 -0
  20. package/.agent/.shared/ui-ux-pro-max/data/styles.csv +59 -0
  21. package/.agent/.shared/ui-ux-pro-max/data/typography.csv +58 -0
  22. package/.agent/.shared/ui-ux-pro-max/data/ui-reasoning.csv +101 -0
  23. package/.agent/.shared/ui-ux-pro-max/data/ux-guidelines.csv +100 -0
  24. package/.agent/.shared/ui-ux-pro-max/data/web-interface.csv +31 -0
  25. package/.agent/.shared/ui-ux-pro-max/scripts/core.py +258 -0
  26. package/.agent/.shared/ui-ux-pro-max/scripts/design_system.py +487 -0
  27. package/.agent/.shared/ui-ux-pro-max/scripts/search.py +76 -0
  28. package/.agent/adr/ADR-TEMPLATE.md +57 -0
  29. package/.agent/adr/README.md +30 -0
  30. package/.agent/agents/backend-specialist.md +1 -1
  31. package/.agent/agents/devops-engineer.md +1 -1
  32. package/.agent/agents/performance-optimizer.md +1 -1
  33. package/.agent/agents/security-auditor.md +1 -1
  34. package/.agent/dashboard/index.html +169 -0
  35. package/.agent/rules/REFERENCE.md +14 -0
  36. package/.agent/skills/ai-incident-management/SKILL.md +517 -0
  37. package/.agent/skills/ai-security-guardrails/SKILL.md +405 -0
  38. package/.agent/skills/ai-security-guardrails/owasp-llm-top10.md +160 -0
  39. package/.agent/skills/ai-security-guardrails/scripts/prompt_injection_scanner.py +230 -0
  40. package/.agent/skills/compliance-for-ai/SKILL.md +411 -0
  41. package/.agent/skills/observability-patterns/SKILL.md +484 -0
  42. package/.agent/skills/observability-patterns/scripts/otel_validator.py +330 -0
  43. package/.agent/skills/opentelemetry-expert/SKILL.md +738 -0
  44. package/.agent/skills/opentelemetry-expert/scripts/trace_analyzer.py +351 -0
  45. package/.agent/skills/privacy-preserving-dev/SKILL.md +442 -0
  46. package/.agent/skills/privacy-preserving-dev/scripts/pii_scanner.py +285 -0
  47. package/.agent/workflows/autofix.md +4 -1
  48. package/.agent/workflows/brainstorm.md +1 -1
  49. package/.agent/workflows/context.md +3 -1
  50. package/.agent/workflows/create.md +1 -1
  51. package/.agent/workflows/dashboard.md +4 -1
  52. package/.agent/workflows/debug.md +1 -1
  53. package/.agent/workflows/deploy.md +1 -1
  54. package/.agent/workflows/enhance.md +1 -1
  55. package/.agent/workflows/next.md +4 -1
  56. package/.agent/workflows/orchestrate.md +1 -1
  57. package/.agent/workflows/plan.md +1 -1
  58. package/.agent/workflows/preview.md +1 -1
  59. package/.agent/workflows/quality.md +1 -1
  60. package/.agent/workflows/spec.md +1 -1
  61. package/.agent/workflows/status.md +1 -1
  62. package/.agent/workflows/test.md +1 -1
  63. package/.agent/workflows/ui-ux-pro-max.md +1 -1
  64. package/package.json +4 -1
@@ -0,0 +1,738 @@
1
+ ---
2
+ name: opentelemetry-expert
3
+ description: Deep-dive OpenTelemetry SDK (Node.js, Python). Custom instrumentation for LLM calls, context propagation, exporter configurations, sampling strategies.
4
+ allowed-tools: Read, Write, Bash, Glob, Grep
5
+ skills:
6
+ - observability-patterns
7
+ ---
8
+
9
+ # OpenTelemetry Expert
10
+
11
+ > Master-level OpenTelemetry instrumentation for AI applications.
12
+
13
+ ## 🔧 Runtime Scripts
14
+
15
+ **Execute for automated analysis:**
16
+
17
+ | Script | Purpose | Usage |
18
+ |--------|---------|-------|
19
+ | `scripts/trace_analyzer.py` | Analyze trace spans for performance issues | `python scripts/trace_analyzer.py <trace_file.json>` |
20
+
21
+ ---
22
+
23
+ ## 1. SDK Setup
24
+
25
+ ### Node.js (TypeScript)
26
+
27
+ ```typescript
28
+ // otel.ts - Initialize before all imports
29
+ import { NodeSDK } from '@opentelemetry/sdk-node';
30
+ import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
31
+ import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
32
+ import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http';
33
+ import { PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics';
34
+ import { Resource } from '@opentelemetry/resources';
35
+ import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
36
+
37
+ const sdk = new NodeSDK({
38
+ resource: new Resource({
39
+ [SemanticResourceAttributes.SERVICE_NAME]: 'ai-service',
40
+ [SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
41
+ 'deployment.environment': process.env.NODE_ENV || 'development',
42
+ }),
43
+
44
+ traceExporter: new OTLPTraceExporter({
45
+ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces',
46
+ }),
47
+
48
+ metricReader: new PeriodicExportingMetricReader({
49
+ exporter: new OTLPMetricExporter({
50
+ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/metrics',
51
+ }),
52
+ exportIntervalMillis: 60000,
53
+ }),
54
+
55
+ instrumentations: [
56
+ getNodeAutoInstrumentations({
57
+ '@opentelemetry/instrumentation-fs': { enabled: false },
58
+ }),
59
+ ],
60
+ });
61
+
62
+ sdk.start();
63
+
64
+ // Graceful shutdown
65
+ process.on('SIGTERM', () => {
66
+ sdk.shutdown().then(() => process.exit(0));
67
+ });
68
+ ```
69
+
70
+ ### Python
71
+
72
+ ```python
73
+ # otel_setup.py
74
+ from opentelemetry import trace, metrics
75
+ from opentelemetry.sdk.trace import TracerProvider
76
+ from opentelemetry.sdk.metrics import MeterProvider
77
+ from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
78
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
79
+ from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
80
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
81
+ from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
82
+ import os
83
+
84
+ def setup_opentelemetry():
85
+ resource = Resource.create({
86
+ SERVICE_NAME: "ai-service",
87
+ SERVICE_VERSION: "1.0.0",
88
+ "deployment.environment": os.getenv("ENVIRONMENT", "development"),
89
+ })
90
+
91
+ # Tracing
92
+ trace_provider = TracerProvider(resource=resource)
93
+ trace_provider.add_span_processor(
94
+ BatchSpanProcessor(
95
+ OTLPSpanExporter(endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318/v1/traces"))
96
+ )
97
+ )
98
+ trace.set_tracer_provider(trace_provider)
99
+
100
+ # Metrics
101
+ metric_reader = PeriodicExportingMetricReader(
102
+ OTLPMetricExporter(endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318/v1/metrics")),
103
+ export_interval_millis=60000,
104
+ )
105
+ metrics.set_meter_provider(MeterProvider(resource=resource, metric_readers=[metric_reader]))
106
+
107
+ return trace.get_tracer("ai-service"), metrics.get_meter("ai-service")
108
+
109
+ tracer, meter = setup_opentelemetry()
110
+ ```
111
+
112
+ ---
113
+
114
+ ## 2. Custom LLM Instrumentation
115
+
116
+ ### OpenAI Wrapper (TypeScript)
117
+
118
+ ```typescript
119
+ import { trace, SpanKind, SpanStatusCode, context } from '@opentelemetry/api';
120
+ import OpenAI from 'openai';
121
+
122
+ const tracer = trace.getTracer('openai-instrumentation');
123
+
124
+ interface TracedOpenAIOptions {
125
+ client: OpenAI;
126
+ capturePrompts?: boolean; // Set false in production for privacy
127
+ }
128
+
129
+ export function createTracedOpenAI({ client, capturePrompts = false }: TracedOpenAIOptions) {
130
+ return {
131
+ async createCompletion(params: OpenAI.ChatCompletionCreateParams) {
132
+ const spanName = `openai.chat.${params.model}`;
133
+
134
+ return tracer.startActiveSpan(spanName, {
135
+ kind: SpanKind.CLIENT,
136
+ attributes: {
137
+ 'llm.vendor': 'openai',
138
+ 'llm.request.model': params.model,
139
+ 'llm.request.max_tokens': params.max_tokens || -1,
140
+ 'llm.request.temperature': params.temperature || 1,
141
+ },
142
+ }, async (span) => {
143
+ const startTime = performance.now();
144
+
145
+ try {
146
+ // Optionally capture prompt (be careful with PII)
147
+ if (capturePrompts) {
148
+ span.setAttribute('llm.request.messages', JSON.stringify(params.messages).slice(0, 1000));
149
+ }
150
+
151
+ const response = await client.chat.completions.create(params);
152
+
153
+ // Record response attributes
154
+ span.setAttributes({
155
+ 'llm.response.model': response.model,
156
+ 'llm.response.id': response.id,
157
+ 'llm.usage.prompt_tokens': response.usage?.prompt_tokens || 0,
158
+ 'llm.usage.completion_tokens': response.usage?.completion_tokens || 0,
159
+ 'llm.usage.total_tokens': response.usage?.total_tokens || 0,
160
+ 'llm.response.finish_reason': response.choices[0]?.finish_reason || 'unknown',
161
+ 'llm.latency_ms': performance.now() - startTime,
162
+ });
163
+
164
+ span.setStatus({ code: SpanStatusCode.OK });
165
+ return response;
166
+
167
+ } catch (error) {
168
+ span.recordException(error as Error);
169
+ span.setStatus({
170
+ code: SpanStatusCode.ERROR,
171
+ message: error instanceof Error ? error.message : 'Unknown error',
172
+ });
173
+
174
+ // Add error details
175
+ if (error instanceof OpenAI.APIError) {
176
+ span.setAttributes({
177
+ 'error.type': error.type || 'api_error',
178
+ 'error.code': error.code || 'unknown',
179
+ 'error.status': error.status,
180
+ });
181
+ }
182
+
183
+ throw error;
184
+ } finally {
185
+ span.end();
186
+ }
187
+ });
188
+ },
189
+
190
+ // Add streaming support
191
+ async *createStreamingCompletion(params: OpenAI.ChatCompletionCreateParams) {
192
+ const spanName = `openai.chat.stream.${params.model}`;
193
+
194
+ const span = tracer.startSpan(spanName, {
195
+ kind: SpanKind.CLIENT,
196
+ attributes: {
197
+ 'llm.vendor': 'openai',
198
+ 'llm.request.model': params.model,
199
+ 'llm.streaming': true,
200
+ },
201
+ });
202
+
203
+ const startTime = performance.now();
204
+ let totalTokens = 0;
205
+
206
+ try {
207
+ const stream = await client.chat.completions.create({
208
+ ...params,
209
+ stream: true,
210
+ });
211
+
212
+ for await (const chunk of stream) {
213
+ totalTokens++; // Approximate
214
+ yield chunk;
215
+ }
216
+
217
+ span.setAttributes({
218
+ 'llm.usage.estimated_tokens': totalTokens,
219
+ 'llm.latency_ms': performance.now() - startTime,
220
+ });
221
+ span.setStatus({ code: SpanStatusCode.OK });
222
+
223
+ } catch (error) {
224
+ span.recordException(error as Error);
225
+ span.setStatus({ code: SpanStatusCode.ERROR });
226
+ throw error;
227
+ } finally {
228
+ span.end();
229
+ }
230
+ },
231
+ };
232
+ }
233
+ ```
234
+
235
+ ### Anthropic Wrapper (Python)
236
+
237
+ ```python
238
+ from opentelemetry import trace
239
+ from opentelemetry.trace import SpanKind, Status, StatusCode
240
+ import anthropic
241
+ from functools import wraps
242
+ import time
243
+
244
+ tracer = trace.get_tracer("anthropic-instrumentation")
245
+
246
+ def traced_anthropic_call(capture_prompts: bool = False):
247
+ """Decorator to add tracing to Anthropic API calls."""
248
+
249
+ def decorator(func):
250
+ @wraps(func)
251
+ def wrapper(*args, **kwargs):
252
+ model = kwargs.get("model", "claude-3")
253
+
254
+ with tracer.start_as_current_span(
255
+ f"anthropic.messages.{model}",
256
+ kind=SpanKind.CLIENT,
257
+ attributes={
258
+ "llm.vendor": "anthropic",
259
+ "llm.request.model": model,
260
+ "llm.request.max_tokens": kwargs.get("max_tokens", 1024),
261
+ }
262
+ ) as span:
263
+ start_time = time.perf_counter()
264
+
265
+ try:
266
+ if capture_prompts:
267
+ messages = kwargs.get("messages", [])
268
+ span.set_attribute("llm.request.messages", str(messages)[:1000])
269
+
270
+ response = func(*args, **kwargs)
271
+
272
+ # Record usage
273
+ if hasattr(response, "usage"):
274
+ span.set_attributes({
275
+ "llm.usage.input_tokens": response.usage.input_tokens,
276
+ "llm.usage.output_tokens": response.usage.output_tokens,
277
+ })
278
+
279
+ span.set_attributes({
280
+ "llm.response.stop_reason": response.stop_reason,
281
+ "llm.latency_ms": (time.perf_counter() - start_time) * 1000,
282
+ })
283
+
284
+ span.set_status(Status(StatusCode.OK))
285
+ return response
286
+
287
+ except anthropic.APIError as e:
288
+ span.record_exception(e)
289
+ span.set_status(Status(StatusCode.ERROR, str(e)))
290
+ span.set_attributes({
291
+ "error.type": type(e).__name__,
292
+ "error.message": str(e),
293
+ })
294
+ raise
295
+
296
+ return wrapper
297
+ return decorator
298
+
299
+
300
+ # Usage
301
+ client = anthropic.Anthropic()
302
+
303
+ @traced_anthropic_call(capture_prompts=False)
304
+ def call_claude(messages: list, model: str = "claude-3-sonnet-20240229", max_tokens: int = 1024):
305
+ return client.messages.create(
306
+ model=model,
307
+ max_tokens=max_tokens,
308
+ messages=messages,
309
+ )
310
+ ```
311
+
312
+ ---
313
+
314
+ ## 3. Agent Loop Instrumentation
315
+
316
+ ### Multi-Step Agent Tracing
317
+
318
+ ```typescript
319
+ import { trace, context, SpanKind } from '@opentelemetry/api';
320
+
321
+ const tracer = trace.getTracer('agent-loop');
322
+
323
+ interface AgentStep {
324
+ thought: string;
325
+ action?: { tool: string; input: any };
326
+ observation?: string;
327
+ }
328
+
329
+ export async function runTracedAgent(
330
+ query: string,
331
+ maxSteps: number = 10
332
+ ): Promise<string> {
333
+
334
+ return tracer.startActiveSpan('agent.run', {
335
+ attributes: {
336
+ 'agent.query': query.slice(0, 500),
337
+ 'agent.max_steps': maxSteps,
338
+ },
339
+ }, async (rootSpan) => {
340
+ const steps: AgentStep[] = [];
341
+ let stepCount = 0;
342
+ let finalAnswer: string | null = null;
343
+
344
+ try {
345
+ while (!finalAnswer && stepCount < maxSteps) {
346
+ stepCount++;
347
+
348
+ // Create child span for each step
349
+ const step = await tracer.startActiveSpan(`agent.step.${stepCount}`, {
350
+ attributes: { 'agent.step.number': stepCount },
351
+ }, async (stepSpan) => {
352
+
353
+ // Think phase
354
+ const thought = await tracer.startActiveSpan('agent.think', async (thinkSpan) => {
355
+ const result = await this.think(query, steps);
356
+ thinkSpan.setAttribute('agent.thought.length', result.length);
357
+ thinkSpan.end();
358
+ return result;
359
+ });
360
+
361
+ // Parse action
362
+ const action = this.parseAction(thought);
363
+
364
+ if (action) {
365
+ // Tool execution
366
+ const observation = await tracer.startActiveSpan('agent.tool', {
367
+ attributes: {
368
+ 'agent.tool.name': action.tool,
369
+ 'agent.tool.input_size': JSON.stringify(action.input).length,
370
+ },
371
+ }, async (toolSpan) => {
372
+ try {
373
+ const result = await this.executeTool(action);
374
+ toolSpan.setAttribute('agent.tool.success', true);
375
+ toolSpan.setAttribute('agent.tool.output_size', result.length);
376
+ toolSpan.end();
377
+ return result;
378
+ } catch (error) {
379
+ toolSpan.recordException(error as Error);
380
+ toolSpan.setAttribute('agent.tool.success', false);
381
+ toolSpan.end();
382
+ throw error;
383
+ }
384
+ });
385
+
386
+ stepSpan.setAttribute('agent.step.has_tool', true);
387
+ stepSpan.end();
388
+ return { thought, action, observation };
389
+ } else {
390
+ // Final answer
391
+ finalAnswer = this.extractAnswer(thought);
392
+ stepSpan.setAttribute('agent.step.is_final', true);
393
+ stepSpan.end();
394
+ return { thought };
395
+ }
396
+ });
397
+
398
+ steps.push(step);
399
+ }
400
+
401
+ // Record final metrics
402
+ rootSpan.setAttributes({
403
+ 'agent.total_steps': stepCount,
404
+ 'agent.completed': finalAnswer !== null,
405
+ 'agent.answer_length': finalAnswer?.length || 0,
406
+ });
407
+
408
+ return finalAnswer || 'No answer found';
409
+
410
+ } catch (error) {
411
+ rootSpan.recordException(error as Error);
412
+ throw error;
413
+ } finally {
414
+ rootSpan.end();
415
+ }
416
+ });
417
+ }
418
+ ```
419
+
420
+ ---
421
+
422
+ ## 4. Context Propagation
423
+
424
+ ### Across HTTP Boundaries
425
+
426
+ ```typescript
427
+ import { propagation, context, trace } from '@opentelemetry/api';
428
+
429
+ // Client side: Inject context into headers
430
+ async function callService(data: any): Promise<Response> {
431
+ const headers: Record<string, string> = {
432
+ 'Content-Type': 'application/json',
433
+ };
434
+
435
+ // Inject current trace context
436
+ propagation.inject(context.active(), headers);
437
+
438
+ return fetch('https://api.service.com/endpoint', {
439
+ method: 'POST',
440
+ headers,
441
+ body: JSON.stringify(data),
442
+ });
443
+ }
444
+
445
+ // Server side: Extract context from headers
446
+ function handleRequest(req: Request): void {
447
+ const extractedContext = propagation.extract(context.active(), req.headers);
448
+
449
+ // Run handler with extracted context
450
+ context.with(extractedContext, () => {
451
+ const span = trace.getTracer('server').startSpan('handle_request');
452
+ try {
453
+ // Handler logic - spans created here will be children
454
+ processRequest(req);
455
+ } finally {
456
+ span.end();
457
+ }
458
+ });
459
+ }
460
+ ```
461
+
462
+ ### Across Message Queues
463
+
464
+ ```typescript
465
+ // Producer: Attach context to message
466
+ async function produceMessage(topic: string, message: any): Promise<void> {
467
+ const carriers: Record<string, string> = {};
468
+ propagation.inject(context.active(), carriers);
469
+
470
+ await kafka.produce({
471
+ topic,
472
+ messages: [{
473
+ value: JSON.stringify(message),
474
+ headers: carriers, // Context in headers
475
+ }],
476
+ });
477
+ }
478
+
479
+ // Consumer: Extract context from message
480
+ async function consumeMessage(message: KafkaMessage): Promise<void> {
481
+ const extractedContext = propagation.extract(
482
+ context.active(),
483
+ Object.fromEntries(
484
+ Object.entries(message.headers || {}).map(([k, v]) => [k, v?.toString()])
485
+ )
486
+ );
487
+
488
+ await context.with(extractedContext, async () => {
489
+ // Process with correct parent context
490
+ await processMessage(JSON.parse(message.value.toString()));
491
+ });
492
+ }
493
+ ```
494
+
495
+ ---
496
+
497
+ ## 5. Exporter Configurations
498
+
499
+ ### Multi-Backend Export
500
+
501
+ ```typescript
502
+ import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
503
+ import { JaegerExporter } from '@opentelemetry/exporter-jaeger';
504
+ import { ZipkinExporter } from '@opentelemetry/exporter-zipkin';
505
+ import { ConsoleSpanExporter, SimpleSpanProcessor, BatchSpanProcessor } from '@opentelemetry/sdk-trace-base';
506
+
507
+ // Development: Console + Jaeger
508
+ const devExporters = [
509
+ new SimpleSpanProcessor(new ConsoleSpanExporter()),
510
+ new BatchSpanProcessor(new JaegerExporter({
511
+ endpoint: 'http://localhost:14268/api/traces',
512
+ })),
513
+ ];
514
+
515
+ // Production: OTLP to collector
516
+ const prodExporter = new BatchSpanProcessor(
517
+ new OTLPTraceExporter({
518
+ url: process.env.OTEL_COLLECTOR_URL,
519
+ headers: {
520
+ 'Authorization': `Bearer ${process.env.OTEL_AUTH_TOKEN}`,
521
+ },
522
+ }),
523
+ {
524
+ maxQueueSize: 2048,
525
+ maxExportBatchSize: 512,
526
+ scheduledDelayMillis: 5000,
527
+ }
528
+ );
529
+
530
+ // Conditional setup
531
+ const isProduction = process.env.NODE_ENV === 'production';
532
+ const spanProcessor = isProduction ? prodExporter : devExporters[1];
533
+ ```
534
+
535
+ ### Environment-Based Configuration
536
+
537
+ ```bash
538
+ # .env
539
+ OTEL_SERVICE_NAME=ai-service
540
+ OTEL_EXPORTER_OTLP_ENDPOINT=https://otel-collector.company.com
541
+ OTEL_EXPORTER_OTLP_HEADERS=Authorization=Bearer abc123
542
+ OTEL_TRACES_SAMPLER=parentbased_traceidratio
543
+ OTEL_TRACES_SAMPLER_ARG=0.1
544
+ OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production,service.version=1.2.3
545
+ ```
546
+
547
+ ---
548
+
549
+ ## 6. Sampling Strategies
550
+
551
+ ### Adaptive Sampling
552
+
553
+ ```typescript
554
+ import {
555
+ Sampler,
556
+ SamplingDecision,
557
+ SamplingResult,
558
+ ParentBasedSampler,
559
+ TraceIdRatioBasedSampler,
560
+ } from '@opentelemetry/sdk-trace-base';
561
+ import { Context, SpanKind, Attributes } from '@opentelemetry/api';
562
+
563
+ class AIAwareSampler implements Sampler {
564
+ private baseRatio: number;
565
+
566
+ constructor(baseRatio: number = 0.1) {
567
+ this.baseRatio = baseRatio;
568
+ }
569
+
570
+ shouldSample(
571
+ context: Context,
572
+ traceId: string,
573
+ spanName: string,
574
+ spanKind: SpanKind,
575
+ attributes: Attributes
576
+ ): SamplingResult {
577
+
578
+ // Always sample errors
579
+ if (attributes['error'] === true) {
580
+ return { decision: SamplingDecision.RECORD_AND_SAMPLED };
581
+ }
582
+
583
+ // Always sample high-cost operations
584
+ if (spanName.includes('gpt-4') || spanName.includes('claude-3-opus')) {
585
+ return { decision: SamplingDecision.RECORD_AND_SAMPLED };
586
+ }
587
+
588
+ // Always sample slow operations (based on previous metrics)
589
+ const estimatedLatency = attributes['estimated_latency_ms'] as number;
590
+ if (estimatedLatency && estimatedLatency > 5000) {
591
+ return { decision: SamplingDecision.RECORD_AND_SAMPLED };
592
+ }
593
+
594
+ // Dynamic sampling based on load
595
+ const currentLoad = getSystemLoad(); // Your implementation
596
+ const adjustedRatio = currentLoad > 0.8 ? this.baseRatio / 2 : this.baseRatio;
597
+
598
+ // Hash-based decision for consistency
599
+ const hash = parseInt(traceId.slice(-8), 16) / 0xffffffff;
600
+ return {
601
+ decision: hash < adjustedRatio
602
+ ? SamplingDecision.RECORD_AND_SAMPLED
603
+ : SamplingDecision.NOT_RECORD
604
+ };
605
+ }
606
+
607
+ toString(): string {
608
+ return `AIAwareSampler{ratio=${this.baseRatio}}`;
609
+ }
610
+ }
611
+
612
+ // Use parent-based to respect remote decisions
613
+ const sampler = new ParentBasedSampler({
614
+ root: new AIAwareSampler(0.1),
615
+ });
616
+ ```
617
+
618
+ ---
619
+
620
+ ## 7. Custom Metrics
621
+
622
+ ### AI-Specific Metrics
623
+
624
+ ```typescript
625
+ import { metrics } from '@opentelemetry/api';
626
+
627
+ const meter = metrics.getMeter('ai-metrics');
628
+
629
+ // Token usage counter
630
+ const tokenCounter = meter.createCounter('llm.tokens', {
631
+ description: 'Total tokens used',
632
+ unit: '{token}',
633
+ });
634
+
635
+ // Latency histogram with AI-specific buckets
636
+ const latencyHistogram = meter.createHistogram('llm.latency', {
637
+ description: 'LLM response latency',
638
+ unit: 'ms',
639
+ advice: {
640
+ explicitBucketBoundaries: [100, 250, 500, 1000, 2500, 5000, 10000, 30000],
641
+ },
642
+ });
643
+
644
+ // Cost gauge (current daily spend)
645
+ const costGauge = meter.createObservableGauge('llm.cost.daily', {
646
+ description: 'Estimated daily LLM cost',
647
+ unit: 'cents',
648
+ });
649
+ costGauge.addCallback((result) => {
650
+ result.observe(calculateDailyCost());
651
+ });
652
+
653
+ // Agent success rate
654
+ const agentSuccessRate = meter.createObservableGauge('agent.success_rate', {
655
+ description: 'Agent task success rate over last hour',
656
+ unit: '%',
657
+ });
658
+
659
+ // Quality score distribution
660
+ const qualityHistogram = meter.createHistogram('llm.quality_score', {
661
+ description: 'Response quality scores',
662
+ unit: '{score}',
663
+ advice: {
664
+ explicitBucketBoundaries: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
665
+ },
666
+ });
667
+
668
+ // Record metrics
669
+ function recordLLMCall(result: LLMResult): void {
670
+ const attributes = {
671
+ model: result.model,
672
+ success: result.success,
673
+ };
674
+
675
+ tokenCounter.add(result.totalTokens, { ...attributes, type: 'total' });
676
+ tokenCounter.add(result.inputTokens, { ...attributes, type: 'input' });
677
+ tokenCounter.add(result.outputTokens, { ...attributes, type: 'output' });
678
+
679
+ latencyHistogram.record(result.latencyMs, attributes);
680
+
681
+ if (result.qualityScore) {
682
+ qualityHistogram.record(result.qualityScore, attributes);
683
+ }
684
+ }
685
+ ```
686
+
687
+ ---
688
+
689
+ ## 8. Troubleshooting
690
+
691
+ ### Debug Mode
692
+
693
+ ```bash
694
+ # Enable debug logging
695
+ export OTEL_LOG_LEVEL=debug
696
+
697
+ # Verify spans are being created
698
+ export OTEL_TRACES_EXPORTER=console
699
+
700
+ # Verify metrics
701
+ export OTEL_METRICS_EXPORTER=console
702
+ ```
703
+
704
+ ### Common Issues
705
+
706
+ | Issue | Cause | Solution |
707
+ |-------|-------|----------|
708
+ | No traces | Missing SDK init | Import otel.ts first |
709
+ | Missing child spans | Wrong context | Use `startActiveSpan` |
710
+ | High memory | Too many attributes | Limit attribute sizes |
711
+ | Missing attributes | Set after span ends | Set before `end()` |
712
+ | Dropped spans | Batch too slow | Reduce batch size |
713
+
714
+ ---
715
+
716
+ ## 9. Checklist
717
+
718
+ ### SDK Setup
719
+ - [ ] Resource configured with service info
720
+ - [ ] Trace exporter configured
721
+ - [ ] Metric exporter configured
722
+ - [ ] Graceful shutdown handled
723
+
724
+ ### Instrumentation
725
+ - [ ] All LLM calls have spans
726
+ - [ ] AI-specific attributes set
727
+ - [ ] Errors recorded with context
728
+ - [ ] Token usage metered
729
+
730
+ ### Production
731
+ - [ ] Sampling configured
732
+ - [ ] Export batching tuned
733
+ - [ ] Sensitive data not logged
734
+ - [ ] Cardinality controlled
735
+
736
+ ---
737
+
738
+ > **Remember:** Good instrumentation is invisible when things work and invaluable when they don't.