npm - autotel - Versions diffs - 2.26.1 → 2.26.2 - Mend

autotel 2.26.1 → 2.26.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/index.cjs +126 -0
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +198 -2
package/dist/index.d.ts +198 -2
package/dist/index.js +117 -1
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/src/gen-ai-events.test.ts +135 -0
package/src/gen-ai-events.ts +199 -0
package/src/gen-ai-metrics.test.ts +96 -0
package/src/gen-ai-metrics.ts +128 -0
package/src/index.ts +26 -0

package/src/gen-ai-events.test.ts ADDED Viewed

@@ -0,0 +1,135 @@
+import { describe, expect, it } from 'vitest';
+import type { TraceContext } from './trace-context';
+import {
+  recordPromptSent,
+  recordResponseReceived,
+  recordRetry,
+  recordStreamFirstToken,
+  recordToolCall,
+} from './gen-ai-events';
+type CapturedEvent = { name: string; attrs?: Record<string, unknown> };
+function captureCtx(): {
+  ctx: TraceContext;
+  events: CapturedEvent[];
+} {
+  const events: CapturedEvent[] = [];
+  const ctx = {
+    addEvent: (name: string, attrs?: Record<string, unknown>) => {
+      events.push({ name, attrs });
+    },
+    setAttribute: () => {},
+    setAttributes: () => {},
+    setStatus: () => {},
+    recordException: () => {},
+    addLink: () => {},
+    addLinks: () => {},
+    updateName: () => {},
+    isRecording: () => true,
+    end: () => {},
+  } as unknown as TraceContext;
+  return { ctx, events };
+}
+describe('GenAI span event helpers', () => {
+  it('recordPromptSent emits gen_ai.prompt.sent with canonical attrs', () => {
+    const { ctx, events } = captureCtx();
+    recordPromptSent(ctx, {
+      model: 'gpt-4o',
+      promptTokens: 1200,
+      messageCount: 3,
+      operation: 'chat',
+    });
+    expect(events).toHaveLength(1);
+    expect(events[0]).toEqual({
+      name: 'gen_ai.prompt.sent',
+      attrs: {
+        'gen_ai.request.model': 'gpt-4o',
+        'gen_ai.usage.input_tokens': 1200,
+        'gen_ai.request.message_count': 3,
+        'gen_ai.operation.name': 'chat',
+      },
+    });
+  });
+  it('recordPromptSent omits unset fields rather than writing undefined', () => {
+    const { ctx, events } = captureCtx();
+    recordPromptSent(ctx);
+    expect(events[0]?.attrs).toEqual({});
+  });
+  it('recordResponseReceived joins finish reasons into a CSV for attribute compat', () => {
+    const { ctx, events } = captureCtx();
+    recordResponseReceived(ctx, {
+      model: 'gpt-4o-2024-11-20',
+      promptTokens: 1200,
+      completionTokens: 400,
+      totalTokens: 1600,
+      finishReasons: ['stop', 'tool_calls'],
+    });
+    expect(events[0]).toEqual({
+      name: 'gen_ai.response.received',
+      attrs: {
+        'gen_ai.response.model': 'gpt-4o-2024-11-20',
+        'gen_ai.usage.input_tokens': 1200,
+        'gen_ai.usage.output_tokens': 400,
+        'gen_ai.usage.total_tokens': 1600,
+        'gen_ai.response.finish_reasons': 'stop,tool_calls',
+      },
+    });
+  });
+  it('recordResponseReceived omits finish_reasons when empty', () => {
+    const { ctx, events } = captureCtx();
+    recordResponseReceived(ctx, { model: 'claude-sonnet-4-6' });
+    expect(events[0]?.attrs).not.toHaveProperty(
+      'gen_ai.response.finish_reasons',
+    );
+  });
+  it('recordRetry captures attempt, reason, delay, and status code', () => {
+    const { ctx, events } = captureCtx();
+    recordRetry(ctx, {
+      attempt: 2,
+      reason: 'rate_limit',
+      delayMs: 1000,
+      statusCode: 429,
+    });
+    expect(events[0]).toEqual({
+      name: 'gen_ai.retry',
+      attrs: {
+        'retry.attempt': 2,
+        'retry.reason': 'rate_limit',
+        'retry.delay_ms': 1000,
+        'http.response.status_code': 429,
+      },
+    });
+  });
+  it('recordToolCall writes canonical gen_ai.tool.* keys', () => {
+    const { ctx, events } = captureCtx();
+    recordToolCall(ctx, {
+      toolName: 'search_traces',
+      toolCallId: 'call-123',
+      arguments: '{"serviceName":"api"}',
+    });
+    expect(events[0]).toEqual({
+      name: 'gen_ai.tool.call',
+      attrs: {
+        'gen_ai.tool.name': 'search_traces',
+        'gen_ai.tool.call.id': 'call-123',
+        'gen_ai.tool.arguments': '{"serviceName":"api"}',
+      },
+    });
+  });
+  it('recordStreamFirstToken is the bare marker for TTFT', () => {
+    const { ctx, events } = captureCtx();
+    recordStreamFirstToken(ctx, { tokensSoFar: 1 });
+    expect(events[0]).toEqual({
+      name: 'gen_ai.stream.first_token',
+      attrs: { 'gen_ai.stream.tokens_so_far': 1 },
+    });
+  });
+});

package/src/gen-ai-events.ts ADDED Viewed

@@ -0,0 +1,199 @@
+/**
+ * Span event helpers for LLM lifecycle, aligned with the OpenTelemetry
+ * GenAI semantic conventions.
+ *
+ * Span events are timestamped points within a span — they render as dots
+ * on the trace timeline in Jaeger / Tempo / Langfuse / Arize. Use them
+ * to mark lifecycle moments the span attributes alone can't express:
+ *
+ *   - When the prompt was sent (vs. when the first token arrived)
+ *   - When each retry attempt started, and why
+ *   - When a streaming response produced its first token (TTFT)
+ *   - When a tool was invoked
+ *
+ * Every helper pins the event name + attribute keys to the published
+ * spec so downstream tooling (autotel-mcp, Langfuse, vendor UIs) can
+ * render them consistently.
+ *
+ * @example
+ * ```typescript
+ * import { trace, recordPromptSent, recordResponseReceived, recordRetry } from 'autotel';
+ *
+ * export const chat = trace('chat', ctx => async (prompt: string) => {
+ *   recordPromptSent(ctx, { model: 'gpt-4o', messageCount: 1 });
+ *
+ *   for (let attempt = 1; attempt <= 3; attempt++) {
+ *     try {
+ *       const res = await openai.chat.completions.create({...});
+ *       recordResponseReceived(ctx, {
+ *         model: res.model,
+ *         promptTokens: res.usage?.prompt_tokens,
+ *         completionTokens: res.usage?.completion_tokens,
+ *         finishReasons: res.choices.map(c => c.finish_reason),
+ *       });
+ *       return res;
+ *     } catch (err) {
+ *       recordRetry(ctx, { attempt, reason: 'rate_limit', delayMs: 500 });
+ *       await sleep(500 * attempt);
+ *     }
+ *   }
+ * });
+ * ```
+ */
+import type { TraceContext } from './trace-context';
+type EventAttrs = Record<string, string | number | boolean>;
+/** Attributes expected on a `gen_ai.prompt.sent` event. */
+export interface PromptSentEvent {
+  /** Model the caller intends to invoke (may differ from response model). */
+  model?: string;
+  /** Estimated input token count, when known before the call. */
+  promptTokens?: number;
+  /** Number of messages in a chat request (system + user + assistant). */
+  messageCount?: number;
+  /** Free-form operation kind — `chat` / `completion` / `embedding`. */
+  operation?: string;
+}
+/** Attributes expected on a `gen_ai.response.received` event. */
+export interface ResponseReceivedEvent {
+  /** Model the provider actually served (may be more specific than requested). */
+  model?: string;
+  promptTokens?: number;
+  completionTokens?: number;
+  totalTokens?: number;
+  /** `stop`, `length`, `content_filter`, `tool_calls`, etc. */
+  finishReasons?: string[];
+}
+/** Attributes expected on a `gen_ai.retry` event. */
+export interface RetryEvent {
+  attempt: number;
+  /** `rate_limit` | `timeout` | `provider_error` | custom label. */
+  reason?: string;
+  /** How long we'll wait before the next attempt. */
+  delayMs?: number;
+  /** HTTP status that triggered the retry, when applicable. */
+  statusCode?: number;
+}
+/** Attributes expected on a `gen_ai.tool.call` event. */
+export interface ToolCallEvent {
+  toolName: string;
+  /** Call identifier so responses can be correlated back to calls. */
+  toolCallId?: string;
+  /** Pre-serialised tool arguments; omit if sensitive. */
+  arguments?: string;
+}
+/** Attributes expected on a `gen_ai.stream.first_token` event. */
+export interface StreamFirstTokenEvent {
+  /** Tokens streamed so far, if the caller tracks that. */
+  tokensSoFar?: number;
+}
+/**
+ * Record that a prompt was dispatched to the provider. Typically called
+ * before `await provider.chat.completions.create(...)`.
+ */
+export function recordPromptSent(
+  ctx: TraceContext,
+  event: PromptSentEvent = {},
+): void {
+  ctx.addEvent('gen_ai.prompt.sent', buildPromptSentAttrs(event));
+}
+/**
+ * Record a successful provider response. Call after the response arrives
+ * (for non-streaming) or after the stream completes.
+ */
+export function recordResponseReceived(
+  ctx: TraceContext,
+  event: ResponseReceivedEvent = {},
+): void {
+  ctx.addEvent('gen_ai.response.received', buildResponseAttrs(event));
+}
+/**
+ * Record a retry attempt on an LLM call. Call *before* sleeping for
+ * `delayMs` so the event timestamp accurately marks when the retry
+ * decision was made.
+ */
+export function recordRetry(ctx: TraceContext, event: RetryEvent): void {
+  ctx.addEvent('gen_ai.retry', buildRetryAttrs(event));
+}
+/**
+ * Record a tool / function call made in the course of an agent step.
+ * Emits an event rather than a child span because many frameworks fire
+ * several tool calls within a single provider response.
+ */
+export function recordToolCall(ctx: TraceContext, event: ToolCallEvent): void {
+  ctx.addEvent('gen_ai.tool.call', buildToolCallAttrs(event));
+}
+/**
+ * Record the time-to-first-token for a streaming response. Pair with
+ * `recordResponseReceived` at the end so the span carries both the TTFT
+ * marker and the final usage numbers.
+ */
+export function recordStreamFirstToken(
+  ctx: TraceContext,
+  event: StreamFirstTokenEvent = {},
+): void {
+  ctx.addEvent('gen_ai.stream.first_token', buildStreamFirstTokenAttrs(event));
+}
+// ---- Attribute builders -------------------------------------------------
+function buildPromptSentAttrs(event: PromptSentEvent): EventAttrs {
+  const attrs: EventAttrs = {};
+  if (event.model) attrs['gen_ai.request.model'] = event.model;
+  if (event.promptTokens !== undefined)
+    attrs['gen_ai.usage.input_tokens'] = event.promptTokens;
+  if (event.messageCount !== undefined)
+    attrs['gen_ai.request.message_count'] = event.messageCount;
+  if (event.operation) attrs['gen_ai.operation.name'] = event.operation;
+  return attrs;
+}
+function buildResponseAttrs(event: ResponseReceivedEvent): EventAttrs {
+  const attrs: EventAttrs = {};
+  if (event.model) attrs['gen_ai.response.model'] = event.model;
+  if (event.promptTokens !== undefined)
+    attrs['gen_ai.usage.input_tokens'] = event.promptTokens;
+  if (event.completionTokens !== undefined)
+    attrs['gen_ai.usage.output_tokens'] = event.completionTokens;
+  if (event.totalTokens !== undefined)
+    attrs['gen_ai.usage.total_tokens'] = event.totalTokens;
+  if (event.finishReasons && event.finishReasons.length > 0) {
+    // Arrays aren't primitive AttributeValues on this context, so join.
+    attrs['gen_ai.response.finish_reasons'] = event.finishReasons.join(',');
+  }
+  return attrs;
+}
+function buildRetryAttrs(event: RetryEvent): EventAttrs {
+  const attrs: EventAttrs = { 'retry.attempt': event.attempt };
+  if (event.reason) attrs['retry.reason'] = event.reason;
+  if (event.delayMs !== undefined) attrs['retry.delay_ms'] = event.delayMs;
+  if (event.statusCode !== undefined)
+    attrs['http.response.status_code'] = event.statusCode;
+  return attrs;
+}
+function buildToolCallAttrs(event: ToolCallEvent): EventAttrs {
+  const attrs: EventAttrs = { 'gen_ai.tool.name': event.toolName };
+  if (event.toolCallId) attrs['gen_ai.tool.call.id'] = event.toolCallId;
+  if (event.arguments) attrs['gen_ai.tool.arguments'] = event.arguments;
+  return attrs;
+}
+function buildStreamFirstTokenAttrs(event: StreamFirstTokenEvent): EventAttrs {
+  const attrs: EventAttrs = {};
+  if (event.tokensSoFar !== undefined)
+    attrs['gen_ai.stream.tokens_so_far'] = event.tokensSoFar;
+  return attrs;
+}

package/src/gen-ai-metrics.test.ts ADDED Viewed

@@ -0,0 +1,96 @@
+import { describe, expect, it } from 'vitest';
+import { AggregationType } from '@opentelemetry/sdk-metrics';
+import {
+  GEN_AI_COST_USD_BUCKETS,
+  GEN_AI_DURATION_BUCKETS_SECONDS,
+  GEN_AI_TOKEN_USAGE_BUCKETS,
+  genAiMetricViews,
+  llmHistogramAdvice,
+} from './gen-ai-metrics';
+describe('gen-ai-metrics', () => {
+  it('bucket arrays are strictly ascending (required by Prometheus + OTel)', () => {
+    for (const buckets of [
+      GEN_AI_DURATION_BUCKETS_SECONDS,
+      GEN_AI_TOKEN_USAGE_BUCKETS,
+      GEN_AI_COST_USD_BUCKETS,
+    ]) {
+      for (let i = 1; i < buckets.length; i++) {
+        expect(
+          buckets[i]! > buckets[i - 1]!,
+          `index ${i} not ascending: ${buckets[i - 1]} → ${buckets[i]}`,
+        ).toBe(true);
+      }
+    }
+  });
+  it('duration buckets cover tail through 5 minutes for reasoning models', () => {
+    expect(GEN_AI_DURATION_BUCKETS_SECONDS[0]).toBeLessThanOrEqual(0.05);
+    expect(
+      GEN_AI_DURATION_BUCKETS_SECONDS[
+        GEN_AI_DURATION_BUCKETS_SECONDS.length - 1
+      ],
+    ).toBeGreaterThanOrEqual(300);
+  });
+  it('token buckets cover up to a million-token context window', () => {
+    expect(
+      GEN_AI_TOKEN_USAGE_BUCKETS[GEN_AI_TOKEN_USAGE_BUCKETS.length - 1],
+    ).toBeGreaterThanOrEqual(1_000_000);
+  });
+  it('cost buckets resolve sub-cent spend', () => {
+    expect(GEN_AI_COST_USD_BUCKETS[0]).toBeLessThan(0.001);
+  });
+  it('bucket arrays are frozen — consumers cannot mutate shared state', () => {
+    expect(() => {
+      (GEN_AI_DURATION_BUCKETS_SECONDS as number[]).push(999);
+    }).toThrow();
+  });
+  it('llmHistogramAdvice returns explicitBucketBoundaries advice shape', () => {
+    const advice = llmHistogramAdvice('duration');
+    expect(advice.advice.explicitBucketBoundaries).toEqual([
+      ...GEN_AI_DURATION_BUCKETS_SECONDS,
+    ]);
+    // The returned array is a fresh copy so callers can mutate without
+    // affecting the shared constant.
+    advice.advice.explicitBucketBoundaries.push(0);
+    expect([...GEN_AI_DURATION_BUCKETS_SECONDS]).not.toContain(0);
+  });
+  it('genAiMetricViews targets the OTel GenAI instrument names with the right buckets', () => {
+    const views = genAiMetricViews();
+    expect(views).toHaveLength(3);
+    const byInstrument = Object.fromEntries(
+      views.map((v) => [v.instrumentName, v]),
+    );
+    expect(
+      byInstrument['gen_ai.client.operation.duration']?.aggregation,
+    ).toEqual({
+      type: AggregationType.EXPLICIT_BUCKET_HISTOGRAM,
+      options: { boundaries: [...GEN_AI_DURATION_BUCKETS_SECONDS] },
+    });
+    expect(byInstrument['gen_ai.client.token.usage']?.aggregation).toEqual({
+      type: AggregationType.EXPLICIT_BUCKET_HISTOGRAM,
+      options: { boundaries: [...GEN_AI_TOKEN_USAGE_BUCKETS] },
+    });
+    expect(byInstrument['gen_ai.client.cost.usd']?.aggregation).toEqual({
+      type: AggregationType.EXPLICIT_BUCKET_HISTOGRAM,
+      options: { boundaries: [...GEN_AI_COST_USD_BUCKETS] },
+    });
+  });
+  it('genAiMetricViews accepts extra instruments', () => {
+    const views = genAiMetricViews([
+      { instrumentName: 'custom.llm.prompt_tokens', kind: 'tokens' },
+    ]);
+    expect(views).toHaveLength(4);
+    const custom = views.find(
+      (v) => v.instrumentName === 'custom.llm.prompt_tokens',
+    );
+    expect(custom).toBeDefined();
+  });
+});

package/src/gen-ai-metrics.ts ADDED Viewed

@@ -0,0 +1,128 @@
+/**
+ * LLM-tuned histogram buckets.
+ *
+ * Default OpenTelemetry histogram buckets target HTTP latency (0ms–10s)
+ * and small counter values. LLM workloads have very different shapes:
+ *
+ *   - **Duration**: single-token prompts can be fast (50ms), long
+ *     generations and reasoning models can run for minutes. Default buckets
+ *     crush everything above 10s into one bucket.
+ *   - **Token usage**: heavily right-skewed. A single request can range
+ *     from tens of tokens to the million-token context windows.
+ *   - **Cost (USD)**: per-request values are tiny (fractions of a cent),
+ *     so linear buckets waste resolution at the low end.
+ *
+ * This module exposes empirically-chosen bucket arrays and a View helper
+ * so users can apply them to their `MeterProvider` without knowing the
+ * exact instrument names emitted by OpenAI/Anthropic/Traceloop plugins.
+ *
+ * @example
+ * ```typescript
+ * import { NodeSDK } from '@opentelemetry/sdk-node';
+ * import { genAiMetricViews } from 'autotel';
+ *
+ * const sdk = new NodeSDK({
+ *   serviceName: 'my-agent',
+ *   views: [...genAiMetricViews()],
+ * });
+ * sdk.start();
+ * ```
+ */
+import { AggregationType, type ViewOptions } from '@opentelemetry/sdk-metrics';
+/**
+ * Duration buckets for LLM operations, in **seconds**. Covers fast
+ * completions (50ms) through long-running reasoning jobs (5 min).
+ *
+ * Aligns with the OTel GenAI semantic conventions' published advice for
+ * `gen_ai.client.operation.duration`.
+ */
+export const GEN_AI_DURATION_BUCKETS_SECONDS: readonly number[] = Object.freeze(
+  [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 20, 30, 60, 120, 300],
+);
+/**
+ * Token-count buckets for prompt, completion, and total token histograms.
+ * Ranges from tiny prompts to million-token context windows.
+ *
+ * Aligns with the OTel GenAI semantic conventions' published advice for
+ * `gen_ai.client.token.usage`.
+ */
+export const GEN_AI_TOKEN_USAGE_BUCKETS: readonly number[] = Object.freeze([
+  1, 4, 16, 64, 256, 1_024, 4_096, 16_384, 65_536, 262_144, 1_048_576,
+  4_194_304,
+]);
+/**
+ * USD cost buckets. Sub-cent resolution at the low end (fractions of a
+ * cent per small call) up to tens of dollars (batch jobs, Opus/o1 runs).
+ */
+export const GEN_AI_COST_USD_BUCKETS: readonly number[] = Object.freeze([
+  0.000_01, 0.000_1, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50,
+]);
+/**
+ * Instrument-level advice object for `createHistogram(name, advice)`.
+ * Use when you control the instrument creation (e.g. custom business
+ * LLM metrics); `genAiMetricViews()` is better when the metric comes
+ * from a third-party plugin.
+ */
+export function llmHistogramAdvice(kind: 'duration' | 'tokens' | 'cost'): {
+  advice: { explicitBucketBoundaries: number[] };
+} {
+  const boundaries =
+    kind === 'duration'
+      ? GEN_AI_DURATION_BUCKETS_SECONDS
+      : kind === 'tokens'
+        ? GEN_AI_TOKEN_USAGE_BUCKETS
+        : GEN_AI_COST_USD_BUCKETS;
+  return { advice: { explicitBucketBoundaries: [...boundaries] } };
+}
+/**
+ * Returns `View`s that re-bucket the standard OTel GenAI histograms. Pass
+ * the result to your `MeterProvider`'s `views` option.
+ *
+ * Matches instrument names emitted by:
+ * - OpenTelemetry GenAI autoinstrumentation
+ * - OpenInference / OpenLLMetry (traceloop)
+ * - Arize Phoenix, LangSmith, etc. that follow the OTel spec
+ *
+ * Add more instrument patterns via the `extra` argument if you emit
+ * custom LLM metrics.
+ */
+export function genAiMetricViews(
+  extra: {
+    instrumentName: string;
+    kind: 'duration' | 'tokens' | 'cost';
+  }[] = [],
+): ViewOptions[] {
+  const defaults: Array<{
+    instrumentName: string;
+    kind: 'duration' | 'tokens' | 'cost';
+  }> = [
+    { instrumentName: 'gen_ai.client.operation.duration', kind: 'duration' },
+    { instrumentName: 'gen_ai.client.token.usage', kind: 'tokens' },
+    // Autotel-emitted cost metric. No-op if you don't emit it.
+    { instrumentName: 'gen_ai.client.cost.usd', kind: 'cost' },
+  ];
+  return [...defaults, ...extra].map(
+    ({ instrumentName, kind }) =>
+      ({
+        instrumentName,
+        aggregation: {
+          type: AggregationType.EXPLICIT_BUCKET_HISTOGRAM,
+          options: {
+            boundaries:
+              kind === 'duration'
+                ? [...GEN_AI_DURATION_BUCKETS_SECONDS]
+                : kind === 'tokens'
+                  ? [...GEN_AI_TOKEN_USAGE_BUCKETS]
+                  : [...GEN_AI_COST_USD_BUCKETS],
+          },
+        },
+      }) satisfies ViewOptions,
+  );
+}

package/src/index.ts CHANGED Viewed

@@ -187,6 +187,32 @@ export {
   createObservableGauge,
 } from './metric-helpers';
+// LLM-tuned histogram buckets — pass genAiMetricViews() to your
+// MeterProvider so gen_ai.* histograms have useful resolution.
+export {
+  GEN_AI_DURATION_BUCKETS_SECONDS,
+  GEN_AI_TOKEN_USAGE_BUCKETS,
+  GEN_AI_COST_USD_BUCKETS,
+  genAiMetricViews,
+  llmHistogramAdvice,
+} from './gen-ai-metrics';
+// OTel GenAI span event helpers — record prompt-sent / response-received
+// / retry / tool-call / stream-first-token as timestamped events aligned
+// with the published GenAI semantic conventions.
+export {
+  recordPromptSent,
+  recordResponseReceived,
+  recordRetry,
+  recordToolCall,
+  recordStreamFirstToken,
+  type PromptSentEvent,
+  type ResponseReceivedEvent,
+  type RetryEvent,
+  type ToolCallEvent,
+  type StreamFirstTokenEvent,
+} from './gen-ai-events';
 // Tracer helpers for custom spans
 export {
   getTracer,