RubyGems - riffer - Versions diffs - 0.32.0 → 0.33.0 - Mend

riffer 0.32.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

checksums.yaml +4 -4
data/.release-please-manifest.json +1 -1
data/.ruby-version +1 -1
data/CHANGELOG.md +34 -0
data/README.md +13 -11
data/docs/01_OVERVIEW.md +2 -0
data/docs/04_AGENT_LIFECYCLE.md +15 -13
data/docs/08_MESSAGES.md +39 -5
data/docs/09_STREAM_EVENTS.md +14 -0
data/docs/10_CONFIGURATION.md +73 -4
data/docs/13_SKILLS.md +66 -4
data/docs/14_MCP.md +2 -1
data/docs/16_TRACING.md +250 -0
data/docs/17_METRICS.md +123 -0
data/docs/providers/07_CUSTOM_PROVIDERS.md +44 -0
data/lib/riffer/agent/response.rb +11 -2
data/lib/riffer/agent/run.rb +136 -35
data/lib/riffer/agent.rb +5 -5
data/lib/riffer/config.rb +231 -15
data/lib/riffer/guardrail.rb +8 -0
data/lib/riffer/guardrails/runner.rb +33 -0
data/lib/riffer/helpers/boolean.rb +22 -0
data/lib/riffer/mcp/authenticated_tool.rb +14 -20
data/lib/riffer/mcp/registration.rb +4 -4
data/lib/riffer/mcp/tool.rb +23 -0
data/lib/riffer/mcp/tool_factory.rb +14 -22
data/lib/riffer/messages/assistant.rb +15 -3
data/lib/riffer/messages/base.rb +2 -1
data/lib/riffer/metrics/instruments.rb +25 -0
data/lib/riffer/metrics/null.rb +14 -0
data/lib/riffer/metrics/otel.rb +79 -0
data/lib/riffer/metrics.rb +93 -0
data/lib/riffer/providers/amazon_bedrock.rb +57 -21
data/lib/riffer/providers/anthropic.rb +59 -24
data/lib/riffer/providers/azure_open_ai.rb +7 -0
data/lib/riffer/providers/base.rb +247 -15
data/lib/riffer/providers/finish_reason.rb +27 -0
data/lib/riffer/providers/gemini.rb +59 -11
data/lib/riffer/providers/mock.rb +30 -9
data/lib/riffer/providers/open_ai.rb +78 -24
data/lib/riffer/providers/open_router.rb +56 -16
data/lib/riffer/providers/repository.rb +9 -0
data/lib/riffer/providers/token_usage.rb +27 -11
data/lib/riffer/skills/activate_tool.rb +12 -2
data/lib/riffer/skills/adapter.rb +15 -0
data/lib/riffer/skills/context.rb +78 -11
data/lib/riffer/skills/frontmatter.rb +13 -5
data/lib/riffer/skills/markdown_adapter.rb +1 -1
data/lib/riffer/skills/xml_adapter.rb +1 -1
data/lib/riffer/stream_events/finish_reason_done.rb +34 -0
data/lib/riffer/tools/runtime.rb +99 -3
data/lib/riffer/tracing/capture.rb +92 -0
data/lib/riffer/tracing/null.rb +61 -0
data/lib/riffer/tracing/otel.rb +131 -0
data/lib/riffer/tracing/stream_recorder.rb +51 -0
data/lib/riffer/tracing.rb +78 -0
data/lib/riffer/version.rb +1 -1
data/sig/_private/opentelemetry.rbs +22 -0
data/sig/generated/riffer/agent/response.rbs +9 -2
data/sig/generated/riffer/agent/run.rbs +28 -8
data/sig/generated/riffer/config.rbs +162 -16
data/sig/generated/riffer/guardrail.rbs +6 -0
data/sig/generated/riffer/guardrails/runner.rbs +14 -0
data/sig/generated/riffer/helpers/boolean.rbs +11 -0
data/sig/generated/riffer/mcp/authenticated_tool.rbs +6 -8
data/sig/generated/riffer/mcp/registration.rbs +4 -4
data/sig/generated/riffer/mcp/tool.rbs +19 -0
data/sig/generated/riffer/mcp/tool_factory.rbs +8 -7
data/sig/generated/riffer/messages/assistant.rbs +10 -4
data/sig/generated/riffer/metrics/instruments.rbs +13 -0
data/sig/generated/riffer/metrics/null.rbs +10 -0
data/sig/generated/riffer/metrics/otel.rbs +47 -0
data/sig/generated/riffer/metrics.rbs +71 -0
data/sig/generated/riffer/providers/amazon_bedrock.rbs +35 -14
data/sig/generated/riffer/providers/anthropic.rbs +41 -20
data/sig/generated/riffer/providers/azure_open_ai.rbs +5 -0
data/sig/generated/riffer/providers/base.rbs +78 -2
data/sig/generated/riffer/providers/finish_reason.rbs +19 -0
data/sig/generated/riffer/providers/gemini.rbs +25 -2
data/sig/generated/riffer/providers/mock.rbs +16 -5
data/sig/generated/riffer/providers/open_ai.rbs +44 -22
data/sig/generated/riffer/providers/open_router.rbs +31 -12
data/sig/generated/riffer/providers/repository.rbs +7 -0
data/sig/generated/riffer/providers/token_usage.rbs +20 -10
data/sig/generated/riffer/skills/activate_tool.rbs +7 -1
data/sig/generated/riffer/skills/adapter.rbs +10 -0
data/sig/generated/riffer/skills/context.rbs +52 -4
data/sig/generated/riffer/skills/frontmatter.rbs +10 -3
data/sig/generated/riffer/stream_events/finish_reason_done.rbs +21 -0
data/sig/generated/riffer/tools/runtime.rbs +35 -0
data/sig/generated/riffer/tracing/capture.rbs +46 -0
data/sig/generated/riffer/tracing/null.rbs +46 -0
data/sig/generated/riffer/tracing/otel.rbs +83 -0
data/sig/generated/riffer/tracing/stream_recorder.rbs +31 -0
data/sig/generated/riffer/tracing.rbs +52 -0
data/sig/manual/riffer/helpers/boolean.rbs +5 -0
data/sig/manual/riffer/metrics/null.rbs +5 -0
data/sig/manual/riffer/metrics.rbs +5 -0
data/sig/manual/riffer/providers.rbs +9 -0
data/sig/manual/riffer/tracing/capture.rbs +5 -0
data/sig/manual/riffer/tracing/null.rbs +5 -0
data/sig/manual/riffer/tracing.rbs +5 -0
metadata +40 -4

data/docs/16_TRACING.md ADDED Viewed

@@ -0,0 +1,250 @@
+# Tracing
+Riffer instruments its agent loop with [OpenTelemetry](https://opentelemetry.io/) spans, following the [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/). The emitted span shape — names, attributes, and hierarchy — is a public, versioned contract you can build dashboards, alerts, and cost reporting against. This page is the reference for that contract.
+Riffer only _emits_ spans. The host application owns the OpenTelemetry SDK, the exporter, sampling, and service naming — the standard OTEL split. Without a host SDK, every span is a silent no-op and Riffer carries no OpenTelemetry gem dependency.
+## Enabling tracing
+Add the OpenTelemetry SDK to your host application and configure an exporter. Riffer detects the API at runtime and starts emitting through whatever provider the SDK configures — there is nothing to switch on in Riffer beyond the SDK being present.
+```ruby
+# Gemfile
+gem "opentelemetry-sdk"
+```
+```ruby
+require "opentelemetry/sdk"
+OpenTelemetry::SDK.configure do |c|
+  c.service_name = "my-agent-host"
+end
+```
+To see Riffer's spans on stdout while developing locally, wire in the console exporter:
+```ruby
+require "opentelemetry/sdk"
+OpenTelemetry::SDK.configure do |c|
+  c.service_name = "my-agent-host"
+  c.add_span_processor(
+    OpenTelemetry::SDK::Trace::Export::SimpleSpanProcessor.new(
+      OpenTelemetry::SDK::Trace::Export::ConsoleSpanExporter.new
+    )
+  )
+end
+```
+Any backend that implements the OpenTelemetry Traces API ingests Riffer's spans with no second pipeline — including the `datadog` gem (`require "datadog/opentelemetry"`), which routes them through an existing tracer so they nest inside the host's live trace. For real exporter and collector setup (OTLP, sampling, resource attributes), see the [OpenTelemetry Ruby docs](https://opentelemetry.io/docs/languages/ruby/).
+The three tracing knobs — the `enabled` kill switch, opt-in message-content capture, and an explicit tracer provider for tests — live in [Configuration — Tracing](10_CONFIGURATION.md#tracing).
+Spans are emitted under the instrumentation scope named `riffer`, versioned with the Riffer gem version. That scope version is the runtime signal for which release produced a span; see [Stability](#stability).
+## Spans
+Riffer emits four span types. A single agent run produces one `invoke_agent` span wrapping one `chat` span per model call, one `execute_tool` span per tool call, and one `execute_guardrail` span per guardrail execution, interleaved in execution order:
+```
+invoke_agent {agent}             INTERNAL
+├─ execute_guardrail {name}      INTERNAL   (one per before-phase guardrail)
+├─ chat {model}                  CLIENT     (one per LLM call)
+├─ execute_tool {tool}           INTERNAL   (one per tool call)
+│   └─ (host spans nest here via around_tool_call / tool internals)
+├─ execute_guardrail {name}      INTERNAL   (one per after-phase guardrail, after each response)
+├─ chat {model}
+└─ …
+```
+The `execute_tool` span opens _outside_ Riffer's `around_tool_call` hook, so any spans a host emits from that hook — or from inside the tool itself — nest beneath it. See [Advanced Tools](07_TOOL_ADVANCED.md) for the hook.
+### Reading the attribute tables
+Every attribute a span can carry is listed below, including the conditional ones — you can't query a key you don't know exists. The **Present** column tells you when to expect each:
+- **`Always`** — emitted on every span of that type.
+- **`On <something happened>`** (e.g. `On a tripwire`, `On failure`) — _path-conditional_: presence is itself a signal. If `riffer.tripwire.phase` is set, a guardrail tripped. Filter on these with confidence.
+- **`When the provider reports it`** / **`When the caller set it`** — _best-effort_: may be absent even on a perfectly healthy span, because it depends on what the upstream provider returned or what options the caller passed. Guard or coalesce these in queries.
+The contract promise is: **when present**, a key carries the documented meaning and type. It is _not_ a promise that every key appears on every span.
+## `invoke_agent {agent}` — the run span
+`INTERNAL`. One per call to `Agent#generate` or `Agent#stream`. The span name suffix is the agent's identifier (e.g. `invoke_agent weather-agent`).
+| Attribute                                  | Type   | Present                                              |
+| ------------------------------------------ | ------ | ---------------------------------------------------- |
+| `gen_ai.operation.name`                    | string | Always (`"invoke_agent"`)                            |
+| `gen_ai.agent.name`                        | string | Always — the agent's identifier                      |
+| `gen_ai.provider.name`                     | string | Always — see [provider names](#provider-names)       |
+| `gen_ai.request.model`                     | string | Always — the agent's configured model                |
+| `riffer.steps`                             | int    | Always — number of LLM calls in the run              |
+| `gen_ai.usage.input_tokens`                | int    | When the run made an LLM call that reported usage    |
+| `gen_ai.usage.output_tokens`               | int    | When the run made an LLM call that reported usage    |
+| `gen_ai.usage.cache_read.input_tokens`     | int    | When the provider reported cache reads               |
+| `gen_ai.usage.cache_creation.input_tokens` | int    | When the provider reported cache writes              |
+| `riffer.cost`                              | float  | When every call in the run was priced                |
+| `riffer.interrupt.reason`                  | string | On interrupt (e.g. approval needed, max steps)       |
+| `riffer.tripwire.guardrail`                | string | On a guardrail tripwire, when the guardrail is named |
+| `riffer.tripwire.reason`                   | string | On a guardrail tripwire                              |
+| `riffer.tripwire.phase`                    | string | On a guardrail tripwire (`"before"` / `"after"`)     |
+| `error.type`                               | string | On an unhandled exception                            |
+The `riffer.tripwire.*` attributes are the run-level summary of the guardrail that halted the run; `riffer.tripwire.guardrail` carries the same name value as the blocking [`execute_guardrail`](#execute_guardrail-name--the-guardrail-span) span's `riffer.guardrail.name`, so the two join on a single key.
+Usage on this span is the run total, aggregated across every step. See [Token usage](#token-usage) for the trap this creates.
+## `chat {model}` — the LLM call span
+`CLIENT`. One per model call, in both `generate` and `stream`. The span name suffix is the model (e.g. `chat gpt-4`), or just `chat` when no model is set.
+| Attribute                                  | Type     | Present                                                                       |
+| ------------------------------------------ | -------- | ----------------------------------------------------------------------------- |
+| `gen_ai.operation.name`                    | string   | Always (`"chat"`)                                                             |
+| `gen_ai.provider.name`                     | string   | Always — see [provider names](#provider-names)                                |
+| `gen_ai.request.model`                     | string   | When a model is set                                                           |
+| `gen_ai.request.temperature`               | float    | When the caller set it                                                        |
+| `gen_ai.request.max_tokens`                | int      | When the caller set `max_tokens` or `max_output_tokens`                       |
+| `gen_ai.request.top_p`                     | float    | When the caller set it                                                        |
+| `gen_ai.request.top_k`                     | int      | When the caller set it                                                        |
+| `gen_ai.request.frequency_penalty`         | float    | When the caller set it                                                        |
+| `gen_ai.request.presence_penalty`          | float    | When the caller set it                                                        |
+| `gen_ai.request.seed`                      | int      | When the caller set it                                                        |
+| `gen_ai.request.stop_sequences`            | string[] | When the caller set it                                                        |
+| `gen_ai.usage.input_tokens`                | int      | When the provider reported usage                                              |
+| `gen_ai.usage.output_tokens`               | int      | When the provider reported usage                                              |
+| `gen_ai.usage.cache_read.input_tokens`     | int      | When the provider reported cache reads                                        |
+| `gen_ai.usage.cache_creation.input_tokens` | int      | When the provider reported cache writes                                       |
+| `riffer.cost`                              | float    | When the call's model was priced                                              |
+| `gen_ai.response.finish_reasons`           | string[] | When the provider reported a finish reason                                    |
+| `riffer.finish_reason.raw`                 | string   | When the raw value differs from the normalized one                            |
+| `gen_ai.input.messages`                    | string   | When `capture_messages` is on (JSON; see [capture](#message-content-capture)) |
+| `gen_ai.system_instructions`               | string   | When `capture_messages` is on and a system prompt exists                      |
+| `gen_ai.output.messages`                   | string   | When `capture_messages` is on (JSON)                                          |
+| `error.type`                               | string   | On an unhandled exception                                                     |
+`gen_ai.response.finish_reasons` is an array of exactly one normalized value, from the fixed vocabulary `stop`, `length`, `tool_calls`, `content_filter`, `error`, `other`. When the provider's raw wire value carries more nuance than the normalized one, the raw string is preserved on `riffer.finish_reason.raw`.
+## `execute_tool {tool}` — the tool call span
+`INTERNAL`. One per tool call dispatched by the runtime. The span name suffix is the tool's name (e.g. `execute_tool get_weather`).
+| Attribute                    | Type   | Present                                                                 |
+| ---------------------------- | ------ | ----------------------------------------------------------------------- |
+| `gen_ai.operation.name`      | string | Always (`"execute_tool"`)                                               |
+| `gen_ai.tool.name`           | string | Always                                                                  |
+| `gen_ai.tool.call.id`        | string | Always — the originating tool-call id                                   |
+| `error.type`                 | string | On a tool error (see below)                                             |
+| `gen_ai.tool.call.arguments` | string | When `capture_messages` is on (see [capture](#message-content-capture)) |
+| `gen_ai.tool.call.result`    | string | When `capture_messages` is on                                           |
+A tool failure comes in two shapes, distinguished by span status:
+- **Handled error** — the tool returned an error response. `error.type` carries the category and the **span status stays unset** (the run continues). The framework's categories are `unknown_tool`, `validation_error`, `timeout_error`, and `execution_error`; a custom tool may set its own via `Riffer::Tools::Response.error(type:)`.
+- **Unhandled exception** — the dispatch raised. `error.type` is the exception class name and the **span status is `ERROR`**, with the exception recorded.
+This status convention is the same on `chat` and `invoke_agent`: an unhandled exception sets `error.type` to the class name and marks the span `ERROR`; everything else leaves the status unset.
+## `execute_guardrail {name}` — the guardrail span
+`INTERNAL`. One per guardrail execution; a guardrail registered for both phases runs — and emits a span — once in each. The span name suffix is the guardrail's name (e.g. `execute_guardrail profanity_filter`), from `Riffer::Guardrail#name` — the converted class name by default, overridable to relabel the span. This is the one Riffer span with **no `gen_ai.operation.name`**. A guardrail is not a GenAI semantic-convention operation, so the span stays entirely in Riffer's own namespace rather than squat an invented value on the standardized key.
+| Attribute                 | Type   | Present                                                     |
+| ------------------------- | ------ | ----------------------------------------------------------- |
+| `riffer.guardrail.name`   | string | Always — the guardrail's name                               |
+| `riffer.guardrail.phase`  | string | Always (`"before"` / `"after"`)                             |
+| `riffer.guardrail.action` | string | On a returned result (`"pass"` / `"transform"` / `"block"`) |
+| `riffer.tripwire.reason`  | string | On a block — the block reason                               |
+| `error.type`              | string | On an unhandled exception                                   |
+`riffer.guardrail.*` holds the facts true of any execution — name, phase, action. A reason exists only on a block, so it reuses the run-level `riffer.tripwire.reason` key: one query finds the reason on both the per-guardrail span and the enclosing `invoke_agent` summary.
+A block is a **handled outcome**: `riffer.guardrail.action` is `block` and the **span status stays unset** — the same convention `execute_tool` uses for a returned error response. Only a guardrail that **raises** sets `error.type` to the exception class name and marks the **span status `ERROR`** (with the exception recorded); on a raise no result is produced, so `riffer.guardrail.action` is absent.
+## Example trace
+A `generate` run where the model calls one tool, then answers — with one `before` guardrail and one `after` guardrail, using the OpenAI provider with `gpt-4`. The `after` guardrail runs once per model response, so it appears after each `chat`:
+```
+invoke_agent weather-agent          INTERNAL
+  gen_ai.agent.name      = weather-agent
+  gen_ai.provider.name   = openai
+  gen_ai.request.model   = gpt-4
+  riffer.steps           = 2
+  gen_ai.usage.input_tokens  = 1240
+  gen_ai.usage.output_tokens = 86
+  riffer.cost                = 0.0423
+├─ execute_guardrail input_filter   INTERNAL
+│    riffer.guardrail.name   = input_filter
+│    riffer.guardrail.phase  = before
+│    riffer.guardrail.action = pass
+├─ chat gpt-4                       CLIENT
+│    gen_ai.request.model            = gpt-4
+│    gen_ai.response.finish_reasons  = ["tool_calls"]
+│    gen_ai.usage.input_tokens       = 612
+│    gen_ai.usage.output_tokens      = 48
+│    riffer.cost                     = 0.0212
+├─ execute_guardrail output_filter  INTERNAL
+│    riffer.guardrail.name   = output_filter
+│    riffer.guardrail.phase  = after
+│    riffer.guardrail.action = pass
+├─ execute_tool get_weather         INTERNAL
+│    gen_ai.tool.name     = get_weather
+│    gen_ai.tool.call.id  = tc_42
+├─ chat gpt-4                       CLIENT
+│    gen_ai.request.model            = gpt-4
+│    gen_ai.response.finish_reasons  = ["stop"]
+│    gen_ai.usage.input_tokens       = 628
+│    gen_ai.usage.output_tokens      = 38
+│    riffer.cost                     = 0.0211
+└─ execute_guardrail output_filter  INTERNAL
+     riffer.guardrail.name   = output_filter
+     riffer.guardrail.phase  = after
+     riffer.guardrail.action = pass
+```
+## Token usage and cost
+`gen_ai.usage.input_tokens` is the **total** prompt tokens for the call, **cache-inclusive**, per the GenAI semantic conventions. `gen_ai.usage.cache_read.input_tokens` and `gen_ai.usage.cache_creation.input_tokens` are **subsets of that total** — the portion served from, or written to, the provider's prompt cache. They are _not_ additional tokens; do not add them on top of `input_tokens`.
+```
+input_tokens                 = 1000
+cache_read.input_tokens      =  800   → 800 of the 1000 were cache hits
+                                        (≈ 200 billed as new input)
+```
+Riffer normalizes this across providers, so the number may differ from a provider's native API field. Anthropic's raw `input_tokens` _excludes_ the cache buckets — Riffer folds them in. OpenAI's already includes them. Either way the span value means the same thing.
+**Don't double-count across spans.** Usage on a `chat` span is per-call; usage on the enclosing `invoke_agent` span is the run total already summed across every `chat`. Aggregate one level or the other, never both.
+### Cost
+`riffer.cost` is the modeled cost of one call (on a `chat` span) or a whole run (on the `invoke_agent` span). It lives in Riffer's own namespace because the GenAI semantic conventions define no cost attribute by design — Riffer never squats `gen_ai.*` for it. The attribute appears only when you have configured pricing for the model in use: Riffer ships no price table and never guesses, so an unpriced model simply carries no `riffer.cost`. See [Configuration — Pricing](10_CONFIGURATION.md#pricing) for the rates.
+The value is **unitless on the wire** — Riffer attaches no currency. It is the sum of the per-token rates you configured, in whatever currency you expressed them, so a `riffer.cost` of `0.0123` means 0.0123 of that unit. The raw float is emitted unrounded; round for display in your backend, not before.
+**Run cost is all-or-nothing.** The `riffer.cost` on an `invoke_agent` span is the sum of its per-call costs, present only when **every** call in the run was priced. A single unpriced call makes the run-level `riffer.cost` absent — costs sum with nil as absorbing, so Riffer reports no run total rather than a partial one that silently under-reports spend. The priced `chat` spans still each carry their own `riffer.cost`; sum those yourself if a partial is what you want.
+## Message content capture
+The prompt and completion content attributes — `gen_ai.input.messages`, `gen_ai.output.messages`, `gen_ai.system_instructions` on `chat`, and `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` on `execute_tool` — are **off by default** and gated behind `config.tracing.capture_messages`. Message content routinely carries sensitive data (including PHI); leave capture off unless your trace backend is an appropriate destination for it.
+When enabled, content is serialized as GenAI-semconv JSON strings. File attachments serialize as metadata-only stubs (media type and name, never bytes). Riffer applies no size limit of its own — cap oversized attributes with the OTEL SDK's attribute length limits. See [Configuration — Tracing](10_CONFIGURATION.md#tracing) for the knob.
+## Provider names
+`gen_ai.provider.name` carries a GenAI-semconv well-known value where one exists: `openai`, `anthropic`, `aws.bedrock`, `azure.ai.openai`, `gcp.gemini`, `openrouter`. A custom provider that doesn't override the value defaults to the snake_cased form of its class name, so enabling tracing never breaks an otherwise-working provider.
+## Stability
+The span and attribute shape is a public, versioned contract, in two tiers:
+- **`gen_ai.*`** tracks the OpenTelemetry GenAI semantic conventions, pinned to schema version `1.37.0`. That convention is still "Development" status upstream and its attribute names may change; Riffer absorbs such renames deliberately in a release, never silently, with a CHANGELOG entry.
+- **`riffer.*`** is Riffer-owned (`riffer.steps`, `riffer.cost`, `riffer.interrupt.reason`, `riffer.tripwire.*`, `riffer.guardrail.*`, `riffer.finish_reason.raw`) and changes only through a normal version bump and CHANGELOG entry.
+The semantic-convention schema version is a documented pin rather than a span attribute — the OpenTelemetry Ruby API can't attach a schema URL to a tracer. The runtime version signal is the instrumentation scope: every span carries scope name `riffer` at the gem version that emitted it. Pin the Riffer version your dashboards depend on, and watch the CHANGELOG for tracing entries before upgrading.
+## Avoid double instrumentation
+Riffer instruments the agent loop natively. Running a provider-level GenAI instrumentation gem (for example an OpenTelemetry contrib instrumentation for the underlying Anthropic or OpenAI client) _alongside_ Riffer duplicates the `chat` spans and double-counts token usage. Run one or the other, not both — disable the provider-level instrumentation when Riffer's loop spans are active.

data/docs/17_METRICS.md ADDED Viewed

@@ -0,0 +1,123 @@
+# Metrics
+Riffer can record [OpenTelemetry](https://opentelemetry.io/) metric instruments alongside its [spans](16_TRACING.md), following the [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/). Metric names, instrument types, units, and attributes are a public, versioned contract you can build dashboards and alerts against. This page is the reference for that contract.
+As with tracing, Riffer only _records_ instruments. The host application owns the OpenTelemetry SDK, the metric reader, the exporter, and the aggregation — the standard OTEL split. Without a host SDK, every measurement is a silent no-op and Riffer carries no OpenTelemetry gem dependency.
+> **OpenTelemetry metrics for Ruby is still pre-1.0.** The metrics API and SDK ship as separate, experimental gems (`opentelemetry-metrics-api`, `opentelemetry-metrics-sdk`) from the stable 1.x traces API. Riffer guards against an incompatible API and falls back to a no-op outside the supported range, but expect the host-side wiring below to evolve with those gems.
+## Enabling metrics
+Add the OpenTelemetry metrics SDK to your host application and register a metric reader with an exporter. Riffer detects the metrics API at runtime and starts recording through whatever meter provider the SDK configures.
+```ruby
+# Gemfile
+gem "opentelemetry-metrics-sdk"
+```
+```ruby
+require "opentelemetry-metrics-sdk"
+OpenTelemetry::SDK.configure do |c|
+  c.service_name = "my-agent-host"
+end
+```
+The metrics SDK is **separate** from the traces SDK (`opentelemetry-sdk`); add it explicitly even if you already trace. Any backend implementing the OpenTelemetry Metrics API ingests Riffer's instruments. For real reader and exporter setup (OTLP, periodic export, Views), see the [OpenTelemetry Ruby docs](https://opentelemetry.io/docs/languages/ruby/).
+The two metrics knobs — the `enabled` kill switch and an explicit meter provider for tests — live in [Configuration — Metrics](10_CONFIGURATION.md#metrics). They are **independent** of the tracing knobs: you can run tracing while metrics are off, or the reverse.
+Instruments are recorded under the instrumentation scope named `riffer`, versioned with the Riffer gem version — the runtime signal for which release produced a measurement; see [Stability](#stability).
+### Bucket boundaries
+Histogram bucket boundaries are a **host-side** concern. The OpenTelemetry metrics API does not let an instrumenting library attach bucket boundaries at instrument creation, so Riffer does not set them — the SDK's default buckets apply unless you override them. To match the GenAI semantic conventions' recommended boundaries (or your own), register a [View](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#view) on the meter provider that targets the instrument by name and sets explicit bucket boundaries.
+The convention recommends boundaries scaled to each instrument, so register one View per histogram — the token-count buckets below are for `gen_ai.client.token.usage`; `gen_ai.client.operation.duration` wants its own latency-scaled set, and `riffer.gen_ai.cost` a USD-scaled one.
+```ruby
+require "opentelemetry-metrics-sdk"
+OpenTelemetry::SDK.configure do |c|
+  c.service_name = "my-agent-host"
+end
+# The GenAI semconv's recommended token-count boundaries. Register the View
+# before Riffer records its first measurement.
+OpenTelemetry.meter_provider.add_view(
+  "gen_ai.client.token.usage",
+  aggregation: OpenTelemetry::SDK::Metrics::Aggregation::ExplicitBucketHistogram.new(
+    boundaries: [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]
+  )
+)
+```
+## Instruments
+Each instrument is documented here as a row carrying its name, instrument type, unit, and attribute set.
+### `gen_ai.client.operation.duration`
+Histogram, unit `s`. The latency of a single GenAI operation, recorded around the same wrap as the matching [span](16_TRACING.md) on both the success and error paths and timed with a monotonic clock. Recording is independent of tracing — the metric fires even with `config.tracing.enabled = false`. Tell the three operations apart by `gen_ai.operation.name`.
+| `gen_ai.operation.name` | Recorded around                                    | Attributes                                                                                                            |
+| ----------------------- | -------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
+| `chat`                  | each provider call (`generate_text`/`stream_text`) | `gen_ai.operation.name`, `gen_ai.provider.name`, `gen_ai.request.model` (when set), `error.type` (on error)           |
+| `invoke_agent`          | each agent run (`generate`/`stream`)               | `gen_ai.operation.name`, `gen_ai.provider.name`, `gen_ai.request.model`, `gen_ai.agent.name`, `error.type` (on error) |
+| `execute_tool`          | each tool call                                     | `gen_ai.operation.name`, `gen_ai.tool.name`, `error.type` (on error)                                                  |
+`error.type` carries the exception class for a raised error; for `execute_tool` it carries the handled error category (e.g. `validation_error`, `timeout_error`) when a tool returns an error result instead of raising — matching the span. `gen_ai.response.model` is not recorded yet; it will land once it is also captured on the chat span.
+> **Streamed operations are consumption-paced.** A streamed `chat` or `invoke_agent` records its duration when the stream drains, so the value includes the time your consumer takes to iterate the events, not just provider latency. The matching span behaves the same way.
+> **`gen_ai.tool.name` cardinality.** One time series exists per distinct tool name. With a large or dynamic tool set (for example MCP-discovered tools) that can grow unbounded — drop the attribute with a [View](https://opentelemetry.io/docs/specs/otel/metrics/sdk/#view) if your backend strains.
+### `gen_ai.client.token.usage`
+Histogram, unit `{token}`. Token volume for a single `chat` call, recorded from the normalized token usage after the provider responds. Each call emits **two data points** — one `input`, one `output` — distinguished by `gen_ai.token.type`. Recording is independent of tracing (it fires with `config.tracing.enabled = false`); for a streamed call it fires when the stream drains. `gen_ai.response.model` is not recorded yet, for the same reason as `operation.duration`.
+| `gen_ai.token.type` | Value                                                 | Attributes                                                                                                              |
+| ------------------- | ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `input`             | total prompt tokens for the call, **cache-inclusive** | `gen_ai.operation.name` (always `chat`), `gen_ai.provider.name`, `gen_ai.token.type`, `gen_ai.request.model` (when set) |
+| `output`            | tokens generated, including reasoning/thinking tokens | same                                                                                                                    |
+> **Per-call only.** Token usage is never recorded at the run (`invoke_agent`) level. Metrics pre-aggregate, so emitting both the per-call points and a run total would double-count — sum the per-call points in your backend if you want a run total. This is the metric-side counterpart of the span-level [double-count trap](16_TRACING.md#token-usage-and-cost).
+> **Cache buckets stay on spans.** The semconv `gen_ai.token.type` defines only `input` and `output`, so the prompt-cache subsets (`cache_read` / `cache_creation`) live on [spans](16_TRACING.md#token-usage-and-cost), not this metric. The `input` value is the cache-inclusive total, matching the span's `gen_ai.usage.input_tokens`.
+A call that reports no usage records no data points, and a failed call has nothing to count — so this metric carries no `error.type` (the semconv marks it not applicable here, unlike `operation.duration`).
+### `riffer.gen_ai.cost`
+Histogram, unit `USD`. The cost of a single `chat` call, recorded from the [cost](16_TRACING.md#token-usage-and-cost) on the normalized token usage after the provider responds — the same source as the cost span attribute, a different sink. This instrument is Riffer-owned (`riffer.*`, not `gen_ai.*`) so it won't collide if the semantic conventions later define a cost instrument; see [Stability](#stability). Recording is independent of tracing (it fires with `config.tracing.enabled = false`); for a streamed call it fires when the stream drains.
+| Value            | Attributes                                                                                         |
+| ---------------- | -------------------------------------------------------------------------------------------------- |
+| cost of the call | `gen_ai.operation.name` (always `chat`), `gen_ai.provider.name`, `gen_ai.request.model` (when set) |
+Pricing is **consumer-configured** — no price table ships with the gem (see [Configuration — Pricing](10_CONFIGURATION.md#pricing)). A call whose model has no configured price records **no** data point, so this metric covers only priced calls; `operation.duration` and `token.usage` still record. A priced call that computes to `0.0` does record a zero data point — only an absent price means there is nothing to measure.
+> **Per-call only.** Cost is never recorded at the run (`invoke_agent`) level, for the same reason as token usage: metrics pre-aggregate, so emitting both per-call points and a run total would double-count. Sum the per-call points in your backend for a run total.
+## Stability
+The instrument shape is a public, versioned contract, in two tiers — mirroring the [tracing contract](16_TRACING.md#stability):
+- **`gen_ai.*`** tracks the OpenTelemetry GenAI semantic conventions, pinned to schema version `1.37.0`. That convention is still "Development" status upstream and its names may change; Riffer absorbs such renames deliberately in a release, never silently, with a CHANGELOG entry.
+- **`riffer.*`** is Riffer-owned and changes only through a normal version bump and CHANGELOG entry. Riffer-owned metrics live here so they won't collide if semconv later defines an equivalent.
+The semantic-convention schema version is a documented pin rather than an instrument attribute — the OpenTelemetry Ruby API can't attach a schema URL to a meter. The runtime version signal is the instrumentation scope: every measurement carries scope name `riffer` at the gem version that recorded it. Pin the Riffer version your dashboards depend on, and watch the CHANGELOG for metrics entries before upgrading.
+## Avoid double instrumentation
+Riffer records its agent loop's metrics natively. Running a provider-level GenAI instrumentation gem (for example an OpenTelemetry contrib instrumentation for the underlying Anthropic or OpenAI client) _alongside_ Riffer records the same `gen_ai.client.*` metrics twice. Because metrics pre-aggregate, that duplication is **silent** — it inflates counts and distorts distributions in your dashboards without any obvious per-event trace to inspect.
+Record one or the other, not both. Since the metrics kill switch is independent of tracing, the usual resolution is to keep Riffer's spans and turn _Riffer's metrics_ off, letting the provider-level instrumentation own the metrics:
+```ruby
+Riffer.configure do |config|
+  config.metrics.enabled = false
+end
+```
+— or disable the provider-level metric instrumentation and let Riffer own them.

data/docs/providers/07_CUSTOM_PROVIDERS.md CHANGED Viewed

@@ -239,6 +239,50 @@ Riffer::StreamEvents::TokenUsageDone.new(
     output_tokens: 50
   )
 )
+# Finish reason (emit at end of stream)
+Riffer::StreamEvents::FinishReasonDone.new(
+  finish_reason: :stop,
+  raw_finish_reason: "done"
+)
+```
+## Token Usage Semantics
+`Riffer::Providers::TokenUsage` is a normalized contract — map your provider's raw usage into the bucket meanings defined in [Messages — Token Usage Semantics](../08_MESSAGES.md#token-usage-semantics) rather than passing fields through untouched.
+## Finish Reasons
+`Riffer::Providers::FinishReason` is the same kind of normalized contract — map your provider's raw finish/stop value into the vocabulary defined in [Messages — Finish Reasons](../08_MESSAGES.md#finish-reasons) (`:stop`, `:length`, `:tool_calls`, `:content_filter`, `:error`, `:other`), keeping the raw wire value alongside:
+```ruby
+def extract_finish_reason(response)
+  raw = response.stop_reason
+  return nil unless raw
+  Riffer::Providers::FinishReason.new(
+    reason: {"done" => :stop, "max_len" => :length}.fetch(raw, :other),
+    raw: raw
+  )
+end
+```
+The hook is optional — the base class defaults to `nil` (no finish reason reported). Map unmapped values to `:other`, never raise on a novel wire value.
+For streaming, emit a `FinishReasonDone` event near the end of `execute_stream`:
+```ruby
+yielder << Riffer::StreamEvents::FinishReasonDone.new(finish_reason: :stop, raw_finish_reason: "done")
+```
+## Trace Provider Name
+LLM-call and agent-run spans stamp `gen_ai.provider.name` from the `semconv_provider_name` class method. The default is your snake_cased class name; override it when a [GenAI semconv well-known value](https://opentelemetry.io/docs/specs/semconv/gen-ai/) exists for your provider:
+```ruby
+def self.semconv_provider_name
+  "my_provider"
+end
 ```
 ## Error Handling

data/lib/riffer/agent/response.rb CHANGED Viewed

@@ -28,6 +28,13 @@ class Riffer::Agent::Response
   # The parsed structured output, if structured output was configured.
   attr_reader :structured_output #: Hash[Symbol, untyped]?
+  # The aggregate token usage across this run's LLM calls, if any was reported.
+  attr_reader :token_usage #: Riffer::Providers::TokenUsage?
+  # The number of LLM calls made during this run (0 when a before-guardrail
+  # blocks before any call). Distinct from the session's cumulative step count.
+  attr_reader :steps #: Integer
   # The full message history from the agent conversation.
   attr_reader :messages #: Array[Riffer::Messages::Base]
@@ -36,8 +43,8 @@ class Riffer::Agent::Response
   attr_reader :healed_tool_call_ids #: Array[String]
   #--
-  #: (String, ?tripwire: Riffer::Guardrails::Tripwire?, ?modifications: Array[Riffer::Guardrails::Modification], ?interrupted: bool, ?interrupt_reason: (String | Symbol)?, ?structured_output: Hash[Symbol, untyped]?, ?messages: Array[Riffer::Messages::Base], ?healed_tool_call_ids: Array[String]) -> void
-  def initialize(content, tripwire: nil, modifications: [], interrupted: false, interrupt_reason: nil, structured_output: nil, messages: [], healed_tool_call_ids: [])
+  #: (String, ?tripwire: Riffer::Guardrails::Tripwire?, ?modifications: Array[Riffer::Guardrails::Modification], ?interrupted: bool, ?interrupt_reason: (String | Symbol)?, ?structured_output: Hash[Symbol, untyped]?, ?messages: Array[Riffer::Messages::Base], ?healed_tool_call_ids: Array[String], ?token_usage: Riffer::Providers::TokenUsage?, ?steps: Integer) -> void
+  def initialize(content, tripwire: nil, modifications: [], interrupted: false, interrupt_reason: nil, structured_output: nil, messages: [], healed_tool_call_ids: [], token_usage: nil, steps: 0)
     @content = content
     @tripwire = tripwire
     @modifications = modifications
@@ -46,6 +53,8 @@ class Riffer::Agent::Response
     @structured_output = structured_output
     @messages = messages
     @healed_tool_call_ids = healed_tool_call_ids
+    @token_usage = token_usage
+    @steps = steps
   end
   # Returns true if the response was blocked by a guardrail.