PyPI - forgesight-eval - Versions diffs - 0.1.1__tar.gz - Mend

forgesight-eval 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

forgesight_eval-0.1.1/.gitignore +38 -0
forgesight_eval-0.1.1/PKG-INFO +88 -0
forgesight_eval-0.1.1/README.md +63 -0
forgesight_eval-0.1.1/pyproject.toml +41 -0
forgesight_eval-0.1.1/src/forgesight_eval/__init__.py +18 -0
forgesight_eval-0.1.1/src/forgesight_eval/_config.py +73 -0
forgesight_eval-0.1.1/src/forgesight_eval/api.py +179 -0
forgesight_eval-0.1.1/src/forgesight_eval/model.py +28 -0
forgesight_eval-0.1.1/src/forgesight_eval/py.typed +0 -0
forgesight_eval-0.1.1/tests/test_eval.py +244 -0

forgesight_eval-0.1.1/.gitignore ADDED Viewed

@@ -0,0 +1,38 @@
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.eggs/
+build/
+dist/
+*.so
+# venv / tooling
+.venv/
+venv/
+.uv/
+uv.lock
+# test / type / lint caches
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+.coverage.*
+coverage.xml
+htmlcov/
+# secrets / local env (never commit)
+.env
+.env.*
+# editor / OS
+.DS_Store
+.idea/
+.vscode/
+# local-only session working state (per the workspace pipeline)
+.claude/state/
+# local-only launch planning (not part of the published repo)
+/launch/

forgesight_eval-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,88 @@
+Metadata-Version: 2.4
+Name: forgesight-eval
+Version: 0.1.1
+Summary: ForgeSight eval — run-correlated eval scores + human feedback on the telemetry pipeline.
+Project-URL: Homepage, https://github.com/Scaffoldic/forgesight
+Project-URL: Repository, https://github.com/Scaffoldic/forgesight
+Project-URL: Issues, https://github.com/Scaffoldic/forgesight/issues
+Project-URL: Changelog, https://github.com/Scaffoldic/forgesight/blob/main/docs/releases/v0.1.md
+Author: kjoshi
+License-Expression: Apache-2.0
+Keywords: ai-agents,evaluation,feedback,forgesight,llm-as-judge,observability
+Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Information Technology
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: System :: Monitoring
+Classifier: Typing :: Typed
+Requires-Python: >=3.11
+Requires-Dist: forgesight-core
+Description-Content-Type: text/markdown
+# forgesight-eval
+Run-correlated **eval scores and human feedback** for [ForgeSight](https://github.com/Scaffoldic/forgesight).
+Quality joins cost and structure on the *same* `run_id`, on the *same* pipeline, in the *same*
+backends — Langfuse scores, Phoenix evaluations, any OTLP sink — for two function calls.
+```bash
+pip install forgesight-eval
+```
+```python
+from forgesight import telemetry
+from forgesight_eval import record_evaluation, record_feedback
+with telemetry.agent_run("rag-answerer", metadata={"prompt_version": "v7"}) as run:
+    answer = await answer_question(...)
+    # an automated eval (LLM-as-judge / Ragas / DeepEval) → one call, correlated to the live run
+    record_evaluation("faithfulness", score=0.91, label="pass",
+                      explanation="All claims grounded.", evaluator="ragas")
+# hours later, from a webhook — post-hoc human feedback, by run_id:
+def on_thumbs_down(run_id: str, comment: str) -> None:
+    record_feedback("user_satisfaction", run_id=run_id, label="thumbs_down",
+                    score=0.0, comment=comment, source="human")
+```
+## How it works
+- **`record_evaluation`** attaches to the **current** run (ambient context) — a real-time eval
+  nests as a child span under the run's open trace. **`record_feedback`** attaches to a **past**
+  run by `run_id` — a standalone record carrying the id so the backend re-associates it.
+- Both emit the OTel **`gen_ai.evaluation.*`** attributes (`name` / `score.value` / `score.label`
+  / `explanation`) plus `forgesight.evaluation.*` extensions (`source`, `realtime`, `evaluator`),
+  and publish an `EVALUATION_RECORDED` lifecycle event. Backends that speak the convention
+  (Langfuse, Phoenix) display them as scores/evaluations with **no per-backend code** (P4).
+- **Non-blocking** (P6): both build a record and enqueue — no network I/O on the agent or the
+  webhook. **Secure by default** (P7): `explanation` / `comment` text is gated by
+  `capture_explanation` *and* the global content-capture switch.
+## Configuration
+```yaml
+modules:
+  eval:
+    enabled: true              # master switch (default false — install ≠ active)
+    emit_as: "span"            # span | event
+    capture_explanation: true  # still gated by the global content-capture switch (P7)
+    score_schema:              # optional — validates score/label at the call site
+      faithfulness:      { type: "numeric", min: 0.0, max: 1.0 }
+      user_satisfaction: { type: "categorical", labels: ["thumbs_up", "thumbs_down"] }
+```
+At least one of `score` / `label` must be set. Schema'd dimensions are validated (numeric range,
+categorical membership); un-schema'd dimensions are accepted unvalidated (open set).
+## Out of scope
+Running the evaluations (judges/metrics live in the agent or a library — this is the transport),
+a dashboard, and score storage/aggregation (the backend's job).
+## License
+Apache-2.0

forgesight_eval-0.1.1/README.md ADDED Viewed

@@ -0,0 +1,63 @@
+# forgesight-eval
+Run-correlated **eval scores and human feedback** for [ForgeSight](https://github.com/Scaffoldic/forgesight).
+Quality joins cost and structure on the *same* `run_id`, on the *same* pipeline, in the *same*
+backends — Langfuse scores, Phoenix evaluations, any OTLP sink — for two function calls.
+```bash
+pip install forgesight-eval
+```
+```python
+from forgesight import telemetry
+from forgesight_eval import record_evaluation, record_feedback
+with telemetry.agent_run("rag-answerer", metadata={"prompt_version": "v7"}) as run:
+    answer = await answer_question(...)
+    # an automated eval (LLM-as-judge / Ragas / DeepEval) → one call, correlated to the live run
+    record_evaluation("faithfulness", score=0.91, label="pass",
+                      explanation="All claims grounded.", evaluator="ragas")
+# hours later, from a webhook — post-hoc human feedback, by run_id:
+def on_thumbs_down(run_id: str, comment: str) -> None:
+    record_feedback("user_satisfaction", run_id=run_id, label="thumbs_down",
+                    score=0.0, comment=comment, source="human")
+```
+## How it works
+- **`record_evaluation`** attaches to the **current** run (ambient context) — a real-time eval
+  nests as a child span under the run's open trace. **`record_feedback`** attaches to a **past**
+  run by `run_id` — a standalone record carrying the id so the backend re-associates it.
+- Both emit the OTel **`gen_ai.evaluation.*`** attributes (`name` / `score.value` / `score.label`
+  / `explanation`) plus `forgesight.evaluation.*` extensions (`source`, `realtime`, `evaluator`),
+  and publish an `EVALUATION_RECORDED` lifecycle event. Backends that speak the convention
+  (Langfuse, Phoenix) display them as scores/evaluations with **no per-backend code** (P4).
+- **Non-blocking** (P6): both build a record and enqueue — no network I/O on the agent or the
+  webhook. **Secure by default** (P7): `explanation` / `comment` text is gated by
+  `capture_explanation` *and* the global content-capture switch.
+## Configuration
+```yaml
+modules:
+  eval:
+    enabled: true              # master switch (default false — install ≠ active)
+    emit_as: "span"            # span | event
+    capture_explanation: true  # still gated by the global content-capture switch (P7)
+    score_schema:              # optional — validates score/label at the call site
+      faithfulness:      { type: "numeric", min: 0.0, max: 1.0 }
+      user_satisfaction: { type: "categorical", labels: ["thumbs_up", "thumbs_down"] }
+```
+At least one of `score` / `label` must be set. Schema'd dimensions are validated (numeric range,
+categorical membership); un-schema'd dimensions are accepted unvalidated (open set).
+## Out of scope
+Running the evaluations (judges/metrics live in the agent or a library — this is the transport),
+a dashboard, and score storage/aggregation (the backend's job).
+## License
+Apache-2.0

forgesight_eval-0.1.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,41 @@
+[project]
+name = "forgesight-eval"
+version = "0.1.1"
+description = "ForgeSight eval — run-correlated eval scores + human feedback on the telemetry pipeline."
+readme = "README.md"
+requires-python = ">=3.11"
+license = "Apache-2.0"
+authors = [{ name = "kjoshi" }]
+keywords = ["observability", "evaluation", "feedback", "llm-as-judge", "ai-agents", "forgesight"]
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Topic :: System :: Monitoring",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Typing :: Typed",
+]
+dependencies = ["forgesight-core"]
+[project.entry-points."forgesight.modules"]
+eval = "forgesight_eval:install"
+[project.urls]
+Homepage = "https://github.com/Scaffoldic/forgesight"
+Repository = "https://github.com/Scaffoldic/forgesight"
+Issues = "https://github.com/Scaffoldic/forgesight/issues"
+Changelog = "https://github.com/Scaffoldic/forgesight/blob/main/docs/releases/v0.1.md"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/forgesight_eval"]
+[tool.uv.sources]
+forgesight-core = { workspace = true }

forgesight_eval-0.1.1/src/forgesight_eval/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""ForgeSight eval — run-correlated eval scores + human feedback on the same pipeline."""
+from __future__ import annotations
+from ._config import EvalConfig, install
+from .api import record_evaluation, record_feedback
+from .model import EvaluationResult
+__version__ = "0.1.0"
+__all__ = [
+    "EvalConfig",
+    "EvaluationResult",
+    "__version__",
+    "install",
+    "record_evaluation",
+    "record_feedback",
+]

forgesight_eval-0.1.1/src/forgesight_eval/_config.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Module config for eval emission: enabled switch, emit_as, explanation gate, score schema.
+``enabled`` defaults **false** — installing the package emits nothing until switched on
+(P2). ``install`` (the ``forgesight.modules`` entry point) sets it from ``modules.eval.*``;
+otherwise it lazy-loads from the SDK's layered settings + ``FORGESIGHT_EVAL_*`` env on first
+use. The optional ``score_schema`` validates score/label at the call site (fail-fast).
+"""
+from __future__ import annotations
+import os
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from typing import Any
+from forgesight_core.config import load_settings
+@dataclass
+class EvalConfig:
+    enabled: bool = False
+    emit_as: str = "span"  # "span" | "event"
+    capture_explanation: bool = True
+    score_schema: dict[str, dict[str, Any]] = field(default_factory=dict)
+_config: EvalConfig | None = None
+def _as_bool(value: object, default: bool) -> bool:
+    if value is None:
+        return default
+    if isinstance(value, bool):
+        return value
+    return str(value).strip().lower() in ("1", "true", "yes", "on")
+def _build(block: Mapping[str, Any]) -> EvalConfig:
+    emit_as = str(os.environ.get("FORGESIGHT_EVAL_EMIT_AS") or block.get("emit_as") or "span")
+    if emit_as not in ("span", "event"):
+        raise ValueError(f"modules.eval.emit_as must be span|event, got {emit_as!r}")
+    enabled_env = os.environ.get("FORGESIGHT_EVAL_ENABLED")
+    enabled = _as_bool(enabled_env, _as_bool(block.get("enabled"), False))
+    schema = block.get("score_schema")
+    return EvalConfig(
+        enabled=enabled,
+        emit_as=emit_as,
+        capture_explanation=_as_bool(block.get("capture_explanation"), True),
+        score_schema=dict(schema) if isinstance(schema, Mapping) else {},
+    )
+def install(config: Mapping[str, Any] | None = None) -> bool:
+    """The ``forgesight.modules`` entry point: set the eval module config. Returns ``enabled``."""
+    global _config
+    _config = _build(config or {})
+    return _config.enabled
+def get_config() -> EvalConfig:
+    """Return the active config, lazily loading from ``modules.eval`` settings if unset."""
+    global _config
+    if _config is None:
+        block = load_settings().get("modules")
+        eval_block = block.get("eval") if isinstance(block, Mapping) else None
+        _config = _build(eval_block if isinstance(eval_block, Mapping) else {})
+    return _config
+def reset_config() -> None:
+    """Clear the cached config (tests / re-configure)."""
+    global _config
+    _config = None

forgesight_eval-0.1.1/src/forgesight_eval/api.py ADDED Viewed

@@ -0,0 +1,179 @@
+"""``record_evaluation`` / ``record_feedback`` — run-correlated quality on the same pipeline.
+``record_evaluation`` attaches to the **current** run (ambient context) — a real-time eval
+nests as a child span under the run's still-open trace. ``record_feedback`` attaches to a
+**past** run by ``run_id`` — a standalone record carrying the id so the backend re-associates
+it. Both build an :class:`EvaluationResult`, emit it as a record with the OTel
+``gen_ai.evaluation.*`` attributes (so it streams to Langfuse scores / Phoenix evaluations /
+any OTLP sink, P4) and publish an ``EVALUATION_RECORDED`` event. Non-blocking (P6).
+"""
+from __future__ import annotations
+import time
+from collections.abc import Mapping
+from forgesight_api import EventType, Kind, LifecycleEvent, Record, RunStatus, new_trace_id
+from forgesight_core import current_context, current_run_scope, get_runtime, new_span_id
+from ._config import get_config
+from .model import EvaluationResult
+# OTel GenAI evaluation attributes (otel-semantic-conventions §4.3).
+GEN_AI_EVALUATION_NAME = "gen_ai.evaluation.name"
+GEN_AI_EVALUATION_SCORE_VALUE = "gen_ai.evaluation.score.value"
+GEN_AI_EVALUATION_SCORE_LABEL = "gen_ai.evaluation.score.label"
+GEN_AI_EVALUATION_EXPLANATION = "gen_ai.evaluation.explanation"
+# namespaced extensions (OTel defines none of these)
+FORGESIGHT_EVAL_SOURCE = "forgesight.evaluation.source"
+FORGESIGHT_EVAL_REALTIME = "forgesight.evaluation.realtime"
+FORGESIGHT_EVAL_EVALUATOR = "forgesight.evaluation.evaluator"
+FORGESIGHT_RUN_ID = "forgesight.run.id"
+def record_evaluation(
+    name: str,
+    *,
+    score: float | None = None,
+    label: str | None = None,
+    explanation: str | None = None,
+    evaluator: str | None = None,
+    run_id: str | None = None,
+    metadata: Mapping[str, object] | None = None,
+) -> None:
+    """Attach an eval (auto) to the current run, or to ``run_id`` if given. No-op if disabled."""
+    resolved_run_id = run_id or _current_run_id()
+    if resolved_run_id is None:
+        raise RuntimeError(
+            "record_evaluation has no run_id and no current run; "
+            "call it inside a run or pass run_id"
+        )
+    _validate(name, score, label)
+    _emit(
+        EvaluationResult(
+            name=name,
+            run_id=resolved_run_id,
+            score=score,
+            label=label,
+            explanation=explanation,
+            evaluator=evaluator,
+            source="auto",
+            realtime=run_id is None,
+            metadata=dict(metadata or {}),
+        )
+    )
+def record_feedback(
+    name: str,
+    *,
+    run_id: str,
+    score: float | None = None,
+    label: str | None = None,
+    comment: str | None = None,
+    source: str = "human",
+    metadata: Mapping[str, object] | None = None,
+) -> None:
+    """Attach post-hoc feedback to a past run by ``run_id``. No-op if disabled."""
+    _validate(name, score, label)
+    _emit(
+        EvaluationResult(
+            name=name,
+            run_id=run_id,
+            score=score,
+            label=label,
+            explanation=comment,
+            evaluator=None,
+            source=source,
+            realtime=False,
+            metadata=dict(metadata or {}),
+        )
+    )
+def _current_run_id() -> str | None:
+    run = current_run_scope()
+    if run is not None and run.run_id:
+        return run.run_id
+    context = current_context()
+    return context.run_id if context is not None else None
+def _validate(name: str, score: float | None, label: str | None) -> None:
+    if score is None and label is None:
+        raise ValueError(f"evaluation {name!r} must set at least one of score / label")
+    schema = get_config().score_schema.get(name)
+    if not isinstance(schema, Mapping):
+        return  # unschema'd dimension — open set, accepted unvalidated
+    kind = schema.get("type")
+    if kind == "numeric" and score is not None:
+        low, high = schema.get("min"), schema.get("max")
+        if (low is not None and score < low) or (high is not None and score > high):
+            raise ValueError(f"score {score} for {name!r} outside [{low}, {high}]")
+    if kind == "categorical" and label is not None:
+        labels = schema.get("labels") or ()
+        if label not in labels:
+            raise ValueError(f"label {label!r} for {name!r} not in {list(labels)}")
+def _emit(result: EvaluationResult) -> None:
+    config = get_config()
+    if not config.enabled:
+        return  # module installed but not switched on (P2)
+    runtime = get_runtime()
+    context = current_context()
+    if result.realtime and context is not None:
+        trace_id, parent = context.trace_id, context.current_span_id
+    else:
+        trace_id, parent = new_trace_id(), None  # post-hoc: standalone, re-associated by run_id
+    now = time.time_ns()
+    record = Record(
+        kind=Kind.STEP,
+        run_id=result.run_id,
+        trace_id=trace_id,
+        span_id=new_span_id(),
+        parent_span_id=parent,
+        name=f"evaluation {result.name}",
+        status=RunStatus.OK,
+        start_unix_nanos=now,
+        end_unix_nanos=now,
+        attributes=_attributes(result, config.capture_explanation),
+    )
+    runtime.emit_record(record)
+    runtime.emit_event(
+        LifecycleEvent(
+            type=EventType.EVALUATION_RECORDED,
+            run_id=result.run_id,
+            unix_nanos=now,
+            record=record,
+            trace_id=trace_id,
+            span_id=record.span_id,
+        )
+    )
+def _attributes(result: EvaluationResult, capture_explanation: bool) -> dict[str, object]:
+    attrs: dict[str, object] = dict(result.metadata)
+    attrs[GEN_AI_EVALUATION_NAME] = result.name
+    attrs[FORGESIGHT_RUN_ID] = result.run_id
+    attrs[FORGESIGHT_EVAL_SOURCE] = result.source
+    attrs[FORGESIGHT_EVAL_REALTIME] = result.realtime
+    if result.score is not None:
+        attrs[GEN_AI_EVALUATION_SCORE_VALUE] = result.score
+    if result.label is not None:
+        attrs[GEN_AI_EVALUATION_SCORE_LABEL] = result.label
+    if result.evaluator is not None:
+        attrs[FORGESIGHT_EVAL_EVALUATOR] = result.evaluator
+    # explanation/comment is free text → gated by capture_explanation AND the global
+    # content-capture switch (P7); dropped if either is off.
+    if result.explanation is not None and capture_explanation and _content_capture_on():
+        attrs[GEN_AI_EVALUATION_EXPLANATION] = result.explanation
+    return attrs
+def _content_capture_on() -> bool:
+    try:
+        return bool(get_runtime().config.capture_content)
+    except Exception:  # pragma: no cover - runtime always present in practice
+        return False

forgesight_eval-0.1.1/src/forgesight_eval/model.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""``EvaluationResult`` — the run-correlated quality signal (feat-021).
+The same value type for an automated eval (LLM-as-judge / Ragas / DeepEval) and a post-hoc
+human thumbs-up/down. ``realtime`` distinguishes "attached during the run" from "arrived
+later by ``run_id``"; ``source`` distinguishes ``auto`` from ``human``. At least one of
+``score`` / ``label`` must be set (enforced at the call site). Experimental within 0.x.
+"""
+from __future__ import annotations
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from types import MappingProxyType
+_EMPTY: Mapping[str, object] = MappingProxyType({})
+@dataclass(frozen=True, slots=True)
+class EvaluationResult:
+    name: str
+    run_id: str
+    score: float | None = None
+    label: str | None = None
+    explanation: str | None = None
+    evaluator: str | None = None
+    source: str = "auto"  # "auto" (eval) | "human" (feedback)
+    realtime: bool = True  # True if attached during the run, else post-hoc
+    metadata: Mapping[str, object] = field(default_factory=lambda: _EMPTY)

forgesight_eval-0.1.1/src/forgesight_eval/py.typed ADDED Viewed

File without changes

forgesight_eval-0.1.1/tests/test_eval.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""Tests for eval/feedback: ambient vs by-id, gen_ai.evaluation.* mapping, schema, gating."""
+from __future__ import annotations
+from collections.abc import Iterator
+import pytest
+from forgesight_api import EventType, LifecycleEvent
+from forgesight_core import InMemoryExporter, configure, reset_runtime, telemetry
+from forgesight_eval import EvaluationResult, install, record_evaluation, record_feedback
+from forgesight_eval._config import reset_config
+class _Listener:
+    def __init__(self) -> None:
+        self.events: list[LifecycleEvent] = []
+    def on_event(self, event: LifecycleEvent) -> None:
+        self.events.append(event)
+@pytest.fixture
+def sink() -> Iterator[InMemoryExporter]:
+    exporter = InMemoryExporter()
+    listener = _Listener()
+    install({"enabled": True})
+    configure(exporters=[exporter], listeners=[listener], sync_export=True, capture_content=True)
+    exporter.listener = listener  # type: ignore[attr-defined]
+    try:
+        yield exporter
+    finally:
+        reset_runtime()
+        reset_config()
+def _eval_records(sink: InMemoryExporter) -> list:
+    return [r for r in sink.records if r.name.startswith("evaluation ")]
+# --- real-time eval -----------------------------------------------------------
+def test_record_evaluation_attaches_to_current_run(sink: InMemoryExporter) -> None:
+    with telemetry.agent_run("rag") as run:
+        record_evaluation(
+            "faithfulness", score=0.91, label="pass", explanation="grounded", evaluator="ragas"
+        )
+        run_id = run.run_id
+        trace_id = run.trace_id
+    [rec] = _eval_records(sink)
+    assert rec.run_id == run_id
+    assert rec.trace_id == trace_id  # nested in the run's trace
+    assert rec.parent_span_id is not None
+    assert rec.attributes["gen_ai.evaluation.name"] == "faithfulness"
+    assert rec.attributes["gen_ai.evaluation.score.value"] == 0.91
+    assert rec.attributes["gen_ai.evaluation.score.label"] == "pass"
+    assert rec.attributes["gen_ai.evaluation.explanation"] == "grounded"
+    assert rec.attributes["forgesight.evaluation.source"] == "auto"
+    assert rec.attributes["forgesight.evaluation.realtime"] is True
+    assert rec.attributes["forgesight.evaluation.evaluator"] == "ragas"
+def test_evaluation_emits_event(sink: InMemoryExporter) -> None:
+    with telemetry.agent_run("rag"):
+        record_evaluation("relevance", score=0.7)
+    events = [e for e in sink.listener.events if e.type is EventType.EVALUATION_RECORDED]  # type: ignore[attr-defined]
+    assert len(events) == 1
+    assert events[0].record is not None
+def test_evaluation_carries_metadata(sink: InMemoryExporter) -> None:
+    with telemetry.agent_run("rag"):
+        record_evaluation("faithfulness", score=0.5, metadata={"judge_model": "claude-sonnet-4-5"})
+    [rec] = _eval_records(sink)
+    assert rec.attributes["judge_model"] == "claude-sonnet-4-5"
+def test_evaluation_explicit_run_id_is_not_realtime(sink: InMemoryExporter) -> None:
+    record_evaluation("faithfulness", score=0.8, run_id="01J9Z3K7P8QF2R5V6W7X8Y9Z0A")
+    [rec] = _eval_records(sink)
+    assert rec.run_id == "01J9Z3K7P8QF2R5V6W7X8Y9Z0A"
+    assert rec.attributes["forgesight.evaluation.realtime"] is False
+    assert rec.parent_span_id is None  # standalone, no live trace
+# --- post-hoc feedback --------------------------------------------------------
+def test_record_feedback_by_run_id(sink: InMemoryExporter) -> None:
+    record_feedback(
+        "user_satisfaction",
+        run_id="01J9Z3K7P8QF2R5V6W7X8Y9Z0A",
+        label="thumbs_down",
+        score=0.0,
+        comment="hallucinated the date",
+    )
+    [rec] = _eval_records(sink)
+    assert rec.run_id == "01J9Z3K7P8QF2R5V6W7X8Y9Z0A"
+    assert rec.attributes["gen_ai.evaluation.score.label"] == "thumbs_down"
+    assert rec.attributes["forgesight.evaluation.source"] == "human"
+    assert rec.attributes["forgesight.evaluation.realtime"] is False
+    assert rec.attributes["gen_ai.evaluation.explanation"] == "hallucinated the date"
+# --- validation ---------------------------------------------------------------
+def test_evaluation_requires_score_or_label(sink: InMemoryExporter) -> None:
+    with telemetry.agent_run("r"), pytest.raises(ValueError, match="at least one of score"):
+        record_evaluation("faithfulness")
+def test_evaluation_outside_run_without_id_raises(sink: InMemoryExporter) -> None:
+    with pytest.raises(RuntimeError, match="no current run"):
+        record_evaluation("faithfulness", score=0.5)
+def test_score_schema_numeric_range() -> None:
+    install(
+        {
+            "enabled": True,
+            "score_schema": {"faithfulness": {"type": "numeric", "min": 0.0, "max": 1.0}},
+        }
+    )
+    configure(exporters=[InMemoryExporter()], sync_export=True)
+    try:
+        with pytest.raises(ValueError, match="outside"):
+            record_evaluation("faithfulness", score=1.5, run_id="r")
+        record_evaluation("faithfulness", score=0.5, run_id="r")  # in range ⇒ ok
+    finally:
+        reset_runtime()
+        reset_config()
+def test_score_schema_categorical() -> None:
+    install(
+        {
+            "enabled": True,
+            "score_schema": {
+                "user_satisfaction": {"type": "categorical", "labels": ["thumbs_up", "thumbs_down"]}
+            },
+        }
+    )
+    configure(exporters=[InMemoryExporter()], sync_export=True)
+    try:
+        with pytest.raises(ValueError, match="not in"):
+            record_feedback("user_satisfaction", run_id="r", label="meh")
+        record_feedback("user_satisfaction", run_id="r", label="thumbs_up")  # valid label
+    finally:
+        reset_runtime()
+        reset_config()
+def test_unschemad_dimension_accepted() -> None:
+    install(
+        {"enabled": True, "score_schema": {"faithfulness": {"type": "numeric", "min": 0, "max": 1}}}
+    )
+    configure(exporters=[InMemoryExporter()], sync_export=True)
+    try:
+        record_evaluation("novel_metric", score=42.0, run_id="r")  # not in schema ⇒ unvalidated
+    finally:
+        reset_runtime()
+        reset_config()
+# --- enable switch + privacy --------------------------------------------------
+def test_disabled_module_is_noop() -> None:
+    install({"enabled": False})
+    exporter = InMemoryExporter()
+    configure(exporters=[exporter], sync_export=True)
+    try:
+        record_evaluation("faithfulness", score=0.9, run_id="r")
+        assert exporter.records == []  # installed but not switched on ⇒ nothing emitted
+    finally:
+        reset_runtime()
+        reset_config()
+def test_explanation_dropped_when_content_capture_off() -> None:
+    install({"enabled": True})
+    exporter = InMemoryExporter()
+    configure(exporters=[exporter], sync_export=True, capture_content=False)  # P7: content off
+    try:
+        record_evaluation("faithfulness", score=0.9, explanation="contains PII", run_id="r")
+        [rec] = [r for r in exporter.records if r.name.startswith("evaluation ")]
+        assert "gen_ai.evaluation.explanation" not in rec.attributes  # text dropped
+    finally:
+        reset_runtime()
+        reset_config()
+def test_explanation_dropped_when_capture_explanation_off() -> None:
+    install({"enabled": True, "capture_explanation": False})
+    exporter = InMemoryExporter()
+    configure(exporters=[exporter], sync_export=True, capture_content=True)
+    try:
+        record_evaluation("faithfulness", score=0.9, explanation="hidden", run_id="r")
+        [rec] = [r for r in exporter.records if r.name.startswith("evaluation ")]
+        assert "gen_ai.evaluation.explanation" not in rec.attributes
+    finally:
+        reset_runtime()
+        reset_config()
+# --- model + config -----------------------------------------------------------
+def test_evaluation_result_defaults() -> None:
+    result = EvaluationResult(name="x", run_id="r", score=1.0)
+    assert result.source == "auto"
+    assert result.realtime is True
+    assert result.metadata == {}
+def test_install_returns_enabled() -> None:
+    try:
+        assert install({"enabled": True}) is True
+        assert install({"enabled": False}) is False
+    finally:
+        reset_config()
+def test_install_rejects_bad_emit_as() -> None:
+    try:
+        with pytest.raises(ValueError, match="emit_as"):
+            install({"emit_as": "telegram"})
+    finally:
+        reset_config()
+def test_config_lazy_loads_from_settings() -> None:
+    reset_config()
+    from forgesight_eval._config import get_config
+    assert get_config().enabled is False  # no config ⇒ default disabled
+    reset_config()
+# --- fan-out to two exporters -------------------------------------------------
+def test_evaluation_fans_out_to_two_exporters() -> None:
+    install({"enabled": True})
+    a, b = InMemoryExporter(), InMemoryExporter()
+    configure(exporters=[a, b], sync_export=True)
+    try:
+        with telemetry.agent_run("r"):
+            record_evaluation("faithfulness", score=0.9)
+        assert any(r.name.startswith("evaluation ") for r in a.records)
+        assert any(r.name.startswith("evaluation ") for r in b.records)
+    finally:
+        reset_runtime()
+        reset_config()