forgesight-eval 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ *.so
9
+
10
+ # venv / tooling
11
+ .venv/
12
+ venv/
13
+ .uv/
14
+ uv.lock
15
+
16
+ # test / type / lint caches
17
+ .pytest_cache/
18
+ .mypy_cache/
19
+ .ruff_cache/
20
+ .coverage
21
+ .coverage.*
22
+ coverage.xml
23
+ htmlcov/
24
+
25
+ # secrets / local env (never commit)
26
+ .env
27
+ .env.*
28
+
29
+ # editor / OS
30
+ .DS_Store
31
+ .idea/
32
+ .vscode/
33
+
34
+ # local-only session working state (per the workspace pipeline)
35
+ .claude/state/
36
+
37
+ # local-only launch planning (not part of the published repo)
38
+ /launch/
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: forgesight-eval
3
+ Version: 0.1.1
4
+ Summary: ForgeSight eval — run-correlated eval scores + human feedback on the telemetry pipeline.
5
+ Project-URL: Homepage, https://github.com/Scaffoldic/forgesight
6
+ Project-URL: Repository, https://github.com/Scaffoldic/forgesight
7
+ Project-URL: Issues, https://github.com/Scaffoldic/forgesight/issues
8
+ Project-URL: Changelog, https://github.com/Scaffoldic/forgesight/blob/main/docs/releases/v0.1.md
9
+ Author: kjoshi
10
+ License-Expression: Apache-2.0
11
+ Keywords: ai-agents,evaluation,feedback,forgesight,llm-as-judge,observability
12
+ Classifier: Development Status :: 2 - Pre-Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Information Technology
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: System :: Monitoring
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.11
23
+ Requires-Dist: forgesight-core
24
+ Description-Content-Type: text/markdown
25
+
26
+ # forgesight-eval
27
+
28
+ Run-correlated **eval scores and human feedback** for [ForgeSight](https://github.com/Scaffoldic/forgesight).
29
+ Quality joins cost and structure on the *same* `run_id`, on the *same* pipeline, in the *same*
30
+ backends — Langfuse scores, Phoenix evaluations, any OTLP sink — for two function calls.
31
+
32
+ ```bash
33
+ pip install forgesight-eval
34
+ ```
35
+
36
+ ```python
37
+ from forgesight import telemetry
38
+ from forgesight_eval import record_evaluation, record_feedback
39
+
40
+ with telemetry.agent_run("rag-answerer", metadata={"prompt_version": "v7"}) as run:
41
+ answer = await answer_question(...)
42
+ # an automated eval (LLM-as-judge / Ragas / DeepEval) → one call, correlated to the live run
43
+ record_evaluation("faithfulness", score=0.91, label="pass",
44
+ explanation="All claims grounded.", evaluator="ragas")
45
+
46
+ # hours later, from a webhook — post-hoc human feedback, by run_id:
47
+ def on_thumbs_down(run_id: str, comment: str) -> None:
48
+ record_feedback("user_satisfaction", run_id=run_id, label="thumbs_down",
49
+ score=0.0, comment=comment, source="human")
50
+ ```
51
+
52
+ ## How it works
53
+
54
+ - **`record_evaluation`** attaches to the **current** run (ambient context) — a real-time eval
55
+ nests as a child span under the run's open trace. **`record_feedback`** attaches to a **past**
56
+ run by `run_id` — a standalone record carrying the id so the backend re-associates it.
57
+ - Both emit the OTel **`gen_ai.evaluation.*`** attributes (`name` / `score.value` / `score.label`
58
+ / `explanation`) plus `forgesight.evaluation.*` extensions (`source`, `realtime`, `evaluator`),
59
+ and publish an `EVALUATION_RECORDED` lifecycle event. Backends that speak the convention
60
+ (Langfuse, Phoenix) display them as scores/evaluations with **no per-backend code** (P4).
61
+ - **Non-blocking** (P6): both build a record and enqueue — no network I/O on the agent or the
62
+ webhook. **Secure by default** (P7): `explanation` / `comment` text is gated by
63
+ `capture_explanation` *and* the global content-capture switch.
64
+
65
+ ## Configuration
66
+
67
+ ```yaml
68
+ modules:
69
+ eval:
70
+ enabled: true # master switch (default false — install ≠ active)
71
+ emit_as: "span" # span | event
72
+ capture_explanation: true # still gated by the global content-capture switch (P7)
73
+ score_schema: # optional — validates score/label at the call site
74
+ faithfulness: { type: "numeric", min: 0.0, max: 1.0 }
75
+ user_satisfaction: { type: "categorical", labels: ["thumbs_up", "thumbs_down"] }
76
+ ```
77
+
78
+ At least one of `score` / `label` must be set. Schema'd dimensions are validated (numeric range,
79
+ categorical membership); un-schema'd dimensions are accepted unvalidated (open set).
80
+
81
+ ## Out of scope
82
+
83
+ Running the evaluations (judges/metrics live in the agent or a library — this is the transport),
84
+ a dashboard, and score storage/aggregation (the backend's job).
85
+
86
+ ## License
87
+
88
+ Apache-2.0
@@ -0,0 +1,63 @@
1
+ # forgesight-eval
2
+
3
+ Run-correlated **eval scores and human feedback** for [ForgeSight](https://github.com/Scaffoldic/forgesight).
4
+ Quality joins cost and structure on the *same* `run_id`, on the *same* pipeline, in the *same*
5
+ backends — Langfuse scores, Phoenix evaluations, any OTLP sink — for two function calls.
6
+
7
+ ```bash
8
+ pip install forgesight-eval
9
+ ```
10
+
11
+ ```python
12
+ from forgesight import telemetry
13
+ from forgesight_eval import record_evaluation, record_feedback
14
+
15
+ with telemetry.agent_run("rag-answerer", metadata={"prompt_version": "v7"}) as run:
16
+ answer = await answer_question(...)
17
+ # an automated eval (LLM-as-judge / Ragas / DeepEval) → one call, correlated to the live run
18
+ record_evaluation("faithfulness", score=0.91, label="pass",
19
+ explanation="All claims grounded.", evaluator="ragas")
20
+
21
+ # hours later, from a webhook — post-hoc human feedback, by run_id:
22
+ def on_thumbs_down(run_id: str, comment: str) -> None:
23
+ record_feedback("user_satisfaction", run_id=run_id, label="thumbs_down",
24
+ score=0.0, comment=comment, source="human")
25
+ ```
26
+
27
+ ## How it works
28
+
29
+ - **`record_evaluation`** attaches to the **current** run (ambient context) — a real-time eval
30
+ nests as a child span under the run's open trace. **`record_feedback`** attaches to a **past**
31
+ run by `run_id` — a standalone record carrying the id so the backend re-associates it.
32
+ - Both emit the OTel **`gen_ai.evaluation.*`** attributes (`name` / `score.value` / `score.label`
33
+ / `explanation`) plus `forgesight.evaluation.*` extensions (`source`, `realtime`, `evaluator`),
34
+ and publish an `EVALUATION_RECORDED` lifecycle event. Backends that speak the convention
35
+ (Langfuse, Phoenix) display them as scores/evaluations with **no per-backend code** (P4).
36
+ - **Non-blocking** (P6): both build a record and enqueue — no network I/O on the agent or the
37
+ webhook. **Secure by default** (P7): `explanation` / `comment` text is gated by
38
+ `capture_explanation` *and* the global content-capture switch.
39
+
40
+ ## Configuration
41
+
42
+ ```yaml
43
+ modules:
44
+ eval:
45
+ enabled: true # master switch (default false — install ≠ active)
46
+ emit_as: "span" # span | event
47
+ capture_explanation: true # still gated by the global content-capture switch (P7)
48
+ score_schema: # optional — validates score/label at the call site
49
+ faithfulness: { type: "numeric", min: 0.0, max: 1.0 }
50
+ user_satisfaction: { type: "categorical", labels: ["thumbs_up", "thumbs_down"] }
51
+ ```
52
+
53
+ At least one of `score` / `label` must be set. Schema'd dimensions are validated (numeric range,
54
+ categorical membership); un-schema'd dimensions are accepted unvalidated (open set).
55
+
56
+ ## Out of scope
57
+
58
+ Running the evaluations (judges/metrics live in the agent or a library — this is the transport),
59
+ a dashboard, and score storage/aggregation (the backend's job).
60
+
61
+ ## License
62
+
63
+ Apache-2.0
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "forgesight-eval"
3
+ version = "0.1.1"
4
+ description = "ForgeSight eval — run-correlated eval scores + human feedback on the telemetry pipeline."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = "Apache-2.0"
8
+ authors = [{ name = "kjoshi" }]
9
+ keywords = ["observability", "evaluation", "feedback", "llm-as-judge", "ai-agents", "forgesight"]
10
+ classifiers = [
11
+ "Development Status :: 2 - Pre-Alpha",
12
+ "Intended Audience :: Developers",
13
+ "Intended Audience :: Information Technology",
14
+ "Topic :: System :: Monitoring",
15
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
16
+ "License :: OSI Approved :: Apache Software License",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "Typing :: Typed",
21
+ ]
22
+ dependencies = ["forgesight-core"]
23
+
24
+ [project.entry-points."forgesight.modules"]
25
+ eval = "forgesight_eval:install"
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/Scaffoldic/forgesight"
29
+ Repository = "https://github.com/Scaffoldic/forgesight"
30
+ Issues = "https://github.com/Scaffoldic/forgesight/issues"
31
+ Changelog = "https://github.com/Scaffoldic/forgesight/blob/main/docs/releases/v0.1.md"
32
+
33
+ [build-system]
34
+ requires = ["hatchling"]
35
+ build-backend = "hatchling.build"
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["src/forgesight_eval"]
39
+
40
+ [tool.uv.sources]
41
+ forgesight-core = { workspace = true }
@@ -0,0 +1,18 @@
1
+ """ForgeSight eval — run-correlated eval scores + human feedback on the same pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ._config import EvalConfig, install
6
+ from .api import record_evaluation, record_feedback
7
+ from .model import EvaluationResult
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ __all__ = [
12
+ "EvalConfig",
13
+ "EvaluationResult",
14
+ "__version__",
15
+ "install",
16
+ "record_evaluation",
17
+ "record_feedback",
18
+ ]
@@ -0,0 +1,73 @@
1
+ """Module config for eval emission: enabled switch, emit_as, explanation gate, score schema.
2
+
3
+ ``enabled`` defaults **false** — installing the package emits nothing until switched on
4
+ (P2). ``install`` (the ``forgesight.modules`` entry point) sets it from ``modules.eval.*``;
5
+ otherwise it lazy-loads from the SDK's layered settings + ``FORGESIGHT_EVAL_*`` env on first
6
+ use. The optional ``score_schema`` validates score/label at the call site (fail-fast).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ from collections.abc import Mapping
13
+ from dataclasses import dataclass, field
14
+ from typing import Any
15
+
16
+ from forgesight_core.config import load_settings
17
+
18
+
19
+ @dataclass
20
+ class EvalConfig:
21
+ enabled: bool = False
22
+ emit_as: str = "span" # "span" | "event"
23
+ capture_explanation: bool = True
24
+ score_schema: dict[str, dict[str, Any]] = field(default_factory=dict)
25
+
26
+
27
+ _config: EvalConfig | None = None
28
+
29
+
30
+ def _as_bool(value: object, default: bool) -> bool:
31
+ if value is None:
32
+ return default
33
+ if isinstance(value, bool):
34
+ return value
35
+ return str(value).strip().lower() in ("1", "true", "yes", "on")
36
+
37
+
38
+ def _build(block: Mapping[str, Any]) -> EvalConfig:
39
+ emit_as = str(os.environ.get("FORGESIGHT_EVAL_EMIT_AS") or block.get("emit_as") or "span")
40
+ if emit_as not in ("span", "event"):
41
+ raise ValueError(f"modules.eval.emit_as must be span|event, got {emit_as!r}")
42
+ enabled_env = os.environ.get("FORGESIGHT_EVAL_ENABLED")
43
+ enabled = _as_bool(enabled_env, _as_bool(block.get("enabled"), False))
44
+ schema = block.get("score_schema")
45
+ return EvalConfig(
46
+ enabled=enabled,
47
+ emit_as=emit_as,
48
+ capture_explanation=_as_bool(block.get("capture_explanation"), True),
49
+ score_schema=dict(schema) if isinstance(schema, Mapping) else {},
50
+ )
51
+
52
+
53
+ def install(config: Mapping[str, Any] | None = None) -> bool:
54
+ """The ``forgesight.modules`` entry point: set the eval module config. Returns ``enabled``."""
55
+ global _config
56
+ _config = _build(config or {})
57
+ return _config.enabled
58
+
59
+
60
+ def get_config() -> EvalConfig:
61
+ """Return the active config, lazily loading from ``modules.eval`` settings if unset."""
62
+ global _config
63
+ if _config is None:
64
+ block = load_settings().get("modules")
65
+ eval_block = block.get("eval") if isinstance(block, Mapping) else None
66
+ _config = _build(eval_block if isinstance(eval_block, Mapping) else {})
67
+ return _config
68
+
69
+
70
+ def reset_config() -> None:
71
+ """Clear the cached config (tests / re-configure)."""
72
+ global _config
73
+ _config = None
@@ -0,0 +1,179 @@
1
+ """``record_evaluation`` / ``record_feedback`` — run-correlated quality on the same pipeline.
2
+
3
+ ``record_evaluation`` attaches to the **current** run (ambient context) — a real-time eval
4
+ nests as a child span under the run's still-open trace. ``record_feedback`` attaches to a
5
+ **past** run by ``run_id`` — a standalone record carrying the id so the backend re-associates
6
+ it. Both build an :class:`EvaluationResult`, emit it as a record with the OTel
7
+ ``gen_ai.evaluation.*`` attributes (so it streams to Langfuse scores / Phoenix evaluations /
8
+ any OTLP sink, P4) and publish an ``EVALUATION_RECORDED`` event. Non-blocking (P6).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import time
14
+ from collections.abc import Mapping
15
+
16
+ from forgesight_api import EventType, Kind, LifecycleEvent, Record, RunStatus, new_trace_id
17
+ from forgesight_core import current_context, current_run_scope, get_runtime, new_span_id
18
+
19
+ from ._config import get_config
20
+ from .model import EvaluationResult
21
+
22
+ # OTel GenAI evaluation attributes (otel-semantic-conventions §4.3).
23
+ GEN_AI_EVALUATION_NAME = "gen_ai.evaluation.name"
24
+ GEN_AI_EVALUATION_SCORE_VALUE = "gen_ai.evaluation.score.value"
25
+ GEN_AI_EVALUATION_SCORE_LABEL = "gen_ai.evaluation.score.label"
26
+ GEN_AI_EVALUATION_EXPLANATION = "gen_ai.evaluation.explanation"
27
+ # namespaced extensions (OTel defines none of these)
28
+ FORGESIGHT_EVAL_SOURCE = "forgesight.evaluation.source"
29
+ FORGESIGHT_EVAL_REALTIME = "forgesight.evaluation.realtime"
30
+ FORGESIGHT_EVAL_EVALUATOR = "forgesight.evaluation.evaluator"
31
+ FORGESIGHT_RUN_ID = "forgesight.run.id"
32
+
33
+
34
+ def record_evaluation(
35
+ name: str,
36
+ *,
37
+ score: float | None = None,
38
+ label: str | None = None,
39
+ explanation: str | None = None,
40
+ evaluator: str | None = None,
41
+ run_id: str | None = None,
42
+ metadata: Mapping[str, object] | None = None,
43
+ ) -> None:
44
+ """Attach an eval (auto) to the current run, or to ``run_id`` if given. No-op if disabled."""
45
+ resolved_run_id = run_id or _current_run_id()
46
+ if resolved_run_id is None:
47
+ raise RuntimeError(
48
+ "record_evaluation has no run_id and no current run; "
49
+ "call it inside a run or pass run_id"
50
+ )
51
+ _validate(name, score, label)
52
+ _emit(
53
+ EvaluationResult(
54
+ name=name,
55
+ run_id=resolved_run_id,
56
+ score=score,
57
+ label=label,
58
+ explanation=explanation,
59
+ evaluator=evaluator,
60
+ source="auto",
61
+ realtime=run_id is None,
62
+ metadata=dict(metadata or {}),
63
+ )
64
+ )
65
+
66
+
67
+ def record_feedback(
68
+ name: str,
69
+ *,
70
+ run_id: str,
71
+ score: float | None = None,
72
+ label: str | None = None,
73
+ comment: str | None = None,
74
+ source: str = "human",
75
+ metadata: Mapping[str, object] | None = None,
76
+ ) -> None:
77
+ """Attach post-hoc feedback to a past run by ``run_id``. No-op if disabled."""
78
+ _validate(name, score, label)
79
+ _emit(
80
+ EvaluationResult(
81
+ name=name,
82
+ run_id=run_id,
83
+ score=score,
84
+ label=label,
85
+ explanation=comment,
86
+ evaluator=None,
87
+ source=source,
88
+ realtime=False,
89
+ metadata=dict(metadata or {}),
90
+ )
91
+ )
92
+
93
+
94
+ def _current_run_id() -> str | None:
95
+ run = current_run_scope()
96
+ if run is not None and run.run_id:
97
+ return run.run_id
98
+ context = current_context()
99
+ return context.run_id if context is not None else None
100
+
101
+
102
+ def _validate(name: str, score: float | None, label: str | None) -> None:
103
+ if score is None and label is None:
104
+ raise ValueError(f"evaluation {name!r} must set at least one of score / label")
105
+ schema = get_config().score_schema.get(name)
106
+ if not isinstance(schema, Mapping):
107
+ return # unschema'd dimension — open set, accepted unvalidated
108
+ kind = schema.get("type")
109
+ if kind == "numeric" and score is not None:
110
+ low, high = schema.get("min"), schema.get("max")
111
+ if (low is not None and score < low) or (high is not None and score > high):
112
+ raise ValueError(f"score {score} for {name!r} outside [{low}, {high}]")
113
+ if kind == "categorical" and label is not None:
114
+ labels = schema.get("labels") or ()
115
+ if label not in labels:
116
+ raise ValueError(f"label {label!r} for {name!r} not in {list(labels)}")
117
+
118
+
119
+ def _emit(result: EvaluationResult) -> None:
120
+ config = get_config()
121
+ if not config.enabled:
122
+ return # module installed but not switched on (P2)
123
+ runtime = get_runtime()
124
+ context = current_context()
125
+ if result.realtime and context is not None:
126
+ trace_id, parent = context.trace_id, context.current_span_id
127
+ else:
128
+ trace_id, parent = new_trace_id(), None # post-hoc: standalone, re-associated by run_id
129
+
130
+ now = time.time_ns()
131
+ record = Record(
132
+ kind=Kind.STEP,
133
+ run_id=result.run_id,
134
+ trace_id=trace_id,
135
+ span_id=new_span_id(),
136
+ parent_span_id=parent,
137
+ name=f"evaluation {result.name}",
138
+ status=RunStatus.OK,
139
+ start_unix_nanos=now,
140
+ end_unix_nanos=now,
141
+ attributes=_attributes(result, config.capture_explanation),
142
+ )
143
+ runtime.emit_record(record)
144
+ runtime.emit_event(
145
+ LifecycleEvent(
146
+ type=EventType.EVALUATION_RECORDED,
147
+ run_id=result.run_id,
148
+ unix_nanos=now,
149
+ record=record,
150
+ trace_id=trace_id,
151
+ span_id=record.span_id,
152
+ )
153
+ )
154
+
155
+
156
+ def _attributes(result: EvaluationResult, capture_explanation: bool) -> dict[str, object]:
157
+ attrs: dict[str, object] = dict(result.metadata)
158
+ attrs[GEN_AI_EVALUATION_NAME] = result.name
159
+ attrs[FORGESIGHT_RUN_ID] = result.run_id
160
+ attrs[FORGESIGHT_EVAL_SOURCE] = result.source
161
+ attrs[FORGESIGHT_EVAL_REALTIME] = result.realtime
162
+ if result.score is not None:
163
+ attrs[GEN_AI_EVALUATION_SCORE_VALUE] = result.score
164
+ if result.label is not None:
165
+ attrs[GEN_AI_EVALUATION_SCORE_LABEL] = result.label
166
+ if result.evaluator is not None:
167
+ attrs[FORGESIGHT_EVAL_EVALUATOR] = result.evaluator
168
+ # explanation/comment is free text → gated by capture_explanation AND the global
169
+ # content-capture switch (P7); dropped if either is off.
170
+ if result.explanation is not None and capture_explanation and _content_capture_on():
171
+ attrs[GEN_AI_EVALUATION_EXPLANATION] = result.explanation
172
+ return attrs
173
+
174
+
175
+ def _content_capture_on() -> bool:
176
+ try:
177
+ return bool(get_runtime().config.capture_content)
178
+ except Exception: # pragma: no cover - runtime always present in practice
179
+ return False
@@ -0,0 +1,28 @@
1
+ """``EvaluationResult`` — the run-correlated quality signal (feat-021).
2
+
3
+ The same value type for an automated eval (LLM-as-judge / Ragas / DeepEval) and a post-hoc
4
+ human thumbs-up/down. ``realtime`` distinguishes "attached during the run" from "arrived
5
+ later by ``run_id``"; ``source`` distinguishes ``auto`` from ``human``. At least one of
6
+ ``score`` / ``label`` must be set (enforced at the call site). Experimental within 0.x.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Mapping
12
+ from dataclasses import dataclass, field
13
+ from types import MappingProxyType
14
+
15
+ _EMPTY: Mapping[str, object] = MappingProxyType({})
16
+
17
+
18
+ @dataclass(frozen=True, slots=True)
19
+ class EvaluationResult:
20
+ name: str
21
+ run_id: str
22
+ score: float | None = None
23
+ label: str | None = None
24
+ explanation: str | None = None
25
+ evaluator: str | None = None
26
+ source: str = "auto" # "auto" (eval) | "human" (feedback)
27
+ realtime: bool = True # True if attached during the run, else post-hoc
28
+ metadata: Mapping[str, object] = field(default_factory=lambda: _EMPTY)
File without changes
@@ -0,0 +1,244 @@
1
+ """Tests for eval/feedback: ambient vs by-id, gen_ai.evaluation.* mapping, schema, gating."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterator
6
+
7
+ import pytest
8
+
9
+ from forgesight_api import EventType, LifecycleEvent
10
+ from forgesight_core import InMemoryExporter, configure, reset_runtime, telemetry
11
+ from forgesight_eval import EvaluationResult, install, record_evaluation, record_feedback
12
+ from forgesight_eval._config import reset_config
13
+
14
+
15
+ class _Listener:
16
+ def __init__(self) -> None:
17
+ self.events: list[LifecycleEvent] = []
18
+
19
+ def on_event(self, event: LifecycleEvent) -> None:
20
+ self.events.append(event)
21
+
22
+
23
+ @pytest.fixture
24
+ def sink() -> Iterator[InMemoryExporter]:
25
+ exporter = InMemoryExporter()
26
+ listener = _Listener()
27
+ install({"enabled": True})
28
+ configure(exporters=[exporter], listeners=[listener], sync_export=True, capture_content=True)
29
+ exporter.listener = listener # type: ignore[attr-defined]
30
+ try:
31
+ yield exporter
32
+ finally:
33
+ reset_runtime()
34
+ reset_config()
35
+
36
+
37
+ def _eval_records(sink: InMemoryExporter) -> list:
38
+ return [r for r in sink.records if r.name.startswith("evaluation ")]
39
+
40
+
41
+ # --- real-time eval -----------------------------------------------------------
42
+ def test_record_evaluation_attaches_to_current_run(sink: InMemoryExporter) -> None:
43
+ with telemetry.agent_run("rag") as run:
44
+ record_evaluation(
45
+ "faithfulness", score=0.91, label="pass", explanation="grounded", evaluator="ragas"
46
+ )
47
+ run_id = run.run_id
48
+ trace_id = run.trace_id
49
+ [rec] = _eval_records(sink)
50
+ assert rec.run_id == run_id
51
+ assert rec.trace_id == trace_id # nested in the run's trace
52
+ assert rec.parent_span_id is not None
53
+ assert rec.attributes["gen_ai.evaluation.name"] == "faithfulness"
54
+ assert rec.attributes["gen_ai.evaluation.score.value"] == 0.91
55
+ assert rec.attributes["gen_ai.evaluation.score.label"] == "pass"
56
+ assert rec.attributes["gen_ai.evaluation.explanation"] == "grounded"
57
+ assert rec.attributes["forgesight.evaluation.source"] == "auto"
58
+ assert rec.attributes["forgesight.evaluation.realtime"] is True
59
+ assert rec.attributes["forgesight.evaluation.evaluator"] == "ragas"
60
+
61
+
62
+ def test_evaluation_emits_event(sink: InMemoryExporter) -> None:
63
+ with telemetry.agent_run("rag"):
64
+ record_evaluation("relevance", score=0.7)
65
+ events = [e for e in sink.listener.events if e.type is EventType.EVALUATION_RECORDED] # type: ignore[attr-defined]
66
+ assert len(events) == 1
67
+ assert events[0].record is not None
68
+
69
+
70
+ def test_evaluation_carries_metadata(sink: InMemoryExporter) -> None:
71
+ with telemetry.agent_run("rag"):
72
+ record_evaluation("faithfulness", score=0.5, metadata={"judge_model": "claude-sonnet-4-5"})
73
+ [rec] = _eval_records(sink)
74
+ assert rec.attributes["judge_model"] == "claude-sonnet-4-5"
75
+
76
+
77
+ def test_evaluation_explicit_run_id_is_not_realtime(sink: InMemoryExporter) -> None:
78
+ record_evaluation("faithfulness", score=0.8, run_id="01J9Z3K7P8QF2R5V6W7X8Y9Z0A")
79
+ [rec] = _eval_records(sink)
80
+ assert rec.run_id == "01J9Z3K7P8QF2R5V6W7X8Y9Z0A"
81
+ assert rec.attributes["forgesight.evaluation.realtime"] is False
82
+ assert rec.parent_span_id is None # standalone, no live trace
83
+
84
+
85
+ # --- post-hoc feedback --------------------------------------------------------
86
+ def test_record_feedback_by_run_id(sink: InMemoryExporter) -> None:
87
+ record_feedback(
88
+ "user_satisfaction",
89
+ run_id="01J9Z3K7P8QF2R5V6W7X8Y9Z0A",
90
+ label="thumbs_down",
91
+ score=0.0,
92
+ comment="hallucinated the date",
93
+ )
94
+ [rec] = _eval_records(sink)
95
+ assert rec.run_id == "01J9Z3K7P8QF2R5V6W7X8Y9Z0A"
96
+ assert rec.attributes["gen_ai.evaluation.score.label"] == "thumbs_down"
97
+ assert rec.attributes["forgesight.evaluation.source"] == "human"
98
+ assert rec.attributes["forgesight.evaluation.realtime"] is False
99
+ assert rec.attributes["gen_ai.evaluation.explanation"] == "hallucinated the date"
100
+
101
+
102
+ # --- validation ---------------------------------------------------------------
103
+ def test_evaluation_requires_score_or_label(sink: InMemoryExporter) -> None:
104
+ with telemetry.agent_run("r"), pytest.raises(ValueError, match="at least one of score"):
105
+ record_evaluation("faithfulness")
106
+
107
+
108
+ def test_evaluation_outside_run_without_id_raises(sink: InMemoryExporter) -> None:
109
+ with pytest.raises(RuntimeError, match="no current run"):
110
+ record_evaluation("faithfulness", score=0.5)
111
+
112
+
113
+ def test_score_schema_numeric_range() -> None:
114
+ install(
115
+ {
116
+ "enabled": True,
117
+ "score_schema": {"faithfulness": {"type": "numeric", "min": 0.0, "max": 1.0}},
118
+ }
119
+ )
120
+ configure(exporters=[InMemoryExporter()], sync_export=True)
121
+ try:
122
+ with pytest.raises(ValueError, match="outside"):
123
+ record_evaluation("faithfulness", score=1.5, run_id="r")
124
+ record_evaluation("faithfulness", score=0.5, run_id="r") # in range ⇒ ok
125
+ finally:
126
+ reset_runtime()
127
+ reset_config()
128
+
129
+
130
+ def test_score_schema_categorical() -> None:
131
+ install(
132
+ {
133
+ "enabled": True,
134
+ "score_schema": {
135
+ "user_satisfaction": {"type": "categorical", "labels": ["thumbs_up", "thumbs_down"]}
136
+ },
137
+ }
138
+ )
139
+ configure(exporters=[InMemoryExporter()], sync_export=True)
140
+ try:
141
+ with pytest.raises(ValueError, match="not in"):
142
+ record_feedback("user_satisfaction", run_id="r", label="meh")
143
+ record_feedback("user_satisfaction", run_id="r", label="thumbs_up") # valid label
144
+ finally:
145
+ reset_runtime()
146
+ reset_config()
147
+
148
+
149
+ def test_unschemad_dimension_accepted() -> None:
150
+ install(
151
+ {"enabled": True, "score_schema": {"faithfulness": {"type": "numeric", "min": 0, "max": 1}}}
152
+ )
153
+ configure(exporters=[InMemoryExporter()], sync_export=True)
154
+ try:
155
+ record_evaluation("novel_metric", score=42.0, run_id="r") # not in schema ⇒ unvalidated
156
+ finally:
157
+ reset_runtime()
158
+ reset_config()
159
+
160
+
161
+ # --- enable switch + privacy --------------------------------------------------
162
+ def test_disabled_module_is_noop() -> None:
163
+ install({"enabled": False})
164
+ exporter = InMemoryExporter()
165
+ configure(exporters=[exporter], sync_export=True)
166
+ try:
167
+ record_evaluation("faithfulness", score=0.9, run_id="r")
168
+ assert exporter.records == [] # installed but not switched on ⇒ nothing emitted
169
+ finally:
170
+ reset_runtime()
171
+ reset_config()
172
+
173
+
174
+ def test_explanation_dropped_when_content_capture_off() -> None:
175
+ install({"enabled": True})
176
+ exporter = InMemoryExporter()
177
+ configure(exporters=[exporter], sync_export=True, capture_content=False) # P7: content off
178
+ try:
179
+ record_evaluation("faithfulness", score=0.9, explanation="contains PII", run_id="r")
180
+ [rec] = [r for r in exporter.records if r.name.startswith("evaluation ")]
181
+ assert "gen_ai.evaluation.explanation" not in rec.attributes # text dropped
182
+ finally:
183
+ reset_runtime()
184
+ reset_config()
185
+
186
+
187
+ def test_explanation_dropped_when_capture_explanation_off() -> None:
188
+ install({"enabled": True, "capture_explanation": False})
189
+ exporter = InMemoryExporter()
190
+ configure(exporters=[exporter], sync_export=True, capture_content=True)
191
+ try:
192
+ record_evaluation("faithfulness", score=0.9, explanation="hidden", run_id="r")
193
+ [rec] = [r for r in exporter.records if r.name.startswith("evaluation ")]
194
+ assert "gen_ai.evaluation.explanation" not in rec.attributes
195
+ finally:
196
+ reset_runtime()
197
+ reset_config()
198
+
199
+
200
+ # --- model + config -----------------------------------------------------------
201
+ def test_evaluation_result_defaults() -> None:
202
+ result = EvaluationResult(name="x", run_id="r", score=1.0)
203
+ assert result.source == "auto"
204
+ assert result.realtime is True
205
+ assert result.metadata == {}
206
+
207
+
208
+ def test_install_returns_enabled() -> None:
209
+ try:
210
+ assert install({"enabled": True}) is True
211
+ assert install({"enabled": False}) is False
212
+ finally:
213
+ reset_config()
214
+
215
+
216
+ def test_install_rejects_bad_emit_as() -> None:
217
+ try:
218
+ with pytest.raises(ValueError, match="emit_as"):
219
+ install({"emit_as": "telegram"})
220
+ finally:
221
+ reset_config()
222
+
223
+
224
+ def test_config_lazy_loads_from_settings() -> None:
225
+ reset_config()
226
+ from forgesight_eval._config import get_config
227
+
228
+ assert get_config().enabled is False # no config ⇒ default disabled
229
+ reset_config()
230
+
231
+
232
+ # --- fan-out to two exporters -------------------------------------------------
233
+ def test_evaluation_fans_out_to_two_exporters() -> None:
234
+ install({"enabled": True})
235
+ a, b = InMemoryExporter(), InMemoryExporter()
236
+ configure(exporters=[a, b], sync_export=True)
237
+ try:
238
+ with telemetry.agent_run("r"):
239
+ record_evaluation("faithfulness", score=0.9)
240
+ assert any(r.name.startswith("evaluation ") for r in a.records)
241
+ assert any(r.name.startswith("evaluation ") for r in b.records)
242
+ finally:
243
+ reset_runtime()
244
+ reset_config()