evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
evalgate_sdk/logger.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Structured logger with levels, child loggers, and request/response helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
LogLevel = Literal["trace", "debug", "info", "warn", "error"]
|
|
12
|
+
|
|
13
|
+
_LEVEL_MAP: dict[str, int] = {
|
|
14
|
+
"trace": 5,
|
|
15
|
+
"debug": logging.DEBUG,
|
|
16
|
+
"info": logging.INFO,
|
|
17
|
+
"warn": logging.WARNING,
|
|
18
|
+
"error": logging.ERROR,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Logger:
|
|
23
|
+
"""SDK logger with structured output, child loggers, and request tracing."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
level: LogLevel = "info",
|
|
29
|
+
prefix: str = "evalai",
|
|
30
|
+
handler: Callable[[str, str, Any], None] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
self._level = level
|
|
33
|
+
self._prefix = prefix
|
|
34
|
+
self._handler = handler
|
|
35
|
+
self._py_logger = logging.getLogger(f"evalgate_sdk.{prefix}")
|
|
36
|
+
self._py_logger.setLevel(_LEVEL_MAP.get(level, logging.INFO))
|
|
37
|
+
if not self._py_logger.handlers:
|
|
38
|
+
h = logging.StreamHandler(sys.stderr)
|
|
39
|
+
h.setFormatter(logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s"))
|
|
40
|
+
self._py_logger.addHandler(h)
|
|
41
|
+
|
|
42
|
+
def set_level(self, level: LogLevel) -> None:
|
|
43
|
+
self._level = level
|
|
44
|
+
self._py_logger.setLevel(_LEVEL_MAP.get(level, logging.INFO))
|
|
45
|
+
|
|
46
|
+
def is_level_enabled(self, level: LogLevel) -> bool:
|
|
47
|
+
return _LEVEL_MAP.get(level, 0) >= _LEVEL_MAP.get(self._level, 0)
|
|
48
|
+
|
|
49
|
+
def _emit(self, level: LogLevel, message: str, data: Any = None) -> None:
|
|
50
|
+
if self._handler:
|
|
51
|
+
self._handler(level, message, data)
|
|
52
|
+
return
|
|
53
|
+
extra = f" {data}" if data is not None else ""
|
|
54
|
+
py_level = _LEVEL_MAP.get(level, logging.INFO)
|
|
55
|
+
self._py_logger.log(py_level, "%s%s", message, extra)
|
|
56
|
+
|
|
57
|
+
def trace(self, message: str, data: Any = None) -> None:
|
|
58
|
+
if self.is_level_enabled("trace"):
|
|
59
|
+
self._emit("trace", message, data)
|
|
60
|
+
|
|
61
|
+
def debug(self, message: str, data: Any = None) -> None:
|
|
62
|
+
if self.is_level_enabled("debug"):
|
|
63
|
+
self._emit("debug", message, data)
|
|
64
|
+
|
|
65
|
+
def info(self, message: str, data: Any = None) -> None:
|
|
66
|
+
if self.is_level_enabled("info"):
|
|
67
|
+
self._emit("info", message, data)
|
|
68
|
+
|
|
69
|
+
def warn(self, message: str, data: Any = None) -> None:
|
|
70
|
+
if self.is_level_enabled("warn"):
|
|
71
|
+
self._emit("warn", message, data)
|
|
72
|
+
|
|
73
|
+
def error(self, message: str, data: Any = None) -> None:
|
|
74
|
+
if self.is_level_enabled("error"):
|
|
75
|
+
self._emit("error", message, data)
|
|
76
|
+
|
|
77
|
+
def log_request(self, method: str, url: str, data: Any = None) -> None:
|
|
78
|
+
self.debug(f"→ {method} {url}", data)
|
|
79
|
+
|
|
80
|
+
def log_response(self, method: str, url: str, status: int, duration_ms: float, data: Any = None) -> None:
|
|
81
|
+
self.debug(f"← {method} {url} {status} ({duration_ms:.0f}ms)", data)
|
|
82
|
+
|
|
83
|
+
def child(self, prefix: str) -> Logger:
|
|
84
|
+
return Logger(
|
|
85
|
+
level=self._level,
|
|
86
|
+
prefix=f"{self._prefix}:{prefix}",
|
|
87
|
+
handler=self._handler,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class RequestLogger:
|
|
92
|
+
"""Convenience wrapper that times request/response pairs."""
|
|
93
|
+
|
|
94
|
+
def __init__(self, logger: Logger) -> None:
|
|
95
|
+
self._logger = logger
|
|
96
|
+
self._start: float = 0
|
|
97
|
+
|
|
98
|
+
def on_request(self, method: str, url: str, body: Any = None) -> None:
|
|
99
|
+
self._start = time.monotonic()
|
|
100
|
+
self._logger.log_request(method, url, body)
|
|
101
|
+
|
|
102
|
+
def on_response(self, method: str, url: str, status: int, body: Any = None) -> None:
|
|
103
|
+
elapsed = (time.monotonic() - self._start) * 1000
|
|
104
|
+
self._logger.log_response(method, url, status, elapsed, body)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
_global_logger: Logger | None = None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def create_logger(level: LogLevel = "info", **kwargs: Any) -> Logger:
|
|
111
|
+
return Logger(level=level, **kwargs)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_logger() -> Logger:
|
|
115
|
+
global _global_logger
|
|
116
|
+
if _global_logger is None:
|
|
117
|
+
_global_logger = create_logger()
|
|
118
|
+
return _global_logger
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def set_logger(logger: Logger) -> None:
|
|
122
|
+
global _global_logger
|
|
123
|
+
_global_logger = logger
|
evalgate_sdk/matchers.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Pytest plugin — provides ``to_pass_gate`` assertion for regression results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_pass_gate(result: Any) -> bool:
|
|
9
|
+
"""Check if an eval result passes the gate.
|
|
10
|
+
|
|
11
|
+
Works with ``OpenAIChatEvalResult`` and any object with a ``passed`` attribute.
|
|
12
|
+
|
|
13
|
+
Usage in pytest::
|
|
14
|
+
|
|
15
|
+
from evalgate_sdk.matchers import to_pass_gate
|
|
16
|
+
|
|
17
|
+
result = await openai_chat_eval(...)
|
|
18
|
+
assert to_pass_gate(result)
|
|
19
|
+
"""
|
|
20
|
+
if hasattr(result, "passed"):
|
|
21
|
+
return bool(result.passed)
|
|
22
|
+
if isinstance(result, dict):
|
|
23
|
+
return bool(result.get("passed", False))
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class GateAssertionError(AssertionError):
|
|
28
|
+
"""Raised when a gate assertion fails with diagnostic info."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, result: Any) -> None:
|
|
31
|
+
self.result = result
|
|
32
|
+
score = getattr(result, "score", "?")
|
|
33
|
+
total = getattr(result, "total", "?")
|
|
34
|
+
passed = getattr(result, "passed_count", "?")
|
|
35
|
+
super().__init__(f"Gate assertion failed: {passed}/{total} passed (score={score})")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def assert_passes_gate(result: Any, message: str = "") -> None:
|
|
39
|
+
"""Assert that a result passes the gate, with rich error output.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
result: The eval result to check (must have a ``passed`` attribute or key).
|
|
43
|
+
message: Optional custom error message to display on failure.
|
|
44
|
+
"""
|
|
45
|
+
if not to_pass_gate(result):
|
|
46
|
+
if message:
|
|
47
|
+
raise AssertionError(message)
|
|
48
|
+
raise GateAssertionError(result)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ── Pytest plugin ────────────────────────────────────────────────────
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
import pytest
|
|
55
|
+
|
|
56
|
+
@pytest.fixture
|
|
57
|
+
def gate_result():
|
|
58
|
+
"""Fixture that provides a gate assertion helper."""
|
|
59
|
+
return assert_passes_gate
|
|
60
|
+
|
|
61
|
+
except ImportError:
|
|
62
|
+
pass
|
evalgate_sdk/otel.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""OpenTelemetry exporter for WorkflowTracer spans (T6).
|
|
2
|
+
|
|
3
|
+
Port of the TypeScript SDK's ``otel.ts``.
|
|
4
|
+
Converts workflow tracer data into OTLP-compatible JSON payloads.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import random
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _generate_trace_id() -> str:
|
|
19
|
+
"""Generate a 32-hex-char trace ID."""
|
|
20
|
+
return f"{random.getrandbits(128):032x}"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _generate_span_id() -> str:
|
|
24
|
+
"""Generate a 16-hex-char span ID."""
|
|
25
|
+
return f"{random.getrandbits(64):016x}"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _ms_to_ns(ms: float) -> int:
|
|
29
|
+
"""Convert milliseconds to nanoseconds."""
|
|
30
|
+
return int(ms * 1_000_000)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class OTelAttribute:
|
|
35
|
+
key: str
|
|
36
|
+
value: Any
|
|
37
|
+
|
|
38
|
+
def to_dict(self) -> dict[str, Any]:
|
|
39
|
+
if isinstance(self.value, bool):
|
|
40
|
+
return {"key": self.key, "value": {"boolValue": self.value}}
|
|
41
|
+
if isinstance(self.value, int):
|
|
42
|
+
return {"key": self.key, "value": {"intValue": str(self.value)}}
|
|
43
|
+
if isinstance(self.value, float):
|
|
44
|
+
return {"key": self.key, "value": {"doubleValue": self.value}}
|
|
45
|
+
return {"key": self.key, "value": {"stringValue": str(self.value)}}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class OTelEvent:
|
|
50
|
+
name: str
|
|
51
|
+
time_unix_nano: int = 0
|
|
52
|
+
attributes: list[OTelAttribute] = field(default_factory=list)
|
|
53
|
+
|
|
54
|
+
def to_dict(self) -> dict[str, Any]:
|
|
55
|
+
return {
|
|
56
|
+
"name": self.name,
|
|
57
|
+
"timeUnixNano": str(self.time_unix_nano),
|
|
58
|
+
"attributes": [a.to_dict() for a in self.attributes],
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class OTelSpan:
|
|
64
|
+
trace_id: str
|
|
65
|
+
span_id: str
|
|
66
|
+
name: str
|
|
67
|
+
start_time_unix_nano: int
|
|
68
|
+
end_time_unix_nano: int
|
|
69
|
+
parent_span_id: str | None = None
|
|
70
|
+
kind: int = 1 # SPAN_KIND_INTERNAL
|
|
71
|
+
status_code: int = 1 # STATUS_CODE_OK
|
|
72
|
+
status_message: str = ""
|
|
73
|
+
attributes: list[OTelAttribute] = field(default_factory=list)
|
|
74
|
+
events: list[OTelEvent] = field(default_factory=list)
|
|
75
|
+
|
|
76
|
+
def to_dict(self) -> dict[str, Any]:
|
|
77
|
+
d: dict[str, Any] = {
|
|
78
|
+
"traceId": self.trace_id,
|
|
79
|
+
"spanId": self.span_id,
|
|
80
|
+
"name": self.name,
|
|
81
|
+
"kind": self.kind,
|
|
82
|
+
"startTimeUnixNano": str(self.start_time_unix_nano),
|
|
83
|
+
"endTimeUnixNano": str(self.end_time_unix_nano),
|
|
84
|
+
"status": {"code": self.status_code, "message": self.status_message},
|
|
85
|
+
"attributes": [a.to_dict() for a in self.attributes],
|
|
86
|
+
"events": [e.to_dict() for e in self.events],
|
|
87
|
+
}
|
|
88
|
+
if self.parent_span_id:
|
|
89
|
+
d["parentSpanId"] = self.parent_span_id
|
|
90
|
+
return d
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class OTelExportPayload:
|
|
95
|
+
"""OTLP JSON export payload."""
|
|
96
|
+
|
|
97
|
+
resource_spans: list[dict[str, Any]] = field(default_factory=list)
|
|
98
|
+
|
|
99
|
+
def to_dict(self) -> dict[str, Any]:
|
|
100
|
+
return {"resourceSpans": self.resource_spans}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class OTelExporterOptions:
|
|
105
|
+
endpoint: str = ""
|
|
106
|
+
service_name: str = "evalgate-sdk"
|
|
107
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
108
|
+
timeout_ms: int = 10_000
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class OTelExporter:
|
|
112
|
+
"""Exports evaluation data as OpenTelemetry spans."""
|
|
113
|
+
|
|
114
|
+
def __init__(self, options: OTelExporterOptions | None = None) -> None:
|
|
115
|
+
opts = options or OTelExporterOptions()
|
|
116
|
+
self._endpoint = opts.endpoint or os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
|
|
117
|
+
self._service_name = opts.service_name
|
|
118
|
+
self._headers = opts.headers
|
|
119
|
+
self._timeout = opts.timeout_ms / 1000.0
|
|
120
|
+
|
|
121
|
+
def export_from_tracer(self, tracer: Any) -> OTelExportPayload:
|
|
122
|
+
"""Convert a WorkflowTracer into an OTLP payload.
|
|
123
|
+
|
|
124
|
+
*tracer* should expose ``.spans`` (list of dicts with name, start_time,
|
|
125
|
+
end_time, metadata, parent_span_id, etc.).
|
|
126
|
+
"""
|
|
127
|
+
trace_id = _generate_trace_id()
|
|
128
|
+
spans: list[OTelSpan] = []
|
|
129
|
+
|
|
130
|
+
raw_spans = getattr(tracer, "spans", [])
|
|
131
|
+
for raw in raw_spans:
|
|
132
|
+
span_id = _generate_span_id()
|
|
133
|
+
attrs: list[OTelAttribute] = []
|
|
134
|
+
|
|
135
|
+
if isinstance(raw, dict):
|
|
136
|
+
name = raw.get("name", "unknown")
|
|
137
|
+
start_ns = _ms_to_ns(raw.get("start_time", 0))
|
|
138
|
+
end_ns = _ms_to_ns(raw.get("end_time", start_ns))
|
|
139
|
+
parent = raw.get("parent_span_id")
|
|
140
|
+
for k, v in raw.get("metadata", {}).items():
|
|
141
|
+
attrs.append(OTelAttribute(key=f"evalgate.{k}", value=v))
|
|
142
|
+
else:
|
|
143
|
+
name = getattr(raw, "name", "unknown")
|
|
144
|
+
start_ns = _ms_to_ns(getattr(raw, "start_time", 0))
|
|
145
|
+
end_ns = _ms_to_ns(getattr(raw, "end_time", start_ns))
|
|
146
|
+
parent = getattr(raw, "parent_span_id", None)
|
|
147
|
+
|
|
148
|
+
attrs.append(OTelAttribute(key="evalgate.service", value=self._service_name))
|
|
149
|
+
spans.append(
|
|
150
|
+
OTelSpan(
|
|
151
|
+
trace_id=trace_id,
|
|
152
|
+
span_id=span_id,
|
|
153
|
+
name=name,
|
|
154
|
+
start_time_unix_nano=start_ns,
|
|
155
|
+
end_time_unix_nano=end_ns,
|
|
156
|
+
parent_span_id=parent,
|
|
157
|
+
attributes=attrs,
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return self._build_payload(spans)
|
|
162
|
+
|
|
163
|
+
def export_run_result(
|
|
164
|
+
self,
|
|
165
|
+
run_id: str,
|
|
166
|
+
results: list[dict[str, Any]],
|
|
167
|
+
start_time_ms: float | None = None,
|
|
168
|
+
end_time_ms: float | None = None,
|
|
169
|
+
) -> OTelExportPayload:
|
|
170
|
+
"""Convert evaluation run results into an OTLP payload."""
|
|
171
|
+
trace_id = _generate_trace_id()
|
|
172
|
+
now_ms = time.time() * 1000
|
|
173
|
+
root_start = _ms_to_ns(start_time_ms or now_ms)
|
|
174
|
+
root_end = _ms_to_ns(end_time_ms or now_ms)
|
|
175
|
+
|
|
176
|
+
root_span_id = _generate_span_id()
|
|
177
|
+
spans: list[OTelSpan] = [
|
|
178
|
+
OTelSpan(
|
|
179
|
+
trace_id=trace_id,
|
|
180
|
+
span_id=root_span_id,
|
|
181
|
+
name=f"evalgate.run.{run_id}",
|
|
182
|
+
start_time_unix_nano=root_start,
|
|
183
|
+
end_time_unix_nano=root_end,
|
|
184
|
+
attributes=[
|
|
185
|
+
OTelAttribute(key="evalgate.run_id", value=run_id),
|
|
186
|
+
OTelAttribute(key="evalgate.service", value=self._service_name),
|
|
187
|
+
],
|
|
188
|
+
)
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
for r in results:
|
|
192
|
+
span_id = _generate_span_id()
|
|
193
|
+
duration_ms = r.get("duration_ms", 0)
|
|
194
|
+
s_start = _ms_to_ns(r.get("start_time_ms", now_ms))
|
|
195
|
+
s_end = s_start + _ms_to_ns(duration_ms)
|
|
196
|
+
|
|
197
|
+
status = 1 if r.get("passed") else 2 # OK or ERROR
|
|
198
|
+
attrs = [
|
|
199
|
+
OTelAttribute(key="evalgate.test_id", value=r.get("test_id", "")),
|
|
200
|
+
OTelAttribute(key="evalgate.test_name", value=r.get("test_name", "")),
|
|
201
|
+
OTelAttribute(key="evalgate.score", value=r.get("score", 0)),
|
|
202
|
+
OTelAttribute(key="evalgate.passed", value=r.get("passed", False)),
|
|
203
|
+
]
|
|
204
|
+
spans.append(
|
|
205
|
+
OTelSpan(
|
|
206
|
+
trace_id=trace_id,
|
|
207
|
+
span_id=span_id,
|
|
208
|
+
parent_span_id=root_span_id,
|
|
209
|
+
name=f"evalgate.spec.{r.get('test_name', 'unknown')}",
|
|
210
|
+
start_time_unix_nano=s_start,
|
|
211
|
+
end_time_unix_nano=s_end,
|
|
212
|
+
status_code=status,
|
|
213
|
+
status_message=r.get("error", ""),
|
|
214
|
+
attributes=attrs,
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return self._build_payload(spans)
|
|
219
|
+
|
|
220
|
+
def _build_payload(self, spans: list[OTelSpan]) -> OTelExportPayload:
|
|
221
|
+
resource = {
|
|
222
|
+
"resource": {
|
|
223
|
+
"attributes": [
|
|
224
|
+
OTelAttribute(key="service.name", value=self._service_name).to_dict(),
|
|
225
|
+
],
|
|
226
|
+
},
|
|
227
|
+
"scopeSpans": [
|
|
228
|
+
{
|
|
229
|
+
"scope": {"name": "evalgate-sdk"},
|
|
230
|
+
"spans": [s.to_dict() for s in spans],
|
|
231
|
+
}
|
|
232
|
+
],
|
|
233
|
+
}
|
|
234
|
+
return OTelExportPayload(resource_spans=[resource])
|
|
235
|
+
|
|
236
|
+
async def send(self, payload: OTelExportPayload) -> bool:
|
|
237
|
+
"""POST the OTLP payload to the collector endpoint.
|
|
238
|
+
|
|
239
|
+
Returns False on connection/timeout errors instead of raising.
|
|
240
|
+
"""
|
|
241
|
+
url = f"{self._endpoint.rstrip('/')}/v1/traces"
|
|
242
|
+
try:
|
|
243
|
+
async with httpx.AsyncClient(timeout=self._timeout) as client:
|
|
244
|
+
resp = await client.post(
|
|
245
|
+
url,
|
|
246
|
+
json=payload.to_dict(),
|
|
247
|
+
headers={"Content-Type": "application/json", **self._headers},
|
|
248
|
+
)
|
|
249
|
+
return resp.status_code < 400
|
|
250
|
+
except (httpx.ConnectError, httpx.TimeoutException, OSError):
|
|
251
|
+
return False
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def create_otel_exporter(options: OTelExporterOptions | None = None) -> OTelExporter:
|
|
255
|
+
"""Factory function for OTelExporter."""
|
|
256
|
+
return OTelExporter(options)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Cursor-based pagination helpers — iterators, auto-pagination, and encoding."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import json
|
|
7
|
+
from collections.abc import AsyncIterator, Callable
|
|
8
|
+
from typing import Any, Generic, TypeVar
|
|
9
|
+
|
|
10
|
+
T = TypeVar("T")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def encode_cursor(data: Any) -> str:
|
|
14
|
+
"""Encode arbitrary data as a base64 cursor string."""
|
|
15
|
+
return base64.urlsafe_b64encode(json.dumps(data).encode()).decode()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def decode_cursor(cursor: str) -> Any:
|
|
19
|
+
"""Decode a base64 cursor string back to its original value."""
|
|
20
|
+
return json.loads(base64.urlsafe_b64decode(cursor.encode()).decode())
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PaginatedResponse(Generic[T]):
|
|
24
|
+
"""Container for a page of results plus pagination metadata."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, data: list[T], has_more: bool, total: int | None = None) -> None:
|
|
27
|
+
self.data = data
|
|
28
|
+
self.has_more = has_more
|
|
29
|
+
self.total = total
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PaginatedIterator(Generic[T]):
|
|
33
|
+
"""Async iterator that automatically fetches pages.
|
|
34
|
+
|
|
35
|
+
Usage::
|
|
36
|
+
|
|
37
|
+
pages = PaginatedIterator(fetch_fn, limit=20)
|
|
38
|
+
async for page in pages:
|
|
39
|
+
for item in page:
|
|
40
|
+
print(item)
|
|
41
|
+
|
|
42
|
+
# or collect everything
|
|
43
|
+
all_items = await pages.to_list()
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
fetch_fn: Callable[[int, int], Any],
|
|
49
|
+
limit: int = 20,
|
|
50
|
+
) -> None:
|
|
51
|
+
self._fetch_fn = fetch_fn
|
|
52
|
+
self._limit = limit
|
|
53
|
+
self._offset = 0
|
|
54
|
+
self._exhausted = False
|
|
55
|
+
|
|
56
|
+
def __aiter__(self) -> PaginatedIterator[T]:
|
|
57
|
+
return self
|
|
58
|
+
|
|
59
|
+
async def __anext__(self) -> list[T]:
|
|
60
|
+
if self._exhausted:
|
|
61
|
+
raise StopAsyncIteration
|
|
62
|
+
|
|
63
|
+
result = await self._fetch_fn(self._offset, self._limit)
|
|
64
|
+
|
|
65
|
+
if isinstance(result, dict):
|
|
66
|
+
data = result.get("data", [])
|
|
67
|
+
has_more = result.get("has_more", result.get("hasMore", False))
|
|
68
|
+
elif isinstance(result, PaginatedResponse):
|
|
69
|
+
data = result.data
|
|
70
|
+
has_more = result.has_more
|
|
71
|
+
elif isinstance(result, list):
|
|
72
|
+
data = result
|
|
73
|
+
has_more = len(data) >= self._limit
|
|
74
|
+
else:
|
|
75
|
+
data = list(result)
|
|
76
|
+
has_more = len(data) >= self._limit
|
|
77
|
+
|
|
78
|
+
if not data:
|
|
79
|
+
self._exhausted = True
|
|
80
|
+
raise StopAsyncIteration
|
|
81
|
+
|
|
82
|
+
self._offset += len(data)
|
|
83
|
+
if not has_more:
|
|
84
|
+
self._exhausted = True
|
|
85
|
+
|
|
86
|
+
return data
|
|
87
|
+
|
|
88
|
+
async def to_list(self) -> list[T]:
|
|
89
|
+
"""Collect all pages into a single flat list."""
|
|
90
|
+
items: list[T] = []
|
|
91
|
+
async for page in self:
|
|
92
|
+
items.extend(page)
|
|
93
|
+
return items
|
|
94
|
+
|
|
95
|
+
def reset(self) -> None:
|
|
96
|
+
self._offset = 0
|
|
97
|
+
self._exhausted = False
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def create_paginated_iterator(
|
|
101
|
+
fetch_fn: Callable[[int, int], Any],
|
|
102
|
+
limit: int = 20,
|
|
103
|
+
) -> PaginatedIterator[Any]:
|
|
104
|
+
"""Create a paginated iterator from a fetch function."""
|
|
105
|
+
return PaginatedIterator(fetch_fn, limit)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def auto_paginate(
|
|
109
|
+
fetch_fn: Callable[[int, int], Any],
|
|
110
|
+
limit: int = 20,
|
|
111
|
+
) -> AsyncIterator[Any]:
|
|
112
|
+
"""Auto-paginate and yield individual items."""
|
|
113
|
+
iterator = PaginatedIterator(fetch_fn, limit)
|
|
114
|
+
async for page in iterator:
|
|
115
|
+
for item in page:
|
|
116
|
+
yield item
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def create_pagination_meta(
|
|
120
|
+
items: list[Any],
|
|
121
|
+
limit: int,
|
|
122
|
+
offset: int,
|
|
123
|
+
total: int | None = None,
|
|
124
|
+
) -> dict[str, Any]:
|
|
125
|
+
"""Create pagination metadata for an API response."""
|
|
126
|
+
return {
|
|
127
|
+
"limit": limit,
|
|
128
|
+
"offset": offset,
|
|
129
|
+
"count": len(items),
|
|
130
|
+
"total": total,
|
|
131
|
+
"has_more": total is not None and (offset + len(items)) < total if total else len(items) >= limit,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def parse_pagination_params(
|
|
136
|
+
params: dict[str, Any] | None = None,
|
|
137
|
+
default_limit: int = 20,
|
|
138
|
+
max_limit: int = 100,
|
|
139
|
+
) -> dict[str, int]:
|
|
140
|
+
"""Parse and clamp pagination params."""
|
|
141
|
+
if params is None:
|
|
142
|
+
return {"limit": default_limit, "offset": 0}
|
|
143
|
+
limit = min(int(params.get("limit", default_limit)), max_limit)
|
|
144
|
+
offset = max(int(params.get("offset", 0)), 0)
|
|
145
|
+
return {"limit": limit, "offset": offset}
|
evalgate_sdk/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Pytest plugin for EvalGate assertions (T9).
|
|
2
|
+
|
|
3
|
+
Provides custom pytest assertions for eval results and a plugin
|
|
4
|
+
that can be auto-registered via pyproject.toml entry points.
|
|
5
|
+
|
|
6
|
+
Usage::
|
|
7
|
+
|
|
8
|
+
from evalgate_sdk.pytest_plugin import assert_passes_gate, assert_score_above
|
|
9
|
+
|
|
10
|
+
def test_chatbot_quality(eval_result):
|
|
11
|
+
assert_passes_gate(eval_result)
|
|
12
|
+
assert_score_above(eval_result, 90.0)
|
|
13
|
+
|
|
14
|
+
Note: pytest is imported lazily so the SDK can be used without pytest installed.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import TYPE_CHECKING, Any
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
import pytest as _pytest_type # noqa: F401
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_pytest() -> Any:
|
|
26
|
+
"""Lazily import pytest, raising a clear error if not installed."""
|
|
27
|
+
try:
|
|
28
|
+
import pytest
|
|
29
|
+
|
|
30
|
+
return pytest
|
|
31
|
+
except ImportError as exc:
|
|
32
|
+
raise ImportError(
|
|
33
|
+
"pytest is required for evalgate_sdk.pytest_plugin. Install with: pip install pytest"
|
|
34
|
+
) from exc
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def assert_passes_gate(result: Any, message: str = "") -> None:
|
|
38
|
+
"""Assert that an eval result passes the quality gate.
|
|
39
|
+
|
|
40
|
+
*result* should have a ``passed`` attribute (or key) that is truthy.
|
|
41
|
+
|
|
42
|
+
This delegates to :func:`evalgate_sdk.matchers.assert_passes_gate` for consistency.
|
|
43
|
+
"""
|
|
44
|
+
from evalgate_sdk.matchers import assert_passes_gate as _assert_passes_gate
|
|
45
|
+
|
|
46
|
+
_assert_passes_gate(result, message)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def assert_score_above(result: Any, threshold: float, message: str = "") -> None:
|
|
50
|
+
"""Assert that the eval result score is above *threshold*."""
|
|
51
|
+
pytest = _get_pytest()
|
|
52
|
+
score = _get_field(result, "score")
|
|
53
|
+
if score is None:
|
|
54
|
+
pytest.fail(message or "Result has no 'score' field")
|
|
55
|
+
if score < threshold:
|
|
56
|
+
pytest.fail(message or f"Score {score} is below threshold {threshold}")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def assert_score_between(result: Any, min_score: float, max_score: float, message: str = "") -> None:
|
|
60
|
+
"""Assert that the eval result score is within [min_score, max_score]."""
|
|
61
|
+
pytest = _get_pytest()
|
|
62
|
+
score = _get_field(result, "score")
|
|
63
|
+
if score is None:
|
|
64
|
+
pytest.fail(message or "Result has no 'score' field")
|
|
65
|
+
if score < min_score or score > max_score:
|
|
66
|
+
pytest.fail(message or f"Score {score} not in [{min_score}, {max_score}]")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def assert_no_errors(result: Any, message: str = "") -> None:
|
|
70
|
+
"""Assert that the eval result has no errors."""
|
|
71
|
+
pytest = _get_pytest()
|
|
72
|
+
error = _get_field(result, "error")
|
|
73
|
+
status = _get_field(result, "status")
|
|
74
|
+
if error:
|
|
75
|
+
pytest.fail(message or f"Eval result has error: {error}")
|
|
76
|
+
if status == "error":
|
|
77
|
+
pytest.fail(message or "Eval result status is 'error'")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def assert_all_assertions_passed(result: Any, message: str = "") -> None:
|
|
81
|
+
"""Assert that all sub-assertions in the result passed."""
|
|
82
|
+
pytest = _get_pytest()
|
|
83
|
+
assertions = _get_field(result, "assertions") or []
|
|
84
|
+
for i, assertion in enumerate(assertions):
|
|
85
|
+
passed = _get_field(assertion, "passed")
|
|
86
|
+
if not passed:
|
|
87
|
+
name = _get_field(assertion, "assertion_type") or _get_field(assertion, "name") or f"assertion-{i}"
|
|
88
|
+
msg = _get_field(assertion, "message") or "failed"
|
|
89
|
+
pytest.fail(message or f"Sub-assertion '{name}' failed: {msg}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _get_field(obj: Any, field: str) -> Any:
|
|
93
|
+
"""Get a field from a dict or dataclass/object."""
|
|
94
|
+
if isinstance(obj, dict):
|
|
95
|
+
return obj.get(field)
|
|
96
|
+
return getattr(obj, field, None)
|