autobots-devtools-shared-lib 0.5.3__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/PKG-INFO +3 -1
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/pyproject.toml +8 -2
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/__init__.py +33 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/__init__.py +3 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/deterministic.py +174 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/golden.py +161 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/llm_judge.py +196 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/registry.py +77 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/__init__.py +3 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/cost_tracker.py +139 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/loader.py +59 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/runner.py +161 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/workspace.py +51 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/models/__init__.py +3 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/models/eval_case.py +96 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/models/result.py +104 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/__init__.py +3 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/fixtures.py +111 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/plugin.py +90 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/reporting.py +68 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/scoring/__init__.py +3 -0
- autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/scoring/langfuse_scorer.py +60 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/README.md +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/jenkins_config.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/jenkins_constants.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/jenkins_loader.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/logging_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/otel_fastapi.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/trace_metadata.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/trace_propagation.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/tracing.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/README.md +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/app.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/config.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/models.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/README.md +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/cache_backed.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/db_repository.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/factory.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/in_memory.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/redis_store.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/store.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/context_tools.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/format_tools.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/fserver_client_tools.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/jenkins_builtin_tools.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/jenkins_pipeline_tools.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/context_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/format_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/fserver_client_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_builtin_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_http_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_pipeline_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/agent_config_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/agent_meta.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/base_agent.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/batch.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/invocation_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/middleware.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/config/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/config/dynagent_settings.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/llm/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/llm/llm.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/models/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/models/state.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/services/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/services/structured_converter.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/tools/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/tools/state_tools.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/tools/tool_registry.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/ui/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/ui/default_ui.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/ui/ui_utils.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/utils/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/utils/schema_directive_resolver.py +0 -0
- {autobots_devtools_shared_lib-0.5.3 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: autobots-devtools-shared-lib
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Shared library functions to be used for all autobots projects
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Pralhad
|
|
@@ -18,6 +18,7 @@ Requires-Dist: langchain (>=1.0.0)
|
|
|
18
18
|
Requires-Dist: langchain-anthropic (>=1.4.0)
|
|
19
19
|
Requires-Dist: langchain-google-genai (>=4.2.0)
|
|
20
20
|
Requires-Dist: langfuse (>=3.12.1)
|
|
21
|
+
Requires-Dist: openevals (>=0.1.0)
|
|
21
22
|
Requires-Dist: opentelemetry-api (>=1.30.0,<2.0.0)
|
|
22
23
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http (>=1.30.0,<2.0.0)
|
|
23
24
|
Requires-Dist: opentelemetry-instrumentation-fastapi (>=0.49b0)
|
|
@@ -26,6 +27,7 @@ Requires-Dist: pydantic-settings (>=2.10.1)
|
|
|
26
27
|
Requires-Dist: python-dotenv (>=1.1.1)
|
|
27
28
|
Requires-Dist: pyyaml (>=6.0.3)
|
|
28
29
|
Requires-Dist: referencing (>=0.37.0)
|
|
30
|
+
Requires-Dist: tiktoken (>=0.7.0)
|
|
29
31
|
Requires-Dist: uvicorn[standard] (>=0.32.0)
|
|
30
32
|
Description-Content-Type: text/markdown
|
|
31
33
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "autobots-devtools-shared-lib"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.6.0"
|
|
4
4
|
description = "Shared library functions to be used for all autobots projects"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -25,6 +25,8 @@ dependencies = [
|
|
|
25
25
|
"opentelemetry-sdk>=1.30.0,<2.0.0",
|
|
26
26
|
"opentelemetry-exporter-otlp-proto-http>=1.30.0,<2.0.0",
|
|
27
27
|
"opentelemetry-instrumentation-fastapi>=0.49b0",
|
|
28
|
+
"openevals>=0.1.0",
|
|
29
|
+
"tiktoken>=0.7.0",
|
|
28
30
|
]
|
|
29
31
|
dev = [
|
|
30
32
|
"pre-commit>=4.5.1",
|
|
@@ -33,6 +35,7 @@ dev = [
|
|
|
33
35
|
"pytest-asyncio>=1.3.0",
|
|
34
36
|
"pytest-cov>=7.0.0",
|
|
35
37
|
"ruff>=0.14.14",
|
|
38
|
+
"pytest-xdist>=3.0.0",
|
|
36
39
|
]
|
|
37
40
|
|
|
38
41
|
[tool.poetry]
|
|
@@ -44,6 +47,9 @@ packages = [
|
|
|
44
47
|
requires = ["poetry-core"]
|
|
45
48
|
build-backend = "poetry.core.masonry.api"
|
|
46
49
|
|
|
50
|
+
[project.entry-points."pytest11"]
|
|
51
|
+
dynagent_eval = "autobots_devtools_shared_lib.eval.pytest_plugin.plugin"
|
|
52
|
+
|
|
47
53
|
[tool.ruff]
|
|
48
54
|
line-length = 100
|
|
49
55
|
target-version = "py312"
|
|
@@ -106,7 +112,7 @@ markers = [
|
|
|
106
112
|
pythonVersion = "3.12"
|
|
107
113
|
typeCheckingMode = "basic"
|
|
108
114
|
include = ["src", "tests"]
|
|
109
|
-
exclude = ["**/__pycache__", ".venv"]
|
|
115
|
+
exclude = ["**/__pycache__", ".venv", "**/eval/core/cost_tracker.py", "**/eval/assertions/llm_judge.py"]
|
|
110
116
|
venvPath = ".."
|
|
111
117
|
venv = ".venv"
|
|
112
118
|
extraPaths = ["src"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# ABOUTME: Public API for the dynagent eval framework.
|
|
2
|
+
# ABOUTME: Import from here for a stable surface — loader, models, and result types.
|
|
3
|
+
|
|
4
|
+
from autobots_devtools_shared_lib.eval.assertions.registry import register_assertion
|
|
5
|
+
from autobots_devtools_shared_lib.eval.core.loader import EvalConfigError, load_eval_cases
|
|
6
|
+
from autobots_devtools_shared_lib.eval.models.eval_case import (
|
|
7
|
+
EvalCase,
|
|
8
|
+
SetupConfig,
|
|
9
|
+
WorkspaceFile,
|
|
10
|
+
)
|
|
11
|
+
from autobots_devtools_shared_lib.eval.models.result import (
|
|
12
|
+
AgentOutput,
|
|
13
|
+
AssertionResult,
|
|
14
|
+
CostDelta,
|
|
15
|
+
EvalCostSnapshot,
|
|
16
|
+
EvalResult,
|
|
17
|
+
TurnResult,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"AgentOutput",
|
|
22
|
+
"AssertionResult",
|
|
23
|
+
"CostDelta",
|
|
24
|
+
"EvalCase",
|
|
25
|
+
"EvalConfigError",
|
|
26
|
+
"EvalCostSnapshot",
|
|
27
|
+
"EvalResult",
|
|
28
|
+
"SetupConfig",
|
|
29
|
+
"TurnResult",
|
|
30
|
+
"WorkspaceFile",
|
|
31
|
+
"load_eval_cases",
|
|
32
|
+
"register_assertion",
|
|
33
|
+
]
|
autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/deterministic.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# ABOUTME: Deterministic assertion functions wrapping OpenEvals and built-in checks.
|
|
2
|
+
# ABOUTME: Each function takes AgentOutput + config and returns AssertionResult.
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import jsonschema as js
|
|
12
|
+
|
|
13
|
+
from autobots_devtools_shared_lib.eval.models.result import AgentOutput, AssertionResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _last_ai_content(agent_output: AgentOutput) -> str:
|
|
17
|
+
"""Extract text content from the last AI message."""
|
|
18
|
+
for msg in reversed(agent_output.messages):
|
|
19
|
+
if hasattr(msg, "type") and msg.type == "ai" and msg.content:
|
|
20
|
+
return str(msg.content)
|
|
21
|
+
return ""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _all_tool_names(agent_output: AgentOutput) -> list[str]:
|
|
25
|
+
"""Extract all tool names called across all messages."""
|
|
26
|
+
names: list[str] = []
|
|
27
|
+
for msg in agent_output.messages:
|
|
28
|
+
tool_calls = getattr(msg, "tool_calls", None)
|
|
29
|
+
if tool_calls:
|
|
30
|
+
for tc in tool_calls:
|
|
31
|
+
if isinstance(tc, dict):
|
|
32
|
+
names.append(tc.get("name", ""))
|
|
33
|
+
elif hasattr(tc, "name"):
|
|
34
|
+
names.append(tc.name)
|
|
35
|
+
return names
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def contains(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
39
|
+
"""Check if agent response contains a substring (case-insensitive)."""
|
|
40
|
+
text = _last_ai_content(agent_output).lower()
|
|
41
|
+
target = str(config).lower()
|
|
42
|
+
found = target in text
|
|
43
|
+
return AssertionResult(
|
|
44
|
+
passed=found,
|
|
45
|
+
name=f"contains:{config}",
|
|
46
|
+
detail=f"{'Found' if found else 'Not found'} in response",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def regex(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
51
|
+
"""Check if agent response matches a regex pattern."""
|
|
52
|
+
text = _last_ai_content(agent_output)
|
|
53
|
+
pattern = str(config)
|
|
54
|
+
match = bool(re.search(pattern, text))
|
|
55
|
+
return AssertionResult(
|
|
56
|
+
passed=match,
|
|
57
|
+
name=f"regex:{pattern}",
|
|
58
|
+
detail=f"{'Matched' if match else 'No match'} for pattern",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def exact_match(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
63
|
+
"""Check if agent response exactly matches expected string."""
|
|
64
|
+
text = _last_ai_content(agent_output)
|
|
65
|
+
expected = str(config)
|
|
66
|
+
passed = text.strip() == expected.strip()
|
|
67
|
+
return AssertionResult(
|
|
68
|
+
passed=passed,
|
|
69
|
+
name="exact_match",
|
|
70
|
+
detail=f"Expected: {expected[:100]}",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def json_match(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
75
|
+
"""Check if agent response JSON matches expected JSON."""
|
|
76
|
+
text = _last_ai_content(agent_output)
|
|
77
|
+
try:
|
|
78
|
+
actual = json.loads(text)
|
|
79
|
+
expected = config if isinstance(config, dict) else json.loads(str(config))
|
|
80
|
+
passed = actual == expected
|
|
81
|
+
return AssertionResult(
|
|
82
|
+
passed=passed,
|
|
83
|
+
name="json_match",
|
|
84
|
+
detail="JSON matches" if passed else "JSON does not match",
|
|
85
|
+
)
|
|
86
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
87
|
+
return AssertionResult(passed=False, name="json_match", detail=f"Parse error: {e}")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def schema_match(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
91
|
+
"""Validate agent response JSON against a JSON schema (dict or file path)."""
|
|
92
|
+
text = _last_ai_content(agent_output)
|
|
93
|
+
try:
|
|
94
|
+
# If config is already a dict, use it directly; otherwise treat as file path
|
|
95
|
+
if isinstance(config, dict):
|
|
96
|
+
schema = config
|
|
97
|
+
else:
|
|
98
|
+
schema_path = Path(str(config))
|
|
99
|
+
schema = json.loads(schema_path.read_text())
|
|
100
|
+
data = json.loads(text)
|
|
101
|
+
js.validate(instance=data, schema=schema)
|
|
102
|
+
return AssertionResult(passed=True, name="response_matches_schema", detail="Valid")
|
|
103
|
+
except js.ValidationError as e:
|
|
104
|
+
return AssertionResult(
|
|
105
|
+
passed=False,
|
|
106
|
+
name="response_matches_schema",
|
|
107
|
+
detail=f"Schema validation failed: {e.message}",
|
|
108
|
+
)
|
|
109
|
+
except (json.JSONDecodeError, FileNotFoundError, OSError) as e:
|
|
110
|
+
return AssertionResult(
|
|
111
|
+
passed=False,
|
|
112
|
+
name="response_matches_schema",
|
|
113
|
+
detail=f"Error: {e}",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def tool_called(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
118
|
+
"""Check if a specific tool was called during the conversation."""
|
|
119
|
+
target = str(config)
|
|
120
|
+
called = _all_tool_names(agent_output)
|
|
121
|
+
found = target in called
|
|
122
|
+
return AssertionResult(
|
|
123
|
+
passed=found,
|
|
124
|
+
name=f"tool_called:{target}",
|
|
125
|
+
detail=f"Tools called: {called}" if not found else "Found",
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def tool_sequence(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
130
|
+
"""Check if tools were called in a specific order."""
|
|
131
|
+
if not isinstance(config, list):
|
|
132
|
+
return AssertionResult(passed=False, name="tool_sequence", detail="Config must be a list")
|
|
133
|
+
|
|
134
|
+
expected_names = [step["tool"] for step in config if isinstance(step, dict)]
|
|
135
|
+
called = _all_tool_names(agent_output)
|
|
136
|
+
|
|
137
|
+
# Check subsequence match (in order, not necessarily contiguous)
|
|
138
|
+
idx = 0
|
|
139
|
+
for name in called:
|
|
140
|
+
if idx < len(expected_names) and name == expected_names[idx]:
|
|
141
|
+
idx += 1
|
|
142
|
+
passed = idx == len(expected_names)
|
|
143
|
+
|
|
144
|
+
return AssertionResult(
|
|
145
|
+
passed=passed,
|
|
146
|
+
name="tool_sequence",
|
|
147
|
+
detail=f"Expected: {expected_names}, Called: {called}",
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def no_extra_tools(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
152
|
+
"""Check that no tools beyond the allowed set were called."""
|
|
153
|
+
allowed = set(config) if isinstance(config, list) else {str(config)}
|
|
154
|
+
called = set(_all_tool_names(agent_output))
|
|
155
|
+
extra = called - allowed
|
|
156
|
+
passed = len(extra) == 0
|
|
157
|
+
return AssertionResult(
|
|
158
|
+
passed=passed,
|
|
159
|
+
name="no_extra_tools",
|
|
160
|
+
detail=f"Extra tools: {extra}" if extra else "No extra tools",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def tools_unordered(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
165
|
+
"""Check that all expected tools were called (order doesn't matter)."""
|
|
166
|
+
expected = set(config) if isinstance(config, list) else {str(config)}
|
|
167
|
+
called = set(_all_tool_names(agent_output))
|
|
168
|
+
missing = expected - called
|
|
169
|
+
passed = len(missing) == 0
|
|
170
|
+
return AssertionResult(
|
|
171
|
+
passed=passed,
|
|
172
|
+
name="tools_unordered",
|
|
173
|
+
detail=f"Missing: {missing}" if missing else "All tools called",
|
|
174
|
+
)
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# ABOUTME: Golden match assertion for comparing agent structured_response against reference files.
|
|
2
|
+
# ABOUTME: Supports exact diff mode and structural-only comparison with field ignoring.
|
|
3
|
+
"""Golden match assertion: compare agent output against reference files."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from autobots_devtools_shared_lib.eval.models.result import AgentOutput, AssertionResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class JsonDiff:
|
|
17
|
+
"""Structured diff between reference and actual JSON."""
|
|
18
|
+
|
|
19
|
+
missing: list[str] = field(default_factory=list)
|
|
20
|
+
unexpected: list[str] = field(default_factory=list)
|
|
21
|
+
changed: list[str] = field(default_factory=list)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def has_differences(self) -> bool:
|
|
25
|
+
return bool(self.missing or self.unexpected or self.changed)
|
|
26
|
+
|
|
27
|
+
def to_detail(self) -> str:
|
|
28
|
+
lines = [f"Missing from actual: {m}" for m in self.missing]
|
|
29
|
+
lines.extend(f"Unexpected in actual: {u}" for u in self.unexpected)
|
|
30
|
+
lines.extend(f"Changed: {c}" for c in self.changed)
|
|
31
|
+
return "\n".join(lines)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _diff_json(reference: Any, actual: Any, path: str = "") -> JsonDiff:
|
|
35
|
+
"""Recursive deep diff between two JSON-like structures."""
|
|
36
|
+
diff = JsonDiff()
|
|
37
|
+
|
|
38
|
+
if isinstance(reference, dict) and isinstance(actual, dict):
|
|
39
|
+
for key in reference:
|
|
40
|
+
child_path = f"{path}.{key}" if path else key
|
|
41
|
+
if key not in actual:
|
|
42
|
+
diff.missing.append(f"{child_path}: {json.dumps(reference[key])}")
|
|
43
|
+
else:
|
|
44
|
+
child = _diff_json(reference[key], actual[key], child_path)
|
|
45
|
+
diff.missing.extend(child.missing)
|
|
46
|
+
diff.unexpected.extend(child.unexpected)
|
|
47
|
+
diff.changed.extend(child.changed)
|
|
48
|
+
for key in actual:
|
|
49
|
+
child_path = f"{path}.{key}" if path else key
|
|
50
|
+
if key not in reference:
|
|
51
|
+
diff.unexpected.append(f"{child_path}: {json.dumps(actual[key])}")
|
|
52
|
+
|
|
53
|
+
elif isinstance(reference, list) and isinstance(actual, list):
|
|
54
|
+
for i in range(max(len(reference), len(actual))):
|
|
55
|
+
child_path = f"{path}[{i}]"
|
|
56
|
+
if i >= len(actual):
|
|
57
|
+
diff.missing.append(f"{child_path}: {json.dumps(reference[i])}")
|
|
58
|
+
elif i >= len(reference):
|
|
59
|
+
diff.unexpected.append(f"{child_path}: {json.dumps(actual[i])}")
|
|
60
|
+
else:
|
|
61
|
+
child = _diff_json(reference[i], actual[i], child_path)
|
|
62
|
+
diff.missing.extend(child.missing)
|
|
63
|
+
diff.unexpected.extend(child.unexpected)
|
|
64
|
+
diff.changed.extend(child.changed)
|
|
65
|
+
|
|
66
|
+
elif reference != actual:
|
|
67
|
+
diff.changed.append(f"{path}: {json.dumps(reference)} → {json.dumps(actual)}")
|
|
68
|
+
|
|
69
|
+
return diff
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _deep_structural_compare(
|
|
73
|
+
reference: Any,
|
|
74
|
+
actual: Any,
|
|
75
|
+
path: str = "",
|
|
76
|
+
ignore_fields: list[str] | None = None,
|
|
77
|
+
) -> list[str]:
|
|
78
|
+
"""Compare structure only: same keys, same types, same array lengths. Ignores string values."""
|
|
79
|
+
ignore = set(ignore_fields or [])
|
|
80
|
+
issues: list[str] = []
|
|
81
|
+
|
|
82
|
+
if isinstance(reference, dict) and isinstance(actual, dict):
|
|
83
|
+
for key in reference:
|
|
84
|
+
if key in ignore:
|
|
85
|
+
continue
|
|
86
|
+
child_path = f"{path}.{key}" if path else key
|
|
87
|
+
if key not in actual:
|
|
88
|
+
issues.append(f"Missing key: {child_path}")
|
|
89
|
+
else:
|
|
90
|
+
issues.extend(
|
|
91
|
+
_deep_structural_compare(reference[key], actual[key], child_path, ignore_fields)
|
|
92
|
+
)
|
|
93
|
+
for key in actual:
|
|
94
|
+
if key in ignore:
|
|
95
|
+
continue
|
|
96
|
+
child_path = f"{path}.{key}" if path else key
|
|
97
|
+
if key not in reference:
|
|
98
|
+
issues.append(f"Unexpected key: {child_path}")
|
|
99
|
+
|
|
100
|
+
elif isinstance(reference, list) and isinstance(actual, list):
|
|
101
|
+
if len(reference) != len(actual):
|
|
102
|
+
issues.append(
|
|
103
|
+
f"Array length mismatch at {path or 'root'}: "
|
|
104
|
+
f"expected {len(reference)}, got {len(actual)}"
|
|
105
|
+
)
|
|
106
|
+
for i in range(min(len(reference), len(actual))):
|
|
107
|
+
child_path = f"{path}[{i}]"
|
|
108
|
+
issues.extend(
|
|
109
|
+
_deep_structural_compare(reference[i], actual[i], child_path, ignore_fields)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
elif type(reference) is not type(actual):
|
|
113
|
+
issues.append(
|
|
114
|
+
f"Type mismatch at {path or 'root'}: "
|
|
115
|
+
f"expected {type(reference).__name__}, got {type(actual).__name__}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return issues
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def golden_match(output: AgentOutput, config: Any) -> AssertionResult:
|
|
122
|
+
"""Compare agent structured_response against a golden reference file."""
|
|
123
|
+
ref_path = Path(config["reference"])
|
|
124
|
+
mode: str = config.get("mode", "exact")
|
|
125
|
+
ignore_fields: list[str] = config.get("ignore_fields", [])
|
|
126
|
+
|
|
127
|
+
if not ref_path.exists():
|
|
128
|
+
return AssertionResult(
|
|
129
|
+
passed=False,
|
|
130
|
+
name="golden_match",
|
|
131
|
+
detail=f"Reference file not found: {ref_path}. Run with --update-golden to create.",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
reference = json.loads(ref_path.read_text())
|
|
135
|
+
actual = output.structured_response
|
|
136
|
+
|
|
137
|
+
if mode == "exact":
|
|
138
|
+
diff = _diff_json(reference, actual)
|
|
139
|
+
if diff.has_differences:
|
|
140
|
+
return AssertionResult(
|
|
141
|
+
passed=False,
|
|
142
|
+
name="golden_match",
|
|
143
|
+
detail=f"Reference: {ref_path}\n\n{diff.to_detail()}",
|
|
144
|
+
)
|
|
145
|
+
return AssertionResult(passed=True, name="golden_match", detail="Exact match")
|
|
146
|
+
|
|
147
|
+
if mode == "structural":
|
|
148
|
+
issues = _deep_structural_compare(reference, actual, ignore_fields=ignore_fields)
|
|
149
|
+
if issues:
|
|
150
|
+
return AssertionResult(
|
|
151
|
+
passed=False,
|
|
152
|
+
name="golden_match",
|
|
153
|
+
detail="Structural mismatch:\n" + "\n".join(f" {i}" for i in issues),
|
|
154
|
+
)
|
|
155
|
+
return AssertionResult(passed=True, name="golden_match", detail="Structural match")
|
|
156
|
+
|
|
157
|
+
return AssertionResult(
|
|
158
|
+
passed=False,
|
|
159
|
+
name="golden_match",
|
|
160
|
+
detail=f"Unknown mode: {mode}. Use 'exact' or 'structural'.",
|
|
161
|
+
)
|
autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/llm_judge.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# ABOUTME: LLM-as-judge assertion functions wrapping OpenEvals.
|
|
2
|
+
# ABOUTME: Evaluates free-text agent responses against criteria using an LLM judge.
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from openevals.llm import create_llm_as_judge # pyright: ignore[reportMissingImports]
|
|
10
|
+
|
|
11
|
+
from autobots_devtools_shared_lib.eval.models.result import AgentOutput, AssertionResult
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Default judge model — cheap and fast for evaluation
|
|
16
|
+
_DEFAULT_JUDGE_MODEL = "google_genai/gemini-2.0-flash"
|
|
17
|
+
|
|
18
|
+
_LLM_JUDGE_PROMPT = """You are evaluating an AI agent's response.
|
|
19
|
+
|
|
20
|
+
Criteria: {criteria}
|
|
21
|
+
|
|
22
|
+
Agent response:
|
|
23
|
+
{outputs}
|
|
24
|
+
|
|
25
|
+
Rate how well the response meets the criteria on a scale from 0.0 to 1.0."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _last_ai_content(agent_output: AgentOutput) -> str:
|
|
29
|
+
"""Extract text content from the last AI message."""
|
|
30
|
+
for msg in reversed(agent_output.messages):
|
|
31
|
+
if hasattr(msg, "type") and msg.type == "ai" and msg.content:
|
|
32
|
+
return str(msg.content)
|
|
33
|
+
return ""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def llm_judge(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
37
|
+
"""Evaluate agent response against criteria using an LLM judge.
|
|
38
|
+
|
|
39
|
+
Config can be:
|
|
40
|
+
- str: criteria string (threshold defaults to 0.5)
|
|
41
|
+
- dict: {"criteria": str, "threshold": float, "model": str (optional)}
|
|
42
|
+
"""
|
|
43
|
+
if isinstance(config, str):
|
|
44
|
+
criteria = config
|
|
45
|
+
threshold = 0.5
|
|
46
|
+
model = _DEFAULT_JUDGE_MODEL
|
|
47
|
+
elif isinstance(config, dict):
|
|
48
|
+
criteria = config.get("criteria", "")
|
|
49
|
+
threshold = config.get("threshold", 0.5)
|
|
50
|
+
model = config.get("model", _DEFAULT_JUDGE_MODEL)
|
|
51
|
+
else:
|
|
52
|
+
return AssertionResult(
|
|
53
|
+
passed=False,
|
|
54
|
+
name="llm_judge",
|
|
55
|
+
detail=f"Invalid config type: {type(config).__name__}",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if not criteria:
|
|
59
|
+
return AssertionResult(
|
|
60
|
+
passed=False,
|
|
61
|
+
name="llm_judge",
|
|
62
|
+
detail="No criteria specified",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
agent_text = _last_ai_content(agent_output)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
evaluator = create_llm_as_judge(
|
|
69
|
+
prompt=_LLM_JUDGE_PROMPT,
|
|
70
|
+
model=model,
|
|
71
|
+
continuous=True,
|
|
72
|
+
feedback_key="score",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
result = evaluator(
|
|
76
|
+
outputs=agent_text,
|
|
77
|
+
criteria=criteria,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# result is a list[EvaluatorResult]; take the first entry
|
|
81
|
+
first = result[0] if isinstance(result, list) else result
|
|
82
|
+
score = float(first["score"]) if isinstance(first, dict) else float(first.score)
|
|
83
|
+
comment = first.get("comment") or "" if isinstance(first, dict) else (first.comment or "")
|
|
84
|
+
passed = score >= threshold
|
|
85
|
+
|
|
86
|
+
return AssertionResult(
|
|
87
|
+
passed=passed,
|
|
88
|
+
name="llm_judge",
|
|
89
|
+
detail=f"Score: {score:.2f} (threshold: {threshold}). {comment}",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.warning("LLM judge failed: %s", e)
|
|
94
|
+
return AssertionResult(
|
|
95
|
+
passed=False,
|
|
96
|
+
name="llm_judge",
|
|
97
|
+
detail=f"Judge error: {type(e).__name__}: {e}",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
_TRAJECTORY_PROMPT = """You are evaluating an AI agent's tool usage trajectory.
|
|
102
|
+
|
|
103
|
+
Criteria: {criteria}
|
|
104
|
+
|
|
105
|
+
Full conversation (messages and tool calls):
|
|
106
|
+
{outputs}
|
|
107
|
+
|
|
108
|
+
Rate how well the agent's tool usage meets the criteria on a scale from 0.0 to 1.0.
|
|
109
|
+
Consider: Were tools used efficiently? Were there redundant calls? Was the sequence logical?"""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _format_trajectory(agent_output: AgentOutput) -> str:
|
|
113
|
+
"""Format the full message history including tool calls for the judge."""
|
|
114
|
+
lines: list[str] = []
|
|
115
|
+
for msg in agent_output.messages:
|
|
116
|
+
msg_type = getattr(msg, "type", "unknown")
|
|
117
|
+
content = str(msg.content) if msg.content else ""
|
|
118
|
+
|
|
119
|
+
if msg_type == "human":
|
|
120
|
+
lines.append(f"[User]: {content}")
|
|
121
|
+
elif msg_type == "ai":
|
|
122
|
+
lines.append(f"[Agent]: {content}")
|
|
123
|
+
tool_calls = getattr(msg, "tool_calls", None)
|
|
124
|
+
if tool_calls:
|
|
125
|
+
for tc in tool_calls:
|
|
126
|
+
if isinstance(tc, dict):
|
|
127
|
+
lines.append(f" -> Tool call: {tc.get('name', '?')}({tc.get('args', {})})")
|
|
128
|
+
elif hasattr(tc, "name"):
|
|
129
|
+
lines.append(f" -> Tool call: {tc.name}({getattr(tc, 'args', {})})")
|
|
130
|
+
elif msg_type == "tool":
|
|
131
|
+
tool_name = getattr(msg, "name", "?")
|
|
132
|
+
lines.append(f"[Tool result ({tool_name})]: {content[:200]}...")
|
|
133
|
+
return "\n".join(lines)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def trajectory_quality(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
137
|
+
"""Evaluate the quality of an agent's tool usage trajectory.
|
|
138
|
+
|
|
139
|
+
Config: {"criteria": str, "threshold": float (default 0.5), "model": str (optional)}
|
|
140
|
+
"""
|
|
141
|
+
if isinstance(config, str):
|
|
142
|
+
criteria = config
|
|
143
|
+
threshold = 0.5
|
|
144
|
+
model = _DEFAULT_JUDGE_MODEL
|
|
145
|
+
elif isinstance(config, dict):
|
|
146
|
+
criteria = config.get("criteria", "")
|
|
147
|
+
threshold = config.get("threshold", 0.5)
|
|
148
|
+
model = config.get("model", _DEFAULT_JUDGE_MODEL)
|
|
149
|
+
else:
|
|
150
|
+
return AssertionResult(
|
|
151
|
+
passed=False,
|
|
152
|
+
name="trajectory_quality",
|
|
153
|
+
detail=f"Invalid config type: {type(config).__name__}",
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if not criteria:
|
|
157
|
+
return AssertionResult(
|
|
158
|
+
passed=False,
|
|
159
|
+
name="trajectory_quality",
|
|
160
|
+
detail="No criteria specified",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
trajectory_text = _format_trajectory(agent_output)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
evaluator = create_llm_as_judge(
|
|
167
|
+
prompt=_TRAJECTORY_PROMPT,
|
|
168
|
+
model=model,
|
|
169
|
+
continuous=True,
|
|
170
|
+
feedback_key="score",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
result = evaluator(
|
|
174
|
+
outputs=trajectory_text,
|
|
175
|
+
criteria=criteria,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# result is a list[EvaluatorResult]; take the first entry
|
|
179
|
+
first = result[0] if isinstance(result, list) else result
|
|
180
|
+
score = float(first["score"]) if isinstance(first, dict) else float(first.score)
|
|
181
|
+
comment = first.get("comment") or "" if isinstance(first, dict) else (first.comment or "")
|
|
182
|
+
passed = score >= threshold
|
|
183
|
+
|
|
184
|
+
return AssertionResult(
|
|
185
|
+
passed=passed,
|
|
186
|
+
name="trajectory_quality",
|
|
187
|
+
detail=f"Score: {score:.2f} (threshold: {threshold}). {comment}",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
logger.warning("Trajectory quality judge failed: %s", e)
|
|
192
|
+
return AssertionResult(
|
|
193
|
+
passed=False,
|
|
194
|
+
name="trajectory_quality",
|
|
195
|
+
detail=f"Judge error: {type(e).__name__}: {e}",
|
|
196
|
+
)
|