autobots-devtools-shared-lib 0.5.2__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/PKG-INFO +3 -1
  2. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/pyproject.toml +8 -2
  3. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/utils/schema_directive_resolver.py +2 -1
  4. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/__init__.py +33 -0
  5. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/__init__.py +3 -0
  6. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/deterministic.py +174 -0
  7. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/golden.py +161 -0
  8. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/llm_judge.py +196 -0
  9. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/assertions/registry.py +77 -0
  10. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/__init__.py +3 -0
  11. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/cost_tracker.py +139 -0
  12. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/loader.py +59 -0
  13. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/runner.py +161 -0
  14. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/core/workspace.py +51 -0
  15. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/models/__init__.py +3 -0
  16. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/models/eval_case.py +96 -0
  17. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/models/result.py +104 -0
  18. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/__init__.py +3 -0
  19. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/fixtures.py +111 -0
  20. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/plugin.py +90 -0
  21. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/pytest_plugin/reporting.py +68 -0
  22. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/scoring/__init__.py +3 -0
  23. autobots_devtools_shared_lib-0.6.0/src/autobots_devtools_shared_lib/eval/scoring/langfuse_scorer.py +60 -0
  24. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/README.md +0 -0
  25. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/__init__.py +0 -0
  26. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/__init__.py +0 -0
  27. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/__init__.py +0 -0
  28. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/jenkins_config.py +0 -0
  29. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/jenkins_constants.py +0 -0
  30. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/config/jenkins_loader.py +0 -0
  31. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/__init__.py +0 -0
  32. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/logging_utils.py +0 -0
  33. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/otel_fastapi.py +0 -0
  34. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/trace_metadata.py +0 -0
  35. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/trace_propagation.py +0 -0
  36. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/observability/tracing.py +0 -0
  37. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/__init__.py +0 -0
  38. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/README.md +0 -0
  39. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/__init__.py +0 -0
  40. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/app.py +0 -0
  41. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/config.py +0 -0
  42. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/models.py +0 -0
  43. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/__init__.py +0 -0
  44. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/README.md +0 -0
  45. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/__init__.py +0 -0
  46. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/cache_backed.py +0 -0
  47. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/db_repository.py +0 -0
  48. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/factory.py +0 -0
  49. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/in_memory.py +0 -0
  50. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/redis_store.py +0 -0
  51. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/services/context/store.py +0 -0
  52. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/__init__.py +0 -0
  53. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/context_tools.py +0 -0
  54. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/format_tools.py +0 -0
  55. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/fserver_client_tools.py +0 -0
  56. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/jenkins_builtin_tools.py +0 -0
  57. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/tools/jenkins_pipeline_tools.py +0 -0
  58. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/__init__.py +0 -0
  59. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/context_utils.py +0 -0
  60. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/format_utils.py +0 -0
  61. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/fserver_client_utils.py +0 -0
  62. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_builtin_utils.py +0 -0
  63. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_http_utils.py +0 -0
  64. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_pipeline_utils.py +0 -0
  65. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/__init__.py +0 -0
  66. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/__init__.py +0 -0
  67. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/agent_config_utils.py +0 -0
  68. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/agent_meta.py +0 -0
  69. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/base_agent.py +0 -0
  70. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/batch.py +0 -0
  71. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/invocation_utils.py +0 -0
  72. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/agents/middleware.py +0 -0
  73. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/config/__init__.py +0 -0
  74. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/config/dynagent_settings.py +0 -0
  75. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/llm/__init__.py +0 -0
  76. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/llm/llm.py +0 -0
  77. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/models/__init__.py +0 -0
  78. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/models/state.py +0 -0
  79. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/services/__init__.py +0 -0
  80. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/services/structured_converter.py +0 -0
  81. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/tools/__init__.py +0 -0
  82. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/tools/state_tools.py +0 -0
  83. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/tools/tool_registry.py +0 -0
  84. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/ui/__init__.py +0 -0
  85. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/ui/default_ui.py +0 -0
  86. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/ui/ui_utils.py +0 -0
  87. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/dynagent/utils/__init__.py +0 -0
  88. {autobots_devtools_shared_lib-0.5.2 → autobots_devtools_shared_lib-0.6.0}/src/autobots_devtools_shared_lib/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: autobots-devtools-shared-lib
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: Shared library functions to be used for all autobots projects
5
5
  License: MIT
6
6
  Author: Pralhad
@@ -18,6 +18,7 @@ Requires-Dist: langchain (>=1.0.0)
18
18
  Requires-Dist: langchain-anthropic (>=1.4.0)
19
19
  Requires-Dist: langchain-google-genai (>=4.2.0)
20
20
  Requires-Dist: langfuse (>=3.12.1)
21
+ Requires-Dist: openevals (>=0.1.0)
21
22
  Requires-Dist: opentelemetry-api (>=1.30.0,<2.0.0)
22
23
  Requires-Dist: opentelemetry-exporter-otlp-proto-http (>=1.30.0,<2.0.0)
23
24
  Requires-Dist: opentelemetry-instrumentation-fastapi (>=0.49b0)
@@ -26,6 +27,7 @@ Requires-Dist: pydantic-settings (>=2.10.1)
26
27
  Requires-Dist: python-dotenv (>=1.1.1)
27
28
  Requires-Dist: pyyaml (>=6.0.3)
28
29
  Requires-Dist: referencing (>=0.37.0)
30
+ Requires-Dist: tiktoken (>=0.7.0)
29
31
  Requires-Dist: uvicorn[standard] (>=0.32.0)
30
32
  Description-Content-Type: text/markdown
31
33
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "autobots-devtools-shared-lib"
3
- version = "0.5.2"
3
+ version = "0.6.0"
4
4
  description = "Shared library functions to be used for all autobots projects"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -25,6 +25,8 @@ dependencies = [
25
25
  "opentelemetry-sdk>=1.30.0,<2.0.0",
26
26
  "opentelemetry-exporter-otlp-proto-http>=1.30.0,<2.0.0",
27
27
  "opentelemetry-instrumentation-fastapi>=0.49b0",
28
+ "openevals>=0.1.0",
29
+ "tiktoken>=0.7.0",
28
30
  ]
29
31
  dev = [
30
32
  "pre-commit>=4.5.1",
@@ -33,6 +35,7 @@ dev = [
33
35
  "pytest-asyncio>=1.3.0",
34
36
  "pytest-cov>=7.0.0",
35
37
  "ruff>=0.14.14",
38
+ "pytest-xdist>=3.0.0",
36
39
  ]
37
40
 
38
41
  [tool.poetry]
@@ -44,6 +47,9 @@ packages = [
44
47
  requires = ["poetry-core"]
45
48
  build-backend = "poetry.core.masonry.api"
46
49
 
50
+ [project.entry-points."pytest11"]
51
+ dynagent_eval = "autobots_devtools_shared_lib.eval.pytest_plugin.plugin"
52
+
47
53
  [tool.ruff]
48
54
  line-length = 100
49
55
  target-version = "py312"
@@ -106,7 +112,7 @@ markers = [
106
112
  pythonVersion = "3.12"
107
113
  typeCheckingMode = "basic"
108
114
  include = ["src", "tests"]
109
- exclude = ["**/__pycache__", ".venv"]
115
+ exclude = ["**/__pycache__", ".venv", "**/eval/core/cost_tracker.py", "**/eval/assertions/llm_judge.py"]
110
116
  venvPath = ".."
111
117
  venv = ".venv"
112
118
  extraPaths = ["src"]
@@ -91,9 +91,10 @@ def _retrieve_from_path(base_dir: Path):
91
91
 
92
92
  def _retrieve(uri: str) -> Resource:
93
93
  from urllib.parse import unquote, urlparse
94
+ from urllib.request import url2pathname
94
95
 
95
96
  parsed = urlparse(uri)
96
- path = Path(unquote(parsed.path))
97
+ path = Path(url2pathname(unquote(parsed.path)))
97
98
  if not path.is_absolute():
98
99
  path = base_dir / path
99
100
  contents = json.loads(path.read_text())
@@ -0,0 +1,33 @@
1
+ # ABOUTME: Public API for the dynagent eval framework.
2
+ # ABOUTME: Import from here for a stable surface — loader, models, and result types.
3
+
4
+ from autobots_devtools_shared_lib.eval.assertions.registry import register_assertion
5
+ from autobots_devtools_shared_lib.eval.core.loader import EvalConfigError, load_eval_cases
6
+ from autobots_devtools_shared_lib.eval.models.eval_case import (
7
+ EvalCase,
8
+ SetupConfig,
9
+ WorkspaceFile,
10
+ )
11
+ from autobots_devtools_shared_lib.eval.models.result import (
12
+ AgentOutput,
13
+ AssertionResult,
14
+ CostDelta,
15
+ EvalCostSnapshot,
16
+ EvalResult,
17
+ TurnResult,
18
+ )
19
+
20
+ __all__ = [
21
+ "AgentOutput",
22
+ "AssertionResult",
23
+ "CostDelta",
24
+ "EvalCase",
25
+ "EvalConfigError",
26
+ "EvalCostSnapshot",
27
+ "EvalResult",
28
+ "SetupConfig",
29
+ "TurnResult",
30
+ "WorkspaceFile",
31
+ "load_eval_cases",
32
+ "register_assertion",
33
+ ]
@@ -0,0 +1,3 @@
1
+ # ABOUTME: Package init for the eval assertions module.
2
+ # ABOUTME: Contains assertion registry and evaluator functions.
3
+ from __future__ import annotations
@@ -0,0 +1,174 @@
1
+ # ABOUTME: Deterministic assertion functions wrapping OpenEvals and built-in checks.
2
+ # ABOUTME: Each function takes AgentOutput + config and returns AssertionResult.
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import re
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import jsonschema as js
12
+
13
+ from autobots_devtools_shared_lib.eval.models.result import AgentOutput, AssertionResult
14
+
15
+
16
+ def _last_ai_content(agent_output: AgentOutput) -> str:
17
+ """Extract text content from the last AI message."""
18
+ for msg in reversed(agent_output.messages):
19
+ if hasattr(msg, "type") and msg.type == "ai" and msg.content:
20
+ return str(msg.content)
21
+ return ""
22
+
23
+
24
+ def _all_tool_names(agent_output: AgentOutput) -> list[str]:
25
+ """Extract all tool names called across all messages."""
26
+ names: list[str] = []
27
+ for msg in agent_output.messages:
28
+ tool_calls = getattr(msg, "tool_calls", None)
29
+ if tool_calls:
30
+ for tc in tool_calls:
31
+ if isinstance(tc, dict):
32
+ names.append(tc.get("name", ""))
33
+ elif hasattr(tc, "name"):
34
+ names.append(tc.name)
35
+ return names
36
+
37
+
38
+ def contains(agent_output: AgentOutput, config: Any) -> AssertionResult:
39
+ """Check if agent response contains a substring (case-insensitive)."""
40
+ text = _last_ai_content(agent_output).lower()
41
+ target = str(config).lower()
42
+ found = target in text
43
+ return AssertionResult(
44
+ passed=found,
45
+ name=f"contains:{config}",
46
+ detail=f"{'Found' if found else 'Not found'} in response",
47
+ )
48
+
49
+
50
+ def regex(agent_output: AgentOutput, config: Any) -> AssertionResult:
51
+ """Check if agent response matches a regex pattern."""
52
+ text = _last_ai_content(agent_output)
53
+ pattern = str(config)
54
+ match = bool(re.search(pattern, text))
55
+ return AssertionResult(
56
+ passed=match,
57
+ name=f"regex:{pattern}",
58
+ detail=f"{'Matched' if match else 'No match'} for pattern",
59
+ )
60
+
61
+
62
+ def exact_match(agent_output: AgentOutput, config: Any) -> AssertionResult:
63
+ """Check if agent response exactly matches expected string."""
64
+ text = _last_ai_content(agent_output)
65
+ expected = str(config)
66
+ passed = text.strip() == expected.strip()
67
+ return AssertionResult(
68
+ passed=passed,
69
+ name="exact_match",
70
+ detail=f"Expected: {expected[:100]}",
71
+ )
72
+
73
+
74
+ def json_match(agent_output: AgentOutput, config: Any) -> AssertionResult:
75
+ """Check if agent response JSON matches expected JSON."""
76
+ text = _last_ai_content(agent_output)
77
+ try:
78
+ actual = json.loads(text)
79
+ expected = config if isinstance(config, dict) else json.loads(str(config))
80
+ passed = actual == expected
81
+ return AssertionResult(
82
+ passed=passed,
83
+ name="json_match",
84
+ detail="JSON matches" if passed else "JSON does not match",
85
+ )
86
+ except (json.JSONDecodeError, TypeError) as e:
87
+ return AssertionResult(passed=False, name="json_match", detail=f"Parse error: {e}")
88
+
89
+
90
+ def schema_match(agent_output: AgentOutput, config: Any) -> AssertionResult:
91
+ """Validate agent response JSON against a JSON schema (dict or file path)."""
92
+ text = _last_ai_content(agent_output)
93
+ try:
94
+ # If config is already a dict, use it directly; otherwise treat as file path
95
+ if isinstance(config, dict):
96
+ schema = config
97
+ else:
98
+ schema_path = Path(str(config))
99
+ schema = json.loads(schema_path.read_text())
100
+ data = json.loads(text)
101
+ js.validate(instance=data, schema=schema)
102
+ return AssertionResult(passed=True, name="response_matches_schema", detail="Valid")
103
+ except js.ValidationError as e:
104
+ return AssertionResult(
105
+ passed=False,
106
+ name="response_matches_schema",
107
+ detail=f"Schema validation failed: {e.message}",
108
+ )
109
+ except (json.JSONDecodeError, FileNotFoundError, OSError) as e:
110
+ return AssertionResult(
111
+ passed=False,
112
+ name="response_matches_schema",
113
+ detail=f"Error: {e}",
114
+ )
115
+
116
+
117
+ def tool_called(agent_output: AgentOutput, config: Any) -> AssertionResult:
118
+ """Check if a specific tool was called during the conversation."""
119
+ target = str(config)
120
+ called = _all_tool_names(agent_output)
121
+ found = target in called
122
+ return AssertionResult(
123
+ passed=found,
124
+ name=f"tool_called:{target}",
125
+ detail=f"Tools called: {called}" if not found else "Found",
126
+ )
127
+
128
+
129
+ def tool_sequence(agent_output: AgentOutput, config: Any) -> AssertionResult:
130
+ """Check if tools were called in a specific order."""
131
+ if not isinstance(config, list):
132
+ return AssertionResult(passed=False, name="tool_sequence", detail="Config must be a list")
133
+
134
+ expected_names = [step["tool"] for step in config if isinstance(step, dict)]
135
+ called = _all_tool_names(agent_output)
136
+
137
+ # Check subsequence match (in order, not necessarily contiguous)
138
+ idx = 0
139
+ for name in called:
140
+ if idx < len(expected_names) and name == expected_names[idx]:
141
+ idx += 1
142
+ passed = idx == len(expected_names)
143
+
144
+ return AssertionResult(
145
+ passed=passed,
146
+ name="tool_sequence",
147
+ detail=f"Expected: {expected_names}, Called: {called}",
148
+ )
149
+
150
+
151
+ def no_extra_tools(agent_output: AgentOutput, config: Any) -> AssertionResult:
152
+ """Check that no tools beyond the allowed set were called."""
153
+ allowed = set(config) if isinstance(config, list) else {str(config)}
154
+ called = set(_all_tool_names(agent_output))
155
+ extra = called - allowed
156
+ passed = len(extra) == 0
157
+ return AssertionResult(
158
+ passed=passed,
159
+ name="no_extra_tools",
160
+ detail=f"Extra tools: {extra}" if extra else "No extra tools",
161
+ )
162
+
163
+
164
+ def tools_unordered(agent_output: AgentOutput, config: Any) -> AssertionResult:
165
+ """Check that all expected tools were called (order doesn't matter)."""
166
+ expected = set(config) if isinstance(config, list) else {str(config)}
167
+ called = set(_all_tool_names(agent_output))
168
+ missing = expected - called
169
+ passed = len(missing) == 0
170
+ return AssertionResult(
171
+ passed=passed,
172
+ name="tools_unordered",
173
+ detail=f"Missing: {missing}" if missing else "All tools called",
174
+ )
@@ -0,0 +1,161 @@
1
+ # ABOUTME: Golden match assertion for comparing agent structured_response against reference files.
2
+ # ABOUTME: Supports exact diff mode and structural-only comparison with field ignoring.
3
+ """Golden match assertion: compare agent output against reference files."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from autobots_devtools_shared_lib.eval.models.result import AgentOutput, AssertionResult
13
+
14
+
15
+ @dataclass
16
+ class JsonDiff:
17
+ """Structured diff between reference and actual JSON."""
18
+
19
+ missing: list[str] = field(default_factory=list)
20
+ unexpected: list[str] = field(default_factory=list)
21
+ changed: list[str] = field(default_factory=list)
22
+
23
+ @property
24
+ def has_differences(self) -> bool:
25
+ return bool(self.missing or self.unexpected or self.changed)
26
+
27
+ def to_detail(self) -> str:
28
+ lines = [f"Missing from actual: {m}" for m in self.missing]
29
+ lines.extend(f"Unexpected in actual: {u}" for u in self.unexpected)
30
+ lines.extend(f"Changed: {c}" for c in self.changed)
31
+ return "\n".join(lines)
32
+
33
+
34
+ def _diff_json(reference: Any, actual: Any, path: str = "") -> JsonDiff:
35
+ """Recursive deep diff between two JSON-like structures."""
36
+ diff = JsonDiff()
37
+
38
+ if isinstance(reference, dict) and isinstance(actual, dict):
39
+ for key in reference:
40
+ child_path = f"{path}.{key}" if path else key
41
+ if key not in actual:
42
+ diff.missing.append(f"{child_path}: {json.dumps(reference[key])}")
43
+ else:
44
+ child = _diff_json(reference[key], actual[key], child_path)
45
+ diff.missing.extend(child.missing)
46
+ diff.unexpected.extend(child.unexpected)
47
+ diff.changed.extend(child.changed)
48
+ for key in actual:
49
+ child_path = f"{path}.{key}" if path else key
50
+ if key not in reference:
51
+ diff.unexpected.append(f"{child_path}: {json.dumps(actual[key])}")
52
+
53
+ elif isinstance(reference, list) and isinstance(actual, list):
54
+ for i in range(max(len(reference), len(actual))):
55
+ child_path = f"{path}[{i}]"
56
+ if i >= len(actual):
57
+ diff.missing.append(f"{child_path}: {json.dumps(reference[i])}")
58
+ elif i >= len(reference):
59
+ diff.unexpected.append(f"{child_path}: {json.dumps(actual[i])}")
60
+ else:
61
+ child = _diff_json(reference[i], actual[i], child_path)
62
+ diff.missing.extend(child.missing)
63
+ diff.unexpected.extend(child.unexpected)
64
+ diff.changed.extend(child.changed)
65
+
66
+ elif reference != actual:
67
+ diff.changed.append(f"{path}: {json.dumps(reference)} → {json.dumps(actual)}")
68
+
69
+ return diff
70
+
71
+
72
+ def _deep_structural_compare(
73
+ reference: Any,
74
+ actual: Any,
75
+ path: str = "",
76
+ ignore_fields: list[str] | None = None,
77
+ ) -> list[str]:
78
+ """Compare structure only: same keys, same types, same array lengths. Ignores string values."""
79
+ ignore = set(ignore_fields or [])
80
+ issues: list[str] = []
81
+
82
+ if isinstance(reference, dict) and isinstance(actual, dict):
83
+ for key in reference:
84
+ if key in ignore:
85
+ continue
86
+ child_path = f"{path}.{key}" if path else key
87
+ if key not in actual:
88
+ issues.append(f"Missing key: {child_path}")
89
+ else:
90
+ issues.extend(
91
+ _deep_structural_compare(reference[key], actual[key], child_path, ignore_fields)
92
+ )
93
+ for key in actual:
94
+ if key in ignore:
95
+ continue
96
+ child_path = f"{path}.{key}" if path else key
97
+ if key not in reference:
98
+ issues.append(f"Unexpected key: {child_path}")
99
+
100
+ elif isinstance(reference, list) and isinstance(actual, list):
101
+ if len(reference) != len(actual):
102
+ issues.append(
103
+ f"Array length mismatch at {path or 'root'}: "
104
+ f"expected {len(reference)}, got {len(actual)}"
105
+ )
106
+ for i in range(min(len(reference), len(actual))):
107
+ child_path = f"{path}[{i}]"
108
+ issues.extend(
109
+ _deep_structural_compare(reference[i], actual[i], child_path, ignore_fields)
110
+ )
111
+
112
+ elif type(reference) is not type(actual):
113
+ issues.append(
114
+ f"Type mismatch at {path or 'root'}: "
115
+ f"expected {type(reference).__name__}, got {type(actual).__name__}"
116
+ )
117
+
118
+ return issues
119
+
120
+
121
+ def golden_match(output: AgentOutput, config: Any) -> AssertionResult:
122
+ """Compare agent structured_response against a golden reference file."""
123
+ ref_path = Path(config["reference"])
124
+ mode: str = config.get("mode", "exact")
125
+ ignore_fields: list[str] = config.get("ignore_fields", [])
126
+
127
+ if not ref_path.exists():
128
+ return AssertionResult(
129
+ passed=False,
130
+ name="golden_match",
131
+ detail=f"Reference file not found: {ref_path}. Run with --update-golden to create.",
132
+ )
133
+
134
+ reference = json.loads(ref_path.read_text())
135
+ actual = output.structured_response
136
+
137
+ if mode == "exact":
138
+ diff = _diff_json(reference, actual)
139
+ if diff.has_differences:
140
+ return AssertionResult(
141
+ passed=False,
142
+ name="golden_match",
143
+ detail=f"Reference: {ref_path}\n\n{diff.to_detail()}",
144
+ )
145
+ return AssertionResult(passed=True, name="golden_match", detail="Exact match")
146
+
147
+ if mode == "structural":
148
+ issues = _deep_structural_compare(reference, actual, ignore_fields=ignore_fields)
149
+ if issues:
150
+ return AssertionResult(
151
+ passed=False,
152
+ name="golden_match",
153
+ detail="Structural mismatch:\n" + "\n".join(f" {i}" for i in issues),
154
+ )
155
+ return AssertionResult(passed=True, name="golden_match", detail="Structural match")
156
+
157
+ return AssertionResult(
158
+ passed=False,
159
+ name="golden_match",
160
+ detail=f"Unknown mode: {mode}. Use 'exact' or 'structural'.",
161
+ )
@@ -0,0 +1,196 @@
1
+ # ABOUTME: LLM-as-judge assertion functions wrapping OpenEvals.
2
+ # ABOUTME: Evaluates free-text agent responses against criteria using an LLM judge.
3
+
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ from typing import Any
8
+
9
+ from openevals.llm import create_llm_as_judge # pyright: ignore[reportMissingImports]
10
+
11
+ from autobots_devtools_shared_lib.eval.models.result import AgentOutput, AssertionResult
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Default judge model — cheap and fast for evaluation
16
+ _DEFAULT_JUDGE_MODEL = "google_genai/gemini-2.0-flash"
17
+
18
+ _LLM_JUDGE_PROMPT = """You are evaluating an AI agent's response.
19
+
20
+ Criteria: {criteria}
21
+
22
+ Agent response:
23
+ {outputs}
24
+
25
+ Rate how well the response meets the criteria on a scale from 0.0 to 1.0."""
26
+
27
+
28
+ def _last_ai_content(agent_output: AgentOutput) -> str:
29
+ """Extract text content from the last AI message."""
30
+ for msg in reversed(agent_output.messages):
31
+ if hasattr(msg, "type") and msg.type == "ai" and msg.content:
32
+ return str(msg.content)
33
+ return ""
34
+
35
+
36
+ def llm_judge(agent_output: AgentOutput, config: Any) -> AssertionResult:
37
+ """Evaluate agent response against criteria using an LLM judge.
38
+
39
+ Config can be:
40
+ - str: criteria string (threshold defaults to 0.5)
41
+ - dict: {"criteria": str, "threshold": float, "model": str (optional)}
42
+ """
43
+ if isinstance(config, str):
44
+ criteria = config
45
+ threshold = 0.5
46
+ model = _DEFAULT_JUDGE_MODEL
47
+ elif isinstance(config, dict):
48
+ criteria = config.get("criteria", "")
49
+ threshold = config.get("threshold", 0.5)
50
+ model = config.get("model", _DEFAULT_JUDGE_MODEL)
51
+ else:
52
+ return AssertionResult(
53
+ passed=False,
54
+ name="llm_judge",
55
+ detail=f"Invalid config type: {type(config).__name__}",
56
+ )
57
+
58
+ if not criteria:
59
+ return AssertionResult(
60
+ passed=False,
61
+ name="llm_judge",
62
+ detail="No criteria specified",
63
+ )
64
+
65
+ agent_text = _last_ai_content(agent_output)
66
+
67
+ try:
68
+ evaluator = create_llm_as_judge(
69
+ prompt=_LLM_JUDGE_PROMPT,
70
+ model=model,
71
+ continuous=True,
72
+ feedback_key="score",
73
+ )
74
+
75
+ result = evaluator(
76
+ outputs=agent_text,
77
+ criteria=criteria,
78
+ )
79
+
80
+ # result is a list[EvaluatorResult]; take the first entry
81
+ first = result[0] if isinstance(result, list) else result
82
+ score = float(first["score"]) if isinstance(first, dict) else float(first.score)
83
+ comment = first.get("comment") or "" if isinstance(first, dict) else (first.comment or "")
84
+ passed = score >= threshold
85
+
86
+ return AssertionResult(
87
+ passed=passed,
88
+ name="llm_judge",
89
+ detail=f"Score: {score:.2f} (threshold: {threshold}). {comment}",
90
+ )
91
+
92
+ except Exception as e:
93
+ logger.warning("LLM judge failed: %s", e)
94
+ return AssertionResult(
95
+ passed=False,
96
+ name="llm_judge",
97
+ detail=f"Judge error: {type(e).__name__}: {e}",
98
+ )
99
+
100
+
101
+ _TRAJECTORY_PROMPT = """You are evaluating an AI agent's tool usage trajectory.
102
+
103
+ Criteria: {criteria}
104
+
105
+ Full conversation (messages and tool calls):
106
+ {outputs}
107
+
108
+ Rate how well the agent's tool usage meets the criteria on a scale from 0.0 to 1.0.
109
+ Consider: Were tools used efficiently? Were there redundant calls? Was the sequence logical?"""
110
+
111
+
112
+ def _format_trajectory(agent_output: AgentOutput) -> str:
113
+ """Format the full message history including tool calls for the judge."""
114
+ lines: list[str] = []
115
+ for msg in agent_output.messages:
116
+ msg_type = getattr(msg, "type", "unknown")
117
+ content = str(msg.content) if msg.content else ""
118
+
119
+ if msg_type == "human":
120
+ lines.append(f"[User]: {content}")
121
+ elif msg_type == "ai":
122
+ lines.append(f"[Agent]: {content}")
123
+ tool_calls = getattr(msg, "tool_calls", None)
124
+ if tool_calls:
125
+ for tc in tool_calls:
126
+ if isinstance(tc, dict):
127
+ lines.append(f" -> Tool call: {tc.get('name', '?')}({tc.get('args', {})})")
128
+ elif hasattr(tc, "name"):
129
+ lines.append(f" -> Tool call: {tc.name}({getattr(tc, 'args', {})})")
130
+ elif msg_type == "tool":
131
+ tool_name = getattr(msg, "name", "?")
132
+ lines.append(f"[Tool result ({tool_name})]: {content[:200]}...")
133
+ return "\n".join(lines)
134
+
135
+
136
+ def trajectory_quality(agent_output: AgentOutput, config: Any) -> AssertionResult:
137
+ """Evaluate the quality of an agent's tool usage trajectory.
138
+
139
+ Config: {"criteria": str, "threshold": float (default 0.5), "model": str (optional)}
140
+ """
141
+ if isinstance(config, str):
142
+ criteria = config
143
+ threshold = 0.5
144
+ model = _DEFAULT_JUDGE_MODEL
145
+ elif isinstance(config, dict):
146
+ criteria = config.get("criteria", "")
147
+ threshold = config.get("threshold", 0.5)
148
+ model = config.get("model", _DEFAULT_JUDGE_MODEL)
149
+ else:
150
+ return AssertionResult(
151
+ passed=False,
152
+ name="trajectory_quality",
153
+ detail=f"Invalid config type: {type(config).__name__}",
154
+ )
155
+
156
+ if not criteria:
157
+ return AssertionResult(
158
+ passed=False,
159
+ name="trajectory_quality",
160
+ detail="No criteria specified",
161
+ )
162
+
163
+ trajectory_text = _format_trajectory(agent_output)
164
+
165
+ try:
166
+ evaluator = create_llm_as_judge(
167
+ prompt=_TRAJECTORY_PROMPT,
168
+ model=model,
169
+ continuous=True,
170
+ feedback_key="score",
171
+ )
172
+
173
+ result = evaluator(
174
+ outputs=trajectory_text,
175
+ criteria=criteria,
176
+ )
177
+
178
+ # result is a list[EvaluatorResult]; take the first entry
179
+ first = result[0] if isinstance(result, list) else result
180
+ score = float(first["score"]) if isinstance(first, dict) else float(first.score)
181
+ comment = first.get("comment") or "" if isinstance(first, dict) else (first.comment or "")
182
+ passed = score >= threshold
183
+
184
+ return AssertionResult(
185
+ passed=passed,
186
+ name="trajectory_quality",
187
+ detail=f"Score: {score:.2f} (threshold: {threshold}). {comment}",
188
+ )
189
+
190
+ except Exception as e:
191
+ logger.warning("Trajectory quality judge failed: %s", e)
192
+ return AssertionResult(
193
+ passed=False,
194
+ name="trajectory_quality",
195
+ detail=f"Judge error: {type(e).__name__}: {e}",
196
+ )