autobots-devtools-shared-lib 0.6.1__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/PKG-INFO +1 -1
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/pyproject.toml +1 -1
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/assertions/golden.py +22 -5
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/assertions/registry.py +2 -0
- autobots_devtools_shared_lib-0.7.0/src/autobots_devtools_shared_lib/eval/assertions/written_file.py +233 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/core/runner.py +2 -0
- autobots_devtools_shared_lib-0.7.0/src/autobots_devtools_shared_lib/eval/core/workspace.py +129 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/pytest_plugin/fixtures.py +2 -3
- autobots_devtools_shared_lib-0.6.1/src/autobots_devtools_shared_lib/eval/core/workspace.py +0 -51
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/README.md +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/config/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/config/jenkins_config.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/config/jenkins_constants.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/config/jenkins_loader.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/observability/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/observability/logging_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/observability/otel_fastapi.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/observability/trace_metadata.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/observability/trace_propagation.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/observability/tracing.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/servers/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/README.md +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/app.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/config.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/servers/fileserver/models.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/README.md +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/cache_backed.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/db_repository.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/factory.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/in_memory.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/redis_store.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/services/context/store.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/tools/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/tools/context_tools.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/tools/format_tools.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/tools/fserver_client_tools.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/tools/jenkins_builtin_tools.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/tools/jenkins_pipeline_tools.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/utils/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/utils/context_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/utils/format_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/utils/fserver_client_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_builtin_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_http_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/common/utils/jenkins_pipeline_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/agents/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/agents/agent_config_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/agents/agent_meta.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/agents/base_agent.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/agents/batch.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/agents/invocation_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/agents/middleware.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/config/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/config/dynagent_settings.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/llm/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/llm/llm.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/models/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/models/state.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/services/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/services/structured_converter.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/tools/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/tools/state_tools.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/tools/tool_registry.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/ui/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/ui/default_ui.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/ui/ui_utils.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/utils/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/dynagent/utils/schema_directive_resolver.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/assertions/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/assertions/deterministic.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/assertions/llm_judge.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/core/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/core/cost_tracker.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/core/loader.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/models/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/models/eval_case.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/models/result.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/pytest_plugin/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/pytest_plugin/plugin.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/pytest_plugin/reporting.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/scoring/__init__.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/eval/scoring/langfuse_scorer.py +0 -0
- {autobots_devtools_shared_lib-0.6.1 → autobots_devtools_shared_lib-0.7.0}/src/autobots_devtools_shared_lib/py.typed +0 -0
|
@@ -31,21 +31,38 @@ class JsonDiff:
|
|
|
31
31
|
return "\n".join(lines)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def _diff_json(
|
|
35
|
-
|
|
34
|
+
def _diff_json(
|
|
35
|
+
reference: Any,
|
|
36
|
+
actual: Any,
|
|
37
|
+
path: str = "",
|
|
38
|
+
ignore_fields: list[str] | None = None,
|
|
39
|
+
) -> JsonDiff:
|
|
40
|
+
"""Recursive deep diff between two JSON-like structures.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
reference: Expected JSON value.
|
|
44
|
+
actual: Actual JSON value from agent output.
|
|
45
|
+
path: Dot-path prefix used in diff messages (internal).
|
|
46
|
+
ignore_fields: Key names to skip at any level of the dict tree.
|
|
47
|
+
"""
|
|
48
|
+
ignore = set(ignore_fields or [])
|
|
36
49
|
diff = JsonDiff()
|
|
37
50
|
|
|
38
51
|
if isinstance(reference, dict) and isinstance(actual, dict):
|
|
39
52
|
for key in reference:
|
|
53
|
+
if key in ignore:
|
|
54
|
+
continue
|
|
40
55
|
child_path = f"{path}.{key}" if path else key
|
|
41
56
|
if key not in actual:
|
|
42
57
|
diff.missing.append(f"{child_path}: {json.dumps(reference[key])}")
|
|
43
58
|
else:
|
|
44
|
-
child = _diff_json(reference[key], actual[key], child_path)
|
|
59
|
+
child = _diff_json(reference[key], actual[key], child_path, ignore_fields)
|
|
45
60
|
diff.missing.extend(child.missing)
|
|
46
61
|
diff.unexpected.extend(child.unexpected)
|
|
47
62
|
diff.changed.extend(child.changed)
|
|
48
63
|
for key in actual:
|
|
64
|
+
if key in ignore:
|
|
65
|
+
continue
|
|
49
66
|
child_path = f"{path}.{key}" if path else key
|
|
50
67
|
if key not in reference:
|
|
51
68
|
diff.unexpected.append(f"{child_path}: {json.dumps(actual[key])}")
|
|
@@ -58,7 +75,7 @@ def _diff_json(reference: Any, actual: Any, path: str = "") -> JsonDiff:
|
|
|
58
75
|
elif i >= len(reference):
|
|
59
76
|
diff.unexpected.append(f"{child_path}: {json.dumps(actual[i])}")
|
|
60
77
|
else:
|
|
61
|
-
child = _diff_json(reference[i], actual[i], child_path)
|
|
78
|
+
child = _diff_json(reference[i], actual[i], child_path, ignore_fields)
|
|
62
79
|
diff.missing.extend(child.missing)
|
|
63
80
|
diff.unexpected.extend(child.unexpected)
|
|
64
81
|
diff.changed.extend(child.changed)
|
|
@@ -135,7 +152,7 @@ def golden_match(output: AgentOutput, config: Any) -> AssertionResult:
|
|
|
135
152
|
actual = output.structured_response
|
|
136
153
|
|
|
137
154
|
if mode == "exact":
|
|
138
|
-
diff = _diff_json(reference, actual)
|
|
155
|
+
diff = _diff_json(reference, actual, ignore_fields=ignore_fields)
|
|
139
156
|
if diff.has_differences:
|
|
140
157
|
return AssertionResult(
|
|
141
158
|
passed=False,
|
|
@@ -39,6 +39,7 @@ def _register_builtins() -> None:
|
|
|
39
39
|
llm_judge,
|
|
40
40
|
trajectory_quality,
|
|
41
41
|
)
|
|
42
|
+
from autobots_devtools_shared_lib.eval.assertions.written_file import written_file_matches
|
|
42
43
|
|
|
43
44
|
_REGISTRY.update(
|
|
44
45
|
cast(
|
|
@@ -56,6 +57,7 @@ def _register_builtins() -> None:
|
|
|
56
57
|
"llm_judge": llm_judge,
|
|
57
58
|
"trajectory_quality": trajectory_quality,
|
|
58
59
|
"golden_match": golden_match,
|
|
60
|
+
"written_file_matches": written_file_matches,
|
|
59
61
|
},
|
|
60
62
|
)
|
|
61
63
|
)
|
autobots_devtools_shared_lib-0.7.0/src/autobots_devtools_shared_lib/eval/assertions/written_file.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# ABOUTME: Assertions for files written to the file server workspace by agents.
|
|
2
|
+
# ABOUTME: Complements golden_match (structured_response) for agents that output via file tools.
|
|
3
|
+
"""written_file_matches assertion evaluator."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import jsonschema as js
|
|
13
|
+
|
|
14
|
+
from autobots_devtools_shared_lib.common.utils.fserver_client_utils import read_file as _read_file
|
|
15
|
+
from autobots_devtools_shared_lib.eval.assertions.golden import _deep_structural_compare, _diff_json
|
|
16
|
+
from autobots_devtools_shared_lib.eval.core.workspace import resolve_workspace_context
|
|
17
|
+
from autobots_devtools_shared_lib.eval.models.result import AgentOutput, AssertionResult
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# Mode handlers
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
FileModeHandler = Any # (content: str, actual: Any, config: dict, name: str) -> AssertionResult
|
|
24
|
+
|
|
25
|
+
_MODE_REGISTRY: dict[str, FileModeHandler] = {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _register_mode(name: str, fn: FileModeHandler) -> None:
|
|
29
|
+
_MODE_REGISTRY[name] = fn
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _resolve_mode(mode: str, assertion_name: str) -> FileModeHandler | AssertionResult:
|
|
33
|
+
if mode not in _MODE_REGISTRY:
|
|
34
|
+
available = ", ".join(sorted(_MODE_REGISTRY.keys()))
|
|
35
|
+
return AssertionResult(
|
|
36
|
+
passed=False,
|
|
37
|
+
name=assertion_name,
|
|
38
|
+
detail=f"Unknown mode: '{mode}'. Available: {available}",
|
|
39
|
+
)
|
|
40
|
+
return _MODE_REGISTRY[mode]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Helpers
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _strip_code_fences(text: str) -> str:
|
|
49
|
+
match = re.search(r"```(?:\w+)?\s*\n?(.*?)\n?```", text, re.DOTALL)
|
|
50
|
+
return match.group(1).strip() if match else text.strip()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _read_workspace_file(file_name: str, raw_state: dict[str, Any]) -> str:
|
|
54
|
+
workspace_context = resolve_workspace_context(raw_state)
|
|
55
|
+
content = _read_file(file_name, workspace_context)
|
|
56
|
+
if content.startswith("Error"):
|
|
57
|
+
raise RuntimeError(f"File server read failed for '{file_name}': {content}")
|
|
58
|
+
return content
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _load_json(content: str, assertion_name: str) -> tuple[Any, AssertionResult | None]:
|
|
62
|
+
try:
|
|
63
|
+
return json.loads(_strip_code_fences(content)), None
|
|
64
|
+
except json.JSONDecodeError as e:
|
|
65
|
+
return None, AssertionResult(
|
|
66
|
+
passed=False, name=assertion_name, detail=f"JSON parse error: {e}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _load_reference(
|
|
71
|
+
config: dict[str, Any], assertion_name: str
|
|
72
|
+
) -> tuple[Any, AssertionResult | None]:
|
|
73
|
+
ref_path_str = config.get("reference")
|
|
74
|
+
if not ref_path_str:
|
|
75
|
+
return None, AssertionResult(
|
|
76
|
+
passed=False,
|
|
77
|
+
name=assertion_name,
|
|
78
|
+
detail=f"Mode '{config.get('mode')}' requires 'reference'",
|
|
79
|
+
)
|
|
80
|
+
ref_path = Path(ref_path_str)
|
|
81
|
+
if not ref_path.exists():
|
|
82
|
+
return None, AssertionResult(
|
|
83
|
+
passed=False, name=assertion_name, detail=f"Reference not found: {ref_path}"
|
|
84
|
+
)
|
|
85
|
+
return json.loads(ref_path.read_text()), None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# Built-in mode implementations
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _mode_contains(
|
|
94
|
+
content: str, _actual: Any, config: dict[str, Any], name: str
|
|
95
|
+
) -> AssertionResult:
|
|
96
|
+
value = str(config.get("value", ""))
|
|
97
|
+
found = value.lower() in content.lower()
|
|
98
|
+
return AssertionResult(
|
|
99
|
+
passed=found,
|
|
100
|
+
name=name,
|
|
101
|
+
detail=f"{'Found' if found else 'Not found'}: {value!r}",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _mode_schema(_content: str, actual: Any, config: dict[str, Any], name: str) -> AssertionResult:
|
|
106
|
+
schema_source = config.get("schema")
|
|
107
|
+
if schema_source is None:
|
|
108
|
+
return AssertionResult(
|
|
109
|
+
passed=False, name=name, detail="Mode 'schema' requires 'schema' key"
|
|
110
|
+
)
|
|
111
|
+
try:
|
|
112
|
+
schema: dict[str, Any] = (
|
|
113
|
+
json.loads(Path(str(schema_source)).read_text())
|
|
114
|
+
if isinstance(schema_source, str)
|
|
115
|
+
else schema_source
|
|
116
|
+
)
|
|
117
|
+
js.validate(instance=actual, schema=schema)
|
|
118
|
+
return AssertionResult(passed=True, name=name, detail="Schema valid")
|
|
119
|
+
except js.ValidationError as e:
|
|
120
|
+
return AssertionResult(passed=False, name=name, detail=f"Schema invalid: {e.message}")
|
|
121
|
+
except (FileNotFoundError, OSError) as e:
|
|
122
|
+
return AssertionResult(passed=False, name=name, detail=f"Schema load error: {e}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _mode_exact(_content: str, actual: Any, config: dict[str, Any], name: str) -> AssertionResult:
|
|
126
|
+
reference, err = _load_reference(config, name)
|
|
127
|
+
if err:
|
|
128
|
+
return err
|
|
129
|
+
ignore_fields: list[str] = config.get("ignore_fields", [])
|
|
130
|
+
diff = _diff_json(reference, actual, ignore_fields=ignore_fields)
|
|
131
|
+
if diff.has_differences:
|
|
132
|
+
return AssertionResult(passed=False, name=name, detail=diff.to_detail())
|
|
133
|
+
return AssertionResult(passed=True, name=name, detail="Exact match")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _mode_structural(
|
|
137
|
+
_content: str, actual: Any, config: dict[str, Any], name: str
|
|
138
|
+
) -> AssertionResult:
|
|
139
|
+
reference, err = _load_reference(config, name)
|
|
140
|
+
if err:
|
|
141
|
+
return err
|
|
142
|
+
ignore_fields: list[str] = config.get("ignore_fields", [])
|
|
143
|
+
issues = _deep_structural_compare(reference, actual, ignore_fields=ignore_fields)
|
|
144
|
+
if issues:
|
|
145
|
+
return AssertionResult(
|
|
146
|
+
passed=False,
|
|
147
|
+
name=name,
|
|
148
|
+
detail="Structural mismatch:\n" + "\n".join(f" {i}" for i in issues),
|
|
149
|
+
)
|
|
150
|
+
return AssertionResult(passed=True, name=name, detail="Structural match")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
_register_mode("contains", _mode_contains)
|
|
154
|
+
_register_mode("schema", _mode_schema)
|
|
155
|
+
_register_mode("exact", _mode_exact)
|
|
156
|
+
_register_mode("structural", _mode_structural)
|
|
157
|
+
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
# Core dispatch
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
_JSON_MODES = {"schema", "exact", "structural"}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _single_file_match(
|
|
166
|
+
path: str, config: dict[str, Any], agent_output: AgentOutput
|
|
167
|
+
) -> AssertionResult:
|
|
168
|
+
assertion_name = f"written_file_matches:{path}"
|
|
169
|
+
mode = config.get("mode", "schema")
|
|
170
|
+
|
|
171
|
+
handler = _resolve_mode(mode, assertion_name)
|
|
172
|
+
if isinstance(handler, AssertionResult):
|
|
173
|
+
return handler
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
content = _read_workspace_file(path, agent_output.raw_state)
|
|
177
|
+
except RuntimeError as e:
|
|
178
|
+
return AssertionResult(passed=False, name=assertion_name, detail=str(e))
|
|
179
|
+
|
|
180
|
+
# Modes that need parsed JSON get it up front
|
|
181
|
+
actual: Any = None
|
|
182
|
+
if mode in _JSON_MODES:
|
|
183
|
+
actual, err = _load_json(content, assertion_name)
|
|
184
|
+
if err:
|
|
185
|
+
return err
|
|
186
|
+
|
|
187
|
+
return handler(content, actual, config, assertion_name)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
# Public entry point
|
|
192
|
+
# ---------------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def written_file_matches(agent_output: AgentOutput, config: Any) -> AssertionResult:
|
|
196
|
+
"""Assert one or more workspace files match expected content/structure.
|
|
197
|
+
|
|
198
|
+
Config can be a single dict or a list of dicts. All entries must pass.
|
|
199
|
+
|
|
200
|
+
YAML config keys (per entry):
|
|
201
|
+
path (str): Workspace-relative file path (required).
|
|
202
|
+
mode (str): schema | exact | structural | contains (default: schema).
|
|
203
|
+
schema (str): Path to JSON schema file (mode=schema).
|
|
204
|
+
reference (str): Path to golden reference file (mode=exact|structural).
|
|
205
|
+
ignore_fields (list[str]): Keys to skip at any dict level (mode=exact|structural).
|
|
206
|
+
value (str): Substring to search for (mode=contains).
|
|
207
|
+
"""
|
|
208
|
+
if isinstance(config, list):
|
|
209
|
+
entries = config
|
|
210
|
+
elif isinstance(config, dict):
|
|
211
|
+
entries = [config]
|
|
212
|
+
else:
|
|
213
|
+
return AssertionResult(
|
|
214
|
+
passed=False, name="written_file_matches", detail="Config must be a dict or list"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
results = [_single_file_match(entry.get("path", ""), entry, agent_output) for entry in entries]
|
|
218
|
+
failures = [r for r in results if not r.passed]
|
|
219
|
+
if failures:
|
|
220
|
+
if len(entries) == 1:
|
|
221
|
+
return failures[0]
|
|
222
|
+
return AssertionResult(
|
|
223
|
+
passed=False,
|
|
224
|
+
name="written_file_matches",
|
|
225
|
+
detail="\n".join(f"{r.name}: {r.detail}" for r in failures),
|
|
226
|
+
)
|
|
227
|
+
if len(entries) == 1:
|
|
228
|
+
return results[0]
|
|
229
|
+
return AssertionResult(
|
|
230
|
+
passed=True,
|
|
231
|
+
name="written_file_matches",
|
|
232
|
+
detail=f"All {len(results)} files matched",
|
|
233
|
+
)
|
|
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
8
8
|
|
|
9
9
|
from autobots_devtools_shared_lib.dynagent.agents.invocation_utils import ainvoke_agent
|
|
10
10
|
from autobots_devtools_shared_lib.eval.assertions.registry import resolve_assertion
|
|
11
|
+
from autobots_devtools_shared_lib.eval.core.workspace import resolve_eval_state_schema
|
|
11
12
|
from autobots_devtools_shared_lib.eval.models.result import (
|
|
12
13
|
AgentOutput,
|
|
13
14
|
AssertionResult,
|
|
@@ -111,6 +112,7 @@ async def run_linear_eval(
|
|
|
111
112
|
config=config,
|
|
112
113
|
enable_tracing=trace_metadata is not None,
|
|
113
114
|
trace_metadata=trace_metadata,
|
|
115
|
+
state_schema=resolve_eval_state_schema(),
|
|
114
116
|
)
|
|
115
117
|
|
|
116
118
|
agent_output = _build_agent_output(result)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# ABOUTME: Workspace staging for eval runs + WorkspaceContextProvider interface.
|
|
2
|
+
# ABOUTME: Consumers register a provider so shared-lib never hard-codes workspace path formation.
|
|
3
|
+
"""Workspace file staging and pluggable workspace context provider."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
10
|
+
|
|
11
|
+
from autobots_devtools_shared_lib.common.utils.fserver_client_utils import write_file
|
|
12
|
+
from autobots_devtools_shared_lib.dynagent.models.state import Dynagent
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from langchain.agents import AgentState
|
|
16
|
+
|
|
17
|
+
from autobots_devtools_shared_lib.eval.models.eval_case import SetupConfig
|
|
18
|
+
|
|
19
|
+
_state_schema: type[Any] = Dynagent
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def register_eval_state_schema(schema: type[Any]) -> None:
|
|
23
|
+
"""Register the LangGraph state schema for eval agent invocations.
|
|
24
|
+
|
|
25
|
+
Call once in conftest.py when your agents use a custom state class (e.g. MerState)
|
|
26
|
+
that extends Dynagent with domain-specific fields. Without this, ainvoke_agent
|
|
27
|
+
defaults to Dynagent and drops extra state fields (e.g. jira_number, repo_name).
|
|
28
|
+
|
|
29
|
+
Example (MER consumer)::
|
|
30
|
+
|
|
31
|
+
from autobots_devtools_shared_lib.eval.core.workspace import register_eval_state_schema
|
|
32
|
+
from autobots_agents_mer.common.models.state import MerState
|
|
33
|
+
|
|
34
|
+
register_eval_state_schema(MerState)
|
|
35
|
+
"""
|
|
36
|
+
global _state_schema
|
|
37
|
+
_state_schema = schema
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def resolve_eval_state_schema() -> type[AgentState]:
|
|
41
|
+
"""Return the registered state schema for eval agent invocations."""
|
|
42
|
+
return _state_schema
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class WorkspaceContextProvider(Protocol):
|
|
46
|
+
"""Protocol for building file-server workspace context from agent state.
|
|
47
|
+
|
|
48
|
+
Implement this in your consumer conftest.py and register via
|
|
49
|
+
register_workspace_context_provider(). Path formation is intentionally
|
|
50
|
+
kept out of shared-lib — each consumer app may have a different convention.
|
|
51
|
+
|
|
52
|
+
Example (MER consumer)::
|
|
53
|
+
|
|
54
|
+
class MerWorkspaceContextProvider:
|
|
55
|
+
def get_workspace_context(self, state: dict) -> str:
|
|
56
|
+
ws = get_workspace_context(state) # MER util
|
|
57
|
+
return json.dumps(ws)
|
|
58
|
+
|
|
59
|
+
register_workspace_context_provider(MerWorkspaceContextProvider())
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def get_workspace_context(self, state: dict[str, Any]) -> str:
|
|
63
|
+
"""Return workspace_context JSON string for fserver_client_utils calls.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
state: Agent state dict (e.g. user_name, repo_name, jira_number).
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
JSON string, e.g. '{"workspace_base_path": "alice/fbp-core-MER-99999"}'.
|
|
70
|
+
"""
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
_provider: WorkspaceContextProvider | None = None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def register_workspace_context_provider(provider: WorkspaceContextProvider) -> None:
|
|
78
|
+
"""Register the workspace context provider.
|
|
79
|
+
|
|
80
|
+
Call once at eval startup, typically from conftest.py, before any evals run.
|
|
81
|
+
"""
|
|
82
|
+
global _provider
|
|
83
|
+
_provider = provider
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def resolve_workspace_context(state: dict[str, Any]) -> str:
|
|
87
|
+
"""Return workspace_context JSON via the registered provider.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
RuntimeError: If no provider has been registered.
|
|
91
|
+
"""
|
|
92
|
+
if _provider is None:
|
|
93
|
+
raise RuntimeError(
|
|
94
|
+
"No WorkspaceContextProvider registered. "
|
|
95
|
+
"Call register_workspace_context_provider() in your conftest.py before running evals."
|
|
96
|
+
)
|
|
97
|
+
return _provider.get_workspace_context(state)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def setup_workspace(config: SetupConfig, state: dict[str, Any] | None = None) -> None:
|
|
101
|
+
"""Stage fixture files into the file server workspace before agent invocation.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
config: Setup configuration with workspace_files to stage.
|
|
105
|
+
state: EvalCase state dict used to resolve workspace context via the provider.
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
FileNotFoundError: If a source fixture file does not exist.
|
|
109
|
+
RuntimeError: If the file server returns an error or no provider is registered.
|
|
110
|
+
"""
|
|
111
|
+
app_root_path = os.getenv("APP_ROOT_PATH", "")
|
|
112
|
+
workspace_context = resolve_workspace_context(state or {})
|
|
113
|
+
|
|
114
|
+
for wf in config.workspace_files:
|
|
115
|
+
src = Path(app_root_path, wf.src)
|
|
116
|
+
if not src.exists():
|
|
117
|
+
raise FileNotFoundError(
|
|
118
|
+
f"Fixture file not found: {src}. "
|
|
119
|
+
f"Ensure the file exists in the eval fixtures directory."
|
|
120
|
+
)
|
|
121
|
+
content = src.read_text(encoding="utf-8")
|
|
122
|
+
result = write_file(wf.dest, content, workspace_context)
|
|
123
|
+
if result.startswith("Error"):
|
|
124
|
+
raise RuntimeError(f"File server failed to stage '{wf.src}' → '{wf.dest}': {result}")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def teardown_workspace(_workspace_path: str) -> None:
|
|
128
|
+
"""No-op: file server manages its own storage. Kept for interface compatibility."""
|
|
129
|
+
pass
|
|
@@ -51,7 +51,6 @@ def make_dynagent_eval(
|
|
|
51
51
|
|
|
52
52
|
async def _eval(eval_case: EvalCase) -> EvalResult:
|
|
53
53
|
session_id = str(uuid.uuid4())
|
|
54
|
-
workspace_path = "/Users/shruthi/Projects/workspace/khushboo-2802394_infosys/fbp-core-genai-sanity-MER-9999"
|
|
55
54
|
|
|
56
55
|
config: RunnableConfig = {
|
|
57
56
|
"configurable": {
|
|
@@ -67,7 +66,7 @@ def make_dynagent_eval(
|
|
|
67
66
|
|
|
68
67
|
try:
|
|
69
68
|
# Stage workspace files
|
|
70
|
-
setup_workspace(eval_case.setup,
|
|
69
|
+
setup_workspace(eval_case.setup, state=eval_case.state or None)
|
|
71
70
|
|
|
72
71
|
# Run the eval
|
|
73
72
|
if eval_case.mode == "linear":
|
|
@@ -104,7 +103,7 @@ def make_dynagent_eval(
|
|
|
104
103
|
post_scores(session_id, result)
|
|
105
104
|
|
|
106
105
|
finally:
|
|
107
|
-
teardown_workspace(
|
|
106
|
+
teardown_workspace("")
|
|
108
107
|
|
|
109
108
|
return result
|
|
110
109
|
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
# ABOUTME: Workspace file staging for eval runs.
|
|
2
|
-
# ABOUTME: Copies fixture files into workspace directory before agent invocation.
|
|
3
|
-
"""Workspace file staging for eval runs."""
|
|
4
|
-
|
|
5
|
-
from __future__ import annotations
|
|
6
|
-
|
|
7
|
-
import shutil
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from typing import TYPE_CHECKING
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from autobots_devtools_shared_lib.eval.models.eval_case import SetupConfig
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def setup_workspace(config: SetupConfig, workspace_path: str) -> None:
|
|
16
|
-
"""Create workspace directory and stage fixture files.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
config: Setup configuration with workspace_files to stage.
|
|
20
|
-
workspace_path: Target workspace directory path.
|
|
21
|
-
|
|
22
|
-
Raises:
|
|
23
|
-
FileNotFoundError: If a source fixture file does not exist.
|
|
24
|
-
"""
|
|
25
|
-
import os
|
|
26
|
-
|
|
27
|
-
app_root_path = os.getenv("APP_ROOT_PATH", "")
|
|
28
|
-
workspace = Path(workspace_path)
|
|
29
|
-
workspace.mkdir(parents=True, exist_ok=True)
|
|
30
|
-
|
|
31
|
-
for wf in config.workspace_files:
|
|
32
|
-
src = Path(app_root_path, wf.src)
|
|
33
|
-
if not src.exists():
|
|
34
|
-
raise FileNotFoundError(
|
|
35
|
-
f"Fixture file not found: {src}. "
|
|
36
|
-
f"Ensure the file exists in the eval fixtures directory."
|
|
37
|
-
)
|
|
38
|
-
dest = workspace / wf.dest
|
|
39
|
-
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
-
shutil.copy2(src, dest)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def teardown_workspace(workspace_path: str) -> None:
|
|
44
|
-
"""Remove workspace directory and all contents.
|
|
45
|
-
|
|
46
|
-
Args:
|
|
47
|
-
workspace_path: Workspace directory to remove.
|
|
48
|
-
"""
|
|
49
|
-
workspace = Path(workspace_path)
|
|
50
|
-
if workspace.exists():
|
|
51
|
-
shutil.rmtree(workspace)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|