tactus 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +49 -0
- tactus/adapters/__init__.py +9 -0
- tactus/adapters/broker_log.py +76 -0
- tactus/adapters/cli_hitl.py +189 -0
- tactus/adapters/cli_log.py +223 -0
- tactus/adapters/cost_collector_log.py +56 -0
- tactus/adapters/file_storage.py +367 -0
- tactus/adapters/http_callback_log.py +109 -0
- tactus/adapters/ide_log.py +71 -0
- tactus/adapters/lua_tools.py +336 -0
- tactus/adapters/mcp.py +289 -0
- tactus/adapters/mcp_manager.py +196 -0
- tactus/adapters/memory.py +53 -0
- tactus/adapters/plugins.py +419 -0
- tactus/backends/http_backend.py +58 -0
- tactus/backends/model_backend.py +35 -0
- tactus/backends/pytorch_backend.py +110 -0
- tactus/broker/__init__.py +12 -0
- tactus/broker/client.py +247 -0
- tactus/broker/protocol.py +183 -0
- tactus/broker/server.py +1123 -0
- tactus/broker/stdio.py +12 -0
- tactus/cli/__init__.py +7 -0
- tactus/cli/app.py +2245 -0
- tactus/cli/commands/__init__.py +0 -0
- tactus/core/__init__.py +32 -0
- tactus/core/config_manager.py +790 -0
- tactus/core/dependencies/__init__.py +14 -0
- tactus/core/dependencies/registry.py +180 -0
- tactus/core/dsl_stubs.py +2117 -0
- tactus/core/exceptions.py +66 -0
- tactus/core/execution_context.py +480 -0
- tactus/core/lua_sandbox.py +508 -0
- tactus/core/message_history_manager.py +236 -0
- tactus/core/mocking.py +286 -0
- tactus/core/output_validator.py +291 -0
- tactus/core/registry.py +499 -0
- tactus/core/runtime.py +2907 -0
- tactus/core/template_resolver.py +142 -0
- tactus/core/yaml_parser.py +301 -0
- tactus/docker/Dockerfile +61 -0
- tactus/docker/entrypoint.sh +69 -0
- tactus/dspy/__init__.py +39 -0
- tactus/dspy/agent.py +1144 -0
- tactus/dspy/broker_lm.py +181 -0
- tactus/dspy/config.py +212 -0
- tactus/dspy/history.py +196 -0
- tactus/dspy/module.py +405 -0
- tactus/dspy/prediction.py +318 -0
- tactus/dspy/signature.py +185 -0
- tactus/formatting/__init__.py +7 -0
- tactus/formatting/formatter.py +437 -0
- tactus/ide/__init__.py +9 -0
- tactus/ide/coding_assistant.py +343 -0
- tactus/ide/server.py +2223 -0
- tactus/primitives/__init__.py +49 -0
- tactus/primitives/control.py +168 -0
- tactus/primitives/file.py +229 -0
- tactus/primitives/handles.py +378 -0
- tactus/primitives/host.py +94 -0
- tactus/primitives/human.py +342 -0
- tactus/primitives/json.py +189 -0
- tactus/primitives/log.py +187 -0
- tactus/primitives/message_history.py +157 -0
- tactus/primitives/model.py +163 -0
- tactus/primitives/procedure.py +564 -0
- tactus/primitives/procedure_callable.py +318 -0
- tactus/primitives/retry.py +155 -0
- tactus/primitives/session.py +152 -0
- tactus/primitives/state.py +182 -0
- tactus/primitives/step.py +209 -0
- tactus/primitives/system.py +93 -0
- tactus/primitives/tool.py +375 -0
- tactus/primitives/tool_handle.py +279 -0
- tactus/primitives/toolset.py +229 -0
- tactus/protocols/__init__.py +38 -0
- tactus/protocols/chat_recorder.py +81 -0
- tactus/protocols/config.py +97 -0
- tactus/protocols/cost.py +31 -0
- tactus/protocols/hitl.py +71 -0
- tactus/protocols/log_handler.py +27 -0
- tactus/protocols/models.py +355 -0
- tactus/protocols/result.py +33 -0
- tactus/protocols/storage.py +90 -0
- tactus/providers/__init__.py +13 -0
- tactus/providers/base.py +92 -0
- tactus/providers/bedrock.py +117 -0
- tactus/providers/google.py +105 -0
- tactus/providers/openai.py +98 -0
- tactus/sandbox/__init__.py +63 -0
- tactus/sandbox/config.py +171 -0
- tactus/sandbox/container_runner.py +1099 -0
- tactus/sandbox/docker_manager.py +433 -0
- tactus/sandbox/entrypoint.py +227 -0
- tactus/sandbox/protocol.py +213 -0
- tactus/stdlib/__init__.py +10 -0
- tactus/stdlib/io/__init__.py +13 -0
- tactus/stdlib/io/csv.py +88 -0
- tactus/stdlib/io/excel.py +136 -0
- tactus/stdlib/io/file.py +90 -0
- tactus/stdlib/io/fs.py +154 -0
- tactus/stdlib/io/hdf5.py +121 -0
- tactus/stdlib/io/json.py +109 -0
- tactus/stdlib/io/parquet.py +83 -0
- tactus/stdlib/io/tsv.py +88 -0
- tactus/stdlib/loader.py +274 -0
- tactus/stdlib/tac/tactus/tools/done.tac +33 -0
- tactus/stdlib/tac/tactus/tools/log.tac +50 -0
- tactus/testing/README.md +273 -0
- tactus/testing/__init__.py +61 -0
- tactus/testing/behave_integration.py +380 -0
- tactus/testing/context.py +486 -0
- tactus/testing/eval_models.py +114 -0
- tactus/testing/evaluation_runner.py +222 -0
- tactus/testing/evaluators.py +634 -0
- tactus/testing/events.py +94 -0
- tactus/testing/gherkin_parser.py +134 -0
- tactus/testing/mock_agent.py +315 -0
- tactus/testing/mock_dependencies.py +234 -0
- tactus/testing/mock_hitl.py +171 -0
- tactus/testing/mock_registry.py +168 -0
- tactus/testing/mock_tools.py +133 -0
- tactus/testing/models.py +115 -0
- tactus/testing/pydantic_eval_runner.py +508 -0
- tactus/testing/steps/__init__.py +13 -0
- tactus/testing/steps/builtin.py +902 -0
- tactus/testing/steps/custom.py +69 -0
- tactus/testing/steps/registry.py +68 -0
- tactus/testing/test_runner.py +489 -0
- tactus/tracing/__init__.py +5 -0
- tactus/tracing/trace_manager.py +417 -0
- tactus/utils/__init__.py +1 -0
- tactus/utils/cost_calculator.py +72 -0
- tactus/utils/model_pricing.py +132 -0
- tactus/utils/safe_file_library.py +502 -0
- tactus/utils/safe_libraries.py +234 -0
- tactus/validation/LuaLexerBase.py +66 -0
- tactus/validation/LuaParserBase.py +23 -0
- tactus/validation/README.md +224 -0
- tactus/validation/__init__.py +7 -0
- tactus/validation/error_listener.py +21 -0
- tactus/validation/generated/LuaLexer.interp +231 -0
- tactus/validation/generated/LuaLexer.py +5548 -0
- tactus/validation/generated/LuaLexer.tokens +124 -0
- tactus/validation/generated/LuaLexerBase.py +66 -0
- tactus/validation/generated/LuaParser.interp +173 -0
- tactus/validation/generated/LuaParser.py +6439 -0
- tactus/validation/generated/LuaParser.tokens +124 -0
- tactus/validation/generated/LuaParserBase.py +23 -0
- tactus/validation/generated/LuaParserVisitor.py +118 -0
- tactus/validation/generated/__init__.py +7 -0
- tactus/validation/grammar/LuaLexer.g4 +123 -0
- tactus/validation/grammar/LuaParser.g4 +178 -0
- tactus/validation/semantic_visitor.py +817 -0
- tactus/validation/validator.py +157 -0
- tactus-0.31.0.dist-info/METADATA +1809 -0
- tactus-0.31.0.dist-info/RECORD +160 -0
- tactus-0.31.0.dist-info/WHEEL +4 -0
- tactus-0.31.0.dist-info/entry_points.txt +2 -0
- tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,634 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluator mapping for Pydantic Evals integration.
|
|
3
|
+
|
|
4
|
+
This module maps Tactus evaluator configurations to Pydantic Evals
|
|
5
|
+
evaluator instances, including both built-in and custom evaluators.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from .eval_models import EvaluatorConfig
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Check if pydantic_evals is available
|
|
18
|
+
try:
|
|
19
|
+
from pydantic_evals.evaluators import (
|
|
20
|
+
Evaluator,
|
|
21
|
+
EvaluatorContext,
|
|
22
|
+
Contains,
|
|
23
|
+
EqualsExpected,
|
|
24
|
+
IsInstance,
|
|
25
|
+
LLMJudge,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
PYDANTIC_EVALS_AVAILABLE = True
|
|
29
|
+
except ImportError:
|
|
30
|
+
PYDANTIC_EVALS_AVAILABLE = False
|
|
31
|
+
|
|
32
|
+
# Create dummy base class for type hints
|
|
33
|
+
class Evaluator:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
class EvaluatorContext:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class TraceAwareEvaluator:
|
|
42
|
+
"""
|
|
43
|
+
Mixin class for evaluators that inspect execution traces.
|
|
44
|
+
|
|
45
|
+
Provides helper methods to extract trace from context metadata or output.
|
|
46
|
+
Subclasses should also inherit from Evaluator and implement evaluate().
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def get_trace(self, ctx: EvaluatorContext) -> dict:
|
|
50
|
+
"""
|
|
51
|
+
Extract trace from context.
|
|
52
|
+
|
|
53
|
+
Trace can be in:
|
|
54
|
+
1. ctx.metadata['trace'] - if passed via Case metadata
|
|
55
|
+
2. ctx.output['__trace__'] - if returned by task function
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
ctx: Evaluator context
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Trace dictionary (empty dict if no trace found)
|
|
62
|
+
"""
|
|
63
|
+
# Try metadata first
|
|
64
|
+
if hasattr(ctx, "metadata") and ctx.metadata:
|
|
65
|
+
trace = ctx.metadata.get("trace", {})
|
|
66
|
+
if trace:
|
|
67
|
+
return trace
|
|
68
|
+
|
|
69
|
+
# Try output
|
|
70
|
+
if isinstance(ctx.output, dict) and "__trace__" in ctx.output:
|
|
71
|
+
return ctx.output["__trace__"]
|
|
72
|
+
|
|
73
|
+
return {}
|
|
74
|
+
|
|
75
|
+
def get_output(self, ctx: EvaluatorContext) -> Any:
|
|
76
|
+
"""
|
|
77
|
+
Extract actual output (without trace wrapper).
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
ctx: Evaluator context
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Actual output value
|
|
84
|
+
"""
|
|
85
|
+
if isinstance(ctx.output, dict) and "__output__" in ctx.output:
|
|
86
|
+
return ctx.output["__output__"]
|
|
87
|
+
return ctx.output
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def create_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
91
|
+
"""
|
|
92
|
+
Create a Pydantic Evals evaluator from Tactus config.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
config: Tactus evaluator configuration
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Pydantic Evals Evaluator instance
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
ValueError: If evaluator type is unknown
|
|
102
|
+
ImportError: If pydantic_evals is not installed
|
|
103
|
+
"""
|
|
104
|
+
if not PYDANTIC_EVALS_AVAILABLE:
|
|
105
|
+
raise ImportError("pydantic_evals is required. Install with: pip install pydantic-evals")
|
|
106
|
+
|
|
107
|
+
evaluator_type = config.type.lower()
|
|
108
|
+
|
|
109
|
+
# Built-in Pydantic Evals evaluators
|
|
110
|
+
if evaluator_type == "contains":
|
|
111
|
+
return _create_contains_evaluator(config)
|
|
112
|
+
elif evaluator_type == "contains_any":
|
|
113
|
+
return _create_contains_any_evaluator(config)
|
|
114
|
+
elif evaluator_type == "equals_expected":
|
|
115
|
+
return _create_equals_expected_evaluator(config)
|
|
116
|
+
elif evaluator_type == "exact_match":
|
|
117
|
+
return _create_equals_expected_evaluator(config)
|
|
118
|
+
elif evaluator_type == "is_instance":
|
|
119
|
+
return _create_is_instance_evaluator(config)
|
|
120
|
+
elif evaluator_type == "llm_judge":
|
|
121
|
+
return _create_llm_judge_evaluator(config)
|
|
122
|
+
elif evaluator_type == "min_length":
|
|
123
|
+
return _create_min_length_evaluator(config)
|
|
124
|
+
elif evaluator_type == "max_length":
|
|
125
|
+
return _create_max_length_evaluator(config)
|
|
126
|
+
|
|
127
|
+
# Tactus-specific evaluators
|
|
128
|
+
elif evaluator_type == "max_iterations":
|
|
129
|
+
return _create_max_iterations_evaluator(config)
|
|
130
|
+
elif evaluator_type == "max_cost":
|
|
131
|
+
return _create_max_cost_evaluator(config)
|
|
132
|
+
elif evaluator_type == "max_tokens":
|
|
133
|
+
return _create_max_tokens_evaluator(config)
|
|
134
|
+
|
|
135
|
+
# Trace-based evaluators
|
|
136
|
+
elif evaluator_type == "tool_called":
|
|
137
|
+
return _create_tool_called_evaluator(config)
|
|
138
|
+
elif evaluator_type == "state_check":
|
|
139
|
+
return _create_state_check_evaluator(config)
|
|
140
|
+
elif evaluator_type == "agent_turns":
|
|
141
|
+
return _create_agent_turns_evaluator(config)
|
|
142
|
+
|
|
143
|
+
# Advanced evaluators
|
|
144
|
+
elif evaluator_type == "regex":
|
|
145
|
+
return _create_regex_evaluator(config)
|
|
146
|
+
elif evaluator_type == "json_schema":
|
|
147
|
+
return _create_json_schema_evaluator(config)
|
|
148
|
+
elif evaluator_type == "range":
|
|
149
|
+
return _create_range_evaluator(config)
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
raise ValueError(f"Unknown evaluator type: {config.type}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _create_contains_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
156
|
+
"""Create Contains evaluator."""
|
|
157
|
+
if config.value is None:
|
|
158
|
+
raise ValueError("Contains evaluator requires 'value' field")
|
|
159
|
+
|
|
160
|
+
# If a field is specified, create custom evaluator for that field
|
|
161
|
+
if config.field:
|
|
162
|
+
|
|
163
|
+
@dataclass
|
|
164
|
+
class FieldContains(Evaluator):
|
|
165
|
+
"""Check if specific field contains value."""
|
|
166
|
+
|
|
167
|
+
field: str
|
|
168
|
+
value: str
|
|
169
|
+
case_sensitive: bool = True
|
|
170
|
+
|
|
171
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
172
|
+
"""Check if field contains value."""
|
|
173
|
+
# Get field value
|
|
174
|
+
if isinstance(ctx.output, dict):
|
|
175
|
+
output = str(ctx.output.get(self.field, ""))
|
|
176
|
+
else:
|
|
177
|
+
output = str(ctx.output)
|
|
178
|
+
|
|
179
|
+
# Check contains
|
|
180
|
+
if self.case_sensitive:
|
|
181
|
+
return self.value in output
|
|
182
|
+
else:
|
|
183
|
+
return self.value.lower() in output.lower()
|
|
184
|
+
|
|
185
|
+
return FieldContains(
|
|
186
|
+
field=config.field,
|
|
187
|
+
value=config.value,
|
|
188
|
+
case_sensitive=True,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Otherwise use standard Contains (checks entire output)
|
|
192
|
+
return Contains(
|
|
193
|
+
value=config.value,
|
|
194
|
+
case_sensitive=True,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _create_contains_any_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
199
|
+
"""Create custom ContainsAny evaluator that checks for any of multiple values."""
|
|
200
|
+
|
|
201
|
+
@dataclass
|
|
202
|
+
class ContainsAny(Evaluator):
|
|
203
|
+
"""Check if output contains any of the specified values."""
|
|
204
|
+
|
|
205
|
+
field: Optional[str] = None
|
|
206
|
+
check_expected: Optional[str] = None
|
|
207
|
+
values: Optional[list] = None
|
|
208
|
+
|
|
209
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
210
|
+
"""Check if output contains any of the values."""
|
|
211
|
+
# Get the values to check
|
|
212
|
+
if self.values:
|
|
213
|
+
check_values = self.values
|
|
214
|
+
elif self.check_expected and ctx.expected_output:
|
|
215
|
+
check_values = ctx.expected_output.get(self.check_expected, [])
|
|
216
|
+
else:
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
# Get the output to check
|
|
220
|
+
if self.field and isinstance(ctx.output, dict):
|
|
221
|
+
output = ctx.output.get(self.field, "")
|
|
222
|
+
else:
|
|
223
|
+
output = str(ctx.output)
|
|
224
|
+
|
|
225
|
+
# Check if any value is in output
|
|
226
|
+
output_lower = output.lower()
|
|
227
|
+
for value in check_values:
|
|
228
|
+
if str(value).lower() in output_lower:
|
|
229
|
+
return True
|
|
230
|
+
return False
|
|
231
|
+
|
|
232
|
+
return ContainsAny(
|
|
233
|
+
field=config.field,
|
|
234
|
+
check_expected=config.check_expected,
|
|
235
|
+
values=config.value if isinstance(config.value, list) else None,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _create_equals_expected_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
240
|
+
"""Create EqualsExpected evaluator or field-specific equality check."""
|
|
241
|
+
|
|
242
|
+
# If a field is specified, create custom evaluator for that field
|
|
243
|
+
if config.field:
|
|
244
|
+
|
|
245
|
+
@dataclass
|
|
246
|
+
class FieldEquals(Evaluator):
|
|
247
|
+
"""Check if specific field equals expected value."""
|
|
248
|
+
|
|
249
|
+
field: str
|
|
250
|
+
|
|
251
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
252
|
+
"""Check if field equals expected value."""
|
|
253
|
+
if not ctx.expected_output:
|
|
254
|
+
return True # No expected output to compare
|
|
255
|
+
|
|
256
|
+
# Get actual field value
|
|
257
|
+
if isinstance(ctx.output, dict):
|
|
258
|
+
actual = ctx.output.get(self.field)
|
|
259
|
+
else:
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
# Get expected field value
|
|
263
|
+
expected = ctx.expected_output.get(self.field)
|
|
264
|
+
|
|
265
|
+
return actual == expected
|
|
266
|
+
|
|
267
|
+
return FieldEquals(field=config.field)
|
|
268
|
+
|
|
269
|
+
# Otherwise use standard EqualsExpected (compares entire output)
|
|
270
|
+
return EqualsExpected()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _create_is_instance_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
274
|
+
"""Create IsInstance evaluator."""
|
|
275
|
+
if config.value is None:
|
|
276
|
+
raise ValueError("IsInstance evaluator requires 'value' field (type name)")
|
|
277
|
+
|
|
278
|
+
return IsInstance(type_name=config.value)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _create_llm_judge_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
282
|
+
"""Create LLMJudge evaluator."""
|
|
283
|
+
if config.rubric is None:
|
|
284
|
+
raise ValueError("LLMJudge evaluator requires 'rubric' field")
|
|
285
|
+
|
|
286
|
+
# Note: include_expected is not a standard LLMJudge parameter
|
|
287
|
+
# The rubric itself should specify if comparison is needed
|
|
288
|
+
return LLMJudge(
|
|
289
|
+
rubric=config.rubric,
|
|
290
|
+
model=config.model or "openai:gpt-4o",
|
|
291
|
+
include_input=True,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _create_min_length_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
296
|
+
"""Create custom MinLength evaluator."""
|
|
297
|
+
|
|
298
|
+
@dataclass
|
|
299
|
+
class MinLength(Evaluator):
|
|
300
|
+
"""Check if output meets minimum length."""
|
|
301
|
+
|
|
302
|
+
field: Optional[str] = None
|
|
303
|
+
min_length: int = 0
|
|
304
|
+
check_expected: Optional[str] = None
|
|
305
|
+
|
|
306
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
307
|
+
"""Check if output meets minimum length."""
|
|
308
|
+
# Get min_length from expected_output if specified
|
|
309
|
+
min_len = self.min_length
|
|
310
|
+
if self.check_expected and ctx.expected_output:
|
|
311
|
+
min_len = ctx.expected_output.get(self.check_expected, min_len)
|
|
312
|
+
|
|
313
|
+
# Get the output to check
|
|
314
|
+
if self.field and isinstance(ctx.output, dict):
|
|
315
|
+
output = ctx.output.get(self.field, "")
|
|
316
|
+
else:
|
|
317
|
+
output = ctx.output
|
|
318
|
+
|
|
319
|
+
# Check length
|
|
320
|
+
if isinstance(output, (list, dict)):
|
|
321
|
+
return len(output) >= min_len
|
|
322
|
+
return len(str(output)) >= min_len
|
|
323
|
+
|
|
324
|
+
return MinLength(
|
|
325
|
+
field=config.field,
|
|
326
|
+
min_length=config.value or 0,
|
|
327
|
+
check_expected=config.check_expected,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _create_max_length_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
332
|
+
"""Create custom MaxLength evaluator."""
|
|
333
|
+
|
|
334
|
+
@dataclass
|
|
335
|
+
class MaxLength(Evaluator):
|
|
336
|
+
"""Check if output doesn't exceed maximum length."""
|
|
337
|
+
|
|
338
|
+
field: Optional[str] = None
|
|
339
|
+
max_length: int = 0
|
|
340
|
+
|
|
341
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
342
|
+
"""Check if output doesn't exceed maximum length."""
|
|
343
|
+
# Get the output to check
|
|
344
|
+
if self.field and isinstance(ctx.output, dict):
|
|
345
|
+
output = ctx.output.get(self.field, "")
|
|
346
|
+
else:
|
|
347
|
+
output = ctx.output
|
|
348
|
+
|
|
349
|
+
# Check length
|
|
350
|
+
if isinstance(output, (list, dict)):
|
|
351
|
+
return len(output) <= self.max_length
|
|
352
|
+
return len(str(output)) <= self.max_length
|
|
353
|
+
|
|
354
|
+
return MaxLength(
|
|
355
|
+
field=config.field,
|
|
356
|
+
max_length=config.value or 0,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _create_max_iterations_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
361
|
+
"""Create Tactus-specific MaxIterations evaluator."""
|
|
362
|
+
|
|
363
|
+
@dataclass
|
|
364
|
+
class MaxIterations(Evaluator):
|
|
365
|
+
"""Check if procedure completed within iteration limit."""
|
|
366
|
+
|
|
367
|
+
max_iterations: int
|
|
368
|
+
|
|
369
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
370
|
+
"""Check if iterations are within limit."""
|
|
371
|
+
# Check metadata for iterations count
|
|
372
|
+
if hasattr(ctx, "metadata") and ctx.metadata:
|
|
373
|
+
iterations = ctx.metadata.get("iterations", 0)
|
|
374
|
+
return iterations <= self.max_iterations
|
|
375
|
+
|
|
376
|
+
# Check output for iterations field
|
|
377
|
+
if isinstance(ctx.output, dict):
|
|
378
|
+
iterations = ctx.output.get("iterations", 0)
|
|
379
|
+
return iterations <= self.max_iterations
|
|
380
|
+
|
|
381
|
+
return True # Pass if we can't find iterations
|
|
382
|
+
|
|
383
|
+
return MaxIterations(max_iterations=config.value or 10)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _create_max_cost_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
387
|
+
"""Create Tactus-specific MaxCost evaluator."""
|
|
388
|
+
|
|
389
|
+
@dataclass
|
|
390
|
+
class MaxCost(Evaluator):
|
|
391
|
+
"""Check if procedure cost is within budget."""
|
|
392
|
+
|
|
393
|
+
max_cost: float
|
|
394
|
+
|
|
395
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
396
|
+
"""Check if cost is within budget."""
|
|
397
|
+
# Check metadata for cost
|
|
398
|
+
if hasattr(ctx, "metadata") and ctx.metadata:
|
|
399
|
+
cost = ctx.metadata.get("total_cost", 0.0)
|
|
400
|
+
return cost <= self.max_cost
|
|
401
|
+
|
|
402
|
+
# Check output for cost field
|
|
403
|
+
if isinstance(ctx.output, dict):
|
|
404
|
+
cost = ctx.output.get("total_cost", 0.0)
|
|
405
|
+
return cost <= self.max_cost
|
|
406
|
+
|
|
407
|
+
return True # Pass if we can't find cost
|
|
408
|
+
|
|
409
|
+
return MaxCost(max_cost=config.value or 1.0)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _create_max_tokens_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
413
|
+
"""Create Tactus-specific MaxTokens evaluator."""
|
|
414
|
+
|
|
415
|
+
@dataclass
|
|
416
|
+
class MaxTokens(Evaluator):
|
|
417
|
+
"""Check if token usage is within limit."""
|
|
418
|
+
|
|
419
|
+
max_tokens: int
|
|
420
|
+
|
|
421
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
422
|
+
"""Check if tokens are within limit."""
|
|
423
|
+
# Check metadata for tokens
|
|
424
|
+
if hasattr(ctx, "metadata") and ctx.metadata:
|
|
425
|
+
tokens = ctx.metadata.get("total_tokens", 0)
|
|
426
|
+
return tokens <= self.max_tokens
|
|
427
|
+
|
|
428
|
+
# Check output for tokens field
|
|
429
|
+
if isinstance(ctx.output, dict):
|
|
430
|
+
tokens = ctx.output.get("total_tokens", 0)
|
|
431
|
+
return tokens <= self.max_tokens
|
|
432
|
+
|
|
433
|
+
return True # Pass if we can't find tokens
|
|
434
|
+
|
|
435
|
+
return MaxTokens(max_tokens=config.value or 10000)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _create_tool_called_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
439
|
+
"""Create evaluator that checks if specific tool was called."""
|
|
440
|
+
|
|
441
|
+
@dataclass
|
|
442
|
+
class ToolCalled(TraceAwareEvaluator, Evaluator):
|
|
443
|
+
"""Check if tool was called during execution."""
|
|
444
|
+
|
|
445
|
+
tool_name: str
|
|
446
|
+
min_calls: int = 1
|
|
447
|
+
max_calls: Optional[int] = None
|
|
448
|
+
|
|
449
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
450
|
+
"""Check if tool was called the expected number of times."""
|
|
451
|
+
trace = self.get_trace(ctx)
|
|
452
|
+
tool_calls = trace.get("tool_calls", [])
|
|
453
|
+
|
|
454
|
+
# Count calls to this tool
|
|
455
|
+
count = sum(1 for call in tool_calls if call.get("name") == self.tool_name)
|
|
456
|
+
|
|
457
|
+
if count < self.min_calls:
|
|
458
|
+
return False
|
|
459
|
+
if self.max_calls is not None and count > self.max_calls:
|
|
460
|
+
return False
|
|
461
|
+
return True
|
|
462
|
+
|
|
463
|
+
return ToolCalled(
|
|
464
|
+
tool_name=config.value,
|
|
465
|
+
min_calls=getattr(config, "min_value", None) or 1,
|
|
466
|
+
max_calls=getattr(config, "max_value", None),
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def _create_state_check_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
471
|
+
"""Create evaluator that checks state variable values."""
|
|
472
|
+
|
|
473
|
+
@dataclass
|
|
474
|
+
class StateCheck(TraceAwareEvaluator, Evaluator):
|
|
475
|
+
"""Check if state variable has expected value."""
|
|
476
|
+
|
|
477
|
+
variable: str
|
|
478
|
+
expected_value: Any
|
|
479
|
+
|
|
480
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
481
|
+
"""Check if state variable matches expected value."""
|
|
482
|
+
trace = self.get_trace(ctx)
|
|
483
|
+
state_changes = trace.get("state_changes", [])
|
|
484
|
+
|
|
485
|
+
# Find final value of variable
|
|
486
|
+
for change in reversed(state_changes):
|
|
487
|
+
if isinstance(change, dict) and change.get("variable") == self.variable:
|
|
488
|
+
return change.get("value") == self.expected_value
|
|
489
|
+
|
|
490
|
+
return False
|
|
491
|
+
|
|
492
|
+
return StateCheck(variable=config.field or "", expected_value=config.value)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def _create_agent_turns_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
496
|
+
"""Create evaluator that checks agent turn counts."""
|
|
497
|
+
|
|
498
|
+
@dataclass
|
|
499
|
+
class AgentTurns(TraceAwareEvaluator, Evaluator):
|
|
500
|
+
"""Check number of agent turns."""
|
|
501
|
+
|
|
502
|
+
agent_name: Optional[str] = None
|
|
503
|
+
min_turns: int = 1
|
|
504
|
+
max_turns: Optional[int] = None
|
|
505
|
+
|
|
506
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
507
|
+
"""Check if agent turn count is within expected range."""
|
|
508
|
+
trace = self.get_trace(ctx)
|
|
509
|
+
agent_turns = trace.get("agent_turns", [])
|
|
510
|
+
|
|
511
|
+
# Filter by agent if specified
|
|
512
|
+
if self.agent_name:
|
|
513
|
+
agent_turns = [t for t in agent_turns if t.get("agent") == self.agent_name]
|
|
514
|
+
|
|
515
|
+
count = len(agent_turns)
|
|
516
|
+
if count < self.min_turns:
|
|
517
|
+
return False
|
|
518
|
+
if self.max_turns is not None and count > self.max_turns:
|
|
519
|
+
return False
|
|
520
|
+
return True
|
|
521
|
+
|
|
522
|
+
return AgentTurns(
|
|
523
|
+
agent_name=config.field,
|
|
524
|
+
min_turns=getattr(config, "min_value", None) or 1,
|
|
525
|
+
max_turns=getattr(config, "max_value", None),
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _create_regex_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
530
|
+
"""Create evaluator that matches output against regex pattern."""
|
|
531
|
+
import re
|
|
532
|
+
|
|
533
|
+
@dataclass
|
|
534
|
+
class RegexMatch(Evaluator):
|
|
535
|
+
"""Check if output matches regex pattern."""
|
|
536
|
+
|
|
537
|
+
field: Optional[str] = None
|
|
538
|
+
pattern: str = ""
|
|
539
|
+
case_sensitive: bool = True
|
|
540
|
+
|
|
541
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
542
|
+
"""Check if output matches the regex pattern."""
|
|
543
|
+
# Get output
|
|
544
|
+
if self.field and isinstance(ctx.output, dict):
|
|
545
|
+
output = str(ctx.output.get(self.field, ""))
|
|
546
|
+
else:
|
|
547
|
+
output = str(ctx.output)
|
|
548
|
+
|
|
549
|
+
# Match pattern
|
|
550
|
+
flags = 0 if self.case_sensitive else re.IGNORECASE
|
|
551
|
+
return bool(re.search(self.pattern, output, flags))
|
|
552
|
+
|
|
553
|
+
return RegexMatch(
|
|
554
|
+
field=config.field,
|
|
555
|
+
pattern=config.value or "",
|
|
556
|
+
case_sensitive=getattr(config, "case_sensitive", True),
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def _create_json_schema_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
561
|
+
"""Create evaluator that validates output against JSON schema."""
|
|
562
|
+
|
|
563
|
+
@dataclass
|
|
564
|
+
class JSONSchemaValidator(Evaluator):
|
|
565
|
+
"""Validate output against JSON schema."""
|
|
566
|
+
|
|
567
|
+
field: Optional[str] = None
|
|
568
|
+
schema: dict = None
|
|
569
|
+
|
|
570
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
571
|
+
"""Validate output against JSON schema."""
|
|
572
|
+
try:
|
|
573
|
+
from jsonschema import validate, ValidationError
|
|
574
|
+
except ImportError:
|
|
575
|
+
logger.warning("jsonschema not installed, skipping validation")
|
|
576
|
+
return True
|
|
577
|
+
|
|
578
|
+
# Get output
|
|
579
|
+
if self.field and isinstance(ctx.output, dict):
|
|
580
|
+
output = ctx.output.get(self.field)
|
|
581
|
+
else:
|
|
582
|
+
output = ctx.output
|
|
583
|
+
|
|
584
|
+
# Validate
|
|
585
|
+
try:
|
|
586
|
+
validate(instance=output, schema=self.schema)
|
|
587
|
+
return True
|
|
588
|
+
except ValidationError:
|
|
589
|
+
return False
|
|
590
|
+
|
|
591
|
+
return JSONSchemaValidator(field=config.field, schema=config.json_schema or config.value or {})
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def _create_range_evaluator(config: EvaluatorConfig) -> Evaluator:
|
|
595
|
+
"""Create evaluator that checks if numeric value is within range."""
|
|
596
|
+
|
|
597
|
+
@dataclass
|
|
598
|
+
class NumericRange(Evaluator):
|
|
599
|
+
"""Check if numeric output is within range."""
|
|
600
|
+
|
|
601
|
+
field: Optional[str] = None
|
|
602
|
+
min_value: Optional[float] = None
|
|
603
|
+
max_value: Optional[float] = None
|
|
604
|
+
|
|
605
|
+
def evaluate(self, ctx: EvaluatorContext) -> bool:
|
|
606
|
+
"""Check if value is within numeric range."""
|
|
607
|
+
# Get output
|
|
608
|
+
if self.field and isinstance(ctx.output, dict):
|
|
609
|
+
value = ctx.output.get(self.field)
|
|
610
|
+
else:
|
|
611
|
+
value = ctx.output
|
|
612
|
+
|
|
613
|
+
# Convert to float
|
|
614
|
+
try:
|
|
615
|
+
num = float(value)
|
|
616
|
+
except (ValueError, TypeError):
|
|
617
|
+
return False
|
|
618
|
+
|
|
619
|
+
# Check range
|
|
620
|
+
if self.min_value is not None and num < self.min_value:
|
|
621
|
+
return False
|
|
622
|
+
if self.max_value is not None and num > self.max_value:
|
|
623
|
+
return False
|
|
624
|
+
return True
|
|
625
|
+
|
|
626
|
+
# Extract min/max from value dict or use separate fields
|
|
627
|
+
if isinstance(config.value, dict):
|
|
628
|
+
min_val = config.value.get("min")
|
|
629
|
+
max_val = config.value.get("max")
|
|
630
|
+
else:
|
|
631
|
+
min_val = getattr(config, "min_value", None)
|
|
632
|
+
max_val = getattr(config, "max_value", None)
|
|
633
|
+
|
|
634
|
+
return NumericRange(field=config.field, min_value=min_val, max_value=max_val)
|