tactus 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. tactus/__init__.py +49 -0
  2. tactus/adapters/__init__.py +9 -0
  3. tactus/adapters/broker_log.py +76 -0
  4. tactus/adapters/cli_hitl.py +189 -0
  5. tactus/adapters/cli_log.py +223 -0
  6. tactus/adapters/cost_collector_log.py +56 -0
  7. tactus/adapters/file_storage.py +367 -0
  8. tactus/adapters/http_callback_log.py +109 -0
  9. tactus/adapters/ide_log.py +71 -0
  10. tactus/adapters/lua_tools.py +336 -0
  11. tactus/adapters/mcp.py +289 -0
  12. tactus/adapters/mcp_manager.py +196 -0
  13. tactus/adapters/memory.py +53 -0
  14. tactus/adapters/plugins.py +419 -0
  15. tactus/backends/http_backend.py +58 -0
  16. tactus/backends/model_backend.py +35 -0
  17. tactus/backends/pytorch_backend.py +110 -0
  18. tactus/broker/__init__.py +12 -0
  19. tactus/broker/client.py +247 -0
  20. tactus/broker/protocol.py +183 -0
  21. tactus/broker/server.py +1123 -0
  22. tactus/broker/stdio.py +12 -0
  23. tactus/cli/__init__.py +7 -0
  24. tactus/cli/app.py +2245 -0
  25. tactus/cli/commands/__init__.py +0 -0
  26. tactus/core/__init__.py +32 -0
  27. tactus/core/config_manager.py +790 -0
  28. tactus/core/dependencies/__init__.py +14 -0
  29. tactus/core/dependencies/registry.py +180 -0
  30. tactus/core/dsl_stubs.py +2117 -0
  31. tactus/core/exceptions.py +66 -0
  32. tactus/core/execution_context.py +480 -0
  33. tactus/core/lua_sandbox.py +508 -0
  34. tactus/core/message_history_manager.py +236 -0
  35. tactus/core/mocking.py +286 -0
  36. tactus/core/output_validator.py +291 -0
  37. tactus/core/registry.py +499 -0
  38. tactus/core/runtime.py +2907 -0
  39. tactus/core/template_resolver.py +142 -0
  40. tactus/core/yaml_parser.py +301 -0
  41. tactus/docker/Dockerfile +61 -0
  42. tactus/docker/entrypoint.sh +69 -0
  43. tactus/dspy/__init__.py +39 -0
  44. tactus/dspy/agent.py +1144 -0
  45. tactus/dspy/broker_lm.py +181 -0
  46. tactus/dspy/config.py +212 -0
  47. tactus/dspy/history.py +196 -0
  48. tactus/dspy/module.py +405 -0
  49. tactus/dspy/prediction.py +318 -0
  50. tactus/dspy/signature.py +185 -0
  51. tactus/formatting/__init__.py +7 -0
  52. tactus/formatting/formatter.py +437 -0
  53. tactus/ide/__init__.py +9 -0
  54. tactus/ide/coding_assistant.py +343 -0
  55. tactus/ide/server.py +2223 -0
  56. tactus/primitives/__init__.py +49 -0
  57. tactus/primitives/control.py +168 -0
  58. tactus/primitives/file.py +229 -0
  59. tactus/primitives/handles.py +378 -0
  60. tactus/primitives/host.py +94 -0
  61. tactus/primitives/human.py +342 -0
  62. tactus/primitives/json.py +189 -0
  63. tactus/primitives/log.py +187 -0
  64. tactus/primitives/message_history.py +157 -0
  65. tactus/primitives/model.py +163 -0
  66. tactus/primitives/procedure.py +564 -0
  67. tactus/primitives/procedure_callable.py +318 -0
  68. tactus/primitives/retry.py +155 -0
  69. tactus/primitives/session.py +152 -0
  70. tactus/primitives/state.py +182 -0
  71. tactus/primitives/step.py +209 -0
  72. tactus/primitives/system.py +93 -0
  73. tactus/primitives/tool.py +375 -0
  74. tactus/primitives/tool_handle.py +279 -0
  75. tactus/primitives/toolset.py +229 -0
  76. tactus/protocols/__init__.py +38 -0
  77. tactus/protocols/chat_recorder.py +81 -0
  78. tactus/protocols/config.py +97 -0
  79. tactus/protocols/cost.py +31 -0
  80. tactus/protocols/hitl.py +71 -0
  81. tactus/protocols/log_handler.py +27 -0
  82. tactus/protocols/models.py +355 -0
  83. tactus/protocols/result.py +33 -0
  84. tactus/protocols/storage.py +90 -0
  85. tactus/providers/__init__.py +13 -0
  86. tactus/providers/base.py +92 -0
  87. tactus/providers/bedrock.py +117 -0
  88. tactus/providers/google.py +105 -0
  89. tactus/providers/openai.py +98 -0
  90. tactus/sandbox/__init__.py +63 -0
  91. tactus/sandbox/config.py +171 -0
  92. tactus/sandbox/container_runner.py +1099 -0
  93. tactus/sandbox/docker_manager.py +433 -0
  94. tactus/sandbox/entrypoint.py +227 -0
  95. tactus/sandbox/protocol.py +213 -0
  96. tactus/stdlib/__init__.py +10 -0
  97. tactus/stdlib/io/__init__.py +13 -0
  98. tactus/stdlib/io/csv.py +88 -0
  99. tactus/stdlib/io/excel.py +136 -0
  100. tactus/stdlib/io/file.py +90 -0
  101. tactus/stdlib/io/fs.py +154 -0
  102. tactus/stdlib/io/hdf5.py +121 -0
  103. tactus/stdlib/io/json.py +109 -0
  104. tactus/stdlib/io/parquet.py +83 -0
  105. tactus/stdlib/io/tsv.py +88 -0
  106. tactus/stdlib/loader.py +274 -0
  107. tactus/stdlib/tac/tactus/tools/done.tac +33 -0
  108. tactus/stdlib/tac/tactus/tools/log.tac +50 -0
  109. tactus/testing/README.md +273 -0
  110. tactus/testing/__init__.py +61 -0
  111. tactus/testing/behave_integration.py +380 -0
  112. tactus/testing/context.py +486 -0
  113. tactus/testing/eval_models.py +114 -0
  114. tactus/testing/evaluation_runner.py +222 -0
  115. tactus/testing/evaluators.py +634 -0
  116. tactus/testing/events.py +94 -0
  117. tactus/testing/gherkin_parser.py +134 -0
  118. tactus/testing/mock_agent.py +315 -0
  119. tactus/testing/mock_dependencies.py +234 -0
  120. tactus/testing/mock_hitl.py +171 -0
  121. tactus/testing/mock_registry.py +168 -0
  122. tactus/testing/mock_tools.py +133 -0
  123. tactus/testing/models.py +115 -0
  124. tactus/testing/pydantic_eval_runner.py +508 -0
  125. tactus/testing/steps/__init__.py +13 -0
  126. tactus/testing/steps/builtin.py +902 -0
  127. tactus/testing/steps/custom.py +69 -0
  128. tactus/testing/steps/registry.py +68 -0
  129. tactus/testing/test_runner.py +489 -0
  130. tactus/tracing/__init__.py +5 -0
  131. tactus/tracing/trace_manager.py +417 -0
  132. tactus/utils/__init__.py +1 -0
  133. tactus/utils/cost_calculator.py +72 -0
  134. tactus/utils/model_pricing.py +132 -0
  135. tactus/utils/safe_file_library.py +502 -0
  136. tactus/utils/safe_libraries.py +234 -0
  137. tactus/validation/LuaLexerBase.py +66 -0
  138. tactus/validation/LuaParserBase.py +23 -0
  139. tactus/validation/README.md +224 -0
  140. tactus/validation/__init__.py +7 -0
  141. tactus/validation/error_listener.py +21 -0
  142. tactus/validation/generated/LuaLexer.interp +231 -0
  143. tactus/validation/generated/LuaLexer.py +5548 -0
  144. tactus/validation/generated/LuaLexer.tokens +124 -0
  145. tactus/validation/generated/LuaLexerBase.py +66 -0
  146. tactus/validation/generated/LuaParser.interp +173 -0
  147. tactus/validation/generated/LuaParser.py +6439 -0
  148. tactus/validation/generated/LuaParser.tokens +124 -0
  149. tactus/validation/generated/LuaParserBase.py +23 -0
  150. tactus/validation/generated/LuaParserVisitor.py +118 -0
  151. tactus/validation/generated/__init__.py +7 -0
  152. tactus/validation/grammar/LuaLexer.g4 +123 -0
  153. tactus/validation/grammar/LuaParser.g4 +178 -0
  154. tactus/validation/semantic_visitor.py +817 -0
  155. tactus/validation/validator.py +157 -0
  156. tactus-0.31.0.dist-info/METADATA +1809 -0
  157. tactus-0.31.0.dist-info/RECORD +160 -0
  158. tactus-0.31.0.dist-info/WHEEL +4 -0
  159. tactus-0.31.0.dist-info/entry_points.txt +2 -0
  160. tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,634 @@
1
+ """
2
+ Evaluator mapping for Pydantic Evals integration.
3
+
4
+ This module maps Tactus evaluator configurations to Pydantic Evals
5
+ evaluator instances, including both built-in and custom evaluators.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import Optional
11
+
12
+ from .eval_models import EvaluatorConfig
13
+ from typing import Any
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Check if pydantic_evals is available
18
+ try:
19
+ from pydantic_evals.evaluators import (
20
+ Evaluator,
21
+ EvaluatorContext,
22
+ Contains,
23
+ EqualsExpected,
24
+ IsInstance,
25
+ LLMJudge,
26
+ )
27
+
28
+ PYDANTIC_EVALS_AVAILABLE = True
29
+ except ImportError:
30
+ PYDANTIC_EVALS_AVAILABLE = False
31
+
32
+ # Create dummy base class for type hints
33
+ class Evaluator:
34
+ pass
35
+
36
+ class EvaluatorContext:
37
+ pass
38
+
39
+
40
+ @dataclass
41
+ class TraceAwareEvaluator:
42
+ """
43
+ Mixin class for evaluators that inspect execution traces.
44
+
45
+ Provides helper methods to extract trace from context metadata or output.
46
+ Subclasses should also inherit from Evaluator and implement evaluate().
47
+ """
48
+
49
+ def get_trace(self, ctx: EvaluatorContext) -> dict:
50
+ """
51
+ Extract trace from context.
52
+
53
+ Trace can be in:
54
+ 1. ctx.metadata['trace'] - if passed via Case metadata
55
+ 2. ctx.output['__trace__'] - if returned by task function
56
+
57
+ Args:
58
+ ctx: Evaluator context
59
+
60
+ Returns:
61
+ Trace dictionary (empty dict if no trace found)
62
+ """
63
+ # Try metadata first
64
+ if hasattr(ctx, "metadata") and ctx.metadata:
65
+ trace = ctx.metadata.get("trace", {})
66
+ if trace:
67
+ return trace
68
+
69
+ # Try output
70
+ if isinstance(ctx.output, dict) and "__trace__" in ctx.output:
71
+ return ctx.output["__trace__"]
72
+
73
+ return {}
74
+
75
+ def get_output(self, ctx: EvaluatorContext) -> Any:
76
+ """
77
+ Extract actual output (without trace wrapper).
78
+
79
+ Args:
80
+ ctx: Evaluator context
81
+
82
+ Returns:
83
+ Actual output value
84
+ """
85
+ if isinstance(ctx.output, dict) and "__output__" in ctx.output:
86
+ return ctx.output["__output__"]
87
+ return ctx.output
88
+
89
+
90
+ def create_evaluator(config: EvaluatorConfig) -> Evaluator:
91
+ """
92
+ Create a Pydantic Evals evaluator from Tactus config.
93
+
94
+ Args:
95
+ config: Tactus evaluator configuration
96
+
97
+ Returns:
98
+ Pydantic Evals Evaluator instance
99
+
100
+ Raises:
101
+ ValueError: If evaluator type is unknown
102
+ ImportError: If pydantic_evals is not installed
103
+ """
104
+ if not PYDANTIC_EVALS_AVAILABLE:
105
+ raise ImportError("pydantic_evals is required. Install with: pip install pydantic-evals")
106
+
107
+ evaluator_type = config.type.lower()
108
+
109
+ # Built-in Pydantic Evals evaluators
110
+ if evaluator_type == "contains":
111
+ return _create_contains_evaluator(config)
112
+ elif evaluator_type == "contains_any":
113
+ return _create_contains_any_evaluator(config)
114
+ elif evaluator_type == "equals_expected":
115
+ return _create_equals_expected_evaluator(config)
116
+ elif evaluator_type == "exact_match":
117
+ return _create_equals_expected_evaluator(config)
118
+ elif evaluator_type == "is_instance":
119
+ return _create_is_instance_evaluator(config)
120
+ elif evaluator_type == "llm_judge":
121
+ return _create_llm_judge_evaluator(config)
122
+ elif evaluator_type == "min_length":
123
+ return _create_min_length_evaluator(config)
124
+ elif evaluator_type == "max_length":
125
+ return _create_max_length_evaluator(config)
126
+
127
+ # Tactus-specific evaluators
128
+ elif evaluator_type == "max_iterations":
129
+ return _create_max_iterations_evaluator(config)
130
+ elif evaluator_type == "max_cost":
131
+ return _create_max_cost_evaluator(config)
132
+ elif evaluator_type == "max_tokens":
133
+ return _create_max_tokens_evaluator(config)
134
+
135
+ # Trace-based evaluators
136
+ elif evaluator_type == "tool_called":
137
+ return _create_tool_called_evaluator(config)
138
+ elif evaluator_type == "state_check":
139
+ return _create_state_check_evaluator(config)
140
+ elif evaluator_type == "agent_turns":
141
+ return _create_agent_turns_evaluator(config)
142
+
143
+ # Advanced evaluators
144
+ elif evaluator_type == "regex":
145
+ return _create_regex_evaluator(config)
146
+ elif evaluator_type == "json_schema":
147
+ return _create_json_schema_evaluator(config)
148
+ elif evaluator_type == "range":
149
+ return _create_range_evaluator(config)
150
+
151
+ else:
152
+ raise ValueError(f"Unknown evaluator type: {config.type}")
153
+
154
+
155
+ def _create_contains_evaluator(config: EvaluatorConfig) -> Evaluator:
156
+ """Create Contains evaluator."""
157
+ if config.value is None:
158
+ raise ValueError("Contains evaluator requires 'value' field")
159
+
160
+ # If a field is specified, create custom evaluator for that field
161
+ if config.field:
162
+
163
+ @dataclass
164
+ class FieldContains(Evaluator):
165
+ """Check if specific field contains value."""
166
+
167
+ field: str
168
+ value: str
169
+ case_sensitive: bool = True
170
+
171
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
172
+ """Check if field contains value."""
173
+ # Get field value
174
+ if isinstance(ctx.output, dict):
175
+ output = str(ctx.output.get(self.field, ""))
176
+ else:
177
+ output = str(ctx.output)
178
+
179
+ # Check contains
180
+ if self.case_sensitive:
181
+ return self.value in output
182
+ else:
183
+ return self.value.lower() in output.lower()
184
+
185
+ return FieldContains(
186
+ field=config.field,
187
+ value=config.value,
188
+ case_sensitive=True,
189
+ )
190
+
191
+ # Otherwise use standard Contains (checks entire output)
192
+ return Contains(
193
+ value=config.value,
194
+ case_sensitive=True,
195
+ )
196
+
197
+
198
+ def _create_contains_any_evaluator(config: EvaluatorConfig) -> Evaluator:
199
+ """Create custom ContainsAny evaluator that checks for any of multiple values."""
200
+
201
+ @dataclass
202
+ class ContainsAny(Evaluator):
203
+ """Check if output contains any of the specified values."""
204
+
205
+ field: Optional[str] = None
206
+ check_expected: Optional[str] = None
207
+ values: Optional[list] = None
208
+
209
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
210
+ """Check if output contains any of the values."""
211
+ # Get the values to check
212
+ if self.values:
213
+ check_values = self.values
214
+ elif self.check_expected and ctx.expected_output:
215
+ check_values = ctx.expected_output.get(self.check_expected, [])
216
+ else:
217
+ return False
218
+
219
+ # Get the output to check
220
+ if self.field and isinstance(ctx.output, dict):
221
+ output = ctx.output.get(self.field, "")
222
+ else:
223
+ output = str(ctx.output)
224
+
225
+ # Check if any value is in output
226
+ output_lower = output.lower()
227
+ for value in check_values:
228
+ if str(value).lower() in output_lower:
229
+ return True
230
+ return False
231
+
232
+ return ContainsAny(
233
+ field=config.field,
234
+ check_expected=config.check_expected,
235
+ values=config.value if isinstance(config.value, list) else None,
236
+ )
237
+
238
+
239
+ def _create_equals_expected_evaluator(config: EvaluatorConfig) -> Evaluator:
240
+ """Create EqualsExpected evaluator or field-specific equality check."""
241
+
242
+ # If a field is specified, create custom evaluator for that field
243
+ if config.field:
244
+
245
+ @dataclass
246
+ class FieldEquals(Evaluator):
247
+ """Check if specific field equals expected value."""
248
+
249
+ field: str
250
+
251
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
252
+ """Check if field equals expected value."""
253
+ if not ctx.expected_output:
254
+ return True # No expected output to compare
255
+
256
+ # Get actual field value
257
+ if isinstance(ctx.output, dict):
258
+ actual = ctx.output.get(self.field)
259
+ else:
260
+ return False
261
+
262
+ # Get expected field value
263
+ expected = ctx.expected_output.get(self.field)
264
+
265
+ return actual == expected
266
+
267
+ return FieldEquals(field=config.field)
268
+
269
+ # Otherwise use standard EqualsExpected (compares entire output)
270
+ return EqualsExpected()
271
+
272
+
273
+ def _create_is_instance_evaluator(config: EvaluatorConfig) -> Evaluator:
274
+ """Create IsInstance evaluator."""
275
+ if config.value is None:
276
+ raise ValueError("IsInstance evaluator requires 'value' field (type name)")
277
+
278
+ return IsInstance(type_name=config.value)
279
+
280
+
281
+ def _create_llm_judge_evaluator(config: EvaluatorConfig) -> Evaluator:
282
+ """Create LLMJudge evaluator."""
283
+ if config.rubric is None:
284
+ raise ValueError("LLMJudge evaluator requires 'rubric' field")
285
+
286
+ # Note: include_expected is not a standard LLMJudge parameter
287
+ # The rubric itself should specify if comparison is needed
288
+ return LLMJudge(
289
+ rubric=config.rubric,
290
+ model=config.model or "openai:gpt-4o",
291
+ include_input=True,
292
+ )
293
+
294
+
295
+ def _create_min_length_evaluator(config: EvaluatorConfig) -> Evaluator:
296
+ """Create custom MinLength evaluator."""
297
+
298
+ @dataclass
299
+ class MinLength(Evaluator):
300
+ """Check if output meets minimum length."""
301
+
302
+ field: Optional[str] = None
303
+ min_length: int = 0
304
+ check_expected: Optional[str] = None
305
+
306
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
307
+ """Check if output meets minimum length."""
308
+ # Get min_length from expected_output if specified
309
+ min_len = self.min_length
310
+ if self.check_expected and ctx.expected_output:
311
+ min_len = ctx.expected_output.get(self.check_expected, min_len)
312
+
313
+ # Get the output to check
314
+ if self.field and isinstance(ctx.output, dict):
315
+ output = ctx.output.get(self.field, "")
316
+ else:
317
+ output = ctx.output
318
+
319
+ # Check length
320
+ if isinstance(output, (list, dict)):
321
+ return len(output) >= min_len
322
+ return len(str(output)) >= min_len
323
+
324
+ return MinLength(
325
+ field=config.field,
326
+ min_length=config.value or 0,
327
+ check_expected=config.check_expected,
328
+ )
329
+
330
+
331
+ def _create_max_length_evaluator(config: EvaluatorConfig) -> Evaluator:
332
+ """Create custom MaxLength evaluator."""
333
+
334
+ @dataclass
335
+ class MaxLength(Evaluator):
336
+ """Check if output doesn't exceed maximum length."""
337
+
338
+ field: Optional[str] = None
339
+ max_length: int = 0
340
+
341
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
342
+ """Check if output doesn't exceed maximum length."""
343
+ # Get the output to check
344
+ if self.field and isinstance(ctx.output, dict):
345
+ output = ctx.output.get(self.field, "")
346
+ else:
347
+ output = ctx.output
348
+
349
+ # Check length
350
+ if isinstance(output, (list, dict)):
351
+ return len(output) <= self.max_length
352
+ return len(str(output)) <= self.max_length
353
+
354
+ return MaxLength(
355
+ field=config.field,
356
+ max_length=config.value or 0,
357
+ )
358
+
359
+
360
+ def _create_max_iterations_evaluator(config: EvaluatorConfig) -> Evaluator:
361
+ """Create Tactus-specific MaxIterations evaluator."""
362
+
363
+ @dataclass
364
+ class MaxIterations(Evaluator):
365
+ """Check if procedure completed within iteration limit."""
366
+
367
+ max_iterations: int
368
+
369
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
370
+ """Check if iterations are within limit."""
371
+ # Check metadata for iterations count
372
+ if hasattr(ctx, "metadata") and ctx.metadata:
373
+ iterations = ctx.metadata.get("iterations", 0)
374
+ return iterations <= self.max_iterations
375
+
376
+ # Check output for iterations field
377
+ if isinstance(ctx.output, dict):
378
+ iterations = ctx.output.get("iterations", 0)
379
+ return iterations <= self.max_iterations
380
+
381
+ return True # Pass if we can't find iterations
382
+
383
+ return MaxIterations(max_iterations=config.value or 10)
384
+
385
+
386
+ def _create_max_cost_evaluator(config: EvaluatorConfig) -> Evaluator:
387
+ """Create Tactus-specific MaxCost evaluator."""
388
+
389
+ @dataclass
390
+ class MaxCost(Evaluator):
391
+ """Check if procedure cost is within budget."""
392
+
393
+ max_cost: float
394
+
395
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
396
+ """Check if cost is within budget."""
397
+ # Check metadata for cost
398
+ if hasattr(ctx, "metadata") and ctx.metadata:
399
+ cost = ctx.metadata.get("total_cost", 0.0)
400
+ return cost <= self.max_cost
401
+
402
+ # Check output for cost field
403
+ if isinstance(ctx.output, dict):
404
+ cost = ctx.output.get("total_cost", 0.0)
405
+ return cost <= self.max_cost
406
+
407
+ return True # Pass if we can't find cost
408
+
409
+ return MaxCost(max_cost=config.value or 1.0)
410
+
411
+
412
+ def _create_max_tokens_evaluator(config: EvaluatorConfig) -> Evaluator:
413
+ """Create Tactus-specific MaxTokens evaluator."""
414
+
415
+ @dataclass
416
+ class MaxTokens(Evaluator):
417
+ """Check if token usage is within limit."""
418
+
419
+ max_tokens: int
420
+
421
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
422
+ """Check if tokens are within limit."""
423
+ # Check metadata for tokens
424
+ if hasattr(ctx, "metadata") and ctx.metadata:
425
+ tokens = ctx.metadata.get("total_tokens", 0)
426
+ return tokens <= self.max_tokens
427
+
428
+ # Check output for tokens field
429
+ if isinstance(ctx.output, dict):
430
+ tokens = ctx.output.get("total_tokens", 0)
431
+ return tokens <= self.max_tokens
432
+
433
+ return True # Pass if we can't find tokens
434
+
435
+ return MaxTokens(max_tokens=config.value or 10000)
436
+
437
+
438
+ def _create_tool_called_evaluator(config: EvaluatorConfig) -> Evaluator:
439
+ """Create evaluator that checks if specific tool was called."""
440
+
441
+ @dataclass
442
+ class ToolCalled(TraceAwareEvaluator, Evaluator):
443
+ """Check if tool was called during execution."""
444
+
445
+ tool_name: str
446
+ min_calls: int = 1
447
+ max_calls: Optional[int] = None
448
+
449
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
450
+ """Check if tool was called the expected number of times."""
451
+ trace = self.get_trace(ctx)
452
+ tool_calls = trace.get("tool_calls", [])
453
+
454
+ # Count calls to this tool
455
+ count = sum(1 for call in tool_calls if call.get("name") == self.tool_name)
456
+
457
+ if count < self.min_calls:
458
+ return False
459
+ if self.max_calls is not None and count > self.max_calls:
460
+ return False
461
+ return True
462
+
463
+ return ToolCalled(
464
+ tool_name=config.value,
465
+ min_calls=getattr(config, "min_value", None) or 1,
466
+ max_calls=getattr(config, "max_value", None),
467
+ )
468
+
469
+
470
+ def _create_state_check_evaluator(config: EvaluatorConfig) -> Evaluator:
471
+ """Create evaluator that checks state variable values."""
472
+
473
+ @dataclass
474
+ class StateCheck(TraceAwareEvaluator, Evaluator):
475
+ """Check if state variable has expected value."""
476
+
477
+ variable: str
478
+ expected_value: Any
479
+
480
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
481
+ """Check if state variable matches expected value."""
482
+ trace = self.get_trace(ctx)
483
+ state_changes = trace.get("state_changes", [])
484
+
485
+ # Find final value of variable
486
+ for change in reversed(state_changes):
487
+ if isinstance(change, dict) and change.get("variable") == self.variable:
488
+ return change.get("value") == self.expected_value
489
+
490
+ return False
491
+
492
+ return StateCheck(variable=config.field or "", expected_value=config.value)
493
+
494
+
495
+ def _create_agent_turns_evaluator(config: EvaluatorConfig) -> Evaluator:
496
+ """Create evaluator that checks agent turn counts."""
497
+
498
+ @dataclass
499
+ class AgentTurns(TraceAwareEvaluator, Evaluator):
500
+ """Check number of agent turns."""
501
+
502
+ agent_name: Optional[str] = None
503
+ min_turns: int = 1
504
+ max_turns: Optional[int] = None
505
+
506
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
507
+ """Check if agent turn count is within expected range."""
508
+ trace = self.get_trace(ctx)
509
+ agent_turns = trace.get("agent_turns", [])
510
+
511
+ # Filter by agent if specified
512
+ if self.agent_name:
513
+ agent_turns = [t for t in agent_turns if t.get("agent") == self.agent_name]
514
+
515
+ count = len(agent_turns)
516
+ if count < self.min_turns:
517
+ return False
518
+ if self.max_turns is not None and count > self.max_turns:
519
+ return False
520
+ return True
521
+
522
+ return AgentTurns(
523
+ agent_name=config.field,
524
+ min_turns=getattr(config, "min_value", None) or 1,
525
+ max_turns=getattr(config, "max_value", None),
526
+ )
527
+
528
+
529
+ def _create_regex_evaluator(config: EvaluatorConfig) -> Evaluator:
530
+ """Create evaluator that matches output against regex pattern."""
531
+ import re
532
+
533
+ @dataclass
534
+ class RegexMatch(Evaluator):
535
+ """Check if output matches regex pattern."""
536
+
537
+ field: Optional[str] = None
538
+ pattern: str = ""
539
+ case_sensitive: bool = True
540
+
541
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
542
+ """Check if output matches the regex pattern."""
543
+ # Get output
544
+ if self.field and isinstance(ctx.output, dict):
545
+ output = str(ctx.output.get(self.field, ""))
546
+ else:
547
+ output = str(ctx.output)
548
+
549
+ # Match pattern
550
+ flags = 0 if self.case_sensitive else re.IGNORECASE
551
+ return bool(re.search(self.pattern, output, flags))
552
+
553
+ return RegexMatch(
554
+ field=config.field,
555
+ pattern=config.value or "",
556
+ case_sensitive=getattr(config, "case_sensitive", True),
557
+ )
558
+
559
+
560
+ def _create_json_schema_evaluator(config: EvaluatorConfig) -> Evaluator:
561
+ """Create evaluator that validates output against JSON schema."""
562
+
563
+ @dataclass
564
+ class JSONSchemaValidator(Evaluator):
565
+ """Validate output against JSON schema."""
566
+
567
+ field: Optional[str] = None
568
+ schema: dict = None
569
+
570
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
571
+ """Validate output against JSON schema."""
572
+ try:
573
+ from jsonschema import validate, ValidationError
574
+ except ImportError:
575
+ logger.warning("jsonschema not installed, skipping validation")
576
+ return True
577
+
578
+ # Get output
579
+ if self.field and isinstance(ctx.output, dict):
580
+ output = ctx.output.get(self.field)
581
+ else:
582
+ output = ctx.output
583
+
584
+ # Validate
585
+ try:
586
+ validate(instance=output, schema=self.schema)
587
+ return True
588
+ except ValidationError:
589
+ return False
590
+
591
+ return JSONSchemaValidator(field=config.field, schema=config.json_schema or config.value or {})
592
+
593
+
594
+ def _create_range_evaluator(config: EvaluatorConfig) -> Evaluator:
595
+ """Create evaluator that checks if numeric value is within range."""
596
+
597
+ @dataclass
598
+ class NumericRange(Evaluator):
599
+ """Check if numeric output is within range."""
600
+
601
+ field: Optional[str] = None
602
+ min_value: Optional[float] = None
603
+ max_value: Optional[float] = None
604
+
605
+ def evaluate(self, ctx: EvaluatorContext) -> bool:
606
+ """Check if value is within numeric range."""
607
+ # Get output
608
+ if self.field and isinstance(ctx.output, dict):
609
+ value = ctx.output.get(self.field)
610
+ else:
611
+ value = ctx.output
612
+
613
+ # Convert to float
614
+ try:
615
+ num = float(value)
616
+ except (ValueError, TypeError):
617
+ return False
618
+
619
+ # Check range
620
+ if self.min_value is not None and num < self.min_value:
621
+ return False
622
+ if self.max_value is not None and num > self.max_value:
623
+ return False
624
+ return True
625
+
626
+ # Extract min/max from value dict or use separate fields
627
+ if isinstance(config.value, dict):
628
+ min_val = config.value.get("min")
629
+ max_val = config.value.get("max")
630
+ else:
631
+ min_val = getattr(config, "min_value", None)
632
+ max_val = getattr(config, "max_value", None)
633
+
634
+ return NumericRange(field=config.field, min_value=min_val, max_value=max_val)