tactus 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. tactus/__init__.py +49 -0
  2. tactus/adapters/__init__.py +9 -0
  3. tactus/adapters/broker_log.py +76 -0
  4. tactus/adapters/cli_hitl.py +189 -0
  5. tactus/adapters/cli_log.py +223 -0
  6. tactus/adapters/cost_collector_log.py +56 -0
  7. tactus/adapters/file_storage.py +367 -0
  8. tactus/adapters/http_callback_log.py +109 -0
  9. tactus/adapters/ide_log.py +71 -0
  10. tactus/adapters/lua_tools.py +336 -0
  11. tactus/adapters/mcp.py +289 -0
  12. tactus/adapters/mcp_manager.py +196 -0
  13. tactus/adapters/memory.py +53 -0
  14. tactus/adapters/plugins.py +419 -0
  15. tactus/backends/http_backend.py +58 -0
  16. tactus/backends/model_backend.py +35 -0
  17. tactus/backends/pytorch_backend.py +110 -0
  18. tactus/broker/__init__.py +12 -0
  19. tactus/broker/client.py +247 -0
  20. tactus/broker/protocol.py +183 -0
  21. tactus/broker/server.py +1123 -0
  22. tactus/broker/stdio.py +12 -0
  23. tactus/cli/__init__.py +7 -0
  24. tactus/cli/app.py +2245 -0
  25. tactus/cli/commands/__init__.py +0 -0
  26. tactus/core/__init__.py +32 -0
  27. tactus/core/config_manager.py +790 -0
  28. tactus/core/dependencies/__init__.py +14 -0
  29. tactus/core/dependencies/registry.py +180 -0
  30. tactus/core/dsl_stubs.py +2117 -0
  31. tactus/core/exceptions.py +66 -0
  32. tactus/core/execution_context.py +480 -0
  33. tactus/core/lua_sandbox.py +508 -0
  34. tactus/core/message_history_manager.py +236 -0
  35. tactus/core/mocking.py +286 -0
  36. tactus/core/output_validator.py +291 -0
  37. tactus/core/registry.py +499 -0
  38. tactus/core/runtime.py +2907 -0
  39. tactus/core/template_resolver.py +142 -0
  40. tactus/core/yaml_parser.py +301 -0
  41. tactus/docker/Dockerfile +61 -0
  42. tactus/docker/entrypoint.sh +69 -0
  43. tactus/dspy/__init__.py +39 -0
  44. tactus/dspy/agent.py +1144 -0
  45. tactus/dspy/broker_lm.py +181 -0
  46. tactus/dspy/config.py +212 -0
  47. tactus/dspy/history.py +196 -0
  48. tactus/dspy/module.py +405 -0
  49. tactus/dspy/prediction.py +318 -0
  50. tactus/dspy/signature.py +185 -0
  51. tactus/formatting/__init__.py +7 -0
  52. tactus/formatting/formatter.py +437 -0
  53. tactus/ide/__init__.py +9 -0
  54. tactus/ide/coding_assistant.py +343 -0
  55. tactus/ide/server.py +2223 -0
  56. tactus/primitives/__init__.py +49 -0
  57. tactus/primitives/control.py +168 -0
  58. tactus/primitives/file.py +229 -0
  59. tactus/primitives/handles.py +378 -0
  60. tactus/primitives/host.py +94 -0
  61. tactus/primitives/human.py +342 -0
  62. tactus/primitives/json.py +189 -0
  63. tactus/primitives/log.py +187 -0
  64. tactus/primitives/message_history.py +157 -0
  65. tactus/primitives/model.py +163 -0
  66. tactus/primitives/procedure.py +564 -0
  67. tactus/primitives/procedure_callable.py +318 -0
  68. tactus/primitives/retry.py +155 -0
  69. tactus/primitives/session.py +152 -0
  70. tactus/primitives/state.py +182 -0
  71. tactus/primitives/step.py +209 -0
  72. tactus/primitives/system.py +93 -0
  73. tactus/primitives/tool.py +375 -0
  74. tactus/primitives/tool_handle.py +279 -0
  75. tactus/primitives/toolset.py +229 -0
  76. tactus/protocols/__init__.py +38 -0
  77. tactus/protocols/chat_recorder.py +81 -0
  78. tactus/protocols/config.py +97 -0
  79. tactus/protocols/cost.py +31 -0
  80. tactus/protocols/hitl.py +71 -0
  81. tactus/protocols/log_handler.py +27 -0
  82. tactus/protocols/models.py +355 -0
  83. tactus/protocols/result.py +33 -0
  84. tactus/protocols/storage.py +90 -0
  85. tactus/providers/__init__.py +13 -0
  86. tactus/providers/base.py +92 -0
  87. tactus/providers/bedrock.py +117 -0
  88. tactus/providers/google.py +105 -0
  89. tactus/providers/openai.py +98 -0
  90. tactus/sandbox/__init__.py +63 -0
  91. tactus/sandbox/config.py +171 -0
  92. tactus/sandbox/container_runner.py +1099 -0
  93. tactus/sandbox/docker_manager.py +433 -0
  94. tactus/sandbox/entrypoint.py +227 -0
  95. tactus/sandbox/protocol.py +213 -0
  96. tactus/stdlib/__init__.py +10 -0
  97. tactus/stdlib/io/__init__.py +13 -0
  98. tactus/stdlib/io/csv.py +88 -0
  99. tactus/stdlib/io/excel.py +136 -0
  100. tactus/stdlib/io/file.py +90 -0
  101. tactus/stdlib/io/fs.py +154 -0
  102. tactus/stdlib/io/hdf5.py +121 -0
  103. tactus/stdlib/io/json.py +109 -0
  104. tactus/stdlib/io/parquet.py +83 -0
  105. tactus/stdlib/io/tsv.py +88 -0
  106. tactus/stdlib/loader.py +274 -0
  107. tactus/stdlib/tac/tactus/tools/done.tac +33 -0
  108. tactus/stdlib/tac/tactus/tools/log.tac +50 -0
  109. tactus/testing/README.md +273 -0
  110. tactus/testing/__init__.py +61 -0
  111. tactus/testing/behave_integration.py +380 -0
  112. tactus/testing/context.py +486 -0
  113. tactus/testing/eval_models.py +114 -0
  114. tactus/testing/evaluation_runner.py +222 -0
  115. tactus/testing/evaluators.py +634 -0
  116. tactus/testing/events.py +94 -0
  117. tactus/testing/gherkin_parser.py +134 -0
  118. tactus/testing/mock_agent.py +315 -0
  119. tactus/testing/mock_dependencies.py +234 -0
  120. tactus/testing/mock_hitl.py +171 -0
  121. tactus/testing/mock_registry.py +168 -0
  122. tactus/testing/mock_tools.py +133 -0
  123. tactus/testing/models.py +115 -0
  124. tactus/testing/pydantic_eval_runner.py +508 -0
  125. tactus/testing/steps/__init__.py +13 -0
  126. tactus/testing/steps/builtin.py +902 -0
  127. tactus/testing/steps/custom.py +69 -0
  128. tactus/testing/steps/registry.py +68 -0
  129. tactus/testing/test_runner.py +489 -0
  130. tactus/tracing/__init__.py +5 -0
  131. tactus/tracing/trace_manager.py +417 -0
  132. tactus/utils/__init__.py +1 -0
  133. tactus/utils/cost_calculator.py +72 -0
  134. tactus/utils/model_pricing.py +132 -0
  135. tactus/utils/safe_file_library.py +502 -0
  136. tactus/utils/safe_libraries.py +234 -0
  137. tactus/validation/LuaLexerBase.py +66 -0
  138. tactus/validation/LuaParserBase.py +23 -0
  139. tactus/validation/README.md +224 -0
  140. tactus/validation/__init__.py +7 -0
  141. tactus/validation/error_listener.py +21 -0
  142. tactus/validation/generated/LuaLexer.interp +231 -0
  143. tactus/validation/generated/LuaLexer.py +5548 -0
  144. tactus/validation/generated/LuaLexer.tokens +124 -0
  145. tactus/validation/generated/LuaLexerBase.py +66 -0
  146. tactus/validation/generated/LuaParser.interp +173 -0
  147. tactus/validation/generated/LuaParser.py +6439 -0
  148. tactus/validation/generated/LuaParser.tokens +124 -0
  149. tactus/validation/generated/LuaParserBase.py +23 -0
  150. tactus/validation/generated/LuaParserVisitor.py +118 -0
  151. tactus/validation/generated/__init__.py +7 -0
  152. tactus/validation/grammar/LuaLexer.g4 +123 -0
  153. tactus/validation/grammar/LuaParser.g4 +178 -0
  154. tactus/validation/semantic_visitor.py +817 -0
  155. tactus/validation/validator.py +157 -0
  156. tactus-0.31.0.dist-info/METADATA +1809 -0
  157. tactus-0.31.0.dist-info/RECORD +160 -0
  158. tactus-0.31.0.dist-info/WHEEL +4 -0
  159. tactus-0.31.0.dist-info/entry_points.txt +2 -0
  160. tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,486 @@
1
+ """
2
+ Test context for Tactus BDD testing.
3
+
4
+ Provides the context object passed to step definitions,
5
+ with helper methods to access procedure execution results.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class TactusTestContext:
18
+ """
19
+ Context object passed to step definitions.
20
+
21
+ Provides access to procedure execution results and state
22
+ for making assertions in step functions.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ procedure_file: Path,
28
+ params: Optional[Dict] = None,
29
+ mock_tools: Optional[Dict] = None,
30
+ mcp_servers: Optional[Dict] = None,
31
+ tool_paths: Optional[List[str]] = None,
32
+ mocked: bool = False,
33
+ ):
34
+ self.procedure_file = procedure_file
35
+ self.params = params or {}
36
+ self.mock_tools = mock_tools # tool_name -> mock_response
37
+ self.mcp_servers = mcp_servers or {}
38
+ self.tool_paths = tool_paths or []
39
+ self.mocked = mocked # Whether to use mocked dependencies
40
+ self.mock_registry = None # Unified mock registry for dependencies + HITL
41
+ self.runtime = None
42
+ self.execution_result: Optional[Dict] = None
43
+ self._primitives: Dict[str, Any] = {} # Captured primitives
44
+ self._procedure_executed = False
45
+ self.total_cost: float = 0.0 # Track total cost
46
+ self.total_tokens: int = 0 # Track total tokens
47
+ self.cost_breakdown: List[Any] = [] # Track per-call costs
48
+ self._agent_mock_turns: Dict[str, List[Dict[str, Any]]] = {}
49
+ self._scenario_message: str | None = None
50
+
51
+ def set_scenario_message(self, message: str) -> None:
52
+ """Set the scenario's primary injected message (for in-spec mocking coordination)."""
53
+ self._scenario_message = message
54
+
55
+ def get_scenario_message(self) -> str | None:
56
+ """Get the scenario's primary injected message, if set."""
57
+ return self._scenario_message
58
+
59
+ def mock_agent_response(
60
+ self, agent: str, message: str, when_message: str | None = None
61
+ ) -> None:
62
+ """Add a mocked agent response for this scenario (temporal; 1 per agent turn).
63
+
64
+ If `when_message` is provided, the mock is selected when the agent is called
65
+ with that exact injected message.
66
+ """
67
+ turn: Dict[str, Any] = {"message": message}
68
+ effective_when = when_message if when_message is not None else self._scenario_message
69
+ if effective_when is not None:
70
+ turn["when_message"] = effective_when
71
+ self._agent_mock_turns.setdefault(agent, []).append(turn)
72
+
73
+ # Ensure runtime exists and sees the same dict reference for this scenario.
74
+ if self.runtime is None:
75
+ self.setup_runtime()
76
+ if self.runtime is not None:
77
+ self.runtime.external_agent_mocks = self._agent_mock_turns
78
+
79
+ def mock_agent_tool_call(
80
+ self,
81
+ agent: str,
82
+ tool: str,
83
+ args: Dict[str, Any] | None = None,
84
+ when_message: str | None = None,
85
+ ) -> None:
86
+ """Add a mocked tool call to an agent's next mocked turn for this scenario."""
87
+ args = args or {}
88
+
89
+ effective_when = when_message if when_message is not None else self._scenario_message
90
+ if (
91
+ agent in self._agent_mock_turns
92
+ and self._agent_mock_turns[agent]
93
+ and (
94
+ effective_when is None
95
+ or self._agent_mock_turns[agent][-1].get("when_message") == effective_when
96
+ )
97
+ ):
98
+ turn = self._agent_mock_turns[agent][-1]
99
+ else:
100
+ turn = {}
101
+ if effective_when is not None:
102
+ turn["when_message"] = effective_when
103
+ self._agent_mock_turns.setdefault(agent, []).append(turn)
104
+
105
+ tool_calls = turn.get("tool_calls")
106
+ if not isinstance(tool_calls, list):
107
+ tool_calls = []
108
+ turn["tool_calls"] = tool_calls
109
+
110
+ tool_calls.append({"tool": tool, "args": args})
111
+
112
+ if self.runtime is None:
113
+ self.setup_runtime()
114
+ if self.runtime is not None:
115
+ self.runtime.external_agent_mocks = self._agent_mock_turns
116
+
117
+ def mock_agent_data(
118
+ self, agent: str, data: Dict[str, Any], when_message: str | None = None
119
+ ) -> None:
120
+ """Set structured output mock data for an agent's next mocked turn.
121
+
122
+ This is only used when an agent has an output schema; the DSPy agent mock
123
+ logic will apply `data` as the structured `result.output`.
124
+ """
125
+ if not isinstance(data, dict):
126
+ raise TypeError("mock_agent_data expects a dict")
127
+
128
+ effective_when = when_message if when_message is not None else self._scenario_message
129
+ if (
130
+ agent in self._agent_mock_turns
131
+ and self._agent_mock_turns[agent]
132
+ and (
133
+ effective_when is None
134
+ or self._agent_mock_turns[agent][-1].get("when_message") == effective_when
135
+ )
136
+ ):
137
+ turn = self._agent_mock_turns[agent][-1]
138
+ else:
139
+ turn = {}
140
+ if effective_when is not None:
141
+ turn["when_message"] = effective_when
142
+ self._agent_mock_turns.setdefault(agent, []).append(turn)
143
+
144
+ turn["data"] = data
145
+
146
+ if self.runtime is None:
147
+ self.setup_runtime()
148
+ if self.runtime is not None:
149
+ self.runtime.external_agent_mocks = self._agent_mock_turns
150
+
151
+ def mock_tool_returns(self, tool: str, output: Any) -> None:
152
+ """Configure a runtime tool mock (Mocks { tool = { returns = ... } } equivalent)."""
153
+ if self.runtime is None:
154
+ self.setup_runtime()
155
+ if self.runtime is None:
156
+ raise AssertionError("Runtime not initialized")
157
+
158
+ if self.runtime.mock_manager is None:
159
+ from tactus.core.mocking import MockManager
160
+
161
+ self.runtime.mock_manager = MockManager()
162
+
163
+ self.runtime.mock_manager.register_mock(tool, {"output": output})
164
+ self.runtime.mock_manager.enable_mock(tool)
165
+
166
+ def setup_runtime(self) -> None:
167
+ """Initialize TactusRuntime with storage and handlers."""
168
+ import os
169
+ from tactus.core.runtime import TactusRuntime
170
+ from tactus.adapters.memory import MemoryStorage
171
+ from tactus.testing.mock_hitl import MockHITLHandler
172
+ from tactus.testing.mock_registry import UnifiedMockRegistry
173
+ from tactus.adapters.cli_log import CLILogHandler
174
+
175
+ storage = MemoryStorage()
176
+
177
+ # Setup mock registry if in mocked mode
178
+ if self.mocked:
179
+ from tactus.testing.mock_hitl import MockHITLHandler
180
+
181
+ self.mock_registry = UnifiedMockRegistry(hitl_handler=MockHITLHandler())
182
+ hitl = self.mock_registry.get_hitl_handler()
183
+ logger.info("Mock mode enabled - using UnifiedMockRegistry")
184
+ else:
185
+ hitl = MockHITLHandler() # Auto-approve for tests
186
+
187
+ log_handler = CLILogHandler() # Capture cost events
188
+
189
+ # Setup mocked tool primitive if mocks configured
190
+ tool_primitive = None
191
+ if self.mock_tools:
192
+ self._setup_mock_tools()
193
+ tool_primitive = self._mocked_tool_primitive
194
+ logger.info("Mock mode enabled - using MockedToolPrimitive")
195
+
196
+ self.runtime = TactusRuntime(
197
+ procedure_id=f"test_{self.procedure_file.stem}",
198
+ storage_backend=storage,
199
+ hitl_handler=hitl,
200
+ tool_primitive=tool_primitive, # Inject mocked tool if configured
201
+ openai_api_key=os.environ.get("OPENAI_API_KEY"), # Pass API key for real LLM calls
202
+ log_handler=log_handler, # Enable cost tracking
203
+ source_file_path=str(self.procedure_file.resolve()), # For require() path resolution
204
+ mcp_servers=self.mcp_servers,
205
+ tool_paths=self.tool_paths,
206
+ )
207
+
208
+ # Create MockManager for handling Mocks {} blocks when in mocked mode
209
+ if self.mocked or self.mock_tools:
210
+ from tactus.core.mocking import MockManager
211
+
212
+ self.runtime.mock_manager = MockManager()
213
+ logger.info("Created MockManager for Mocks {} block support")
214
+ # Mocked-mode tests should never call real LLMs by default.
215
+ self.runtime.mock_all_agents = True
216
+
217
+ logger.debug(f"Setup runtime for test: {self.procedure_file.stem}")
218
+
219
+ async def run_procedure_async(self) -> None:
220
+ """Execute procedure asynchronously and capture results."""
221
+ if self._procedure_executed:
222
+ logger.debug("Procedure already executed, skipping")
223
+ return
224
+
225
+ if not self.runtime:
226
+ self.setup_runtime()
227
+
228
+ # Read procedure source
229
+ source = self.procedure_file.read_text()
230
+
231
+ # Setup mock tools if provided
232
+ if self.mock_tools:
233
+ self._setup_mock_tools()
234
+
235
+ # Inject mocked dependencies if in mocked mode
236
+ if self.mocked and self.mock_registry:
237
+ await self._inject_mocked_dependencies()
238
+
239
+ # Execute procedure
240
+ logger.info(f"Executing procedure: {self.procedure_file}")
241
+ self.execution_result = await self.runtime.execute(
242
+ source=source, context=self.params, format="lua"
243
+ )
244
+
245
+ # Capture metrics from execution result
246
+ if self.execution_result:
247
+ self.total_cost = self.execution_result.get("total_cost", 0.0)
248
+ self.total_tokens = self.execution_result.get("total_tokens", 0)
249
+ self.cost_breakdown = self.execution_result.get("cost_breakdown", [])
250
+ self.iterations = self.execution_result.get("iterations", 0)
251
+ self.tools_used = self.execution_result.get("tools_used", [])
252
+
253
+ # Capture primitives for assertions
254
+ self._capture_primitives()
255
+
256
+ self._procedure_executed = True
257
+ logger.info(f"Procedure execution complete: success={self.execution_result.get('success')}")
258
+
259
+ def run_procedure(self) -> None:
260
+ """Execute procedure synchronously (wrapper for async)."""
261
+ asyncio.run(self.run_procedure_async())
262
+
263
+ def _setup_mock_tools(self) -> None:
264
+ """Setup mock tool responses by creating MockedToolPrimitive."""
265
+ from tactus.testing.mock_tools import MockToolRegistry, MockedToolPrimitive
266
+
267
+ # Create mock registry
268
+ mock_registry = MockToolRegistry()
269
+ for tool_name, response in self.mock_tools.items():
270
+ mock_registry.register(tool_name, response)
271
+
272
+ # Create mocked tool primitive
273
+ self._mocked_tool_primitive = MockedToolPrimitive(mock_registry)
274
+
275
+ logger.info(f"Mock tools configured: {list(self.mock_tools.keys())}")
276
+
277
+ async def _inject_mocked_dependencies(self) -> None:
278
+ """Inject mocked dependencies into runtime."""
279
+ if not self.runtime or not self.runtime.registry:
280
+ logger.warning("Cannot inject mocked dependencies - runtime or registry not available")
281
+ return
282
+
283
+ # Get dependencies from registry
284
+ dependencies_config = {}
285
+ for dep_name, dep_decl in self.runtime.registry.dependencies.items():
286
+ dependencies_config[dep_name] = dep_decl.config
287
+
288
+ if not dependencies_config:
289
+ logger.debug("No dependencies declared in procedure")
290
+ return
291
+
292
+ # Create mock dependencies
293
+ mock_dependencies = await self.mock_registry.create_mock_dependencies(dependencies_config)
294
+
295
+ # Inject into runtime
296
+ self.runtime.user_dependencies = mock_dependencies
297
+
298
+ logger.info(f"Mocked dependencies injected: {list(mock_dependencies.keys())}")
299
+
300
+ def _capture_primitives(self) -> None:
301
+ """Capture primitive states after execution."""
302
+ if not self.runtime or not self.runtime.lua_sandbox:
303
+ logger.warning("Cannot capture primitives - runtime or sandbox not available")
304
+ return
305
+
306
+ # Capture Tool primitive
307
+ try:
308
+ self._primitives["tool"] = self.runtime.tool_primitive
309
+ except Exception as e:
310
+ logger.debug(f"Could not capture Tool primitive: {e}")
311
+
312
+ # Capture State primitive
313
+ try:
314
+ self._primitives["state"] = self.runtime.state_primitive
315
+ except Exception as e:
316
+ logger.debug(f"Could not capture State primitive: {e}")
317
+
318
+ # Capture Iterations primitive
319
+ try:
320
+ self._primitives["iterations"] = self.runtime.iterations_primitive
321
+ except Exception as e:
322
+ logger.debug(f"Could not capture Iterations primitive: {e}")
323
+
324
+ # Capture Stop primitive
325
+ try:
326
+ self._primitives["stop"] = self.runtime.stop_primitive
327
+ except Exception as e:
328
+ logger.debug(f"Could not capture Stop primitive: {e}")
329
+
330
+ logger.debug(f"Captured {len(self._primitives)} primitives")
331
+
332
+ def is_running(self) -> bool:
333
+ """Check if procedure has been executed."""
334
+ return self._procedure_executed
335
+
336
+ # Tool-related methods
337
+
338
+ def tool_called(self, tool_name: str) -> bool:
339
+ """Check if a tool was called."""
340
+ tool_prim = self._primitives.get("tool")
341
+ if tool_prim:
342
+ return tool_prim.called(tool_name)
343
+ # Fallback to execution result
344
+ tools_used = self.execution_result.get("tools_used", []) if self.execution_result else []
345
+ return tool_name in tools_used
346
+
347
+ def tool_call_count(self, tool_name: str) -> int:
348
+ """Get number of times a tool was called."""
349
+ tool_prim = self._primitives.get("tool")
350
+ if tool_prim and hasattr(tool_prim, "_tool_calls"):
351
+ return sum(1 for call in tool_prim._tool_calls if call.name == tool_name)
352
+ return 0
353
+
354
+ def tool_calls(self, tool_name: str) -> List[Dict]:
355
+ """Get all calls to a specific tool."""
356
+ tool_prim = self._primitives.get("tool")
357
+ if tool_prim and hasattr(tool_prim, "_tool_calls"):
358
+ return [
359
+ {"tool": call.name, "args": call.args, "result": call.result}
360
+ for call in tool_prim._tool_calls
361
+ if call.name == tool_name
362
+ ]
363
+ return []
364
+
365
+ # State-related methods
366
+
367
+ def state_get(self, key: str) -> Any:
368
+ """Get state value."""
369
+ state_prim = self._primitives.get("state")
370
+ if state_prim:
371
+ return state_prim.get(key)
372
+ return None
373
+
374
+ def state_exists(self, key: str) -> bool:
375
+ """Check if state key exists."""
376
+ state_prim = self._primitives.get("state")
377
+ if state_prim and hasattr(state_prim, "_state"):
378
+ return key in state_prim._state
379
+ return False
380
+
381
+ # Output-related methods
382
+
383
+ def output_get(self, key: str) -> Any:
384
+ """Get output value from procedure execution result."""
385
+ if self.execution_result:
386
+ # Check if outputs are in a dedicated field
387
+ if "output" in self.execution_result:
388
+ output = self.execution_result["output"]
389
+ if isinstance(output, dict):
390
+ return output.get(key)
391
+ return None
392
+ # Otherwise check in the result dict (procedure return value)
393
+ if "result" in self.execution_result:
394
+ result = self.execution_result["result"]
395
+ if isinstance(result, dict):
396
+ return result.get(key)
397
+
398
+ return None
399
+
400
+ def output_exists(self, key: str) -> bool:
401
+ """Check if output key exists in procedure execution result."""
402
+ if self.execution_result:
403
+ # Check if outputs are in a dedicated field
404
+ if "output" in self.execution_result:
405
+ output = self.execution_result["output"]
406
+ return isinstance(output, dict) and key in output
407
+ # Otherwise check in the result dict (procedure return value)
408
+ if "result" in self.execution_result:
409
+ result = self.execution_result["result"]
410
+ if isinstance(result, dict):
411
+ return key in result
412
+ return False
413
+
414
+ def output_value(self) -> Any:
415
+ """Get the full (possibly scalar) output value for the procedure."""
416
+ if not self.execution_result:
417
+ return None
418
+ if "output" in self.execution_result:
419
+ return self.execution_result["output"]
420
+ result = self.execution_result.get("result")
421
+ try:
422
+ from tactus.protocols.result import TactusResult
423
+
424
+ if isinstance(result, TactusResult):
425
+ return result.output
426
+ except Exception:
427
+ pass
428
+ return result
429
+
430
+ # Completion methods
431
+
432
+ def stop_success(self) -> bool:
433
+ """Check if procedure completed successfully."""
434
+ if self.execution_result:
435
+ return self.execution_result.get("success", False)
436
+ return False
437
+
438
+ def stop_reason(self) -> str:
439
+ """Get stop reason."""
440
+ stop_prim = self._primitives.get("stop")
441
+ if stop_prim and hasattr(stop_prim, "_reason"):
442
+ return stop_prim._reason or ""
443
+ if self.execution_result:
444
+ return self.execution_result.get("stop_reason", "")
445
+ return ""
446
+
447
+ # Iteration methods
448
+
449
+ def iterations(self) -> int:
450
+ """Get total iterations."""
451
+ iterations_prim = self._primitives.get("iterations")
452
+ if iterations_prim and hasattr(iterations_prim, "_count"):
453
+ return iterations_prim._count
454
+ if self.execution_result:
455
+ return self.execution_result.get("iterations", 0)
456
+ return 0
457
+
458
+ def agent_turns(self) -> int:
459
+ """Get number of agent turns."""
460
+ # Count from execution result
461
+ if self.execution_result:
462
+ return self.execution_result.get("agent_turns", 0)
463
+ return 0
464
+
465
+ # Parameter/context methods
466
+
467
+ def get_params(self) -> Dict:
468
+ """Get procedure parameters."""
469
+ return self.params
470
+
471
+ def set_input(self, key: str, value: Any) -> None:
472
+ """Set an input parameter for the procedure.
473
+
474
+ Args:
475
+ key: Parameter name
476
+ value: Parameter value (will be parsed from string if needed)
477
+ """
478
+ self.params[key] = value
479
+ logger.debug(f"Set input parameter: {key}={value}")
480
+
481
+ def agent_context(self) -> str:
482
+ """Get agent context as string."""
483
+ # This would need to be populated by the runtime
484
+ if self.execution_result:
485
+ return self.execution_result.get("agent_context", "")
486
+ return ""
@@ -0,0 +1,114 @@
1
+ """
2
+ Pydantic models for Pydantic Evals integration.
3
+
4
+ These models define the structure of evaluation configurations
5
+ that can be declared in .tac files using the evaluations() function.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Optional
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class EvalCase(BaseModel):
13
+ """
14
+ Single evaluation test case.
15
+
16
+ Represents one test case in an evaluation dataset with inputs,
17
+ optional expected outputs, and metadata.
18
+ """
19
+
20
+ name: str
21
+ inputs: Dict[str, Any] # Procedure parameters
22
+ expected_output: Optional[Dict[str, Any]] = None
23
+ metadata: Dict[str, Any] = Field(default_factory=dict)
24
+
25
+
26
+ class EvaluatorConfig(BaseModel):
27
+ """
28
+ Configuration for an evaluator.
29
+
30
+ Defines how to evaluate procedure outputs. Different evaluator types
31
+ have different configuration requirements.
32
+ """
33
+
34
+ type: str # "contains", "llm_judge", "exact_match", "min_length", etc.
35
+
36
+ # Common fields (used by different evaluator types)
37
+ field: Optional[str] = None # Which output field to evaluate
38
+ value: Optional[Any] = None # Value to check against
39
+ check_expected: Optional[str] = None # Field name in expected_output to check
40
+
41
+ # LLM-as-judge specific
42
+ rubric: Optional[str] = None # Evaluation rubric for LLM judge
43
+ model: Optional[str] = None # Model to use for LLM judge
44
+ include_expected: bool = False # Whether to include expected_output in prompt
45
+
46
+ # Tactus-specific evaluators
47
+ max_iterations: Optional[int] = None
48
+ max_cost: Optional[float] = None
49
+ max_tokens: Optional[int] = None
50
+
51
+ # Regex evaluator
52
+ pattern: Optional[str] = None
53
+ case_sensitive: bool = True
54
+
55
+ # JSON Schema evaluator
56
+ json_schema: Optional[Dict[str, Any]] = None
57
+
58
+ # Numeric range evaluator
59
+ min_value: Optional[float] = None
60
+ max_value: Optional[float] = None
61
+
62
+
63
+ class EvaluationThresholds(BaseModel):
64
+ """
65
+ Quality gates for CI/CD integration.
66
+
67
+ Defines minimum acceptable thresholds for evaluation metrics.
68
+ If any threshold is not met, the evaluation fails.
69
+ """
70
+
71
+ min_success_rate: Optional[float] = None # 0.0-1.0 (e.g., 0.90 for 90%)
72
+ max_cost_per_run: Optional[float] = None # Maximum cost in dollars
73
+ max_duration: Optional[float] = None # Maximum duration in seconds
74
+ max_tokens_per_run: Optional[int] = None # Maximum tokens per run
75
+
76
+
77
+ class EvaluationConfig(BaseModel):
78
+ """
79
+ Complete evaluation configuration from evaluations() call.
80
+
81
+ Contains the dataset, evaluators, and execution settings for
82
+ running Pydantic Evals on a Tactus procedure.
83
+ """
84
+
85
+ dataset: List[EvalCase]
86
+ evaluators: List[EvaluatorConfig]
87
+ runs: int = 1 # Number of times to run each case
88
+ parallel: bool = True # Whether to run cases in parallel
89
+ dataset_file: Optional[str] = None # Path to external dataset file
90
+ thresholds: Optional[EvaluationThresholds] = None # Quality gates for CI/CD
91
+
92
+
93
+ class EvaluationResultSummary(BaseModel):
94
+ """
95
+ Summary of evaluation results.
96
+
97
+ Aggregates results across all cases and runs for reporting.
98
+ """
99
+
100
+ total_cases: int
101
+ passed_cases: int
102
+ failed_cases: int
103
+
104
+ # Aggregate metrics
105
+ mean_score: Optional[float] = None # Average score from LLM judges
106
+ consistency_score: Optional[float] = None # Consistency across runs
107
+
108
+ # Performance metrics
109
+ total_cost: float = 0.0
110
+ total_tokens: int = 0
111
+ total_duration: float = 0.0
112
+
113
+ # Per-case results
114
+ case_results: List[Dict[str, Any]] = Field(default_factory=list)