tactus 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. tactus/__init__.py +49 -0
  2. tactus/adapters/__init__.py +9 -0
  3. tactus/adapters/broker_log.py +76 -0
  4. tactus/adapters/cli_hitl.py +189 -0
  5. tactus/adapters/cli_log.py +223 -0
  6. tactus/adapters/cost_collector_log.py +56 -0
  7. tactus/adapters/file_storage.py +367 -0
  8. tactus/adapters/http_callback_log.py +109 -0
  9. tactus/adapters/ide_log.py +71 -0
  10. tactus/adapters/lua_tools.py +336 -0
  11. tactus/adapters/mcp.py +289 -0
  12. tactus/adapters/mcp_manager.py +196 -0
  13. tactus/adapters/memory.py +53 -0
  14. tactus/adapters/plugins.py +419 -0
  15. tactus/backends/http_backend.py +58 -0
  16. tactus/backends/model_backend.py +35 -0
  17. tactus/backends/pytorch_backend.py +110 -0
  18. tactus/broker/__init__.py +12 -0
  19. tactus/broker/client.py +247 -0
  20. tactus/broker/protocol.py +183 -0
  21. tactus/broker/server.py +1123 -0
  22. tactus/broker/stdio.py +12 -0
  23. tactus/cli/__init__.py +7 -0
  24. tactus/cli/app.py +2245 -0
  25. tactus/cli/commands/__init__.py +0 -0
  26. tactus/core/__init__.py +32 -0
  27. tactus/core/config_manager.py +790 -0
  28. tactus/core/dependencies/__init__.py +14 -0
  29. tactus/core/dependencies/registry.py +180 -0
  30. tactus/core/dsl_stubs.py +2117 -0
  31. tactus/core/exceptions.py +66 -0
  32. tactus/core/execution_context.py +480 -0
  33. tactus/core/lua_sandbox.py +508 -0
  34. tactus/core/message_history_manager.py +236 -0
  35. tactus/core/mocking.py +286 -0
  36. tactus/core/output_validator.py +291 -0
  37. tactus/core/registry.py +499 -0
  38. tactus/core/runtime.py +2907 -0
  39. tactus/core/template_resolver.py +142 -0
  40. tactus/core/yaml_parser.py +301 -0
  41. tactus/docker/Dockerfile +61 -0
  42. tactus/docker/entrypoint.sh +69 -0
  43. tactus/dspy/__init__.py +39 -0
  44. tactus/dspy/agent.py +1144 -0
  45. tactus/dspy/broker_lm.py +181 -0
  46. tactus/dspy/config.py +212 -0
  47. tactus/dspy/history.py +196 -0
  48. tactus/dspy/module.py +405 -0
  49. tactus/dspy/prediction.py +318 -0
  50. tactus/dspy/signature.py +185 -0
  51. tactus/formatting/__init__.py +7 -0
  52. tactus/formatting/formatter.py +437 -0
  53. tactus/ide/__init__.py +9 -0
  54. tactus/ide/coding_assistant.py +343 -0
  55. tactus/ide/server.py +2223 -0
  56. tactus/primitives/__init__.py +49 -0
  57. tactus/primitives/control.py +168 -0
  58. tactus/primitives/file.py +229 -0
  59. tactus/primitives/handles.py +378 -0
  60. tactus/primitives/host.py +94 -0
  61. tactus/primitives/human.py +342 -0
  62. tactus/primitives/json.py +189 -0
  63. tactus/primitives/log.py +187 -0
  64. tactus/primitives/message_history.py +157 -0
  65. tactus/primitives/model.py +163 -0
  66. tactus/primitives/procedure.py +564 -0
  67. tactus/primitives/procedure_callable.py +318 -0
  68. tactus/primitives/retry.py +155 -0
  69. tactus/primitives/session.py +152 -0
  70. tactus/primitives/state.py +182 -0
  71. tactus/primitives/step.py +209 -0
  72. tactus/primitives/system.py +93 -0
  73. tactus/primitives/tool.py +375 -0
  74. tactus/primitives/tool_handle.py +279 -0
  75. tactus/primitives/toolset.py +229 -0
  76. tactus/protocols/__init__.py +38 -0
  77. tactus/protocols/chat_recorder.py +81 -0
  78. tactus/protocols/config.py +97 -0
  79. tactus/protocols/cost.py +31 -0
  80. tactus/protocols/hitl.py +71 -0
  81. tactus/protocols/log_handler.py +27 -0
  82. tactus/protocols/models.py +355 -0
  83. tactus/protocols/result.py +33 -0
  84. tactus/protocols/storage.py +90 -0
  85. tactus/providers/__init__.py +13 -0
  86. tactus/providers/base.py +92 -0
  87. tactus/providers/bedrock.py +117 -0
  88. tactus/providers/google.py +105 -0
  89. tactus/providers/openai.py +98 -0
  90. tactus/sandbox/__init__.py +63 -0
  91. tactus/sandbox/config.py +171 -0
  92. tactus/sandbox/container_runner.py +1099 -0
  93. tactus/sandbox/docker_manager.py +433 -0
  94. tactus/sandbox/entrypoint.py +227 -0
  95. tactus/sandbox/protocol.py +213 -0
  96. tactus/stdlib/__init__.py +10 -0
  97. tactus/stdlib/io/__init__.py +13 -0
  98. tactus/stdlib/io/csv.py +88 -0
  99. tactus/stdlib/io/excel.py +136 -0
  100. tactus/stdlib/io/file.py +90 -0
  101. tactus/stdlib/io/fs.py +154 -0
  102. tactus/stdlib/io/hdf5.py +121 -0
  103. tactus/stdlib/io/json.py +109 -0
  104. tactus/stdlib/io/parquet.py +83 -0
  105. tactus/stdlib/io/tsv.py +88 -0
  106. tactus/stdlib/loader.py +274 -0
  107. tactus/stdlib/tac/tactus/tools/done.tac +33 -0
  108. tactus/stdlib/tac/tactus/tools/log.tac +50 -0
  109. tactus/testing/README.md +273 -0
  110. tactus/testing/__init__.py +61 -0
  111. tactus/testing/behave_integration.py +380 -0
  112. tactus/testing/context.py +486 -0
  113. tactus/testing/eval_models.py +114 -0
  114. tactus/testing/evaluation_runner.py +222 -0
  115. tactus/testing/evaluators.py +634 -0
  116. tactus/testing/events.py +94 -0
  117. tactus/testing/gherkin_parser.py +134 -0
  118. tactus/testing/mock_agent.py +315 -0
  119. tactus/testing/mock_dependencies.py +234 -0
  120. tactus/testing/mock_hitl.py +171 -0
  121. tactus/testing/mock_registry.py +168 -0
  122. tactus/testing/mock_tools.py +133 -0
  123. tactus/testing/models.py +115 -0
  124. tactus/testing/pydantic_eval_runner.py +508 -0
  125. tactus/testing/steps/__init__.py +13 -0
  126. tactus/testing/steps/builtin.py +902 -0
  127. tactus/testing/steps/custom.py +69 -0
  128. tactus/testing/steps/registry.py +68 -0
  129. tactus/testing/test_runner.py +489 -0
  130. tactus/tracing/__init__.py +5 -0
  131. tactus/tracing/trace_manager.py +417 -0
  132. tactus/utils/__init__.py +1 -0
  133. tactus/utils/cost_calculator.py +72 -0
  134. tactus/utils/model_pricing.py +132 -0
  135. tactus/utils/safe_file_library.py +502 -0
  136. tactus/utils/safe_libraries.py +234 -0
  137. tactus/validation/LuaLexerBase.py +66 -0
  138. tactus/validation/LuaParserBase.py +23 -0
  139. tactus/validation/README.md +224 -0
  140. tactus/validation/__init__.py +7 -0
  141. tactus/validation/error_listener.py +21 -0
  142. tactus/validation/generated/LuaLexer.interp +231 -0
  143. tactus/validation/generated/LuaLexer.py +5548 -0
  144. tactus/validation/generated/LuaLexer.tokens +124 -0
  145. tactus/validation/generated/LuaLexerBase.py +66 -0
  146. tactus/validation/generated/LuaParser.interp +173 -0
  147. tactus/validation/generated/LuaParser.py +6439 -0
  148. tactus/validation/generated/LuaParser.tokens +124 -0
  149. tactus/validation/generated/LuaParserBase.py +23 -0
  150. tactus/validation/generated/LuaParserVisitor.py +118 -0
  151. tactus/validation/generated/__init__.py +7 -0
  152. tactus/validation/grammar/LuaLexer.g4 +123 -0
  153. tactus/validation/grammar/LuaParser.g4 +178 -0
  154. tactus/validation/semantic_visitor.py +817 -0
  155. tactus/validation/validator.py +157 -0
  156. tactus-0.31.0.dist-info/METADATA +1809 -0
  157. tactus-0.31.0.dist-info/RECORD +160 -0
  158. tactus-0.31.0.dist-info/WHEEL +4 -0
  159. tactus-0.31.0.dist-info/entry_points.txt +2 -0
  160. tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,115 @@
1
+ """
2
+ Pydantic models for BDD testing results and parsed Gherkin.
3
+ """
4
+
5
+ from datetime import datetime
6
+ from typing import List, Optional
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ # Parsed Gherkin Models (from gherkin-official)
11
+
12
+
13
+ class ParsedStep(BaseModel):
14
+ """Parsed Gherkin step."""
15
+
16
+ keyword: str # Given, When, Then, And, But
17
+ message: str
18
+ line: Optional[int] = None
19
+
20
+
21
+ class ParsedScenario(BaseModel):
22
+ """Parsed Gherkin scenario."""
23
+
24
+ name: str
25
+ tags: List[str] = Field(default_factory=list)
26
+ steps: List[ParsedStep] = Field(default_factory=list)
27
+ line: Optional[int] = None
28
+
29
+
30
+ class ParsedFeature(BaseModel):
31
+ """Parsed Gherkin feature."""
32
+
33
+ name: str
34
+ description: str = ""
35
+ scenarios: List[ParsedScenario] = Field(default_factory=list)
36
+ tags: List[str] = Field(default_factory=list)
37
+ line: Optional[int] = None
38
+
39
+
40
+ # Test Result Models (from Behave execution)
41
+
42
+
43
+ class StepResult(BaseModel):
44
+ """Result of executing a single step."""
45
+
46
+ keyword: str
47
+ message: str
48
+ status: str # passed, failed, skipped, undefined
49
+ duration: float = 0.0
50
+ error_message: Optional[str] = None
51
+
52
+
53
+ class ScenarioResult(BaseModel):
54
+ """Result of executing a scenario."""
55
+
56
+ name: str
57
+ status: str # passed, failed, skipped
58
+ duration: float
59
+ steps: List[StepResult] = Field(default_factory=list)
60
+ tags: List[str] = Field(default_factory=list)
61
+ iteration: Optional[int] = None # For evaluation runs
62
+ timestamp: datetime = Field(default_factory=datetime.now)
63
+
64
+ # Execution metrics
65
+ total_cost: float = 0.0 # Total LLM cost for this scenario
66
+ total_tokens: int = 0 # Total tokens used in this scenario
67
+ iterations: int = 0 # Number of agent iterations
68
+ tools_used: List[str] = Field(default_factory=list) # Tools called during execution
69
+ llm_calls: int = 0 # Number of LLM API calls made
70
+
71
+
72
+ class FeatureResult(BaseModel):
73
+ """Result of executing a feature."""
74
+
75
+ name: str
76
+ description: str = ""
77
+ status: str # passed, failed, skipped
78
+ duration: float
79
+ scenarios: List[ScenarioResult] = Field(default_factory=list)
80
+ tags: List[str] = Field(default_factory=list)
81
+
82
+
83
+ class TestResult(BaseModel):
84
+ """Result from 'tactus test' command."""
85
+
86
+ features: List[FeatureResult] = Field(default_factory=list)
87
+ total_scenarios: int
88
+ passed_scenarios: int
89
+ failed_scenarios: int
90
+ total_duration: float
91
+
92
+ # Aggregated execution metrics
93
+ total_cost: float = 0.0 # Total LLM cost across all scenarios
94
+ total_tokens: int = 0 # Total tokens used across all scenarios
95
+ total_iterations: int = 0 # Total iterations across all scenarios
96
+ total_llm_calls: int = 0 # Total LLM API calls across all scenarios
97
+ unique_tools_used: List[str] = Field(
98
+ default_factory=list
99
+ ) # Unique tools used across all scenarios
100
+
101
+
102
+ class EvaluationResult(BaseModel):
103
+ """Result from 'tactus evaluate' command."""
104
+
105
+ scenario_name: str
106
+ total_runs: int
107
+ passed_runs: int
108
+ failed_runs: int
109
+ success_rate: float
110
+ mean_duration: float
111
+ median_duration: float
112
+ stddev_duration: float
113
+ consistency_score: float # 0.0 to 1.0
114
+ is_flaky: bool
115
+ individual_results: List[ScenarioResult] = Field(default_factory=list)
@@ -0,0 +1,508 @@
1
+ """
2
+ Pydantic Evals runner for Tactus procedures.
3
+
4
+ This module bridges Tactus procedures to the Pydantic Evals framework,
5
+ allowing evaluation of LLM agent quality, consistency, and performance.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
12
+
13
+ from .eval_models import EvaluationConfig, EvalCase
14
+
15
+ if TYPE_CHECKING:
16
+ from tactus.core.runtime import TactusRuntime
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Check if pydantic_evals is available
21
+ try:
22
+ from pydantic_evals import Dataset
23
+ from pydantic_evals.evaluators import Evaluator
24
+
25
+ PYDANTIC_EVALS_AVAILABLE = True
26
+ except ImportError:
27
+ PYDANTIC_EVALS_AVAILABLE = False
28
+ logger.warning("pydantic_evals not installed. Install with: pip install pydantic-evals")
29
+
30
+
31
+ class TactusPydanticEvalRunner:
32
+ """
33
+ Runs Pydantic Evals on Tactus procedures.
34
+
35
+ Converts Tactus evaluation config to Pydantic Evals Dataset,
36
+ executes procedure as the "task", and collects results.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ procedure_file: Path,
42
+ eval_config: EvaluationConfig,
43
+ openai_api_key: Optional[str] = None,
44
+ ):
45
+ """
46
+ Initialize the evaluation runner.
47
+
48
+ Args:
49
+ procedure_file: Path to the .tac procedure file
50
+ eval_config: Evaluation configuration from evaluations() block
51
+ openai_api_key: Optional OpenAI API key for LLM calls
52
+ """
53
+ if not PYDANTIC_EVALS_AVAILABLE:
54
+ raise ImportError(
55
+ "pydantic_evals is required for evaluations. "
56
+ "Install with: pip install pydantic-evals"
57
+ )
58
+
59
+ self.procedure_file = procedure_file
60
+ self.eval_config = eval_config
61
+ self.openai_api_key = openai_api_key
62
+ self._procedure_source: Optional[str] = None
63
+
64
+ def run_evaluation(self):
65
+ """
66
+ Run evaluation using Pydantic Evals framework.
67
+
68
+ Flow:
69
+ 1. Convert eval_config to pydantic_evals.Dataset
70
+ 2. Create task function that runs Tactus procedure
71
+ 3. Execute dataset.evaluate_sync(task)
72
+ 4. Return EvaluationReport
73
+
74
+ Returns:
75
+ Pydantic Evals EvaluationReport
76
+ """
77
+ logger.info(f"Running evaluation on {self.procedure_file}")
78
+
79
+ # Load procedure source once
80
+ self._procedure_source = self.procedure_file.read_text()
81
+
82
+ # Create Pydantic Evals dataset
83
+ dataset = self._create_dataset()
84
+
85
+ # Create task function
86
+ task = self._create_task_function()
87
+
88
+ # Run evaluation
89
+ logger.info(f"Evaluating {len(self.eval_config.dataset)} cases...")
90
+ report = dataset.evaluate_sync(task)
91
+
92
+ logger.info("Evaluation complete")
93
+ return report
94
+
95
+ def _create_dataset(self) -> "Dataset":
96
+ """
97
+ Convert Tactus EvaluationConfig to Pydantic Evals Dataset.
98
+
99
+ Returns:
100
+ Pydantic Evals Dataset
101
+ """
102
+ from pydantic_evals import Case
103
+
104
+ # Load cases from file if specified
105
+ all_eval_cases = []
106
+ if self.eval_config.dataset_file:
107
+ all_eval_cases.extend(self._load_dataset_file(self.eval_config.dataset_file))
108
+
109
+ # Add inline dataset cases
110
+ all_eval_cases.extend(self.eval_config.dataset)
111
+
112
+ # Convert cases - duplicate each case N times for multiple runs
113
+ cases = []
114
+ runs = self.eval_config.runs or 1
115
+
116
+ for eval_case in all_eval_cases:
117
+ for run_num in range(runs):
118
+ # Create a unique name for each run
119
+ case_name = eval_case.name
120
+ if runs > 1:
121
+ case_name = f"{eval_case.name}_run{run_num + 1}"
122
+
123
+ case = Case(
124
+ name=case_name,
125
+ inputs=eval_case.inputs,
126
+ expected_output=eval_case.expected_output,
127
+ metadata={
128
+ **eval_case.metadata,
129
+ "run_number": run_num + 1,
130
+ "original_case_name": eval_case.name,
131
+ # Trace will be populated during execution
132
+ "trace": {},
133
+ },
134
+ )
135
+ cases.append(case)
136
+
137
+ # Convert evaluators
138
+ evaluators = self._create_evaluators()
139
+
140
+ # Create dataset
141
+ dataset = Dataset(
142
+ cases=cases,
143
+ evaluators=evaluators,
144
+ )
145
+
146
+ return dataset
147
+
148
+ def _load_dataset_file(self, dataset_file: str) -> List[EvalCase]:
149
+ """
150
+ Load evaluation cases from external file.
151
+
152
+ Supports .jsonl, .json, and .csv formats.
153
+
154
+ Args:
155
+ dataset_file: Path to dataset file (relative to procedure file or absolute)
156
+
157
+ Returns:
158
+ List of EvalCase objects
159
+
160
+ Raises:
161
+ ValueError: If file format is unsupported
162
+ FileNotFoundError: If file doesn't exist
163
+ """
164
+ from pathlib import Path
165
+
166
+ # Resolve path
167
+ file_path = Path(dataset_file)
168
+ if not file_path.is_absolute():
169
+ # Resolve relative to procedure file
170
+ file_path = self.procedure_file.parent / file_path
171
+
172
+ if not file_path.exists():
173
+ raise FileNotFoundError(f"Dataset file not found: {file_path}")
174
+
175
+ # Load based on file extension
176
+ if file_path.suffix == ".jsonl":
177
+ return self._load_jsonl(file_path)
178
+ elif file_path.suffix == ".json":
179
+ return self._load_json(file_path)
180
+ elif file_path.suffix == ".csv":
181
+ return self._load_csv(file_path)
182
+ else:
183
+ raise ValueError(
184
+ f"Unsupported dataset file format: {file_path.suffix}. "
185
+ f"Supported formats: .jsonl, .json, .csv"
186
+ )
187
+
188
+ def _load_jsonl(self, file_path: Path) -> List[EvalCase]:
189
+ """Load cases from JSONL file (one JSON object per line)."""
190
+ import json
191
+
192
+ cases = []
193
+ with open(file_path, "r", encoding="utf-8") as f:
194
+ for line_num, line in enumerate(f, 1):
195
+ line = line.strip()
196
+ if not line:
197
+ continue
198
+ try:
199
+ data = json.loads(line)
200
+ cases.append(EvalCase(**data))
201
+ except json.JSONDecodeError as e:
202
+ raise ValueError(f"Invalid JSON on line {line_num} in {file_path}: {e}")
203
+ except Exception as e:
204
+ raise ValueError(f"Invalid case data on line {line_num} in {file_path}: {e}")
205
+ return cases
206
+
207
+ def _load_json(self, file_path: Path) -> List[EvalCase]:
208
+ """Load cases from JSON file (array of objects)."""
209
+ import json
210
+
211
+ with open(file_path, "r", encoding="utf-8") as f:
212
+ data = json.load(f)
213
+
214
+ if not isinstance(data, list):
215
+ raise ValueError(f"JSON file must contain an array of cases: {file_path}")
216
+
217
+ cases = []
218
+ for idx, item in enumerate(data):
219
+ try:
220
+ cases.append(EvalCase(**item))
221
+ except Exception as e:
222
+ raise ValueError(f"Invalid case data at index {idx} in {file_path}: {e}")
223
+ return cases
224
+
225
+ def _load_csv(self, file_path: Path) -> List[EvalCase]:
226
+ """
227
+ Load cases from CSV file.
228
+
229
+ Expected columns:
230
+ - name: Case name (required)
231
+ - inputs: JSON string of inputs dict (required)
232
+ - expected_output: JSON string of expected output dict (optional)
233
+ - metadata: JSON string of metadata dict (optional)
234
+ """
235
+ import csv
236
+ import json
237
+
238
+ cases = []
239
+ with open(file_path, "r", encoding="utf-8") as f:
240
+ reader = csv.DictReader(f)
241
+
242
+ if not reader.fieldnames or "name" not in reader.fieldnames:
243
+ raise ValueError(f"CSV must have 'name' column: {file_path}")
244
+ if "inputs" not in reader.fieldnames:
245
+ raise ValueError(f"CSV must have 'inputs' column: {file_path}")
246
+
247
+ for row_num, row in enumerate(reader, 2): # Start at 2 (header is 1)
248
+ try:
249
+ # Parse required fields
250
+ name = row["name"]
251
+ inputs = json.loads(row["inputs"])
252
+
253
+ # Parse optional fields
254
+ expected_output = None
255
+ if "expected_output" in row and row["expected_output"]:
256
+ expected_output = json.loads(row["expected_output"])
257
+
258
+ metadata = {}
259
+ if "metadata" in row and row["metadata"]:
260
+ metadata = json.loads(row["metadata"])
261
+
262
+ cases.append(
263
+ EvalCase(
264
+ name=name,
265
+ inputs=inputs,
266
+ expected_output=expected_output,
267
+ metadata=metadata,
268
+ )
269
+ )
270
+ except json.JSONDecodeError as e:
271
+ raise ValueError(f"Invalid JSON in row {row_num} of {file_path}: {e}")
272
+ except Exception as e:
273
+ raise ValueError(f"Invalid case data in row {row_num} of {file_path}: {e}")
274
+
275
+ return cases
276
+
277
+ def _create_evaluators(self) -> List["Evaluator"]:
278
+ """
279
+ Convert Tactus evaluator configs to Pydantic Evals evaluators.
280
+
281
+ Returns:
282
+ List of Pydantic Evals Evaluator instances
283
+ """
284
+ from .evaluators import create_evaluator
285
+
286
+ evaluators = []
287
+ for config in self.eval_config.evaluators:
288
+ try:
289
+ evaluator = create_evaluator(config)
290
+ evaluators.append(evaluator)
291
+ except Exception as e:
292
+ logger.warning(f"Failed to create evaluator {config.type}: {e}")
293
+
294
+ return evaluators
295
+
296
+ def _create_task_function(self) -> Callable:
297
+ """
298
+ Create task function that Pydantic Evals can call.
299
+
300
+ The task function:
301
+ - Takes inputs (Dict) as parameter
302
+ - Runs Tactus procedure with those inputs
303
+ - Returns procedure output
304
+
305
+ Returns:
306
+ Task function for Pydantic Evals
307
+ """
308
+
309
+ def tactus_task(inputs: Dict[str, Any]) -> Dict[str, Any]:
310
+ """
311
+ Execute Tactus procedure with given inputs.
312
+
313
+ Args:
314
+ inputs: Procedure parameters (from EvalCase.inputs)
315
+
316
+ Returns:
317
+ Procedure output (result dict) with execution trace in metadata
318
+ """
319
+ from tactus.core.runtime import TactusRuntime
320
+ from tactus.adapters.memory import MemoryStorage
321
+ from tactus.testing.mock_hitl import MockHITLHandler
322
+ import time
323
+
324
+ # Setup runtime
325
+ storage = MemoryStorage()
326
+ hitl = MockHITLHandler() # Auto-approve for evals
327
+
328
+ runtime = TactusRuntime(
329
+ procedure_id=f"eval_{self.procedure_file.stem}",
330
+ storage_backend=storage,
331
+ hitl_handler=hitl,
332
+ openai_api_key=self.openai_api_key,
333
+ )
334
+
335
+ # Execute procedure with inputs as context
336
+ start_time = time.time()
337
+ try:
338
+ result = asyncio.run(
339
+ runtime.execute(source=self._procedure_source, context=inputs, format="lua")
340
+ )
341
+ duration = time.time() - start_time
342
+
343
+ # Extract execution trace from runtime
344
+ trace = self._extract_trace(runtime, duration)
345
+
346
+ # Get procedure output
347
+ output = result
348
+ if isinstance(result, dict) and "result" in result:
349
+ output = result["result"]
350
+
351
+ # Return output with trace in special field
352
+ # Pydantic Evals will pass this through to evaluators
353
+ return {"__output__": output, "__trace__": trace}
354
+
355
+ except Exception as e:
356
+ logger.error(f"Procedure execution failed: {e}")
357
+ duration = time.time() - start_time
358
+ # Return error info with trace for evaluation
359
+ return {
360
+ "__output__": {"error": str(e), "success": False},
361
+ "__trace__": {"duration": duration, "error": str(e)},
362
+ }
363
+
364
+ return tactus_task
365
+
366
+ def _extract_trace(self, runtime: "TactusRuntime", duration: float) -> Dict[str, Any]:
367
+ """
368
+ Extract execution trace from runtime for evaluators.
369
+
370
+ Args:
371
+ runtime: TactusRuntime instance after execution
372
+ duration: Execution duration in seconds
373
+
374
+ Returns:
375
+ Dictionary with execution trace information
376
+ """
377
+ trace = {
378
+ "duration": duration,
379
+ "tool_calls": [],
380
+ "state_changes": [],
381
+ "agent_turns": [],
382
+ "iterations": 0,
383
+ "cost": 0.0,
384
+ "tokens": 0,
385
+ }
386
+
387
+ # Extract from session if available
388
+ if hasattr(runtime, "session") and runtime.session:
389
+ session = runtime.session
390
+
391
+ # Extract tool calls
392
+ if hasattr(session, "tool_calls"):
393
+ trace["tool_calls"] = [
394
+ {
395
+ "name": getattr(call, "tool_name", getattr(call, "name", "unknown")),
396
+ "args": getattr(call, "args", {}),
397
+ "result": getattr(call, "result", None),
398
+ }
399
+ for call in session.tool_calls
400
+ ]
401
+
402
+ # Extract agent turns/messages
403
+ if hasattr(session, "messages"):
404
+ for msg in session.messages:
405
+ if hasattr(msg, "role") and msg.role == "assistant":
406
+ trace["agent_turns"].append(
407
+ {
408
+ "agent": getattr(msg, "agent_name", "unknown"),
409
+ "message": getattr(msg, "content", ""),
410
+ }
411
+ )
412
+
413
+ # Extract state changes if tracked
414
+ if hasattr(session, "state_history"):
415
+ trace["state_changes"] = session.state_history
416
+
417
+ # Extract metrics
418
+ if hasattr(session, "iteration_count"):
419
+ trace["iterations"] = session.iteration_count
420
+
421
+ # Extract cost/token metrics if available
422
+ if hasattr(runtime, "total_cost"):
423
+ trace["cost"] = runtime.total_cost
424
+ if hasattr(runtime, "total_tokens"):
425
+ trace["tokens"] = runtime.total_tokens
426
+
427
+ return trace
428
+
429
+ def check_thresholds(self, report) -> tuple[bool, list[str]]:
430
+ """
431
+ Check if evaluation results meet configured thresholds.
432
+
433
+ Args:
434
+ report: Pydantic Evals EvaluationReport
435
+
436
+ Returns:
437
+ Tuple of (passed, violations):
438
+ - passed: True if all thresholds met, False otherwise
439
+ - violations: List of violation messages
440
+ """
441
+ if not self.eval_config.thresholds:
442
+ return True, []
443
+
444
+ violations = []
445
+ thresholds = self.eval_config.thresholds
446
+
447
+ # Calculate metrics from report
448
+ total_cases = len(report.cases)
449
+ if total_cases == 0:
450
+ return True, []
451
+
452
+ # Calculate success rate (all assertions passed)
453
+ passed_cases = sum(
454
+ 1
455
+ for case in report.cases
456
+ if hasattr(case, "assertions")
457
+ and case.assertions
458
+ and all(getattr(a, "value", False) for a in case.assertions.values())
459
+ )
460
+ success_rate = passed_cases / total_cases
461
+
462
+ # Check success rate threshold
463
+ if thresholds.min_success_rate is not None:
464
+ if success_rate < thresholds.min_success_rate:
465
+ violations.append(
466
+ f"Success rate {success_rate:.1%} below threshold {thresholds.min_success_rate:.1%}"
467
+ )
468
+
469
+ # Calculate average cost per run
470
+ if thresholds.max_cost_per_run is not None:
471
+ total_cost = 0.0
472
+ for case in report.cases:
473
+ if hasattr(case, "cost"):
474
+ total_cost += getattr(case, "cost", 0.0)
475
+ avg_cost = total_cost / total_cases if total_cases > 0 else 0.0
476
+
477
+ if avg_cost > thresholds.max_cost_per_run:
478
+ violations.append(
479
+ f"Average cost per run ${avg_cost:.4f} exceeds threshold ${thresholds.max_cost_per_run:.4f}"
480
+ )
481
+
482
+ # Calculate average duration
483
+ if thresholds.max_duration is not None:
484
+ total_duration = 0.0
485
+ for case in report.cases:
486
+ if hasattr(case, "task_duration"):
487
+ total_duration += getattr(case, "task_duration", 0.0)
488
+ avg_duration = total_duration / total_cases if total_cases > 0 else 0.0
489
+
490
+ if avg_duration > thresholds.max_duration:
491
+ violations.append(
492
+ f"Average duration {avg_duration:.2f}s exceeds threshold {thresholds.max_duration:.2f}s"
493
+ )
494
+
495
+ # Calculate average tokens per run
496
+ if thresholds.max_tokens_per_run is not None:
497
+ total_tokens = 0
498
+ for case in report.cases:
499
+ if hasattr(case, "tokens"):
500
+ total_tokens += getattr(case, "tokens", 0)
501
+ avg_tokens = total_tokens // total_cases if total_cases > 0 else 0
502
+
503
+ if avg_tokens > thresholds.max_tokens_per_run:
504
+ violations.append(
505
+ f"Average tokens per run {avg_tokens} exceeds threshold {thresholds.max_tokens_per_run}"
506
+ )
507
+
508
+ return len(violations) == 0, violations
@@ -0,0 +1,13 @@
1
+ """
2
+ Step definitions for Tactus BDD testing.
3
+ """
4
+
5
+ from .registry import StepRegistry
6
+ from .builtin import register_builtin_steps
7
+ from .custom import CustomStepManager
8
+
9
+ __all__ = [
10
+ "StepRegistry",
11
+ "register_builtin_steps",
12
+ "CustomStepManager",
13
+ ]