tactus 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +49 -0
- tactus/adapters/__init__.py +9 -0
- tactus/adapters/broker_log.py +76 -0
- tactus/adapters/cli_hitl.py +189 -0
- tactus/adapters/cli_log.py +223 -0
- tactus/adapters/cost_collector_log.py +56 -0
- tactus/adapters/file_storage.py +367 -0
- tactus/adapters/http_callback_log.py +109 -0
- tactus/adapters/ide_log.py +71 -0
- tactus/adapters/lua_tools.py +336 -0
- tactus/adapters/mcp.py +289 -0
- tactus/adapters/mcp_manager.py +196 -0
- tactus/adapters/memory.py +53 -0
- tactus/adapters/plugins.py +419 -0
- tactus/backends/http_backend.py +58 -0
- tactus/backends/model_backend.py +35 -0
- tactus/backends/pytorch_backend.py +110 -0
- tactus/broker/__init__.py +12 -0
- tactus/broker/client.py +247 -0
- tactus/broker/protocol.py +183 -0
- tactus/broker/server.py +1123 -0
- tactus/broker/stdio.py +12 -0
- tactus/cli/__init__.py +7 -0
- tactus/cli/app.py +2245 -0
- tactus/cli/commands/__init__.py +0 -0
- tactus/core/__init__.py +32 -0
- tactus/core/config_manager.py +790 -0
- tactus/core/dependencies/__init__.py +14 -0
- tactus/core/dependencies/registry.py +180 -0
- tactus/core/dsl_stubs.py +2117 -0
- tactus/core/exceptions.py +66 -0
- tactus/core/execution_context.py +480 -0
- tactus/core/lua_sandbox.py +508 -0
- tactus/core/message_history_manager.py +236 -0
- tactus/core/mocking.py +286 -0
- tactus/core/output_validator.py +291 -0
- tactus/core/registry.py +499 -0
- tactus/core/runtime.py +2907 -0
- tactus/core/template_resolver.py +142 -0
- tactus/core/yaml_parser.py +301 -0
- tactus/docker/Dockerfile +61 -0
- tactus/docker/entrypoint.sh +69 -0
- tactus/dspy/__init__.py +39 -0
- tactus/dspy/agent.py +1144 -0
- tactus/dspy/broker_lm.py +181 -0
- tactus/dspy/config.py +212 -0
- tactus/dspy/history.py +196 -0
- tactus/dspy/module.py +405 -0
- tactus/dspy/prediction.py +318 -0
- tactus/dspy/signature.py +185 -0
- tactus/formatting/__init__.py +7 -0
- tactus/formatting/formatter.py +437 -0
- tactus/ide/__init__.py +9 -0
- tactus/ide/coding_assistant.py +343 -0
- tactus/ide/server.py +2223 -0
- tactus/primitives/__init__.py +49 -0
- tactus/primitives/control.py +168 -0
- tactus/primitives/file.py +229 -0
- tactus/primitives/handles.py +378 -0
- tactus/primitives/host.py +94 -0
- tactus/primitives/human.py +342 -0
- tactus/primitives/json.py +189 -0
- tactus/primitives/log.py +187 -0
- tactus/primitives/message_history.py +157 -0
- tactus/primitives/model.py +163 -0
- tactus/primitives/procedure.py +564 -0
- tactus/primitives/procedure_callable.py +318 -0
- tactus/primitives/retry.py +155 -0
- tactus/primitives/session.py +152 -0
- tactus/primitives/state.py +182 -0
- tactus/primitives/step.py +209 -0
- tactus/primitives/system.py +93 -0
- tactus/primitives/tool.py +375 -0
- tactus/primitives/tool_handle.py +279 -0
- tactus/primitives/toolset.py +229 -0
- tactus/protocols/__init__.py +38 -0
- tactus/protocols/chat_recorder.py +81 -0
- tactus/protocols/config.py +97 -0
- tactus/protocols/cost.py +31 -0
- tactus/protocols/hitl.py +71 -0
- tactus/protocols/log_handler.py +27 -0
- tactus/protocols/models.py +355 -0
- tactus/protocols/result.py +33 -0
- tactus/protocols/storage.py +90 -0
- tactus/providers/__init__.py +13 -0
- tactus/providers/base.py +92 -0
- tactus/providers/bedrock.py +117 -0
- tactus/providers/google.py +105 -0
- tactus/providers/openai.py +98 -0
- tactus/sandbox/__init__.py +63 -0
- tactus/sandbox/config.py +171 -0
- tactus/sandbox/container_runner.py +1099 -0
- tactus/sandbox/docker_manager.py +433 -0
- tactus/sandbox/entrypoint.py +227 -0
- tactus/sandbox/protocol.py +213 -0
- tactus/stdlib/__init__.py +10 -0
- tactus/stdlib/io/__init__.py +13 -0
- tactus/stdlib/io/csv.py +88 -0
- tactus/stdlib/io/excel.py +136 -0
- tactus/stdlib/io/file.py +90 -0
- tactus/stdlib/io/fs.py +154 -0
- tactus/stdlib/io/hdf5.py +121 -0
- tactus/stdlib/io/json.py +109 -0
- tactus/stdlib/io/parquet.py +83 -0
- tactus/stdlib/io/tsv.py +88 -0
- tactus/stdlib/loader.py +274 -0
- tactus/stdlib/tac/tactus/tools/done.tac +33 -0
- tactus/stdlib/tac/tactus/tools/log.tac +50 -0
- tactus/testing/README.md +273 -0
- tactus/testing/__init__.py +61 -0
- tactus/testing/behave_integration.py +380 -0
- tactus/testing/context.py +486 -0
- tactus/testing/eval_models.py +114 -0
- tactus/testing/evaluation_runner.py +222 -0
- tactus/testing/evaluators.py +634 -0
- tactus/testing/events.py +94 -0
- tactus/testing/gherkin_parser.py +134 -0
- tactus/testing/mock_agent.py +315 -0
- tactus/testing/mock_dependencies.py +234 -0
- tactus/testing/mock_hitl.py +171 -0
- tactus/testing/mock_registry.py +168 -0
- tactus/testing/mock_tools.py +133 -0
- tactus/testing/models.py +115 -0
- tactus/testing/pydantic_eval_runner.py +508 -0
- tactus/testing/steps/__init__.py +13 -0
- tactus/testing/steps/builtin.py +902 -0
- tactus/testing/steps/custom.py +69 -0
- tactus/testing/steps/registry.py +68 -0
- tactus/testing/test_runner.py +489 -0
- tactus/tracing/__init__.py +5 -0
- tactus/tracing/trace_manager.py +417 -0
- tactus/utils/__init__.py +1 -0
- tactus/utils/cost_calculator.py +72 -0
- tactus/utils/model_pricing.py +132 -0
- tactus/utils/safe_file_library.py +502 -0
- tactus/utils/safe_libraries.py +234 -0
- tactus/validation/LuaLexerBase.py +66 -0
- tactus/validation/LuaParserBase.py +23 -0
- tactus/validation/README.md +224 -0
- tactus/validation/__init__.py +7 -0
- tactus/validation/error_listener.py +21 -0
- tactus/validation/generated/LuaLexer.interp +231 -0
- tactus/validation/generated/LuaLexer.py +5548 -0
- tactus/validation/generated/LuaLexer.tokens +124 -0
- tactus/validation/generated/LuaLexerBase.py +66 -0
- tactus/validation/generated/LuaParser.interp +173 -0
- tactus/validation/generated/LuaParser.py +6439 -0
- tactus/validation/generated/LuaParser.tokens +124 -0
- tactus/validation/generated/LuaParserBase.py +23 -0
- tactus/validation/generated/LuaParserVisitor.py +118 -0
- tactus/validation/generated/__init__.py +7 -0
- tactus/validation/grammar/LuaLexer.g4 +123 -0
- tactus/validation/grammar/LuaParser.g4 +178 -0
- tactus/validation/semantic_visitor.py +817 -0
- tactus/validation/validator.py +157 -0
- tactus-0.31.0.dist-info/METADATA +1809 -0
- tactus-0.31.0.dist-info/RECORD +160 -0
- tactus-0.31.0.dist-info/WHEEL +4 -0
- tactus-0.31.0.dist-info/entry_points.txt +2 -0
- tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
tactus/testing/models.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic models for BDD testing results and parsed Gherkin.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Parsed Gherkin Models (from gherkin-official)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ParsedStep(BaseModel):
|
|
14
|
+
"""Parsed Gherkin step."""
|
|
15
|
+
|
|
16
|
+
keyword: str # Given, When, Then, And, But
|
|
17
|
+
message: str
|
|
18
|
+
line: Optional[int] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ParsedScenario(BaseModel):
|
|
22
|
+
"""Parsed Gherkin scenario."""
|
|
23
|
+
|
|
24
|
+
name: str
|
|
25
|
+
tags: List[str] = Field(default_factory=list)
|
|
26
|
+
steps: List[ParsedStep] = Field(default_factory=list)
|
|
27
|
+
line: Optional[int] = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ParsedFeature(BaseModel):
|
|
31
|
+
"""Parsed Gherkin feature."""
|
|
32
|
+
|
|
33
|
+
name: str
|
|
34
|
+
description: str = ""
|
|
35
|
+
scenarios: List[ParsedScenario] = Field(default_factory=list)
|
|
36
|
+
tags: List[str] = Field(default_factory=list)
|
|
37
|
+
line: Optional[int] = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Test Result Models (from Behave execution)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class StepResult(BaseModel):
|
|
44
|
+
"""Result of executing a single step."""
|
|
45
|
+
|
|
46
|
+
keyword: str
|
|
47
|
+
message: str
|
|
48
|
+
status: str # passed, failed, skipped, undefined
|
|
49
|
+
duration: float = 0.0
|
|
50
|
+
error_message: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ScenarioResult(BaseModel):
|
|
54
|
+
"""Result of executing a scenario."""
|
|
55
|
+
|
|
56
|
+
name: str
|
|
57
|
+
status: str # passed, failed, skipped
|
|
58
|
+
duration: float
|
|
59
|
+
steps: List[StepResult] = Field(default_factory=list)
|
|
60
|
+
tags: List[str] = Field(default_factory=list)
|
|
61
|
+
iteration: Optional[int] = None # For evaluation runs
|
|
62
|
+
timestamp: datetime = Field(default_factory=datetime.now)
|
|
63
|
+
|
|
64
|
+
# Execution metrics
|
|
65
|
+
total_cost: float = 0.0 # Total LLM cost for this scenario
|
|
66
|
+
total_tokens: int = 0 # Total tokens used in this scenario
|
|
67
|
+
iterations: int = 0 # Number of agent iterations
|
|
68
|
+
tools_used: List[str] = Field(default_factory=list) # Tools called during execution
|
|
69
|
+
llm_calls: int = 0 # Number of LLM API calls made
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class FeatureResult(BaseModel):
|
|
73
|
+
"""Result of executing a feature."""
|
|
74
|
+
|
|
75
|
+
name: str
|
|
76
|
+
description: str = ""
|
|
77
|
+
status: str # passed, failed, skipped
|
|
78
|
+
duration: float
|
|
79
|
+
scenarios: List[ScenarioResult] = Field(default_factory=list)
|
|
80
|
+
tags: List[str] = Field(default_factory=list)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class TestResult(BaseModel):
|
|
84
|
+
"""Result from 'tactus test' command."""
|
|
85
|
+
|
|
86
|
+
features: List[FeatureResult] = Field(default_factory=list)
|
|
87
|
+
total_scenarios: int
|
|
88
|
+
passed_scenarios: int
|
|
89
|
+
failed_scenarios: int
|
|
90
|
+
total_duration: float
|
|
91
|
+
|
|
92
|
+
# Aggregated execution metrics
|
|
93
|
+
total_cost: float = 0.0 # Total LLM cost across all scenarios
|
|
94
|
+
total_tokens: int = 0 # Total tokens used across all scenarios
|
|
95
|
+
total_iterations: int = 0 # Total iterations across all scenarios
|
|
96
|
+
total_llm_calls: int = 0 # Total LLM API calls across all scenarios
|
|
97
|
+
unique_tools_used: List[str] = Field(
|
|
98
|
+
default_factory=list
|
|
99
|
+
) # Unique tools used across all scenarios
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class EvaluationResult(BaseModel):
|
|
103
|
+
"""Result from 'tactus evaluate' command."""
|
|
104
|
+
|
|
105
|
+
scenario_name: str
|
|
106
|
+
total_runs: int
|
|
107
|
+
passed_runs: int
|
|
108
|
+
failed_runs: int
|
|
109
|
+
success_rate: float
|
|
110
|
+
mean_duration: float
|
|
111
|
+
median_duration: float
|
|
112
|
+
stddev_duration: float
|
|
113
|
+
consistency_score: float # 0.0 to 1.0
|
|
114
|
+
is_flaky: bool
|
|
115
|
+
individual_results: List[ScenarioResult] = Field(default_factory=list)
|
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic Evals runner for Tactus procedures.
|
|
3
|
+
|
|
4
|
+
This module bridges Tactus procedures to the Pydantic Evals framework,
|
|
5
|
+
allowing evaluation of LLM agent quality, consistency, and performance.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from .eval_models import EvaluationConfig, EvalCase
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from tactus.core.runtime import TactusRuntime
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Check if pydantic_evals is available
|
|
21
|
+
try:
|
|
22
|
+
from pydantic_evals import Dataset
|
|
23
|
+
from pydantic_evals.evaluators import Evaluator
|
|
24
|
+
|
|
25
|
+
PYDANTIC_EVALS_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
PYDANTIC_EVALS_AVAILABLE = False
|
|
28
|
+
logger.warning("pydantic_evals not installed. Install with: pip install pydantic-evals")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TactusPydanticEvalRunner:
|
|
32
|
+
"""
|
|
33
|
+
Runs Pydantic Evals on Tactus procedures.
|
|
34
|
+
|
|
35
|
+
Converts Tactus evaluation config to Pydantic Evals Dataset,
|
|
36
|
+
executes procedure as the "task", and collects results.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
procedure_file: Path,
|
|
42
|
+
eval_config: EvaluationConfig,
|
|
43
|
+
openai_api_key: Optional[str] = None,
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
Initialize the evaluation runner.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
procedure_file: Path to the .tac procedure file
|
|
50
|
+
eval_config: Evaluation configuration from evaluations() block
|
|
51
|
+
openai_api_key: Optional OpenAI API key for LLM calls
|
|
52
|
+
"""
|
|
53
|
+
if not PYDANTIC_EVALS_AVAILABLE:
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"pydantic_evals is required for evaluations. "
|
|
56
|
+
"Install with: pip install pydantic-evals"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
self.procedure_file = procedure_file
|
|
60
|
+
self.eval_config = eval_config
|
|
61
|
+
self.openai_api_key = openai_api_key
|
|
62
|
+
self._procedure_source: Optional[str] = None
|
|
63
|
+
|
|
64
|
+
def run_evaluation(self):
|
|
65
|
+
"""
|
|
66
|
+
Run evaluation using Pydantic Evals framework.
|
|
67
|
+
|
|
68
|
+
Flow:
|
|
69
|
+
1. Convert eval_config to pydantic_evals.Dataset
|
|
70
|
+
2. Create task function that runs Tactus procedure
|
|
71
|
+
3. Execute dataset.evaluate_sync(task)
|
|
72
|
+
4. Return EvaluationReport
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Pydantic Evals EvaluationReport
|
|
76
|
+
"""
|
|
77
|
+
logger.info(f"Running evaluation on {self.procedure_file}")
|
|
78
|
+
|
|
79
|
+
# Load procedure source once
|
|
80
|
+
self._procedure_source = self.procedure_file.read_text()
|
|
81
|
+
|
|
82
|
+
# Create Pydantic Evals dataset
|
|
83
|
+
dataset = self._create_dataset()
|
|
84
|
+
|
|
85
|
+
# Create task function
|
|
86
|
+
task = self._create_task_function()
|
|
87
|
+
|
|
88
|
+
# Run evaluation
|
|
89
|
+
logger.info(f"Evaluating {len(self.eval_config.dataset)} cases...")
|
|
90
|
+
report = dataset.evaluate_sync(task)
|
|
91
|
+
|
|
92
|
+
logger.info("Evaluation complete")
|
|
93
|
+
return report
|
|
94
|
+
|
|
95
|
+
def _create_dataset(self) -> "Dataset":
|
|
96
|
+
"""
|
|
97
|
+
Convert Tactus EvaluationConfig to Pydantic Evals Dataset.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Pydantic Evals Dataset
|
|
101
|
+
"""
|
|
102
|
+
from pydantic_evals import Case
|
|
103
|
+
|
|
104
|
+
# Load cases from file if specified
|
|
105
|
+
all_eval_cases = []
|
|
106
|
+
if self.eval_config.dataset_file:
|
|
107
|
+
all_eval_cases.extend(self._load_dataset_file(self.eval_config.dataset_file))
|
|
108
|
+
|
|
109
|
+
# Add inline dataset cases
|
|
110
|
+
all_eval_cases.extend(self.eval_config.dataset)
|
|
111
|
+
|
|
112
|
+
# Convert cases - duplicate each case N times for multiple runs
|
|
113
|
+
cases = []
|
|
114
|
+
runs = self.eval_config.runs or 1
|
|
115
|
+
|
|
116
|
+
for eval_case in all_eval_cases:
|
|
117
|
+
for run_num in range(runs):
|
|
118
|
+
# Create a unique name for each run
|
|
119
|
+
case_name = eval_case.name
|
|
120
|
+
if runs > 1:
|
|
121
|
+
case_name = f"{eval_case.name}_run{run_num + 1}"
|
|
122
|
+
|
|
123
|
+
case = Case(
|
|
124
|
+
name=case_name,
|
|
125
|
+
inputs=eval_case.inputs,
|
|
126
|
+
expected_output=eval_case.expected_output,
|
|
127
|
+
metadata={
|
|
128
|
+
**eval_case.metadata,
|
|
129
|
+
"run_number": run_num + 1,
|
|
130
|
+
"original_case_name": eval_case.name,
|
|
131
|
+
# Trace will be populated during execution
|
|
132
|
+
"trace": {},
|
|
133
|
+
},
|
|
134
|
+
)
|
|
135
|
+
cases.append(case)
|
|
136
|
+
|
|
137
|
+
# Convert evaluators
|
|
138
|
+
evaluators = self._create_evaluators()
|
|
139
|
+
|
|
140
|
+
# Create dataset
|
|
141
|
+
dataset = Dataset(
|
|
142
|
+
cases=cases,
|
|
143
|
+
evaluators=evaluators,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return dataset
|
|
147
|
+
|
|
148
|
+
def _load_dataset_file(self, dataset_file: str) -> List[EvalCase]:
|
|
149
|
+
"""
|
|
150
|
+
Load evaluation cases from external file.
|
|
151
|
+
|
|
152
|
+
Supports .jsonl, .json, and .csv formats.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
dataset_file: Path to dataset file (relative to procedure file or absolute)
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
List of EvalCase objects
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
ValueError: If file format is unsupported
|
|
162
|
+
FileNotFoundError: If file doesn't exist
|
|
163
|
+
"""
|
|
164
|
+
from pathlib import Path
|
|
165
|
+
|
|
166
|
+
# Resolve path
|
|
167
|
+
file_path = Path(dataset_file)
|
|
168
|
+
if not file_path.is_absolute():
|
|
169
|
+
# Resolve relative to procedure file
|
|
170
|
+
file_path = self.procedure_file.parent / file_path
|
|
171
|
+
|
|
172
|
+
if not file_path.exists():
|
|
173
|
+
raise FileNotFoundError(f"Dataset file not found: {file_path}")
|
|
174
|
+
|
|
175
|
+
# Load based on file extension
|
|
176
|
+
if file_path.suffix == ".jsonl":
|
|
177
|
+
return self._load_jsonl(file_path)
|
|
178
|
+
elif file_path.suffix == ".json":
|
|
179
|
+
return self._load_json(file_path)
|
|
180
|
+
elif file_path.suffix == ".csv":
|
|
181
|
+
return self._load_csv(file_path)
|
|
182
|
+
else:
|
|
183
|
+
raise ValueError(
|
|
184
|
+
f"Unsupported dataset file format: {file_path.suffix}. "
|
|
185
|
+
f"Supported formats: .jsonl, .json, .csv"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def _load_jsonl(self, file_path: Path) -> List[EvalCase]:
|
|
189
|
+
"""Load cases from JSONL file (one JSON object per line)."""
|
|
190
|
+
import json
|
|
191
|
+
|
|
192
|
+
cases = []
|
|
193
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
194
|
+
for line_num, line in enumerate(f, 1):
|
|
195
|
+
line = line.strip()
|
|
196
|
+
if not line:
|
|
197
|
+
continue
|
|
198
|
+
try:
|
|
199
|
+
data = json.loads(line)
|
|
200
|
+
cases.append(EvalCase(**data))
|
|
201
|
+
except json.JSONDecodeError as e:
|
|
202
|
+
raise ValueError(f"Invalid JSON on line {line_num} in {file_path}: {e}")
|
|
203
|
+
except Exception as e:
|
|
204
|
+
raise ValueError(f"Invalid case data on line {line_num} in {file_path}: {e}")
|
|
205
|
+
return cases
|
|
206
|
+
|
|
207
|
+
def _load_json(self, file_path: Path) -> List[EvalCase]:
|
|
208
|
+
"""Load cases from JSON file (array of objects)."""
|
|
209
|
+
import json
|
|
210
|
+
|
|
211
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
212
|
+
data = json.load(f)
|
|
213
|
+
|
|
214
|
+
if not isinstance(data, list):
|
|
215
|
+
raise ValueError(f"JSON file must contain an array of cases: {file_path}")
|
|
216
|
+
|
|
217
|
+
cases = []
|
|
218
|
+
for idx, item in enumerate(data):
|
|
219
|
+
try:
|
|
220
|
+
cases.append(EvalCase(**item))
|
|
221
|
+
except Exception as e:
|
|
222
|
+
raise ValueError(f"Invalid case data at index {idx} in {file_path}: {e}")
|
|
223
|
+
return cases
|
|
224
|
+
|
|
225
|
+
def _load_csv(self, file_path: Path) -> List[EvalCase]:
|
|
226
|
+
"""
|
|
227
|
+
Load cases from CSV file.
|
|
228
|
+
|
|
229
|
+
Expected columns:
|
|
230
|
+
- name: Case name (required)
|
|
231
|
+
- inputs: JSON string of inputs dict (required)
|
|
232
|
+
- expected_output: JSON string of expected output dict (optional)
|
|
233
|
+
- metadata: JSON string of metadata dict (optional)
|
|
234
|
+
"""
|
|
235
|
+
import csv
|
|
236
|
+
import json
|
|
237
|
+
|
|
238
|
+
cases = []
|
|
239
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
240
|
+
reader = csv.DictReader(f)
|
|
241
|
+
|
|
242
|
+
if not reader.fieldnames or "name" not in reader.fieldnames:
|
|
243
|
+
raise ValueError(f"CSV must have 'name' column: {file_path}")
|
|
244
|
+
if "inputs" not in reader.fieldnames:
|
|
245
|
+
raise ValueError(f"CSV must have 'inputs' column: {file_path}")
|
|
246
|
+
|
|
247
|
+
for row_num, row in enumerate(reader, 2): # Start at 2 (header is 1)
|
|
248
|
+
try:
|
|
249
|
+
# Parse required fields
|
|
250
|
+
name = row["name"]
|
|
251
|
+
inputs = json.loads(row["inputs"])
|
|
252
|
+
|
|
253
|
+
# Parse optional fields
|
|
254
|
+
expected_output = None
|
|
255
|
+
if "expected_output" in row and row["expected_output"]:
|
|
256
|
+
expected_output = json.loads(row["expected_output"])
|
|
257
|
+
|
|
258
|
+
metadata = {}
|
|
259
|
+
if "metadata" in row and row["metadata"]:
|
|
260
|
+
metadata = json.loads(row["metadata"])
|
|
261
|
+
|
|
262
|
+
cases.append(
|
|
263
|
+
EvalCase(
|
|
264
|
+
name=name,
|
|
265
|
+
inputs=inputs,
|
|
266
|
+
expected_output=expected_output,
|
|
267
|
+
metadata=metadata,
|
|
268
|
+
)
|
|
269
|
+
)
|
|
270
|
+
except json.JSONDecodeError as e:
|
|
271
|
+
raise ValueError(f"Invalid JSON in row {row_num} of {file_path}: {e}")
|
|
272
|
+
except Exception as e:
|
|
273
|
+
raise ValueError(f"Invalid case data in row {row_num} of {file_path}: {e}")
|
|
274
|
+
|
|
275
|
+
return cases
|
|
276
|
+
|
|
277
|
+
def _create_evaluators(self) -> List["Evaluator"]:
|
|
278
|
+
"""
|
|
279
|
+
Convert Tactus evaluator configs to Pydantic Evals evaluators.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
List of Pydantic Evals Evaluator instances
|
|
283
|
+
"""
|
|
284
|
+
from .evaluators import create_evaluator
|
|
285
|
+
|
|
286
|
+
evaluators = []
|
|
287
|
+
for config in self.eval_config.evaluators:
|
|
288
|
+
try:
|
|
289
|
+
evaluator = create_evaluator(config)
|
|
290
|
+
evaluators.append(evaluator)
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.warning(f"Failed to create evaluator {config.type}: {e}")
|
|
293
|
+
|
|
294
|
+
return evaluators
|
|
295
|
+
|
|
296
|
+
def _create_task_function(self) -> Callable:
|
|
297
|
+
"""
|
|
298
|
+
Create task function that Pydantic Evals can call.
|
|
299
|
+
|
|
300
|
+
The task function:
|
|
301
|
+
- Takes inputs (Dict) as parameter
|
|
302
|
+
- Runs Tactus procedure with those inputs
|
|
303
|
+
- Returns procedure output
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Task function for Pydantic Evals
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
def tactus_task(inputs: Dict[str, Any]) -> Dict[str, Any]:
|
|
310
|
+
"""
|
|
311
|
+
Execute Tactus procedure with given inputs.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
inputs: Procedure parameters (from EvalCase.inputs)
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Procedure output (result dict) with execution trace in metadata
|
|
318
|
+
"""
|
|
319
|
+
from tactus.core.runtime import TactusRuntime
|
|
320
|
+
from tactus.adapters.memory import MemoryStorage
|
|
321
|
+
from tactus.testing.mock_hitl import MockHITLHandler
|
|
322
|
+
import time
|
|
323
|
+
|
|
324
|
+
# Setup runtime
|
|
325
|
+
storage = MemoryStorage()
|
|
326
|
+
hitl = MockHITLHandler() # Auto-approve for evals
|
|
327
|
+
|
|
328
|
+
runtime = TactusRuntime(
|
|
329
|
+
procedure_id=f"eval_{self.procedure_file.stem}",
|
|
330
|
+
storage_backend=storage,
|
|
331
|
+
hitl_handler=hitl,
|
|
332
|
+
openai_api_key=self.openai_api_key,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Execute procedure with inputs as context
|
|
336
|
+
start_time = time.time()
|
|
337
|
+
try:
|
|
338
|
+
result = asyncio.run(
|
|
339
|
+
runtime.execute(source=self._procedure_source, context=inputs, format="lua")
|
|
340
|
+
)
|
|
341
|
+
duration = time.time() - start_time
|
|
342
|
+
|
|
343
|
+
# Extract execution trace from runtime
|
|
344
|
+
trace = self._extract_trace(runtime, duration)
|
|
345
|
+
|
|
346
|
+
# Get procedure output
|
|
347
|
+
output = result
|
|
348
|
+
if isinstance(result, dict) and "result" in result:
|
|
349
|
+
output = result["result"]
|
|
350
|
+
|
|
351
|
+
# Return output with trace in special field
|
|
352
|
+
# Pydantic Evals will pass this through to evaluators
|
|
353
|
+
return {"__output__": output, "__trace__": trace}
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.error(f"Procedure execution failed: {e}")
|
|
357
|
+
duration = time.time() - start_time
|
|
358
|
+
# Return error info with trace for evaluation
|
|
359
|
+
return {
|
|
360
|
+
"__output__": {"error": str(e), "success": False},
|
|
361
|
+
"__trace__": {"duration": duration, "error": str(e)},
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
return tactus_task
|
|
365
|
+
|
|
366
|
+
def _extract_trace(self, runtime: "TactusRuntime", duration: float) -> Dict[str, Any]:
|
|
367
|
+
"""
|
|
368
|
+
Extract execution trace from runtime for evaluators.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
runtime: TactusRuntime instance after execution
|
|
372
|
+
duration: Execution duration in seconds
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
Dictionary with execution trace information
|
|
376
|
+
"""
|
|
377
|
+
trace = {
|
|
378
|
+
"duration": duration,
|
|
379
|
+
"tool_calls": [],
|
|
380
|
+
"state_changes": [],
|
|
381
|
+
"agent_turns": [],
|
|
382
|
+
"iterations": 0,
|
|
383
|
+
"cost": 0.0,
|
|
384
|
+
"tokens": 0,
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
# Extract from session if available
|
|
388
|
+
if hasattr(runtime, "session") and runtime.session:
|
|
389
|
+
session = runtime.session
|
|
390
|
+
|
|
391
|
+
# Extract tool calls
|
|
392
|
+
if hasattr(session, "tool_calls"):
|
|
393
|
+
trace["tool_calls"] = [
|
|
394
|
+
{
|
|
395
|
+
"name": getattr(call, "tool_name", getattr(call, "name", "unknown")),
|
|
396
|
+
"args": getattr(call, "args", {}),
|
|
397
|
+
"result": getattr(call, "result", None),
|
|
398
|
+
}
|
|
399
|
+
for call in session.tool_calls
|
|
400
|
+
]
|
|
401
|
+
|
|
402
|
+
# Extract agent turns/messages
|
|
403
|
+
if hasattr(session, "messages"):
|
|
404
|
+
for msg in session.messages:
|
|
405
|
+
if hasattr(msg, "role") and msg.role == "assistant":
|
|
406
|
+
trace["agent_turns"].append(
|
|
407
|
+
{
|
|
408
|
+
"agent": getattr(msg, "agent_name", "unknown"),
|
|
409
|
+
"message": getattr(msg, "content", ""),
|
|
410
|
+
}
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Extract state changes if tracked
|
|
414
|
+
if hasattr(session, "state_history"):
|
|
415
|
+
trace["state_changes"] = session.state_history
|
|
416
|
+
|
|
417
|
+
# Extract metrics
|
|
418
|
+
if hasattr(session, "iteration_count"):
|
|
419
|
+
trace["iterations"] = session.iteration_count
|
|
420
|
+
|
|
421
|
+
# Extract cost/token metrics if available
|
|
422
|
+
if hasattr(runtime, "total_cost"):
|
|
423
|
+
trace["cost"] = runtime.total_cost
|
|
424
|
+
if hasattr(runtime, "total_tokens"):
|
|
425
|
+
trace["tokens"] = runtime.total_tokens
|
|
426
|
+
|
|
427
|
+
return trace
|
|
428
|
+
|
|
429
|
+
def check_thresholds(self, report) -> tuple[bool, list[str]]:
|
|
430
|
+
"""
|
|
431
|
+
Check if evaluation results meet configured thresholds.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
report: Pydantic Evals EvaluationReport
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
Tuple of (passed, violations):
|
|
438
|
+
- passed: True if all thresholds met, False otherwise
|
|
439
|
+
- violations: List of violation messages
|
|
440
|
+
"""
|
|
441
|
+
if not self.eval_config.thresholds:
|
|
442
|
+
return True, []
|
|
443
|
+
|
|
444
|
+
violations = []
|
|
445
|
+
thresholds = self.eval_config.thresholds
|
|
446
|
+
|
|
447
|
+
# Calculate metrics from report
|
|
448
|
+
total_cases = len(report.cases)
|
|
449
|
+
if total_cases == 0:
|
|
450
|
+
return True, []
|
|
451
|
+
|
|
452
|
+
# Calculate success rate (all assertions passed)
|
|
453
|
+
passed_cases = sum(
|
|
454
|
+
1
|
|
455
|
+
for case in report.cases
|
|
456
|
+
if hasattr(case, "assertions")
|
|
457
|
+
and case.assertions
|
|
458
|
+
and all(getattr(a, "value", False) for a in case.assertions.values())
|
|
459
|
+
)
|
|
460
|
+
success_rate = passed_cases / total_cases
|
|
461
|
+
|
|
462
|
+
# Check success rate threshold
|
|
463
|
+
if thresholds.min_success_rate is not None:
|
|
464
|
+
if success_rate < thresholds.min_success_rate:
|
|
465
|
+
violations.append(
|
|
466
|
+
f"Success rate {success_rate:.1%} below threshold {thresholds.min_success_rate:.1%}"
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# Calculate average cost per run
|
|
470
|
+
if thresholds.max_cost_per_run is not None:
|
|
471
|
+
total_cost = 0.0
|
|
472
|
+
for case in report.cases:
|
|
473
|
+
if hasattr(case, "cost"):
|
|
474
|
+
total_cost += getattr(case, "cost", 0.0)
|
|
475
|
+
avg_cost = total_cost / total_cases if total_cases > 0 else 0.0
|
|
476
|
+
|
|
477
|
+
if avg_cost > thresholds.max_cost_per_run:
|
|
478
|
+
violations.append(
|
|
479
|
+
f"Average cost per run ${avg_cost:.4f} exceeds threshold ${thresholds.max_cost_per_run:.4f}"
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# Calculate average duration
|
|
483
|
+
if thresholds.max_duration is not None:
|
|
484
|
+
total_duration = 0.0
|
|
485
|
+
for case in report.cases:
|
|
486
|
+
if hasattr(case, "task_duration"):
|
|
487
|
+
total_duration += getattr(case, "task_duration", 0.0)
|
|
488
|
+
avg_duration = total_duration / total_cases if total_cases > 0 else 0.0
|
|
489
|
+
|
|
490
|
+
if avg_duration > thresholds.max_duration:
|
|
491
|
+
violations.append(
|
|
492
|
+
f"Average duration {avg_duration:.2f}s exceeds threshold {thresholds.max_duration:.2f}s"
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# Calculate average tokens per run
|
|
496
|
+
if thresholds.max_tokens_per_run is not None:
|
|
497
|
+
total_tokens = 0
|
|
498
|
+
for case in report.cases:
|
|
499
|
+
if hasattr(case, "tokens"):
|
|
500
|
+
total_tokens += getattr(case, "tokens", 0)
|
|
501
|
+
avg_tokens = total_tokens // total_cases if total_cases > 0 else 0
|
|
502
|
+
|
|
503
|
+
if avg_tokens > thresholds.max_tokens_per_run:
|
|
504
|
+
violations.append(
|
|
505
|
+
f"Average tokens per run {avg_tokens} exceeds threshold {thresholds.max_tokens_per_run}"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
return len(violations) == 0, violations
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Step definitions for Tactus BDD testing.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .registry import StepRegistry
|
|
6
|
+
from .builtin import register_builtin_steps
|
|
7
|
+
from .custom import CustomStepManager
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"StepRegistry",
|
|
11
|
+
"register_builtin_steps",
|
|
12
|
+
"CustomStepManager",
|
|
13
|
+
]
|