tactus 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +49 -0
- tactus/adapters/__init__.py +9 -0
- tactus/adapters/broker_log.py +76 -0
- tactus/adapters/cli_hitl.py +189 -0
- tactus/adapters/cli_log.py +223 -0
- tactus/adapters/cost_collector_log.py +56 -0
- tactus/adapters/file_storage.py +367 -0
- tactus/adapters/http_callback_log.py +109 -0
- tactus/adapters/ide_log.py +71 -0
- tactus/adapters/lua_tools.py +336 -0
- tactus/adapters/mcp.py +289 -0
- tactus/adapters/mcp_manager.py +196 -0
- tactus/adapters/memory.py +53 -0
- tactus/adapters/plugins.py +419 -0
- tactus/backends/http_backend.py +58 -0
- tactus/backends/model_backend.py +35 -0
- tactus/backends/pytorch_backend.py +110 -0
- tactus/broker/__init__.py +12 -0
- tactus/broker/client.py +247 -0
- tactus/broker/protocol.py +183 -0
- tactus/broker/server.py +1123 -0
- tactus/broker/stdio.py +12 -0
- tactus/cli/__init__.py +7 -0
- tactus/cli/app.py +2245 -0
- tactus/cli/commands/__init__.py +0 -0
- tactus/core/__init__.py +32 -0
- tactus/core/config_manager.py +790 -0
- tactus/core/dependencies/__init__.py +14 -0
- tactus/core/dependencies/registry.py +180 -0
- tactus/core/dsl_stubs.py +2117 -0
- tactus/core/exceptions.py +66 -0
- tactus/core/execution_context.py +480 -0
- tactus/core/lua_sandbox.py +508 -0
- tactus/core/message_history_manager.py +236 -0
- tactus/core/mocking.py +286 -0
- tactus/core/output_validator.py +291 -0
- tactus/core/registry.py +499 -0
- tactus/core/runtime.py +2907 -0
- tactus/core/template_resolver.py +142 -0
- tactus/core/yaml_parser.py +301 -0
- tactus/docker/Dockerfile +61 -0
- tactus/docker/entrypoint.sh +69 -0
- tactus/dspy/__init__.py +39 -0
- tactus/dspy/agent.py +1144 -0
- tactus/dspy/broker_lm.py +181 -0
- tactus/dspy/config.py +212 -0
- tactus/dspy/history.py +196 -0
- tactus/dspy/module.py +405 -0
- tactus/dspy/prediction.py +318 -0
- tactus/dspy/signature.py +185 -0
- tactus/formatting/__init__.py +7 -0
- tactus/formatting/formatter.py +437 -0
- tactus/ide/__init__.py +9 -0
- tactus/ide/coding_assistant.py +343 -0
- tactus/ide/server.py +2223 -0
- tactus/primitives/__init__.py +49 -0
- tactus/primitives/control.py +168 -0
- tactus/primitives/file.py +229 -0
- tactus/primitives/handles.py +378 -0
- tactus/primitives/host.py +94 -0
- tactus/primitives/human.py +342 -0
- tactus/primitives/json.py +189 -0
- tactus/primitives/log.py +187 -0
- tactus/primitives/message_history.py +157 -0
- tactus/primitives/model.py +163 -0
- tactus/primitives/procedure.py +564 -0
- tactus/primitives/procedure_callable.py +318 -0
- tactus/primitives/retry.py +155 -0
- tactus/primitives/session.py +152 -0
- tactus/primitives/state.py +182 -0
- tactus/primitives/step.py +209 -0
- tactus/primitives/system.py +93 -0
- tactus/primitives/tool.py +375 -0
- tactus/primitives/tool_handle.py +279 -0
- tactus/primitives/toolset.py +229 -0
- tactus/protocols/__init__.py +38 -0
- tactus/protocols/chat_recorder.py +81 -0
- tactus/protocols/config.py +97 -0
- tactus/protocols/cost.py +31 -0
- tactus/protocols/hitl.py +71 -0
- tactus/protocols/log_handler.py +27 -0
- tactus/protocols/models.py +355 -0
- tactus/protocols/result.py +33 -0
- tactus/protocols/storage.py +90 -0
- tactus/providers/__init__.py +13 -0
- tactus/providers/base.py +92 -0
- tactus/providers/bedrock.py +117 -0
- tactus/providers/google.py +105 -0
- tactus/providers/openai.py +98 -0
- tactus/sandbox/__init__.py +63 -0
- tactus/sandbox/config.py +171 -0
- tactus/sandbox/container_runner.py +1099 -0
- tactus/sandbox/docker_manager.py +433 -0
- tactus/sandbox/entrypoint.py +227 -0
- tactus/sandbox/protocol.py +213 -0
- tactus/stdlib/__init__.py +10 -0
- tactus/stdlib/io/__init__.py +13 -0
- tactus/stdlib/io/csv.py +88 -0
- tactus/stdlib/io/excel.py +136 -0
- tactus/stdlib/io/file.py +90 -0
- tactus/stdlib/io/fs.py +154 -0
- tactus/stdlib/io/hdf5.py +121 -0
- tactus/stdlib/io/json.py +109 -0
- tactus/stdlib/io/parquet.py +83 -0
- tactus/stdlib/io/tsv.py +88 -0
- tactus/stdlib/loader.py +274 -0
- tactus/stdlib/tac/tactus/tools/done.tac +33 -0
- tactus/stdlib/tac/tactus/tools/log.tac +50 -0
- tactus/testing/README.md +273 -0
- tactus/testing/__init__.py +61 -0
- tactus/testing/behave_integration.py +380 -0
- tactus/testing/context.py +486 -0
- tactus/testing/eval_models.py +114 -0
- tactus/testing/evaluation_runner.py +222 -0
- tactus/testing/evaluators.py +634 -0
- tactus/testing/events.py +94 -0
- tactus/testing/gherkin_parser.py +134 -0
- tactus/testing/mock_agent.py +315 -0
- tactus/testing/mock_dependencies.py +234 -0
- tactus/testing/mock_hitl.py +171 -0
- tactus/testing/mock_registry.py +168 -0
- tactus/testing/mock_tools.py +133 -0
- tactus/testing/models.py +115 -0
- tactus/testing/pydantic_eval_runner.py +508 -0
- tactus/testing/steps/__init__.py +13 -0
- tactus/testing/steps/builtin.py +902 -0
- tactus/testing/steps/custom.py +69 -0
- tactus/testing/steps/registry.py +68 -0
- tactus/testing/test_runner.py +489 -0
- tactus/tracing/__init__.py +5 -0
- tactus/tracing/trace_manager.py +417 -0
- tactus/utils/__init__.py +1 -0
- tactus/utils/cost_calculator.py +72 -0
- tactus/utils/model_pricing.py +132 -0
- tactus/utils/safe_file_library.py +502 -0
- tactus/utils/safe_libraries.py +234 -0
- tactus/validation/LuaLexerBase.py +66 -0
- tactus/validation/LuaParserBase.py +23 -0
- tactus/validation/README.md +224 -0
- tactus/validation/__init__.py +7 -0
- tactus/validation/error_listener.py +21 -0
- tactus/validation/generated/LuaLexer.interp +231 -0
- tactus/validation/generated/LuaLexer.py +5548 -0
- tactus/validation/generated/LuaLexer.tokens +124 -0
- tactus/validation/generated/LuaLexerBase.py +66 -0
- tactus/validation/generated/LuaParser.interp +173 -0
- tactus/validation/generated/LuaParser.py +6439 -0
- tactus/validation/generated/LuaParser.tokens +124 -0
- tactus/validation/generated/LuaParserBase.py +23 -0
- tactus/validation/generated/LuaParserVisitor.py +118 -0
- tactus/validation/generated/__init__.py +7 -0
- tactus/validation/grammar/LuaLexer.g4 +123 -0
- tactus/validation/grammar/LuaParser.g4 +178 -0
- tactus/validation/semantic_visitor.py +817 -0
- tactus/validation/validator.py +157 -0
- tactus-0.31.0.dist-info/METADATA +1809 -0
- tactus-0.31.0.dist-info/RECORD +160 -0
- tactus-0.31.0.dist-info/WHEEL +4 -0
- tactus-0.31.0.dist-info/entry_points.txt +2 -0
- tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom step manager for user-defined Lua step functions.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CustomStepManager:
|
|
13
|
+
"""
|
|
14
|
+
Manages custom Lua step definitions.
|
|
15
|
+
|
|
16
|
+
Allows users to define custom steps in their procedure files
|
|
17
|
+
using the step() function with Lua implementations.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, lua_sandbox=None):
|
|
21
|
+
self.lua_sandbox = lua_sandbox
|
|
22
|
+
self.custom_steps: Dict[str, Any] = {}
|
|
23
|
+
|
|
24
|
+
def register_from_lua(self, step_text: str, lua_function: Any) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Register a custom step from Lua code.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
step_text: The step text pattern (exact match)
|
|
30
|
+
lua_function: Lua function reference to execute
|
|
31
|
+
"""
|
|
32
|
+
self.custom_steps[step_text] = lua_function
|
|
33
|
+
logger.debug(f"Registered custom step: {step_text}")
|
|
34
|
+
|
|
35
|
+
def execute(self, step_text: str, context: Any) -> bool:
|
|
36
|
+
"""
|
|
37
|
+
Execute custom Lua step if it exists.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
step_text: The step text to match
|
|
41
|
+
context: Test context object
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
True if step was found and executed, False otherwise
|
|
45
|
+
"""
|
|
46
|
+
if step_text in self.custom_steps:
|
|
47
|
+
lua_func = self.custom_steps[step_text]
|
|
48
|
+
try:
|
|
49
|
+
# Call Lua function with context
|
|
50
|
+
# The Lua function should perform assertions
|
|
51
|
+
lua_func(context)
|
|
52
|
+
return True
|
|
53
|
+
except Exception as e:
|
|
54
|
+
logger.error(f"Custom step '{step_text}' failed: {e}")
|
|
55
|
+
raise AssertionError(f"Custom step failed: {e}")
|
|
56
|
+
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
def has_step(self, step_text: str) -> bool:
|
|
60
|
+
"""Check if custom step exists."""
|
|
61
|
+
return step_text in self.custom_steps
|
|
62
|
+
|
|
63
|
+
def get_all_steps(self) -> list[str]:
|
|
64
|
+
"""Get all registered custom step texts."""
|
|
65
|
+
return list(self.custom_steps.keys())
|
|
66
|
+
|
|
67
|
+
def clear(self) -> None:
|
|
68
|
+
"""Clear all custom steps."""
|
|
69
|
+
self.custom_steps.clear()
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Step registry for pattern matching and execution.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Callable, Dict, Optional, Pattern, Tuple
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StepRegistry:
|
|
14
|
+
"""
|
|
15
|
+
Registry of step definitions with regex pattern matching.
|
|
16
|
+
|
|
17
|
+
Matches step text against registered patterns and executes
|
|
18
|
+
the corresponding step functions.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self._steps: Dict[Pattern, Callable] = {}
|
|
23
|
+
self._step_patterns: Dict[str, Pattern] = {} # pattern_str -> compiled pattern
|
|
24
|
+
|
|
25
|
+
def register(self, pattern: str, func: Callable, step_type: str = "any") -> None:
|
|
26
|
+
"""
|
|
27
|
+
Register a step with regex pattern.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
pattern: Regex pattern to match step text
|
|
31
|
+
func: Function to execute when pattern matches
|
|
32
|
+
step_type: Type of step (given, when, then, any)
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
compiled = re.compile(pattern, re.IGNORECASE)
|
|
36
|
+
self._steps[compiled] = func
|
|
37
|
+
self._step_patterns[pattern] = compiled
|
|
38
|
+
logger.debug(f"Registered step pattern: {pattern}")
|
|
39
|
+
except re.error as e:
|
|
40
|
+
logger.error(f"Invalid regex pattern '{pattern}': {e}")
|
|
41
|
+
raise ValueError(f"Invalid step pattern: {e}")
|
|
42
|
+
|
|
43
|
+
def match(self, step_text: str) -> Optional[Tuple[Callable, dict]]:
|
|
44
|
+
"""
|
|
45
|
+
Find matching step function for given step text.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
step_text: The step text to match
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Tuple of (function, match_groups) or None if no match
|
|
52
|
+
"""
|
|
53
|
+
for pattern, func in self._steps.items():
|
|
54
|
+
match = pattern.match(step_text)
|
|
55
|
+
if match:
|
|
56
|
+
# Return function and captured groups as dict
|
|
57
|
+
return func, match.groupdict()
|
|
58
|
+
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
def get_all_patterns(self) -> list[str]:
|
|
62
|
+
"""Get all registered pattern strings."""
|
|
63
|
+
return list(self._step_patterns.keys())
|
|
64
|
+
|
|
65
|
+
def clear(self) -> None:
|
|
66
|
+
"""Clear all registered steps."""
|
|
67
|
+
self._steps.clear()
|
|
68
|
+
self._step_patterns.clear()
|
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test runner for Tactus BDD testing.
|
|
3
|
+
|
|
4
|
+
Runs tests with parallel scenario execution using multiprocessing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import importlib.util
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
import subprocess
|
|
12
|
+
import multiprocessing
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from .models import (
|
|
18
|
+
ParsedFeature,
|
|
19
|
+
ScenarioResult,
|
|
20
|
+
StepResult,
|
|
21
|
+
FeatureResult,
|
|
22
|
+
TestResult,
|
|
23
|
+
)
|
|
24
|
+
from .gherkin_parser import GherkinParser
|
|
25
|
+
from .behave_integration import setup_behave_directory
|
|
26
|
+
from .steps.registry import StepRegistry
|
|
27
|
+
from .steps.builtin import register_builtin_steps
|
|
28
|
+
from .steps.custom import CustomStepManager
|
|
29
|
+
|
|
30
|
+
BEHAVE_AVAILABLE = importlib.util.find_spec("behave") is not None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TactusTestRunner:
|
|
37
|
+
"""
|
|
38
|
+
Runs Tactus BDD tests with parallel scenario execution.
|
|
39
|
+
|
|
40
|
+
Parses Gherkin specifications, generates Behave files,
|
|
41
|
+
and executes scenarios in parallel for performance.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
procedure_file: Path,
|
|
47
|
+
mock_tools: Optional[Dict] = None,
|
|
48
|
+
params: Optional[Dict] = None,
|
|
49
|
+
mcp_servers: Optional[Dict] = None,
|
|
50
|
+
tool_paths: Optional[List[str]] = None,
|
|
51
|
+
mocked: bool = False,
|
|
52
|
+
):
|
|
53
|
+
if not BEHAVE_AVAILABLE:
|
|
54
|
+
raise ImportError("behave library not installed. Install with: pip install behave")
|
|
55
|
+
|
|
56
|
+
self.procedure_file = procedure_file
|
|
57
|
+
self.mock_tools = mock_tools or {}
|
|
58
|
+
self.params = params or {}
|
|
59
|
+
self.mcp_servers = mcp_servers or {}
|
|
60
|
+
self.tool_paths = tool_paths or []
|
|
61
|
+
self.mocked = mocked # Whether to use mocked dependencies
|
|
62
|
+
self.work_dir: Optional[Path] = None
|
|
63
|
+
self.parsed_feature: Optional[ParsedFeature] = None
|
|
64
|
+
self.step_registry = StepRegistry()
|
|
65
|
+
self.custom_steps = CustomStepManager()
|
|
66
|
+
self.generated_step_file: Optional[Path] = None
|
|
67
|
+
|
|
68
|
+
# Register built-in steps
|
|
69
|
+
register_builtin_steps(self.step_registry)
|
|
70
|
+
|
|
71
|
+
def setup(self, gherkin_text: str) -> None:
|
|
72
|
+
"""
|
|
73
|
+
Setup test environment from Gherkin text.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
gherkin_text: Raw Gherkin feature text
|
|
77
|
+
"""
|
|
78
|
+
# Parse Gherkin
|
|
79
|
+
parser = GherkinParser()
|
|
80
|
+
self.parsed_feature = parser.parse(gherkin_text)
|
|
81
|
+
|
|
82
|
+
# Setup Behave directory with mock tools, params, and mocked flag
|
|
83
|
+
self.work_dir = setup_behave_directory(
|
|
84
|
+
self.parsed_feature,
|
|
85
|
+
self.step_registry,
|
|
86
|
+
self.custom_steps,
|
|
87
|
+
self.procedure_file,
|
|
88
|
+
mock_tools=self.mock_tools,
|
|
89
|
+
params=self.params,
|
|
90
|
+
mcp_servers=self.mcp_servers,
|
|
91
|
+
tool_paths=self.tool_paths,
|
|
92
|
+
mocked=self.mocked,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Track the generated step file for cleanup
|
|
96
|
+
# The step file name is based on the work_dir hash
|
|
97
|
+
import hashlib
|
|
98
|
+
|
|
99
|
+
dir_hash = hashlib.md5(str(self.work_dir).encode()).hexdigest()[:8]
|
|
100
|
+
self.generated_step_file = self.work_dir / "steps" / f"tactus_steps_{dir_hash}.py"
|
|
101
|
+
|
|
102
|
+
logger.info(f"Test setup complete for feature: {self.parsed_feature.name}")
|
|
103
|
+
|
|
104
|
+
def run_tests(self, parallel: bool = True, scenario_filter: Optional[str] = None) -> TestResult:
|
|
105
|
+
"""
|
|
106
|
+
Run all scenarios (optionally in parallel).
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
parallel: Whether to run scenarios in parallel
|
|
110
|
+
scenario_filter: Optional scenario name to run (runs only that scenario)
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
TestResult with all scenario results
|
|
114
|
+
"""
|
|
115
|
+
if not self.parsed_feature or not self.work_dir:
|
|
116
|
+
raise RuntimeError("Must call setup() before run_tests()")
|
|
117
|
+
|
|
118
|
+
# Get scenarios to run
|
|
119
|
+
scenarios = self.parsed_feature.scenarios
|
|
120
|
+
if scenario_filter:
|
|
121
|
+
scenarios = [s for s in scenarios if s.name == scenario_filter]
|
|
122
|
+
if not scenarios:
|
|
123
|
+
raise ValueError(f"Scenario not found: {scenario_filter}")
|
|
124
|
+
|
|
125
|
+
# Run scenarios
|
|
126
|
+
if parallel and len(scenarios) > 1:
|
|
127
|
+
# Run in parallel using 'spawn' to avoid Behave global state conflicts
|
|
128
|
+
# 'spawn' creates fresh Python interpreters for each worker
|
|
129
|
+
ctx = multiprocessing.get_context("spawn")
|
|
130
|
+
with ctx.Pool(processes=min(len(scenarios), os.cpu_count() or 1)) as pool:
|
|
131
|
+
scenario_results = pool.starmap(
|
|
132
|
+
self._run_single_scenario, [(s.name, str(self.work_dir)) for s in scenarios]
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
# Run sequentially
|
|
136
|
+
scenario_results = [
|
|
137
|
+
self._run_single_scenario(s.name, str(self.work_dir)) for s in scenarios
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
# Build feature result
|
|
141
|
+
feature_result = self._build_feature_result(scenario_results)
|
|
142
|
+
|
|
143
|
+
# Build test result
|
|
144
|
+
return self._build_test_result([feature_result])
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def _run_single_scenario(scenario_name: str, work_dir: str) -> ScenarioResult:
|
|
148
|
+
"""
|
|
149
|
+
Run a single scenario in subprocess to avoid event loop conflicts.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
scenario_name: Name of scenario to run
|
|
153
|
+
work_dir: Path to Behave work directory
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
ScenarioResult
|
|
157
|
+
"""
|
|
158
|
+
# Create tag filter for this scenario
|
|
159
|
+
# Remove special characters that could interfere with behave tags
|
|
160
|
+
import re
|
|
161
|
+
|
|
162
|
+
sanitized_name = re.sub(r"[^a-z0-9_]", "_", scenario_name.lower())
|
|
163
|
+
sanitized_name = re.sub(r"_+", "_", sanitized_name) # Collapse multiple underscores
|
|
164
|
+
tag_filter = f"scenario_{sanitized_name}"
|
|
165
|
+
|
|
166
|
+
# Use unique results file to avoid conflicts when running in parallel
|
|
167
|
+
import uuid
|
|
168
|
+
|
|
169
|
+
results_filename = f"results_{uuid.uuid4().hex[:8]}.json"
|
|
170
|
+
|
|
171
|
+
# Run behave in subprocess to isolate event loops
|
|
172
|
+
cmd = [
|
|
173
|
+
sys.executable,
|
|
174
|
+
"-m",
|
|
175
|
+
"behave",
|
|
176
|
+
str(work_dir),
|
|
177
|
+
"--tags",
|
|
178
|
+
tag_filter,
|
|
179
|
+
"--no-capture",
|
|
180
|
+
"--format",
|
|
181
|
+
"json",
|
|
182
|
+
"--outfile",
|
|
183
|
+
f"{work_dir}/{results_filename}",
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
logger.debug(f"Running behave subprocess: {' '.join(cmd)}")
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
# Ensure tactus module is importable in subprocess
|
|
190
|
+
env = os.environ.copy()
|
|
191
|
+
# Add parent directory to PYTHONPATH so tactus can be imported
|
|
192
|
+
project_root = Path(__file__).parent.parent.parent # Go up to project root
|
|
193
|
+
if "PYTHONPATH" in env:
|
|
194
|
+
env["PYTHONPATH"] = f"{project_root}:{env['PYTHONPATH']}"
|
|
195
|
+
else:
|
|
196
|
+
env["PYTHONPATH"] = str(project_root)
|
|
197
|
+
|
|
198
|
+
result = subprocess.run(
|
|
199
|
+
cmd,
|
|
200
|
+
capture_output=True,
|
|
201
|
+
text=True,
|
|
202
|
+
timeout=600, # 10 minute timeout
|
|
203
|
+
cwd=work_dir,
|
|
204
|
+
env=env,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Check if behave ran successfully (even if tests failed)
|
|
208
|
+
if result.returncode not in [0, 1]:
|
|
209
|
+
# Return code 0 = all passed, 1 = some failed, other = error
|
|
210
|
+
raise RuntimeError(
|
|
211
|
+
f"Behave subprocess failed with return code {result.returncode}\n"
|
|
212
|
+
f"STDOUT: {result.stdout}\n"
|
|
213
|
+
f"STDERR: {result.stderr}"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Parse JSON results
|
|
217
|
+
import json
|
|
218
|
+
|
|
219
|
+
results_file = Path(work_dir) / results_filename
|
|
220
|
+
if not results_file.exists():
|
|
221
|
+
raise RuntimeError(
|
|
222
|
+
f"Behave results file not found: {results_file}\n"
|
|
223
|
+
f"Command: {' '.join(cmd)}\n"
|
|
224
|
+
f"Return code: {result.returncode}\n"
|
|
225
|
+
f"STDOUT: {result.stdout}\n"
|
|
226
|
+
f"STDERR: {result.stderr}"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Behave may write multiple JSON objects (one per feature run)
|
|
230
|
+
# We need to parse them separately and combine
|
|
231
|
+
with open(results_file) as f:
|
|
232
|
+
content = f.read().strip()
|
|
233
|
+
if not content:
|
|
234
|
+
raise RuntimeError("Behave results file is empty")
|
|
235
|
+
|
|
236
|
+
# Try to parse as single JSON first
|
|
237
|
+
try:
|
|
238
|
+
behave_results = json.loads(content)
|
|
239
|
+
except json.JSONDecodeError:
|
|
240
|
+
# Multiple JSON objects - split and parse each
|
|
241
|
+
behave_results = []
|
|
242
|
+
for line in content.split("\n"):
|
|
243
|
+
line = line.strip()
|
|
244
|
+
if line:
|
|
245
|
+
try:
|
|
246
|
+
obj = json.loads(line)
|
|
247
|
+
if isinstance(obj, list):
|
|
248
|
+
behave_results.extend(obj)
|
|
249
|
+
else:
|
|
250
|
+
behave_results.append(obj)
|
|
251
|
+
except json.JSONDecodeError:
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
# Extract the scenario result
|
|
255
|
+
found_scenarios = []
|
|
256
|
+
for feature_data in behave_results:
|
|
257
|
+
for element in feature_data.get("elements", []):
|
|
258
|
+
element_name = element.get("name")
|
|
259
|
+
found_scenarios.append(element_name)
|
|
260
|
+
if element_name == scenario_name:
|
|
261
|
+
scenario_result = TactusTestRunner._convert_json_scenario_result(element)
|
|
262
|
+
# Clean up results file
|
|
263
|
+
try:
|
|
264
|
+
results_file.unlink()
|
|
265
|
+
except Exception:
|
|
266
|
+
pass
|
|
267
|
+
return scenario_result
|
|
268
|
+
|
|
269
|
+
# Scenario not found (shouldn't happen)
|
|
270
|
+
raise RuntimeError(
|
|
271
|
+
f"Scenario '{scenario_name}' not found in Behave JSON results. "
|
|
272
|
+
f"Found scenarios: {found_scenarios}. "
|
|
273
|
+
f"Tag filter used: scenario_{scenario_name.lower().replace(' ', '_')}. "
|
|
274
|
+
f"Command: {' '.join(cmd)}. "
|
|
275
|
+
f"Behave output: {result.stdout[:500]}"
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
except subprocess.TimeoutExpired:
|
|
279
|
+
raise RuntimeError(f"Scenario '{scenario_name}' timed out after 10 minutes")
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error(f"Error running scenario '{scenario_name}': {e}", exc_info=True)
|
|
282
|
+
raise
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def _convert_json_scenario_result(scenario_data: Dict[str, Any]) -> ScenarioResult:
|
|
286
|
+
"""Convert Behave JSON scenario data to ScenarioResult."""
|
|
287
|
+
steps = []
|
|
288
|
+
for step_data in scenario_data.get("steps", []):
|
|
289
|
+
result = step_data.get("result", {})
|
|
290
|
+
# error_message might be a list (behave can return traceback as list)
|
|
291
|
+
# Convert to string if needed
|
|
292
|
+
error_msg = result.get("error_message")
|
|
293
|
+
if isinstance(error_msg, list):
|
|
294
|
+
error_msg = "\n".join(str(e) for e in error_msg)
|
|
295
|
+
elif error_msg is not None and not isinstance(error_msg, str):
|
|
296
|
+
error_msg = str(error_msg)
|
|
297
|
+
|
|
298
|
+
steps.append(
|
|
299
|
+
StepResult(
|
|
300
|
+
keyword=step_data.get("keyword", ""),
|
|
301
|
+
message=step_data.get("name", ""),
|
|
302
|
+
status=result.get("status", "skipped"),
|
|
303
|
+
duration=result.get("duration", 0.0),
|
|
304
|
+
error_message=error_msg,
|
|
305
|
+
)
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Calculate total duration from steps
|
|
309
|
+
total_duration = sum(s.duration for s in steps)
|
|
310
|
+
|
|
311
|
+
# Extract tags
|
|
312
|
+
tags = [tag.replace("@", "") for tag in scenario_data.get("tags", [])]
|
|
313
|
+
|
|
314
|
+
# Extract execution metrics from scenario properties (if attached by hook)
|
|
315
|
+
# These would be in the scenario_data dict if the hook sets them
|
|
316
|
+
total_cost = scenario_data.get("total_cost", 0.0)
|
|
317
|
+
total_tokens = scenario_data.get("total_tokens", 0)
|
|
318
|
+
iterations = scenario_data.get("iterations", 0)
|
|
319
|
+
tools_used = scenario_data.get("tools_used", [])
|
|
320
|
+
llm_calls = scenario_data.get("llm_calls", 0)
|
|
321
|
+
|
|
322
|
+
# Determine overall status
|
|
323
|
+
status = scenario_data.get("status", "passed")
|
|
324
|
+
|
|
325
|
+
return ScenarioResult(
|
|
326
|
+
name=scenario_data.get("name", ""),
|
|
327
|
+
status=status,
|
|
328
|
+
duration=total_duration,
|
|
329
|
+
steps=steps,
|
|
330
|
+
tags=tags,
|
|
331
|
+
timestamp=datetime.now(),
|
|
332
|
+
total_cost=total_cost,
|
|
333
|
+
total_tokens=total_tokens,
|
|
334
|
+
iterations=iterations,
|
|
335
|
+
tools_used=tools_used,
|
|
336
|
+
llm_calls=llm_calls,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
@staticmethod
|
|
340
|
+
def _convert_scenario_result(behave_scenario) -> ScenarioResult:
|
|
341
|
+
"""Convert Behave scenario object to ScenarioResult (legacy method)."""
|
|
342
|
+
steps = []
|
|
343
|
+
for behave_step in behave_scenario.steps:
|
|
344
|
+
steps.append(
|
|
345
|
+
StepResult(
|
|
346
|
+
keyword=behave_step.keyword,
|
|
347
|
+
message=behave_step.name,
|
|
348
|
+
status=behave_step.status.name,
|
|
349
|
+
duration=behave_step.duration,
|
|
350
|
+
error_message=(
|
|
351
|
+
behave_step.error_message if hasattr(behave_step, "error_message") else None
|
|
352
|
+
),
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Extract execution metrics (attached by after_scenario hook)
|
|
357
|
+
total_cost = getattr(behave_scenario, "total_cost", 0.0)
|
|
358
|
+
total_tokens = getattr(behave_scenario, "total_tokens", 0)
|
|
359
|
+
cost_breakdown = getattr(behave_scenario, "cost_breakdown", [])
|
|
360
|
+
# iterations is a method, not an attribute - call it if it exists
|
|
361
|
+
iterations_attr = getattr(behave_scenario, "iterations", None)
|
|
362
|
+
iterations = iterations_attr() if callable(iterations_attr) else 0
|
|
363
|
+
tools_used = getattr(behave_scenario, "tools_used", [])
|
|
364
|
+
llm_calls = len(cost_breakdown) # Number of LLM calls = number of cost events
|
|
365
|
+
|
|
366
|
+
return ScenarioResult(
|
|
367
|
+
name=behave_scenario.name,
|
|
368
|
+
status=behave_scenario.status.name,
|
|
369
|
+
duration=behave_scenario.duration,
|
|
370
|
+
steps=steps,
|
|
371
|
+
tags=behave_scenario.tags,
|
|
372
|
+
timestamp=datetime.now(),
|
|
373
|
+
total_cost=total_cost,
|
|
374
|
+
total_tokens=total_tokens,
|
|
375
|
+
iterations=iterations,
|
|
376
|
+
tools_used=tools_used,
|
|
377
|
+
llm_calls=llm_calls,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
def _build_feature_result(self, scenario_results: List[ScenarioResult]) -> FeatureResult:
|
|
381
|
+
"""Build FeatureResult from scenario results."""
|
|
382
|
+
if not self.parsed_feature:
|
|
383
|
+
raise RuntimeError("No parsed feature available")
|
|
384
|
+
|
|
385
|
+
# Calculate feature status
|
|
386
|
+
all_passed = all(s.status == "passed" for s in scenario_results)
|
|
387
|
+
any_failed = any(s.status == "failed" for s in scenario_results)
|
|
388
|
+
status = "passed" if all_passed else ("failed" if any_failed else "skipped")
|
|
389
|
+
|
|
390
|
+
# Calculate total duration
|
|
391
|
+
total_duration = sum(s.duration for s in scenario_results)
|
|
392
|
+
|
|
393
|
+
return FeatureResult(
|
|
394
|
+
name=self.parsed_feature.name,
|
|
395
|
+
description=self.parsed_feature.description,
|
|
396
|
+
status=status,
|
|
397
|
+
duration=total_duration,
|
|
398
|
+
scenarios=scenario_results,
|
|
399
|
+
tags=self.parsed_feature.tags,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
def _build_test_result(self, feature_results: List[FeatureResult]) -> TestResult:
|
|
403
|
+
"""Build TestResult from feature results."""
|
|
404
|
+
total_scenarios = sum(len(f.scenarios) for f in feature_results)
|
|
405
|
+
passed_scenarios = sum(
|
|
406
|
+
1 for f in feature_results for s in f.scenarios if s.status == "passed"
|
|
407
|
+
)
|
|
408
|
+
failed_scenarios = total_scenarios - passed_scenarios
|
|
409
|
+
total_duration = sum(f.duration for f in feature_results)
|
|
410
|
+
|
|
411
|
+
# Aggregate execution metrics across all scenarios
|
|
412
|
+
total_cost = sum(s.total_cost for f in feature_results for s in f.scenarios)
|
|
413
|
+
total_tokens = sum(s.total_tokens for f in feature_results for s in f.scenarios)
|
|
414
|
+
total_iterations = sum(s.iterations for f in feature_results for s in f.scenarios)
|
|
415
|
+
total_llm_calls = sum(s.llm_calls for f in feature_results for s in f.scenarios)
|
|
416
|
+
|
|
417
|
+
# Collect unique tools used across all scenarios
|
|
418
|
+
all_tools = set()
|
|
419
|
+
for f in feature_results:
|
|
420
|
+
for s in f.scenarios:
|
|
421
|
+
all_tools.update(s.tools_used)
|
|
422
|
+
unique_tools_used = sorted(list(all_tools))
|
|
423
|
+
|
|
424
|
+
return TestResult(
|
|
425
|
+
features=feature_results,
|
|
426
|
+
total_scenarios=total_scenarios,
|
|
427
|
+
passed_scenarios=passed_scenarios,
|
|
428
|
+
failed_scenarios=failed_scenarios,
|
|
429
|
+
total_duration=total_duration,
|
|
430
|
+
total_cost=total_cost,
|
|
431
|
+
total_tokens=total_tokens,
|
|
432
|
+
total_iterations=total_iterations,
|
|
433
|
+
total_llm_calls=total_llm_calls,
|
|
434
|
+
unique_tools_used=unique_tools_used,
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
def cleanup(self) -> None:
|
|
438
|
+
"""
|
|
439
|
+
Cleanup temporary files and clear Behave state.
|
|
440
|
+
|
|
441
|
+
This removes:
|
|
442
|
+
- The temporary work directory
|
|
443
|
+
- Generated step modules from sys.modules
|
|
444
|
+
- Behave's global step registry
|
|
445
|
+
"""
|
|
446
|
+
import sys
|
|
447
|
+
import importlib
|
|
448
|
+
|
|
449
|
+
# Clear the generated step module from sys.modules
|
|
450
|
+
if self.generated_step_file:
|
|
451
|
+
# Compute the module name that Python would use
|
|
452
|
+
# The step file is in work_dir/steps/tactus_steps_<hash>.py
|
|
453
|
+
# Python imports it as "steps.tactus_steps_<hash>"
|
|
454
|
+
step_module_name = f"steps.{self.generated_step_file.stem}"
|
|
455
|
+
|
|
456
|
+
# Remove all variations of this module name
|
|
457
|
+
modules_to_clear = [
|
|
458
|
+
m
|
|
459
|
+
for m in list(sys.modules.keys())
|
|
460
|
+
if step_module_name in m or self.generated_step_file.stem in m
|
|
461
|
+
]
|
|
462
|
+
|
|
463
|
+
for mod in modules_to_clear:
|
|
464
|
+
del sys.modules[mod]
|
|
465
|
+
logger.debug(f"Cleared module from sys.modules: {mod}")
|
|
466
|
+
|
|
467
|
+
# Clear Behave's global step registry IN-PLACE
|
|
468
|
+
# IMPORTANT: We call registry.clear() instead of creating a new registry
|
|
469
|
+
# because the @step decorator has a closure reference to the registry object.
|
|
470
|
+
try:
|
|
471
|
+
import behave.step_registry
|
|
472
|
+
|
|
473
|
+
behave.step_registry.registry.clear()
|
|
474
|
+
logger.debug("Cleared Behave step registry")
|
|
475
|
+
except ImportError:
|
|
476
|
+
pass
|
|
477
|
+
|
|
478
|
+
# Invalidate import caches
|
|
479
|
+
importlib.invalidate_caches()
|
|
480
|
+
|
|
481
|
+
# Clean up work directory
|
|
482
|
+
if self.work_dir and self.work_dir.exists():
|
|
483
|
+
import shutil
|
|
484
|
+
|
|
485
|
+
try:
|
|
486
|
+
shutil.rmtree(self.work_dir)
|
|
487
|
+
logger.debug(f"Cleaned up work directory: {self.work_dir}")
|
|
488
|
+
except Exception as e:
|
|
489
|
+
logger.warning(f"Failed to cleanup work directory: {e}")
|