tactus 0.31.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +49 -0
- tactus/adapters/__init__.py +9 -0
- tactus/adapters/broker_log.py +76 -0
- tactus/adapters/cli_hitl.py +189 -0
- tactus/adapters/cli_log.py +223 -0
- tactus/adapters/cost_collector_log.py +56 -0
- tactus/adapters/file_storage.py +367 -0
- tactus/adapters/http_callback_log.py +109 -0
- tactus/adapters/ide_log.py +71 -0
- tactus/adapters/lua_tools.py +336 -0
- tactus/adapters/mcp.py +289 -0
- tactus/adapters/mcp_manager.py +196 -0
- tactus/adapters/memory.py +53 -0
- tactus/adapters/plugins.py +419 -0
- tactus/backends/http_backend.py +58 -0
- tactus/backends/model_backend.py +35 -0
- tactus/backends/pytorch_backend.py +110 -0
- tactus/broker/__init__.py +12 -0
- tactus/broker/client.py +247 -0
- tactus/broker/protocol.py +183 -0
- tactus/broker/server.py +1123 -0
- tactus/broker/stdio.py +12 -0
- tactus/cli/__init__.py +7 -0
- tactus/cli/app.py +2245 -0
- tactus/cli/commands/__init__.py +0 -0
- tactus/core/__init__.py +32 -0
- tactus/core/config_manager.py +790 -0
- tactus/core/dependencies/__init__.py +14 -0
- tactus/core/dependencies/registry.py +180 -0
- tactus/core/dsl_stubs.py +2117 -0
- tactus/core/exceptions.py +66 -0
- tactus/core/execution_context.py +480 -0
- tactus/core/lua_sandbox.py +508 -0
- tactus/core/message_history_manager.py +236 -0
- tactus/core/mocking.py +286 -0
- tactus/core/output_validator.py +291 -0
- tactus/core/registry.py +499 -0
- tactus/core/runtime.py +2907 -0
- tactus/core/template_resolver.py +142 -0
- tactus/core/yaml_parser.py +301 -0
- tactus/docker/Dockerfile +61 -0
- tactus/docker/entrypoint.sh +69 -0
- tactus/dspy/__init__.py +39 -0
- tactus/dspy/agent.py +1144 -0
- tactus/dspy/broker_lm.py +181 -0
- tactus/dspy/config.py +212 -0
- tactus/dspy/history.py +196 -0
- tactus/dspy/module.py +405 -0
- tactus/dspy/prediction.py +318 -0
- tactus/dspy/signature.py +185 -0
- tactus/formatting/__init__.py +7 -0
- tactus/formatting/formatter.py +437 -0
- tactus/ide/__init__.py +9 -0
- tactus/ide/coding_assistant.py +343 -0
- tactus/ide/server.py +2223 -0
- tactus/primitives/__init__.py +49 -0
- tactus/primitives/control.py +168 -0
- tactus/primitives/file.py +229 -0
- tactus/primitives/handles.py +378 -0
- tactus/primitives/host.py +94 -0
- tactus/primitives/human.py +342 -0
- tactus/primitives/json.py +189 -0
- tactus/primitives/log.py +187 -0
- tactus/primitives/message_history.py +157 -0
- tactus/primitives/model.py +163 -0
- tactus/primitives/procedure.py +564 -0
- tactus/primitives/procedure_callable.py +318 -0
- tactus/primitives/retry.py +155 -0
- tactus/primitives/session.py +152 -0
- tactus/primitives/state.py +182 -0
- tactus/primitives/step.py +209 -0
- tactus/primitives/system.py +93 -0
- tactus/primitives/tool.py +375 -0
- tactus/primitives/tool_handle.py +279 -0
- tactus/primitives/toolset.py +229 -0
- tactus/protocols/__init__.py +38 -0
- tactus/protocols/chat_recorder.py +81 -0
- tactus/protocols/config.py +97 -0
- tactus/protocols/cost.py +31 -0
- tactus/protocols/hitl.py +71 -0
- tactus/protocols/log_handler.py +27 -0
- tactus/protocols/models.py +355 -0
- tactus/protocols/result.py +33 -0
- tactus/protocols/storage.py +90 -0
- tactus/providers/__init__.py +13 -0
- tactus/providers/base.py +92 -0
- tactus/providers/bedrock.py +117 -0
- tactus/providers/google.py +105 -0
- tactus/providers/openai.py +98 -0
- tactus/sandbox/__init__.py +63 -0
- tactus/sandbox/config.py +171 -0
- tactus/sandbox/container_runner.py +1099 -0
- tactus/sandbox/docker_manager.py +433 -0
- tactus/sandbox/entrypoint.py +227 -0
- tactus/sandbox/protocol.py +213 -0
- tactus/stdlib/__init__.py +10 -0
- tactus/stdlib/io/__init__.py +13 -0
- tactus/stdlib/io/csv.py +88 -0
- tactus/stdlib/io/excel.py +136 -0
- tactus/stdlib/io/file.py +90 -0
- tactus/stdlib/io/fs.py +154 -0
- tactus/stdlib/io/hdf5.py +121 -0
- tactus/stdlib/io/json.py +109 -0
- tactus/stdlib/io/parquet.py +83 -0
- tactus/stdlib/io/tsv.py +88 -0
- tactus/stdlib/loader.py +274 -0
- tactus/stdlib/tac/tactus/tools/done.tac +33 -0
- tactus/stdlib/tac/tactus/tools/log.tac +50 -0
- tactus/testing/README.md +273 -0
- tactus/testing/__init__.py +61 -0
- tactus/testing/behave_integration.py +380 -0
- tactus/testing/context.py +486 -0
- tactus/testing/eval_models.py +114 -0
- tactus/testing/evaluation_runner.py +222 -0
- tactus/testing/evaluators.py +634 -0
- tactus/testing/events.py +94 -0
- tactus/testing/gherkin_parser.py +134 -0
- tactus/testing/mock_agent.py +315 -0
- tactus/testing/mock_dependencies.py +234 -0
- tactus/testing/mock_hitl.py +171 -0
- tactus/testing/mock_registry.py +168 -0
- tactus/testing/mock_tools.py +133 -0
- tactus/testing/models.py +115 -0
- tactus/testing/pydantic_eval_runner.py +508 -0
- tactus/testing/steps/__init__.py +13 -0
- tactus/testing/steps/builtin.py +902 -0
- tactus/testing/steps/custom.py +69 -0
- tactus/testing/steps/registry.py +68 -0
- tactus/testing/test_runner.py +489 -0
- tactus/tracing/__init__.py +5 -0
- tactus/tracing/trace_manager.py +417 -0
- tactus/utils/__init__.py +1 -0
- tactus/utils/cost_calculator.py +72 -0
- tactus/utils/model_pricing.py +132 -0
- tactus/utils/safe_file_library.py +502 -0
- tactus/utils/safe_libraries.py +234 -0
- tactus/validation/LuaLexerBase.py +66 -0
- tactus/validation/LuaParserBase.py +23 -0
- tactus/validation/README.md +224 -0
- tactus/validation/__init__.py +7 -0
- tactus/validation/error_listener.py +21 -0
- tactus/validation/generated/LuaLexer.interp +231 -0
- tactus/validation/generated/LuaLexer.py +5548 -0
- tactus/validation/generated/LuaLexer.tokens +124 -0
- tactus/validation/generated/LuaLexerBase.py +66 -0
- tactus/validation/generated/LuaParser.interp +173 -0
- tactus/validation/generated/LuaParser.py +6439 -0
- tactus/validation/generated/LuaParser.tokens +124 -0
- tactus/validation/generated/LuaParserBase.py +23 -0
- tactus/validation/generated/LuaParserVisitor.py +118 -0
- tactus/validation/generated/__init__.py +7 -0
- tactus/validation/grammar/LuaLexer.g4 +123 -0
- tactus/validation/grammar/LuaParser.g4 +178 -0
- tactus/validation/semantic_visitor.py +817 -0
- tactus/validation/validator.py +157 -0
- tactus-0.31.2.dist-info/METADATA +1809 -0
- tactus-0.31.2.dist-info/RECORD +160 -0
- tactus-0.31.2.dist-info/WHEEL +4 -0
- tactus-0.31.2.dist-info/entry_points.txt +2 -0
- tactus-0.31.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation runner for Tactus BDD testing.
|
|
3
|
+
|
|
4
|
+
Runs scenarios multiple times in parallel to measure consistency and reliability.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import statistics
|
|
10
|
+
import multiprocessing
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from typing import List
|
|
13
|
+
|
|
14
|
+
from .models import ScenarioResult, EvaluationResult
|
|
15
|
+
from .test_runner import TactusTestRunner
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TactusEvaluationRunner(TactusTestRunner):
|
|
22
|
+
"""
|
|
23
|
+
Runs Tactus BDD evaluations with multiple iterations per scenario.
|
|
24
|
+
|
|
25
|
+
Extends TactusTestRunner to run scenarios multiple times and
|
|
26
|
+
calculate consistency and reliability metrics.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def evaluate_all(
|
|
30
|
+
self,
|
|
31
|
+
runs: int = 10,
|
|
32
|
+
parallel: bool = True,
|
|
33
|
+
) -> List[EvaluationResult]:
|
|
34
|
+
"""
|
|
35
|
+
Evaluate all scenarios with N runs each.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
runs: Number of times to run each scenario
|
|
39
|
+
parallel: Whether to run iterations in parallel
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of EvaluationResult, one per scenario
|
|
43
|
+
"""
|
|
44
|
+
if not self.parsed_feature or not self.work_dir:
|
|
45
|
+
raise RuntimeError("Must call setup() before evaluate_all()")
|
|
46
|
+
|
|
47
|
+
results = []
|
|
48
|
+
for scenario in self.parsed_feature.scenarios:
|
|
49
|
+
eval_result = self._evaluate_scenario(
|
|
50
|
+
scenario.name,
|
|
51
|
+
runs,
|
|
52
|
+
parallel,
|
|
53
|
+
)
|
|
54
|
+
results.append(eval_result)
|
|
55
|
+
|
|
56
|
+
return results
|
|
57
|
+
|
|
58
|
+
def evaluate_scenario(
|
|
59
|
+
self,
|
|
60
|
+
scenario_name: str,
|
|
61
|
+
runs: int = 10,
|
|
62
|
+
parallel: bool = True,
|
|
63
|
+
) -> EvaluationResult:
|
|
64
|
+
"""
|
|
65
|
+
Evaluate a single scenario with N runs.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
scenario_name: Name of scenario to evaluate
|
|
69
|
+
runs: Number of times to run the scenario
|
|
70
|
+
parallel: Whether to run iterations in parallel
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
EvaluationResult with consistency metrics
|
|
74
|
+
"""
|
|
75
|
+
if not self.work_dir:
|
|
76
|
+
raise RuntimeError("Must call setup() before evaluate_scenario()")
|
|
77
|
+
|
|
78
|
+
return self._evaluate_scenario(scenario_name, runs, parallel)
|
|
79
|
+
|
|
80
|
+
def _evaluate_scenario(
|
|
81
|
+
self,
|
|
82
|
+
scenario_name: str,
|
|
83
|
+
runs: int,
|
|
84
|
+
parallel: bool,
|
|
85
|
+
) -> EvaluationResult:
|
|
86
|
+
"""
|
|
87
|
+
Run single scenario N times and calculate metrics.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
scenario_name: Name of scenario to evaluate
|
|
91
|
+
runs: Number of iterations
|
|
92
|
+
parallel: Whether to run in parallel
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
EvaluationResult with all metrics
|
|
96
|
+
"""
|
|
97
|
+
logger.info(f"Evaluating scenario '{scenario_name}' with {runs} runs")
|
|
98
|
+
|
|
99
|
+
# Run scenario N times
|
|
100
|
+
if parallel:
|
|
101
|
+
workers = min(runs, os.cpu_count() or 1)
|
|
102
|
+
# Use 'spawn' to avoid Behave global state conflicts
|
|
103
|
+
ctx = multiprocessing.get_context("spawn")
|
|
104
|
+
with ctx.Pool(processes=workers) as pool:
|
|
105
|
+
iteration_args = [(scenario_name, str(self.work_dir), i) for i in range(runs)]
|
|
106
|
+
results = pool.starmap(self._run_single_iteration, iteration_args)
|
|
107
|
+
else:
|
|
108
|
+
results = [
|
|
109
|
+
self._run_single_iteration(scenario_name, str(self.work_dir), i)
|
|
110
|
+
for i in range(runs)
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
# Calculate metrics
|
|
114
|
+
return self._calculate_metrics(scenario_name, results)
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def _run_single_iteration(
|
|
118
|
+
scenario_name: str,
|
|
119
|
+
work_dir: str,
|
|
120
|
+
iteration: int,
|
|
121
|
+
) -> ScenarioResult:
|
|
122
|
+
"""
|
|
123
|
+
Run one iteration of a scenario (called in subprocess).
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
scenario_name: Name of scenario to run
|
|
127
|
+
work_dir: Path to Behave work directory
|
|
128
|
+
iteration: Iteration number (for tracking)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
ScenarioResult with iteration number
|
|
132
|
+
"""
|
|
133
|
+
result = TactusTestRunner._run_single_scenario(scenario_name, work_dir)
|
|
134
|
+
result.iteration = iteration
|
|
135
|
+
return result
|
|
136
|
+
|
|
137
|
+
def _calculate_metrics(
|
|
138
|
+
self,
|
|
139
|
+
scenario_name: str,
|
|
140
|
+
results: List[ScenarioResult],
|
|
141
|
+
) -> EvaluationResult:
|
|
142
|
+
"""
|
|
143
|
+
Calculate consistency and reliability metrics.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
scenario_name: Name of scenario
|
|
147
|
+
results: List of ScenarioResult from all runs
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
EvaluationResult with all metrics
|
|
151
|
+
"""
|
|
152
|
+
total_runs = len(results)
|
|
153
|
+
passed_runs = sum(1 for r in results if r.status == "passed")
|
|
154
|
+
failed_runs = total_runs - passed_runs
|
|
155
|
+
|
|
156
|
+
# Success rate
|
|
157
|
+
success_rate = passed_runs / total_runs if total_runs > 0 else 0.0
|
|
158
|
+
|
|
159
|
+
# Timing statistics
|
|
160
|
+
durations = [r.duration for r in results]
|
|
161
|
+
mean_duration = statistics.mean(durations) if durations else 0.0
|
|
162
|
+
median_duration = statistics.median(durations) if durations else 0.0
|
|
163
|
+
stddev_duration = statistics.stdev(durations) if len(durations) > 1 else 0.0
|
|
164
|
+
|
|
165
|
+
# Consistency score - compare step outcomes
|
|
166
|
+
consistency_score = self._calculate_consistency(results)
|
|
167
|
+
|
|
168
|
+
# Flakiness detection
|
|
169
|
+
is_flaky = 0 < passed_runs < total_runs
|
|
170
|
+
|
|
171
|
+
logger.info(
|
|
172
|
+
f"Scenario '{scenario_name}': "
|
|
173
|
+
f"Success rate: {success_rate:.1%}, "
|
|
174
|
+
f"Consistency: {consistency_score:.1%}, "
|
|
175
|
+
f"Flaky: {is_flaky}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
return EvaluationResult(
|
|
179
|
+
scenario_name=scenario_name,
|
|
180
|
+
total_runs=total_runs,
|
|
181
|
+
passed_runs=passed_runs,
|
|
182
|
+
failed_runs=failed_runs,
|
|
183
|
+
success_rate=success_rate,
|
|
184
|
+
mean_duration=mean_duration,
|
|
185
|
+
median_duration=median_duration,
|
|
186
|
+
stddev_duration=stddev_duration,
|
|
187
|
+
consistency_score=consistency_score,
|
|
188
|
+
is_flaky=is_flaky,
|
|
189
|
+
individual_results=results,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def _calculate_consistency(self, results: List[ScenarioResult]) -> float:
|
|
193
|
+
"""
|
|
194
|
+
Calculate consistency by comparing step outcomes.
|
|
195
|
+
|
|
196
|
+
Consistency score measures how often the scenario produces
|
|
197
|
+
identical step-by-step behavior across runs.
|
|
198
|
+
|
|
199
|
+
1.0 = all runs had identical step outcomes
|
|
200
|
+
0.0 = completely inconsistent
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
results: List of ScenarioResult
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Consistency score between 0.0 and 1.0
|
|
207
|
+
"""
|
|
208
|
+
if not results:
|
|
209
|
+
return 0.0
|
|
210
|
+
|
|
211
|
+
# Create signature for each run (step statuses)
|
|
212
|
+
signatures = []
|
|
213
|
+
for result in results:
|
|
214
|
+
sig = tuple((step.keyword, step.message, step.status) for step in result.steps)
|
|
215
|
+
signatures.append(sig)
|
|
216
|
+
|
|
217
|
+
# Count most common signature
|
|
218
|
+
signature_counts = Counter(signatures)
|
|
219
|
+
most_common_count = signature_counts.most_common(1)[0][1]
|
|
220
|
+
|
|
221
|
+
# Consistency is the fraction of runs that match the most common pattern
|
|
222
|
+
return most_common_count / len(results)
|