tactus 0.31.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. tactus/__init__.py +49 -0
  2. tactus/adapters/__init__.py +9 -0
  3. tactus/adapters/broker_log.py +76 -0
  4. tactus/adapters/cli_hitl.py +189 -0
  5. tactus/adapters/cli_log.py +223 -0
  6. tactus/adapters/cost_collector_log.py +56 -0
  7. tactus/adapters/file_storage.py +367 -0
  8. tactus/adapters/http_callback_log.py +109 -0
  9. tactus/adapters/ide_log.py +71 -0
  10. tactus/adapters/lua_tools.py +336 -0
  11. tactus/adapters/mcp.py +289 -0
  12. tactus/adapters/mcp_manager.py +196 -0
  13. tactus/adapters/memory.py +53 -0
  14. tactus/adapters/plugins.py +419 -0
  15. tactus/backends/http_backend.py +58 -0
  16. tactus/backends/model_backend.py +35 -0
  17. tactus/backends/pytorch_backend.py +110 -0
  18. tactus/broker/__init__.py +12 -0
  19. tactus/broker/client.py +247 -0
  20. tactus/broker/protocol.py +183 -0
  21. tactus/broker/server.py +1123 -0
  22. tactus/broker/stdio.py +12 -0
  23. tactus/cli/__init__.py +7 -0
  24. tactus/cli/app.py +2245 -0
  25. tactus/cli/commands/__init__.py +0 -0
  26. tactus/core/__init__.py +32 -0
  27. tactus/core/config_manager.py +790 -0
  28. tactus/core/dependencies/__init__.py +14 -0
  29. tactus/core/dependencies/registry.py +180 -0
  30. tactus/core/dsl_stubs.py +2117 -0
  31. tactus/core/exceptions.py +66 -0
  32. tactus/core/execution_context.py +480 -0
  33. tactus/core/lua_sandbox.py +508 -0
  34. tactus/core/message_history_manager.py +236 -0
  35. tactus/core/mocking.py +286 -0
  36. tactus/core/output_validator.py +291 -0
  37. tactus/core/registry.py +499 -0
  38. tactus/core/runtime.py +2907 -0
  39. tactus/core/template_resolver.py +142 -0
  40. tactus/core/yaml_parser.py +301 -0
  41. tactus/docker/Dockerfile +61 -0
  42. tactus/docker/entrypoint.sh +69 -0
  43. tactus/dspy/__init__.py +39 -0
  44. tactus/dspy/agent.py +1144 -0
  45. tactus/dspy/broker_lm.py +181 -0
  46. tactus/dspy/config.py +212 -0
  47. tactus/dspy/history.py +196 -0
  48. tactus/dspy/module.py +405 -0
  49. tactus/dspy/prediction.py +318 -0
  50. tactus/dspy/signature.py +185 -0
  51. tactus/formatting/__init__.py +7 -0
  52. tactus/formatting/formatter.py +437 -0
  53. tactus/ide/__init__.py +9 -0
  54. tactus/ide/coding_assistant.py +343 -0
  55. tactus/ide/server.py +2223 -0
  56. tactus/primitives/__init__.py +49 -0
  57. tactus/primitives/control.py +168 -0
  58. tactus/primitives/file.py +229 -0
  59. tactus/primitives/handles.py +378 -0
  60. tactus/primitives/host.py +94 -0
  61. tactus/primitives/human.py +342 -0
  62. tactus/primitives/json.py +189 -0
  63. tactus/primitives/log.py +187 -0
  64. tactus/primitives/message_history.py +157 -0
  65. tactus/primitives/model.py +163 -0
  66. tactus/primitives/procedure.py +564 -0
  67. tactus/primitives/procedure_callable.py +318 -0
  68. tactus/primitives/retry.py +155 -0
  69. tactus/primitives/session.py +152 -0
  70. tactus/primitives/state.py +182 -0
  71. tactus/primitives/step.py +209 -0
  72. tactus/primitives/system.py +93 -0
  73. tactus/primitives/tool.py +375 -0
  74. tactus/primitives/tool_handle.py +279 -0
  75. tactus/primitives/toolset.py +229 -0
  76. tactus/protocols/__init__.py +38 -0
  77. tactus/protocols/chat_recorder.py +81 -0
  78. tactus/protocols/config.py +97 -0
  79. tactus/protocols/cost.py +31 -0
  80. tactus/protocols/hitl.py +71 -0
  81. tactus/protocols/log_handler.py +27 -0
  82. tactus/protocols/models.py +355 -0
  83. tactus/protocols/result.py +33 -0
  84. tactus/protocols/storage.py +90 -0
  85. tactus/providers/__init__.py +13 -0
  86. tactus/providers/base.py +92 -0
  87. tactus/providers/bedrock.py +117 -0
  88. tactus/providers/google.py +105 -0
  89. tactus/providers/openai.py +98 -0
  90. tactus/sandbox/__init__.py +63 -0
  91. tactus/sandbox/config.py +171 -0
  92. tactus/sandbox/container_runner.py +1099 -0
  93. tactus/sandbox/docker_manager.py +433 -0
  94. tactus/sandbox/entrypoint.py +227 -0
  95. tactus/sandbox/protocol.py +213 -0
  96. tactus/stdlib/__init__.py +10 -0
  97. tactus/stdlib/io/__init__.py +13 -0
  98. tactus/stdlib/io/csv.py +88 -0
  99. tactus/stdlib/io/excel.py +136 -0
  100. tactus/stdlib/io/file.py +90 -0
  101. tactus/stdlib/io/fs.py +154 -0
  102. tactus/stdlib/io/hdf5.py +121 -0
  103. tactus/stdlib/io/json.py +109 -0
  104. tactus/stdlib/io/parquet.py +83 -0
  105. tactus/stdlib/io/tsv.py +88 -0
  106. tactus/stdlib/loader.py +274 -0
  107. tactus/stdlib/tac/tactus/tools/done.tac +33 -0
  108. tactus/stdlib/tac/tactus/tools/log.tac +50 -0
  109. tactus/testing/README.md +273 -0
  110. tactus/testing/__init__.py +61 -0
  111. tactus/testing/behave_integration.py +380 -0
  112. tactus/testing/context.py +486 -0
  113. tactus/testing/eval_models.py +114 -0
  114. tactus/testing/evaluation_runner.py +222 -0
  115. tactus/testing/evaluators.py +634 -0
  116. tactus/testing/events.py +94 -0
  117. tactus/testing/gherkin_parser.py +134 -0
  118. tactus/testing/mock_agent.py +315 -0
  119. tactus/testing/mock_dependencies.py +234 -0
  120. tactus/testing/mock_hitl.py +171 -0
  121. tactus/testing/mock_registry.py +168 -0
  122. tactus/testing/mock_tools.py +133 -0
  123. tactus/testing/models.py +115 -0
  124. tactus/testing/pydantic_eval_runner.py +508 -0
  125. tactus/testing/steps/__init__.py +13 -0
  126. tactus/testing/steps/builtin.py +902 -0
  127. tactus/testing/steps/custom.py +69 -0
  128. tactus/testing/steps/registry.py +68 -0
  129. tactus/testing/test_runner.py +489 -0
  130. tactus/tracing/__init__.py +5 -0
  131. tactus/tracing/trace_manager.py +417 -0
  132. tactus/utils/__init__.py +1 -0
  133. tactus/utils/cost_calculator.py +72 -0
  134. tactus/utils/model_pricing.py +132 -0
  135. tactus/utils/safe_file_library.py +502 -0
  136. tactus/utils/safe_libraries.py +234 -0
  137. tactus/validation/LuaLexerBase.py +66 -0
  138. tactus/validation/LuaParserBase.py +23 -0
  139. tactus/validation/README.md +224 -0
  140. tactus/validation/__init__.py +7 -0
  141. tactus/validation/error_listener.py +21 -0
  142. tactus/validation/generated/LuaLexer.interp +231 -0
  143. tactus/validation/generated/LuaLexer.py +5548 -0
  144. tactus/validation/generated/LuaLexer.tokens +124 -0
  145. tactus/validation/generated/LuaLexerBase.py +66 -0
  146. tactus/validation/generated/LuaParser.interp +173 -0
  147. tactus/validation/generated/LuaParser.py +6439 -0
  148. tactus/validation/generated/LuaParser.tokens +124 -0
  149. tactus/validation/generated/LuaParserBase.py +23 -0
  150. tactus/validation/generated/LuaParserVisitor.py +118 -0
  151. tactus/validation/generated/__init__.py +7 -0
  152. tactus/validation/grammar/LuaLexer.g4 +123 -0
  153. tactus/validation/grammar/LuaParser.g4 +178 -0
  154. tactus/validation/semantic_visitor.py +817 -0
  155. tactus/validation/validator.py +157 -0
  156. tactus-0.31.2.dist-info/METADATA +1809 -0
  157. tactus-0.31.2.dist-info/RECORD +160 -0
  158. tactus-0.31.2.dist-info/WHEEL +4 -0
  159. tactus-0.31.2.dist-info/entry_points.txt +2 -0
  160. tactus-0.31.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,222 @@
1
+ """
2
+ Evaluation runner for Tactus BDD testing.
3
+
4
+ Runs scenarios multiple times in parallel to measure consistency and reliability.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import statistics
10
+ import multiprocessing
11
+ from collections import Counter
12
+ from typing import List
13
+
14
+ from .models import ScenarioResult, EvaluationResult
15
+ from .test_runner import TactusTestRunner
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class TactusEvaluationRunner(TactusTestRunner):
22
+ """
23
+ Runs Tactus BDD evaluations with multiple iterations per scenario.
24
+
25
+ Extends TactusTestRunner to run scenarios multiple times and
26
+ calculate consistency and reliability metrics.
27
+ """
28
+
29
+ def evaluate_all(
30
+ self,
31
+ runs: int = 10,
32
+ parallel: bool = True,
33
+ ) -> List[EvaluationResult]:
34
+ """
35
+ Evaluate all scenarios with N runs each.
36
+
37
+ Args:
38
+ runs: Number of times to run each scenario
39
+ parallel: Whether to run iterations in parallel
40
+
41
+ Returns:
42
+ List of EvaluationResult, one per scenario
43
+ """
44
+ if not self.parsed_feature or not self.work_dir:
45
+ raise RuntimeError("Must call setup() before evaluate_all()")
46
+
47
+ results = []
48
+ for scenario in self.parsed_feature.scenarios:
49
+ eval_result = self._evaluate_scenario(
50
+ scenario.name,
51
+ runs,
52
+ parallel,
53
+ )
54
+ results.append(eval_result)
55
+
56
+ return results
57
+
58
+ def evaluate_scenario(
59
+ self,
60
+ scenario_name: str,
61
+ runs: int = 10,
62
+ parallel: bool = True,
63
+ ) -> EvaluationResult:
64
+ """
65
+ Evaluate a single scenario with N runs.
66
+
67
+ Args:
68
+ scenario_name: Name of scenario to evaluate
69
+ runs: Number of times to run the scenario
70
+ parallel: Whether to run iterations in parallel
71
+
72
+ Returns:
73
+ EvaluationResult with consistency metrics
74
+ """
75
+ if not self.work_dir:
76
+ raise RuntimeError("Must call setup() before evaluate_scenario()")
77
+
78
+ return self._evaluate_scenario(scenario_name, runs, parallel)
79
+
80
+ def _evaluate_scenario(
81
+ self,
82
+ scenario_name: str,
83
+ runs: int,
84
+ parallel: bool,
85
+ ) -> EvaluationResult:
86
+ """
87
+ Run single scenario N times and calculate metrics.
88
+
89
+ Args:
90
+ scenario_name: Name of scenario to evaluate
91
+ runs: Number of iterations
92
+ parallel: Whether to run in parallel
93
+
94
+ Returns:
95
+ EvaluationResult with all metrics
96
+ """
97
+ logger.info(f"Evaluating scenario '{scenario_name}' with {runs} runs")
98
+
99
+ # Run scenario N times
100
+ if parallel:
101
+ workers = min(runs, os.cpu_count() or 1)
102
+ # Use 'spawn' to avoid Behave global state conflicts
103
+ ctx = multiprocessing.get_context("spawn")
104
+ with ctx.Pool(processes=workers) as pool:
105
+ iteration_args = [(scenario_name, str(self.work_dir), i) for i in range(runs)]
106
+ results = pool.starmap(self._run_single_iteration, iteration_args)
107
+ else:
108
+ results = [
109
+ self._run_single_iteration(scenario_name, str(self.work_dir), i)
110
+ for i in range(runs)
111
+ ]
112
+
113
+ # Calculate metrics
114
+ return self._calculate_metrics(scenario_name, results)
115
+
116
+ @staticmethod
117
+ def _run_single_iteration(
118
+ scenario_name: str,
119
+ work_dir: str,
120
+ iteration: int,
121
+ ) -> ScenarioResult:
122
+ """
123
+ Run one iteration of a scenario (called in subprocess).
124
+
125
+ Args:
126
+ scenario_name: Name of scenario to run
127
+ work_dir: Path to Behave work directory
128
+ iteration: Iteration number (for tracking)
129
+
130
+ Returns:
131
+ ScenarioResult with iteration number
132
+ """
133
+ result = TactusTestRunner._run_single_scenario(scenario_name, work_dir)
134
+ result.iteration = iteration
135
+ return result
136
+
137
+ def _calculate_metrics(
138
+ self,
139
+ scenario_name: str,
140
+ results: List[ScenarioResult],
141
+ ) -> EvaluationResult:
142
+ """
143
+ Calculate consistency and reliability metrics.
144
+
145
+ Args:
146
+ scenario_name: Name of scenario
147
+ results: List of ScenarioResult from all runs
148
+
149
+ Returns:
150
+ EvaluationResult with all metrics
151
+ """
152
+ total_runs = len(results)
153
+ passed_runs = sum(1 for r in results if r.status == "passed")
154
+ failed_runs = total_runs - passed_runs
155
+
156
+ # Success rate
157
+ success_rate = passed_runs / total_runs if total_runs > 0 else 0.0
158
+
159
+ # Timing statistics
160
+ durations = [r.duration for r in results]
161
+ mean_duration = statistics.mean(durations) if durations else 0.0
162
+ median_duration = statistics.median(durations) if durations else 0.0
163
+ stddev_duration = statistics.stdev(durations) if len(durations) > 1 else 0.0
164
+
165
+ # Consistency score - compare step outcomes
166
+ consistency_score = self._calculate_consistency(results)
167
+
168
+ # Flakiness detection
169
+ is_flaky = 0 < passed_runs < total_runs
170
+
171
+ logger.info(
172
+ f"Scenario '{scenario_name}': "
173
+ f"Success rate: {success_rate:.1%}, "
174
+ f"Consistency: {consistency_score:.1%}, "
175
+ f"Flaky: {is_flaky}"
176
+ )
177
+
178
+ return EvaluationResult(
179
+ scenario_name=scenario_name,
180
+ total_runs=total_runs,
181
+ passed_runs=passed_runs,
182
+ failed_runs=failed_runs,
183
+ success_rate=success_rate,
184
+ mean_duration=mean_duration,
185
+ median_duration=median_duration,
186
+ stddev_duration=stddev_duration,
187
+ consistency_score=consistency_score,
188
+ is_flaky=is_flaky,
189
+ individual_results=results,
190
+ )
191
+
192
+ def _calculate_consistency(self, results: List[ScenarioResult]) -> float:
193
+ """
194
+ Calculate consistency by comparing step outcomes.
195
+
196
+ Consistency score measures how often the scenario produces
197
+ identical step-by-step behavior across runs.
198
+
199
+ 1.0 = all runs had identical step outcomes
200
+ 0.0 = completely inconsistent
201
+
202
+ Args:
203
+ results: List of ScenarioResult
204
+
205
+ Returns:
206
+ Consistency score between 0.0 and 1.0
207
+ """
208
+ if not results:
209
+ return 0.0
210
+
211
+ # Create signature for each run (step statuses)
212
+ signatures = []
213
+ for result in results:
214
+ sig = tuple((step.keyword, step.message, step.status) for step in result.steps)
215
+ signatures.append(sig)
216
+
217
+ # Count most common signature
218
+ signature_counts = Counter(signatures)
219
+ most_common_count = signature_counts.most_common(1)[0][1]
220
+
221
+ # Consistency is the fraction of runs that match the most common pattern
222
+ return most_common_count / len(results)