tactus 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. tactus/__init__.py +49 -0
  2. tactus/adapters/__init__.py +9 -0
  3. tactus/adapters/broker_log.py +76 -0
  4. tactus/adapters/cli_hitl.py +189 -0
  5. tactus/adapters/cli_log.py +223 -0
  6. tactus/adapters/cost_collector_log.py +56 -0
  7. tactus/adapters/file_storage.py +367 -0
  8. tactus/adapters/http_callback_log.py +109 -0
  9. tactus/adapters/ide_log.py +71 -0
  10. tactus/adapters/lua_tools.py +336 -0
  11. tactus/adapters/mcp.py +289 -0
  12. tactus/adapters/mcp_manager.py +196 -0
  13. tactus/adapters/memory.py +53 -0
  14. tactus/adapters/plugins.py +419 -0
  15. tactus/backends/http_backend.py +58 -0
  16. tactus/backends/model_backend.py +35 -0
  17. tactus/backends/pytorch_backend.py +110 -0
  18. tactus/broker/__init__.py +12 -0
  19. tactus/broker/client.py +247 -0
  20. tactus/broker/protocol.py +183 -0
  21. tactus/broker/server.py +1123 -0
  22. tactus/broker/stdio.py +12 -0
  23. tactus/cli/__init__.py +7 -0
  24. tactus/cli/app.py +2245 -0
  25. tactus/cli/commands/__init__.py +0 -0
  26. tactus/core/__init__.py +32 -0
  27. tactus/core/config_manager.py +790 -0
  28. tactus/core/dependencies/__init__.py +14 -0
  29. tactus/core/dependencies/registry.py +180 -0
  30. tactus/core/dsl_stubs.py +2117 -0
  31. tactus/core/exceptions.py +66 -0
  32. tactus/core/execution_context.py +480 -0
  33. tactus/core/lua_sandbox.py +508 -0
  34. tactus/core/message_history_manager.py +236 -0
  35. tactus/core/mocking.py +286 -0
  36. tactus/core/output_validator.py +291 -0
  37. tactus/core/registry.py +499 -0
  38. tactus/core/runtime.py +2907 -0
  39. tactus/core/template_resolver.py +142 -0
  40. tactus/core/yaml_parser.py +301 -0
  41. tactus/docker/Dockerfile +61 -0
  42. tactus/docker/entrypoint.sh +69 -0
  43. tactus/dspy/__init__.py +39 -0
  44. tactus/dspy/agent.py +1144 -0
  45. tactus/dspy/broker_lm.py +181 -0
  46. tactus/dspy/config.py +212 -0
  47. tactus/dspy/history.py +196 -0
  48. tactus/dspy/module.py +405 -0
  49. tactus/dspy/prediction.py +318 -0
  50. tactus/dspy/signature.py +185 -0
  51. tactus/formatting/__init__.py +7 -0
  52. tactus/formatting/formatter.py +437 -0
  53. tactus/ide/__init__.py +9 -0
  54. tactus/ide/coding_assistant.py +343 -0
  55. tactus/ide/server.py +2223 -0
  56. tactus/primitives/__init__.py +49 -0
  57. tactus/primitives/control.py +168 -0
  58. tactus/primitives/file.py +229 -0
  59. tactus/primitives/handles.py +378 -0
  60. tactus/primitives/host.py +94 -0
  61. tactus/primitives/human.py +342 -0
  62. tactus/primitives/json.py +189 -0
  63. tactus/primitives/log.py +187 -0
  64. tactus/primitives/message_history.py +157 -0
  65. tactus/primitives/model.py +163 -0
  66. tactus/primitives/procedure.py +564 -0
  67. tactus/primitives/procedure_callable.py +318 -0
  68. tactus/primitives/retry.py +155 -0
  69. tactus/primitives/session.py +152 -0
  70. tactus/primitives/state.py +182 -0
  71. tactus/primitives/step.py +209 -0
  72. tactus/primitives/system.py +93 -0
  73. tactus/primitives/tool.py +375 -0
  74. tactus/primitives/tool_handle.py +279 -0
  75. tactus/primitives/toolset.py +229 -0
  76. tactus/protocols/__init__.py +38 -0
  77. tactus/protocols/chat_recorder.py +81 -0
  78. tactus/protocols/config.py +97 -0
  79. tactus/protocols/cost.py +31 -0
  80. tactus/protocols/hitl.py +71 -0
  81. tactus/protocols/log_handler.py +27 -0
  82. tactus/protocols/models.py +355 -0
  83. tactus/protocols/result.py +33 -0
  84. tactus/protocols/storage.py +90 -0
  85. tactus/providers/__init__.py +13 -0
  86. tactus/providers/base.py +92 -0
  87. tactus/providers/bedrock.py +117 -0
  88. tactus/providers/google.py +105 -0
  89. tactus/providers/openai.py +98 -0
  90. tactus/sandbox/__init__.py +63 -0
  91. tactus/sandbox/config.py +171 -0
  92. tactus/sandbox/container_runner.py +1099 -0
  93. tactus/sandbox/docker_manager.py +433 -0
  94. tactus/sandbox/entrypoint.py +227 -0
  95. tactus/sandbox/protocol.py +213 -0
  96. tactus/stdlib/__init__.py +10 -0
  97. tactus/stdlib/io/__init__.py +13 -0
  98. tactus/stdlib/io/csv.py +88 -0
  99. tactus/stdlib/io/excel.py +136 -0
  100. tactus/stdlib/io/file.py +90 -0
  101. tactus/stdlib/io/fs.py +154 -0
  102. tactus/stdlib/io/hdf5.py +121 -0
  103. tactus/stdlib/io/json.py +109 -0
  104. tactus/stdlib/io/parquet.py +83 -0
  105. tactus/stdlib/io/tsv.py +88 -0
  106. tactus/stdlib/loader.py +274 -0
  107. tactus/stdlib/tac/tactus/tools/done.tac +33 -0
  108. tactus/stdlib/tac/tactus/tools/log.tac +50 -0
  109. tactus/testing/README.md +273 -0
  110. tactus/testing/__init__.py +61 -0
  111. tactus/testing/behave_integration.py +380 -0
  112. tactus/testing/context.py +486 -0
  113. tactus/testing/eval_models.py +114 -0
  114. tactus/testing/evaluation_runner.py +222 -0
  115. tactus/testing/evaluators.py +634 -0
  116. tactus/testing/events.py +94 -0
  117. tactus/testing/gherkin_parser.py +134 -0
  118. tactus/testing/mock_agent.py +315 -0
  119. tactus/testing/mock_dependencies.py +234 -0
  120. tactus/testing/mock_hitl.py +171 -0
  121. tactus/testing/mock_registry.py +168 -0
  122. tactus/testing/mock_tools.py +133 -0
  123. tactus/testing/models.py +115 -0
  124. tactus/testing/pydantic_eval_runner.py +508 -0
  125. tactus/testing/steps/__init__.py +13 -0
  126. tactus/testing/steps/builtin.py +902 -0
  127. tactus/testing/steps/custom.py +69 -0
  128. tactus/testing/steps/registry.py +68 -0
  129. tactus/testing/test_runner.py +489 -0
  130. tactus/tracing/__init__.py +5 -0
  131. tactus/tracing/trace_manager.py +417 -0
  132. tactus/utils/__init__.py +1 -0
  133. tactus/utils/cost_calculator.py +72 -0
  134. tactus/utils/model_pricing.py +132 -0
  135. tactus/utils/safe_file_library.py +502 -0
  136. tactus/utils/safe_libraries.py +234 -0
  137. tactus/validation/LuaLexerBase.py +66 -0
  138. tactus/validation/LuaParserBase.py +23 -0
  139. tactus/validation/README.md +224 -0
  140. tactus/validation/__init__.py +7 -0
  141. tactus/validation/error_listener.py +21 -0
  142. tactus/validation/generated/LuaLexer.interp +231 -0
  143. tactus/validation/generated/LuaLexer.py +5548 -0
  144. tactus/validation/generated/LuaLexer.tokens +124 -0
  145. tactus/validation/generated/LuaLexerBase.py +66 -0
  146. tactus/validation/generated/LuaParser.interp +173 -0
  147. tactus/validation/generated/LuaParser.py +6439 -0
  148. tactus/validation/generated/LuaParser.tokens +124 -0
  149. tactus/validation/generated/LuaParserBase.py +23 -0
  150. tactus/validation/generated/LuaParserVisitor.py +118 -0
  151. tactus/validation/generated/__init__.py +7 -0
  152. tactus/validation/grammar/LuaLexer.g4 +123 -0
  153. tactus/validation/grammar/LuaParser.g4 +178 -0
  154. tactus/validation/semantic_visitor.py +817 -0
  155. tactus/validation/validator.py +157 -0
  156. tactus-0.31.0.dist-info/METADATA +1809 -0
  157. tactus-0.31.0.dist-info/RECORD +160 -0
  158. tactus-0.31.0.dist-info/WHEEL +4 -0
  159. tactus-0.31.0.dist-info/entry_points.txt +2 -0
  160. tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,50 @@
1
+ --[[
2
+ tactus.tools.log: Logging tool for agents
3
+
4
+ Usage:
5
+ local log = require("tactus.tools.log")
6
+
7
+ -- In an agent's toolset
8
+ agent = Agent {
9
+ tools = {"log"},
10
+ ...
11
+ }
12
+
13
+ This tool allows agents to log messages during execution.
14
+ Note: For direct logging in procedures, use the Log global directly:
15
+ Log.info("message")
16
+ Log.debug("message", {key = value})
17
+ ]]--
18
+
19
+ local log_tool = Tool {
20
+ name = "log",
21
+ description = "Log a message during procedure execution",
22
+ input = {
23
+ message = field.string{required = true, description = "Message to log"},
24
+ level = field.string{required = false, description = "Log level: debug, info, warn, error"},
25
+ data = field.object{required = false, description = "Optional data to include"}
26
+ },
27
+ function(args)
28
+ local level = args.level or "info"
29
+ local data = args.data or {}
30
+
31
+ -- Use the Log global which is injected by the runtime
32
+ if level == "debug" then
33
+ Log.debug(args.message, data)
34
+ elseif level == "warn" then
35
+ Log.warn(args.message, data)
36
+ elseif level == "error" then
37
+ Log.error(args.message, data)
38
+ else
39
+ Log.info(args.message, data)
40
+ end
41
+
42
+ return {
43
+ logged = true,
44
+ level = level,
45
+ message = args.message
46
+ }
47
+ end
48
+ }
49
+
50
+ return log_tool
@@ -0,0 +1,273 @@
1
+ # Tactus BDD Testing Framework
2
+
3
+ First-class Gherkin-style BDD testing integrated into the Tactus DSL.
4
+
5
+ ## Overview
6
+
7
+ The Tactus BDD Testing Framework allows you to write behavior-driven tests directly in your procedure files using Gherkin syntax. Tests are executed using Behave under the hood, with full support for:
8
+
9
+ - **Natural language specifications** - Write tests in plain English using Gherkin
10
+ - **Built-in step library** - Comprehensive steps for Tactus primitives (tools, state, etc.)
11
+ - **Custom steps** - Define your own steps in Lua for advanced assertions
12
+ - **Parallel execution** - Run scenarios in parallel for fast feedback
13
+ - **Consistency evaluation** - Run tests multiple times to measure reliability
14
+ - **Structured results** - All results are Pydantic models, no text parsing
15
+
16
+ ## Quick Start
17
+
18
+ ### 1. Add Specifications to Your Procedure
19
+
20
+ ```lua
21
+ -- procedure.tac
22
+ name("my_procedure")
23
+ version("1.0.0")
24
+
25
+ agent("worker", {
26
+ provider = "openai",
27
+ model = "gpt-4o-mini",
28
+ system_prompt = "Do the work",
29
+ tools = {"search", "done"}
30
+ })
31
+
32
+ procedure(function()
33
+ repeat
34
+ Worker()
35
+ until done.called()
36
+ end)
37
+
38
+ -- Add BDD specifications
39
+ specifications([[
40
+ Feature: My Procedure
41
+
42
+ Scenario: Worker completes task
43
+ Given the procedure has started
44
+ When the worker agent takes turns
45
+ Then the search tool should be called
46
+ And the done tool should be called
47
+ And the procedure should complete successfully
48
+ ]])
49
+ ```
50
+
51
+ ### 2. Run Tests
52
+
53
+ ```bash
54
+ # Run all scenarios once
55
+ tactus test procedure.tac
56
+
57
+ # Run specific scenario
58
+ tactus test procedure.tac --scenario "Worker completes task"
59
+
60
+ # Run without parallel execution
61
+ tactus test procedure.tac --no-parallel
62
+ ```
63
+
64
+ ### 3. Evaluate Consistency
65
+
66
+ ```bash
67
+ # Run each scenario 10 times to measure consistency
68
+ tactus test procedure.tac --runs 10
69
+
70
+ # Run with more iterations
71
+ tactus test procedure.tac --runs 50
72
+
73
+ # Evaluate specific scenario
74
+ tactus test procedure.tac --scenario "Worker completes task" --runs 20
75
+ ```
76
+
77
+ ## Built-in Steps
78
+
79
+ The framework provides a comprehensive library of built-in steps:
80
+
81
+ ### Tool Steps
82
+
83
+ ```gherkin
84
+ Then the search tool should be called
85
+ Then the search tool should not be called
86
+ Then the search tool should be called at least 3 times
87
+ Then the search tool should be called exactly 2 times
88
+ Then the search tool should be called with query=test
89
+ ```
90
+
91
+ ### State Steps
92
+
93
+ ```gherkin
94
+ Then the state count should be 5
95
+ Then the state error should exist
96
+ Then the state should contain results
97
+ ```
98
+
99
+ ### Completion Steps
100
+
101
+ ```gherkin
102
+ Then the procedure should complete successfully
103
+ Then the procedure should fail
104
+ Then the stop reason should be done
105
+ Then the stop reason should contain timeout
106
+ ```
107
+
108
+ ### Iteration Steps
109
+
110
+ ```gherkin
111
+ Then the total iterations should be less than 10
112
+ Then the total iterations should be between 5 and 15
113
+ Then the agent should take at least 3 turns
114
+ ```
115
+
116
+ ### Parameter Steps
117
+
118
+ ```gherkin
119
+ Given the topic parameter is quantum computing
120
+ Then the agent's context should include quantum computing
121
+ ```
122
+
123
+ ### Agent Steps
124
+
125
+ ```gherkin
126
+ When the worker agent takes turns
127
+ When the procedure runs
128
+ ```
129
+
130
+ ## Custom Steps
131
+
132
+ Define custom steps in Lua for advanced assertions:
133
+
134
+ ```lua
135
+ -- Custom step definition
136
+ step("the research quality is high", function()
137
+ local results = State.get("research_results")
138
+ assert(#results > 5, "Should have at least 5 results")
139
+ assert(results[1].quality == "high", "First result should be high quality")
140
+ end)
141
+
142
+ -- Use in specifications
143
+ specifications([[
144
+ Feature: Research Quality
145
+
146
+ Scenario: High quality research
147
+ Given the procedure has started
148
+ When the procedure runs
149
+ Then the research quality is high
150
+ ]])
151
+ ```
152
+
153
+ ## Evaluation Metrics
154
+
155
+ The `evaluate` command runs scenarios multiple times and provides:
156
+
157
+ - **Success Rate** - Percentage of runs that passed
158
+ - **Mean Duration** - Average execution time
159
+ - **Standard Deviation** - Timing consistency
160
+ - **Consistency Score** - How often runs produce identical step outcomes (0.0 to 1.0)
161
+ - **Flakiness Detection** - Identifies scenarios with inconsistent results
162
+
163
+ Example output:
164
+
165
+ ```
166
+ Scenario: Agent completes research
167
+ Success Rate: 90% (9/10)
168
+ Duration: 1.23s (±0.15s)
169
+ Consistency: 90%
170
+ ⚠️ FLAKY - Inconsistent results detected
171
+ ```
172
+
173
+ ## Parser Warnings
174
+
175
+ The Tactus validator will warn if your procedure has no specifications:
176
+
177
+ ```bash
178
+ $ tactus validate procedure.tac
179
+
180
+ ⚠ Warning: No specifications defined - consider adding BDD tests using specifications([[...]])
181
+ ```
182
+
183
+ ## Note on Evaluations
184
+
185
+ This framework is for **testing logic** (BDD). If you want to evaluate **LLM output quality** using datasets and metrics (Pydantic Evals), see the main [README](../../README.md#evaluations-testing-agent-intelligence) and use the `tactus eval` command.
186
+
187
+ ## Architecture
188
+
189
+ ```
190
+ Tactus Procedure (.tac)
191
+ └─ specifications([[ Gherkin text ]])
192
+ └─ step("custom step", function() ... end)
193
+
194
+ Gherkin Parser (gherkin-official)
195
+
196
+ Feature/Scenario/Step AST
197
+
198
+ Step Matcher (built-in + custom steps)
199
+
200
+ Behave Integration Layer
201
+ ├─ Generate .feature files
202
+ ├─ Generate step_definitions.py
203
+ └─ Run via Behave Runner API
204
+
205
+ Parallel Execution (multiprocessing)
206
+
207
+ Structured Results (Pydantic models)
208
+
209
+ CLI Output / IDE Display / Log Events
210
+ ```
211
+
212
+ ## API Usage
213
+
214
+ You can also use the testing framework programmatically:
215
+
216
+ ```python
217
+ from pathlib import Path
218
+ from tactus.testing import TactusTestRunner, TactusEvaluationRunner
219
+
220
+ # Run tests
221
+ runner = TactusTestRunner(Path("procedure.tac"))
222
+ runner.setup(gherkin_text)
223
+ result = runner.run_tests(parallel=True)
224
+
225
+ print(f"Passed: {result.passed_scenarios}/{result.total_scenarios}")
226
+
227
+ # Run evaluation
228
+ evaluator = TactusEvaluationRunner(Path("procedure.tac"))
229
+ evaluator.setup(gherkin_text)
230
+ eval_results = evaluator.evaluate_all(runs=10, parallel=True)
231
+
232
+ for result in eval_results:
233
+ print(f"{result.scenario_name}: {result.success_rate:.1%} success rate")
234
+ ```
235
+
236
+ ## IDE Integration
237
+
238
+ Test and evaluation results are emitted as structured log events for IDE display:
239
+
240
+ - `TestStartedEvent`
241
+ - `TestCompletedEvent`
242
+ - `TestScenarioStartedEvent`
243
+ - `TestScenarioCompletedEvent`
244
+ - `EvaluationStartedEvent`
245
+ - `EvaluationCompletedEvent`
246
+ - `EvaluationScenarioStartedEvent`
247
+ - `EvaluationScenarioCompletedEvent`
248
+ - `EvaluationProgressEvent`
249
+
250
+ All events are Pydantic models that can be serialized to JSON for display in the IDE's execution panel.
251
+
252
+ ## Dependencies
253
+
254
+ The testing framework requires:
255
+
256
+ - `behave>=1.2.6` - BDD test execution
257
+ - `gherkin-official>=28.0.0` - Gherkin parsing
258
+
259
+ These are automatically installed with Tactus.
260
+
261
+ ## Examples
262
+
263
+ See `examples/with-bdd-tests.tac` for a complete example with:
264
+ - Multiple scenarios
265
+ - Custom steps
266
+ - Evaluation configuration
267
+ - All major step types
268
+
269
+
270
+
271
+
272
+
273
+
@@ -0,0 +1,61 @@
1
+ """
2
+ Tactus BDD Testing Framework.
3
+
4
+ Provides Gherkin-style BDD testing integrated into the Tactus DSL.
5
+ """
6
+
7
+ from .models import (
8
+ StepResult,
9
+ ScenarioResult,
10
+ FeatureResult,
11
+ TestResult,
12
+ EvaluationResult,
13
+ ParsedStep,
14
+ ParsedScenario,
15
+ ParsedFeature,
16
+ )
17
+ from .gherkin_parser import GherkinParser
18
+ from .test_runner import TactusTestRunner
19
+ from .evaluation_runner import TactusEvaluationRunner
20
+ from .context import TactusTestContext
21
+ from .mock_tools import MockToolRegistry, MockedToolPrimitive, create_default_mocks
22
+ from .mock_hitl import MockHITLHandler
23
+ from .events import (
24
+ TestStartedEvent,
25
+ TestCompletedEvent,
26
+ TestScenarioStartedEvent,
27
+ TestScenarioCompletedEvent,
28
+ EvaluationStartedEvent,
29
+ EvaluationCompletedEvent,
30
+ EvaluationScenarioStartedEvent,
31
+ EvaluationScenarioCompletedEvent,
32
+ EvaluationProgressEvent,
33
+ )
34
+
35
+ __all__ = [
36
+ "StepResult",
37
+ "ScenarioResult",
38
+ "FeatureResult",
39
+ "TestResult",
40
+ "EvaluationResult",
41
+ "ParsedStep",
42
+ "ParsedScenario",
43
+ "ParsedFeature",
44
+ "GherkinParser",
45
+ "TactusTestRunner",
46
+ "TactusEvaluationRunner",
47
+ "TactusTestContext",
48
+ "MockToolRegistry",
49
+ "MockedToolPrimitive",
50
+ "create_default_mocks",
51
+ "MockHITLHandler",
52
+ "TestStartedEvent",
53
+ "TestCompletedEvent",
54
+ "TestScenarioStartedEvent",
55
+ "TestScenarioCompletedEvent",
56
+ "EvaluationStartedEvent",
57
+ "EvaluationCompletedEvent",
58
+ "EvaluationScenarioStartedEvent",
59
+ "EvaluationScenarioCompletedEvent",
60
+ "EvaluationProgressEvent",
61
+ ]