sandboxy 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {sandboxy-0.0.2 → sandboxy-0.0.3}/PKG-INFO +1 -1
  2. {sandboxy-0.0.2 → sandboxy-0.0.3}/pyproject.toml +1 -1
  3. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/cli/main.py +238 -143
  4. {sandboxy-0.0.2 → sandboxy-0.0.3}/.env.example +0 -0
  5. {sandboxy-0.0.2 → sandboxy-0.0.3}/.github/workflows/ci.yml +0 -0
  6. {sandboxy-0.0.2 → sandboxy-0.0.3}/.github/workflows/publish.yml +0 -0
  7. {sandboxy-0.0.2 → sandboxy-0.0.3}/.gitignore +0 -0
  8. {sandboxy-0.0.2 → sandboxy-0.0.3}/CONTRIBUTING.md +0 -0
  9. {sandboxy-0.0.2 → sandboxy-0.0.3}/LICENSE +0 -0
  10. {sandboxy-0.0.2 → sandboxy-0.0.3}/Makefile +0 -0
  11. {sandboxy-0.0.2 → sandboxy-0.0.3}/README.md +0 -0
  12. {sandboxy-0.0.2 → sandboxy-0.0.3}/docs/yaml-tools.md +0 -0
  13. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/index.html +0 -0
  14. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/package-lock.json +0 -0
  15. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/package.json +0 -0
  16. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/postcss.config.js +0 -0
  17. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/App.tsx +0 -0
  18. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/components/Layout.tsx +0 -0
  19. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/components/ModelSelector.tsx +0 -0
  20. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/components/ResultDisplay.tsx +0 -0
  21. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/hooks/useScenarioBuilder.ts +0 -0
  22. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/hooks/useScenarioRun.ts +0 -0
  23. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/hooks/useToolBuilder.ts +0 -0
  24. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/index.css +0 -0
  25. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/lib/api.ts +0 -0
  26. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/main.tsx +0 -0
  27. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/pages/BuilderPage.tsx +0 -0
  28. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/pages/DashboardPage.tsx +0 -0
  29. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/pages/DatasetPage.tsx +0 -0
  30. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/pages/ResultsPage.tsx +0 -0
  31. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/pages/RunPage.tsx +0 -0
  32. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/src/pages/ToolBuilderPage.tsx +0 -0
  33. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/tailwind.config.js +0 -0
  34. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/tsconfig.json +0 -0
  35. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/tsconfig.node.json +0 -0
  36. {sandboxy-0.0.2 → sandboxy-0.0.3}/local-ui/vite.config.ts +0 -0
  37. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/__init__.py +0 -0
  38. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/agents/__init__.py +0 -0
  39. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/agents/base.py +0 -0
  40. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/agents/llm_prompt.py +0 -0
  41. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/agents/loader.py +0 -0
  42. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/api/__init__.py +0 -0
  43. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/api/app.py +0 -0
  44. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/api/routes/__init__.py +0 -0
  45. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/api/routes/agents.py +0 -0
  46. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/api/routes/local.py +0 -0
  47. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/api/routes/tools.py +0 -0
  48. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/cli/__init__.py +0 -0
  49. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/cli/type_detector.py +0 -0
  50. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/config.py +0 -0
  51. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/core/__init__.py +0 -0
  52. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/core/async_runner.py +0 -0
  53. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/core/mdl_parser.py +0 -0
  54. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/core/runner.py +0 -0
  55. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/core/safe_eval.py +0 -0
  56. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/core/state.py +0 -0
  57. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/datasets/__init__.py +0 -0
  58. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/datasets/loader.py +0 -0
  59. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/datasets/runner.py +0 -0
  60. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/errors.py +0 -0
  61. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/local/context.py +0 -0
  62. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/local/results.py +0 -0
  63. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/logging.py +0 -0
  64. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/mcp/__init__.py +0 -0
  65. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/mcp/client.py +0 -0
  66. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/mcp/wrapper.py +0 -0
  67. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/providers/__init__.py +0 -0
  68. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/providers/anthropic_provider.py +0 -0
  69. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/providers/base.py +0 -0
  70. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/providers/http_client.py +0 -0
  71. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/providers/openai_provider.py +0 -0
  72. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/providers/openrouter.py +0 -0
  73. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/providers/registry.py +0 -0
  74. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/scenarios/__init__.py +0 -0
  75. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/scenarios/comparison.py +0 -0
  76. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/scenarios/loader.py +0 -0
  77. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/scenarios/runner.py +0 -0
  78. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/scenarios/unified.py +0 -0
  79. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/session/__init__.py +0 -0
  80. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/session/manager.py +0 -0
  81. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/tools/__init__.py +0 -0
  82. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/tools/base.py +0 -0
  83. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/tools/loader.py +0 -0
  84. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/tools/yaml_tools.py +0 -0
  85. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/ui/__init__.py +0 -0
  86. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -0
  87. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -0
  88. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/ui/dist/index.html +0 -0
  89. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/utils/__init__.py +0 -0
  90. {sandboxy-0.0.2 → sandboxy-0.0.3}/sandboxy/utils/time.py +0 -0
  91. {sandboxy-0.0.2 → sandboxy-0.0.3}/scenarios/customer_service.yml +0 -0
  92. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/__init__.py +0 -0
  93. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/conftest.py +0 -0
  94. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/factories.py +0 -0
  95. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/integration/__init__.py +0 -0
  96. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/integration/api/__init__.py +0 -0
  97. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/mocks/__init__.py +0 -0
  98. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/mocks/providers.py +0 -0
  99. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/__init__.py +0 -0
  100. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/agents/__init__.py +0 -0
  101. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/agents/test_base.py +0 -0
  102. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/agents/test_llm_prompt.py +0 -0
  103. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/agents/test_loader.py +0 -0
  104. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/core/__init__.py +0 -0
  105. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/core/test_async_runner.py +0 -0
  106. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/core/test_mdl_parser.py +0 -0
  107. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/core/test_runner.py +0 -0
  108. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/core/test_safe_eval.py +0 -0
  109. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/core/test_state.py +0 -0
  110. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/providers/test_openrouter.py +0 -0
  111. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/tools/__init__.py +0 -0
  112. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/tools/test_base.py +0 -0
  113. {sandboxy-0.0.2 → sandboxy-0.0.3}/tests/unit/tools/test_loader.py +0 -0
  114. {sandboxy-0.0.2 → sandboxy-0.0.3}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sandboxy
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Open-source agent simulation and benchmarking platform
5
5
  Project-URL: Homepage, https://github.com/sandboxy-ai/sandboxy
6
6
  Project-URL: Repository, https://github.com/sandboxy-ai/sandboxy
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sandboxy"
7
- version = "0.0.2"
7
+ version = "0.0.3"
8
8
  description = "Open-source agent simulation and benchmarking platform"
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -1,6 +1,5 @@
1
1
  """CLI entrypoint for Sandboxy."""
2
2
 
3
- import csv
4
3
  import json
5
4
  import os
6
5
  import sys
@@ -35,6 +34,59 @@ def main() -> None:
35
34
  pass
36
35
 
37
36
 
37
+ @main.command()
38
+ @click.argument("shell", type=click.Choice(["bash", "zsh", "fish"]), default="bash")
39
+ def completion(shell: str) -> None:
40
+ """Generate shell completion and show setup instructions.
41
+
42
+ Writes completion script to ~/.sandboxy-completion.<shell>
43
+ and shows the line to add to your shell config.
44
+
45
+ Examples:
46
+ sandboxy completion # Generate bash completion
47
+ sandboxy completion zsh # Generate zsh completion
48
+ """
49
+ import subprocess
50
+
51
+ home = Path.home()
52
+ ext = shell if shell != "bash" else "bash"
53
+ completion_file = home / f".sandboxy-completion.{ext}"
54
+
55
+ # Generate completion script using Click's built-in mechanism
56
+ env = os.environ.copy()
57
+ env["_SANDBOXY_COMPLETE"] = f"{shell}_source"
58
+
59
+ result = subprocess.run( # noqa: S603
60
+ [sys.executable, "-m", "sandboxy.cli.main"],
61
+ env=env,
62
+ capture_output=True,
63
+ text=True,
64
+ )
65
+
66
+ # Write to file
67
+ completion_file.write_text(result.stdout)
68
+ click.echo(f"Generated: {completion_file}")
69
+ click.echo("")
70
+ click.echo("Add this line to your shell config:")
71
+ click.echo("")
72
+
73
+ if shell == "bash":
74
+ click.echo("# Sandboxy completion")
75
+ click.echo(f'. "{completion_file}"')
76
+ click.echo("")
77
+ click.echo("(Add to ~/.bashrc)")
78
+ elif shell == "zsh":
79
+ click.echo("# Sandboxy completion")
80
+ click.echo(f'. "{completion_file}"')
81
+ click.echo("")
82
+ click.echo("(Add to ~/.zshrc)")
83
+ elif shell == "fish":
84
+ click.echo("# Sandboxy completion")
85
+ click.echo(f'source "{completion_file}"')
86
+ click.echo("")
87
+ click.echo("(Add to ~/.config/fish/config.fish)")
88
+
89
+
38
90
  def _load_variables_from_env() -> dict:
39
91
  """Load variables from SANDBOXY_VARIABLES environment variable."""
40
92
  env_vars = os.environ.get("SANDBOXY_VARIABLES", "")
@@ -46,6 +98,191 @@ def _load_variables_from_env() -> dict:
46
98
  return {}
47
99
 
48
100
 
101
+ @main.command()
102
+ @click.option("--with-examples", is_flag=True, help="Include example scenarios and tools")
103
+ @click.option(
104
+ "--dir",
105
+ "-d",
106
+ "directory",
107
+ type=click.Path(path_type=Path),
108
+ default=None,
109
+ help="Directory to initialize (default: current directory)",
110
+ )
111
+ def init(with_examples: bool, directory: Path | None) -> None:
112
+ """Initialize a new Sandboxy project.
113
+
114
+ Creates the standard folder structure for scenarios, tools, agents, and datasets.
115
+
116
+ Examples:
117
+ sandboxy init
118
+ sandboxy init --with-examples
119
+ sandboxy init --dir my-project
120
+ """
121
+ root = directory or Path.cwd()
122
+
123
+ # Create directory if specified and doesn't exist
124
+ if directory and not root.exists():
125
+ root.mkdir(parents=True)
126
+ click.echo(f"Created directory: {root}")
127
+
128
+ # Standard folders
129
+ folders = ["scenarios", "tools", "agents", "datasets", "runs"]
130
+ created = []
131
+
132
+ for folder in folders:
133
+ folder_path = root / folder
134
+ if not folder_path.exists():
135
+ folder_path.mkdir(parents=True)
136
+ created.append(folder)
137
+
138
+ if created:
139
+ click.echo(f"Created folders: {', '.join(created)}")
140
+ else:
141
+ click.echo("All folders already exist")
142
+
143
+ # Create .env.example if it doesn't exist
144
+ env_example = root / ".env.example"
145
+ if not env_example.exists():
146
+ env_example.write_text(
147
+ """# Sandboxy Environment Variables
148
+ # Copy this to .env and fill in your API keys
149
+
150
+ # OpenRouter API key (recommended - access to 400+ models)
151
+ OPENROUTER_API_KEY=
152
+
153
+ # Or use direct provider keys
154
+ OPENAI_API_KEY=
155
+ ANTHROPIC_API_KEY=
156
+ """
157
+ )
158
+ click.echo("Created .env.example")
159
+
160
+ # Create .gitignore if it doesn't exist
161
+ gitignore = root / ".gitignore"
162
+ if not gitignore.exists():
163
+ gitignore.write_text(
164
+ """.env
165
+ runs/
166
+ __pycache__/
167
+ *.pyc
168
+ """
169
+ )
170
+ click.echo("Created .gitignore")
171
+
172
+ # Add examples if requested
173
+ if with_examples:
174
+ _create_example_files(root)
175
+
176
+ click.echo("")
177
+ click.echo("Project initialized! Next steps:")
178
+ click.echo(" 1. Copy .env.example to .env and add your API key")
179
+ click.echo(" 2. Create scenarios in scenarios/")
180
+ click.echo(" 3. Run: sandboxy open")
181
+
182
+
183
+ def _create_example_files(root: Path) -> None:
184
+ """Create example scenario and tool files."""
185
+ # Example scenario
186
+ example_scenario = root / "scenarios" / "hello-world.yml"
187
+ if not example_scenario.exists():
188
+ example_scenario.write_text(
189
+ """name: Hello World
190
+ description: A simple greeting scenario to test your setup
191
+
192
+ system_prompt: |
193
+ You are a friendly assistant. Greet the user warmly.
194
+
195
+ prompt: |
196
+ Hello! Can you introduce yourself?
197
+
198
+ evaluation:
199
+ goals:
200
+ - id: greeted
201
+ name: Greeted the user
202
+ description: The assistant should greet the user
203
+ outcome: true
204
+ check: "'hello' in response.lower() or 'hi' in response.lower()"
205
+ """
206
+ )
207
+ click.echo("Created scenarios/hello-world.yml")
208
+
209
+ # Example tool
210
+ example_tool = root / "tools" / "calculator.yml"
211
+ if not example_tool.exists():
212
+ example_tool.write_text(
213
+ """name: calculator
214
+ description: A simple calculator tool
215
+
216
+ tools:
217
+ calculator:
218
+ description: Perform basic math operations
219
+ actions:
220
+ add:
221
+ description: Add two numbers
222
+ parameters:
223
+ type: object
224
+ properties:
225
+ a:
226
+ type: number
227
+ description: First number
228
+ b:
229
+ type: number
230
+ description: Second number
231
+ required: [a, b]
232
+ returns:
233
+ result: "{{a}} + {{b}}"
234
+
235
+ multiply:
236
+ description: Multiply two numbers
237
+ parameters:
238
+ type: object
239
+ properties:
240
+ a:
241
+ type: number
242
+ b:
243
+ type: number
244
+ required: [a, b]
245
+ returns:
246
+ result: "{{a}} * {{b}}"
247
+ """
248
+ )
249
+ click.echo("Created tools/calculator.yml")
250
+
251
+ # Example scenario using the tool
252
+ tool_scenario = root / "scenarios" / "calculator-test.yml"
253
+ if not tool_scenario.exists():
254
+ tool_scenario.write_text(
255
+ """name: Calculator Test
256
+ description: Test the calculator tool
257
+
258
+ system_prompt: |
259
+ You are a helpful assistant with access to a calculator.
260
+ Use the calculator tool to perform math operations.
261
+
262
+ tools_from:
263
+ - calculator
264
+
265
+ prompt: |
266
+ What is 42 + 17?
267
+
268
+ evaluation:
269
+ goals:
270
+ - id: used_calculator
271
+ name: Used calculator
272
+ description: The agent should use the calculator tool
273
+ outcome: true
274
+ check: "any(tc.tool == 'calculator' for tc in tool_calls)"
275
+
276
+ - id: correct_answer
277
+ name: Correct answer
278
+ description: The response should contain 59
279
+ outcome: true
280
+ check: "'59' in response"
281
+ """
282
+ )
283
+ click.echo("Created scenarios/calculator-test.yml")
284
+
285
+
49
286
  @main.command()
50
287
  @click.argument("module_path", type=click.Path(exists=True))
51
288
  @click.option("--agent-id", "-a", help="Agent ID to use", default=None)
@@ -135,148 +372,6 @@ def validate(module_path: str) -> None:
135
372
  click.echo("Module is valid.")
136
373
 
137
374
 
138
- @main.command()
139
- @click.argument("module_path", type=click.Path(exists=True))
140
- @click.option("--agents", required=True, help="Comma-separated agent IDs")
141
- @click.option("--runs-per-agent", type=int, default=1, help="Number of runs per agent")
142
- @click.option("--output", "-o", type=click.Path(), default=None, help="Output CSV file")
143
- @click.option("--var", "-v", multiple=True, help="Variable in name=value format")
144
- @click.option("--seed", type=int, default=None, help="Random seed for reproducibility")
145
- def bench(
146
- module_path: str,
147
- agents: str,
148
- runs_per_agent: int,
149
- output: str | None,
150
- var: tuple[str, ...],
151
- seed: int | None,
152
- ) -> None:
153
- """Benchmark a module against multiple agents.
154
-
155
- MODULE_PATH is the path to an MDL YAML file.
156
-
157
- Examples:
158
- sandboxy bench modules/lemonade.yml --agents gpt4,claude --runs 5
159
- sandboxy bench modules/lemonade.yml --agents gpt4 -v difficulty=8 -v starting_cash=100
160
- """
161
- import random
162
-
163
- # Set random seed for reproducibility
164
- if seed is not None:
165
- random.seed(seed)
166
-
167
- try:
168
- module = load_module(Path(module_path))
169
- except MDLParseError as e:
170
- click.echo(f"Error loading module: {e}", err=True)
171
- sys.exit(1)
172
-
173
- # Load variables from environment and CLI
174
- variables = _load_variables_from_env()
175
- for v in var:
176
- if "=" in v:
177
- name, value = v.split("=", 1)
178
- try:
179
- variables[name] = json.loads(value)
180
- except json.JSONDecodeError:
181
- variables[name] = value
182
-
183
- # Apply variables to module
184
- if variables:
185
- module = apply_variables(module, variables)
186
- click.echo(f"Variables: {variables}")
187
-
188
- loader = AgentLoader(DEFAULT_AGENT_DIRS)
189
- agent_ids = [a.strip() for a in agents.split(",")]
190
-
191
- results: list[dict[str, str | float | int]] = []
192
-
193
- for agent_id in agent_ids:
194
- try:
195
- agent = loader.load(agent_id)
196
- except ValueError as e:
197
- click.echo(f"Warning: Skipping agent {agent_id}: {e}", err=True)
198
- continue
199
-
200
- # Apply module's agent_config overrides
201
- if module.agent_config:
202
- if "system_prompt" in module.agent_config:
203
- agent.config.system_prompt = module.agent_config["system_prompt"]
204
-
205
- click.echo(f"Benchmarking agent: {agent_id}")
206
-
207
- for run_idx in range(runs_per_agent):
208
- runner = Runner(module=module, agent=agent)
209
- result = runner.run()
210
-
211
- row: dict[str, str | float | int] = {
212
- "agent_id": agent_id,
213
- "run_idx": run_idx,
214
- "score": result.evaluation.score,
215
- "num_events": result.evaluation.num_events,
216
- "status": result.evaluation.status,
217
- }
218
-
219
- # Add seed if used for reproducibility tracking
220
- if seed is not None:
221
- row["seed"] = seed
222
-
223
- # Add env_state metrics if available
224
- if "cash_balance" in runner.env_state:
225
- row["final_cash"] = runner.env_state["cash_balance"]
226
- if "starting_cash" in module.environment.initial_state:
227
- initial = module.environment.initial_state["starting_cash"]
228
- if "final_cash" in row:
229
- row["profit"] = float(row["final_cash"]) - float(initial)
230
-
231
- # Add all evaluation check results
232
- for check_name, check_result in result.evaluation.checks.items():
233
- if isinstance(check_result, int | float | bool):
234
- row[f"check_{check_name}"] = check_result
235
-
236
- results.append(row)
237
- click.echo(f" Run {run_idx + 1}: score={result.evaluation.score:.2f}")
238
-
239
- if not results:
240
- click.echo("No results to report.", err=True)
241
- sys.exit(1)
242
-
243
- # Output results
244
- if output:
245
- fieldnames = list(results[0].keys())
246
- with open(output, "w", newline="") as f:
247
- writer = csv.DictWriter(f, fieldnames=fieldnames)
248
- writer.writeheader()
249
- writer.writerows(results)
250
- click.echo(f"\nResults saved to: {output}")
251
- else:
252
- # Print summary table
253
- click.echo("\nBenchmark Results:")
254
- click.echo("-" * 60)
255
-
256
- # Group by agent
257
- from collections import defaultdict
258
-
259
- by_agent: dict[str, list[dict[str, str | float | int]]] = defaultdict(list)
260
- for r in results:
261
- by_agent[str(r["agent_id"])].append(r)
262
-
263
- for agent_id, runs in by_agent.items():
264
- scores = [r["score"] for r in runs if isinstance(r["score"], int | float)]
265
- avg_score = sum(scores) / len(scores) if scores else 0
266
- click.echo(f"{agent_id}:")
267
- click.echo(f" Runs: {len(runs)}")
268
- click.echo(f" Avg Score: {avg_score:.3f}")
269
- if "final_cash" in runs[0]:
270
- cash_values = [
271
- float(r["final_cash"])
272
- for r in runs
273
- if "final_cash" in r and isinstance(r["final_cash"], int | float)
274
- ]
275
- avg_cash = sum(cash_values) / len(cash_values) if cash_values else 0.0
276
- click.echo(f" Avg Final Cash: {avg_cash:.2f}")
277
- click.echo("")
278
-
279
-
280
375
  @main.command()
281
376
  @click.option("--port", "-p", type=int, default=8000, help="Port to run server on")
282
377
  @click.option("--host", default="127.0.0.1", help="Host to bind to")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes