cube-harness 0.1.0rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. cube_harness-0.1.0rc2/PKG-INFO +220 -0
  2. cube_harness-0.1.0rc2/README.md +180 -0
  3. cube_harness-0.1.0rc2/pyproject.toml +109 -0
  4. cube_harness-0.1.0rc2/src/cube_harness/__init__.py +10 -0
  5. cube_harness-0.1.0rc2/src/cube_harness/agent.py +282 -0
  6. cube_harness-0.1.0rc2/src/cube_harness/agents/__init__.py +19 -0
  7. cube_harness-0.1.0rc2/src/cube_harness/agents/genny.py +609 -0
  8. cube_harness-0.1.0rc2/src/cube_harness/agents/genny_configs.py +101 -0
  9. cube_harness-0.1.0rc2/src/cube_harness/agents/legacy_generic_agent.py +1269 -0
  10. cube_harness-0.1.0rc2/src/cube_harness/agents/react.py +210 -0
  11. cube_harness-0.1.0rc2/src/cube_harness/agents/react_configs.py +17 -0
  12. cube_harness-0.1.0rc2/src/cube_harness/analyze/__init__.py +5 -0
  13. cube_harness-0.1.0rc2/src/cube_harness/analyze/cross_experiment/__init__.py +26 -0
  14. cube_harness-0.1.0rc2/src/cube_harness/analyze/cross_experiment/cross_investigation_agreement.py +107 -0
  15. cube_harness-0.1.0rc2/src/cube_harness/analyze/cross_experiment/joint_csv.py +203 -0
  16. cube_harness-0.1.0rc2/src/cube_harness/analyze/inspect_results.py +191 -0
  17. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigation_report.py +91 -0
  18. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/__init__.py +45 -0
  19. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/agent_driver.py +486 -0
  20. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/audit.py +207 -0
  21. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/benchmark_context_agent.py +226 -0
  22. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/cli.py +233 -0
  23. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/context.py +167 -0
  24. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/core.py +958 -0
  25. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/episode_discovery.py +135 -0
  26. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/meta_analysis.py +516 -0
  27. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/parse.py +86 -0
  28. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/recipe.py +95 -0
  29. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/schema_prompt.py +138 -0
  30. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/selectors.py +157 -0
  31. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/transcript.py +180 -0
  32. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/__init__.py +67 -0
  33. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/agent_scaffolding/SKILL.md +24 -0
  34. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/agent_scaffolding/__init__.py +9 -0
  35. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/agent_scaffolding/recipe.py +122 -0
  36. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/general_blame/SKILL.md +20 -0
  37. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/general_blame/__init__.py +5 -0
  38. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/general_blame/recipe.py +138 -0
  39. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/hinter/SKILL.md +58 -0
  40. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/hinter/__init__.py +9 -0
  41. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/hinter/recipe.py +167 -0
  42. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/SKILL.md +21 -0
  43. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/__init__.py +5 -0
  44. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/recipe.py +106 -0
  45. cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/scripts/summarise_profile.py +5 -0
  46. cube_harness-0.1.0rc2/src/cube_harness/analyze/reset_episodes.py +135 -0
  47. cube_harness-0.1.0rc2/src/cube_harness/analyze/stats.py +54 -0
  48. cube_harness-0.1.0rc2/src/cube_harness/analyze/trace.py +253 -0
  49. cube_harness-0.1.0rc2/src/cube_harness/analyze/xray.py +1834 -0
  50. cube_harness-0.1.0rc2/src/cube_harness/analyze/xray_events.py +441 -0
  51. cube_harness-0.1.0rc2/src/cube_harness/analyze/xray_utils.py +1680 -0
  52. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/README.md +160 -0
  53. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/diagram.png +0 -0
  54. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/exp_config.py +52 -0
  55. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/fix_report.md +128 -0
  56. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/notes.md +32 -0
  57. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/report.md +37 -0
  58. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/session.md +22 -0
  59. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/debug/SKILL.md +252 -0
  60. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/debug/investigator_extra.md +52 -0
  61. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/hinter/SKILL.md +182 -0
  62. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/hinter/investigator_extra.md +40 -0
  63. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/hinter/templates/exp_config.py +60 -0
  64. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/profile/SKILL.md +142 -0
  65. cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/profile/templates/exp_config.py +55 -0
  66. cube_harness-0.1.0rc2/src/cube_harness/budget.py +187 -0
  67. cube_harness-0.1.0rc2/src/cube_harness/core.py +286 -0
  68. cube_harness-0.1.0rc2/src/cube_harness/episode.py +450 -0
  69. cube_harness-0.1.0rc2/src/cube_harness/episode_logs.py +64 -0
  70. cube_harness-0.1.0rc2/src/cube_harness/episode_status.py +146 -0
  71. cube_harness-0.1.0rc2/src/cube_harness/eval_log.py +874 -0
  72. cube_harness-0.1.0rc2/src/cube_harness/exp_runner.py +846 -0
  73. cube_harness-0.1.0rc2/src/cube_harness/experiment.py +376 -0
  74. cube_harness-0.1.0rc2/src/cube_harness/experiment_status.py +158 -0
  75. cube_harness-0.1.0rc2/src/cube_harness/infra.py +56 -0
  76. cube_harness-0.1.0rc2/src/cube_harness/llm.py +823 -0
  77. cube_harness-0.1.0rc2/src/cube_harness/mcp/__init__.py +11 -0
  78. cube_harness-0.1.0rc2/src/cube_harness/mcp/convert.py +59 -0
  79. cube_harness-0.1.0rc2/src/cube_harness/mcp/server.py +121 -0
  80. cube_harness-0.1.0rc2/src/cube_harness/metrics/__init__.py +0 -0
  81. cube_harness-0.1.0rc2/src/cube_harness/metrics/profile_rollup.py +273 -0
  82. cube_harness-0.1.0rc2/src/cube_harness/metrics/profiler.py +390 -0
  83. cube_harness-0.1.0rc2/src/cube_harness/metrics/tracer.py +234 -0
  84. cube_harness-0.1.0rc2/src/cube_harness/multi_agent.py +157 -0
  85. cube_harness-0.1.0rc2/src/cube_harness/py.typed +0 -0
  86. cube_harness-0.1.0rc2/src/cube_harness/recipe.py +92 -0
  87. cube_harness-0.1.0rc2/src/cube_harness/reproducibility/__init__.py +36 -0
  88. cube_harness-0.1.0rc2/src/cube_harness/reproducibility/eee.py +220 -0
  89. cube_harness-0.1.0rc2/src/cube_harness/reproducibility/journal.py +309 -0
  90. cube_harness-0.1.0rc2/src/cube_harness/reproducibility/samples.py +77 -0
  91. cube_harness-0.1.0rc2/src/cube_harness/reproducibility/scan.py +357 -0
  92. cube_harness-0.1.0rc2/src/cube_harness/reproducibility/submissions.py +184 -0
  93. cube_harness-0.1.0rc2/src/cube_harness/results.py +224 -0
  94. cube_harness-0.1.0rc2/src/cube_harness/rl/__init__.py +47 -0
  95. cube_harness-0.1.0rc2/src/cube_harness/rl/__main__.py +39 -0
  96. cube_harness-0.1.0rc2/src/cube_harness/rl/engine.py +339 -0
  97. cube_harness-0.1.0rc2/src/cube_harness/rl/event_publisher.py +169 -0
  98. cube_harness-0.1.0rc2/src/cube_harness/rl/events.py +96 -0
  99. cube_harness-0.1.0rc2/src/cube_harness/rl/executor.py +407 -0
  100. cube_harness-0.1.0rc2/src/cube_harness/rl/llm.py +72 -0
  101. cube_harness-0.1.0rc2/src/cube_harness/rl/ray_runtime.py +113 -0
  102. cube_harness-0.1.0rc2/src/cube_harness/rl/rollout.py +67 -0
  103. cube_harness-0.1.0rc2/src/cube_harness/rl/service.py +130 -0
  104. cube_harness-0.1.0rc2/src/cube_harness/rl/task_runner.py +65 -0
  105. cube_harness-0.1.0rc2/src/cube_harness/rl/trajectory_sink.py +186 -0
  106. cube_harness-0.1.0rc2/src/cube_harness/rl/utils.py +39 -0
  107. cube_harness-0.1.0rc2/src/cube_harness/storage.py +1560 -0
  108. cube_harness-0.1.0rc2/src/cube_harness/streamer.py +373 -0
  109. cube_harness-0.1.0rc2/src/cube_harness/summary.py +36 -0
  110. cube_harness-0.1.0rc2/src/cube_harness/tool.py +195 -0
  111. cube_harness-0.1.0rc2/src/cube_harness/utils.py +61 -0
@@ -0,0 +1,220 @@
1
+ Metadata-Version: 2.3
2
+ Name: cube-harness
3
+ Version: 0.1.0rc2
4
+ Summary: cube-harness, open source agentic evaluation and data generation framework.
5
+ Author: Nicolas Gontier, Aman Jaiswal, Oleh Shliazhko
6
+ Author-email: Nicolas Gontier <nicolas.gontier@servicenow.com>, Aman Jaiswal <amanjaiswal73892@gmail.com>, Oleh Shliazhko <oleh.shliazhko@servicenow.com>
7
+ Requires-Dist: cube-standard>=0.1.0rc9
8
+ Requires-Dist: cube-browser-playwright>=0.2.0
9
+ Requires-Dist: pydantic~=2.0
10
+ Requires-Dist: litellm~=1.80.8
11
+ Requires-Dist: pillow~=11.0
12
+ Requires-Dist: beautifulsoup4~=4.14
13
+ Requires-Dist: numpy~=2.3.5
14
+ Requires-Dist: ray[default]>=2.52.1
15
+ Requires-Dist: tenacity>=8.5.0
16
+ Requires-Dist: typer~=0.25
17
+ Requires-Dist: python-dotenv~=1.2.0
18
+ Requires-Dist: termcolor~=3.2
19
+ Requires-Dist: opentelemetry-api>=1.20.0
20
+ Requires-Dist: opentelemetry-sdk>=1.20.0
21
+ Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.20.0
22
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc>=1.39.1
23
+ Requires-Dist: browsergym-core>=0.14.3
24
+ Requires-Dist: mcp[cli]>=1.26
25
+ Requires-Dist: msgpack>=1.0.0
26
+ Requires-Dist: zstandard>=0.20.0
27
+ Requires-Dist: docker>=7.1.0
28
+ Requires-Dist: psutil>=5.9.0
29
+ Requires-Dist: gradio~=5.49 ; extra == 'analyze'
30
+ Requires-Dist: pandas~=2.0 ; extra == 'analyze'
31
+ Requires-Dist: claude-agent-sdk>=0.1.6 ; extra == 'investigator'
32
+ Requires-Dist: fastapi>=0.115.0 ; extra == 'rl'
33
+ Requires-Dist: uvicorn>=0.29.0 ; extra == 'rl'
34
+ Requires-Dist: transformers~=4.57.0 ; extra == 'rl'
35
+ Requires-Python: >=3.12
36
+ Provides-Extra: analyze
37
+ Provides-Extra: investigator
38
+ Provides-Extra: rl
39
+ Description-Content-Type: text/markdown
40
+
41
+ <img alt="cube-harness banner" src="docs/assets/images/cube_harness_banner.png" />
42
+
43
+ # cube-harness
44
+
45
+ Open source harness for building and evaluating AI agents using the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
46
+
47
+ **[CUBE Standard](https://github.com/The-AI-Alliance/cube-standard)** defines the benchmark protocol. **cube-harness** is the evaluation runtime: it runs agents against any CUBE-compatible benchmark, records trajectories, and scales execution with Ray.
48
+
49
+ > [!NOTE]
50
+ > **cube-harness is in active development (alpha).** Interfaces may change. We welcome early adopters and contributors who want to shape the framework, not just use it.
51
+ > See our [Roadmap](ROADMAP.md) and [Contributing Guide](CONTRIBUTING.md).
52
+ >
53
+ > **Want to change the harness itself?** Start with [Changing cube-harness](CONTRIBUTING.md#changing-cube-harness) and the project [Design Philosophy](https://the-ai-alliance.github.io/cube-standard/design-philosophy); the `/gatekeep-rfc` skill lets you check your own draft before anyone else reads it.
54
+ >
55
+ > **Have a benchmark to contribute?** [Fill out this short form](https://docs.google.com/forms/d/e/1FAIpQLSddMFyRXZJPpD0I2K27OEmIPUpj57w--u2NuMscrjNlkqy8rQ/viewform) — no commitment required. Want to go deeper? [Apply to join the core team](https://forms.gle/JFiBi4ynfVLMghAH8).
56
+
57
+ <!-- [Published Documentation](https://the-ai-alliance.github.io/cube-harness/) -->
58
+
59
+ ## Quickstart
60
+
61
+ ### Installation
62
+
63
+ ```bash
64
+ # Clone the repository
65
+ git clone https://github.com/The-AI-Alliance/cube-harness.git
66
+ cd cube-harness
67
+
68
+ # Install dependencies
69
+ make install
70
+ ```
71
+
72
+ ### API Keys
73
+
74
+ Set your OpenAI API key:
75
+
76
+ ```bash
77
+ export OPENAI_API_KEY=your-key-here
78
+ ```
79
+
80
+ Any [LiteLLM-supported provider](https://docs.litellm.ai/docs/providers) works — just change `model_name` in the recipe.
81
+
82
+ ### Run Tests
83
+
84
+ ```bash
85
+ make test
86
+ ```
87
+
88
+ ### Run Hello Example
89
+
90
+ The [`hello_miniwob`](recipes/hello_miniwob.py) recipe demonstrates running a ReAct agent on the MiniWob benchmark.
91
+
92
+ **Start here** — first 2 tasks, in-process (fast, no Ray required):
93
+
94
+ ```bash
95
+ make debug # → uv run recipes/hello_miniwob.py --limit 2
96
+ ```
97
+
98
+ Full benchmark (parallel via Ray):
99
+
100
+ ```bash
101
+ make hello # → uv run recipes/hello_miniwob.py
102
+ ```
103
+
104
+ ### Configuration
105
+
106
+ A recipe is a declarative config file: it imports canonical configs by name,
107
+ tweaks a few attributes, builds one or more `Experiment` objects, and ends
108
+ with `run(...)`. **Copy a recipe from [`recipes/`](recipes/) and edit it** —
109
+ recipes are documentation-by-example, not a CLI.
110
+
111
+ ```python
112
+ from cube_harness.agents.genny_configs import GENNY_CONFIGS # "default", "swe"
113
+ from cube_harness.infra import INFRA_CONFIGS # ~/.cube/infra.py; "local" built in
114
+ from cube_harness.recipe import run
115
+
116
+ agent = GENNY_CONFIGS["swe"] # every lookup is a fresh deep copy
117
+ agent.budget.cost_limit = 2.0 # validated at the assignment site
118
+
119
+ exp = Experiment(name="x", agent_config=agent, benchmark_config=..., infra=INFRA_CONFIGS["local"])
120
+ if __name__ == "__main__":
121
+ run(exp) # or run(exp_a, exp_b)
122
+ ```
123
+
124
+ `run()` is the only CLI, identical for every recipe and not extensible:
125
+ `--limit N` (first N tasks, in-process), `--ray N` (worker count),
126
+ `--set dotted.path=value` (ad-hoc override). For anything structural, clone
127
+ the file. Config objects are typed Pydantic models, serialized with every
128
+ experiment for reproducibility.
129
+
130
+ **Infra** is machine-local in `~/.cube/infra.py` (a `dict[str, InfraConfig]`,
131
+ never committed; credentials come from env). `"local"` works with zero setup.
132
+ To use a cluster/cloud, copy [`recipes/infra_template.py`](recipes/infra_template.py)
133
+ to `~/.cube/infra.py` and edit it — it documents the process and shows
134
+ LocalInfraConfig plus commented Toolkit/Azure examples.
135
+
136
+ See **[docs/configuration.md](docs/configuration.md)** for the full philosophy, a comparison with Hydra/YAML/CLI approaches, and how to run sweeps.
137
+
138
+ ## Experiment Viewer
139
+
140
+ cube-harness includes a Gradio-based XRay UI for exploring experiment results, trajectories, and OpenTelemetry spans:
141
+
142
+ ```bash
143
+ make xray
144
+ # or: uv run ch-xray
145
+ ```
146
+
147
+ The viewer displays:
148
+ - **Trajectory list** — all runs with task ID, steps, reward, and duration
149
+ - **Visual timeline** — color-coded steps (blue=environment, green=agent) with duration-based widths
150
+ - **Screenshots** — environment state at each step
151
+ - **Step details** — observations, agent actions, and LLM reasoning
152
+ - **Debug data** — raw JSON, LLM calls, and tool configurations
153
+
154
+ ![cube-harness Viewer Screenshot](docs/assets/images/al2_viewer.png)
155
+
156
+ ## Architecture Overview
157
+
158
+ cube-harness is a **universal evaluation platform** for agentic benchmarks and an **RL data generation** framework built on top of the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
159
+
160
+ ### Core Components
161
+
162
+ ![cube-harness Overview](docs/assets/images/al2_overview.png)
163
+
164
+ - **Agent** — LLM-powered decision maker that receives observations and produces actions
165
+ - **Environment** — Executes actions, provides observations and rewards (tool + task composition)
166
+ - **Tool** — Modular action provider that exposes an action space, reusable across benchmarks
167
+ - **ActionSpace** — Defines the set of possible actions a tool can execute
168
+ - **Task** — Defines goals, validation logic, and action subsets
169
+ - **Trajectory** — Stores interaction history (observations, actions, rewards)
170
+ - **Episode** — Single agent-environment loop for one task; records a trajectory
171
+ - **Benchmark** — Collection of tasks; produces env configs for episodes
172
+ - **Experiment** — Coordinates execution of multiple episodes across a benchmark
173
+ - **ExpRunner** — Execution runtime (sequential or parallel via Ray)
174
+
175
+ ### Design Goals
176
+
177
+ 1. **Benchmark Agnostic** — Plug in any CUBE-standard benchmark (MiniWob, WebArena, OSWorld, …) via the `Benchmark` interface
178
+ 2. **Agent Agnostic** — Support any agent architecture by implementing the `Agent` protocol
179
+ 3. **RL-Ready** — Trajectory format designed for training data generation with full LLM call logging
180
+ 4. **Scalable** — Ray integration for parallel episode execution across multiple workers
181
+ 5. **Observable** — Structured trajectory output for analysis and debugging
182
+
183
+ ## Development
184
+
185
+ ```bash
186
+ make format # Format code
187
+ make lint # Lint and auto-fix
188
+ make help # Show all commands
189
+ make test # Run tests
190
+ make coverage # Run tests with coverage report
191
+ ```
192
+
193
+ ### Pre-commit hooks
194
+
195
+ Install once after cloning to get ruff lint/format, trailing-whitespace checks, and DCO sign-off enforcement on every commit:
196
+
197
+ ```bash
198
+ pre-commit install --hook-type pre-commit --hook-type commit-msg --hook-type prepare-commit-msg
199
+ ```
200
+
201
+ The `prepare-commit-msg` hook automatically appends `Signed-off-by: Your Name <email>` to every commit message (required by the DCO). You can also sign off manually with `git commit -s`.
202
+
203
+ ## Project Structure
204
+
205
+ ```
206
+ cube-harness/
207
+ ├── src/cube_harness/ # Source code for the framework
208
+ ├── tests/ # Test suite
209
+ ├── recipes/ # Example recipes and configurations
210
+ ├── docs/ # Project documentation
211
+ └── Makefile # Common task shortcuts
212
+ ```
213
+
214
+ ## Getting Involved
215
+
216
+ All contributions are welcome — open an issue, submit a PR, or wrap a new benchmark. See [CONTRIBUTING.md](CONTRIBUTING.md) for the development guide, DCO requirements, and RFC process.
217
+
218
+ Want deeper involvement? Join the core team, shape the roadmap, and get credit for what you build. [Apply here](https://forms.gle/JFiBi4ynfVLMghAH8).
219
+
220
+ For general AI Alliance contribution guidelines, see the [community repo](https://github.com/The-AI-Alliance/community/).
@@ -0,0 +1,180 @@
1
+ <img alt="cube-harness banner" src="docs/assets/images/cube_harness_banner.png" />
2
+
3
+ # cube-harness
4
+
5
+ Open source harness for building and evaluating AI agents using the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
6
+
7
+ **[CUBE Standard](https://github.com/The-AI-Alliance/cube-standard)** defines the benchmark protocol. **cube-harness** is the evaluation runtime: it runs agents against any CUBE-compatible benchmark, records trajectories, and scales execution with Ray.
8
+
9
+ > [!NOTE]
10
+ > **cube-harness is in active development (alpha).** Interfaces may change. We welcome early adopters and contributors who want to shape the framework, not just use it.
11
+ > See our [Roadmap](ROADMAP.md) and [Contributing Guide](CONTRIBUTING.md).
12
+ >
13
+ > **Want to change the harness itself?** Start with [Changing cube-harness](CONTRIBUTING.md#changing-cube-harness) and the project [Design Philosophy](https://the-ai-alliance.github.io/cube-standard/design-philosophy); the `/gatekeep-rfc` skill lets you check your own draft before anyone else reads it.
14
+ >
15
+ > **Have a benchmark to contribute?** [Fill out this short form](https://docs.google.com/forms/d/e/1FAIpQLSddMFyRXZJPpD0I2K27OEmIPUpj57w--u2NuMscrjNlkqy8rQ/viewform) — no commitment required. Want to go deeper? [Apply to join the core team](https://forms.gle/JFiBi4ynfVLMghAH8).
16
+
17
+ <!-- [Published Documentation](https://the-ai-alliance.github.io/cube-harness/) -->
18
+
19
+ ## Quickstart
20
+
21
+ ### Installation
22
+
23
+ ```bash
24
+ # Clone the repository
25
+ git clone https://github.com/The-AI-Alliance/cube-harness.git
26
+ cd cube-harness
27
+
28
+ # Install dependencies
29
+ make install
30
+ ```
31
+
32
+ ### API Keys
33
+
34
+ Set your OpenAI API key:
35
+
36
+ ```bash
37
+ export OPENAI_API_KEY=your-key-here
38
+ ```
39
+
40
+ Any [LiteLLM-supported provider](https://docs.litellm.ai/docs/providers) works — just change `model_name` in the recipe.
41
+
42
+ ### Run Tests
43
+
44
+ ```bash
45
+ make test
46
+ ```
47
+
48
+ ### Run Hello Example
49
+
50
+ The [`hello_miniwob`](recipes/hello_miniwob.py) recipe demonstrates running a ReAct agent on the MiniWob benchmark.
51
+
52
+ **Start here** — first 2 tasks, in-process (fast, no Ray required):
53
+
54
+ ```bash
55
+ make debug # → uv run recipes/hello_miniwob.py --limit 2
56
+ ```
57
+
58
+ Full benchmark (parallel via Ray):
59
+
60
+ ```bash
61
+ make hello # → uv run recipes/hello_miniwob.py
62
+ ```
63
+
64
+ ### Configuration
65
+
66
+ A recipe is a declarative config file: it imports canonical configs by name,
67
+ tweaks a few attributes, builds one or more `Experiment` objects, and ends
68
+ with `run(...)`. **Copy a recipe from [`recipes/`](recipes/) and edit it** —
69
+ recipes are documentation-by-example, not a CLI.
70
+
71
+ ```python
72
+ from cube_harness.agents.genny_configs import GENNY_CONFIGS # "default", "swe"
73
+ from cube_harness.infra import INFRA_CONFIGS # ~/.cube/infra.py; "local" built in
74
+ from cube_harness.recipe import run
75
+
76
+ agent = GENNY_CONFIGS["swe"] # every lookup is a fresh deep copy
77
+ agent.budget.cost_limit = 2.0 # validated at the assignment site
78
+
79
+ exp = Experiment(name="x", agent_config=agent, benchmark_config=..., infra=INFRA_CONFIGS["local"])
80
+ if __name__ == "__main__":
81
+ run(exp) # or run(exp_a, exp_b)
82
+ ```
83
+
84
+ `run()` is the only CLI, identical for every recipe and not extensible:
85
+ `--limit N` (first N tasks, in-process), `--ray N` (worker count),
86
+ `--set dotted.path=value` (ad-hoc override). For anything structural, clone
87
+ the file. Config objects are typed Pydantic models, serialized with every
88
+ experiment for reproducibility.
89
+
90
+ **Infra** is machine-local in `~/.cube/infra.py` (a `dict[str, InfraConfig]`,
91
+ never committed; credentials come from env). `"local"` works with zero setup.
92
+ To use a cluster/cloud, copy [`recipes/infra_template.py`](recipes/infra_template.py)
93
+ to `~/.cube/infra.py` and edit it — it documents the process and shows
94
+ LocalInfraConfig plus commented Toolkit/Azure examples.
95
+
96
+ See **[docs/configuration.md](docs/configuration.md)** for the full philosophy, a comparison with Hydra/YAML/CLI approaches, and how to run sweeps.
97
+
98
+ ## Experiment Viewer
99
+
100
+ cube-harness includes a Gradio-based XRay UI for exploring experiment results, trajectories, and OpenTelemetry spans:
101
+
102
+ ```bash
103
+ make xray
104
+ # or: uv run ch-xray
105
+ ```
106
+
107
+ The viewer displays:
108
+ - **Trajectory list** — all runs with task ID, steps, reward, and duration
109
+ - **Visual timeline** — color-coded steps (blue=environment, green=agent) with duration-based widths
110
+ - **Screenshots** — environment state at each step
111
+ - **Step details** — observations, agent actions, and LLM reasoning
112
+ - **Debug data** — raw JSON, LLM calls, and tool configurations
113
+
114
+ ![cube-harness Viewer Screenshot](docs/assets/images/al2_viewer.png)
115
+
116
+ ## Architecture Overview
117
+
118
+ cube-harness is a **universal evaluation platform** for agentic benchmarks and an **RL data generation** framework built on top of the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
119
+
120
+ ### Core Components
121
+
122
+ ![cube-harness Overview](docs/assets/images/al2_overview.png)
123
+
124
+ - **Agent** — LLM-powered decision maker that receives observations and produces actions
125
+ - **Environment** — Executes actions, provides observations and rewards (tool + task composition)
126
+ - **Tool** — Modular action provider that exposes an action space, reusable across benchmarks
127
+ - **ActionSpace** — Defines the set of possible actions a tool can execute
128
+ - **Task** — Defines goals, validation logic, and action subsets
129
+ - **Trajectory** — Stores interaction history (observations, actions, rewards)
130
+ - **Episode** — Single agent-environment loop for one task; records a trajectory
131
+ - **Benchmark** — Collection of tasks; produces env configs for episodes
132
+ - **Experiment** — Coordinates execution of multiple episodes across a benchmark
133
+ - **ExpRunner** — Execution runtime (sequential or parallel via Ray)
134
+
135
+ ### Design Goals
136
+
137
+ 1. **Benchmark Agnostic** — Plug in any CUBE-standard benchmark (MiniWob, WebArena, OSWorld, …) via the `Benchmark` interface
138
+ 2. **Agent Agnostic** — Support any agent architecture by implementing the `Agent` protocol
139
+ 3. **RL-Ready** — Trajectory format designed for training data generation with full LLM call logging
140
+ 4. **Scalable** — Ray integration for parallel episode execution across multiple workers
141
+ 5. **Observable** — Structured trajectory output for analysis and debugging
142
+
143
+ ## Development
144
+
145
+ ```bash
146
+ make format # Format code
147
+ make lint # Lint and auto-fix
148
+ make help # Show all commands
149
+ make test # Run tests
150
+ make coverage # Run tests with coverage report
151
+ ```
152
+
153
+ ### Pre-commit hooks
154
+
155
+ Install once after cloning to get ruff lint/format, trailing-whitespace checks, and DCO sign-off enforcement on every commit:
156
+
157
+ ```bash
158
+ pre-commit install --hook-type pre-commit --hook-type commit-msg --hook-type prepare-commit-msg
159
+ ```
160
+
161
+ The `prepare-commit-msg` hook automatically appends `Signed-off-by: Your Name <email>` to every commit message (required by the DCO). You can also sign off manually with `git commit -s`.
162
+
163
+ ## Project Structure
164
+
165
+ ```
166
+ cube-harness/
167
+ ├── src/cube_harness/ # Source code for the framework
168
+ ├── tests/ # Test suite
169
+ ├── recipes/ # Example recipes and configurations
170
+ ├── docs/ # Project documentation
171
+ └── Makefile # Common task shortcuts
172
+ ```
173
+
174
+ ## Getting Involved
175
+
176
+ All contributions are welcome — open an issue, submit a PR, or wrap a new benchmark. See [CONTRIBUTING.md](CONTRIBUTING.md) for the development guide, DCO requirements, and RFC process.
177
+
178
+ Want deeper involvement? Join the core team, shape the roadmap, and get credit for what you build. [Apply here](https://forms.gle/JFiBi4ynfVLMghAH8).
179
+
180
+ For general AI Alliance contribution guidelines, see the [community repo](https://github.com/The-AI-Alliance/community/).
@@ -0,0 +1,109 @@
1
+
2
+ [project]
3
+ name = "cube-harness"
4
+ version = "0.1.0rc2"
5
+ description = "cube-harness, open source agentic evaluation and data generation framework."
6
+ readme = "README.md"
7
+ authors = [
8
+ { name = "Nicolas Gontier", email = "nicolas.gontier@servicenow.com" },
9
+ { name = "Aman Jaiswal", email = "amanjaiswal73892@gmail.com" },
10
+ { name = "Oleh Shliazhko", email = "oleh.shliazhko@servicenow.com" },
11
+ ]
12
+ requires-python = ">=3.12"
13
+ dependencies = [
14
+ "cube-standard>=0.1.0rc9",
15
+ "cube-browser-playwright>=0.2.0", # used by cubes/workarena (PlaywrightSessionConfig)
16
+ "pydantic~=2.0",
17
+ "litellm~=1.80.8",
18
+ "Pillow~=11.0",
19
+ "beautifulsoup4~=4.14",
20
+ "numpy~=2.3.5",
21
+ "ray[default]>=2.52.1",
22
+ "tenacity>=8.5.0",
23
+ "typer~=0.25",
24
+ "python-dotenv~=1.2.0",
25
+ "termcolor~=3.2",
26
+ "opentelemetry-api>=1.20.0",
27
+ "opentelemetry-sdk>=1.20.0",
28
+ "opentelemetry-exporter-otlp-proto-http>=1.20.0",
29
+ "opentelemetry-exporter-otlp-proto-grpc>=1.39.1",
30
+ "browsergym-core>=0.14.3",
31
+ "mcp[cli]>=1.26",
32
+ "msgpack>=1.0.0",
33
+ "zstandard>=0.20.0",
34
+ "docker>=7.1.0",
35
+ "psutil>=5.9.0",
36
+ ]
37
+
38
+ [project.optional-dependencies]
39
+ # XRay trajectory viewer
40
+ analyze = [
41
+ "gradio~=5.49",
42
+ "pandas~=2.0",
43
+ ]
44
+ # Trajectory investigator — post-hoc LLM analysis via Claude Code
45
+ investigator = [
46
+ "claude-agent-sdk>=0.1.6",
47
+ ]
48
+ # Rollout generation
49
+ rl = [
50
+ "fastapi>=0.115.0",
51
+ "uvicorn>=0.29.0",
52
+ "transformers~=4.57.0"
53
+ ]
54
+
55
+ [build-system]
56
+ requires = ["uv_build>=0.8.22,<0.9.0"]
57
+ build-backend = "uv_build"
58
+
59
+ [tool.uv]
60
+ constraint-dependencies = ["pyasn1>=0.6.2", "cryptography>=46.0.5", "protobuf>=6.33.5"]
61
+
62
+
63
+ [tool.ruff]
64
+ fix = true
65
+ line-length = 120
66
+ indent-width = 4
67
+
68
+ [tool.ruff.format]
69
+ quote-style = "double"
70
+ indent-style = "space"
71
+ skip-magic-trailing-comma = false
72
+ line-ending = "auto"
73
+
74
+ [tool.ruff.lint]
75
+ extend-select = ["I"] # sort imports
76
+
77
+ [project.scripts]
78
+ ch-mcp-server = "cube_harness.mcp.server:main"
79
+ ch-xray = "cube_harness.analyze.xray:main"
80
+ ch-trace = "cube_harness.analyze.trace:main"
81
+ ch-investigate = "cube_harness.analyze.investigator:main"
82
+ ch-investigation-report = "cube_harness.analyze.investigation_report:main"
83
+ ch-reset-episodes = "cube_harness.analyze.reset_episodes:main"
84
+ ch-rollout = "cube_harness.rl.__main__:cli"
85
+ ch-profile = "cube_harness.metrics.profile_rollup:cli"
86
+
87
+ [dependency-groups]
88
+ dev = [
89
+ "ruff>=0.14.7",
90
+ "pytest>=8.0",
91
+ "pytest-cov>=6.0",
92
+ "pytest-xdist>=3.8.0",
93
+ "pytest-asyncio>=1.3.0",
94
+ "pytest-playwright>=0.7.2",
95
+ ]
96
+
97
+ [tool.pytest.ini_options]
98
+ markers = [
99
+ "slow: tests taking >5s but no external deps (Ray retry, xray e2e). CI runs them; deselect locally for fast feedback ('-m \"not slow\"').",
100
+ "serial: mark test to be run sequentially (deselect with '-m \"not serial\"').",
101
+ "integration: requires a live browser (headless Chromium via Playwright). CI runs them after `playwright install`.",
102
+ "live_api: hits a real LLM provider — costs money, needs API key (e.g. ANTHROPIC_API_KEY). Auto-skipped without the key; never run by default CI. Run locally to validate cache-control / streaming / etc. against a real model.",
103
+ ]
104
+
105
+ [tool.setuptools.packages.find]
106
+ where = ["src"]
107
+
108
+ [tool.uv.sources]
109
+ cube-standard = { git = "https://github.com/The-AI-Alliance/cube-standard.git", branch = "dev" }
@@ -0,0 +1,10 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ from cube_harness.experiment import EXP_DIR, make_experiment_output_dir
4
+
5
+ try:
6
+ __version__ = version("cube-harness")
7
+ except PackageNotFoundError:
8
+ __version__ = "unknown"
9
+
10
+ __all__ = ["EXP_DIR", "__version__", "make_experiment_output_dir"]