cube-harness 0.1.0rc2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cube_harness-0.1.0rc2/PKG-INFO +220 -0
- cube_harness-0.1.0rc2/README.md +180 -0
- cube_harness-0.1.0rc2/pyproject.toml +109 -0
- cube_harness-0.1.0rc2/src/cube_harness/__init__.py +10 -0
- cube_harness-0.1.0rc2/src/cube_harness/agent.py +282 -0
- cube_harness-0.1.0rc2/src/cube_harness/agents/__init__.py +19 -0
- cube_harness-0.1.0rc2/src/cube_harness/agents/genny.py +609 -0
- cube_harness-0.1.0rc2/src/cube_harness/agents/genny_configs.py +101 -0
- cube_harness-0.1.0rc2/src/cube_harness/agents/legacy_generic_agent.py +1269 -0
- cube_harness-0.1.0rc2/src/cube_harness/agents/react.py +210 -0
- cube_harness-0.1.0rc2/src/cube_harness/agents/react_configs.py +17 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/__init__.py +5 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/cross_experiment/__init__.py +26 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/cross_experiment/cross_investigation_agreement.py +107 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/cross_experiment/joint_csv.py +203 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/inspect_results.py +191 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigation_report.py +91 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/__init__.py +45 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/agent_driver.py +486 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/audit.py +207 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/benchmark_context_agent.py +226 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/cli.py +233 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/context.py +167 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/core.py +958 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/episode_discovery.py +135 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/meta_analysis.py +516 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/parse.py +86 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/recipe.py +95 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/schema_prompt.py +138 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/selectors.py +157 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/transcript.py +180 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/__init__.py +67 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/agent_scaffolding/SKILL.md +24 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/agent_scaffolding/__init__.py +9 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/agent_scaffolding/recipe.py +122 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/general_blame/SKILL.md +20 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/general_blame/__init__.py +5 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/general_blame/recipe.py +138 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/hinter/SKILL.md +58 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/hinter/__init__.py +9 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/hinter/recipe.py +167 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/SKILL.md +21 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/__init__.py +5 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/recipe.py +106 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/investigator/use_cases/profiling/scripts/summarise_profile.py +5 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/reset_episodes.py +135 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/stats.py +54 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/trace.py +253 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/xray.py +1834 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/xray_events.py +441 -0
- cube_harness-0.1.0rc2/src/cube_harness/analyze/xray_utils.py +1680 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/README.md +160 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/diagram.png +0 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/exp_config.py +52 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/fix_report.md +128 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/notes.md +32 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/report.md +37 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/templates/session.md +22 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/debug/SKILL.md +252 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/debug/investigator_extra.md +52 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/hinter/SKILL.md +182 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/hinter/investigator_extra.md +40 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/hinter/templates/exp_config.py +60 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/profile/SKILL.md +142 -0
- cube_harness-0.1.0rc2/src/cube_harness/auto_cube/use_cases/profile/templates/exp_config.py +55 -0
- cube_harness-0.1.0rc2/src/cube_harness/budget.py +187 -0
- cube_harness-0.1.0rc2/src/cube_harness/core.py +286 -0
- cube_harness-0.1.0rc2/src/cube_harness/episode.py +450 -0
- cube_harness-0.1.0rc2/src/cube_harness/episode_logs.py +64 -0
- cube_harness-0.1.0rc2/src/cube_harness/episode_status.py +146 -0
- cube_harness-0.1.0rc2/src/cube_harness/eval_log.py +874 -0
- cube_harness-0.1.0rc2/src/cube_harness/exp_runner.py +846 -0
- cube_harness-0.1.0rc2/src/cube_harness/experiment.py +376 -0
- cube_harness-0.1.0rc2/src/cube_harness/experiment_status.py +158 -0
- cube_harness-0.1.0rc2/src/cube_harness/infra.py +56 -0
- cube_harness-0.1.0rc2/src/cube_harness/llm.py +823 -0
- cube_harness-0.1.0rc2/src/cube_harness/mcp/__init__.py +11 -0
- cube_harness-0.1.0rc2/src/cube_harness/mcp/convert.py +59 -0
- cube_harness-0.1.0rc2/src/cube_harness/mcp/server.py +121 -0
- cube_harness-0.1.0rc2/src/cube_harness/metrics/__init__.py +0 -0
- cube_harness-0.1.0rc2/src/cube_harness/metrics/profile_rollup.py +273 -0
- cube_harness-0.1.0rc2/src/cube_harness/metrics/profiler.py +390 -0
- cube_harness-0.1.0rc2/src/cube_harness/metrics/tracer.py +234 -0
- cube_harness-0.1.0rc2/src/cube_harness/multi_agent.py +157 -0
- cube_harness-0.1.0rc2/src/cube_harness/py.typed +0 -0
- cube_harness-0.1.0rc2/src/cube_harness/recipe.py +92 -0
- cube_harness-0.1.0rc2/src/cube_harness/reproducibility/__init__.py +36 -0
- cube_harness-0.1.0rc2/src/cube_harness/reproducibility/eee.py +220 -0
- cube_harness-0.1.0rc2/src/cube_harness/reproducibility/journal.py +309 -0
- cube_harness-0.1.0rc2/src/cube_harness/reproducibility/samples.py +77 -0
- cube_harness-0.1.0rc2/src/cube_harness/reproducibility/scan.py +357 -0
- cube_harness-0.1.0rc2/src/cube_harness/reproducibility/submissions.py +184 -0
- cube_harness-0.1.0rc2/src/cube_harness/results.py +224 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/__init__.py +47 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/__main__.py +39 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/engine.py +339 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/event_publisher.py +169 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/events.py +96 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/executor.py +407 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/llm.py +72 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/ray_runtime.py +113 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/rollout.py +67 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/service.py +130 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/task_runner.py +65 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/trajectory_sink.py +186 -0
- cube_harness-0.1.0rc2/src/cube_harness/rl/utils.py +39 -0
- cube_harness-0.1.0rc2/src/cube_harness/storage.py +1560 -0
- cube_harness-0.1.0rc2/src/cube_harness/streamer.py +373 -0
- cube_harness-0.1.0rc2/src/cube_harness/summary.py +36 -0
- cube_harness-0.1.0rc2/src/cube_harness/tool.py +195 -0
- cube_harness-0.1.0rc2/src/cube_harness/utils.py +61 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: cube-harness
|
|
3
|
+
Version: 0.1.0rc2
|
|
4
|
+
Summary: cube-harness, open source agentic evaluation and data generation framework.
|
|
5
|
+
Author: Nicolas Gontier, Aman Jaiswal, Oleh Shliazhko
|
|
6
|
+
Author-email: Nicolas Gontier <nicolas.gontier@servicenow.com>, Aman Jaiswal <amanjaiswal73892@gmail.com>, Oleh Shliazhko <oleh.shliazhko@servicenow.com>
|
|
7
|
+
Requires-Dist: cube-standard>=0.1.0rc9
|
|
8
|
+
Requires-Dist: cube-browser-playwright>=0.2.0
|
|
9
|
+
Requires-Dist: pydantic~=2.0
|
|
10
|
+
Requires-Dist: litellm~=1.80.8
|
|
11
|
+
Requires-Dist: pillow~=11.0
|
|
12
|
+
Requires-Dist: beautifulsoup4~=4.14
|
|
13
|
+
Requires-Dist: numpy~=2.3.5
|
|
14
|
+
Requires-Dist: ray[default]>=2.52.1
|
|
15
|
+
Requires-Dist: tenacity>=8.5.0
|
|
16
|
+
Requires-Dist: typer~=0.25
|
|
17
|
+
Requires-Dist: python-dotenv~=1.2.0
|
|
18
|
+
Requires-Dist: termcolor~=3.2
|
|
19
|
+
Requires-Dist: opentelemetry-api>=1.20.0
|
|
20
|
+
Requires-Dist: opentelemetry-sdk>=1.20.0
|
|
21
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.20.0
|
|
22
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc>=1.39.1
|
|
23
|
+
Requires-Dist: browsergym-core>=0.14.3
|
|
24
|
+
Requires-Dist: mcp[cli]>=1.26
|
|
25
|
+
Requires-Dist: msgpack>=1.0.0
|
|
26
|
+
Requires-Dist: zstandard>=0.20.0
|
|
27
|
+
Requires-Dist: docker>=7.1.0
|
|
28
|
+
Requires-Dist: psutil>=5.9.0
|
|
29
|
+
Requires-Dist: gradio~=5.49 ; extra == 'analyze'
|
|
30
|
+
Requires-Dist: pandas~=2.0 ; extra == 'analyze'
|
|
31
|
+
Requires-Dist: claude-agent-sdk>=0.1.6 ; extra == 'investigator'
|
|
32
|
+
Requires-Dist: fastapi>=0.115.0 ; extra == 'rl'
|
|
33
|
+
Requires-Dist: uvicorn>=0.29.0 ; extra == 'rl'
|
|
34
|
+
Requires-Dist: transformers~=4.57.0 ; extra == 'rl'
|
|
35
|
+
Requires-Python: >=3.12
|
|
36
|
+
Provides-Extra: analyze
|
|
37
|
+
Provides-Extra: investigator
|
|
38
|
+
Provides-Extra: rl
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
|
|
41
|
+
<img alt="cube-harness banner" src="docs/assets/images/cube_harness_banner.png" />
|
|
42
|
+
|
|
43
|
+
# cube-harness
|
|
44
|
+
|
|
45
|
+
Open source harness for building and evaluating AI agents using the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
|
|
46
|
+
|
|
47
|
+
**[CUBE Standard](https://github.com/The-AI-Alliance/cube-standard)** defines the benchmark protocol. **cube-harness** is the evaluation runtime: it runs agents against any CUBE-compatible benchmark, records trajectories, and scales execution with Ray.
|
|
48
|
+
|
|
49
|
+
> [!NOTE]
|
|
50
|
+
> **cube-harness is in active development (alpha).** Interfaces may change. We welcome early adopters and contributors who want to shape the framework, not just use it.
|
|
51
|
+
> See our [Roadmap](ROADMAP.md) and [Contributing Guide](CONTRIBUTING.md).
|
|
52
|
+
>
|
|
53
|
+
> **Want to change the harness itself?** Start with [Changing cube-harness](CONTRIBUTING.md#changing-cube-harness) and the project [Design Philosophy](https://the-ai-alliance.github.io/cube-standard/design-philosophy); the `/gatekeep-rfc` skill lets you check your own draft before anyone else reads it.
|
|
54
|
+
>
|
|
55
|
+
> **Have a benchmark to contribute?** [Fill out this short form](https://docs.google.com/forms/d/e/1FAIpQLSddMFyRXZJPpD0I2K27OEmIPUpj57w--u2NuMscrjNlkqy8rQ/viewform) — no commitment required. Want to go deeper? [Apply to join the core team](https://forms.gle/JFiBi4ynfVLMghAH8).
|
|
56
|
+
|
|
57
|
+
<!-- [Published Documentation](https://the-ai-alliance.github.io/cube-harness/) -->
|
|
58
|
+
|
|
59
|
+
## Quickstart
|
|
60
|
+
|
|
61
|
+
### Installation
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Clone the repository
|
|
65
|
+
git clone https://github.com/The-AI-Alliance/cube-harness.git
|
|
66
|
+
cd cube-harness
|
|
67
|
+
|
|
68
|
+
# Install dependencies
|
|
69
|
+
make install
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### API Keys
|
|
73
|
+
|
|
74
|
+
Set your OpenAI API key:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
export OPENAI_API_KEY=your-key-here
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Any [LiteLLM-supported provider](https://docs.litellm.ai/docs/providers) works — just change `model_name` in the recipe.
|
|
81
|
+
|
|
82
|
+
### Run Tests
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
make test
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Run Hello Example
|
|
89
|
+
|
|
90
|
+
The [`hello_miniwob`](recipes/hello_miniwob.py) recipe demonstrates running a ReAct agent on the MiniWob benchmark.
|
|
91
|
+
|
|
92
|
+
**Start here** — first 2 tasks, in-process (fast, no Ray required):
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
make debug # → uv run recipes/hello_miniwob.py --limit 2
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Full benchmark (parallel via Ray):
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
make hello # → uv run recipes/hello_miniwob.py
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Configuration
|
|
105
|
+
|
|
106
|
+
A recipe is a declarative config file: it imports canonical configs by name,
|
|
107
|
+
tweaks a few attributes, builds one or more `Experiment` objects, and ends
|
|
108
|
+
with `run(...)`. **Copy a recipe from [`recipes/`](recipes/) and edit it** —
|
|
109
|
+
recipes are documentation-by-example, not a CLI.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from cube_harness.agents.genny_configs import GENNY_CONFIGS # "default", "swe"
|
|
113
|
+
from cube_harness.infra import INFRA_CONFIGS # ~/.cube/infra.py; "local" built in
|
|
114
|
+
from cube_harness.recipe import run
|
|
115
|
+
|
|
116
|
+
agent = GENNY_CONFIGS["swe"] # every lookup is a fresh deep copy
|
|
117
|
+
agent.budget.cost_limit = 2.0 # validated at the assignment site
|
|
118
|
+
|
|
119
|
+
exp = Experiment(name="x", agent_config=agent, benchmark_config=..., infra=INFRA_CONFIGS["local"])
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
run(exp) # or run(exp_a, exp_b)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
`run()` is the only CLI, identical for every recipe and not extensible:
|
|
125
|
+
`--limit N` (first N tasks, in-process), `--ray N` (worker count),
|
|
126
|
+
`--set dotted.path=value` (ad-hoc override). For anything structural, clone
|
|
127
|
+
the file. Config objects are typed Pydantic models, serialized with every
|
|
128
|
+
experiment for reproducibility.
|
|
129
|
+
|
|
130
|
+
**Infra** is machine-local in `~/.cube/infra.py` (a `dict[str, InfraConfig]`,
|
|
131
|
+
never committed; credentials come from env). `"local"` works with zero setup.
|
|
132
|
+
To use a cluster/cloud, copy [`recipes/infra_template.py`](recipes/infra_template.py)
|
|
133
|
+
to `~/.cube/infra.py` and edit it — it documents the process and shows
|
|
134
|
+
LocalInfraConfig plus commented Toolkit/Azure examples.
|
|
135
|
+
|
|
136
|
+
See **[docs/configuration.md](docs/configuration.md)** for the full philosophy, a comparison with Hydra/YAML/CLI approaches, and how to run sweeps.
|
|
137
|
+
|
|
138
|
+
## Experiment Viewer
|
|
139
|
+
|
|
140
|
+
cube-harness includes a Gradio-based XRay UI for exploring experiment results, trajectories, and OpenTelemetry spans:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
make xray
|
|
144
|
+
# or: uv run ch-xray
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
The viewer displays:
|
|
148
|
+
- **Trajectory list** — all runs with task ID, steps, reward, and duration
|
|
149
|
+
- **Visual timeline** — color-coded steps (blue=environment, green=agent) with duration-based widths
|
|
150
|
+
- **Screenshots** — environment state at each step
|
|
151
|
+
- **Step details** — observations, agent actions, and LLM reasoning
|
|
152
|
+
- **Debug data** — raw JSON, LLM calls, and tool configurations
|
|
153
|
+
|
|
154
|
+

|
|
155
|
+
|
|
156
|
+
## Architecture Overview
|
|
157
|
+
|
|
158
|
+
cube-harness is a **universal evaluation platform** for agentic benchmarks and an **RL data generation** framework built on top of the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
|
|
159
|
+
|
|
160
|
+
### Core Components
|
|
161
|
+
|
|
162
|
+

|
|
163
|
+
|
|
164
|
+
- **Agent** — LLM-powered decision maker that receives observations and produces actions
|
|
165
|
+
- **Environment** — Executes actions, provides observations and rewards (tool + task composition)
|
|
166
|
+
- **Tool** — Modular action provider that exposes an action space, reusable across benchmarks
|
|
167
|
+
- **ActionSpace** — Defines the set of possible actions a tool can execute
|
|
168
|
+
- **Task** — Defines goals, validation logic, and action subsets
|
|
169
|
+
- **Trajectory** — Stores interaction history (observations, actions, rewards)
|
|
170
|
+
- **Episode** — Single agent-environment loop for one task; records a trajectory
|
|
171
|
+
- **Benchmark** — Collection of tasks; produces env configs for episodes
|
|
172
|
+
- **Experiment** — Coordinates execution of multiple episodes across a benchmark
|
|
173
|
+
- **ExpRunner** — Execution runtime (sequential or parallel via Ray)
|
|
174
|
+
|
|
175
|
+
### Design Goals
|
|
176
|
+
|
|
177
|
+
1. **Benchmark Agnostic** — Plug in any CUBE-standard benchmark (MiniWob, WebArena, OSWorld, …) via the `Benchmark` interface
|
|
178
|
+
2. **Agent Agnostic** — Support any agent architecture by implementing the `Agent` protocol
|
|
179
|
+
3. **RL-Ready** — Trajectory format designed for training data generation with full LLM call logging
|
|
180
|
+
4. **Scalable** — Ray integration for parallel episode execution across multiple workers
|
|
181
|
+
5. **Observable** — Structured trajectory output for analysis and debugging
|
|
182
|
+
|
|
183
|
+
## Development
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
make format # Format code
|
|
187
|
+
make lint # Lint and auto-fix
|
|
188
|
+
make help # Show all commands
|
|
189
|
+
make test # Run tests
|
|
190
|
+
make coverage # Run tests with coverage report
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Pre-commit hooks
|
|
194
|
+
|
|
195
|
+
Install once after cloning to get ruff lint/format, trailing-whitespace checks, and DCO sign-off enforcement on every commit:
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
pre-commit install --hook-type pre-commit --hook-type commit-msg --hook-type prepare-commit-msg
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
The `prepare-commit-msg` hook automatically appends `Signed-off-by: Your Name <email>` to every commit message (required by the DCO). You can also sign off manually with `git commit -s`.
|
|
202
|
+
|
|
203
|
+
## Project Structure
|
|
204
|
+
|
|
205
|
+
```
|
|
206
|
+
cube-harness/
|
|
207
|
+
├── src/cube_harness/ # Source code for the framework
|
|
208
|
+
├── tests/ # Test suite
|
|
209
|
+
├── recipes/ # Example recipes and configurations
|
|
210
|
+
├── docs/ # Project documentation
|
|
211
|
+
└── Makefile # Common task shortcuts
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Getting Involved
|
|
215
|
+
|
|
216
|
+
All contributions are welcome — open an issue, submit a PR, or wrap a new benchmark. See [CONTRIBUTING.md](CONTRIBUTING.md) for the development guide, DCO requirements, and RFC process.
|
|
217
|
+
|
|
218
|
+
Want deeper involvement? Join the core team, shape the roadmap, and get credit for what you build. [Apply here](https://forms.gle/JFiBi4ynfVLMghAH8).
|
|
219
|
+
|
|
220
|
+
For general AI Alliance contribution guidelines, see the [community repo](https://github.com/The-AI-Alliance/community/).
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
<img alt="cube-harness banner" src="docs/assets/images/cube_harness_banner.png" />
|
|
2
|
+
|
|
3
|
+
# cube-harness
|
|
4
|
+
|
|
5
|
+
Open source harness for building and evaluating AI agents using the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
|
|
6
|
+
|
|
7
|
+
**[CUBE Standard](https://github.com/The-AI-Alliance/cube-standard)** defines the benchmark protocol. **cube-harness** is the evaluation runtime: it runs agents against any CUBE-compatible benchmark, records trajectories, and scales execution with Ray.
|
|
8
|
+
|
|
9
|
+
> [!NOTE]
|
|
10
|
+
> **cube-harness is in active development (alpha).** Interfaces may change. We welcome early adopters and contributors who want to shape the framework, not just use it.
|
|
11
|
+
> See our [Roadmap](ROADMAP.md) and [Contributing Guide](CONTRIBUTING.md).
|
|
12
|
+
>
|
|
13
|
+
> **Want to change the harness itself?** Start with [Changing cube-harness](CONTRIBUTING.md#changing-cube-harness) and the project [Design Philosophy](https://the-ai-alliance.github.io/cube-standard/design-philosophy); the `/gatekeep-rfc` skill lets you check your own draft before anyone else reads it.
|
|
14
|
+
>
|
|
15
|
+
> **Have a benchmark to contribute?** [Fill out this short form](https://docs.google.com/forms/d/e/1FAIpQLSddMFyRXZJPpD0I2K27OEmIPUpj57w--u2NuMscrjNlkqy8rQ/viewform) — no commitment required. Want to go deeper? [Apply to join the core team](https://forms.gle/JFiBi4ynfVLMghAH8).
|
|
16
|
+
|
|
17
|
+
<!-- [Published Documentation](https://the-ai-alliance.github.io/cube-harness/) -->
|
|
18
|
+
|
|
19
|
+
## Quickstart
|
|
20
|
+
|
|
21
|
+
### Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
# Clone the repository
|
|
25
|
+
git clone https://github.com/The-AI-Alliance/cube-harness.git
|
|
26
|
+
cd cube-harness
|
|
27
|
+
|
|
28
|
+
# Install dependencies
|
|
29
|
+
make install
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### API Keys
|
|
33
|
+
|
|
34
|
+
Set your OpenAI API key:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
export OPENAI_API_KEY=your-key-here
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Any [LiteLLM-supported provider](https://docs.litellm.ai/docs/providers) works — just change `model_name` in the recipe.
|
|
41
|
+
|
|
42
|
+
### Run Tests
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
make test
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Run Hello Example
|
|
49
|
+
|
|
50
|
+
The [`hello_miniwob`](recipes/hello_miniwob.py) recipe demonstrates running a ReAct agent on the MiniWob benchmark.
|
|
51
|
+
|
|
52
|
+
**Start here** — first 2 tasks, in-process (fast, no Ray required):
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
make debug # → uv run recipes/hello_miniwob.py --limit 2
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Full benchmark (parallel via Ray):
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
make hello # → uv run recipes/hello_miniwob.py
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Configuration
|
|
65
|
+
|
|
66
|
+
A recipe is a declarative config file: it imports canonical configs by name,
|
|
67
|
+
tweaks a few attributes, builds one or more `Experiment` objects, and ends
|
|
68
|
+
with `run(...)`. **Copy a recipe from [`recipes/`](recipes/) and edit it** —
|
|
69
|
+
recipes are documentation-by-example, not a CLI.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from cube_harness.agents.genny_configs import GENNY_CONFIGS # "default", "swe"
|
|
73
|
+
from cube_harness.infra import INFRA_CONFIGS # ~/.cube/infra.py; "local" built in
|
|
74
|
+
from cube_harness.recipe import run
|
|
75
|
+
|
|
76
|
+
agent = GENNY_CONFIGS["swe"] # every lookup is a fresh deep copy
|
|
77
|
+
agent.budget.cost_limit = 2.0 # validated at the assignment site
|
|
78
|
+
|
|
79
|
+
exp = Experiment(name="x", agent_config=agent, benchmark_config=..., infra=INFRA_CONFIGS["local"])
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
run(exp) # or run(exp_a, exp_b)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
`run()` is the only CLI, identical for every recipe and not extensible:
|
|
85
|
+
`--limit N` (first N tasks, in-process), `--ray N` (worker count),
|
|
86
|
+
`--set dotted.path=value` (ad-hoc override). For anything structural, clone
|
|
87
|
+
the file. Config objects are typed Pydantic models, serialized with every
|
|
88
|
+
experiment for reproducibility.
|
|
89
|
+
|
|
90
|
+
**Infra** is machine-local in `~/.cube/infra.py` (a `dict[str, InfraConfig]`,
|
|
91
|
+
never committed; credentials come from env). `"local"` works with zero setup.
|
|
92
|
+
To use a cluster/cloud, copy [`recipes/infra_template.py`](recipes/infra_template.py)
|
|
93
|
+
to `~/.cube/infra.py` and edit it — it documents the process and shows
|
|
94
|
+
LocalInfraConfig plus commented Toolkit/Azure examples.
|
|
95
|
+
|
|
96
|
+
See **[docs/configuration.md](docs/configuration.md)** for the full philosophy, a comparison with Hydra/YAML/CLI approaches, and how to run sweeps.
|
|
97
|
+
|
|
98
|
+
## Experiment Viewer
|
|
99
|
+
|
|
100
|
+
cube-harness includes a Gradio-based XRay UI for exploring experiment results, trajectories, and OpenTelemetry spans:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
make xray
|
|
104
|
+
# or: uv run ch-xray
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The viewer displays:
|
|
108
|
+
- **Trajectory list** — all runs with task ID, steps, reward, and duration
|
|
109
|
+
- **Visual timeline** — color-coded steps (blue=environment, green=agent) with duration-based widths
|
|
110
|
+
- **Screenshots** — environment state at each step
|
|
111
|
+
- **Step details** — observations, agent actions, and LLM reasoning
|
|
112
|
+
- **Debug data** — raw JSON, LLM calls, and tool configurations
|
|
113
|
+
|
|
114
|
+

|
|
115
|
+
|
|
116
|
+
## Architecture Overview
|
|
117
|
+
|
|
118
|
+
cube-harness is a **universal evaluation platform** for agentic benchmarks and an **RL data generation** framework built on top of the [CUBE Standard](https://github.com/The-AI-Alliance/cube-standard).
|
|
119
|
+
|
|
120
|
+
### Core Components
|
|
121
|
+
|
|
122
|
+

|
|
123
|
+
|
|
124
|
+
- **Agent** — LLM-powered decision maker that receives observations and produces actions
|
|
125
|
+
- **Environment** — Executes actions, provides observations and rewards (tool + task composition)
|
|
126
|
+
- **Tool** — Modular action provider that exposes an action space, reusable across benchmarks
|
|
127
|
+
- **ActionSpace** — Defines the set of possible actions a tool can execute
|
|
128
|
+
- **Task** — Defines goals, validation logic, and action subsets
|
|
129
|
+
- **Trajectory** — Stores interaction history (observations, actions, rewards)
|
|
130
|
+
- **Episode** — Single agent-environment loop for one task; records a trajectory
|
|
131
|
+
- **Benchmark** — Collection of tasks; produces env configs for episodes
|
|
132
|
+
- **Experiment** — Coordinates execution of multiple episodes across a benchmark
|
|
133
|
+
- **ExpRunner** — Execution runtime (sequential or parallel via Ray)
|
|
134
|
+
|
|
135
|
+
### Design Goals
|
|
136
|
+
|
|
137
|
+
1. **Benchmark Agnostic** — Plug in any CUBE-standard benchmark (MiniWob, WebArena, OSWorld, …) via the `Benchmark` interface
|
|
138
|
+
2. **Agent Agnostic** — Support any agent architecture by implementing the `Agent` protocol
|
|
139
|
+
3. **RL-Ready** — Trajectory format designed for training data generation with full LLM call logging
|
|
140
|
+
4. **Scalable** — Ray integration for parallel episode execution across multiple workers
|
|
141
|
+
5. **Observable** — Structured trajectory output for analysis and debugging
|
|
142
|
+
|
|
143
|
+
## Development
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
make format # Format code
|
|
147
|
+
make lint # Lint and auto-fix
|
|
148
|
+
make help # Show all commands
|
|
149
|
+
make test # Run tests
|
|
150
|
+
make coverage # Run tests with coverage report
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Pre-commit hooks
|
|
154
|
+
|
|
155
|
+
Install once after cloning to get ruff lint/format, trailing-whitespace checks, and DCO sign-off enforcement on every commit:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
pre-commit install --hook-type pre-commit --hook-type commit-msg --hook-type prepare-commit-msg
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
The `prepare-commit-msg` hook automatically appends `Signed-off-by: Your Name <email>` to every commit message (required by the DCO). You can also sign off manually with `git commit -s`.
|
|
162
|
+
|
|
163
|
+
## Project Structure
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
cube-harness/
|
|
167
|
+
├── src/cube_harness/ # Source code for the framework
|
|
168
|
+
├── tests/ # Test suite
|
|
169
|
+
├── recipes/ # Example recipes and configurations
|
|
170
|
+
├── docs/ # Project documentation
|
|
171
|
+
└── Makefile # Common task shortcuts
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Getting Involved
|
|
175
|
+
|
|
176
|
+
All contributions are welcome — open an issue, submit a PR, or wrap a new benchmark. See [CONTRIBUTING.md](CONTRIBUTING.md) for the development guide, DCO requirements, and RFC process.
|
|
177
|
+
|
|
178
|
+
Want deeper involvement? Join the core team, shape the roadmap, and get credit for what you build. [Apply here](https://forms.gle/JFiBi4ynfVLMghAH8).
|
|
179
|
+
|
|
180
|
+
For general AI Alliance contribution guidelines, see the [community repo](https://github.com/The-AI-Alliance/community/).
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
|
|
2
|
+
[project]
|
|
3
|
+
name = "cube-harness"
|
|
4
|
+
version = "0.1.0rc2"
|
|
5
|
+
description = "cube-harness, open source agentic evaluation and data generation framework."
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Nicolas Gontier", email = "nicolas.gontier@servicenow.com" },
|
|
9
|
+
{ name = "Aman Jaiswal", email = "amanjaiswal73892@gmail.com" },
|
|
10
|
+
{ name = "Oleh Shliazhko", email = "oleh.shliazhko@servicenow.com" },
|
|
11
|
+
]
|
|
12
|
+
requires-python = ">=3.12"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"cube-standard>=0.1.0rc9",
|
|
15
|
+
"cube-browser-playwright>=0.2.0", # used by cubes/workarena (PlaywrightSessionConfig)
|
|
16
|
+
"pydantic~=2.0",
|
|
17
|
+
"litellm~=1.80.8",
|
|
18
|
+
"Pillow~=11.0",
|
|
19
|
+
"beautifulsoup4~=4.14",
|
|
20
|
+
"numpy~=2.3.5",
|
|
21
|
+
"ray[default]>=2.52.1",
|
|
22
|
+
"tenacity>=8.5.0",
|
|
23
|
+
"typer~=0.25",
|
|
24
|
+
"python-dotenv~=1.2.0",
|
|
25
|
+
"termcolor~=3.2",
|
|
26
|
+
"opentelemetry-api>=1.20.0",
|
|
27
|
+
"opentelemetry-sdk>=1.20.0",
|
|
28
|
+
"opentelemetry-exporter-otlp-proto-http>=1.20.0",
|
|
29
|
+
"opentelemetry-exporter-otlp-proto-grpc>=1.39.1",
|
|
30
|
+
"browsergym-core>=0.14.3",
|
|
31
|
+
"mcp[cli]>=1.26",
|
|
32
|
+
"msgpack>=1.0.0",
|
|
33
|
+
"zstandard>=0.20.0",
|
|
34
|
+
"docker>=7.1.0",
|
|
35
|
+
"psutil>=5.9.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
# XRay trajectory viewer
|
|
40
|
+
analyze = [
|
|
41
|
+
"gradio~=5.49",
|
|
42
|
+
"pandas~=2.0",
|
|
43
|
+
]
|
|
44
|
+
# Trajectory investigator — post-hoc LLM analysis via Claude Code
|
|
45
|
+
investigator = [
|
|
46
|
+
"claude-agent-sdk>=0.1.6",
|
|
47
|
+
]
|
|
48
|
+
# Rollout generation
|
|
49
|
+
rl = [
|
|
50
|
+
"fastapi>=0.115.0",
|
|
51
|
+
"uvicorn>=0.29.0",
|
|
52
|
+
"transformers~=4.57.0"
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[build-system]
|
|
56
|
+
requires = ["uv_build>=0.8.22,<0.9.0"]
|
|
57
|
+
build-backend = "uv_build"
|
|
58
|
+
|
|
59
|
+
[tool.uv]
|
|
60
|
+
constraint-dependencies = ["pyasn1>=0.6.2", "cryptography>=46.0.5", "protobuf>=6.33.5"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
[tool.ruff]
|
|
64
|
+
fix = true
|
|
65
|
+
line-length = 120
|
|
66
|
+
indent-width = 4
|
|
67
|
+
|
|
68
|
+
[tool.ruff.format]
|
|
69
|
+
quote-style = "double"
|
|
70
|
+
indent-style = "space"
|
|
71
|
+
skip-magic-trailing-comma = false
|
|
72
|
+
line-ending = "auto"
|
|
73
|
+
|
|
74
|
+
[tool.ruff.lint]
|
|
75
|
+
extend-select = ["I"] # sort imports
|
|
76
|
+
|
|
77
|
+
[project.scripts]
|
|
78
|
+
ch-mcp-server = "cube_harness.mcp.server:main"
|
|
79
|
+
ch-xray = "cube_harness.analyze.xray:main"
|
|
80
|
+
ch-trace = "cube_harness.analyze.trace:main"
|
|
81
|
+
ch-investigate = "cube_harness.analyze.investigator:main"
|
|
82
|
+
ch-investigation-report = "cube_harness.analyze.investigation_report:main"
|
|
83
|
+
ch-reset-episodes = "cube_harness.analyze.reset_episodes:main"
|
|
84
|
+
ch-rollout = "cube_harness.rl.__main__:cli"
|
|
85
|
+
ch-profile = "cube_harness.metrics.profile_rollup:cli"
|
|
86
|
+
|
|
87
|
+
[dependency-groups]
|
|
88
|
+
dev = [
|
|
89
|
+
"ruff>=0.14.7",
|
|
90
|
+
"pytest>=8.0",
|
|
91
|
+
"pytest-cov>=6.0",
|
|
92
|
+
"pytest-xdist>=3.8.0",
|
|
93
|
+
"pytest-asyncio>=1.3.0",
|
|
94
|
+
"pytest-playwright>=0.7.2",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
[tool.pytest.ini_options]
|
|
98
|
+
markers = [
|
|
99
|
+
"slow: tests taking >5s but no external deps (Ray retry, xray e2e). CI runs them; deselect locally for fast feedback ('-m \"not slow\"').",
|
|
100
|
+
"serial: mark test to be run sequentially (deselect with '-m \"not serial\"').",
|
|
101
|
+
"integration: requires a live browser (headless Chromium via Playwright). CI runs them after `playwright install`.",
|
|
102
|
+
"live_api: hits a real LLM provider — costs money, needs API key (e.g. ANTHROPIC_API_KEY). Auto-skipped without the key; never run by default CI. Run locally to validate cache-control / streaming / etc. against a real model.",
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
[tool.setuptools.packages.find]
|
|
106
|
+
where = ["src"]
|
|
107
|
+
|
|
108
|
+
[tool.uv.sources]
|
|
109
|
+
cube-standard = { git = "https://github.com/The-AI-Alliance/cube-standard.git", branch = "dev" }
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
|
|
3
|
+
from cube_harness.experiment import EXP_DIR, make_experiment_output_dir
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
__version__ = version("cube-harness")
|
|
7
|
+
except PackageNotFoundError:
|
|
8
|
+
__version__ = "unknown"
|
|
9
|
+
|
|
10
|
+
__all__ = ["EXP_DIR", "__version__", "make_experiment_output_dir"]
|