swecc-mesocosm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ dist/
2
+ build/
3
+ *.egg-info/
4
+ __pycache__/
5
+ *.pyc
6
+ .pytest_cache/
7
+ .mypy_cache/
8
+ .ruff_cache/
9
+ .coverage
10
+ htmlcov/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 SWECC Labs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.4
2
+ Name: swecc-mesocosm
3
+ Version: 0.1.0
4
+ Summary: CLI and Python client for SWECC's benchmark and eval platform.
5
+ Project-URL: Homepage, https://github.com/swecc-uw/swecc-core
6
+ Project-URL: Repository, https://github.com/swecc-uw/swecc-core
7
+ Project-URL: Issues, https://github.com/swecc-uw/swecc-core/issues
8
+ Author-email: SWECC Labs <swecc@uw.edu>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 SWECC Labs
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: benchmark,evaluation,llm,swecc
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3.11
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Programming Language :: Python :: 3.13
39
+ Classifier: Topic :: Software Development :: Testing
40
+ Requires-Python: >=3.11
41
+ Requires-Dist: httpx>=0.27
42
+ Requires-Dist: pydantic-settings>=2.5
43
+ Requires-Dist: pydantic>=2.9
44
+ Requires-Dist: rich>=13.7
45
+ Requires-Dist: typer>=0.12
46
+ Provides-Extra: dev
47
+ Requires-Dist: black; extra == 'dev'
48
+ Requires-Dist: build; extra == 'dev'
49
+ Requires-Dist: mypy; extra == 'dev'
50
+ Requires-Dist: pre-commit; extra == 'dev'
51
+ Requires-Dist: pytest; extra == 'dev'
52
+ Requires-Dist: pytest-asyncio; extra == 'dev'
53
+ Requires-Dist: ruff; extra == 'dev'
54
+ Requires-Dist: twine; extra == 'dev'
55
+ Provides-Extra: lint
56
+ Requires-Dist: black; extra == 'lint'
57
+ Requires-Dist: mypy; extra == 'lint'
58
+ Requires-Dist: ruff; extra == 'lint'
59
+ Provides-Extra: test
60
+ Requires-Dist: pytest; extra == 'test'
61
+ Requires-Dist: pytest-asyncio; extra == 'test'
62
+ Description-Content-Type: text/markdown
63
+
64
+ # swecc-mesocosm
65
+
66
+ CLI and Python client for SWECC's benchmark and eval platform.
67
+
68
+ A *mesocosm* is a small, enclosed environment used for controlled experiments — which is exactly what this tool helps you build, register, and run evals against.
69
+
70
+ ## Install
71
+
72
+ ```bash
73
+ pip install swecc-mesocosm
74
+ # or, with uv:
75
+ uv tool install swecc-mesocosm
76
+ # or, with pipx:
77
+ pipx install swecc-mesocosm
78
+ ```
79
+
80
+ For local development against this monorepo:
81
+
82
+ ```bash
83
+ pip install -e ./packages/swecc-mesocosm
84
+ ```
85
+
86
+ ## Configure
87
+
88
+ The CLI reads `MESOCOSM_BASE_URL` from the environment (default: `http://127.0.0.1:8010`, matching `BENCH_API_PORT` in docker compose). You can also pass `--base-url` to any command.
89
+
90
+ Production:
91
+
92
+ ```bash
93
+ export MESOCOSM_BASE_URL=https://api.swecc.org/bench
94
+ mesocosm doctor # verify health + openapi
95
+ ```
96
+
97
+ See `infra/mesocosm.env.example` in the monorepo root.
98
+
99
+ ```bash
100
+ export MESOCOSM_BASE_URL=http://127.0.0.1:8010 # docker compose
101
+ # or
102
+ export MESOCOSM_BASE_URL=https://api.swecc.org/bench
103
+ ```
104
+
105
+ ## Commands
106
+
107
+ ```bash
108
+ mesocosm --help
109
+
110
+ # connectivity check (bench-api health + openapi)
111
+ mesocosm doctor
112
+ mesocosm doctor --base-url https://api.swecc.org/bench
113
+
114
+ # inference + validation (no network)
115
+ mesocosm suggest "Wordle clone where the agent gets 6 guesses."
116
+ mesocosm validate ./my-domain.json
117
+
118
+ # domain CRUD
119
+ mesocosm register --id my-bench --name "My Bench" --owner-id me \
120
+ --description "Trivia about Python." --env-url https://envs.example.com/mybench
121
+ mesocosm publish my-bench
122
+ mesocosm get my-bench --artifacts
123
+ mesocosm list --status published
124
+
125
+ # evals
126
+ mesocosm eval test --domain-id my-bench --vow-version 1.0.0 --model openai/gpt-4o-mini
127
+ mesocosm eval run --domain-id my-bench --vow-version 1.0.0 --model openai/gpt-4o-mini \
128
+ --num-episodes 20 --seed-set '[1,2,3]'
129
+
130
+ # results
131
+ mesocosm run get <run-id>
132
+ mesocosm run episodes <run-id> --traces
133
+ ```
134
+
135
+ All commands print JSON to stdout (pretty when stdout is a TTY, compact otherwise), so they pipe cleanly into `jq`:
136
+
137
+ ```bash
138
+ mesocosm list --status published | jq '.[].id'
139
+ ```
140
+
141
+ ## Local vs bench-api commands
142
+
143
+ **Local** means the CLI does not call bench-api at `MESOCOSM_BASE_URL` (no HTTP to `/v1/...`). That is not the same as “no LLM”: model calls happen on the **server** when you use `eval` commands.
144
+
145
+ **Bench-api** means the command needs a reachable bench-api (`MESOCOSM_BASE_URL` or `--base-url` on the command).
146
+
147
+ ### Local (no bench-api)
148
+
149
+ | Command | What it does |
150
+ | --------| -------------|
151
+ | `mesocosm --version` / `-V` | Print the installed package version. |
152
+ | `mesocosm suggest <description>` | Regex heuristics on your text → JSON defaults (`benchmark_kind`, `scoring_source`, `max_steps`, `primary_metric`, `reasoning`, `tags`). Preview only; does not register. |
153
+ | `mesocosm validate <path>` | Check a domain JSON payload against shipped `policy/constraints.json` (`-` = stdin). Exit 0 if `ok`, else 1. |
154
+
155
+ These work without bench-api running.
156
+
157
+ ### Bench-api (HTTP)
158
+
159
+ | Command | API | What it does |
160
+ | --------|-----| -------------|
161
+ | `mesocosm register` | `POST /v1/domains` (409 → `PATCH`) | Build or load a payload, optionally run local `validate`, then upsert a draft domain. |
162
+ | `mesocosm publish <id>` | `POST /v1/domains/{id}/publish` | Publish a domain; print artifact SHA-256 digests. |
163
+ | `mesocosm get <id>` | `GET /v1/domains/{id}` | Fetch a domain; `--artifacts` adds synthesized contract files locally. |
164
+ | `mesocosm list` | `GET /v1/domains` | List domains (`--status`, `--json` for raw output). |
165
+ | `mesocosm eval test` | `POST /v1/test/episode` | One test episode (model + env on the server). |
166
+ | `mesocosm eval run` | `GET` domain + `POST /v1/runs` | Full eval run with aggregated scores. |
167
+ | `mesocosm run get <run-id>` | `GET /v1/runs/{id}` (+ episodes) | Run status and aggregate scores. |
168
+ | `mesocosm run episodes <run-id>` | `GET /v1/runs/{id}/episodes` | Episode list; `--traces` fetches traces too. |
169
+
170
+ `register` is hybrid: inference and `validate` run locally; the upsert step needs bench-api.
171
+
172
+ ```text
173
+ LOCAL BENCH-API
174
+ ──────────────────────────── ─────────────────────────────────────
175
+ mesocosm --version mesocosm register
176
+ mesocosm suggest "<desc>" mesocosm publish <id>
177
+ mesocosm validate <file> mesocosm get <id> [--artifacts]
178
+ mesocosm list [--status ...] [--json]
179
+ mesocosm eval test ...
180
+ mesocosm eval run ...
181
+ mesocosm run get <run-id>
182
+ mesocosm run episodes <run-id> [--traces]
183
+ ```
184
+
185
+ ## Python client
186
+
187
+ ```python
188
+ import asyncio
189
+ from swecc_mesocosm import BenchClient
190
+
191
+ async def main():
192
+ c = BenchClient(base_url="http://127.0.0.1:8000")
193
+ try:
194
+ domains = await c.list_domains(published_only=True)
195
+ print(len(domains), "published")
196
+ finally:
197
+ await c.aclose()
198
+
199
+ asyncio.run(main())
200
+ ```
201
+
202
+ ## Policy / constraints
203
+
204
+ `mesocosm validate` reads `swecc_mesocosm/policy/constraints.json` shipped with the package — required register fields, allowed model prefixes, etc. Edit that file (or fork the package) to tune for your event.
@@ -0,0 +1,141 @@
1
+ # swecc-mesocosm
2
+
3
+ CLI and Python client for SWECC's benchmark and eval platform.
4
+
5
+ A *mesocosm* is a small, enclosed environment used for controlled experiments — which is exactly what this tool helps you build, register, and run evals against.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install swecc-mesocosm
11
+ # or, with uv:
12
+ uv tool install swecc-mesocosm
13
+ # or, with pipx:
14
+ pipx install swecc-mesocosm
15
+ ```
16
+
17
+ For local development against this monorepo:
18
+
19
+ ```bash
20
+ pip install -e ./packages/swecc-mesocosm
21
+ ```
22
+
23
+ ## Configure
24
+
25
+ The CLI reads `MESOCOSM_BASE_URL` from the environment (default: `http://127.0.0.1:8010`, matching `BENCH_API_PORT` in docker compose). You can also pass `--base-url` to any command.
26
+
27
+ Production:
28
+
29
+ ```bash
30
+ export MESOCOSM_BASE_URL=https://api.swecc.org/bench
31
+ mesocosm doctor # verify health + openapi
32
+ ```
33
+
34
+ See `infra/mesocosm.env.example` in the monorepo root.
35
+
36
+ ```bash
37
+ export MESOCOSM_BASE_URL=http://127.0.0.1:8010 # docker compose
38
+ # or
39
+ export MESOCOSM_BASE_URL=https://api.swecc.org/bench
40
+ ```
41
+
42
+ ## Commands
43
+
44
+ ```bash
45
+ mesocosm --help
46
+
47
+ # connectivity check (bench-api health + openapi)
48
+ mesocosm doctor
49
+ mesocosm doctor --base-url https://api.swecc.org/bench
50
+
51
+ # inference + validation (no network)
52
+ mesocosm suggest "Wordle clone where the agent gets 6 guesses."
53
+ mesocosm validate ./my-domain.json
54
+
55
+ # domain CRUD
56
+ mesocosm register --id my-bench --name "My Bench" --owner-id me \
57
+ --description "Trivia about Python." --env-url https://envs.example.com/mybench
58
+ mesocosm publish my-bench
59
+ mesocosm get my-bench --artifacts
60
+ mesocosm list --status published
61
+
62
+ # evals
63
+ mesocosm eval test --domain-id my-bench --vow-version 1.0.0 --model openai/gpt-4o-mini
64
+ mesocosm eval run --domain-id my-bench --vow-version 1.0.0 --model openai/gpt-4o-mini \
65
+ --num-episodes 20 --seed-set '[1,2,3]'
66
+
67
+ # results
68
+ mesocosm run get <run-id>
69
+ mesocosm run episodes <run-id> --traces
70
+ ```
71
+
72
+ All commands print JSON to stdout (pretty when stdout is a TTY, compact otherwise), so they pipe cleanly into `jq`:
73
+
74
+ ```bash
75
+ mesocosm list --status published | jq '.[].id'
76
+ ```
77
+
78
+ ## Local vs bench-api commands
79
+
80
+ **Local** means the CLI does not call bench-api at `MESOCOSM_BASE_URL` (no HTTP to `/v1/...`). That is not the same as “no LLM”: model calls happen on the **server** when you use `eval` commands.
81
+
82
+ **Bench-api** means the command needs a reachable bench-api (`MESOCOSM_BASE_URL` or `--base-url` on the command).
83
+
84
+ ### Local (no bench-api)
85
+
86
+ | Command | What it does |
87
+ | --------| -------------|
88
+ | `mesocosm --version` / `-V` | Print the installed package version. |
89
+ | `mesocosm suggest <description>` | Regex heuristics on your text → JSON defaults (`benchmark_kind`, `scoring_source`, `max_steps`, `primary_metric`, `reasoning`, `tags`). Preview only; does not register. |
90
+ | `mesocosm validate <path>` | Check a domain JSON payload against shipped `policy/constraints.json` (`-` = stdin). Exit 0 if `ok`, else 1. |
91
+
92
+ These work without bench-api running.
93
+
94
+ ### Bench-api (HTTP)
95
+
96
+ | Command | API | What it does |
97
+ | --------|-----| -------------|
98
+ | `mesocosm register` | `POST /v1/domains` (409 → `PATCH`) | Build or load a payload, optionally run local `validate`, then upsert a draft domain. |
99
+ | `mesocosm publish <id>` | `POST /v1/domains/{id}/publish` | Publish a domain; print artifact SHA-256 digests. |
100
+ | `mesocosm get <id>` | `GET /v1/domains/{id}` | Fetch a domain; `--artifacts` adds synthesized contract files locally. |
101
+ | `mesocosm list` | `GET /v1/domains` | List domains (`--status`, `--json` for raw output). |
102
+ | `mesocosm eval test` | `POST /v1/test/episode` | One test episode (model + env on the server). |
103
+ | `mesocosm eval run` | `GET` domain + `POST /v1/runs` | Full eval run with aggregated scores. |
104
+ | `mesocosm run get <run-id>` | `GET /v1/runs/{id}` (+ episodes) | Run status and aggregate scores. |
105
+ | `mesocosm run episodes <run-id>` | `GET /v1/runs/{id}/episodes` | Episode list; `--traces` fetches traces too. |
106
+
107
+ `register` is hybrid: inference and `validate` run locally; the upsert step needs bench-api.
108
+
109
+ ```text
110
+ LOCAL BENCH-API
111
+ ──────────────────────────── ─────────────────────────────────────
112
+ mesocosm --version mesocosm register
113
+ mesocosm suggest "<desc>" mesocosm publish <id>
114
+ mesocosm validate <file> mesocosm get <id> [--artifacts]
115
+ mesocosm list [--status ...] [--json]
116
+ mesocosm eval test ...
117
+ mesocosm eval run ...
118
+ mesocosm run get <run-id>
119
+ mesocosm run episodes <run-id> [--traces]
120
+ ```
121
+
122
+ ## Python client
123
+
124
+ ```python
125
+ import asyncio
126
+ from swecc_mesocosm import BenchClient
127
+
128
+ async def main():
129
+ c = BenchClient(base_url="http://127.0.0.1:8000")
130
+ try:
131
+ domains = await c.list_domains(published_only=True)
132
+ print(len(domains), "published")
133
+ finally:
134
+ await c.aclose()
135
+
136
+ asyncio.run(main())
137
+ ```
138
+
139
+ ## Policy / constraints
140
+
141
+ `mesocosm validate` reads `swecc_mesocosm/policy/constraints.json` shipped with the package — required register fields, allowed model prefixes, etc. Edit that file (or fork the package) to tune for your event.
@@ -0,0 +1,108 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.21.0"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "swecc-mesocosm"
7
+ version = "0.1.0"
8
+ description = "CLI and Python client for SWECC's benchmark and eval platform."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { file = "LICENSE" }
12
+ keywords = ["benchmark", "evaluation", "llm", "swecc"]
13
+ authors = [
14
+ { name = "SWECC Labs", email = "swecc@uw.edu" },
15
+ ]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Topic :: Software Development :: Testing",
25
+ ]
26
+ dependencies = [
27
+ "httpx>=0.27",
28
+ "pydantic>=2.9",
29
+ "pydantic-settings>=2.5",
30
+ "typer>=0.12",
31
+ "rich>=13.7",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "pytest",
37
+ "pytest-asyncio",
38
+ "mypy",
39
+ "ruff",
40
+ "black",
41
+ "pre-commit",
42
+ "build",
43
+ "twine",
44
+ ]
45
+ lint = [
46
+ "ruff",
47
+ "black",
48
+ "mypy",
49
+ ]
50
+ test = [
51
+ "pytest",
52
+ "pytest-asyncio",
53
+ ]
54
+
55
+ [project.urls]
56
+ Homepage = "https://github.com/swecc-uw/swecc-core"
57
+ Repository = "https://github.com/swecc-uw/swecc-core"
58
+ Issues = "https://github.com/swecc-uw/swecc-core/issues"
59
+
60
+ [project.scripts]
61
+ mesocosm = "swecc_mesocosm.cli:main"
62
+
63
+ [tool.hatch.build]
64
+ exclude = [
65
+ "tests/*",
66
+ "venv/*",
67
+ ".git/*",
68
+ "*.pyc",
69
+ "__pycache__/*",
70
+ ".pytest_cache/*",
71
+ ".mypy_cache/*",
72
+ ".ruff_cache/*",
73
+ "dist/*",
74
+ "build/*",
75
+ ]
76
+
77
+ [tool.hatch.build.targets.wheel]
78
+ packages = ["swecc_mesocosm"]
79
+
80
+ [tool.ruff]
81
+ line-length = 100
82
+ target-version = "py311"
83
+ fix = true
84
+ unsafe-fixes = false
85
+ lint.select = ["E", "F", "I", "N", "W", "B", "UP", "RUF"]
86
+ lint.ignore = []
87
+
88
+ [tool.ruff.lint.isort]
89
+ known-first-party = ["swecc_mesocosm"]
90
+
91
+ [tool.mypy]
92
+ python_version = "3.11"
93
+ strict = true
94
+ ignore_missing_imports = true
95
+ disallow_untyped_defs = true
96
+ disallow_incomplete_defs = true
97
+ warn_redundant_casts = true
98
+ warn_unused_ignores = true
99
+ no_implicit_optional = true
100
+ show_error_codes = true
101
+
102
+ [tool.pytest.ini_options]
103
+ testpaths = ["tests"]
104
+ python_files = ["test_*.py"]
105
+ filterwarnings = [
106
+ "ignore::DeprecationWarning",
107
+ "ignore::UserWarning",
108
+ ]
@@ -0,0 +1,12 @@
1
+ """swecc-mesocosm — CLI and Python client for SWECC's bench platform."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from swecc_mesocosm.client import BenchClient
6
+
7
+ try:
8
+ __version__ = version("swecc-mesocosm")
9
+ except PackageNotFoundError:
10
+ __version__ = "0.0.0.dev"
11
+
12
+ __all__ = ["BenchClient", "__version__"]
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from typing import Any
6
+
7
+
8
+ def _canonical_json(obj: Any) -> bytes:
9
+ return json.dumps(obj, sort_keys=True, ensure_ascii=False, separators=(",", ":")).encode(
10
+ "utf-8"
11
+ )
12
+
13
+
14
+ def sha256_digest(obj: Any) -> str:
15
+ h = hashlib.sha256(_canonical_json(obj)).hexdigest()
16
+ return f"sha256:{h}"
17
+
18
+
19
+ def compile_benchmark_artifacts(domain: dict[str, Any]) -> dict[str, Any]:
20
+ """Synthesize stable artifact views from a Domain object returned by the API."""
21
+ contract = domain.get("binding_vow")
22
+ eval_profile: dict[str, Any] = {
23
+ "scoring": domain.get("scoring"),
24
+ "domain_id": domain.get("id"),
25
+ "status": domain.get("status"),
26
+ }
27
+ dataset_lock: dict[str, Any] = {
28
+ "note": (
29
+ "No dataset lock is stored on the server yet. Pin seeds and env version in your repo."
30
+ ),
31
+ "domain_id": domain.get("id"),
32
+ }
33
+ return {
34
+ "contract.json": contract,
35
+ "eval_profile.json": eval_profile,
36
+ "dataset.lock.json": dataset_lock,
37
+ }