itemeval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- itemeval-0.1.0/.github/workflows/ci.yml +42 -0
- itemeval-0.1.0/.github/workflows/release.yml +32 -0
- itemeval-0.1.0/.github/workflows/sync-wiki.yml +33 -0
- itemeval-0.1.0/.gitignore +26 -0
- itemeval-0.1.0/.python-version +1 -0
- itemeval-0.1.0/CHANGELOG.md +117 -0
- itemeval-0.1.0/CLAUDE.md +35 -0
- itemeval-0.1.0/DEVELOPMENT.md +92 -0
- itemeval-0.1.0/LICENSE +21 -0
- itemeval-0.1.0/PKG-INFO +279 -0
- itemeval-0.1.0/README.md +239 -0
- itemeval-0.1.0/ROADMAP.md +96 -0
- itemeval-0.1.0/configs/quickstart_aime.yaml +30 -0
- itemeval-0.1.0/configs/usamo_demo.yaml +31 -0
- itemeval-0.1.0/configs/usamo_demo_gate.yaml +31 -0
- itemeval-0.1.0/configs/usamo_smoke.yaml +37 -0
- itemeval-0.1.0/docs/wiki/Architecture.md +121 -0
- itemeval-0.1.0/docs/wiki/Budget-and-Costs.md +76 -0
- itemeval-0.1.0/docs/wiki/CLI.md +116 -0
- itemeval-0.1.0/docs/wiki/Configuration.md +133 -0
- itemeval-0.1.0/docs/wiki/Error-Handling.md +148 -0
- itemeval-0.1.0/docs/wiki/FAQ.md +97 -0
- itemeval-0.1.0/docs/wiki/Getting-Started.md +95 -0
- itemeval-0.1.0/docs/wiki/Home.md +59 -0
- itemeval-0.1.0/docs/wiki/Outputs-and-Schemas.md +97 -0
- itemeval-0.1.0/docs/wiki/Pipeline-Concepts.md +101 -0
- itemeval-0.1.0/docs/wiki/Python-API.md +130 -0
- itemeval-0.1.0/pyproject.toml +73 -0
- itemeval-0.1.0/src/itemeval/__init__.py +60 -0
- itemeval-0.1.0/src/itemeval/_builtin/config.yaml +48 -0
- itemeval-0.1.0/src/itemeval/_builtin/prompts/solver/minimal.md +4 -0
- itemeval-0.1.0/src/itemeval/_builtin/prompts/solver/standard.md +6 -0
- itemeval-0.1.0/src/itemeval/_builtin/rubrics/standard.md +17 -0
- itemeval-0.1.0/src/itemeval/_config.py +252 -0
- itemeval-0.1.0/src/itemeval/_errors.py +25 -0
- itemeval-0.1.0/src/itemeval/_item.py +29 -0
- itemeval-0.1.0/src/itemeval/_manifest.py +196 -0
- itemeval-0.1.0/src/itemeval/_mockmodels.py +68 -0
- itemeval-0.1.0/src/itemeval/_prepare.py +85 -0
- itemeval-0.1.0/src/itemeval/_status.py +165 -0
- itemeval-0.1.0/src/itemeval/_templates.py +161 -0
- itemeval-0.1.0/src/itemeval/_util.py +46 -0
- itemeval-0.1.0/src/itemeval/adapters/__init__.py +1 -0
- itemeval-0.1.0/src/itemeval/adapters/_base.py +98 -0
- itemeval-0.1.0/src/itemeval/adapters/_hf.py +85 -0
- itemeval-0.1.0/src/itemeval/budget/__init__.py +1 -0
- itemeval-0.1.0/src/itemeval/budget/_estimator.py +195 -0
- itemeval-0.1.0/src/itemeval/budget/_gate.py +69 -0
- itemeval-0.1.0/src/itemeval/budget/_policies.py +40 -0
- itemeval-0.1.0/src/itemeval/budget/_pricing.py +131 -0
- itemeval-0.1.0/src/itemeval/budget/pricing_seed.json +13 -0
- itemeval-0.1.0/src/itemeval/cli.py +407 -0
- itemeval-0.1.0/src/itemeval/design/__init__.py +1 -0
- itemeval-0.1.0/src/itemeval/design/_grid.py +178 -0
- itemeval-0.1.0/src/itemeval/design/_ids.py +25 -0
- itemeval-0.1.0/src/itemeval/generate/__init__.py +1 -0
- itemeval-0.1.0/src/itemeval/generate/_params.py +50 -0
- itemeval-0.1.0/src/itemeval/generate/_run.py +397 -0
- itemeval-0.1.0/src/itemeval/generate/_task.py +61 -0
- itemeval-0.1.0/src/itemeval/grade/__init__.py +1 -0
- itemeval-0.1.0/src/itemeval/grade/_judge.py +96 -0
- itemeval-0.1.0/src/itemeval/grade/_parse.py +94 -0
- itemeval-0.1.0/src/itemeval/grade/_run.py +333 -0
- itemeval-0.1.0/src/itemeval/grade/_verifiable.py +84 -0
- itemeval-0.1.0/src/itemeval/py.typed +0 -0
- itemeval-0.1.0/src/itemeval/store/__init__.py +1 -0
- itemeval-0.1.0/src/itemeval/store/_base.py +59 -0
- itemeval-0.1.0/src/itemeval/store/_export.py +183 -0
- itemeval-0.1.0/src/itemeval/store/_gradings.py +87 -0
- itemeval-0.1.0/src/itemeval/store/_items.py +47 -0
- itemeval-0.1.0/src/itemeval/store/_layout.py +47 -0
- itemeval-0.1.0/src/itemeval/store/_ledger.py +37 -0
- itemeval-0.1.0/src/itemeval/store/_logs.py +38 -0
- itemeval-0.1.0/src/itemeval/store/_solutions.py +93 -0
- itemeval-0.1.0/tests/conftest.py +129 -0
- itemeval-0.1.0/tests/test_adapter_hf.py +30 -0
- itemeval-0.1.0/tests/test_adapter_hf_io.py +107 -0
- itemeval-0.1.0/tests/test_adapter_mapping.py +125 -0
- itemeval-0.1.0/tests/test_cli.py +87 -0
- itemeval-0.1.0/tests/test_condition_ids.py +35 -0
- itemeval-0.1.0/tests/test_config.py +148 -0
- itemeval-0.1.0/tests/test_demo_configs.py +61 -0
- itemeval-0.1.0/tests/test_estimator.py +91 -0
- itemeval-0.1.0/tests/test_export.py +71 -0
- itemeval-0.1.0/tests/test_generate_run.py +95 -0
- itemeval-0.1.0/tests/test_grade_run.py +206 -0
- itemeval-0.1.0/tests/test_grid.py +131 -0
- itemeval-0.1.0/tests/test_integration_pipeline.py +54 -0
- itemeval-0.1.0/tests/test_item.py +31 -0
- itemeval-0.1.0/tests/test_judge_parse.py +80 -0
- itemeval-0.1.0/tests/test_manifest.py +44 -0
- itemeval-0.1.0/tests/test_mockmodels.py +45 -0
- itemeval-0.1.0/tests/test_policies_gate.py +67 -0
- itemeval-0.1.0/tests/test_pricing.py +97 -0
- itemeval-0.1.0/tests/test_public_api.py +109 -0
- itemeval-0.1.0/tests/test_status.py +99 -0
- itemeval-0.1.0/tests/test_store.py +238 -0
- itemeval-0.1.0/tests/test_templates.py +100 -0
- itemeval-0.1.0/tests/test_util.py +48 -0
- itemeval-0.1.0/tests/test_verifiable.py +104 -0
- itemeval-0.1.0/uv.lock +3428 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
concurrency:
|
|
9
|
+
group: ci-${{ github.ref }}
|
|
10
|
+
cancel-in-progress: true
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
name: lint + test (py${{ matrix.python-version }})
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
strategy:
|
|
17
|
+
fail-fast: false
|
|
18
|
+
matrix:
|
|
19
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v5
|
|
22
|
+
|
|
23
|
+
- name: Install uv + Python ${{ matrix.python-version }}
|
|
24
|
+
uses: astral-sh/setup-uv@v8.2.0
|
|
25
|
+
with:
|
|
26
|
+
python-version: ${{ matrix.python-version }}
|
|
27
|
+
enable-cache: true
|
|
28
|
+
cache-dependency-glob: "uv.lock"
|
|
29
|
+
|
|
30
|
+
- name: Sync (locked, dev group)
|
|
31
|
+
run: uv sync --locked
|
|
32
|
+
|
|
33
|
+
- name: Ruff lint
|
|
34
|
+
run: uv run ruff check .
|
|
35
|
+
|
|
36
|
+
- name: Ruff format check
|
|
37
|
+
run: uv run ruff format --check .
|
|
38
|
+
|
|
39
|
+
# Unit tests only; the `network` test (HF Hub) is excluded to keep CI
|
|
40
|
+
# hermetic — run it manually per DEVELOPMENT.md when touching the adapter.
|
|
41
|
+
- name: Pytest
|
|
42
|
+
run: uv run pytest -m "not network"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Publishes to PyPI via trusted publishing (OIDC) — no API token stored.
|
|
4
|
+
# Triggered when a GitHub Release is published; the workflow is taken from the
|
|
5
|
+
# default branch, so it fires for tags created before this file existed.
|
|
6
|
+
#
|
|
7
|
+
# One-time PyPI setup (project settings -> Publishing -> add a trusted publisher):
|
|
8
|
+
# Owner: luozm | Repository: itemeval | Workflow: release.yml | Environment: (blank)
|
|
9
|
+
on:
|
|
10
|
+
release:
|
|
11
|
+
types: [published]
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
pypi-publish:
|
|
15
|
+
name: Build + publish to PyPI
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
permissions:
|
|
18
|
+
id-token: write # mint the OIDC token PyPI trusted publishing verifies
|
|
19
|
+
contents: read
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v5
|
|
22
|
+
|
|
23
|
+
- name: Install uv
|
|
24
|
+
uses: astral-sh/setup-uv@v8.2.0
|
|
25
|
+
|
|
26
|
+
- name: Build sdist + wheel
|
|
27
|
+
run: uv build
|
|
28
|
+
|
|
29
|
+
# --trusted-publishing always: fail loudly if OIDC isn't available rather
|
|
30
|
+
# than silently falling back to looking for credentials.
|
|
31
|
+
- name: Publish
|
|
32
|
+
run: uv publish --trusted-publishing always
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Sync wiki
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
paths:
|
|
7
|
+
- "docs/wiki/**"
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
sync:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v5
|
|
17
|
+
|
|
18
|
+
- name: Push docs/wiki to the wiki repo
|
|
19
|
+
env:
|
|
20
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
21
|
+
run: |
|
|
22
|
+
git clone "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.wiki.git" wiki
|
|
23
|
+
rsync -av --delete --exclude .git docs/wiki/ wiki/
|
|
24
|
+
cd wiki
|
|
25
|
+
git config user.name "github-actions[bot]"
|
|
26
|
+
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
|
27
|
+
git add -A
|
|
28
|
+
if git diff --cached --quiet; then
|
|
29
|
+
echo "Wiki already up to date."
|
|
30
|
+
exit 0
|
|
31
|
+
fi
|
|
32
|
+
git commit -m "Sync wiki from docs/wiki @ ${GITHUB_SHA::7}"
|
|
33
|
+
git push
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
|
|
8
|
+
# Environments & secrets
|
|
9
|
+
.venv/
|
|
10
|
+
.env
|
|
11
|
+
|
|
12
|
+
# Tooling caches
|
|
13
|
+
.pytest_cache/
|
|
14
|
+
.ruff_cache/
|
|
15
|
+
.coverage
|
|
16
|
+
.coverage.*
|
|
17
|
+
htmlcov/
|
|
18
|
+
|
|
19
|
+
# Demo study outputs (itemeval generate/grade/export artifacts)
|
|
20
|
+
studies/
|
|
21
|
+
|
|
22
|
+
# Internal build-time design contract — kept locally, not published
|
|
23
|
+
docs/DESIGN.md
|
|
24
|
+
|
|
25
|
+
# OS
|
|
26
|
+
.DS_Store
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to itemeval are documented here. Format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com); versioning follows
|
|
5
|
+
[SemVer](https://semver.org) (pre-1.0: minor bumps may break APIs).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.1.0] - 2026-06-10
|
|
10
|
+
|
|
11
|
+
First public release. Item-level LLM evaluation over any inspect_ai-supported
|
|
12
|
+
provider, with a two-stage generate/grade pipeline, long-format item-response
|
|
13
|
+
export, and a budget layer.
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- Core data model and config (M1): canonical `Item` model; full pydantic
|
|
17
|
+
experiment-config schema validating the README YAML sketch as-is
|
|
18
|
+
(`load_config`); content-derived stable condition ids; facet grid expansion
|
|
19
|
+
with full crossing.
|
|
20
|
+
- HuggingFace benchmark adapter (M1): field-mapping spec → canonical items,
|
|
21
|
+
revision pinned at first run via a per-study `dataset_locks.json`.
|
|
22
|
+
- Run manifests (M1): dataset revisions, template content hashes, model ids,
|
|
23
|
+
requested sampling params (effective values backfilled per condition after
|
|
24
|
+
each run), package versions, full condition grid — one JSON per run.
|
|
25
|
+
- Generate stage (M2): one inspect task per (model × prompt × model-config)
|
|
26
|
+
cell, `epochs` = replications, thinking/reasoning toggles as model-config
|
|
27
|
+
facets, requested vs effective sampling params recorded per row, resumable
|
|
28
|
+
solutions parquet store + raw `.eval` log index.
|
|
29
|
+
- Grade stage (M3): verifiable scorers (exact match / multiple choice /
|
|
30
|
+
numeric, $0) and judge-as-task (grading dataset built from stored solutions,
|
|
31
|
+
judge temperature pinned to 0, prompt caching enabled); strict structured
|
|
32
|
+
score parsing with parse failures flagged in-table, never dropped;
|
|
33
|
+
re-runnable per (grader × rubric) without touching solutions.
|
|
34
|
+
- Export (M4): long-format gradings table (45 columns: scores, judge
|
|
35
|
+
reasoning, tokens, USD, latency, full provenance), parquet + CSV mirrors,
|
|
36
|
+
per-run cost ledger attributed generation vs grading with internal
|
|
37
|
+
reconciliation check.
|
|
38
|
+
- Budget layer (M5): packaged pricing seed + OpenRouter pricing refresh,
|
|
39
|
+
per-stage dry-run estimator, `confirm_above_usd` gate (exit 3) and
|
|
40
|
+
non-overridable `max_usd` cap (exit 4), `dev`/`full-interactive`/
|
|
41
|
+
`full-batch` policies, batch-API wiring with documented ~50% discount
|
|
42
|
+
approximation.
|
|
43
|
+
- CLI (M6): `estimate | generate | grade | export | status` with consistent
|
|
44
|
+
UX, `--json` output, repeatable `--condition/--grader/--rubric` filters,
|
|
45
|
+
resumability and grid-completion reporting.
|
|
46
|
+
- `mockllm/*` pass-through: any mock model id runs the full pipeline free and
|
|
47
|
+
deterministically (used by all demos and tests; `configs/usamo_demo.yaml`).
|
|
48
|
+
- Public Python API: the pipeline is drivable programmatically as well as via
|
|
49
|
+
the CLI — `prepare_study`, `estimate_study`, `run_generate`, `run_grade`,
|
|
50
|
+
`export_study`, `build_status` exported from `itemeval` (lazily, so
|
|
51
|
+
`import itemeval` stays light). The budget confirmation gate remains a
|
|
52
|
+
CLI-layer feature.
|
|
53
|
+
- Dependency: `datasets` (HuggingFace) for the HF adapter.
|
|
54
|
+
- Built-in template library: prompts `minimal`/`standard` and rubric `standard`
|
|
55
|
+
ship inside the package and are referenced as `builtin:<name>`. A bare name
|
|
56
|
+
still resolves to a local file under `prompts_dir`/`rubrics_dir`; the two
|
|
57
|
+
namespaces are distinct and never silently shadow each other — each template
|
|
58
|
+
is recorded in the run manifest with its `source` (`local`/`builtin`) and
|
|
59
|
+
content hash, and built-in templates record a machine-independent path.
|
|
60
|
+
- `itemeval init DIR [--with-templates] [--force]`: scaffold a runnable starter
|
|
61
|
+
study (`config.yaml`). `--with-templates` also copies the referenced built-in
|
|
62
|
+
prompts/rubrics locally as editable starters. Makes `pip install itemeval`
|
|
63
|
+
usable without cloning the repo.
|
|
64
|
+
- `solvers.on_empty` policy (`skip` default / `rerun` / `grade`) for completed
|
|
65
|
+
generations that produced no gradable text (empty/blank `solution`, no API
|
|
66
|
+
error — e.g. a reasoning model whose token budget was spent entirely on
|
|
67
|
+
hidden reasoning). Empty no-error completions are a distinct channel from API
|
|
68
|
+
errors (re-attempted) and parse failures (final): `skip` excludes them from
|
|
69
|
+
grading, `rerun` also makes them eligible for regeneration on the next
|
|
70
|
+
`generate`, `grade` sends them to the judge as-is. They are always surfaced —
|
|
71
|
+
`grade` reports the count and stop-reason breakdown, and `status` gains an
|
|
72
|
+
`empty` column — never silently folded into a green "complete".
|
|
73
|
+
- Provider/endpoint provenance for cost attribution: `ledger.parquet` gains a
|
|
74
|
+
`provider` column (the inspect prefix of `model`), and run manifests gain
|
|
75
|
+
`endpoints_effective` per condition (`{provider, base_url, served_model}`,
|
|
76
|
+
backfilled after the run) — recording which provider, endpoint, and
|
|
77
|
+
provider-returned model snapshot actually answered. `base_url` is null on the
|
|
78
|
+
provider's default endpoint; a non-null value flags traffic routed elsewhere
|
|
79
|
+
(Azure/proxy/gateway).
|
|
80
|
+
|
|
81
|
+
### Changed
|
|
82
|
+
- **Path resolution split by intent** (behavior change). Inputs (`prompts_dir`,
|
|
83
|
+
`rubrics_dir`, `budget.pricing_path`) still anchor to the config file's
|
|
84
|
+
directory; outputs (`output_dir`, i.e. the study tree) now anchor to a **work
|
|
85
|
+
directory** defaulting to the current directory, never the config dir or the
|
|
86
|
+
installed package. New `-C/--base-dir` (CLI) and `load_config(work_dir=...)`
|
|
87
|
+
(Python) override the output anchor. The example configs drop their `../`
|
|
88
|
+
prefixes accordingly.
|
|
89
|
+
- Default `facets.prompt` / `facets.rubric` are now `[builtin:standard]`
|
|
90
|
+
(were `[default]`, which referenced a template that never existed).
|
|
91
|
+
- Template references and validation moved ahead of study-directory creation:
|
|
92
|
+
an unresolved template now fails before any output directory is written.
|
|
93
|
+
|
|
94
|
+
### Packaging
|
|
95
|
+
- Provider-SDK optional extras (`openai`, `anthropic`, `google`, `all`),
|
|
96
|
+
mirroring inspect_ai's lazy provider imports. Install the extra for the
|
|
97
|
+
provider you run, e.g. `pip install itemeval[openai]` — the `openai` extra
|
|
98
|
+
also covers OpenRouter and other OpenAI-compatible providers. The base
|
|
99
|
+
install stays SDK-free; running a real provider without its extra raises
|
|
100
|
+
inspect_ai's `PrerequisiteError` with the install hint.
|
|
101
|
+
- Ship a `py.typed` marker (PEP 561): downstream type checkers now see
|
|
102
|
+
itemeval's annotations. Added the `Typing :: Typed` and Python 3.11/3.12
|
|
103
|
+
classifiers.
|
|
104
|
+
- Relaxed the `pyarrow` (`>=24` → `>=15`) and `datasets` (`>=5` → `>=3`)
|
|
105
|
+
lower bounds to the oldest versions whose APIs we actually use, easing
|
|
106
|
+
co-installation; dev/CI still pin the latest via `uv.lock`. The full test
|
|
107
|
+
suite passes at both the floor and the locked versions.
|
|
108
|
+
- Expanded `[project.urls]` (Homepage, Documentation → wiki, Changelog, Issues)
|
|
109
|
+
and switched the README's PyPI-facing links to absolute GitHub URLs.
|
|
110
|
+
- Minimum Python is now 3.11 (was 3.10). The tested dependency stack resolves
|
|
111
|
+
pandas 3.x, which requires Python >=3.11, so 3.10 could only ever install a
|
|
112
|
+
different (pandas 2.x) stack that was never tested. Floor now matches the
|
|
113
|
+
tested stack; `uv.lock` reconciled to a single resolution (dropped the
|
|
114
|
+
3.10-only `exceptiongroup`/`tomli`/`async-timeout`/`pytz` backports).
|
|
115
|
+
|
|
116
|
+
[Unreleased]: https://github.com/luozm/itemeval/compare/v0.1.0...HEAD
|
|
117
|
+
[0.1.0]: https://github.com/luozm/itemeval/releases/tag/v0.1.0
|
itemeval-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# CLAUDE.md — itemeval (package development)
|
|
2
|
+
|
|
3
|
+
Publishable Python package: item-level LLM evaluation on inspect_ai.
|
|
4
|
+
`README.md` is the spec; `ROADMAP.md` is the milestone plan — keep it updated
|
|
5
|
+
as milestones complete. `DEVELOPMENT.md` defines the inspect_ai upgrade
|
|
6
|
+
pipeline and the versioning/release process — follow it for any dependency
|
|
7
|
+
bump or release; update `CHANGELOG.md` ([Unreleased]) in the same change that
|
|
8
|
+
makes a user-visible difference. Any study consuming this package lives in its
|
|
9
|
+
own separate repo; never put study-specific content (a particular study's
|
|
10
|
+
datasets, rubric texts, or analysis) in this package.
|
|
11
|
+
|
|
12
|
+
## Python environment
|
|
13
|
+
|
|
14
|
+
- Managed by `uv`. All Python runs against `./.venv`; system Python is never used.
|
|
15
|
+
- Invoke by absolute path, never activate: `./.venv/bin/python -m pytest`.
|
|
16
|
+
Never emit `source .venv/bin/activate`.
|
|
17
|
+
- New deps: `uv add <pkg>` (runtime) / `uv add --dev <pkg>` (dev). Never call
|
|
18
|
+
pip directly. `pyproject.toml` carries ranges; `uv.lock` pins exactly and is
|
|
19
|
+
committed.
|
|
20
|
+
- Recreate from scratch: `uv sync`. `./.venv` is disposable.
|
|
21
|
+
- Supports Python >=3.11 (don't use 3.12+ syntax); develop on 3.12. Floor is
|
|
22
|
+
3.11 because the tested stack pulls pandas 3.x, which requires >=3.11.
|
|
23
|
+
|
|
24
|
+
## Conventions
|
|
25
|
+
|
|
26
|
+
- src layout: code in `src/itemeval/`; tests import the installed package.
|
|
27
|
+
- Public API is exported from `itemeval/__init__.py`; everything else is
|
|
28
|
+
internal and free to refactor. Prefix private modules with `_`.
|
|
29
|
+
- pydantic models for all config/data schemas; YAML configs validated at load.
|
|
30
|
+
- Lint/format: `./.venv/bin/python -m ruff check . && ./.venv/bin/python -m ruff format .`
|
|
31
|
+
- Tests: `./.venv/bin/python -m pytest`. Unit tests must not call paid APIs;
|
|
32
|
+
anything touching providers is mocked or marked for manual runs.
|
|
33
|
+
- No real API keys in tests, fixtures, or examples.
|
|
34
|
+
- Conventional commits (feat:/fix:/docs:/test:/refactor:); update CHANGELOG.md
|
|
35
|
+
for user-visible changes once releases start.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Development Guide
|
|
2
|
+
|
|
3
|
+
Process documentation for maintaining itemeval. For coding conventions see
|
|
4
|
+
`CLAUDE.md`; for milestones see `ROADMAP.md`.
|
|
5
|
+
|
|
6
|
+
## Dependency policy
|
|
7
|
+
|
|
8
|
+
- `pyproject.toml` declares **ranges** (lower bounds, e.g. `inspect-ai>=0.3.239`)
|
|
9
|
+
— this is the contract published to package users.
|
|
10
|
+
- `uv.lock` pins **exact versions** of everything — this is what `uv sync`
|
|
11
|
+
reproduces locally and in CI. Always committed.
|
|
12
|
+
- Add/remove only via `uv add <pkg>` / `uv remove <pkg>`; never edit dependency
|
|
13
|
+
lists by hand and never call pip.
|
|
14
|
+
- Upper-bound pins (`<X`) are allowed only as a temporary response to a known
|
|
15
|
+
breakage, with a linked issue and removal plan.
|
|
16
|
+
|
|
17
|
+
## inspect_ai upgrade pipeline
|
|
18
|
+
|
|
19
|
+
inspect-ai is the load-bearing dependency and releases frequently. Upgrades are
|
|
20
|
+
**deliberate, never incidental** — routine `uv sync` keeps using the lockfile
|
|
21
|
+
pin, so versions only move when we move them.
|
|
22
|
+
|
|
23
|
+
Cadence: at the start of each ROADMAP milestone, and before any large paid run
|
|
24
|
+
in a consuming study.
|
|
25
|
+
|
|
26
|
+
1. **Check what's new**
|
|
27
|
+
```bash
|
|
28
|
+
uv tree --package inspect-ai --depth 0 # current pinned version
|
|
29
|
+
```
|
|
30
|
+
Read the release notes: https://github.com/UKGovernmentBEIS/inspect_ai/releases
|
|
31
|
+
Watch for: model-provider changes, batch/caching behavior, log-format (.eval
|
|
32
|
+
schema) changes, dataframe API (`samples_df`/`evals_df`) changes.
|
|
33
|
+
2. **Upgrade on a branch**
|
|
34
|
+
```bash
|
|
35
|
+
git checkout -b chore/bump-inspect-ai
|
|
36
|
+
uv lock --upgrade-package inspect-ai && uv sync
|
|
37
|
+
```
|
|
38
|
+
3. **Unit tests** (no API calls): `./.venv/bin/python -m pytest`
|
|
39
|
+
4. **Live smoke test** (manual, costs cents): run the consuming study's pilot
|
|
40
|
+
config at `dev` scope end-to-end (generate → grade → export) and confirm
|
|
41
|
+
the export schema and cost ledger are unchanged.
|
|
42
|
+
5. **Commit** the lockfile bump: `chore: bump inspect-ai 0.3.X -> 0.3.Y`, with
|
|
43
|
+
any behavior notes in the body. Merge.
|
|
44
|
+
6. **On breakage**: pin a temporary upper bound in `pyproject.toml`
|
|
45
|
+
(`inspect-ai>=0.3.X,<0.3.Y`), open an issue describing the incompatibility,
|
|
46
|
+
and remove the bound when fixed.
|
|
47
|
+
|
|
48
|
+
Once CI exists (ROADMAP M7): add a scheduled weekly GitHub Actions job (or
|
|
49
|
+
Renovate/Dependabot) that opens the upgrade PR automatically; steps 3–5 run in
|
|
50
|
+
CI, step 4 stays manual.
|
|
51
|
+
|
|
52
|
+
All other dependencies: `uv lock --upgrade && uv sync` quarterly, same
|
|
53
|
+
branch-test-commit flow, less scrutiny.
|
|
54
|
+
|
|
55
|
+
## Versioning discipline
|
|
56
|
+
|
|
57
|
+
**Semantic versioning**, version lives in one place: `pyproject.toml`
|
|
58
|
+
(`itemeval.__version__` reads it via package metadata — never duplicate it).
|
|
59
|
+
|
|
60
|
+
- **Pre-1.0 (now)**: `0.MINOR.PATCH`. Minor bumps may break APIs (expected at
|
|
61
|
+
this stage and noted in the changelog); patch bumps are fixes only.
|
|
62
|
+
Between releases the version carries a `.devN` suffix (e.g. `0.1.0.dev0`).
|
|
63
|
+
- **Post-1.0**: MAJOR = breaking, MINOR = backwards-compatible features,
|
|
64
|
+
PATCH = backwards-compatible fixes. Breaking changes are deprecated with a
|
|
65
|
+
runtime warning for at least one minor release before removal.
|
|
66
|
+
|
|
67
|
+
**CHANGELOG.md** follows [Keep a Changelog](https://keepachangelog.com):
|
|
68
|
+
user-visible changes are added to the `[Unreleased]` section in the same PR
|
|
69
|
+
that makes them — never reconstructed at release time.
|
|
70
|
+
|
|
71
|
+
PyPI publishing uses **trusted publishing** (OIDC from GitHub Actions — no API
|
|
72
|
+
token stored). One-time setup: on PyPI, add a trusted publisher for the project
|
|
73
|
+
(owner `luozm`, repo `itemeval`, workflow `release.yml`, environment blank). The
|
|
74
|
+
publish itself runs in `.github/workflows/release.yml`, triggered when a GitHub
|
|
75
|
+
release is published; locally you only build/tag.
|
|
76
|
+
|
|
77
|
+
**Release checklist** (applies from v0.1.0, ROADMAP M7):
|
|
78
|
+
|
|
79
|
+
1. Tests and lint green: `./.venv/bin/python -m pytest && ./.venv/bin/python -m ruff check .`
|
|
80
|
+
2. Move `[Unreleased]` entries under a new `[X.Y.Z] - YYYY-MM-DD` heading.
|
|
81
|
+
3. Set `version = "X.Y.Z"` in `pyproject.toml` (drop the `.devN`).
|
|
82
|
+
4. Optionally verify the build locally: `uv build` (the same command CI runs).
|
|
83
|
+
5. Commit `release: vX.Y.Z`; tag and push: `git tag vX.Y.Z && git push origin main --tags`.
|
|
84
|
+
6. Create a GitHub release from the tag (body = the changelog section). Publishing
|
|
85
|
+
to PyPI is automatic: the `release: published` event triggers `release.yml`,
|
|
86
|
+
which runs `uv build && uv publish` via trusted publishing. Watch the Actions
|
|
87
|
+
run and confirm the new version appears on PyPI.
|
|
88
|
+
7. Bump to the next dev version (e.g. `0.2.0.dev0`) in a follow-up commit.
|
|
89
|
+
|
|
90
|
+
Consuming studies pin itemeval like any dependency: editable path source during
|
|
91
|
+
development, exact-version pin from PyPI once published — their `uv.lock` plus
|
|
92
|
+
run manifests (which record `itemeval.__version__`) keep results reproducible.
|
itemeval-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Zhimeng (Brian) Luo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|