itemeval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. itemeval-0.1.0/.github/workflows/ci.yml +42 -0
  2. itemeval-0.1.0/.github/workflows/release.yml +32 -0
  3. itemeval-0.1.0/.github/workflows/sync-wiki.yml +33 -0
  4. itemeval-0.1.0/.gitignore +26 -0
  5. itemeval-0.1.0/.python-version +1 -0
  6. itemeval-0.1.0/CHANGELOG.md +117 -0
  7. itemeval-0.1.0/CLAUDE.md +35 -0
  8. itemeval-0.1.0/DEVELOPMENT.md +92 -0
  9. itemeval-0.1.0/LICENSE +21 -0
  10. itemeval-0.1.0/PKG-INFO +279 -0
  11. itemeval-0.1.0/README.md +239 -0
  12. itemeval-0.1.0/ROADMAP.md +96 -0
  13. itemeval-0.1.0/configs/quickstart_aime.yaml +30 -0
  14. itemeval-0.1.0/configs/usamo_demo.yaml +31 -0
  15. itemeval-0.1.0/configs/usamo_demo_gate.yaml +31 -0
  16. itemeval-0.1.0/configs/usamo_smoke.yaml +37 -0
  17. itemeval-0.1.0/docs/wiki/Architecture.md +121 -0
  18. itemeval-0.1.0/docs/wiki/Budget-and-Costs.md +76 -0
  19. itemeval-0.1.0/docs/wiki/CLI.md +116 -0
  20. itemeval-0.1.0/docs/wiki/Configuration.md +133 -0
  21. itemeval-0.1.0/docs/wiki/Error-Handling.md +148 -0
  22. itemeval-0.1.0/docs/wiki/FAQ.md +97 -0
  23. itemeval-0.1.0/docs/wiki/Getting-Started.md +95 -0
  24. itemeval-0.1.0/docs/wiki/Home.md +59 -0
  25. itemeval-0.1.0/docs/wiki/Outputs-and-Schemas.md +97 -0
  26. itemeval-0.1.0/docs/wiki/Pipeline-Concepts.md +101 -0
  27. itemeval-0.1.0/docs/wiki/Python-API.md +130 -0
  28. itemeval-0.1.0/pyproject.toml +73 -0
  29. itemeval-0.1.0/src/itemeval/__init__.py +60 -0
  30. itemeval-0.1.0/src/itemeval/_builtin/config.yaml +48 -0
  31. itemeval-0.1.0/src/itemeval/_builtin/prompts/solver/minimal.md +4 -0
  32. itemeval-0.1.0/src/itemeval/_builtin/prompts/solver/standard.md +6 -0
  33. itemeval-0.1.0/src/itemeval/_builtin/rubrics/standard.md +17 -0
  34. itemeval-0.1.0/src/itemeval/_config.py +252 -0
  35. itemeval-0.1.0/src/itemeval/_errors.py +25 -0
  36. itemeval-0.1.0/src/itemeval/_item.py +29 -0
  37. itemeval-0.1.0/src/itemeval/_manifest.py +196 -0
  38. itemeval-0.1.0/src/itemeval/_mockmodels.py +68 -0
  39. itemeval-0.1.0/src/itemeval/_prepare.py +85 -0
  40. itemeval-0.1.0/src/itemeval/_status.py +165 -0
  41. itemeval-0.1.0/src/itemeval/_templates.py +161 -0
  42. itemeval-0.1.0/src/itemeval/_util.py +46 -0
  43. itemeval-0.1.0/src/itemeval/adapters/__init__.py +1 -0
  44. itemeval-0.1.0/src/itemeval/adapters/_base.py +98 -0
  45. itemeval-0.1.0/src/itemeval/adapters/_hf.py +85 -0
  46. itemeval-0.1.0/src/itemeval/budget/__init__.py +1 -0
  47. itemeval-0.1.0/src/itemeval/budget/_estimator.py +195 -0
  48. itemeval-0.1.0/src/itemeval/budget/_gate.py +69 -0
  49. itemeval-0.1.0/src/itemeval/budget/_policies.py +40 -0
  50. itemeval-0.1.0/src/itemeval/budget/_pricing.py +131 -0
  51. itemeval-0.1.0/src/itemeval/budget/pricing_seed.json +13 -0
  52. itemeval-0.1.0/src/itemeval/cli.py +407 -0
  53. itemeval-0.1.0/src/itemeval/design/__init__.py +1 -0
  54. itemeval-0.1.0/src/itemeval/design/_grid.py +178 -0
  55. itemeval-0.1.0/src/itemeval/design/_ids.py +25 -0
  56. itemeval-0.1.0/src/itemeval/generate/__init__.py +1 -0
  57. itemeval-0.1.0/src/itemeval/generate/_params.py +50 -0
  58. itemeval-0.1.0/src/itemeval/generate/_run.py +397 -0
  59. itemeval-0.1.0/src/itemeval/generate/_task.py +61 -0
  60. itemeval-0.1.0/src/itemeval/grade/__init__.py +1 -0
  61. itemeval-0.1.0/src/itemeval/grade/_judge.py +96 -0
  62. itemeval-0.1.0/src/itemeval/grade/_parse.py +94 -0
  63. itemeval-0.1.0/src/itemeval/grade/_run.py +333 -0
  64. itemeval-0.1.0/src/itemeval/grade/_verifiable.py +84 -0
  65. itemeval-0.1.0/src/itemeval/py.typed +0 -0
  66. itemeval-0.1.0/src/itemeval/store/__init__.py +1 -0
  67. itemeval-0.1.0/src/itemeval/store/_base.py +59 -0
  68. itemeval-0.1.0/src/itemeval/store/_export.py +183 -0
  69. itemeval-0.1.0/src/itemeval/store/_gradings.py +87 -0
  70. itemeval-0.1.0/src/itemeval/store/_items.py +47 -0
  71. itemeval-0.1.0/src/itemeval/store/_layout.py +47 -0
  72. itemeval-0.1.0/src/itemeval/store/_ledger.py +37 -0
  73. itemeval-0.1.0/src/itemeval/store/_logs.py +38 -0
  74. itemeval-0.1.0/src/itemeval/store/_solutions.py +93 -0
  75. itemeval-0.1.0/tests/conftest.py +129 -0
  76. itemeval-0.1.0/tests/test_adapter_hf.py +30 -0
  77. itemeval-0.1.0/tests/test_adapter_hf_io.py +107 -0
  78. itemeval-0.1.0/tests/test_adapter_mapping.py +125 -0
  79. itemeval-0.1.0/tests/test_cli.py +87 -0
  80. itemeval-0.1.0/tests/test_condition_ids.py +35 -0
  81. itemeval-0.1.0/tests/test_config.py +148 -0
  82. itemeval-0.1.0/tests/test_demo_configs.py +61 -0
  83. itemeval-0.1.0/tests/test_estimator.py +91 -0
  84. itemeval-0.1.0/tests/test_export.py +71 -0
  85. itemeval-0.1.0/tests/test_generate_run.py +95 -0
  86. itemeval-0.1.0/tests/test_grade_run.py +206 -0
  87. itemeval-0.1.0/tests/test_grid.py +131 -0
  88. itemeval-0.1.0/tests/test_integration_pipeline.py +54 -0
  89. itemeval-0.1.0/tests/test_item.py +31 -0
  90. itemeval-0.1.0/tests/test_judge_parse.py +80 -0
  91. itemeval-0.1.0/tests/test_manifest.py +44 -0
  92. itemeval-0.1.0/tests/test_mockmodels.py +45 -0
  93. itemeval-0.1.0/tests/test_policies_gate.py +67 -0
  94. itemeval-0.1.0/tests/test_pricing.py +97 -0
  95. itemeval-0.1.0/tests/test_public_api.py +109 -0
  96. itemeval-0.1.0/tests/test_status.py +99 -0
  97. itemeval-0.1.0/tests/test_store.py +238 -0
  98. itemeval-0.1.0/tests/test_templates.py +100 -0
  99. itemeval-0.1.0/tests/test_util.py +48 -0
  100. itemeval-0.1.0/tests/test_verifiable.py +104 -0
  101. itemeval-0.1.0/uv.lock +3428 -0
@@ -0,0 +1,42 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ concurrency:
9
+ group: ci-${{ github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ test:
14
+ name: lint + test (py${{ matrix.python-version }})
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ python-version: ["3.11", "3.12", "3.13"]
20
+ steps:
21
+ - uses: actions/checkout@v5
22
+
23
+ - name: Install uv + Python ${{ matrix.python-version }}
24
+ uses: astral-sh/setup-uv@v8.2.0
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ enable-cache: true
28
+ cache-dependency-glob: "uv.lock"
29
+
30
+ - name: Sync (locked, dev group)
31
+ run: uv sync --locked
32
+
33
+ - name: Ruff lint
34
+ run: uv run ruff check .
35
+
36
+ - name: Ruff format check
37
+ run: uv run ruff format --check .
38
+
39
+ # Unit tests only; the `network` test (HF Hub) is excluded to keep CI
40
+ # hermetic — run it manually per DEVELOPMENT.md when touching the adapter.
41
+ - name: Pytest
42
+ run: uv run pytest -m "not network"
@@ -0,0 +1,32 @@
1
+ name: Release
2
+
3
+ # Publishes to PyPI via trusted publishing (OIDC) — no API token stored.
4
+ # Triggered when a GitHub Release is published; the workflow is taken from the
5
+ # default branch, so it fires for tags created before this file existed.
6
+ #
7
+ # One-time PyPI setup (project settings -> Publishing -> add a trusted publisher):
8
+ # Owner: luozm | Repository: itemeval | Workflow: release.yml | Environment: (blank)
9
+ on:
10
+ release:
11
+ types: [published]
12
+
13
+ jobs:
14
+ pypi-publish:
15
+ name: Build + publish to PyPI
16
+ runs-on: ubuntu-latest
17
+ permissions:
18
+ id-token: write # mint the OIDC token PyPI trusted publishing verifies
19
+ contents: read
20
+ steps:
21
+ - uses: actions/checkout@v5
22
+
23
+ - name: Install uv
24
+ uses: astral-sh/setup-uv@v8.2.0
25
+
26
+ - name: Build sdist + wheel
27
+ run: uv build
28
+
29
+ # --trusted-publishing always: fail loudly if OIDC isn't available rather
30
+ # than silently falling back to looking for credentials.
31
+ - name: Publish
32
+ run: uv publish --trusted-publishing always
@@ -0,0 +1,33 @@
1
+ name: Sync wiki
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - "docs/wiki/**"
8
+
9
+ permissions:
10
+ contents: write
11
+
12
+ jobs:
13
+ sync:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v5
17
+
18
+ - name: Push docs/wiki to the wiki repo
19
+ env:
20
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21
+ run: |
22
+ git clone "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.wiki.git" wiki
23
+ rsync -av --delete --exclude .git docs/wiki/ wiki/
24
+ cd wiki
25
+ git config user.name "github-actions[bot]"
26
+ git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
27
+ git add -A
28
+ if git diff --cached --quiet; then
29
+ echo "Wiki already up to date."
30
+ exit 0
31
+ fi
32
+ git commit -m "Sync wiki from docs/wiki @ ${GITHUB_SHA::7}"
33
+ git push
@@ -0,0 +1,26 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+
8
+ # Environments & secrets
9
+ .venv/
10
+ .env
11
+
12
+ # Tooling caches
13
+ .pytest_cache/
14
+ .ruff_cache/
15
+ .coverage
16
+ .coverage.*
17
+ htmlcov/
18
+
19
+ # Demo study outputs (itemeval generate/grade/export artifacts)
20
+ studies/
21
+
22
+ # Internal build-time design contract — kept locally, not published
23
+ docs/DESIGN.md
24
+
25
+ # OS
26
+ .DS_Store
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,117 @@
1
+ # Changelog
2
+
3
+ All notable changes to itemeval are documented here. Format follows
4
+ [Keep a Changelog](https://keepachangelog.com); versioning follows
5
+ [SemVer](https://semver.org) (pre-1.0: minor bumps may break APIs).
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.1.0] - 2026-06-10
10
+
11
+ First public release. Item-level LLM evaluation over any inspect_ai-supported
12
+ provider, with a two-stage generate/grade pipeline, long-format item-response
13
+ export, and a budget layer.
14
+
15
+ ### Added
16
+ - Core data model and config (M1): canonical `Item` model; full pydantic
17
+ experiment-config schema validating the README YAML sketch as-is
18
+ (`load_config`); content-derived stable condition ids; facet grid expansion
19
+ with full crossing.
20
+ - HuggingFace benchmark adapter (M1): field-mapping spec → canonical items,
21
+ revision pinned at first run via a per-study `dataset_locks.json`.
22
+ - Run manifests (M1): dataset revisions, template content hashes, model ids,
23
+ requested sampling params (effective values backfilled per condition after
24
+ each run), package versions, full condition grid — one JSON per run.
25
+ - Generate stage (M2): one inspect task per (model × prompt × model-config)
26
+ cell, `epochs` = replications, thinking/reasoning toggles as model-config
27
+ facets, requested vs effective sampling params recorded per row, resumable
28
+ solutions parquet store + raw `.eval` log index.
29
+ - Grade stage (M3): verifiable scorers (exact match / multiple choice /
30
+ numeric, $0) and judge-as-task (grading dataset built from stored solutions,
31
+ judge temperature pinned to 0, prompt caching enabled); strict structured
32
+ score parsing with parse failures flagged in-table, never dropped;
33
+ re-runnable per (grader × rubric) without touching solutions.
34
+ - Export (M4): long-format gradings table (45 columns: scores, judge
35
+ reasoning, tokens, USD, latency, full provenance), parquet + CSV mirrors,
36
+ per-run cost ledger attributed generation vs grading with internal
37
+ reconciliation check.
38
+ - Budget layer (M5): packaged pricing seed + OpenRouter pricing refresh,
39
+ per-stage dry-run estimator, `confirm_above_usd` gate (exit 3) and
40
+ non-overridable `max_usd` cap (exit 4), `dev`/`full-interactive`/
41
+ `full-batch` policies, batch-API wiring with documented ~50% discount
42
+ approximation.
43
+ - CLI (M6): `estimate | generate | grade | export | status` with consistent
44
+ UX, `--json` output, repeatable `--condition/--grader/--rubric` filters,
45
+ resumability and grid-completion reporting.
46
+ - `mockllm/*` pass-through: any mock model id runs the full pipeline free and
47
+ deterministically (used by all demos and tests; `configs/usamo_demo.yaml`).
48
+ - Public Python API: the pipeline is drivable programmatically as well as via
49
+ the CLI — `prepare_study`, `estimate_study`, `run_generate`, `run_grade`,
50
+ `export_study`, `build_status` exported from `itemeval` (lazily, so
51
+ `import itemeval` stays light). The budget confirmation gate remains a
52
+ CLI-layer feature.
53
+ - Dependency: `datasets` (HuggingFace) for the HF adapter.
54
+ - Built-in template library: prompts `minimal`/`standard` and rubric `standard`
55
+ ship inside the package and are referenced as `builtin:<name>`. A bare name
56
+ still resolves to a local file under `prompts_dir`/`rubrics_dir`; the two
57
+ namespaces are distinct and never silently shadow each other — each template
58
+ is recorded in the run manifest with its `source` (`local`/`builtin`) and
59
+ content hash, and built-in templates record a machine-independent path.
60
+ - `itemeval init DIR [--with-templates] [--force]`: scaffold a runnable starter
61
+ study (`config.yaml`). `--with-templates` also copies the referenced built-in
62
+ prompts/rubrics locally as editable starters. Makes `pip install itemeval`
63
+ usable without cloning the repo.
64
+ - `solvers.on_empty` policy (`skip` default / `rerun` / `grade`) for completed
65
+ generations that produced no gradable text (empty/blank `solution`, no API
66
+ error — e.g. a reasoning model whose token budget was spent entirely on
67
+ hidden reasoning). Empty no-error completions are a distinct channel from API
68
+ errors (re-attempted) and parse failures (final): `skip` excludes them from
69
+ grading, `rerun` also makes them eligible for regeneration on the next
70
+ `generate`, `grade` sends them to the judge as-is. They are always surfaced —
71
+ `grade` reports the count and stop-reason breakdown, and `status` gains an
72
+ `empty` column — never silently folded into a green "complete".
73
+ - Provider/endpoint provenance for cost attribution: `ledger.parquet` gains a
74
+ `provider` column (the inspect prefix of `model`), and run manifests gain
75
+ `endpoints_effective` per condition (`{provider, base_url, served_model}`,
76
+ backfilled after the run) — recording which provider, endpoint, and
77
+ provider-returned model snapshot actually answered. `base_url` is null on the
78
+ provider's default endpoint; a non-null value flags traffic routed elsewhere
79
+ (Azure/proxy/gateway).
80
+
81
+ ### Changed
82
+ - **Path resolution split by intent** (behavior change). Inputs (`prompts_dir`,
83
+ `rubrics_dir`, `budget.pricing_path`) still anchor to the config file's
84
+ directory; outputs (`output_dir`, i.e. the study tree) now anchor to a **work
85
+ directory** defaulting to the current directory, never the config dir or the
86
+ installed package. New `-C/--base-dir` (CLI) and `load_config(work_dir=...)`
87
+ (Python) override the output anchor. The example configs drop their `../`
88
+ prefixes accordingly.
89
+ - Default `facets.prompt` / `facets.rubric` are now `[builtin:standard]`
90
+ (were `[default]`, which referenced a template that never existed).
91
+ - Template references and validation moved ahead of study-directory creation:
92
+ an unresolved template now fails before any output directory is written.
93
+
94
+ ### Packaging
95
+ - Provider-SDK optional extras (`openai`, `anthropic`, `google`, `all`),
96
+ mirroring inspect_ai's lazy provider imports. Install the extra for the
97
+ provider you run, e.g. `pip install itemeval[openai]` — the `openai` extra
98
+ also covers OpenRouter and other OpenAI-compatible providers. The base
99
+ install stays SDK-free; running a real provider without its extra raises
100
+ inspect_ai's `PrerequisiteError` with the install hint.
101
+ - Ship a `py.typed` marker (PEP 561): downstream type checkers now see
102
+ itemeval's annotations. Added the `Typing :: Typed` and Python 3.11/3.12
103
+ classifiers.
104
+ - Relaxed the `pyarrow` (`>=24` → `>=15`) and `datasets` (`>=5` → `>=3`)
105
+ lower bounds to the oldest versions whose APIs we actually use, easing
106
+ co-installation; dev/CI still pin the latest via `uv.lock`. The full test
107
+ suite passes at both the floor and the locked versions.
108
+ - Expanded `[project.urls]` (Homepage, Documentation → wiki, Changelog, Issues)
109
+ and switched the README's PyPI-facing links to absolute GitHub URLs.
110
+ - Minimum Python is now 3.11 (was 3.10). The tested dependency stack resolves
111
+ pandas 3.x, which requires Python >=3.11, so 3.10 could only ever install a
112
+ different (pandas 2.x) stack that was never tested. Floor now matches the
113
+ tested stack; `uv.lock` reconciled to a single resolution (dropped the
114
+ 3.10-only `exceptiongroup`/`tomli`/`async-timeout`/`pytz` backports).
115
+
116
+ [Unreleased]: https://github.com/luozm/itemeval/compare/v0.1.0...HEAD
117
+ [0.1.0]: https://github.com/luozm/itemeval/releases/tag/v0.1.0
@@ -0,0 +1,35 @@
1
+ # CLAUDE.md — itemeval (package development)
2
+
3
+ Publishable Python package: item-level LLM evaluation on inspect_ai.
4
+ `README.md` is the spec; `ROADMAP.md` is the milestone plan — keep it updated
5
+ as milestones complete. `DEVELOPMENT.md` defines the inspect_ai upgrade
6
+ pipeline and the versioning/release process — follow it for any dependency
7
+ bump or release; update `CHANGELOG.md` ([Unreleased]) in the same change that
8
+ makes a user-visible difference. Any study consuming this package lives in its
9
+ own separate repo; never put study-specific content (a particular study's
10
+ datasets, rubric texts, or analysis) in this package.
11
+
12
+ ## Python environment
13
+
14
+ - Managed by `uv`. All Python runs against `./.venv`; system Python is never used.
15
+ - Invoke by absolute path, never activate: `./.venv/bin/python -m pytest`.
16
+ Never emit `source .venv/bin/activate`.
17
+ - New deps: `uv add <pkg>` (runtime) / `uv add --dev <pkg>` (dev). Never call
18
+ pip directly. `pyproject.toml` carries ranges; `uv.lock` pins exactly and is
19
+ committed.
20
+ - Recreate from scratch: `uv sync`. `./.venv` is disposable.
21
+ - Supports Python >=3.11 (don't use 3.12+ syntax); develop on 3.12. Floor is
22
+ 3.11 because the tested stack pulls pandas 3.x, which requires >=3.11.
23
+
24
+ ## Conventions
25
+
26
+ - src layout: code in `src/itemeval/`; tests import the installed package.
27
+ - Public API is exported from `itemeval/__init__.py`; everything else is
28
+ internal and free to refactor. Prefix private modules with `_`.
29
+ - pydantic models for all config/data schemas; YAML configs validated at load.
30
+ - Lint/format: `./.venv/bin/python -m ruff check . && ./.venv/bin/python -m ruff format .`
31
+ - Tests: `./.venv/bin/python -m pytest`. Unit tests must not call paid APIs;
32
+ anything touching providers is mocked or marked for manual runs.
33
+ - No real API keys in tests, fixtures, or examples.
34
+ - Conventional commits (feat:/fix:/docs:/test:/refactor:); update CHANGELOG.md
35
+ for user-visible changes once releases start.
@@ -0,0 +1,92 @@
1
+ # Development Guide
2
+
3
+ Process documentation for maintaining itemeval. For coding conventions see
4
+ `CLAUDE.md`; for milestones see `ROADMAP.md`.
5
+
6
+ ## Dependency policy
7
+
8
+ - `pyproject.toml` declares **ranges** (lower bounds, e.g. `inspect-ai>=0.3.239`)
9
+ — this is the contract published to package users.
10
+ - `uv.lock` pins **exact versions** of everything — this is what `uv sync`
11
+ reproduces locally and in CI. Always committed.
12
+ - Add/remove only via `uv add <pkg>` / `uv remove <pkg>`; never edit dependency
13
+ lists by hand and never call pip.
14
+ - Upper-bound pins (`<X`) are allowed only as a temporary response to a known
15
+ breakage, with a linked issue and removal plan.
16
+
17
+ ## inspect_ai upgrade pipeline
18
+
19
+ inspect-ai is the load-bearing dependency and releases frequently. Upgrades are
20
+ **deliberate, never incidental** — routine `uv sync` keeps using the lockfile
21
+ pin, so versions only move when we move them.
22
+
23
+ Cadence: at the start of each ROADMAP milestone, and before any large paid run
24
+ in a consuming study.
25
+
26
+ 1. **Check what's new**
27
+ ```bash
28
+ uv tree --package inspect-ai --depth 0 # current pinned version
29
+ ```
30
+ Read the release notes: https://github.com/UKGovernmentBEIS/inspect_ai/releases
31
+ Watch for: model-provider changes, batch/caching behavior, log-format (.eval
32
+ schema) changes, dataframe API (`samples_df`/`evals_df`) changes.
33
+ 2. **Upgrade on a branch**
34
+ ```bash
35
+ git checkout -b chore/bump-inspect-ai
36
+ uv lock --upgrade-package inspect-ai && uv sync
37
+ ```
38
+ 3. **Unit tests** (no API calls): `./.venv/bin/python -m pytest`
39
+ 4. **Live smoke test** (manual, costs cents): run the consuming study's pilot
40
+ config at `dev` scope end-to-end (generate → grade → export) and confirm
41
+ the export schema and cost ledger are unchanged.
42
+ 5. **Commit** the lockfile bump: `chore: bump inspect-ai 0.3.X -> 0.3.Y`, with
43
+ any behavior notes in the body. Merge.
44
+ 6. **On breakage**: pin a temporary upper bound in `pyproject.toml`
45
+ (`inspect-ai>=0.3.X,<0.3.Y`), open an issue describing the incompatibility,
46
+ and remove the bound when fixed.
47
+
48
+ Once CI exists (ROADMAP M7): add a scheduled weekly GitHub Actions job (or
49
+ Renovate/Dependabot) that opens the upgrade PR automatically; steps 3–5 run in
50
+ CI, step 4 stays manual.
51
+
52
+ All other dependencies: `uv lock --upgrade && uv sync` quarterly, same
53
+ branch-test-commit flow, less scrutiny.
54
+
55
+ ## Versioning discipline
56
+
57
+ **Semantic versioning**, version lives in one place: `pyproject.toml`
58
+ (`itemeval.__version__` reads it via package metadata — never duplicate it).
59
+
60
+ - **Pre-1.0 (now)**: `0.MINOR.PATCH`. Minor bumps may break APIs (expected at
61
+ this stage and noted in the changelog); patch bumps are fixes only.
62
+ Between releases the version carries a `.devN` suffix (e.g. `0.1.0.dev0`).
63
+ - **Post-1.0**: MAJOR = breaking, MINOR = backwards-compatible features,
64
+ PATCH = backwards-compatible fixes. Breaking changes are deprecated with a
65
+ runtime warning for at least one minor release before removal.
66
+
67
+ **CHANGELOG.md** follows [Keep a Changelog](https://keepachangelog.com):
68
+ user-visible changes are added to the `[Unreleased]` section in the same PR
69
+ that makes them — never reconstructed at release time.
70
+
71
+ PyPI publishing uses **trusted publishing** (OIDC from GitHub Actions — no API
72
+ token stored). One-time setup: on PyPI, add a trusted publisher for the project
73
+ (owner `luozm`, repo `itemeval`, workflow `release.yml`, environment blank). The
74
+ publish itself runs in `.github/workflows/release.yml`, triggered when a GitHub
75
+ release is published; locally you only build/tag.
76
+
77
+ **Release checklist** (applies from v0.1.0, ROADMAP M7):
78
+
79
+ 1. Tests and lint green: `./.venv/bin/python -m pytest && ./.venv/bin/python -m ruff check .`
80
+ 2. Move `[Unreleased]` entries under a new `[X.Y.Z] - YYYY-MM-DD` heading.
81
+ 3. Set `version = "X.Y.Z"` in `pyproject.toml` (drop the `.devN`).
82
+ 4. Optionally verify the build locally: `uv build` (the same command CI runs).
83
+ 5. Commit `release: vX.Y.Z`; tag and push: `git tag vX.Y.Z && git push origin main --tags`.
84
+ 6. Create a GitHub release from the tag (body = the changelog section). Publishing
85
+ to PyPI is automatic: the `release: published` event triggers `release.yml`,
86
+ which runs `uv build && uv publish` via trusted publishing. Watch the Actions
87
+ run and confirm the new version appears on PyPI.
88
+ 7. Bump to the next dev version (e.g. `0.2.0.dev0`) in a follow-up commit.
89
+
90
+ Consuming studies pin itemeval like any dependency: editable path source during
91
+ development, exact-version pin from PyPI once published — their `uv.lock` plus
92
+ run manifests (which record `itemeval.__version__`) keep results reproducible.
itemeval-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Zhimeng (Brian) Luo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.