lodlina 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lodlina-0.2.0/.env.example +18 -0
- lodlina-0.2.0/.github/workflows/ci.yml +34 -0
- lodlina-0.2.0/.github/workflows/release.yml +38 -0
- lodlina-0.2.0/.gitignore +34 -0
- lodlina-0.2.0/CHANGELOG.md +55 -0
- lodlina-0.2.0/LICENSE +21 -0
- lodlina-0.2.0/PKG-INFO +396 -0
- lodlina-0.2.0/README.md +358 -0
- lodlina-0.2.0/docs/ROADMAP.md +139 -0
- lodlina-0.2.0/docs/methodology.md +270 -0
- lodlina-0.2.0/leaderboard/README.md +21 -0
- lodlina-0.2.0/pyproject.toml +65 -0
- lodlina-0.2.0/src/lodlina/__init__.py +27 -0
- lodlina-0.2.0/src/lodlina/_offline.py +75 -0
- lodlina-0.2.0/src/lodlina/cli.py +243 -0
- lodlina-0.2.0/src/lodlina/data/eligibility_fairness.jsonl +12 -0
- lodlina-0.2.0/src/lodlina/data/grounded_qa.jsonl +15 -0
- lodlina-0.2.0/src/lodlina/data/plain_language.jsonl +12 -0
- lodlina-0.2.0/src/lodlina/data/records_redaction.jsonl +18 -0
- lodlina-0.2.0/src/lodlina/datagen/__init__.py +11 -0
- lodlina-0.2.0/src/lodlina/datagen/generate_eligibility.py +123 -0
- lodlina-0.2.0/src/lodlina/datagen/generate_grounded_qa.py +203 -0
- lodlina-0.2.0/src/lodlina/datagen/generate_plain_language.py +112 -0
- lodlina-0.2.0/src/lodlina/datagen/generate_redaction.py +241 -0
- lodlina-0.2.0/src/lodlina/leaderboard.py +389 -0
- lodlina-0.2.0/src/lodlina/models.py +209 -0
- lodlina-0.2.0/src/lodlina/packs/__init__.py +45 -0
- lodlina-0.2.0/src/lodlina/packs/builtin/README.md +74 -0
- lodlina-0.2.0/src/lodlina/packs/builtin/eligibility-fairness/manifest.yaml +14 -0
- lodlina-0.2.0/src/lodlina/packs/builtin/grounded-qa/manifest.yaml +14 -0
- lodlina-0.2.0/src/lodlina/packs/builtin/plain-language/manifest.yaml +14 -0
- lodlina-0.2.0/src/lodlina/packs/builtin/records-redaction/manifest.yaml +16 -0
- lodlina-0.2.0/src/lodlina/packs/pack.py +282 -0
- lodlina-0.2.0/src/lodlina/packs/types.py +195 -0
- lodlina-0.2.0/src/lodlina/scorers/__init__.py +1 -0
- lodlina-0.2.0/src/lodlina/scorers/citation.py +204 -0
- lodlina-0.2.0/src/lodlina/scorers/common.py +118 -0
- lodlina-0.2.0/src/lodlina/scorers/fairness.py +124 -0
- lodlina-0.2.0/src/lodlina/scorers/readability.py +142 -0
- lodlina-0.2.0/src/lodlina/scorers/redaction.py +111 -0
- lodlina-0.2.0/src/lodlina/tasks/__init__.py +59 -0
- lodlina-0.2.0/src/lodlina/tasks/eligibility_fairness.py +79 -0
- lodlina-0.2.0/src/lodlina/tasks/grounded_qa.py +71 -0
- lodlina-0.2.0/src/lodlina/tasks/plain_language.py +55 -0
- lodlina-0.2.0/src/lodlina/tasks/records_redaction.py +72 -0
- lodlina-0.2.0/src/lodlina/validate.py +45 -0
- lodlina-0.2.0/tests/conftest.py +26 -0
- lodlina-0.2.0/tests/test_cli.py +61 -0
- lodlina-0.2.0/tests/test_eligibility.py +86 -0
- lodlina-0.2.0/tests/test_grounded_qa.py +89 -0
- lodlina-0.2.0/tests/test_leaderboard.py +79 -0
- lodlina-0.2.0/tests/test_models.py +77 -0
- lodlina-0.2.0/tests/test_offline.py +31 -0
- lodlina-0.2.0/tests/test_packs.py +138 -0
- lodlina-0.2.0/tests/test_plain_language.py +68 -0
- lodlina-0.2.0/tests/test_redaction.py +124 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Lodlina local environment (copy to .env.local; .env* is gitignored).
|
|
2
|
+
#
|
|
3
|
+
# --- Amazon Bedrock: Claude line-up + the model-graded grader (us-east-1) ---
|
|
4
|
+
# Claude models and the pinned grader run via Inspect's bedrock/ provider
|
|
5
|
+
# (Converse API), using your standard AWS credentials. Either set a profile:
|
|
6
|
+
# export AWS_PROFILE=bedrock-test
|
|
7
|
+
# ...or set explicit keys:
|
|
8
|
+
# export AWS_ACCESS_KEY_ID=...
|
|
9
|
+
# export AWS_SECRET_ACCESS_KEY=...
|
|
10
|
+
export AWS_DEFAULT_REGION=us-east-1
|
|
11
|
+
|
|
12
|
+
# --- Bedrock Mantle: OpenAI GPT-5.x (us-east-2 / us-west-2 / us-gov-west-1) ---
|
|
13
|
+
# GPT-5.4 / GPT-5.5 are served on the separate Bedrock "Mantle" endpoint via the
|
|
14
|
+
# OpenAI Responses API, addressed through Inspect's openai-api provider. They use
|
|
15
|
+
# a Bedrock long-term API key (bearer token), NOT SigV4 credentials. Mantle is
|
|
16
|
+
# not available in us-east-1, so set the base URL's region to a supported one.
|
|
17
|
+
export BEDROCK_MANTLE_BASE_URL=https://bedrock-mantle.us-east-2.api.aws/openai/v1
|
|
18
|
+
export BEDROCK_MANTLE_API_KEY=ABSK...your-bedrock-api-key...
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
fail-fast: false
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install (dev = tests + linter + all providers)
|
|
25
|
+
# setup-uv already provisions a .venv (VIRTUAL_ENV is set); install into it.
|
|
26
|
+
run: uv pip install -e ".[dev]"
|
|
27
|
+
|
|
28
|
+
- name: Lint
|
|
29
|
+
run: uv run ruff check src/ tests/
|
|
30
|
+
|
|
31
|
+
- name: Test (offline; no credentials needed)
|
|
32
|
+
# The suite drives the full Inspect pipeline with mock models, so it
|
|
33
|
+
# needs neither cloud credentials nor network access.
|
|
34
|
+
run: uv run pytest -q
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Publishes to PyPI via Trusted Publishing (OIDC) when a GitHub Release is
|
|
4
|
+
# published. No API token is stored — PyPI verifies the workflow's identity.
|
|
5
|
+
# One-time setup on PyPI: create a "pending publisher" for project `lodlina`
|
|
6
|
+
# (owner: Lodlina, repo: Lodlina, workflow: release.yml, environment: pypi).
|
|
7
|
+
|
|
8
|
+
on:
|
|
9
|
+
release:
|
|
10
|
+
types: [published]
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
publish:
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
environment: pypi
|
|
19
|
+
permissions:
|
|
20
|
+
# Job-level permissions REPLACE workflow-level, so both are needed here:
|
|
21
|
+
contents: read # for actions/checkout
|
|
22
|
+
id-token: write # required for OIDC Trusted Publishing
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
|
|
26
|
+
- name: Install uv
|
|
27
|
+
uses: astral-sh/setup-uv@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: "3.12"
|
|
30
|
+
|
|
31
|
+
- name: Build sdist + wheel
|
|
32
|
+
run: uv build
|
|
33
|
+
|
|
34
|
+
- name: Verify metadata
|
|
35
|
+
run: uvx twine check dist/*
|
|
36
|
+
|
|
37
|
+
- name: Publish to PyPI (Trusted Publishing)
|
|
38
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
lodlina-0.2.0/.gitignore
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
|
|
11
|
+
# uv
|
|
12
|
+
uv.lock
|
|
13
|
+
|
|
14
|
+
# Inspect
|
|
15
|
+
logs/
|
|
16
|
+
.inspect/
|
|
17
|
+
.inspect-logs*/
|
|
18
|
+
|
|
19
|
+
# Leaderboard output (regenerated; see leaderboard/README.md)
|
|
20
|
+
leaderboard/results.md
|
|
21
|
+
leaderboard/results.json
|
|
22
|
+
leaderboard/results.html
|
|
23
|
+
leaderboard/results/*.json
|
|
24
|
+
|
|
25
|
+
# Secrets / local env (Bedrock Mantle API key, etc.) — never commit
|
|
26
|
+
.env
|
|
27
|
+
.env.*
|
|
28
|
+
!.env.example
|
|
29
|
+
|
|
30
|
+
# OS / editor
|
|
31
|
+
.DS_Store
|
|
32
|
+
.idea/
|
|
33
|
+
.vscode/
|
|
34
|
+
*.swp
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to Lodlina are documented here. Format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/); versions follow
|
|
5
|
+
[SemVer](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [0.2.0] — 2026-06-09
|
|
8
|
+
|
|
9
|
+
Theme: a real, installable, multi-provider package — Bedrock-first, with a
|
|
10
|
+
shareable eval-pack ecosystem and the groundwork for government (air-gapped) use.
|
|
11
|
+
Provider paths live-verified: AWS Bedrock (Claude), Bedrock Mantle (OpenAI
|
|
12
|
+
GPT-5.x), and direct OpenAI; direct Anthropic is wired and unit-tested.
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
- **Eval-pack ecosystem** (Phase 2 complete): the leaderboard now runs **over
|
|
16
|
+
packs** (everything-is-a-pack); third parties can distribute packs as
|
|
17
|
+
pip-installable packages discovered via the `lodlina_packs` entry-point group
|
|
18
|
+
(built-ins always win on id collisions); and `lodlina new-pack <id>
|
|
19
|
+
--task-type <t>` scaffolds a valid starter pack. `lodlina list` shows built-in
|
|
20
|
+
and installed packs; `lodlina validate --pack <id|path>` validates one pack.
|
|
21
|
+
- **Eval-pack architecture** (Phase 2 spine): a pack is a `manifest.yaml` +
|
|
22
|
+
synthetic `dataset.jsonl` that references a curated **task type** (sample
|
|
23
|
+
mapping + prompt + vetted scorers) by name — **data + config only, no
|
|
24
|
+
contributed code**. The four built-in tasks now ship as packs. New:
|
|
25
|
+
`lodlina.packs` (task-type registry, manifest loader, discovery, validation),
|
|
26
|
+
built-in pack manifests, and `packs/builtin/README.md` documenting the format.
|
|
27
|
+
`lodlina run` loads packs (built-in id or `--pack <path>`); `lodlina validate`
|
|
28
|
+
validates packs (incl. verbatim gold spans + `synthetic: true`).
|
|
29
|
+
- **Unified `lodlina` CLI**: `lodlina list | run | leaderboard | validate`.
|
|
30
|
+
`run` evaluates one task against a model (alias-aware, grader bound, air-gap
|
|
31
|
+
safe); `validate` checks the built-in datasets (incl. that every gold span is
|
|
32
|
+
verbatim in its source). The `lodlina-leaderboard` script is retained.
|
|
33
|
+
- **Provider extras**: install only what you need — `lodlina[bedrock]`,
|
|
34
|
+
`[openai]`, `[anthropic]`, `[all]`, `[dev]`. The core install is
|
|
35
|
+
provider-agnostic (eval framework + graders, no cloud SDKs).
|
|
36
|
+
- **Model registry / aliases** (`src/lodlina/models.py`): pick a model by a short
|
|
37
|
+
alias (`claude-sonnet-4-6`, `gpt-5.5`, …) that resolves **Bedrock-first**.
|
|
38
|
+
Direct OpenAI/Anthropic are secondary routes selected only via `--provider`
|
|
39
|
+
(no silent cross-boundary fallback). Full Inspect model strings still work.
|
|
40
|
+
- **Data-boundary reporting**: the leaderboard labels each model's provider and
|
|
41
|
+
whether the run was in-boundary (Bedrock) or off-boundary (commercial API).
|
|
42
|
+
- **Air-gap support** (`src/lodlina/_offline.py`): Inspect's remote token-estimate
|
|
43
|
+
is replaced with an offline fallback at CLI startup; never affects grading.
|
|
44
|
+
- **CI** (GitHub Actions): ruff + the offline test suite on Python 3.10–3.12.
|
|
45
|
+
|
|
46
|
+
### Changed
|
|
47
|
+
- Default leaderboard line-up is now expressed as aliases (Bedrock-first).
|
|
48
|
+
- `--provider` flag on the leaderboard to force an off-boundary route.
|
|
49
|
+
|
|
50
|
+
## [0.1.0]
|
|
51
|
+
- Initial four tasks (records-redaction, eligibility-fairness, grounded-qa,
|
|
52
|
+
plain-language), each with a synthetic dataset, solver, and defensible scorer.
|
|
53
|
+
- Leaderboard runner (Markdown/JSON/HTML); methodology docs; offline test suite.
|
|
54
|
+
- Live on Amazon Bedrock (Claude via Converse) and Bedrock Mantle (OpenAI
|
|
55
|
+
GPT-5.x via the Responses API); a pinned neutral grader for model-graded scorers.
|
lodlina-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lodlina contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
lodlina-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lodlina
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A plumb line for government AI: realistic U.S. public-sector tasks and automated graders for evaluating LLMs, built on Inspect.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Lodlina/Lodlina
|
|
6
|
+
Project-URL: Repository, https://github.com/Lodlina/Lodlina
|
|
7
|
+
Author: Lodlina contributors
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: ai-evaluation,evals,government,inspect,llm,public-sector
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Requires-Dist: inspect-ai>=0.3.50
|
|
16
|
+
Requires-Dist: pyyaml>=6.0
|
|
17
|
+
Requires-Dist: textstat>=0.7.3
|
|
18
|
+
Provides-Extra: all
|
|
19
|
+
Requires-Dist: aioboto3>=13.0; extra == 'all'
|
|
20
|
+
Requires-Dist: anthropic>=0.40; extra == 'all'
|
|
21
|
+
Requires-Dist: boto3>=1.34; extra == 'all'
|
|
22
|
+
Requires-Dist: openai>=1.40; extra == 'all'
|
|
23
|
+
Provides-Extra: anthropic
|
|
24
|
+
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
25
|
+
Provides-Extra: bedrock
|
|
26
|
+
Requires-Dist: aioboto3>=13.0; extra == 'bedrock'
|
|
27
|
+
Requires-Dist: boto3>=1.34; extra == 'bedrock'
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: aioboto3>=13.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: anthropic>=0.40; extra == 'dev'
|
|
31
|
+
Requires-Dist: boto3>=1.34; extra == 'dev'
|
|
32
|
+
Requires-Dist: openai>=1.40; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
35
|
+
Provides-Extra: openai
|
|
36
|
+
Requires-Dist: openai>=1.40; extra == 'openai'
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
# Lodlina
|
|
40
|
+
|
|
41
|
+
**A plumb line for government AI.**
|
|
42
|
+
|
|
43
|
+
*Lodlina* is Swedish for **plumb line** — the weighted cord builders have used for
|
|
44
|
+
millennia to check whether something is true and upright. That is exactly what
|
|
45
|
+
this project is: a fair, reproducible standard for checking whether AI systems do
|
|
46
|
+
government work correctly, fairly, and honestly.
|
|
47
|
+
|
|
48
|
+
Lodlina is an open-source suite of realistic U.S. public-sector tasks paired with
|
|
49
|
+
**automated, defensible graders**, built on
|
|
50
|
+
[Inspect](https://inspect.aisi.org.uk) (the open evaluation framework from the UK
|
|
51
|
+
AI Safety Institute). It scores how well any LLM performs real government work and
|
|
52
|
+
produces a model-comparison leaderboard.
|
|
53
|
+
|
|
54
|
+
> A plumb line doesn't argue about which wall is prettier — it tells you, without
|
|
55
|
+
> opinion, whether the wall is true. Lodlina aims for the same: measurement you can
|
|
56
|
+
> defend, not vibes.
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Why this exists
|
|
61
|
+
|
|
62
|
+
Public-sector agencies are under real pressure to adopt AI for tasks like
|
|
63
|
+
processing records, making eligibility determinations, answering the public from
|
|
64
|
+
policy manuals, and communicating plainly. These tasks have a property most LLM
|
|
65
|
+
benchmarks ignore: **the cost of being wrong is asymmetric and concrete.** Leaking
|
|
66
|
+
a citizen's Social Security number is not a rounding error. Flipping an
|
|
67
|
+
eligibility decision because an applicant's name "sounds" a certain way is not a
|
|
68
|
+
style preference. Inventing a citation in a determination letter is not a minor
|
|
69
|
+
hallucination.
|
|
70
|
+
|
|
71
|
+
Lodlina measures the things that actually matter for government adoption, with
|
|
72
|
+
graders that an evaluation practitioner — or an inspector general — could audit.
|
|
73
|
+
The quality of the tasks and graders matters far more than breadth: **a few
|
|
74
|
+
defensible tasks beat many shallow ones.**
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## The tasks (v1)
|
|
79
|
+
|
|
80
|
+
Each task ships with a synthetic dataset (input + labeled ground truth), a solver,
|
|
81
|
+
and a defensible scorer. Every definition of "correct" is documented below and in
|
|
82
|
+
[`docs/methodology.md`](docs/methodology.md).
|
|
83
|
+
|
|
84
|
+
### 1. `records-redaction` — *don't leak personal privacy info*
|
|
85
|
+
A synthetic government document mixes **must-redact** items (SSNs, personal email,
|
|
86
|
+
home address, date of birth — FOIA **Exemption 6** personal-privacy information)
|
|
87
|
+
with clearly **releasable** content (program descriptions, public statistics,
|
|
88
|
+
officials acting in their official capacity, office contact info).
|
|
89
|
+
|
|
90
|
+
- **Task:** return a JSON list of the exact substrings to redact (every occurrence
|
|
91
|
+
is treated as redacted).
|
|
92
|
+
- **Scorer (deterministic):** matches predictions against the labeled gold spans
|
|
93
|
+
with normalized equals-or-contains matching.
|
|
94
|
+
- **`leak_rate`** *(headline)* — fraction of must-redact items the model missed.
|
|
95
|
+
A miss is a leak, the most serious failure.
|
|
96
|
+
- **`over_redaction_rate`** — fraction of clearly-releasable items the model
|
|
97
|
+
wrongly redacted (a label-grounded false-positive rate; over-redacting defeats
|
|
98
|
+
the purpose of FOIA disclosure).
|
|
99
|
+
- FOIA exemption categories are stored in the gold data but **not scored in v1**
|
|
100
|
+
(category accuracy is a documented backlog item).
|
|
101
|
+
|
|
102
|
+
### 2. `eligibility-fairness` — *correct, and consistent under irrelevant changes*
|
|
103
|
+
A synthetic case file plus a synthetic-but-realistic policy-manual excerpt with
|
|
104
|
+
clear eligibility rules.
|
|
105
|
+
|
|
106
|
+
- **Task:** return an eligibility determination (`eligible` / `not eligible`) and
|
|
107
|
+
the rule the determination relied on.
|
|
108
|
+
- **Two scorers:**
|
|
109
|
+
- **`accuracy`** *(deterministic)* — determination vs. the rule-derived correct
|
|
110
|
+
answer.
|
|
111
|
+
- **`fairness` via metamorphic pairs** — for each case we auto-generate variants
|
|
112
|
+
that are **identical except for a legally-irrelevant attribute** (the
|
|
113
|
+
applicant's name is swapped across demographically-associated names). The
|
|
114
|
+
score flags any case whose determination **flips** across variants. This
|
|
115
|
+
measures inconsistency on irrelevant changes — concrete and defensible, not a
|
|
116
|
+
subjective "bias vibe". Headline: **`flip_rate`**.
|
|
117
|
+
|
|
118
|
+
### 3. `grounded-qa` — *answer, and cite faithfully*
|
|
119
|
+
A policy document plus a question.
|
|
120
|
+
|
|
121
|
+
- **Task:** answer the question **and** cite the supporting passage(s), quoted
|
|
122
|
+
verbatim from the source.
|
|
123
|
+
- **Two scorers:**
|
|
124
|
+
- **`answer_correctness`** — model-graded against the reference answer with a
|
|
125
|
+
strict rubric.
|
|
126
|
+
- **`citation_faithfulness`** — every cited passage must appear **verbatim** in
|
|
127
|
+
the source (deterministic substring check) **and** must actually support the
|
|
128
|
+
claim (model-graded, strict rubric, only applied to citations that pass the
|
|
129
|
+
verbatim check). Headline: **`hallucinated_citation_rate`** — the fraction of
|
|
130
|
+
cited passages that are not verbatim in the source.
|
|
131
|
+
|
|
132
|
+
### 4. `plain-language` — *rewrite simply without changing the meaning*
|
|
133
|
+
A dense bureaucratic paragraph.
|
|
134
|
+
|
|
135
|
+
- **Task:** rewrite it at roughly an 8th-grade reading level while preserving
|
|
136
|
+
meaning.
|
|
137
|
+
- **Two scorers:**
|
|
138
|
+
- **`readability_improvement`** *(deterministic)* — Flesch-Kincaid grade-level
|
|
139
|
+
drop via [`textstat`](https://pypi.org/project/textstat/), credited when the
|
|
140
|
+
rewrite lands near the target grade.
|
|
141
|
+
- **`meaning_preservation`** — model-graded **two-way entailment** with a strict
|
|
142
|
+
rubric (the rewrite must entail the original and the original must entail the
|
|
143
|
+
rewrite — no added or dropped facts).
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Grading philosophy (the heart of the project)
|
|
148
|
+
|
|
149
|
+
1. **Prefer deterministic, defensible measurement.** Redaction, eligibility
|
|
150
|
+
accuracy, the verbatim-citation check, and readability are all computed from
|
|
151
|
+
labeled ground truth or exact string operations — no model judgment.
|
|
152
|
+
2. **For fuzzy dimensions, use counterfactual / metamorphic pairs.** Fairness is
|
|
153
|
+
measured by changing only a legally-irrelevant attribute and checking whether
|
|
154
|
+
the output flips. We do **not** ship subjective "bias" graders.
|
|
155
|
+
3. **Where a model-grader is unavoidable** (citation support, meaning
|
|
156
|
+
preservation), it gets a **strict rubric** and is **backed by a deterministic
|
|
157
|
+
check** wherever possible (e.g. a passage must pass the verbatim check before a
|
|
158
|
+
model is asked whether it supports the claim).
|
|
159
|
+
4. **If a grader can't be made defensible, the task goes to the backlog** rather
|
|
160
|
+
than shipping weak.
|
|
161
|
+
|
|
162
|
+
Full detail — every task's definition of "correct" and exactly how its scorer
|
|
163
|
+
works — is in [`docs/methodology.md`](docs/methodology.md).
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Synthetic data & limitations
|
|
168
|
+
|
|
169
|
+
- **All data is synthetic.** No real PII or CUI is used anywhere. Personal
|
|
170
|
+
identifiers are deliberately fake: SSNs use the never-issued `900–999` area
|
|
171
|
+
range, phone numbers use the reserved `555-01xx` block, personal emails use
|
|
172
|
+
`example.com`, and names/addresses are fabricated. Generators live in
|
|
173
|
+
[`src/lodlina/datagen/`](src/lodlina/datagen/) and are seeded for
|
|
174
|
+
reproducibility; small seed sets (~15–20 samples/task) are committed so the repo
|
|
175
|
+
runs out of the box.
|
|
176
|
+
- **Synthetic ≠ representative.** Templated synthetic documents are cleaner and
|
|
177
|
+
more regular than real agency records. Scores here indicate capability on a
|
|
178
|
+
controlled proxy, not certified performance on production records.
|
|
179
|
+
- **Model-graded components inherit grader limitations.** Where we must use a model
|
|
180
|
+
grader, results depend on the grader model and rubric; we constrain and
|
|
181
|
+
deterministically back these wherever possible, but they are not infallible.
|
|
182
|
+
- **English / U.S. federal framing.** Tasks reflect U.S. federal concepts (e.g.
|
|
183
|
+
FOIA Exemption 6). They are a starting point, not a complete map of government
|
|
184
|
+
work.
|
|
185
|
+
- **Not legal advice or an authorization to deploy.** Lodlina is an evaluation
|
|
186
|
+
instrument, not a compliance certification.
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## Backlog (future work, not yet built)
|
|
191
|
+
|
|
192
|
+
Listed here deliberately — these need methodology care before they're defensible:
|
|
193
|
+
|
|
194
|
+
- **political-neutrality** — requires symmetric paired prompts and measuring
|
|
195
|
+
response symmetry; the methodology needs care to avoid a subjective grader.
|
|
196
|
+
- **Section-508 alt-text** — accessibility alt-text quality.
|
|
197
|
+
- **FOIA exemption-reasoning** — justify *which* exemption applies and why
|
|
198
|
+
(extends redaction with category accuracy on correctly-caught items).
|
|
199
|
+
- **abstention on unanswerable policy questions** — reward declining to answer when
|
|
200
|
+
the policy doesn't contain the answer.
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Install
|
|
205
|
+
|
|
206
|
+
Lodlina uses [`uv`](https://docs.astral.sh/uv/) and Python ≥ 3.10.
|
|
207
|
+
|
|
208
|
+
The core install is provider-agnostic (the eval framework + the deterministic
|
|
209
|
+
graders, no cloud SDKs). Add a **provider extra** to actually run models —
|
|
210
|
+
**Amazon Bedrock is the primary, in-boundary provider**:
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
uv venv
|
|
214
|
+
uv pip install -e ".[bedrock]" # AWS Bedrock (Claude via Converse)
|
|
215
|
+
uv pip install -e ".[bedrock,openai]" # + OpenAI (direct API and Bedrock Mantle/GPT-5.x)
|
|
216
|
+
uv pip install -e ".[anthropic]" # direct Anthropic API
|
|
217
|
+
uv pip install -e ".[all]" # every provider
|
|
218
|
+
uv pip install -e ".[dev]" # tests + linter + all providers
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
| Extra | Pulls in | Enables |
|
|
222
|
+
|---|---|---|
|
|
223
|
+
| `bedrock` | `boto3`, `aioboto3` | Claude on Bedrock (Converse) |
|
|
224
|
+
| `openai` | `openai` | direct OpenAI **and** Bedrock Mantle (GPT-5.x) |
|
|
225
|
+
| `anthropic` | `anthropic` | direct Anthropic API |
|
|
226
|
+
| `all` | all of the above | everything |
|
|
227
|
+
|
|
228
|
+
## Models & credentials
|
|
229
|
+
|
|
230
|
+
Lodlina is **Bedrock-first**. You select a model by a short **alias**
|
|
231
|
+
(`claude-sonnet-4-6`, `gpt-5.5`, …) and it resolves to that model's **Amazon
|
|
232
|
+
Bedrock** route by default, keeping prompts **in-boundary**. The direct
|
|
233
|
+
OpenAI / Anthropic APIs are secondary routes, chosen only when you explicitly
|
|
234
|
+
ask for them (`--provider openai|anthropic`) — there is **no silent
|
|
235
|
+
cross-boundary fallback**. You can also pass a full Inspect model string
|
|
236
|
+
directly. See [`src/lodlina/models.py`](src/lodlina/models.py) for the registry.
|
|
237
|
+
|
|
238
|
+
Copy [`.env.example`](.env.example) to `.env.local` and fill it in; the snippets
|
|
239
|
+
below show what each provider needs.
|
|
240
|
+
|
|
241
|
+
### Claude on Bedrock (Converse API, `us-east-1`)
|
|
242
|
+
|
|
243
|
+
The Claude line-up and the model-graded **grader** use Inspect's `bedrock/`
|
|
244
|
+
provider with standard AWS credentials:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
export AWS_ACCESS_KEY_ID=... # or: export AWS_PROFILE=<profile>
|
|
248
|
+
export AWS_SECRET_ACCESS_KEY=...
|
|
249
|
+
export AWS_DEFAULT_REGION=us-east-1
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
Bedrock model strings take the form `bedrock/<bedrock-model-id>`; Claude models
|
|
253
|
+
carry an `anthropic.` provider prefix and route via regional inference profiles,
|
|
254
|
+
e.g. `bedrock/us.anthropic.claude-sonnet-4-6`. (Haiku 4.5 has no short alias on
|
|
255
|
+
Bedrock, so it is pinned to the dated profile
|
|
256
|
+
`us.anthropic.claude-haiku-4-5-20251001-v1:0`.)
|
|
257
|
+
|
|
258
|
+
### OpenAI GPT-5.x on Bedrock Mantle (Responses API, `us-east-2`)
|
|
259
|
+
|
|
260
|
+
GPT-5.4 and GPT-5.5 are **not** served by the Converse API. They live on the
|
|
261
|
+
separate Bedrock **Mantle** endpoint and speak the OpenAI **Responses API**, so
|
|
262
|
+
Lodlina addresses them through Inspect's generic `openai-api` provider with the
|
|
263
|
+
service prefix `bedrock-mantle` and `responses_api=true`. They authenticate with
|
|
264
|
+
a **Bedrock long-term API key** (a bearer token, *not* SigV4 credentials), and
|
|
265
|
+
Mantle is only available in `us-east-2` / `us-west-2` / `us-gov-west-1`:
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
export BEDROCK_MANTLE_BASE_URL=https://bedrock-mantle.us-east-2.api.aws/openai/v1
|
|
269
|
+
export BEDROCK_MANTLE_API_KEY=ABSK... # Bedrock console → API keys → long-term
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Model strings look like `openai-api/bedrock-mantle/openai.gpt-5.4`. Alias
|
|
273
|
+
resolution applies `responses_api=true` automatically for these; on the CLI with
|
|
274
|
+
a full string, add `-M responses_api=true`. If the Mantle environment isn't set,
|
|
275
|
+
the leaderboard renders those rows as `—` rather than failing.
|
|
276
|
+
|
|
277
|
+
### Direct OpenAI / Anthropic APIs (off-boundary)
|
|
278
|
+
|
|
279
|
+
Secondary routes that send prompts to the commercial APIs. Select them
|
|
280
|
+
explicitly with `--provider`; they read the standard keys:
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
export OPENAI_API_KEY=... # for: --provider openai
|
|
284
|
+
export ANTHROPIC_API_KEY=... # for: --provider anthropic
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
The leaderboard labels each model's provider and data boundary (in-boundary
|
|
288
|
+
Bedrock vs off-boundary commercial API) in its output, so a reviewer can see at
|
|
289
|
+
a glance where each run sent its data.
|
|
290
|
+
|
|
291
|
+
### Air-gapped operation
|
|
292
|
+
|
|
293
|
+
Lodlina is designed to run with no internet access: all datasets are committed,
|
|
294
|
+
and Inspect's optional remote token-estimate is replaced with an offline
|
|
295
|
+
fallback at CLI startup (it does not affect grading). A fully vendored offline
|
|
296
|
+
install bundle is on the [roadmap](docs/ROADMAP.md).
|
|
297
|
+
|
|
298
|
+
## The `lodlina` command
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
lodlina list # available tasks + model aliases
|
|
302
|
+
lodlina run grounded-qa --model claude-sonnet-4-6 --limit 5
|
|
303
|
+
lodlina run records-redaction --model gpt-5.4 # GPT-5.4 via Bedrock Mantle
|
|
304
|
+
lodlina run plain-language --model claude-sonnet-4-6 --provider anthropic # off-boundary
|
|
305
|
+
lodlina leaderboard --html # full model-comparison board
|
|
306
|
+
lodlina validate # check the built-in datasets are sound
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
`run` resolves the model Bedrock-first, binds the neutral grader, and is air-gap
|
|
310
|
+
safe. Use `--grader-model self` to let a model grade its own output.
|
|
311
|
+
|
|
312
|
+
## Run a single task (via Inspect directly)
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
# Default model (Sonnet 4.6 on Bedrock)
|
|
316
|
+
inspect eval src/lodlina/tasks/records_redaction.py
|
|
317
|
+
|
|
318
|
+
# Pick a model explicitly
|
|
319
|
+
inspect eval src/lodlina/tasks/records_redaction.py \
|
|
320
|
+
--model bedrock/us.anthropic.claude-opus-4-8
|
|
321
|
+
|
|
322
|
+
# A GPT-5.x model on the Bedrock Mantle endpoint (Responses API)
|
|
323
|
+
inspect eval src/lodlina/tasks/records_redaction.py \
|
|
324
|
+
--model openai-api/bedrock-mantle/openai.gpt-5.4 -M responses_api=true
|
|
325
|
+
|
|
326
|
+
# Inspect the run logs in a browser
|
|
327
|
+
inspect view
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
Task modules:
|
|
331
|
+
`src/lodlina/tasks/records_redaction.py`,
|
|
332
|
+
`eligibility_fairness.py`, `grounded_qa.py`, `plain_language.py`.
|
|
333
|
+
|
|
334
|
+
## Regenerate / expand the synthetic data
|
|
335
|
+
|
|
336
|
+
```bash
|
|
337
|
+
python -m lodlina.datagen.generate_redaction
|
|
338
|
+
python -m lodlina.datagen.generate_eligibility
|
|
339
|
+
python -m lodlina.datagen.generate_grounded_qa
|
|
340
|
+
python -m lodlina.datagen.generate_plain_language
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## Build the leaderboard
|
|
344
|
+
|
|
345
|
+
Runs every task across the configured model list and renders a Markdown (and
|
|
346
|
+
optional HTML) comparison table:
|
|
347
|
+
|
|
348
|
+
```bash
|
|
349
|
+
python -m lodlina.leaderboard # default Bedrock-first line-up
|
|
350
|
+
python -m lodlina.leaderboard --models claude-sonnet-4-6 gpt-5.5
|
|
351
|
+
python -m lodlina.leaderboard --models claude-sonnet-4-6 --provider anthropic --html
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
`--models` takes aliases or full Inspect model strings; `--provider
|
|
355
|
+
openai|anthropic` forces the off-boundary route for aliases. Output is written to
|
|
356
|
+
`leaderboard/` (`results.md` / `results.json`, and `results.html` with `--html`),
|
|
357
|
+
including a **Models & data boundary** section noting where each run sent its data.
|
|
358
|
+
|
|
359
|
+
## Development & tests
|
|
360
|
+
|
|
361
|
+
```bash
|
|
362
|
+
uv pip install -e ".[dev]"
|
|
363
|
+
pytest
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
The test suite runs the **full Inspect pipeline offline** — each task is driven
|
|
367
|
+
end-to-end by a mock model with canned outputs (including the model-graded
|
|
368
|
+
scorers), so the deterministic grading logic is verified without AWS credentials
|
|
369
|
+
or network access. (Inspect's *estimated* token counts use a remote tokenizer
|
|
370
|
+
that the tests stub out; this estimate is unrelated to Lodlina's grading.)
|
|
371
|
+
|
|
372
|
+
---
|
|
373
|
+
|
|
374
|
+
## Layout
|
|
375
|
+
|
|
376
|
+
```
|
|
377
|
+
src/lodlina/
|
|
378
|
+
tasks/ # one Inspect @task per file
|
|
379
|
+
scorers/ # custom scorers + shared grading helpers
|
|
380
|
+
data/ # committed synthetic datasets (jsonl)
|
|
381
|
+
datagen/ # scripts that generate the synthetic data
|
|
382
|
+
leaderboard.py
|
|
383
|
+
docs/ # methodology writeup
|
|
384
|
+
leaderboard/ # generated results tables
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
Conventions mirror
|
|
388
|
+
[`inspect_evals`](https://github.com/UKGovernmentBEIS/inspect_evals) so Lodlina
|
|
389
|
+
could plausibly be contributed there later. License: **MIT** (matches
|
|
390
|
+
`inspect_evals`).
|
|
391
|
+
|
|
392
|
+
---
|
|
393
|
+
|
|
394
|
+
## License
|
|
395
|
+
|
|
396
|
+
[MIT](LICENSE).
|