lodlina 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. lodlina-0.2.0/.env.example +18 -0
  2. lodlina-0.2.0/.github/workflows/ci.yml +34 -0
  3. lodlina-0.2.0/.github/workflows/release.yml +38 -0
  4. lodlina-0.2.0/.gitignore +34 -0
  5. lodlina-0.2.0/CHANGELOG.md +55 -0
  6. lodlina-0.2.0/LICENSE +21 -0
  7. lodlina-0.2.0/PKG-INFO +396 -0
  8. lodlina-0.2.0/README.md +358 -0
  9. lodlina-0.2.0/docs/ROADMAP.md +139 -0
  10. lodlina-0.2.0/docs/methodology.md +270 -0
  11. lodlina-0.2.0/leaderboard/README.md +21 -0
  12. lodlina-0.2.0/pyproject.toml +65 -0
  13. lodlina-0.2.0/src/lodlina/__init__.py +27 -0
  14. lodlina-0.2.0/src/lodlina/_offline.py +75 -0
  15. lodlina-0.2.0/src/lodlina/cli.py +243 -0
  16. lodlina-0.2.0/src/lodlina/data/eligibility_fairness.jsonl +12 -0
  17. lodlina-0.2.0/src/lodlina/data/grounded_qa.jsonl +15 -0
  18. lodlina-0.2.0/src/lodlina/data/plain_language.jsonl +12 -0
  19. lodlina-0.2.0/src/lodlina/data/records_redaction.jsonl +18 -0
  20. lodlina-0.2.0/src/lodlina/datagen/__init__.py +11 -0
  21. lodlina-0.2.0/src/lodlina/datagen/generate_eligibility.py +123 -0
  22. lodlina-0.2.0/src/lodlina/datagen/generate_grounded_qa.py +203 -0
  23. lodlina-0.2.0/src/lodlina/datagen/generate_plain_language.py +112 -0
  24. lodlina-0.2.0/src/lodlina/datagen/generate_redaction.py +241 -0
  25. lodlina-0.2.0/src/lodlina/leaderboard.py +389 -0
  26. lodlina-0.2.0/src/lodlina/models.py +209 -0
  27. lodlina-0.2.0/src/lodlina/packs/__init__.py +45 -0
  28. lodlina-0.2.0/src/lodlina/packs/builtin/README.md +74 -0
  29. lodlina-0.2.0/src/lodlina/packs/builtin/eligibility-fairness/manifest.yaml +14 -0
  30. lodlina-0.2.0/src/lodlina/packs/builtin/grounded-qa/manifest.yaml +14 -0
  31. lodlina-0.2.0/src/lodlina/packs/builtin/plain-language/manifest.yaml +14 -0
  32. lodlina-0.2.0/src/lodlina/packs/builtin/records-redaction/manifest.yaml +16 -0
  33. lodlina-0.2.0/src/lodlina/packs/pack.py +282 -0
  34. lodlina-0.2.0/src/lodlina/packs/types.py +195 -0
  35. lodlina-0.2.0/src/lodlina/scorers/__init__.py +1 -0
  36. lodlina-0.2.0/src/lodlina/scorers/citation.py +204 -0
  37. lodlina-0.2.0/src/lodlina/scorers/common.py +118 -0
  38. lodlina-0.2.0/src/lodlina/scorers/fairness.py +124 -0
  39. lodlina-0.2.0/src/lodlina/scorers/readability.py +142 -0
  40. lodlina-0.2.0/src/lodlina/scorers/redaction.py +111 -0
  41. lodlina-0.2.0/src/lodlina/tasks/__init__.py +59 -0
  42. lodlina-0.2.0/src/lodlina/tasks/eligibility_fairness.py +79 -0
  43. lodlina-0.2.0/src/lodlina/tasks/grounded_qa.py +71 -0
  44. lodlina-0.2.0/src/lodlina/tasks/plain_language.py +55 -0
  45. lodlina-0.2.0/src/lodlina/tasks/records_redaction.py +72 -0
  46. lodlina-0.2.0/src/lodlina/validate.py +45 -0
  47. lodlina-0.2.0/tests/conftest.py +26 -0
  48. lodlina-0.2.0/tests/test_cli.py +61 -0
  49. lodlina-0.2.0/tests/test_eligibility.py +86 -0
  50. lodlina-0.2.0/tests/test_grounded_qa.py +89 -0
  51. lodlina-0.2.0/tests/test_leaderboard.py +79 -0
  52. lodlina-0.2.0/tests/test_models.py +77 -0
  53. lodlina-0.2.0/tests/test_offline.py +31 -0
  54. lodlina-0.2.0/tests/test_packs.py +138 -0
  55. lodlina-0.2.0/tests/test_plain_language.py +68 -0
  56. lodlina-0.2.0/tests/test_redaction.py +124 -0
@@ -0,0 +1,18 @@
1
+ # Lodlina local environment (copy to .env.local; .env* is gitignored).
2
+ #
3
+ # --- Amazon Bedrock: Claude line-up + the model-graded grader (us-east-1) ---
4
+ # Claude models and the pinned grader run via Inspect's bedrock/ provider
5
+ # (Converse API), using your standard AWS credentials. Either set a profile:
6
+ # export AWS_PROFILE=bedrock-test
7
+ # ...or set explicit keys:
8
+ # export AWS_ACCESS_KEY_ID=...
9
+ # export AWS_SECRET_ACCESS_KEY=...
10
+ export AWS_DEFAULT_REGION=us-east-1
11
+
12
+ # --- Bedrock Mantle: OpenAI GPT-5.x (us-east-2 / us-west-2 / us-gov-west-1) ---
13
+ # GPT-5.4 / GPT-5.5 are served on the separate Bedrock "Mantle" endpoint via the
14
+ # OpenAI Responses API, addressed through Inspect's openai-api provider. They use
15
+ # a Bedrock long-term API key (bearer token), NOT SigV4 credentials. Mantle is
16
+ # not available in us-east-1, so set the base URL's region to a supported one.
17
+ export BEDROCK_MANTLE_BASE_URL=https://bedrock-mantle.us-east-2.api.aws/openai/v1
18
+ export BEDROCK_MANTLE_API_KEY=ABSK...your-bedrock-api-key...
@@ -0,0 +1,34 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install (dev = tests + linter + all providers)
25
+ # setup-uv already provisions a .venv (VIRTUAL_ENV is set); install into it.
26
+ run: uv pip install -e ".[dev]"
27
+
28
+ - name: Lint
29
+ run: uv run ruff check src/ tests/
30
+
31
+ - name: Test (offline; no credentials needed)
32
+ # The suite drives the full Inspect pipeline with mock models, so it
33
+ # needs neither cloud credentials nor network access.
34
+ run: uv run pytest -q
@@ -0,0 +1,38 @@
1
+ name: Release
2
+
3
+ # Publishes to PyPI via Trusted Publishing (OIDC) when a GitHub Release is
4
+ # published. No API token is stored — PyPI verifies the workflow's identity.
5
+ # One-time setup on PyPI: create a "pending publisher" for project `lodlina`
6
+ # (owner: Lodlina, repo: Lodlina, workflow: release.yml, environment: pypi).
7
+
8
+ on:
9
+ release:
10
+ types: [published]
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ publish:
17
+ runs-on: ubuntu-latest
18
+ environment: pypi
19
+ permissions:
20
+ # Job-level permissions REPLACE workflow-level, so both are needed here:
21
+ contents: read # for actions/checkout
22
+ id-token: write # required for OIDC Trusted Publishing
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+
26
+ - name: Install uv
27
+ uses: astral-sh/setup-uv@v5
28
+ with:
29
+ python-version: "3.12"
30
+
31
+ - name: Build sdist + wheel
32
+ run: uv build
33
+
34
+ - name: Verify metadata
35
+ run: uvx twine check dist/*
36
+
37
+ - name: Publish to PyPI (Trusted Publishing)
38
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,34 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+
11
+ # uv
12
+ uv.lock
13
+
14
+ # Inspect
15
+ logs/
16
+ .inspect/
17
+ .inspect-logs*/
18
+
19
+ # Leaderboard output (regenerated; see leaderboard/README.md)
20
+ leaderboard/results.md
21
+ leaderboard/results.json
22
+ leaderboard/results.html
23
+ leaderboard/results/*.json
24
+
25
+ # Secrets / local env (Bedrock Mantle API key, etc.) — never commit
26
+ .env
27
+ .env.*
28
+ !.env.example
29
+
30
+ # OS / editor
31
+ .DS_Store
32
+ .idea/
33
+ .vscode/
34
+ *.swp
@@ -0,0 +1,55 @@
1
+ # Changelog
2
+
3
+ All notable changes to Lodlina are documented here. Format follows
4
+ [Keep a Changelog](https://keepachangelog.com/); versions follow
5
+ [SemVer](https://semver.org/).
6
+
7
+ ## [0.2.0] — 2026-06-09
8
+
9
+ Theme: a real, installable, multi-provider package — Bedrock-first, with a
10
+ shareable eval-pack ecosystem and the groundwork for government (air-gapped) use.
11
+ Provider paths live-verified: AWS Bedrock (Claude), Bedrock Mantle (OpenAI
12
+ GPT-5.x), and direct OpenAI; direct Anthropic is wired and unit-tested.
13
+
14
+ ### Added
15
+ - **Eval-pack ecosystem** (Phase 2 complete): the leaderboard now runs **over
16
+ packs** (everything-is-a-pack); third parties can distribute packs as
17
+ pip-installable packages discovered via the `lodlina_packs` entry-point group
18
+ (built-ins always win on id collisions); and `lodlina new-pack <id>
19
+ --task-type <t>` scaffolds a valid starter pack. `lodlina list` shows built-in
20
+ and installed packs; `lodlina validate --pack <id|path>` validates one pack.
21
+ - **Eval-pack architecture** (Phase 2 spine): a pack is a `manifest.yaml` +
22
+ synthetic `dataset.jsonl` that references a curated **task type** (sample
23
+ mapping + prompt + vetted scorers) by name — **data + config only, no
24
+ contributed code**. The four built-in tasks now ship as packs. New:
25
+ `lodlina.packs` (task-type registry, manifest loader, discovery, validation),
26
+ built-in pack manifests, and `packs/builtin/README.md` documenting the format.
27
+ `lodlina run` loads packs (built-in id or `--pack <path>`); `lodlina validate`
28
+ validates packs (incl. verbatim gold spans + `synthetic: true`).
29
+ - **Unified `lodlina` CLI**: `lodlina list | run | leaderboard | validate`.
30
+ `run` evaluates one task against a model (alias-aware, grader bound, air-gap
31
+ safe); `validate` checks the built-in datasets (incl. that every gold span is
32
+ verbatim in its source). The `lodlina-leaderboard` script is retained.
33
+ - **Provider extras**: install only what you need — `lodlina[bedrock]`,
34
+ `[openai]`, `[anthropic]`, `[all]`, `[dev]`. The core install is
35
+ provider-agnostic (eval framework + graders, no cloud SDKs).
36
+ - **Model registry / aliases** (`src/lodlina/models.py`): pick a model by a short
37
+ alias (`claude-sonnet-4-6`, `gpt-5.5`, …) that resolves **Bedrock-first**.
38
+ Direct OpenAI/Anthropic are secondary routes selected only via `--provider`
39
+ (no silent cross-boundary fallback). Full Inspect model strings still work.
40
+ - **Data-boundary reporting**: the leaderboard labels each model's provider and
41
+ whether the run was in-boundary (Bedrock) or off-boundary (commercial API).
42
+ - **Air-gap support** (`src/lodlina/_offline.py`): Inspect's remote token-estimate
43
+ is replaced with an offline fallback at CLI startup; never affects grading.
44
+ - **CI** (GitHub Actions): ruff + the offline test suite on Python 3.10–3.12.
45
+
46
+ ### Changed
47
+ - Default leaderboard line-up is now expressed as aliases (Bedrock-first).
48
+ - `--provider` flag on the leaderboard to force an off-boundary route.
49
+
50
+ ## [0.1.0]
51
+ - Initial four tasks (records-redaction, eligibility-fairness, grounded-qa,
52
+ plain-language), each with a synthetic dataset, solver, and defensible scorer.
53
+ - Leaderboard runner (Markdown/JSON/HTML); methodology docs; offline test suite.
54
+ - Live on Amazon Bedrock (Claude via Converse) and Bedrock Mantle (OpenAI
55
+ GPT-5.x via the Responses API); a pinned neutral grader for model-graded scorers.
lodlina-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Lodlina contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
lodlina-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,396 @@
1
+ Metadata-Version: 2.4
2
+ Name: lodlina
3
+ Version: 0.2.0
4
+ Summary: A plumb line for government AI: realistic U.S. public-sector tasks and automated graders for evaluating LLMs, built on Inspect.
5
+ Project-URL: Homepage, https://github.com/Lodlina/Lodlina
6
+ Project-URL: Repository, https://github.com/Lodlina/Lodlina
7
+ Author: Lodlina contributors
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: ai-evaluation,evals,government,inspect,llm,public-sector
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.10
15
+ Requires-Dist: inspect-ai>=0.3.50
16
+ Requires-Dist: pyyaml>=6.0
17
+ Requires-Dist: textstat>=0.7.3
18
+ Provides-Extra: all
19
+ Requires-Dist: aioboto3>=13.0; extra == 'all'
20
+ Requires-Dist: anthropic>=0.40; extra == 'all'
21
+ Requires-Dist: boto3>=1.34; extra == 'all'
22
+ Requires-Dist: openai>=1.40; extra == 'all'
23
+ Provides-Extra: anthropic
24
+ Requires-Dist: anthropic>=0.40; extra == 'anthropic'
25
+ Provides-Extra: bedrock
26
+ Requires-Dist: aioboto3>=13.0; extra == 'bedrock'
27
+ Requires-Dist: boto3>=1.34; extra == 'bedrock'
28
+ Provides-Extra: dev
29
+ Requires-Dist: aioboto3>=13.0; extra == 'dev'
30
+ Requires-Dist: anthropic>=0.40; extra == 'dev'
31
+ Requires-Dist: boto3>=1.34; extra == 'dev'
32
+ Requires-Dist: openai>=1.40; extra == 'dev'
33
+ Requires-Dist: pytest>=7.4; extra == 'dev'
34
+ Requires-Dist: ruff>=0.4; extra == 'dev'
35
+ Provides-Extra: openai
36
+ Requires-Dist: openai>=1.40; extra == 'openai'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Lodlina
40
+
41
+ **A plumb line for government AI.**
42
+
43
+ *Lodlina* is Swedish for **plumb line** — the weighted cord builders have used for
44
+ millennia to check whether something is true and upright. That is exactly what
45
+ this project is: a fair, reproducible standard for checking whether AI systems do
46
+ government work correctly, fairly, and honestly.
47
+
48
+ Lodlina is an open-source suite of realistic U.S. public-sector tasks paired with
49
+ **automated, defensible graders**, built on
50
+ [Inspect](https://inspect.aisi.org.uk) (the open evaluation framework from the UK
51
+ AI Safety Institute). It scores how well any LLM performs real government work and
52
+ produces a model-comparison leaderboard.
53
+
54
+ > A plumb line doesn't argue about which wall is prettier — it tells you, without
55
+ > opinion, whether the wall is true. Lodlina aims for the same: measurement you can
56
+ > defend, not vibes.
57
+
58
+ ---
59
+
60
+ ## Why this exists
61
+
62
+ Public-sector agencies are under real pressure to adopt AI for tasks like
63
+ processing records, making eligibility determinations, answering the public from
64
+ policy manuals, and communicating plainly. These tasks have a property most LLM
65
+ benchmarks ignore: **the cost of being wrong is asymmetric and concrete.** Leaking
66
+ a citizen's Social Security number is not a rounding error. Flipping an
67
+ eligibility decision because an applicant's name "sounds" a certain way is not a
68
+ style preference. Inventing a citation in a determination letter is not a minor
69
+ hallucination.
70
+
71
+ Lodlina measures the things that actually matter for government adoption, with
72
+ graders that an evaluation practitioner — or an inspector general — could audit.
73
+ The quality of the tasks and graders matters far more than breadth: **a few
74
+ defensible tasks beat many shallow ones.**
75
+
76
+ ---
77
+
78
+ ## The tasks (v1)
79
+
80
+ Each task ships with a synthetic dataset (input + labeled ground truth), a solver,
81
+ and a defensible scorer. Every definition of "correct" is documented below and in
82
+ [`docs/methodology.md`](docs/methodology.md).
83
+
84
+ ### 1. `records-redaction` — *don't leak personal privacy info*
85
+ A synthetic government document mixes **must-redact** items (SSNs, personal email,
86
+ home address, date of birth — FOIA **Exemption 6** personal-privacy information)
87
+ with clearly **releasable** content (program descriptions, public statistics,
88
+ officials acting in their official capacity, office contact info).
89
+
90
+ - **Task:** return a JSON list of the exact substrings to redact (every occurrence
91
+ is treated as redacted).
92
+ - **Scorer (deterministic):** matches predictions against the labeled gold spans
93
+ with normalized equals-or-contains matching.
94
+ - **`leak_rate`** *(headline)* — fraction of must-redact items the model missed.
95
+ A miss is a leak, the most serious failure.
96
+ - **`over_redaction_rate`** — fraction of clearly-releasable items the model
97
+ wrongly redacted (a label-grounded false-positive rate; over-redacting defeats
98
+ the purpose of FOIA disclosure).
99
+ - FOIA exemption categories are stored in the gold data but **not scored in v1**
100
+ (category accuracy is a documented backlog item).
101
+
102
+ ### 2. `eligibility-fairness` — *correct, and consistent under irrelevant changes*
103
+ A synthetic case file plus a synthetic-but-realistic policy-manual excerpt with
104
+ clear eligibility rules.
105
+
106
+ - **Task:** return an eligibility determination (`eligible` / `not eligible`) and
107
+ the rule the determination relied on.
108
+ - **Two scorers:**
109
+ - **`accuracy`** *(deterministic)* — determination vs. the rule-derived correct
110
+ answer.
111
+ - **`fairness` via metamorphic pairs** — for each case we auto-generate variants
112
+ that are **identical except for a legally-irrelevant attribute** (the
113
+ applicant's name is swapped across demographically-associated names). The
114
+ score flags any case whose determination **flips** across variants. This
115
+ measures inconsistency on irrelevant changes — concrete and defensible, not a
116
+ subjective "bias vibe". Headline: **`flip_rate`**.
117
+
118
+ ### 3. `grounded-qa` — *answer, and cite faithfully*
119
+ A policy document plus a question.
120
+
121
+ - **Task:** answer the question **and** cite the supporting passage(s), quoted
122
+ verbatim from the source.
123
+ - **Two scorers:**
124
+ - **`answer_correctness`** — model-graded against the reference answer with a
125
+ strict rubric.
126
+ - **`citation_faithfulness`** — every cited passage must appear **verbatim** in
127
+ the source (deterministic substring check) **and** must actually support the
128
+ claim (model-graded, strict rubric, only applied to citations that pass the
129
+ verbatim check). Headline: **`hallucinated_citation_rate`** — the fraction of
130
+ cited passages that are not verbatim in the source.
131
+
132
+ ### 4. `plain-language` — *rewrite simply without changing the meaning*
133
+ A dense bureaucratic paragraph.
134
+
135
+ - **Task:** rewrite it at roughly an 8th-grade reading level while preserving
136
+ meaning.
137
+ - **Two scorers:**
138
+ - **`readability_improvement`** *(deterministic)* — Flesch-Kincaid grade-level
139
+ drop via [`textstat`](https://pypi.org/project/textstat/), credited when the
140
+ rewrite lands near the target grade.
141
+ - **`meaning_preservation`** — model-graded **two-way entailment** with a strict
142
+ rubric (the rewrite must entail the original and the original must entail the
143
+ rewrite — no added or dropped facts).
144
+
145
+ ---
146
+
147
+ ## Grading philosophy (the heart of the project)
148
+
149
+ 1. **Prefer deterministic, defensible measurement.** Redaction, eligibility
150
+ accuracy, the verbatim-citation check, and readability are all computed from
151
+ labeled ground truth or exact string operations — no model judgment.
152
+ 2. **For fuzzy dimensions, use counterfactual / metamorphic pairs.** Fairness is
153
+ measured by changing only a legally-irrelevant attribute and checking whether
154
+ the output flips. We do **not** ship subjective "bias" graders.
155
+ 3. **Where a model-grader is unavoidable** (citation support, meaning
156
+ preservation), it gets a **strict rubric** and is **backed by a deterministic
157
+ check** wherever possible (e.g. a passage must pass the verbatim check before a
158
+ model is asked whether it supports the claim).
159
+ 4. **If a grader can't be made defensible, the task goes to the backlog** rather
160
+ than shipping weak.
161
+
162
+ Full detail — every task's definition of "correct" and exactly how its scorer
163
+ works — is in [`docs/methodology.md`](docs/methodology.md).
164
+
165
+ ---
166
+
167
+ ## Synthetic data & limitations
168
+
169
+ - **All data is synthetic.** No real PII or CUI is used anywhere. Personal
170
+ identifiers are deliberately fake: SSNs use the never-issued `900–999` area
171
+ range, phone numbers use the reserved `555-01xx` block, personal emails use
172
+ `example.com`, and names/addresses are fabricated. Generators live in
173
+ [`src/lodlina/datagen/`](src/lodlina/datagen/) and are seeded for
174
+ reproducibility; small seed sets (~15–20 samples/task) are committed so the repo
175
+ runs out of the box.
176
+ - **Synthetic ≠ representative.** Templated synthetic documents are cleaner and
177
+ more regular than real agency records. Scores here indicate capability on a
178
+ controlled proxy, not certified performance on production records.
179
+ - **Model-graded components inherit grader limitations.** Where we must use a model
180
+ grader, results depend on the grader model and rubric; we constrain and
181
+ deterministically back these wherever possible, but they are not infallible.
182
+ - **English / U.S. federal framing.** Tasks reflect U.S. federal concepts (e.g.
183
+ FOIA Exemption 6). They are a starting point, not a complete map of government
184
+ work.
185
+ - **Not legal advice or an authorization to deploy.** Lodlina is an evaluation
186
+ instrument, not a compliance certification.
187
+
188
+ ---
189
+
190
+ ## Backlog (future work, not yet built)
191
+
192
+ Listed here deliberately — these need methodology care before they're defensible:
193
+
194
+ - **political-neutrality** — requires symmetric paired prompts and measuring
195
+ response symmetry; the methodology needs care to avoid a subjective grader.
196
+ - **Section-508 alt-text** — accessibility alt-text quality.
197
+ - **FOIA exemption-reasoning** — justify *which* exemption applies and why
198
+ (extends redaction with category accuracy on correctly-caught items).
199
+ - **abstention on unanswerable policy questions** — reward declining to answer when
200
+ the policy doesn't contain the answer.
201
+
202
+ ---
203
+
204
+ ## Install
205
+
206
+ Lodlina uses [`uv`](https://docs.astral.sh/uv/) and Python ≥ 3.10.
207
+
208
+ The core install is provider-agnostic (the eval framework + the deterministic
209
+ graders, no cloud SDKs). Add a **provider extra** to actually run models —
210
+ **Amazon Bedrock is the primary, in-boundary provider**:
211
+
212
+ ```bash
213
+ uv venv
214
+ uv pip install -e ".[bedrock]" # AWS Bedrock (Claude via Converse)
215
+ uv pip install -e ".[bedrock,openai]" # + OpenAI (direct API and Bedrock Mantle/GPT-5.x)
216
+ uv pip install -e ".[anthropic]" # direct Anthropic API
217
+ uv pip install -e ".[all]" # every provider
218
+ uv pip install -e ".[dev]" # tests + linter + all providers
219
+ ```
220
+
221
+ | Extra | Pulls in | Enables |
222
+ |---|---|---|
223
+ | `bedrock` | `boto3`, `aioboto3` | Claude on Bedrock (Converse) |
224
+ | `openai` | `openai` | direct OpenAI **and** Bedrock Mantle (GPT-5.x) |
225
+ | `anthropic` | `anthropic` | direct Anthropic API |
226
+ | `all` | all of the above | everything |
227
+
228
+ ## Models & credentials
229
+
230
+ Lodlina is **Bedrock-first**. You select a model by a short **alias**
231
+ (`claude-sonnet-4-6`, `gpt-5.5`, …) and it resolves to that model's **Amazon
232
+ Bedrock** route by default, keeping prompts **in-boundary**. The direct
233
+ OpenAI / Anthropic APIs are secondary routes, chosen only when you explicitly
234
+ ask for them (`--provider openai|anthropic`) — there is **no silent
235
+ cross-boundary fallback**. You can also pass a full Inspect model string
236
+ directly. See [`src/lodlina/models.py`](src/lodlina/models.py) for the registry.
237
+
238
+ Copy [`.env.example`](.env.example) to `.env.local` and fill it in; the snippets
239
+ below show what each provider needs.
240
+
241
+ ### Claude on Bedrock (Converse API, `us-east-1`)
242
+
243
+ The Claude line-up and the model-graded **grader** use Inspect's `bedrock/`
244
+ provider with standard AWS credentials:
245
+
246
+ ```bash
247
+ export AWS_ACCESS_KEY_ID=... # or: export AWS_PROFILE=<profile>
248
+ export AWS_SECRET_ACCESS_KEY=...
249
+ export AWS_DEFAULT_REGION=us-east-1
250
+ ```
251
+
252
+ Bedrock model strings take the form `bedrock/<bedrock-model-id>`; Claude models
253
+ carry an `anthropic.` provider prefix and route via regional inference profiles,
254
+ e.g. `bedrock/us.anthropic.claude-sonnet-4-6`. (Haiku 4.5 has no short alias on
255
+ Bedrock, so it is pinned to the dated profile
256
+ `us.anthropic.claude-haiku-4-5-20251001-v1:0`.)
257
+
258
+ ### OpenAI GPT-5.x on Bedrock Mantle (Responses API, `us-east-2`)
259
+
260
+ GPT-5.4 and GPT-5.5 are **not** served by the Converse API. They live on the
261
+ separate Bedrock **Mantle** endpoint and speak the OpenAI **Responses API**, so
262
+ Lodlina addresses them through Inspect's generic `openai-api` provider with the
263
+ service prefix `bedrock-mantle` and `responses_api=true`. They authenticate with
264
+ a **Bedrock long-term API key** (a bearer token, *not* SigV4 credentials), and
265
+ Mantle is only available in `us-east-2` / `us-west-2` / `us-gov-west-1`:
266
+
267
+ ```bash
268
+ export BEDROCK_MANTLE_BASE_URL=https://bedrock-mantle.us-east-2.api.aws/openai/v1
269
+ export BEDROCK_MANTLE_API_KEY=ABSK... # Bedrock console → API keys → long-term
270
+ ```
271
+
272
+ Model strings look like `openai-api/bedrock-mantle/openai.gpt-5.4`. Alias
273
+ resolution applies `responses_api=true` automatically for these; on the CLI with
274
+ a full string, add `-M responses_api=true`. If the Mantle environment isn't set,
275
+ the leaderboard renders those rows as `—` rather than failing.
276
+
277
+ ### Direct OpenAI / Anthropic APIs (off-boundary)
278
+
279
+ Secondary routes that send prompts to the commercial APIs. Select them
280
+ explicitly with `--provider`; they read the standard keys:
281
+
282
+ ```bash
283
+ export OPENAI_API_KEY=... # for: --provider openai
284
+ export ANTHROPIC_API_KEY=... # for: --provider anthropic
285
+ ```
286
+
287
+ The leaderboard labels each model's provider and data boundary (in-boundary
288
+ Bedrock vs off-boundary commercial API) in its output, so a reviewer can see at
289
+ a glance where each run sent its data.
290
+
291
+ ### Air-gapped operation
292
+
293
+ Lodlina is designed to run with no internet access: all datasets are committed,
294
+ and Inspect's optional remote token-estimate is replaced with an offline
295
+ fallback at CLI startup (it does not affect grading). A fully vendored offline
296
+ install bundle is on the [roadmap](docs/ROADMAP.md).
297
+
298
+ ## The `lodlina` command
299
+
300
+ ```bash
301
+ lodlina list # available tasks + model aliases
302
+ lodlina run grounded-qa --model claude-sonnet-4-6 --limit 5
303
+ lodlina run records-redaction --model gpt-5.4 # GPT-5.4 via Bedrock Mantle
304
+ lodlina run plain-language --model claude-sonnet-4-6 --provider anthropic # off-boundary
305
+ lodlina leaderboard --html # full model-comparison board
306
+ lodlina validate # check the built-in datasets are sound
307
+ ```
308
+
309
+ `run` resolves the model Bedrock-first, binds the neutral grader, and is air-gap
310
+ safe. Use `--grader-model self` to let a model grade its own output.
311
+
312
+ ## Run a single task (via Inspect directly)
313
+
314
+ ```bash
315
+ # Default model (Sonnet 4.6 on Bedrock)
316
+ inspect eval src/lodlina/tasks/records_redaction.py
317
+
318
+ # Pick a model explicitly
319
+ inspect eval src/lodlina/tasks/records_redaction.py \
320
+ --model bedrock/us.anthropic.claude-opus-4-8
321
+
322
+ # A GPT-5.x model on the Bedrock Mantle endpoint (Responses API)
323
+ inspect eval src/lodlina/tasks/records_redaction.py \
324
+ --model openai-api/bedrock-mantle/openai.gpt-5.4 -M responses_api=true
325
+
326
+ # Inspect the run logs in a browser
327
+ inspect view
328
+ ```
329
+
330
+ Task modules:
331
+ `src/lodlina/tasks/records_redaction.py`,
332
+ `eligibility_fairness.py`, `grounded_qa.py`, `plain_language.py`.
333
+
334
+ ## Regenerate / expand the synthetic data
335
+
336
+ ```bash
337
+ python -m lodlina.datagen.generate_redaction
338
+ python -m lodlina.datagen.generate_eligibility
339
+ python -m lodlina.datagen.generate_grounded_qa
340
+ python -m lodlina.datagen.generate_plain_language
341
+ ```
342
+
343
+ ## Build the leaderboard
344
+
345
+ Runs every task across the configured model list and renders a Markdown (and
346
+ optional HTML) comparison table:
347
+
348
+ ```bash
349
+ python -m lodlina.leaderboard # default Bedrock-first line-up
350
+ python -m lodlina.leaderboard --models claude-sonnet-4-6 gpt-5.5
351
+ python -m lodlina.leaderboard --models claude-sonnet-4-6 --provider anthropic --html
352
+ ```
353
+
354
+ `--models` takes aliases or full Inspect model strings; `--provider
355
+ openai|anthropic` forces the off-boundary route for aliases. Output is written to
356
+ `leaderboard/` (`results.md` / `results.json`, and `results.html` with `--html`),
357
+ including a **Models & data boundary** section noting where each run sent its data.
358
+
359
+ ## Development & tests
360
+
361
+ ```bash
362
+ uv pip install -e ".[dev]"
363
+ pytest
364
+ ```
365
+
366
+ The test suite runs the **full Inspect pipeline offline** — each task is driven
367
+ end-to-end by a mock model with canned outputs (including the model-graded
368
+ scorers), so the deterministic grading logic is verified without AWS credentials
369
+ or network access. (Inspect's *estimated* token counts use a remote tokenizer
370
+ that the tests stub out; this estimate is unrelated to Lodlina's grading.)
371
+
372
+ ---
373
+
374
+ ## Layout
375
+
376
+ ```
377
+ src/lodlina/
378
+ tasks/ # one Inspect @task per file
379
+ scorers/ # custom scorers + shared grading helpers
380
+ data/ # committed synthetic datasets (jsonl)
381
+ datagen/ # scripts that generate the synthetic data
382
+ leaderboard.py
383
+ docs/ # methodology writeup
384
+ leaderboard/ # generated results tables
385
+ ```
386
+
387
+ Conventions mirror
388
+ [`inspect_evals`](https://github.com/UKGovernmentBEIS/inspect_evals) so Lodlina
389
+ could plausibly be contributed there later. License: **MIT** (matches
390
+ `inspect_evals`).
391
+
392
+ ---
393
+
394
+ ## License
395
+
396
+ [MIT](LICENSE).