neurodock-evals 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neurodock_evals-0.0.1/.gitignore +189 -0
- neurodock_evals-0.0.1/CHANGELOG.md +28 -0
- neurodock_evals-0.0.1/PKG-INFO +96 -0
- neurodock_evals-0.0.1/README.md +80 -0
- neurodock_evals-0.0.1/corpora/README.md +30 -0
- neurodock_evals-0.0.1/corpora/translation/README.md +14 -0
- neurodock_evals-0.0.1/corpora/translation/incoming/001-circle-back.example.yaml +27 -0
- neurodock_evals-0.0.1/corpora/translation/incoming/002-loop-you-in.example.yaml +32 -0
- neurodock_evals-0.0.1/corpora/translation/incoming/003-revisit-next-week.example.yaml +37 -0
- neurodock_evals-0.0.1/corpora/translation/incoming/README.md +8 -0
- neurodock_evals-0.0.1/corpora/translation/meetings/001-short-product-meeting.example.yaml +38 -0
- neurodock_evals-0.0.1/corpora/translation/meetings/002-hedged-decision.example.yaml +37 -0
- neurodock_evals-0.0.1/corpora/translation/meetings/README.md +9 -0
- neurodock_evals-0.0.1/corpora/translation/outgoing/001-rewrite-for-warmth.example.yaml +29 -0
- neurodock_evals-0.0.1/corpora/translation/outgoing/002-preserve-technical-terms.example.yaml +33 -0
- neurodock_evals-0.0.1/corpora/translation/outgoing/README.md +6 -0
- neurodock_evals-0.0.1/corpora/translation/tone/001-blunt-pr-comment.example.yaml +29 -0
- neurodock_evals-0.0.1/corpora/translation/tone/002-warm-deadline-reminder.example.yaml +27 -0
- neurodock_evals-0.0.1/corpora/translation/tone/README.md +4 -0
- neurodock_evals-0.0.1/pyproject.toml +48 -0
- neurodock_evals-0.0.1/schemas/annotation.schema.json +39 -0
- neurodock_evals-0.0.1/schemas/example.schema.json +79 -0
- neurodock_evals-0.0.1/src/neurodock_evals/__init__.py +62 -0
- neurodock_evals-0.0.1/src/neurodock_evals/anonymise.py +278 -0
- neurodock_evals-0.0.1/src/neurodock_evals/corpus.py +170 -0
- neurodock_evals-0.0.1/src/neurodock_evals/dedupe.py +90 -0
- neurodock_evals-0.0.1/src/neurodock_evals/harness.py +192 -0
- neurodock_evals-0.0.1/src/neurodock_evals/py.typed +0 -0
- neurodock_evals-0.0.1/src/neurodock_evals/runner.py +118 -0
- neurodock_evals-0.0.1/src/neurodock_evals/scoring.py +144 -0
- neurodock_evals-0.0.1/src/neurodock_evals/types.py +95 -0
- neurodock_evals-0.0.1/tests/conftest.py +18 -0
- neurodock_evals-0.0.1/tests/test_anonymise.py +85 -0
- neurodock_evals-0.0.1/tests/test_corpus_load.py +38 -0
- neurodock_evals-0.0.1/tests/test_dedupe.py +45 -0
- neurodock_evals-0.0.1/tests/test_harness_cli.py +105 -0
- neurodock_evals-0.0.1/tests/test_runner.py +56 -0
- neurodock_evals-0.0.1/tests/test_scoring.py +55 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# ─────────────────────────────────────────────────────────────
|
|
2
|
+
# Node / pnpm (TypeScript packages: core, cli, extension-browser, docs)
|
|
3
|
+
# ─────────────────────────────────────────────────────────────
|
|
4
|
+
node_modules/
|
|
5
|
+
.pnpm-store/
|
|
6
|
+
.pnpm-debug.log*
|
|
7
|
+
npm-debug.log*
|
|
8
|
+
yarn-debug.log*
|
|
9
|
+
yarn-error.log*
|
|
10
|
+
|
|
11
|
+
# Build outputs
|
|
12
|
+
dist/
|
|
13
|
+
build/
|
|
14
|
+
out/
|
|
15
|
+
*.tsbuildinfo
|
|
16
|
+
.next/
|
|
17
|
+
.nuxt/
|
|
18
|
+
|
|
19
|
+
# Turborepo cache
|
|
20
|
+
.turbo/
|
|
21
|
+
|
|
22
|
+
# ─────────────────────────────────────────────────────────────
|
|
23
|
+
# Python / uv (MCP servers, clinical, evals)
|
|
24
|
+
# ─────────────────────────────────────────────────────────────
|
|
25
|
+
__pycache__/
|
|
26
|
+
*.py[cod]
|
|
27
|
+
*$py.class
|
|
28
|
+
*.so
|
|
29
|
+
|
|
30
|
+
# uv / virtualenvs
|
|
31
|
+
.venv/
|
|
32
|
+
venv/
|
|
33
|
+
env/
|
|
34
|
+
.python-version
|
|
35
|
+
|
|
36
|
+
# Packaging
|
|
37
|
+
*.egg-info/
|
|
38
|
+
*.egg
|
|
39
|
+
dist/
|
|
40
|
+
build/
|
|
41
|
+
wheels/
|
|
42
|
+
*.whl
|
|
43
|
+
|
|
44
|
+
# Test / coverage
|
|
45
|
+
.pytest_cache/
|
|
46
|
+
.mypy_cache/
|
|
47
|
+
.ruff_cache/
|
|
48
|
+
.coverage
|
|
49
|
+
.coverage.*
|
|
50
|
+
coverage.xml
|
|
51
|
+
htmlcov/
|
|
52
|
+
.tox/
|
|
53
|
+
.hypothesis/
|
|
54
|
+
|
|
55
|
+
# Jupyter (in case any analysis notebooks land in scripts/)
|
|
56
|
+
.ipynb_checkpoints/
|
|
57
|
+
|
|
58
|
+
# ─────────────────────────────────────────────────────────────
|
|
59
|
+
# WXT browser extension (packages/extension-browser/)
|
|
60
|
+
# ─────────────────────────────────────────────────────────────
|
|
61
|
+
.output/
|
|
62
|
+
.wxt/
|
|
63
|
+
web-ext-artifacts/
|
|
64
|
+
*.zip
|
|
65
|
+
*.crx
|
|
66
|
+
*.xpi
|
|
67
|
+
|
|
68
|
+
# Extension store submission artefacts — keep these local
|
|
69
|
+
extension-submission/
|
|
70
|
+
|
|
71
|
+
# ─────────────────────────────────────────────────────────────
|
|
72
|
+
# Astro / Starlight docs (docs/)
|
|
73
|
+
# ─────────────────────────────────────────────────────────────
|
|
74
|
+
.astro/
|
|
75
|
+
|
|
76
|
+
# ─────────────────────────────────────────────────────────────
|
|
77
|
+
# Local NeuroDock runtime data — NEVER commit
|
|
78
|
+
# ─────────────────────────────────────────────────────────────
|
|
79
|
+
# Profile files contain user neurotype declarations
|
|
80
|
+
profile.yaml
|
|
81
|
+
profile.yml
|
|
82
|
+
.neurodock/
|
|
83
|
+
**/.neurodock/
|
|
84
|
+
|
|
85
|
+
# SQLite stores (cognitive graph, embeddings, traces)
|
|
86
|
+
*.sqlite
|
|
87
|
+
*.sqlite-journal
|
|
88
|
+
*.sqlite-wal
|
|
89
|
+
*.sqlite-shm
|
|
90
|
+
*.db
|
|
91
|
+
*.db-journal
|
|
92
|
+
|
|
93
|
+
# JSONL event logs
|
|
94
|
+
*.jsonl
|
|
95
|
+
!packages/evals/**/*.jsonl # eval corpora are checked in
|
|
96
|
+
!**/fixtures/*.jsonl # test fixtures are checked in
|
|
97
|
+
|
|
98
|
+
# Local embedding caches
|
|
99
|
+
.embeddings/
|
|
100
|
+
embeddings-cache/
|
|
101
|
+
|
|
102
|
+
# OpenTelemetry local trace dumps
|
|
103
|
+
.otel/
|
|
104
|
+
traces/
|
|
105
|
+
|
|
106
|
+
# ─────────────────────────────────────────────────────────────
|
|
107
|
+
# Secrets / environment
|
|
108
|
+
# ─────────────────────────────────────────────────────────────
|
|
109
|
+
.env
|
|
110
|
+
.env.*
|
|
111
|
+
!.env.example
|
|
112
|
+
!.env.template
|
|
113
|
+
*.pem
|
|
114
|
+
*.key
|
|
115
|
+
*.p12
|
|
116
|
+
secrets/
|
|
117
|
+
|
|
118
|
+
# Anthropic / LLM provider keys (defence in depth)
|
|
119
|
+
.anthropic/
|
|
120
|
+
.openai/
|
|
121
|
+
|
|
122
|
+
# Browser extension signing keys — local only
|
|
123
|
+
*.signing-key
|
|
124
|
+
extension-private-key.pem
|
|
125
|
+
|
|
126
|
+
# ─────────────────────────────────────────────────────────────
|
|
127
|
+
# OS / editor (you're on Windows + VS Code)
|
|
128
|
+
# ─────────────────────────────────────────────────────────────
|
|
129
|
+
# Windows
|
|
130
|
+
Thumbs.db
|
|
131
|
+
ehthumbs.db
|
|
132
|
+
ehthumbs_vista.db
|
|
133
|
+
Desktop.ini
|
|
134
|
+
$RECYCLE.BIN/
|
|
135
|
+
*.lnk
|
|
136
|
+
|
|
137
|
+
# macOS (in case any contributor PRs from a Mac)
|
|
138
|
+
.DS_Store
|
|
139
|
+
.AppleDouble
|
|
140
|
+
.LSOverride
|
|
141
|
+
._*
|
|
142
|
+
|
|
143
|
+
# Linux
|
|
144
|
+
*~
|
|
145
|
+
.directory
|
|
146
|
+
.Trash-*
|
|
147
|
+
|
|
148
|
+
# VS Code — share recommendations, ignore personal state
|
|
149
|
+
.vscode/*
|
|
150
|
+
!.vscode/extensions.json
|
|
151
|
+
!.vscode/settings.shared.json
|
|
152
|
+
!.vscode/launch.shared.json
|
|
153
|
+
|
|
154
|
+
# JetBrains
|
|
155
|
+
.idea/
|
|
156
|
+
*.iml
|
|
157
|
+
*.iws
|
|
158
|
+
|
|
159
|
+
# Vim / Emacs
|
|
160
|
+
*.swp
|
|
161
|
+
*.swo
|
|
162
|
+
*~
|
|
163
|
+
.\#*
|
|
164
|
+
\#*\#
|
|
165
|
+
|
|
166
|
+
# ─────────────────────────────────────────────────────────────
|
|
167
|
+
# Claude Code / agent runtime — keep agents tracked, ignore state
|
|
168
|
+
# ─────────────────────────────────────────────────────────────
|
|
169
|
+
.claude/cache/
|
|
170
|
+
.claude/history/
|
|
171
|
+
.claude/sessions/
|
|
172
|
+
.claude/local/
|
|
173
|
+
|
|
174
|
+
# Do keep: .claude/agents/, .claude/skills/, .claude/settings.json
|
|
175
|
+
|
|
176
|
+
# ─────────────────────────────────────────────────────────────
|
|
177
|
+
# Misc
|
|
178
|
+
# ─────────────────────────────────────────────────────────────
|
|
179
|
+
*.log
|
|
180
|
+
logs/
|
|
181
|
+
tmp/
|
|
182
|
+
.tmp/
|
|
183
|
+
.cache/
|
|
184
|
+
.parcel-cache/
|
|
185
|
+
|
|
186
|
+
# Lockfiles policy: KEEP these committed
|
|
187
|
+
# !pnpm-lock.yaml
|
|
188
|
+
# !uv.lock
|
|
189
|
+
# (intentionally not listed above so they're tracked)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `neurodock-evals`. Format loosely follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com).
|
|
5
|
+
|
|
6
|
+
## [0.0.1] — 2026-05-17
|
|
7
|
+
|
|
8
|
+
### Added
|
|
9
|
+
|
|
10
|
+
- Initial harness scaffold: `harness.py`, `corpus.py`, `runner.py`, `scoring.py`,
|
|
11
|
+
`anonymise.py`, `dedupe.py`, `types.py`.
|
|
12
|
+
- JSON Schemas: `schemas/example.schema.json`, `schemas/annotation.schema.json`.
|
|
13
|
+
- Ten hand-authored, **synthesised** seed examples across four translation
|
|
14
|
+
slices: `translation/incoming/` (3), `translation/tone/` (2),
|
|
15
|
+
`translation/outgoing/` (2), `translation/meetings/` (3, including a
|
|
16
|
+
multi-rater agreement metric demonstration).
|
|
17
|
+
- `corpora/guardrail/` placeholder for the .
|
|
18
|
+
- `CONTRIBUTING.md` describing the consent + anonymisation pipeline.
|
|
19
|
+
- Pytest suite covering corpus load, anonymisation, dedupe, runner, scoring,
|
|
20
|
+
and harness CLI.
|
|
21
|
+
|
|
22
|
+
### Not yet
|
|
23
|
+
|
|
24
|
+
- Real contributed corpora — gated on the pipeline.
|
|
25
|
+
- HuggingFace publication — deferred to v0.1.0.
|
|
26
|
+
- Tuned CI thresholds — the harness ships with permissive defaults; tuning
|
|
27
|
+
follows the first real contributions.
|
|
28
|
+
- rater agreement metric adjudication workflow beyond a single demo example.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: neurodock-evals
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: NeuroDock eval corpora and harness — versioned datasets for translation, skills, and guardrails.
|
|
5
|
+
Author: NeuroDock contributors
|
|
6
|
+
License: AGPL-3.0-or-later
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: jsonschema>=4.21
|
|
9
|
+
Requires-Dist: neurodock-mcp-translation
|
|
10
|
+
Requires-Dist: pydantic>=2.7
|
|
11
|
+
Requires-Dist: pyyaml>=6.0
|
|
12
|
+
Provides-Extra: test
|
|
13
|
+
Requires-Dist: pytest-asyncio>=1.3.0; extra == 'test'
|
|
14
|
+
Requires-Dist: pytest>=8.3.0; extra == 'test'
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# neurodock-evals
|
|
18
|
+
|
|
19
|
+
The versioned eval corpora and the air-gapped harness that runs ND prompts
|
|
20
|
+
against them.
|
|
21
|
+
|
|
22
|
+
The corpus is the strategic asset that makes the translation layer honest. We
|
|
23
|
+
prove that ND-aware prompts help neurodivergent users in real situations, and
|
|
24
|
+
we catch regressions when prompts change. The harness gates prompt PRs in CI.
|
|
25
|
+
|
|
26
|
+
This package is **v0.0.1** — the scaffold, the harness, and 6-10 hand-authored
|
|
27
|
+
seed examples. The seeds are **synthesised by to
|
|
28
|
+
demonstrate the format** — they are NOT real corporate messages. Real
|
|
29
|
+
contributed corpora arrive over Phase 2 (target ~300 examples by month 6, per
|
|
30
|
+
).
|
|
31
|
+
|
|
32
|
+
## What's here
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
packages/evals/
|
|
36
|
+
├── src/neurodock_evals/ # Harness, anonymiser, deduper, scorer
|
|
37
|
+
├── corpora/ # Versioned YAML eval examples by slice
|
|
38
|
+
├── schemas/ # JSON Schemas for examples + annotations
|
|
39
|
+
└── tests/ # Tests for the harness itself
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick start
|
|
43
|
+
|
|
44
|
+
Run the harness against the seed corpora:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
uv run python -m neurodock_evals.harness --corpus translation/incoming \
|
|
48
|
+
--tool translate_incoming
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Run all four translation slices:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
uv run python -m neurodock_evals.harness --ci
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Anonymise a contribution before opening a PR:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
uv run python -m neurodock_evals.anonymise path/to/example.yaml
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Air-gapped by design
|
|
64
|
+
|
|
65
|
+
The harness never calls an LLM. It exercises each tool's **deterministic
|
|
66
|
+
baseline** (the heuristic layer the translation server returns even before any
|
|
67
|
+
LLM refinement) and scores the baseline against the human-rated `expected`
|
|
68
|
+
block. Any LLM-side eval is a separate concern that the maintainer reviews
|
|
69
|
+
under a different policy.
|
|
70
|
+
|
|
71
|
+
## Privacy
|
|
72
|
+
|
|
73
|
+
- The harness never logs example contents to stdout or to anywhere outside
|
|
74
|
+
`.eval-reports/`.
|
|
75
|
+
- Reports contain example IDs and scores only — never verbatim text.
|
|
76
|
+
- The contribution pipeline (`anonymise.py`) is a safety net, NOT a substitute
|
|
77
|
+
for contributor judgement. See `CONTRIBUTING.md`.
|
|
78
|
+
- All corpora are licensed **AGPL-3.0-or-later**.
|
|
79
|
+
|
|
80
|
+
## Glossary
|
|
81
|
+
|
|
82
|
+
| Term | Meaning |
|
|
83
|
+
| ---- | ------- |
|
|
84
|
+
| corpus slice | a directory under `corpora/<server>/<slice>/`; the unit of versioning |
|
|
85
|
+
| example | one YAML file under a slice — one input, one `expected` block, multiple ratings |
|
|
86
|
+
| rating | one ND-rater's judgement of how close the `expected` block matches their read |
|
|
87
|
+
| deterministic baseline | the heuristic output a translation tool returns without invoking an LLM |
|
|
88
|
+
| eval-corpus binding | every `mcp-translation` tool cites the slice that validates it (ADR 0005 §4) |
|
|
89
|
+
|
|
90
|
+
## Status
|
|
91
|
+
|
|
92
|
+
- v0.0.1 (current): scaffold + harness + 10 synthesised seed examples
|
|
93
|
+
- v0.0.2 (planned): first contributed corpus (after )
|
|
94
|
+
- v0.1.0 (planned): HuggingFace publication pipeline under the `neurodock` org
|
|
95
|
+
|
|
96
|
+
See `CHANGELOG.md` for detail.
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# neurodock-evals
|
|
2
|
+
|
|
3
|
+
The versioned eval corpora and the air-gapped harness that runs ND prompts
|
|
4
|
+
against them.
|
|
5
|
+
|
|
6
|
+
The corpus is the strategic asset that makes the translation layer honest. We
|
|
7
|
+
prove that ND-aware prompts help neurodivergent users in real situations, and
|
|
8
|
+
we catch regressions when prompts change. The harness gates prompt PRs in CI.
|
|
9
|
+
|
|
10
|
+
This package is **v0.0.1** — the scaffold, the harness, and 6-10 hand-authored
|
|
11
|
+
seed examples. The seeds are **synthesised by to
|
|
12
|
+
demonstrate the format** — they are NOT real corporate messages. Real
|
|
13
|
+
contributed corpora arrive over Phase 2 (target ~300 examples by month 6, per
|
|
14
|
+
).
|
|
15
|
+
|
|
16
|
+
## What's here
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
packages/evals/
|
|
20
|
+
├── src/neurodock_evals/ # Harness, anonymiser, deduper, scorer
|
|
21
|
+
├── corpora/ # Versioned YAML eval examples by slice
|
|
22
|
+
├── schemas/ # JSON Schemas for examples + annotations
|
|
23
|
+
└── tests/ # Tests for the harness itself
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick start
|
|
27
|
+
|
|
28
|
+
Run the harness against the seed corpora:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv run python -m neurodock_evals.harness --corpus translation/incoming \
|
|
32
|
+
--tool translate_incoming
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Run all four translation slices:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv run python -m neurodock_evals.harness --ci
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Anonymise a contribution before opening a PR:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uv run python -m neurodock_evals.anonymise path/to/example.yaml
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Air-gapped by design
|
|
48
|
+
|
|
49
|
+
The harness never calls an LLM. It exercises each tool's **deterministic
|
|
50
|
+
baseline** (the heuristic layer the translation server returns even before any
|
|
51
|
+
LLM refinement) and scores the baseline against the human-rated `expected`
|
|
52
|
+
block. Any LLM-side eval is a separate concern that the maintainer reviews
|
|
53
|
+
under a different policy.
|
|
54
|
+
|
|
55
|
+
## Privacy
|
|
56
|
+
|
|
57
|
+
- The harness never logs example contents to stdout or to anywhere outside
|
|
58
|
+
`.eval-reports/`.
|
|
59
|
+
- Reports contain example IDs and scores only — never verbatim text.
|
|
60
|
+
- The contribution pipeline (`anonymise.py`) is a safety net, NOT a substitute
|
|
61
|
+
for contributor judgement. See `CONTRIBUTING.md`.
|
|
62
|
+
- All corpora are licensed **AGPL-3.0-or-later**.
|
|
63
|
+
|
|
64
|
+
## Glossary
|
|
65
|
+
|
|
66
|
+
| Term | Meaning |
|
|
67
|
+
| ---- | ------- |
|
|
68
|
+
| corpus slice | a directory under `corpora/<server>/<slice>/`; the unit of versioning |
|
|
69
|
+
| example | one YAML file under a slice — one input, one `expected` block, multiple ratings |
|
|
70
|
+
| rating | one ND-rater's judgement of how close the `expected` block matches their read |
|
|
71
|
+
| deterministic baseline | the heuristic output a translation tool returns without invoking an LLM |
|
|
72
|
+
| eval-corpus binding | every `mcp-translation` tool cites the slice that validates it (ADR 0005 §4) |
|
|
73
|
+
|
|
74
|
+
## Status
|
|
75
|
+
|
|
76
|
+
- v0.0.1 (current): scaffold + harness + 10 synthesised seed examples
|
|
77
|
+
- v0.0.2 (planned): first contributed corpus (after )
|
|
78
|
+
- v0.1.0 (planned): HuggingFace publication pipeline under the `neurodock` org
|
|
79
|
+
|
|
80
|
+
See `CHANGELOG.md` for detail.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Eval corpora
|
|
2
|
+
|
|
3
|
+
Versioned datasets that anchor the prompt regression suite. Every directory
|
|
4
|
+
under here is **one slice**: one server, one tool-family, one set of
|
|
5
|
+
hand-rated examples.
|
|
6
|
+
|
|
7
|
+
## Layout
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
corpora/
|
|
11
|
+
├── translation/ # Slices for the mcp-translation server
|
|
12
|
+
│ ├── incoming/ # translate_incoming evaluator
|
|
13
|
+
│ ├── tone/ # check_tone evaluator
|
|
14
|
+
│ ├── outgoing/ # rewrite_outgoing evaluator
|
|
15
|
+
│ └── meetings/ # brief_meeting evaluator
|
|
16
|
+
└── guardrail/ # Reserved for the
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## File format
|
|
20
|
+
|
|
21
|
+
One YAML file per example, named `NNN-slug.example.yaml`. Validated against
|
|
22
|
+
`packages/evals/schemas/example.schema.json`. See `CONTRIBUTING.md` for the
|
|
23
|
+
contribution flow.
|
|
24
|
+
|
|
25
|
+
## Provenance
|
|
26
|
+
|
|
27
|
+
Every example in v0.0.1 is **synthesised** by to
|
|
28
|
+
demonstrate the format and exercise the harness end-to-end. These are NOT
|
|
29
|
+
real corporate messages. Phase 2 brings the first contributed examples
|
|
30
|
+
through the consent pipeline at `evals.neurodock.org/contribute`.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# translation/ slices
|
|
2
|
+
|
|
3
|
+
Eval corpora for the four `mcp-translation` tools, per ADR 0005 §4
|
|
4
|
+
(eval-corpus binding).
|
|
5
|
+
|
|
6
|
+
| Slice | Tool | What it tests |
|
|
7
|
+
| ----- | ---- | ------------- |
|
|
8
|
+
| `translation/incoming` | `translate_incoming` | Subtext, ambiguity, recommended next action on incoming messages |
|
|
9
|
+
| `translation/tone` | `check_tone` | Directness/warmth/urgency scoring and flagged phrases |
|
|
10
|
+
| `translation/outgoing` | `rewrite_outgoing` | Register-targeted rewrites that preserve specified terms |
|
|
11
|
+
| `translation/meetings` | `brief_meeting` | Verbatim-anchored partition of a transcript into asks/decisions/ambiguities |
|
|
12
|
+
|
|
13
|
+
Each slice is loaded by `neurodock_evals.corpus.load_slice("translation/<name>")`
|
|
14
|
+
and run via `neurodock_evals.runner.run_example`.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
id: "translation.incoming.001"
|
|
2
|
+
slice: "translation/incoming"
|
|
3
|
+
created_at: "2026-05-17"
|
|
4
|
+
consent:
|
|
5
|
+
contributor: "synth-curator-001"
|
|
6
|
+
consent_token: "sha256:synthesised-seed-2026-05-17"
|
|
7
|
+
anonymisation_pass: 1
|
|
8
|
+
status: "synthesised"
|
|
9
|
+
license: "AGPL-3.0-or-later"
|
|
10
|
+
input:
|
|
11
|
+
text: "I'll circle back on the migration plan next week — quick one, just need a sanity check."
|
|
12
|
+
channel: "slack"
|
|
13
|
+
expected:
|
|
14
|
+
explicit_ask: null
|
|
15
|
+
ambiguity:
|
|
16
|
+
detected: true
|
|
17
|
+
recommended_next_action:
|
|
18
|
+
action: "set_reminder"
|
|
19
|
+
ratings:
|
|
20
|
+
- rater_id: "rater-curator-A"
|
|
21
|
+
rater_neurotypes: ["adhd"]
|
|
22
|
+
agreement_with_expected: 0.9
|
|
23
|
+
notes: "Classic soft-deferral. 'I'll circle back' + 'next week' both signal the response will not self-fulfil."
|
|
24
|
+
notes: |
|
|
25
|
+
Demonstrates the hedged-commitment / vague-timeline combination. The baseline
|
|
26
|
+
should flag both phrases and recommend `set_reminder` because there is no
|
|
27
|
+
explicit ask to act on.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
id: "translation.incoming.002"
|
|
2
|
+
slice: "translation/incoming"
|
|
3
|
+
created_at: "2026-05-17"
|
|
4
|
+
consent:
|
|
5
|
+
contributor: "synth-curator-001"
|
|
6
|
+
consent_token: "sha256:synthesised-seed-2026-05-17"
|
|
7
|
+
anonymisation_pass: 1
|
|
8
|
+
status: "synthesised"
|
|
9
|
+
license: "AGPL-3.0-or-later"
|
|
10
|
+
input:
|
|
11
|
+
text: "Have you had a chance to look at the rollout doc? No rush but it's been a while."
|
|
12
|
+
channel: "email"
|
|
13
|
+
thread_context:
|
|
14
|
+
- "Sharing the rollout doc — would love your thoughts before Friday."
|
|
15
|
+
- "(no reply)"
|
|
16
|
+
expected:
|
|
17
|
+
ambiguity:
|
|
18
|
+
detected: true
|
|
19
|
+
recommended_next_action:
|
|
20
|
+
action: "reply"
|
|
21
|
+
ratings:
|
|
22
|
+
- rater_id: "rater-curator-A"
|
|
23
|
+
rater_neurotypes: ["audhd"]
|
|
24
|
+
agreement_with_expected: 0.85
|
|
25
|
+
notes: |
|
|
26
|
+
Polite-overdue framing. "Have you had a chance" + "no rush" reads as
|
|
27
|
+
implied urgency once you account for the thread context. The baseline
|
|
28
|
+
should flag this and recommend acknowledging receipt with a concrete
|
|
29
|
+
next step.
|
|
30
|
+
notes: |
|
|
31
|
+
Implied-urgency pattern: polite words masking a follow-up. The baseline
|
|
32
|
+
flags both `have you had a chance` and `no rush` as implied_urgency.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
id: "translation.incoming.003"
|
|
2
|
+
slice: "translation/incoming"
|
|
3
|
+
created_at: "2026-05-17"
|
|
4
|
+
consent:
|
|
5
|
+
contributor: "synth-curator-001"
|
|
6
|
+
consent_token: "sha256:synthesised-seed-2026-05-17"
|
|
7
|
+
anonymisation_pass: 1
|
|
8
|
+
status: "synthesised"
|
|
9
|
+
license: "AGPL-3.0-or-later"
|
|
10
|
+
input:
|
|
11
|
+
text: "Hey — can we revisit the rollout timeline? I'm not sure everyone is aligned."
|
|
12
|
+
channel: "slack"
|
|
13
|
+
thread_context:
|
|
14
|
+
- "We agreed last sprint to ship the rollout by end of May."
|
|
15
|
+
- "Two engineers flagged risk on the migration script."
|
|
16
|
+
expected:
|
|
17
|
+
ambiguity:
|
|
18
|
+
detected: true
|
|
19
|
+
recommended_next_action:
|
|
20
|
+
action: "clarify"
|
|
21
|
+
ratings:
|
|
22
|
+
- rater_id: "rater-curator-A"
|
|
23
|
+
rater_neurotypes: ["asd"]
|
|
24
|
+
agreement_with_expected: 0.95
|
|
25
|
+
notes: |
|
|
26
|
+
Soft-request + implied-blame combo. The 'can we revisit' phrase reads as
|
|
27
|
+
'I want to change' rather than a neutral re-open; 'not sure everyone is
|
|
28
|
+
aligned' softly attributes misalignment without naming who.
|
|
29
|
+
- rater_id: "rater-curator-B"
|
|
30
|
+
rater_neurotypes: ["adhd", "asd"]
|
|
31
|
+
agreement_with_expected: 0.8
|
|
32
|
+
notes: |
|
|
33
|
+
Agree on subtext, but I'd want the recommendation to push harder on
|
|
34
|
+
naming the specific stakeholders before agreeing to reopen scope.
|
|
35
|
+
notes: |
|
|
36
|
+
The canonical 'can we revisit' example from the v0.1.0 schema. Carries two
|
|
37
|
+
raters so the harness can demonstrate inter-rater agreement reporting.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# translation/incoming/
|
|
2
|
+
|
|
3
|
+
Examples for the `translate_incoming` tool. Each example carries an `input`
|
|
4
|
+
matching `TranslateIncomingInput` and an `expected` block naming the fields
|
|
5
|
+
the rater considers diagnostic.
|
|
6
|
+
|
|
7
|
+
The runner scores against the deterministic baseline (no LLM). v0.0.1 ships
|
|
8
|
+
three synthesised seed examples; real contributed examples arrive in Phase 2.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
id: "translation.meetings.001"
|
|
2
|
+
slice: "translation/meetings"
|
|
3
|
+
created_at: "2026-05-17"
|
|
4
|
+
consent:
|
|
5
|
+
contributor: "synth-curator-001"
|
|
6
|
+
consent_token: "sha256:synthesised-seed-2026-05-17"
|
|
7
|
+
anonymisation_pass: 1
|
|
8
|
+
status: "synthesised"
|
|
9
|
+
license: "AGPL-3.0-or-later"
|
|
10
|
+
input:
|
|
11
|
+
transcript: |
|
|
12
|
+
Alex: Sam, can you own the migration script for the rollout?
|
|
13
|
+
Sam: Yes — I'll have it by Wednesday.
|
|
14
|
+
Alex: Great. We agreed to ship by end of May.
|
|
15
|
+
me: "Sam"
|
|
16
|
+
speakers: ["Alex", "Sam"]
|
|
17
|
+
project: "rollout"
|
|
18
|
+
expected:
|
|
19
|
+
my_asks:
|
|
20
|
+
__len__: 1
|
|
21
|
+
others_asks:
|
|
22
|
+
__len__: 0
|
|
23
|
+
decisions:
|
|
24
|
+
__len__: 2
|
|
25
|
+
ambiguous_items:
|
|
26
|
+
__len__: 0
|
|
27
|
+
ratings:
|
|
28
|
+
- rater_id: "rater-curator-A"
|
|
29
|
+
rater_neurotypes: ["asd"]
|
|
30
|
+
agreement_with_expected: 0.95
|
|
31
|
+
notes: |
|
|
32
|
+
Clean partition: Alex's 'Can you own ...' becomes Sam's my_ask; Sam's
|
|
33
|
+
'Yes — I'll have it' is a decision; Alex's 'We agreed' is also a
|
|
34
|
+
decision. No ambiguity in this transcript.
|
|
35
|
+
notes: |
|
|
36
|
+
Demonstrates the four-section partition with verbatim-anchored spans. Both
|
|
37
|
+
the 'me' ask and the two decision lines are line-anchored in the transcript;
|
|
38
|
+
the runner enforces the verbatim invariant on every load.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
id: "translation.meetings.002"
|
|
2
|
+
slice: "translation/meetings"
|
|
3
|
+
created_at: "2026-05-17"
|
|
4
|
+
consent:
|
|
5
|
+
contributor: "synth-curator-001"
|
|
6
|
+
consent_token: "sha256:synthesised-seed-2026-05-17"
|
|
7
|
+
anonymisation_pass: 1
|
|
8
|
+
status: "synthesised"
|
|
9
|
+
license: "AGPL-3.0-or-later"
|
|
10
|
+
input:
|
|
11
|
+
transcript: |
|
|
12
|
+
Alex: Sam, can you draft the migration plan?
|
|
13
|
+
Sam: I'll circle back next week with thoughts.
|
|
14
|
+
Alex: Someone should also think about the rollback path.
|
|
15
|
+
me: "Sam"
|
|
16
|
+
speakers: ["Alex", "Sam"]
|
|
17
|
+
project: "rollout"
|
|
18
|
+
expected:
|
|
19
|
+
my_asks:
|
|
20
|
+
__len__: 1
|
|
21
|
+
decisions:
|
|
22
|
+
__len__: 0
|
|
23
|
+
ambiguous_items:
|
|
24
|
+
__len__: 2
|
|
25
|
+
ratings:
|
|
26
|
+
- rater_id: "rater-curator-A"
|
|
27
|
+
rater_neurotypes: ["audhd"]
|
|
28
|
+
agreement_with_expected: 0.85
|
|
29
|
+
notes: |
|
|
30
|
+
Sam's 'I'll circle back next week' is both a (weak) decision and a
|
|
31
|
+
hedged commitment — the baseline surfaces it under both categories.
|
|
32
|
+
Alex's 'Someone should also think' has unassigned owner + hedged commitment.
|
|
33
|
+
notes: |
|
|
34
|
+
Tests the verbatim-anchor invariant on ambiguous_items: every quoted_span
|
|
35
|
+
must slice cleanly from the transcript. This example also documents the
|
|
36
|
+
case where one line becomes both a decision and an ambiguous_item, which
|
|
37
|
+
the LLM-refinement layer can later collapse.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# translation/meetings/
|
|
2
|
+
|
|
3
|
+
Examples for the `brief_meeting` tool. Each example carries an `input`
|
|
4
|
+
matching `BriefMeetingInput` (a transcript, the rater's `me` handle, and
|
|
5
|
+
optional speakers/project) and an `expected` block over my_asks /
|
|
6
|
+
others_asks / decisions / ambiguous_items counts and salient fields.
|
|
7
|
+
|
|
8
|
+
Per ADR 0005 §5, every `ambiguous_items[*].quoted_span` is verbatim-anchored.
|
|
9
|
+
The runner exercises that invariant on every meeting example.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
id: "translation.outgoing.001"
|
|
2
|
+
slice: "translation/outgoing"
|
|
3
|
+
created_at: "2026-05-17"
|
|
4
|
+
consent:
|
|
5
|
+
contributor: "synth-curator-001"
|
|
6
|
+
consent_token: "sha256:synthesised-seed-2026-05-17"
|
|
7
|
+
anonymisation_pass: 1
|
|
8
|
+
status: "synthesised"
|
|
9
|
+
license: "AGPL-3.0-or-later"
|
|
10
|
+
input:
|
|
11
|
+
text: "Strong nack. This will block the release."
|
|
12
|
+
target_register: "warm"
|
|
13
|
+
channel: "github"
|
|
14
|
+
expected:
|
|
15
|
+
preserved_terms: []
|
|
16
|
+
unpreserved_terms: []
|
|
17
|
+
diff_summary:
|
|
18
|
+
tone_shift: "Lifted warmth by opening with a relational acknowledgement; preserved the underlying ask."
|
|
19
|
+
ratings:
|
|
20
|
+
- rater_id: "rater-curator-A"
|
|
21
|
+
rater_neurotypes: ["asd"]
|
|
22
|
+
agreement_with_expected: 0.85
|
|
23
|
+
notes: |
|
|
24
|
+
The baseline drops 'strong nack' and prepends a 'Hey — ' opener. The
|
|
25
|
+
underlying concern survives; the relational opener softens the blow.
|
|
26
|
+
notes: |
|
|
27
|
+
Warmth-rewrite baseline test. The structural_changes array should contain
|
|
28
|
+
"Removed opening rejection token" and "Added relational opener". The
|
|
29
|
+
`tone_shift` summary is deterministic and matched verbatim.
|