neurodock-evals 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. neurodock_evals-0.0.1/.gitignore +189 -0
  2. neurodock_evals-0.0.1/CHANGELOG.md +28 -0
  3. neurodock_evals-0.0.1/PKG-INFO +96 -0
  4. neurodock_evals-0.0.1/README.md +80 -0
  5. neurodock_evals-0.0.1/corpora/README.md +30 -0
  6. neurodock_evals-0.0.1/corpora/translation/README.md +14 -0
  7. neurodock_evals-0.0.1/corpora/translation/incoming/001-circle-back.example.yaml +27 -0
  8. neurodock_evals-0.0.1/corpora/translation/incoming/002-loop-you-in.example.yaml +32 -0
  9. neurodock_evals-0.0.1/corpora/translation/incoming/003-revisit-next-week.example.yaml +37 -0
  10. neurodock_evals-0.0.1/corpora/translation/incoming/README.md +8 -0
  11. neurodock_evals-0.0.1/corpora/translation/meetings/001-short-product-meeting.example.yaml +38 -0
  12. neurodock_evals-0.0.1/corpora/translation/meetings/002-hedged-decision.example.yaml +37 -0
  13. neurodock_evals-0.0.1/corpora/translation/meetings/README.md +9 -0
  14. neurodock_evals-0.0.1/corpora/translation/outgoing/001-rewrite-for-warmth.example.yaml +29 -0
  15. neurodock_evals-0.0.1/corpora/translation/outgoing/002-preserve-technical-terms.example.yaml +33 -0
  16. neurodock_evals-0.0.1/corpora/translation/outgoing/README.md +6 -0
  17. neurodock_evals-0.0.1/corpora/translation/tone/001-blunt-pr-comment.example.yaml +29 -0
  18. neurodock_evals-0.0.1/corpora/translation/tone/002-warm-deadline-reminder.example.yaml +27 -0
  19. neurodock_evals-0.0.1/corpora/translation/tone/README.md +4 -0
  20. neurodock_evals-0.0.1/pyproject.toml +48 -0
  21. neurodock_evals-0.0.1/schemas/annotation.schema.json +39 -0
  22. neurodock_evals-0.0.1/schemas/example.schema.json +79 -0
  23. neurodock_evals-0.0.1/src/neurodock_evals/__init__.py +62 -0
  24. neurodock_evals-0.0.1/src/neurodock_evals/anonymise.py +278 -0
  25. neurodock_evals-0.0.1/src/neurodock_evals/corpus.py +170 -0
  26. neurodock_evals-0.0.1/src/neurodock_evals/dedupe.py +90 -0
  27. neurodock_evals-0.0.1/src/neurodock_evals/harness.py +192 -0
  28. neurodock_evals-0.0.1/src/neurodock_evals/py.typed +0 -0
  29. neurodock_evals-0.0.1/src/neurodock_evals/runner.py +118 -0
  30. neurodock_evals-0.0.1/src/neurodock_evals/scoring.py +144 -0
  31. neurodock_evals-0.0.1/src/neurodock_evals/types.py +95 -0
  32. neurodock_evals-0.0.1/tests/conftest.py +18 -0
  33. neurodock_evals-0.0.1/tests/test_anonymise.py +85 -0
  34. neurodock_evals-0.0.1/tests/test_corpus_load.py +38 -0
  35. neurodock_evals-0.0.1/tests/test_dedupe.py +45 -0
  36. neurodock_evals-0.0.1/tests/test_harness_cli.py +105 -0
  37. neurodock_evals-0.0.1/tests/test_runner.py +56 -0
  38. neurodock_evals-0.0.1/tests/test_scoring.py +55 -0
@@ -0,0 +1,189 @@
1
+ # ─────────────────────────────────────────────────────────────
2
+ # Node / pnpm (TypeScript packages: core, cli, extension-browser, docs)
3
+ # ─────────────────────────────────────────────────────────────
4
+ node_modules/
5
+ .pnpm-store/
6
+ .pnpm-debug.log*
7
+ npm-debug.log*
8
+ yarn-debug.log*
9
+ yarn-error.log*
10
+
11
+ # Build outputs
12
+ dist/
13
+ build/
14
+ out/
15
+ *.tsbuildinfo
16
+ .next/
17
+ .nuxt/
18
+
19
+ # Turborepo cache
20
+ .turbo/
21
+
22
+ # ─────────────────────────────────────────────────────────────
23
+ # Python / uv (MCP servers, clinical, evals)
24
+ # ─────────────────────────────────────────────────────────────
25
+ __pycache__/
26
+ *.py[cod]
27
+ *$py.class
28
+ *.so
29
+
30
+ # uv / virtualenvs
31
+ .venv/
32
+ venv/
33
+ env/
34
+ .python-version
35
+
36
+ # Packaging
37
+ *.egg-info/
38
+ *.egg
39
+ dist/
40
+ build/
41
+ wheels/
42
+ *.whl
43
+
44
+ # Test / coverage
45
+ .pytest_cache/
46
+ .mypy_cache/
47
+ .ruff_cache/
48
+ .coverage
49
+ .coverage.*
50
+ coverage.xml
51
+ htmlcov/
52
+ .tox/
53
+ .hypothesis/
54
+
55
+ # Jupyter (in case any analysis notebooks land in scripts/)
56
+ .ipynb_checkpoints/
57
+
58
+ # ─────────────────────────────────────────────────────────────
59
+ # WXT browser extension (packages/extension-browser/)
60
+ # ─────────────────────────────────────────────────────────────
61
+ .output/
62
+ .wxt/
63
+ web-ext-artifacts/
64
+ *.zip
65
+ *.crx
66
+ *.xpi
67
+
68
+ # Extension store submission artefacts — keep these local
69
+ extension-submission/
70
+
71
+ # ─────────────────────────────────────────────────────────────
72
+ # Astro / Starlight docs (docs/)
73
+ # ─────────────────────────────────────────────────────────────
74
+ .astro/
75
+
76
+ # ─────────────────────────────────────────────────────────────
77
+ # Local NeuroDock runtime data — NEVER commit
78
+ # ─────────────────────────────────────────────────────────────
79
+ # Profile files contain user neurotype declarations
80
+ profile.yaml
81
+ profile.yml
82
+ .neurodock/
83
+ **/.neurodock/
84
+
85
+ # SQLite stores (cognitive graph, embeddings, traces)
86
+ *.sqlite
87
+ *.sqlite-journal
88
+ *.sqlite-wal
89
+ *.sqlite-shm
90
+ *.db
91
+ *.db-journal
92
+
93
+ # JSONL event logs
94
+ *.jsonl
95
+ !packages/evals/**/*.jsonl # eval corpora are checked in
96
+ !**/fixtures/*.jsonl # test fixtures are checked in
97
+
98
+ # Local embedding caches
99
+ .embeddings/
100
+ embeddings-cache/
101
+
102
+ # OpenTelemetry local trace dumps
103
+ .otel/
104
+ traces/
105
+
106
+ # ─────────────────────────────────────────────────────────────
107
+ # Secrets / environment
108
+ # ─────────────────────────────────────────────────────────────
109
+ .env
110
+ .env.*
111
+ !.env.example
112
+ !.env.template
113
+ *.pem
114
+ *.key
115
+ *.p12
116
+ secrets/
117
+
118
+ # Anthropic / LLM provider keys (defence in depth)
119
+ .anthropic/
120
+ .openai/
121
+
122
+ # Browser extension signing keys — local only
123
+ *.signing-key
124
+ extension-private-key.pem
125
+
126
+ # ─────────────────────────────────────────────────────────────
127
+ # OS / editor (you're on Windows + VS Code)
128
+ # ─────────────────────────────────────────────────────────────
129
+ # Windows
130
+ Thumbs.db
131
+ ehthumbs.db
132
+ ehthumbs_vista.db
133
+ Desktop.ini
134
+ $RECYCLE.BIN/
135
+ *.lnk
136
+
137
+ # macOS (in case any contributor PRs from a Mac)
138
+ .DS_Store
139
+ .AppleDouble
140
+ .LSOverride
141
+ ._*
142
+
143
+ # Linux
144
+ *~
145
+ .directory
146
+ .Trash-*
147
+
148
+ # VS Code — share recommendations, ignore personal state
149
+ .vscode/*
150
+ !.vscode/extensions.json
151
+ !.vscode/settings.shared.json
152
+ !.vscode/launch.shared.json
153
+
154
+ # JetBrains
155
+ .idea/
156
+ *.iml
157
+ *.iws
158
+
159
+ # Vim / Emacs
160
+ *.swp
161
+ *.swo
162
+ *~
163
+ .\#*
164
+ \#*\#
165
+
166
+ # ─────────────────────────────────────────────────────────────
167
+ # Claude Code / agent runtime — keep agents tracked, ignore state
168
+ # ─────────────────────────────────────────────────────────────
169
+ .claude/cache/
170
+ .claude/history/
171
+ .claude/sessions/
172
+ .claude/local/
173
+
174
+ # Do keep: .claude/agents/, .claude/skills/, .claude/settings.json
175
+
176
+ # ─────────────────────────────────────────────────────────────
177
+ # Misc
178
+ # ─────────────────────────────────────────────────────────────
179
+ *.log
180
+ logs/
181
+ tmp/
182
+ .tmp/
183
+ .cache/
184
+ .parcel-cache/
185
+
186
+ # Lockfiles policy: KEEP these committed
187
+ # !pnpm-lock.yaml
188
+ # !uv.lock
189
+ # (intentionally not listed above so they're tracked)
@@ -0,0 +1,28 @@
1
+ # Changelog
2
+
3
+ All notable changes to `neurodock-evals`. Format loosely follows
4
+ [Keep a Changelog](https://keepachangelog.com).
5
+
6
+ ## [0.0.1] — 2026-05-17
7
+
8
+ ### Added
9
+
10
+ - Initial harness scaffold: `harness.py`, `corpus.py`, `runner.py`, `scoring.py`,
11
+ `anonymise.py`, `dedupe.py`, `types.py`.
12
+ - JSON Schemas: `schemas/example.schema.json`, `schemas/annotation.schema.json`.
13
+ - Ten hand-authored, **synthesised** seed examples across four translation
14
+ slices: `translation/incoming/` (3), `translation/tone/` (2),
15
+ `translation/outgoing/` (2), `translation/meetings/` (3, including a
16
+ multi-rater agreement metric demonstration).
17
+ - `corpora/guardrail/` placeholder for the .
18
+ - `CONTRIBUTING.md` describing the consent + anonymisation pipeline.
19
+ - Pytest suite covering corpus load, anonymisation, dedupe, runner, scoring,
20
+ and harness CLI.
21
+
22
+ ### Not yet
23
+
24
+ - Real contributed corpora — gated on the pipeline.
25
+ - HuggingFace publication — deferred to v0.1.0.
26
+ - Tuned CI thresholds — the harness ships with permissive defaults; tuning
27
+ follows the first real contributions.
28
+ - rater agreement metric adjudication workflow beyond a single demo example.
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.4
2
+ Name: neurodock-evals
3
+ Version: 0.0.1
4
+ Summary: NeuroDock eval corpora and harness — versioned datasets for translation, skills, and guardrails.
5
+ Author: NeuroDock contributors
6
+ License: AGPL-3.0-or-later
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: jsonschema>=4.21
9
+ Requires-Dist: neurodock-mcp-translation
10
+ Requires-Dist: pydantic>=2.7
11
+ Requires-Dist: pyyaml>=6.0
12
+ Provides-Extra: test
13
+ Requires-Dist: pytest-asyncio>=1.3.0; extra == 'test'
14
+ Requires-Dist: pytest>=8.3.0; extra == 'test'
15
+ Description-Content-Type: text/markdown
16
+
17
+ # neurodock-evals
18
+
19
+ The versioned eval corpora and the air-gapped harness that runs ND prompts
20
+ against them.
21
+
22
+ The corpus is the strategic asset that makes the translation layer honest. We
23
+ prove that ND-aware prompts help neurodivergent users in real situations, and
24
+ we catch regressions when prompts change. The harness gates prompt PRs in CI.
25
+
26
+ This package is **v0.0.1** — the scaffold, the harness, and 6-10 hand-authored
27
+ seed examples. The seeds are **synthesised by to
28
+ demonstrate the format** — they are NOT real corporate messages. Real
29
+ contributed corpora arrive over Phase 2 (target ~300 examples by month 6, per
30
+ ).
31
+
32
+ ## What's here
33
+
34
+ ```
35
+ packages/evals/
36
+ ├── src/neurodock_evals/ # Harness, anonymiser, deduper, scorer
37
+ ├── corpora/ # Versioned YAML eval examples by slice
38
+ ├── schemas/ # JSON Schemas for examples + annotations
39
+ └── tests/ # Tests for the harness itself
40
+ ```
41
+
42
+ ## Quick start
43
+
44
+ Run the harness against the seed corpora:
45
+
46
+ ```bash
47
+ uv run python -m neurodock_evals.harness --corpus translation/incoming \
48
+ --tool translate_incoming
49
+ ```
50
+
51
+ Run all four translation slices:
52
+
53
+ ```bash
54
+ uv run python -m neurodock_evals.harness --ci
55
+ ```
56
+
57
+ Anonymise a contribution before opening a PR:
58
+
59
+ ```bash
60
+ uv run python -m neurodock_evals.anonymise path/to/example.yaml
61
+ ```
62
+
63
+ ## Air-gapped by design
64
+
65
+ The harness never calls an LLM. It exercises each tool's **deterministic
66
+ baseline** (the heuristic layer the translation server returns even before any
67
+ LLM refinement) and scores the baseline against the human-rated `expected`
68
+ block. Any LLM-side eval is a separate concern that the maintainer reviews
69
+ under a different policy.
70
+
71
+ ## Privacy
72
+
73
+ - The harness never logs example contents to stdout or to anywhere outside
74
+ `.eval-reports/`.
75
+ - Reports contain example IDs and scores only — never verbatim text.
76
+ - The contribution pipeline (`anonymise.py`) is a safety net, NOT a substitute
77
+ for contributor judgement. See `CONTRIBUTING.md`.
78
+ - All corpora are licensed **AGPL-3.0-or-later**.
79
+
80
+ ## Glossary
81
+
82
+ | Term | Meaning |
83
+ | ---- | ------- |
84
+ | corpus slice | a directory under `corpora/<server>/<slice>/`; the unit of versioning |
85
+ | example | one YAML file under a slice — one input, one `expected` block, multiple ratings |
86
+ | rating | one ND-rater's judgement of how close the `expected` block matches their read |
87
+ | deterministic baseline | the heuristic output a translation tool returns without invoking an LLM |
88
+ | eval-corpus binding | every `mcp-translation` tool cites the slice that validates it (ADR 0005 §4) |
89
+
90
+ ## Status
91
+
92
+ - v0.0.1 (current): scaffold + harness + 10 synthesised seed examples
93
+ - v0.0.2 (planned): first contributed corpus (after )
94
+ - v0.1.0 (planned): HuggingFace publication pipeline under the `neurodock` org
95
+
96
+ See `CHANGELOG.md` for detail.
@@ -0,0 +1,80 @@
1
+ # neurodock-evals
2
+
3
+ The versioned eval corpora and the air-gapped harness that runs ND prompts
4
+ against them.
5
+
6
+ The corpus is the strategic asset that makes the translation layer honest. We
7
+ prove that ND-aware prompts help neurodivergent users in real situations, and
8
+ we catch regressions when prompts change. The harness gates prompt PRs in CI.
9
+
10
+ This package is **v0.0.1** — the scaffold, the harness, and 6-10 hand-authored
11
+ seed examples. The seeds are **synthesised by to
12
+ demonstrate the format** — they are NOT real corporate messages. Real
13
+ contributed corpora arrive over Phase 2 (target ~300 examples by month 6, per
14
+ ).
15
+
16
+ ## What's here
17
+
18
+ ```
19
+ packages/evals/
20
+ ├── src/neurodock_evals/ # Harness, anonymiser, deduper, scorer
21
+ ├── corpora/ # Versioned YAML eval examples by slice
22
+ ├── schemas/ # JSON Schemas for examples + annotations
23
+ └── tests/ # Tests for the harness itself
24
+ ```
25
+
26
+ ## Quick start
27
+
28
+ Run the harness against the seed corpora:
29
+
30
+ ```bash
31
+ uv run python -m neurodock_evals.harness --corpus translation/incoming \
32
+ --tool translate_incoming
33
+ ```
34
+
35
+ Run all four translation slices:
36
+
37
+ ```bash
38
+ uv run python -m neurodock_evals.harness --ci
39
+ ```
40
+
41
+ Anonymise a contribution before opening a PR:
42
+
43
+ ```bash
44
+ uv run python -m neurodock_evals.anonymise path/to/example.yaml
45
+ ```
46
+
47
+ ## Air-gapped by design
48
+
49
+ The harness never calls an LLM. It exercises each tool's **deterministic
50
+ baseline** (the heuristic layer the translation server returns even before any
51
+ LLM refinement) and scores the baseline against the human-rated `expected`
52
+ block. Any LLM-side eval is a separate concern that the maintainer reviews
53
+ under a different policy.
54
+
55
+ ## Privacy
56
+
57
+ - The harness never logs example contents to stdout or to anywhere outside
58
+ `.eval-reports/`.
59
+ - Reports contain example IDs and scores only — never verbatim text.
60
+ - The contribution pipeline (`anonymise.py`) is a safety net, NOT a substitute
61
+ for contributor judgement. See `CONTRIBUTING.md`.
62
+ - All corpora are licensed **AGPL-3.0-or-later**.
63
+
64
+ ## Glossary
65
+
66
+ | Term | Meaning |
67
+ | ---- | ------- |
68
+ | corpus slice | a directory under `corpora/<server>/<slice>/`; the unit of versioning |
69
+ | example | one YAML file under a slice — one input, one `expected` block, multiple ratings |
70
+ | rating | one ND-rater's judgement of how close the `expected` block matches their read |
71
+ | deterministic baseline | the heuristic output a translation tool returns without invoking an LLM |
72
+ | eval-corpus binding | every `mcp-translation` tool cites the slice that validates it (ADR 0005 §4) |
73
+
74
+ ## Status
75
+
76
+ - v0.0.1 (current): scaffold + harness + 10 synthesised seed examples
77
+ - v0.0.2 (planned): first contributed corpus (after )
78
+ - v0.1.0 (planned): HuggingFace publication pipeline under the `neurodock` org
79
+
80
+ See `CHANGELOG.md` for detail.
@@ -0,0 +1,30 @@
1
+ # Eval corpora
2
+
3
+ Versioned datasets that anchor the prompt regression suite. Every directory
4
+ under here is **one slice**: one server, one tool-family, one set of
5
+ hand-rated examples.
6
+
7
+ ## Layout
8
+
9
+ ```
10
+ corpora/
11
+ ├── translation/ # Slices for the mcp-translation server
12
+ │ ├── incoming/ # translate_incoming evaluator
13
+ │ ├── tone/ # check_tone evaluator
14
+ │ ├── outgoing/ # rewrite_outgoing evaluator
15
+ │ └── meetings/ # brief_meeting evaluator
16
+ └── guardrail/ # Reserved for the
17
+ ```
18
+
19
+ ## File format
20
+
21
+ One YAML file per example, named `NNN-slug.example.yaml`. Validated against
22
+ `packages/evals/schemas/example.schema.json`. See `CONTRIBUTING.md` for the
23
+ contribution flow.
24
+
25
+ ## Provenance
26
+
27
+ Every example in v0.0.1 is **synthesised** by to
28
+ demonstrate the format and exercise the harness end-to-end. These are NOT
29
+ real corporate messages. Phase 2 brings the first contributed examples
30
+ through the consent pipeline at `evals.neurodock.org/contribute`.
@@ -0,0 +1,14 @@
1
+ # translation/ slices
2
+
3
+ Eval corpora for the four `mcp-translation` tools, per ADR 0005 §4
4
+ (eval-corpus binding).
5
+
6
+ | Slice | Tool | What it tests |
7
+ | ----- | ---- | ------------- |
8
+ | `translation/incoming` | `translate_incoming` | Subtext, ambiguity, recommended next action on incoming messages |
9
+ | `translation/tone` | `check_tone` | Directness/warmth/urgency scoring and flagged phrases |
10
+ | `translation/outgoing` | `rewrite_outgoing` | Register-targeted rewrites that preserve specified terms |
11
+ | `translation/meetings` | `brief_meeting` | Verbatim-anchored partition of a transcript into asks/decisions/ambiguities |
12
+
13
+ Each slice is loaded by `neurodock_evals.corpus.load_slice("translation/<name>")`
14
+ and run via `neurodock_evals.runner.run_example`.
@@ -0,0 +1,27 @@
1
+ id: "translation.incoming.001"
2
+ slice: "translation/incoming"
3
+ created_at: "2026-05-17"
4
+ consent:
5
+ contributor: "synth-curator-001"
6
+ consent_token: "sha256:synthesised-seed-2026-05-17"
7
+ anonymisation_pass: 1
8
+ status: "synthesised"
9
+ license: "AGPL-3.0-or-later"
10
+ input:
11
+ text: "I'll circle back on the migration plan next week — quick one, just need a sanity check."
12
+ channel: "slack"
13
+ expected:
14
+ explicit_ask: null
15
+ ambiguity:
16
+ detected: true
17
+ recommended_next_action:
18
+ action: "set_reminder"
19
+ ratings:
20
+ - rater_id: "rater-curator-A"
21
+ rater_neurotypes: ["adhd"]
22
+ agreement_with_expected: 0.9
23
+ notes: "Classic soft-deferral. 'I'll circle back' + 'next week' both signal the response will not self-fulfil."
24
+ notes: |
25
+ Demonstrates the hedged-commitment / vague-timeline combination. The baseline
26
+ should flag both phrases and recommend `set_reminder` because there is no
27
+ explicit ask to act on.
@@ -0,0 +1,32 @@
1
+ id: "translation.incoming.002"
2
+ slice: "translation/incoming"
3
+ created_at: "2026-05-17"
4
+ consent:
5
+ contributor: "synth-curator-001"
6
+ consent_token: "sha256:synthesised-seed-2026-05-17"
7
+ anonymisation_pass: 1
8
+ status: "synthesised"
9
+ license: "AGPL-3.0-or-later"
10
+ input:
11
+ text: "Have you had a chance to look at the rollout doc? No rush but it's been a while."
12
+ channel: "email"
13
+ thread_context:
14
+ - "Sharing the rollout doc — would love your thoughts before Friday."
15
+ - "(no reply)"
16
+ expected:
17
+ ambiguity:
18
+ detected: true
19
+ recommended_next_action:
20
+ action: "reply"
21
+ ratings:
22
+ - rater_id: "rater-curator-A"
23
+ rater_neurotypes: ["audhd"]
24
+ agreement_with_expected: 0.85
25
+ notes: |
26
+ Polite-overdue framing. "Have you had a chance" + "no rush" reads as
27
+ implied urgency once you account for the thread context. The baseline
28
+ should flag this and recommend acknowledging receipt with a concrete
29
+ next step.
30
+ notes: |
31
+ Implied-urgency pattern: polite words masking a follow-up. The baseline
32
+ flags both `have you had a chance` and `no rush` as implied_urgency.
@@ -0,0 +1,37 @@
1
+ id: "translation.incoming.003"
2
+ slice: "translation/incoming"
3
+ created_at: "2026-05-17"
4
+ consent:
5
+ contributor: "synth-curator-001"
6
+ consent_token: "sha256:synthesised-seed-2026-05-17"
7
+ anonymisation_pass: 1
8
+ status: "synthesised"
9
+ license: "AGPL-3.0-or-later"
10
+ input:
11
+ text: "Hey — can we revisit the rollout timeline? I'm not sure everyone is aligned."
12
+ channel: "slack"
13
+ thread_context:
14
+ - "We agreed last sprint to ship the rollout by end of May."
15
+ - "Two engineers flagged risk on the migration script."
16
+ expected:
17
+ ambiguity:
18
+ detected: true
19
+ recommended_next_action:
20
+ action: "clarify"
21
+ ratings:
22
+ - rater_id: "rater-curator-A"
23
+ rater_neurotypes: ["asd"]
24
+ agreement_with_expected: 0.95
25
+ notes: |
26
+ Soft-request + implied-blame combo. The 'can we revisit' phrase reads as
27
+ 'I want to change' rather than a neutral re-open; 'not sure everyone is
28
+ aligned' softly attributes misalignment without naming who.
29
+ - rater_id: "rater-curator-B"
30
+ rater_neurotypes: ["adhd", "asd"]
31
+ agreement_with_expected: 0.8
32
+ notes: |
33
+ Agree on subtext, but I'd want the recommendation to push harder on
34
+ naming the specific stakeholders before agreeing to reopen scope.
35
+ notes: |
36
+ The canonical 'can we revisit' example from the v0.1.0 schema. Carries two
37
+ raters so the harness can demonstrate inter-rater agreement reporting.
@@ -0,0 +1,8 @@
1
+ # translation/incoming/
2
+
3
+ Examples for the `translate_incoming` tool. Each example carries an `input`
4
+ matching `TranslateIncomingInput` and an `expected` block naming the fields
5
+ the rater considers diagnostic.
6
+
7
+ The runner scores against the deterministic baseline (no LLM). v0.0.1 ships
8
+ three synthesised seed examples; real contributed examples arrive in Phase 2.
@@ -0,0 +1,38 @@
1
+ id: "translation.meetings.001"
2
+ slice: "translation/meetings"
3
+ created_at: "2026-05-17"
4
+ consent:
5
+ contributor: "synth-curator-001"
6
+ consent_token: "sha256:synthesised-seed-2026-05-17"
7
+ anonymisation_pass: 1
8
+ status: "synthesised"
9
+ license: "AGPL-3.0-or-later"
10
+ input:
11
+ transcript: |
12
+ Alex: Sam, can you own the migration script for the rollout?
13
+ Sam: Yes — I'll have it by Wednesday.
14
+ Alex: Great. We agreed to ship by end of May.
15
+ me: "Sam"
16
+ speakers: ["Alex", "Sam"]
17
+ project: "rollout"
18
+ expected:
19
+ my_asks:
20
+ __len__: 1
21
+ others_asks:
22
+ __len__: 0
23
+ decisions:
24
+ __len__: 2
25
+ ambiguous_items:
26
+ __len__: 0
27
+ ratings:
28
+ - rater_id: "rater-curator-A"
29
+ rater_neurotypes: ["asd"]
30
+ agreement_with_expected: 0.95
31
+ notes: |
32
+ Clean partition: Alex's 'Can you own ...' becomes Sam's my_ask; Sam's
33
+ 'Yes — I'll have it' is a decision; Alex's 'We agreed' is also a
34
+ decision. No ambiguity in this transcript.
35
+ notes: |
36
+ Demonstrates the four-section partition with verbatim-anchored spans. Both
37
+ the 'me' ask and the two decision lines are line-anchored in the transcript;
38
+ the runner enforces the verbatim invariant on every load.
@@ -0,0 +1,37 @@
1
+ id: "translation.meetings.002"
2
+ slice: "translation/meetings"
3
+ created_at: "2026-05-17"
4
+ consent:
5
+ contributor: "synth-curator-001"
6
+ consent_token: "sha256:synthesised-seed-2026-05-17"
7
+ anonymisation_pass: 1
8
+ status: "synthesised"
9
+ license: "AGPL-3.0-or-later"
10
+ input:
11
+ transcript: |
12
+ Alex: Sam, can you draft the migration plan?
13
+ Sam: I'll circle back next week with thoughts.
14
+ Alex: Someone should also think about the rollback path.
15
+ me: "Sam"
16
+ speakers: ["Alex", "Sam"]
17
+ project: "rollout"
18
+ expected:
19
+ my_asks:
20
+ __len__: 1
21
+ decisions:
22
+ __len__: 0
23
+ ambiguous_items:
24
+ __len__: 2
25
+ ratings:
26
+ - rater_id: "rater-curator-A"
27
+ rater_neurotypes: ["audhd"]
28
+ agreement_with_expected: 0.85
29
+ notes: |
30
+ Sam's 'I'll circle back next week' is both a (weak) decision and a
31
+ hedged commitment — the baseline surfaces it under both categories.
32
+ Alex's 'Someone should also think' has unassigned owner + hedged commitment.
33
+ notes: |
34
+ Tests the verbatim-anchor invariant on ambiguous_items: every quoted_span
35
+ must slice cleanly from the transcript. This example also documents the
36
+ case where one line becomes both a decision and an ambiguous_item, which
37
+ the LLM-refinement layer can later collapse.
@@ -0,0 +1,9 @@
1
+ # translation/meetings/
2
+
3
+ Examples for the `brief_meeting` tool. Each example carries an `input`
4
+ matching `BriefMeetingInput` (a transcript, the rater's `me` handle, and
5
+ optional speakers/project) and an `expected` block over my_asks /
6
+ others_asks / decisions / ambiguous_items counts and salient fields.
7
+
8
+ Per ADR 0005 §5, every `ambiguous_items[*].quoted_span` is verbatim-anchored.
9
+ The runner exercises that invariant on every meeting example.
@@ -0,0 +1,29 @@
1
+ id: "translation.outgoing.001"
2
+ slice: "translation/outgoing"
3
+ created_at: "2026-05-17"
4
+ consent:
5
+ contributor: "synth-curator-001"
6
+ consent_token: "sha256:synthesised-seed-2026-05-17"
7
+ anonymisation_pass: 1
8
+ status: "synthesised"
9
+ license: "AGPL-3.0-or-later"
10
+ input:
11
+ text: "Strong nack. This will block the release."
12
+ target_register: "warm"
13
+ channel: "github"
14
+ expected:
15
+ preserved_terms: []
16
+ unpreserved_terms: []
17
+ diff_summary:
18
+ tone_shift: "Lifted warmth by opening with a relational acknowledgement; preserved the underlying ask."
19
+ ratings:
20
+ - rater_id: "rater-curator-A"
21
+ rater_neurotypes: ["asd"]
22
+ agreement_with_expected: 0.85
23
+ notes: |
24
+ The baseline drops 'strong nack' and prepends a 'Hey — ' opener. The
25
+ underlying concern survives; the relational opener softens the blow.
26
+ notes: |
27
+ Warmth-rewrite baseline test. The structural_changes array should contain
28
+ "Removed opening rejection token" and "Added relational opener". The
29
+ `tone_shift` summary is deterministic and matched verbatim.