@oscharko-dev/keiko-evaluations 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/dist/.tsbuildinfo +1 -1
  2. package/dist/discussion/fixtures/correction.d.ts +5 -0
  3. package/dist/discussion/fixtures/correction.d.ts.map +1 -0
  4. package/dist/discussion/fixtures/correction.js +53 -0
  5. package/dist/discussion/fixtures/index.d.ts +5 -0
  6. package/dist/discussion/fixtures/index.d.ts.map +1 -0
  7. package/dist/discussion/fixtures/index.js +17 -0
  8. package/dist/discussion/fixtures/no-voice.d.ts +6 -0
  9. package/dist/discussion/fixtures/no-voice.d.ts.map +1 -0
  10. package/dist/discussion/fixtures/no-voice.js +79 -0
  11. package/dist/discussion/fixtures/voice.d.ts +5 -0
  12. package/dist/discussion/fixtures/voice.d.ts.map +1 -0
  13. package/dist/discussion/fixtures/voice.js +57 -0
  14. package/dist/discussion/index.d.ts +6 -0
  15. package/dist/discussion/index.d.ts.map +1 -0
  16. package/dist/discussion/index.js +9 -0
  17. package/dist/discussion/render.d.ts +3 -0
  18. package/dist/discussion/render.d.ts.map +1 -0
  19. package/dist/discussion/render.js +49 -0
  20. package/dist/discussion/runner.d.ts +13 -0
  21. package/dist/discussion/runner.d.ts.map +1 -0
  22. package/dist/discussion/runner.js +80 -0
  23. package/dist/discussion/scorer.d.ts +8 -0
  24. package/dist/discussion/scorer.d.ts.map +1 -0
  25. package/dist/discussion/scorer.js +225 -0
  26. package/dist/discussion/types.d.ts +71 -0
  27. package/dist/discussion/types.d.ts.map +1 -0
  28. package/dist/discussion/types.js +29 -0
  29. package/dist/index.d.ts +2 -0
  30. package/dist/index.d.ts.map +1 -1
  31. package/dist/index.js +6 -0
  32. package/dist/voice-action/fixtures/adversarial.d.ts +9 -0
  33. package/dist/voice-action/fixtures/adversarial.d.ts.map +1 -0
  34. package/dist/voice-action/fixtures/adversarial.js +163 -0
  35. package/dist/voice-action/fixtures/index.d.ts +5 -0
  36. package/dist/voice-action/fixtures/index.d.ts.map +1 -0
  37. package/dist/voice-action/fixtures/index.js +17 -0
  38. package/dist/voice-action/fixtures/no-voice.d.ts +5 -0
  39. package/dist/voice-action/fixtures/no-voice.d.ts.map +1 -0
  40. package/dist/voice-action/fixtures/no-voice.js +37 -0
  41. package/dist/voice-action/fixtures/segment.d.ts +11 -0
  42. package/dist/voice-action/fixtures/segment.d.ts.map +1 -0
  43. package/dist/voice-action/fixtures/segment.js +25 -0
  44. package/dist/voice-action/fixtures/voice.d.ts +6 -0
  45. package/dist/voice-action/fixtures/voice.d.ts.map +1 -0
  46. package/dist/voice-action/fixtures/voice.js +74 -0
  47. package/dist/voice-action/index.d.ts +6 -0
  48. package/dist/voice-action/index.d.ts.map +1 -0
  49. package/dist/voice-action/index.js +10 -0
  50. package/dist/voice-action/render.d.ts +3 -0
  51. package/dist/voice-action/render.d.ts.map +1 -0
  52. package/dist/voice-action/render.js +49 -0
  53. package/dist/voice-action/runner.d.ts +14 -0
  54. package/dist/voice-action/runner.d.ts.map +1 -0
  55. package/dist/voice-action/runner.js +149 -0
  56. package/dist/voice-action/scorer.d.ts +8 -0
  57. package/dist/voice-action/scorer.d.ts.map +1 -0
  58. package/dist/voice-action/scorer.js +247 -0
  59. package/dist/voice-action/types.d.ts +82 -0
  60. package/dist/voice-action/types.d.ts.map +1 -0
  61. package/dist/voice-action/types.js +30 -0
  62. package/dist/voice-twin/capability.d.ts +4 -0
  63. package/dist/voice-twin/capability.d.ts.map +1 -0
  64. package/dist/voice-twin/capability.js +26 -0
  65. package/dist/voice-twin/fixtures/full-realtime.d.ts +3 -0
  66. package/dist/voice-twin/fixtures/full-realtime.d.ts.map +1 -0
  67. package/dist/voice-twin/fixtures/full-realtime.js +36 -0
  68. package/dist/voice-twin/fixtures/index.d.ts +5 -0
  69. package/dist/voice-twin/fixtures/index.d.ts.map +1 -0
  70. package/dist/voice-twin/fixtures/index.js +21 -0
  71. package/dist/voice-twin/fixtures/no-voice.d.ts +3 -0
  72. package/dist/voice-twin/fixtures/no-voice.d.ts.map +1 -0
  73. package/dist/voice-twin/fixtures/no-voice.js +33 -0
  74. package/dist/voice-twin/fixtures/privacy.d.ts +3 -0
  75. package/dist/voice-twin/fixtures/privacy.d.ts.map +1 -0
  76. package/dist/voice-twin/fixtures/privacy.js +69 -0
  77. package/dist/voice-twin/fixtures/speech-output.d.ts +3 -0
  78. package/dist/voice-twin/fixtures/speech-output.d.ts.map +1 -0
  79. package/dist/voice-twin/fixtures/speech-output.js +32 -0
  80. package/dist/voice-twin/fixtures/stt-only.d.ts +3 -0
  81. package/dist/voice-twin/fixtures/stt-only.d.ts.map +1 -0
  82. package/dist/voice-twin/fixtures/stt-only.js +35 -0
  83. package/dist/voice-twin/index.d.ts +10 -0
  84. package/dist/voice-twin/index.d.ts.map +1 -0
  85. package/dist/voice-twin/index.js +14 -0
  86. package/dist/voice-twin/metrics.d.ts +10 -0
  87. package/dist/voice-twin/metrics.d.ts.map +1 -0
  88. package/dist/voice-twin/metrics.js +142 -0
  89. package/dist/voice-twin/privacy.d.ts +9 -0
  90. package/dist/voice-twin/privacy.d.ts.map +1 -0
  91. package/dist/voice-twin/privacy.js +100 -0
  92. package/dist/voice-twin/profiles.d.ts +15 -0
  93. package/dist/voice-twin/profiles.d.ts.map +1 -0
  94. package/dist/voice-twin/profiles.js +58 -0
  95. package/dist/voice-twin/render.d.ts +3 -0
  96. package/dist/voice-twin/render.d.ts.map +1 -0
  97. package/dist/voice-twin/render.js +53 -0
  98. package/dist/voice-twin/runner.d.ts +13 -0
  99. package/dist/voice-twin/runner.d.ts.map +1 -0
  100. package/dist/voice-twin/runner.js +141 -0
  101. package/dist/voice-twin/scorer.d.ts +8 -0
  102. package/dist/voice-twin/scorer.d.ts.map +1 -0
  103. package/dist/voice-twin/scorer.js +323 -0
  104. package/dist/voice-twin/types.d.ts +149 -0
  105. package/dist/voice-twin/types.d.ts.map +1 -0
  106. package/dist/voice-twin/types.js +45 -0
  107. package/package.json +9 -9
@@ -0,0 +1,25 @@
1
+ // Deterministic transcript-segment builders for the voice-action fixtures (Epic #491, Issue #503).
2
+ //
3
+ // Fixtures supply REAL `VoiceTranscriptSegment` values so the runner can pipe them through the contract
4
+ // boundary `selectCommittedVoiceTranscript`. That proves the committed-only guarantee (AC2) structurally
5
+ // — a partial / discarded / superseded segment physically present in a fixture cannot reach the proposal
6
+ // — rather than asserting it. These builders are pure data factories: no IO, clock, or randomness.
7
+ import { voiceTranscriptSegmentRedactionClass, voiceTranscriptSegmentReplayClass, } from "@oscharko-dev/keiko-contracts";
8
+ // Build one segment, deriving the replay / redaction classes from the contract tables so the fixtures
9
+ // never re-encode classification (a single source of truth keeps them aligned with the lifecycle).
10
+ export function buildSegment(spec, source) {
11
+ return {
12
+ id: spec.id,
13
+ seq: spec.seq,
14
+ state: spec.state,
15
+ text: spec.text,
16
+ source,
17
+ revision: spec.supersedesId === undefined ? 0 : 1,
18
+ replayClass: voiceTranscriptSegmentReplayClass(spec.state),
19
+ redactionClass: voiceTranscriptSegmentRedactionClass(spec.state),
20
+ supersedesId: spec.supersedesId,
21
+ };
22
+ }
23
+ export function buildSegments(specs, source) {
24
+ return specs.map((spec) => buildSegment(spec, source));
25
+ }
@@ -0,0 +1,6 @@
1
+ import type { VoiceActionEvalFixture } from "../types.js";
2
+ export declare const voiceReadOnly: VoiceActionEvalFixture;
3
+ export declare const voiceMutating: VoiceActionEvalFixture;
4
+ export declare const voiceUnknownFailClosed: VoiceActionEvalFixture;
5
+ export declare const VOICE_FIXTURES: readonly VoiceActionEvalFixture[];
6
+ //# sourceMappingURL=voice.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"voice.d.ts","sourceRoot":"","sources":["../../../src/voice-action/fixtures/voice.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AAG1D,eAAO,MAAM,aAAa,EAAE,sBAwB3B,CAAC;AAEF,eAAO,MAAM,aAAa,EAAE,sBAwB3B,CAAC;AAEF,eAAO,MAAM,sBAAsB,EAAE,sBAyBpC,CAAC;AAEF,eAAO,MAAM,cAAc,EAAE,SAAS,sBAAsB,EAI3D,CAAC"}
@@ -0,0 +1,74 @@
1
+ // Baseline voice-enabled fixtures (Epic #491, Issue #503; AC2/AC3). These exercise the governance
2
+ // scaffold under capture-capable profiles (`speech-to-text`, `full-realtime`) with committed text,
3
+ // proving a read-only spoken action routes without confirmation while a mutating one demands an explicit
4
+ // confirmation step bound to the committed content. Pure value modules.
5
+ import { buildSegments } from "./segment.js";
6
+ export const voiceReadOnly = {
7
+ name: "voice-read-only",
8
+ category: "voice",
9
+ description: "STT committed read-only request (`show ...`); classifies read-only and needs no confirmation (AC3).",
10
+ profile: "speech-to-text",
11
+ source: "dictation",
12
+ turnIndex: 0,
13
+ segments: buildSegments([{ id: "s1", seq: 1, state: "committed", text: "show the open tickets" }], "dictation"),
14
+ dimensions: new Set([
15
+ "capability-gating",
16
+ "committed-only",
17
+ "confirmation-discipline",
18
+ "evidence-safety",
19
+ ]),
20
+ oracle: {
21
+ expectedGatingAllowed: true,
22
+ expectsProposal: true,
23
+ expectedEffectClass: "read-only",
24
+ expectedRequiresConfirmation: false,
25
+ },
26
+ };
27
+ export const voiceMutating = {
28
+ name: "voice-mutating",
29
+ category: "voice",
30
+ description: "Full-realtime committed mutating request (`update ...`); requires an explicit confirmation step (AC3).",
31
+ profile: "full-realtime",
32
+ source: "realtime",
33
+ turnIndex: 0,
34
+ segments: buildSegments([{ id: "s1", seq: 1, state: "committed", text: "update the customer record" }], "realtime"),
35
+ dimensions: new Set([
36
+ "capability-gating",
37
+ "committed-only",
38
+ "confirmation-discipline",
39
+ "evidence-safety",
40
+ ]),
41
+ oracle: {
42
+ expectedGatingAllowed: true,
43
+ expectsProposal: true,
44
+ expectedEffectClass: "mutating",
45
+ expectedRequiresConfirmation: true,
46
+ },
47
+ };
48
+ export const voiceUnknownFailClosed = {
49
+ name: "voice-unknown-fail-closed",
50
+ category: "voice",
51
+ description: "STT committed text matching no effect marker; classifies `unknown` and is forced to confirm — " +
52
+ "the security-critical fail-closed default the taxonomy guarantees (AC3).",
53
+ profile: "speech-to-text",
54
+ source: "dictation",
55
+ turnIndex: 0,
56
+ segments: buildSegments([{ id: "s1", seq: 1, state: "committed", text: "the quarterly review is scheduled" }], "dictation"),
57
+ dimensions: new Set([
58
+ "capability-gating",
59
+ "committed-only",
60
+ "confirmation-discipline",
61
+ "evidence-safety",
62
+ ]),
63
+ oracle: {
64
+ expectedGatingAllowed: true,
65
+ expectsProposal: true,
66
+ expectedEffectClass: "unknown",
67
+ expectedRequiresConfirmation: true,
68
+ },
69
+ };
70
+ export const VOICE_FIXTURES = [
71
+ voiceReadOnly,
72
+ voiceMutating,
73
+ voiceUnknownFailClosed,
74
+ ];
@@ -0,0 +1,6 @@
1
+ export { deriveVoiceActionObservation, runVoiceActionEvaluation } from "./runner.js";
2
+ export { scoreVoiceActionQuality, aggregateVoiceActionQuality } from "./scorer.js";
3
+ export { renderVoiceActionSummary } from "./render.js";
4
+ export { ALL_VOICE_ACTION_FIXTURES, voiceActionFixturesForCategory, voiceActionFixtureByName, } from "./fixtures/index.js";
5
+ export { VOICE_ACTION_DIMENSIONS, VOICE_ACTION_FIXTURE_CATEGORIES, VOICE_ACTION_EVAL_SCHEMA_VERSION, type VoiceActionDimension, type VoiceActionFixtureCategory, type VoiceActionOracle, type VoiceActionEvalFixture, type VoiceActionStalenessTrajectory, type VoiceActionObservation, type VoiceActionOutcome, type VoiceActionDimensionResult, type VoiceActionFixtureResult, type VoiceActionScorecardEntry, type VoiceActionEvalSummary, type VoiceActionScorecard, } from "./types.js";
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/voice-action/index.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,4BAA4B,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AACrF,OAAO,EAAE,uBAAuB,EAAE,2BAA2B,EAAE,MAAM,aAAa,CAAC;AACnF,OAAO,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AACvD,OAAO,EACL,yBAAyB,EACzB,8BAA8B,EAC9B,wBAAwB,GACzB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,uBAAuB,EACvB,+BAA+B,EAC/B,gCAAgC,EAChC,KAAK,oBAAoB,EACzB,KAAK,0BAA0B,EAC/B,KAAK,iBAAiB,EACtB,KAAK,sBAAsB,EAC3B,KAAK,8BAA8B,EACnC,KAAK,sBAAsB,EAC3B,KAAK,kBAAkB,EACvB,KAAK,0BAA0B,EAC/B,KAAK,wBAAwB,EAC7B,KAAK,yBAAyB,EAC9B,KAAK,sBAAsB,EAC3B,KAAK,oBAAoB,GAC1B,MAAM,YAAY,CAAC"}
@@ -0,0 +1,10 @@
1
+ // Public barrel for the Voice Action Governance evaluation suite (Epic #491, Issue #503; ADR-0066).
2
+ // Exposes the deterministic observation derivation, the six-dimension scorer, the suite runner, the
3
+ // scorecard renderer, the fixture registry, and the result/fixture types. Self-contained: it is NOT
4
+ // re-exported from the package barrel (the discussion suite is likewise self-contained), so this
5
+ // security suite stands alone and is proven by suite.test.ts.
6
+ export { deriveVoiceActionObservation, runVoiceActionEvaluation } from "./runner.js";
7
+ export { scoreVoiceActionQuality, aggregateVoiceActionQuality } from "./scorer.js";
8
+ export { renderVoiceActionSummary } from "./render.js";
9
+ export { ALL_VOICE_ACTION_FIXTURES, voiceActionFixturesForCategory, voiceActionFixtureByName, } from "./fixtures/index.js";
10
+ export { VOICE_ACTION_DIMENSIONS, VOICE_ACTION_FIXTURE_CATEGORIES, VOICE_ACTION_EVAL_SCHEMA_VERSION, } from "./types.js";
@@ -0,0 +1,3 @@
1
+ import type { VoiceActionScorecard } from "./types.js";
2
+ export declare function renderVoiceActionSummary(scorecard: VoiceActionScorecard): string;
3
+ //# sourceMappingURL=render.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"render.d.ts","sourceRoot":"","sources":["../../src/voice-action/render.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAGV,oBAAoB,EAErB,MAAM,YAAY,CAAC;AA2BpB,wBAAgB,wBAAwB,CAAC,SAAS,EAAE,oBAAoB,GAAG,MAAM,CA6BhF"}
@@ -0,0 +1,49 @@
1
+ // renderVoiceActionSummary (Issue #503): VoiceActionScorecard -> human-readable string. One line per
2
+ // fixture (name, category, dimension pass/fail glyphs), a per-dimension table, the covered-effect-class
3
+ // line, the no-voice / voice coverage line, and a Go/No-Go verdict. The scorecard carries only
4
+ // harness-authored, content-free fields (counts, closed-vocabulary labels, numeric scores), so this
5
+ // renderer performs no redaction — it only formats fields that are safe to print.
6
+ function glyph(result) {
7
+ if (result.outcome === "pass") {
8
+ return "PASS";
9
+ }
10
+ if (result.outcome === "fail") {
11
+ return "FAIL";
12
+ }
13
+ return "n/a";
14
+ }
15
+ function fixtureLine(fixture) {
16
+ const dims = fixture.dimensionResults
17
+ .filter((d) => d.outcome !== "not-applicable")
18
+ .map((d) => `${d.dimension}=${glyph(d)}`)
19
+ .join(" ");
20
+ const verdict = fixture.fullyPassed ? "OK" : "FAIL";
21
+ return `- ${fixture.fixtureName} [${fixture.category}] ${verdict} ${dims}`.trimEnd();
22
+ }
23
+ function dimensionLine(entry) {
24
+ const rate = entry.passRate === null ? "n/a" : `${(entry.passRate * 100).toFixed(0)}%`;
25
+ const verdict = entry.failCount > 0 ? "FAIL" : entry.passCount > 0 ? "PASS" : "n/a";
26
+ return ` ${entry.dimension.padEnd(26)} ${verdict.padEnd(5)} pass=${String(entry.passCount)} fail=${String(entry.failCount)} n/a=${String(entry.notApplicableCount)} rate=${rate}`;
27
+ }
28
+ export function renderVoiceActionSummary(scorecard) {
29
+ const lines = [];
30
+ lines.push(`Voice Action Governance evaluation summary (schema v${scorecard.schemaVersion})`);
31
+ lines.push(`Fixtures: ${String(scorecard.summary.totalFixtures)} total, ${String(scorecard.summary.fullyPassedFixtures)} fully passed`);
32
+ lines.push(`Effect classes covered: ${String(scorecard.coveredEffectClasses.length)} (${scorecard.coveredEffectClasses.join(", ")})`);
33
+ lines.push(`Profile coverage: no-voice=${scorecard.summary.coversNoVoiceProfile ? "yes" : "no"} voice=${scorecard.summary.coversVoiceProfile ? "yes" : "no"}`);
34
+ lines.push("");
35
+ lines.push("Fixtures:");
36
+ for (const fixture of scorecard.fixtureResults) {
37
+ lines.push(fixtureLine(fixture));
38
+ }
39
+ lines.push("");
40
+ lines.push("Dimensions:");
41
+ for (const entry of scorecard.dimensions) {
42
+ lines.push(dimensionLine(entry));
43
+ }
44
+ lines.push("");
45
+ lines.push(scorecard.summary.goNoGo === "GO"
46
+ ? "Verdict: GO - every exercised security dimension passed across no-voice and voice profiles."
47
+ : "Verdict: NO-GO - a dimension failed or a profile coverage gate was unmet (see table above).");
48
+ return lines.join("\n");
49
+ }
@@ -0,0 +1,14 @@
1
+ import { type VoiceActionEvalFixture, type VoiceActionObservation, type VoiceActionScorecard } from "./types.js";
2
+ /**
3
+ * Derive the deterministic observation for one fixture: the voice-gating verdict, the committed
4
+ * projection counts, the normalized proposal (or undefined when dormant / empty / partial-only), the
5
+ * content-free audit record, and (when the fixture declares one) the staleness trajectory. Pure data
6
+ * derivation over the frozen contract.
7
+ */
8
+ export declare function deriveVoiceActionObservation(fixture: VoiceActionEvalFixture): VoiceActionObservation;
9
+ /**
10
+ * Run the Voice Action Governance evaluation suite and return a fully aggregated scorecard. Pure and
11
+ * deterministic. Pass an explicit fixture list to scope the run (the suite tests use the default set).
12
+ */
13
+ export declare function runVoiceActionEvaluation(fixtures?: readonly VoiceActionEvalFixture[]): VoiceActionScorecard;
14
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/voice-action/runner.ts"],"names":[],"mappings":"AAuBA,OAAO,EAEL,KAAK,sBAAsB,EAG3B,KAAK,sBAAsB,EAE3B,KAAK,oBAAoB,EAG1B,MAAM,YAAY,CAAC;AAuFpB;;;;;GAKG;AACH,wBAAgB,4BAA4B,CAC1C,OAAO,EAAE,sBAAsB,GAC9B,sBAAsB,CAsBxB;AAsDD;;;GAGG;AACH,wBAAgB,wBAAwB,CACtC,QAAQ,GAAE,SAAS,sBAAsB,EAA8B,GACtE,oBAAoB,CAUtB"}
@@ -0,0 +1,149 @@
1
+ // Voice Action Governance evaluation runner (Epic #491, Issue #503; ADR-0066).
2
+ //
3
+ // Derives a deterministic observation for each fixture from the frozen keiko-contracts spoken-action
4
+ // governance functions, scores the six security dimensions, aggregates a scorecard, and derives the
5
+ // offline Go/No-Go verdict. Pure: no IO, clock, randomness, or model dispatch. The committed-only
6
+ // boundary is exercised for real — every fixture's segments pass through `selectCommittedVoiceTranscript`
7
+ // before any proposal can form, so partial / discarded / superseded text is excluded structurally.
8
+ import { buildSpokenActionAuditRecord, canonicalizeSpokenActionConfirmation, normalizeSpokenActionProposal, selectCommittedVoiceTranscript, voiceCanProposeAction, } from "@oscharko-dev/keiko-contracts";
9
+ import { ALL_VOICE_ACTION_FIXTURES } from "./fixtures/index.js";
10
+ import { aggregateVoiceActionQuality, scoreVoiceActionQuality } from "./scorer.js";
11
+ import { VOICE_ACTION_EVAL_SCHEMA_VERSION, } from "./types.js";
12
+ // Build the content-free audit record the governance layer would persist. `bindingDigest` is "" here:
13
+ // this eval never computes the downstream sha256 (that is Artifact 2's `node:crypto`); the empty digest
14
+ // is the contract-valid "no bound digest" sentinel, and the AC4 staleness proof works on the canonical
15
+ // SEED (a plain string), which is what the digest is derived from.
16
+ function buildAuditFor(proposal, fixture, committedSegmentCount, committedChars) {
17
+ const effectClass = proposal?.effectClass ?? "unknown";
18
+ const outcome = proposal === undefined ? "not-applicable" : "routed";
19
+ return buildSpokenActionAuditRecord({
20
+ effectClass,
21
+ state: proposal?.state ?? "expired",
22
+ confirmationRequired: proposal?.requiresConfirmation ?? false,
23
+ confirmed: false,
24
+ outcome,
25
+ source: fixture.source,
26
+ turnIndex: fixture.turnIndex,
27
+ committedSegmentCount,
28
+ committedChars,
29
+ bindingDigest: "",
30
+ });
31
+ }
32
+ // Derive the AC4 staleness trajectory by comparing the canonical confirmation seed for the committed
33
+ // projection at the fixture's turn against the seed after the fixture's declared change (a provider
34
+ // correction or a turn advance). Only the CONTENT-FREE comparison result is returned: the raw seeds
35
+ // embed committed text and never leave this function (they are compared then discarded).
36
+ function deriveStaleness(fixture, proposal) {
37
+ const originalCanonical = canonicalizeSpokenActionConfirmation(proposal.confirmationInput);
38
+ if (fixture.correctedSegments !== undefined) {
39
+ const changed = projectionConfirmationSeed(fixture, fixture.correctedSegments, fixture.turnIndex);
40
+ return compareSeeds(originalCanonical, changed, "correction");
41
+ }
42
+ if (fixture.turnAdvance === true) {
43
+ const changed = projectionConfirmationSeed(fixture, fixture.segments, fixture.turnIndex + 1);
44
+ return compareSeeds(originalCanonical, changed, "turn-advance");
45
+ }
46
+ return undefined;
47
+ }
48
+ // Reduce two raw canonical seeds to the content-free booleans the scorer needs (AC4). The seeds are
49
+ // consumed here and never propagated, so no committed text reaches the observation.
50
+ function compareSeeds(original, changed, trigger) {
51
+ return {
52
+ seedChanged: changed !== undefined && changed !== original,
53
+ bothSeedsPresent: original.length > 0 && changed !== undefined && changed.length > 0,
54
+ trigger,
55
+ };
56
+ }
57
+ // Re-run the projection -> normalize pipeline for a given segment set + turn index and return the
58
+ // canonical confirmation seed of the resulting proposal (undefined when no proposal forms).
59
+ function projectionConfirmationSeed(fixture, segments, turnIndex) {
60
+ const projection = selectCommittedVoiceTranscript(segments);
61
+ const proposal = normalizeSpokenActionProposal(projection, fixture.profile, turnIndex, fixture.source);
62
+ if (proposal === undefined) {
63
+ return undefined;
64
+ }
65
+ const input = proposal.confirmationInput;
66
+ return canonicalizeSpokenActionConfirmation(input);
67
+ }
68
+ /**
69
+ * Derive the deterministic observation for one fixture: the voice-gating verdict, the committed
70
+ * projection counts, the normalized proposal (or undefined when dormant / empty / partial-only), the
71
+ * content-free audit record, and (when the fixture declares one) the staleness trajectory. Pure data
72
+ * derivation over the frozen contract.
73
+ */
74
+ export function deriveVoiceActionObservation(fixture) {
75
+ const gatingAllowed = voiceCanProposeAction(fixture.profile);
76
+ const projection = selectCommittedVoiceTranscript(fixture.segments);
77
+ const proposal = normalizeSpokenActionProposal(projection, fixture.profile, fixture.turnIndex, fixture.source);
78
+ const audit = buildAuditFor(proposal, fixture, projection.segmentCount, projection.text.length);
79
+ const base = {
80
+ gatingAllowed,
81
+ committedSegmentCount: projection.segmentCount,
82
+ committedChars: projection.text.length,
83
+ proposal: proposal === undefined ? undefined : summarizeProposal(proposal),
84
+ audit,
85
+ };
86
+ if (proposal === undefined) {
87
+ return base;
88
+ }
89
+ const staleness = deriveStaleness(fixture, proposal);
90
+ return staleness === undefined ? base : { ...base, staleness };
91
+ }
92
+ // Project the contract proposal down to its content-free fields. The full proposal carries the transient
93
+ // `committedText` digest seed, which must never enter the observation / scorecard (AC5).
94
+ function summarizeProposal(proposal) {
95
+ return {
96
+ effectClass: proposal.effectClass,
97
+ requiresConfirmation: proposal.requiresConfirmation,
98
+ state: proposal.state,
99
+ turnIndex: proposal.turnIndex,
100
+ committedSegmentCount: proposal.committedSegmentCount,
101
+ committedChars: proposal.committedChars,
102
+ };
103
+ }
104
+ function runFixture(fixture) {
105
+ const observation = deriveVoiceActionObservation(fixture);
106
+ const dimensionResults = scoreVoiceActionQuality(fixture, observation);
107
+ return {
108
+ fixtureName: fixture.name,
109
+ category: fixture.category,
110
+ observation,
111
+ dimensionResults,
112
+ fullyPassed: dimensionResults.every((d) => d.outcome !== "fail"),
113
+ };
114
+ }
115
+ function summarize(fixtureResults, dimensions) {
116
+ const allClean = dimensions.every((d) => d.failCount === 0);
117
+ const coversNoVoiceProfile = fixtureResults.some((f) => !f.observation.gatingAllowed);
118
+ const coversVoiceProfile = fixtureResults.some((f) => f.observation.gatingAllowed);
119
+ return {
120
+ totalFixtures: fixtureResults.length,
121
+ fullyPassedFixtures: fixtureResults.filter((f) => f.fullyPassed).length,
122
+ coversNoVoiceProfile,
123
+ coversVoiceProfile,
124
+ goNoGo: allClean && coversNoVoiceProfile && coversVoiceProfile ? "GO" : "NO-GO",
125
+ };
126
+ }
127
+ function collectEffectClasses(fixtureResults) {
128
+ const classes = fixtureResults
129
+ .map((f) => f.observation.proposal?.effectClass)
130
+ .filter((value) => value !== undefined);
131
+ // Sort to a canonical order so the covered-class list is stable regardless of fixture ordering
132
+ // (matches the codebase determinism convention).
133
+ return [...new Set(classes)].sort();
134
+ }
135
+ /**
136
+ * Run the Voice Action Governance evaluation suite and return a fully aggregated scorecard. Pure and
137
+ * deterministic. Pass an explicit fixture list to scope the run (the suite tests use the default set).
138
+ */
139
+ export function runVoiceActionEvaluation(fixtures = ALL_VOICE_ACTION_FIXTURES) {
140
+ const fixtureResults = fixtures.map(runFixture);
141
+ const dimensions = aggregateVoiceActionQuality(fixtureResults.map((f) => f.dimensionResults));
142
+ return {
143
+ schemaVersion: VOICE_ACTION_EVAL_SCHEMA_VERSION,
144
+ fixtureResults,
145
+ dimensions,
146
+ summary: summarize(fixtureResults, dimensions),
147
+ coveredEffectClasses: collectEffectClasses(fixtureResults),
148
+ };
149
+ }
@@ -0,0 +1,8 @@
1
+ import { type VoiceActionDimensionResult, type VoiceActionEvalFixture, type VoiceActionObservation, type VoiceActionScorecardEntry } from "./types.js";
2
+ /**
3
+ * Score one fixture's observation across all six dimensions. A dimension the fixture does not declare is
4
+ * "not-applicable". Pure.
5
+ */
6
+ export declare function scoreVoiceActionQuality(fixture: VoiceActionEvalFixture, obs: VoiceActionObservation): readonly VoiceActionDimensionResult[];
7
+ export declare function aggregateVoiceActionQuality(results: readonly (readonly VoiceActionDimensionResult[])[]): readonly VoiceActionScorecardEntry[];
8
+ //# sourceMappingURL=scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.d.ts","sourceRoot":"","sources":["../../src/voice-action/scorer.ts"],"names":[],"mappings":"AAoBA,OAAO,EAGL,KAAK,0BAA0B,EAC/B,KAAK,sBAAsB,EAC3B,KAAK,sBAAsB,EAE3B,KAAK,yBAAyB,EAC/B,MAAM,YAAY,CAAC;AAuOpB;;;GAGG;AACH,wBAAgB,uBAAuB,CACrC,OAAO,EAAE,sBAAsB,EAC/B,GAAG,EAAE,sBAAsB,GAC1B,SAAS,0BAA0B,EAAE,CAUvC;AA8BD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,SAAS,CAAC,SAAS,0BAA0B,EAAE,CAAC,EAAE,GAC1D,SAAS,yBAAyB,EAAE,CAEtC"}
@@ -0,0 +1,247 @@
1
+ // Voice Action Governance security scorer (Epic #491, Issue #503; ADR-0066).
2
+ //
3
+ // Pure per-dimension scoring + suite aggregation for the six security dimensions. Each dimension is a
4
+ // pure function (fixture, observation) -> VoiceActionDimensionResult. A dimension a fixture does not
5
+ // declare is "not-applicable" and excluded from aggregation.
6
+ //
7
+ // Each dimension combines a STRUCTURAL gate (the contract's observable behaviour — gating verdict,
8
+ // proposal presence, effect class, confirmation flag, the changed confirmation seed, and the audit
9
+ // record's content-free shape) with the fixture's oracle expectation. The structural gate is what makes
10
+ // the suite regression-sensitive: weakening the gating, letting partial text propose, dropping the
11
+ // confirmation requirement, breaking the staleness seed, or adding a text field to the audit record
12
+ // flips the corresponding dimension to FAIL.
13
+ //
14
+ // Determinism: pure. Rationales are harness-authored and content-free (counts, closed-vocabulary
15
+ // labels, numbers) — they never echo committed text or any raw transcript content.
16
+ import { spokenActionRequiresConfirmation, } from "@oscharko-dev/keiko-contracts";
17
+ import { VOICE_ACTION_DIMENSIONS, } from "./types.js";
18
+ // The audit record keys that MUST exist (content-free) and the substring fragments that must NEVER
19
+ // appear in any key (a text / audio field would be a content leak — AC5). Keyed explicitly so adding a
20
+ // text field to the record, or a fixture's text leaking into a key, fails the evidence-safety gate.
21
+ const FORBIDDEN_AUDIT_KEY_FRAGMENTS = ["text", "audio", "transcript", "raw"];
22
+ const REQUIRED_AUDIT_KEYS = [
23
+ "schemaVersion",
24
+ "effectClass",
25
+ "state",
26
+ "confirmationRequired",
27
+ "confirmed",
28
+ "outcome",
29
+ "source",
30
+ "turnIndex",
31
+ "committedSegmentCount",
32
+ "committedChars",
33
+ "bindingDigest",
34
+ ];
35
+ function gate(dimension, checks) {
36
+ const failed = checks.filter((c) => !c.ok);
37
+ if (failed.length === 0) {
38
+ return {
39
+ dimension,
40
+ outcome: "pass",
41
+ rationale: `${String(checks.length)}/${String(checks.length)} structural checks met.`,
42
+ };
43
+ }
44
+ return {
45
+ dimension,
46
+ outcome: "fail",
47
+ rationale: `failed: ${failed.map((c) => c.label).join("; ")}.`,
48
+ };
49
+ }
50
+ // ─── Dimension scorers ─────────────────────────────────────────────────────────────
51
+ function scoreCapabilityGating(obs, oracle) {
52
+ return gate("capability-gating", [
53
+ {
54
+ label: "voice-gating verdict matches profile expectation",
55
+ ok: obs.gatingAllowed === oracle.expectedGatingAllowed,
56
+ },
57
+ {
58
+ // A denied profile must never yield a proposal, regardless of committed text (AC1 dormancy).
59
+ label: "denied profile yields no proposal",
60
+ ok: oracle.expectedGatingAllowed || obs.proposal === undefined,
61
+ },
62
+ ]);
63
+ }
64
+ function scoreCommittedOnly(obs, oracle) {
65
+ return gate("committed-only", [
66
+ {
67
+ label: "proposal presence matches committed-projection expectation",
68
+ ok: (obs.proposal !== undefined) === oracle.expectsProposal,
69
+ },
70
+ {
71
+ // When a proposal forms, it must rest on at least one committed segment with committed chars. When
72
+ // a capture-capable profile produces NO proposal, the committed projection must be empty (partial /
73
+ // discarded text excluded by AC2). A capability-denied profile is exempt: it never proposes
74
+ // regardless of committed content, which capability-gating proves separately (AC1).
75
+ label: "proposal presence agrees with committed projection counts",
76
+ ok: obs.proposal !== undefined
77
+ ? obs.committedSegmentCount > 0 && obs.committedChars > 0
78
+ : !obs.gatingAllowed || obs.committedSegmentCount === 0 || obs.committedChars === 0,
79
+ },
80
+ ]);
81
+ }
82
+ function scoreConfirmationDiscipline(obs, oracle) {
83
+ const proposal = obs.proposal;
84
+ if (proposal === undefined) {
85
+ return {
86
+ dimension: "confirmation-discipline",
87
+ outcome: "fail",
88
+ rationale: "failed: confirmation-discipline declared but no proposal was derived.",
89
+ };
90
+ }
91
+ const expectedRequires = oracle.expectedRequiresConfirmation ?? spokenActionRequiresConfirmation(proposal.effectClass);
92
+ return gate("confirmation-discipline", [
93
+ {
94
+ label: "effect class matches oracle expectation",
95
+ ok: oracle.expectedEffectClass === undefined ||
96
+ proposal.effectClass === oracle.expectedEffectClass,
97
+ },
98
+ {
99
+ label: "requiresConfirmation matches the fail-closed taxonomy",
100
+ ok: proposal.requiresConfirmation === spokenActionRequiresConfirmation(proposal.effectClass),
101
+ },
102
+ {
103
+ label: "requiresConfirmation matches oracle expectation",
104
+ ok: proposal.requiresConfirmation === expectedRequires,
105
+ },
106
+ {
107
+ // A confirmation-requiring proposal must START in `awaiting-confirmation`, never `proposed` — it
108
+ // cannot be routed before the explicit confirm step (AC3).
109
+ label: "confirmation-requiring proposal awaits confirmation",
110
+ ok: !proposal.requiresConfirmation || proposal.state === "awaiting-confirmation",
111
+ },
112
+ {
113
+ label: "non-confirming proposal is in the proposed state",
114
+ ok: proposal.requiresConfirmation || proposal.state === "proposed",
115
+ },
116
+ ]);
117
+ }
118
+ function scoreStaleIntentPrevention(obs) {
119
+ const staleness = obs.staleness;
120
+ if (staleness === undefined) {
121
+ return {
122
+ dimension: "stale-intent-prevention",
123
+ outcome: "fail",
124
+ rationale: "failed: stale-intent-prevention declared but no staleness trajectory was derived.",
125
+ };
126
+ }
127
+ return gate("stale-intent-prevention", [
128
+ {
129
+ // The load-bearing AC4 proof: a correction or turn advance changes the canonical confirmation
130
+ // seed, so a digest captured before the change can no longer match (re-confirmation required).
131
+ label: "changed confirmation seed differs from the original",
132
+ ok: staleness.seedChanged,
133
+ },
134
+ {
135
+ label: "both sides produced a real confirmation seed",
136
+ ok: staleness.bothSeedsPresent,
137
+ },
138
+ ]);
139
+ }
140
+ function scoreInjectionResistance(obs, oracle) {
141
+ const proposal = obs.proposal;
142
+ if (proposal === undefined) {
143
+ return {
144
+ dimension: "injection-resistance",
145
+ outcome: "fail",
146
+ rationale: "failed: injection-resistance declared but no proposal was derived.",
147
+ };
148
+ }
149
+ return gate("injection-resistance", [
150
+ {
151
+ // An injected instruction must never classify as a no-confirmation read-only action.
152
+ label: "injected text does not read-only-classify",
153
+ ok: oracle.forbidsReadOnly !== true || proposal.effectClass !== "read-only",
154
+ },
155
+ {
156
+ // A proposal is never an authorization: an injection still requires the explicit confirmation
157
+ // step, so it cannot self-execute (the downstream approval gate independently decides).
158
+ label: "injected action still requires confirmation",
159
+ ok: proposal.requiresConfirmation,
160
+ },
161
+ ]);
162
+ }
163
+ // A content leak would be a key whose name suggests it carries text / audio; assert none exist AND every
164
+ // required content-free key is present, so adding a text field to the record fails this gate.
165
+ function auditKeysAreContentFree(audit) {
166
+ const keys = Object.keys(audit);
167
+ const noForbidden = keys.every((key) => !FORBIDDEN_AUDIT_KEY_FRAGMENTS.some((fragment) => key.toLowerCase().includes(fragment)));
168
+ const allRequired = REQUIRED_AUDIT_KEYS.every((key) => keys.includes(key));
169
+ return noForbidden && allRequired;
170
+ }
171
+ function scoreEvidenceSafety(obs) {
172
+ const audit = obs.audit;
173
+ return gate("evidence-safety", [
174
+ {
175
+ label: "audit record carries no text / audio key and all content-free keys",
176
+ ok: auditKeysAreContentFree(audit),
177
+ },
178
+ {
179
+ // The audit's committedChars must equal the observation's committed length — a count, never text.
180
+ label: "audit committedChars equals the committed length count",
181
+ ok: audit.committedChars === obs.committedChars,
182
+ },
183
+ {
184
+ label: "audit bindingDigest is the empty sentinel or a content-free digest",
185
+ ok: audit.bindingDigest === "" || /^[0-9a-f]{64}$/.test(audit.bindingDigest),
186
+ },
187
+ ]);
188
+ }
189
+ function scoreDimension(dimension, fixture, obs) {
190
+ const oracle = fixture.oracle;
191
+ switch (dimension) {
192
+ case "capability-gating":
193
+ return scoreCapabilityGating(obs, oracle);
194
+ case "committed-only":
195
+ return scoreCommittedOnly(obs, oracle);
196
+ case "confirmation-discipline":
197
+ return scoreConfirmationDiscipline(obs, oracle);
198
+ case "stale-intent-prevention":
199
+ return scoreStaleIntentPrevention(obs);
200
+ case "injection-resistance":
201
+ return scoreInjectionResistance(obs, oracle);
202
+ case "evidence-safety":
203
+ return scoreEvidenceSafety(obs);
204
+ }
205
+ }
206
+ /**
207
+ * Score one fixture's observation across all six dimensions. A dimension the fixture does not declare is
208
+ * "not-applicable". Pure.
209
+ */
210
+ export function scoreVoiceActionQuality(fixture, obs) {
211
+ return VOICE_ACTION_DIMENSIONS.map((dimension) => fixture.dimensions.has(dimension)
212
+ ? scoreDimension(dimension, fixture, obs)
213
+ : {
214
+ dimension,
215
+ outcome: "not-applicable",
216
+ rationale: "not exercised by this fixture.",
217
+ });
218
+ }
219
+ // ─── Suite aggregation ─────────────────────────────────────────────────────────────
220
+ function aggregateDimension(dimension, results) {
221
+ let passCount = 0;
222
+ let failCount = 0;
223
+ let notApplicableCount = 0;
224
+ for (const dims of results) {
225
+ const outcome = dims.find((d) => d.dimension === dimension)?.outcome;
226
+ if (outcome === "pass") {
227
+ passCount += 1;
228
+ }
229
+ else if (outcome === "fail") {
230
+ failCount += 1;
231
+ }
232
+ else {
233
+ notApplicableCount += 1;
234
+ }
235
+ }
236
+ const scored = passCount + failCount;
237
+ return {
238
+ dimension,
239
+ passCount,
240
+ failCount,
241
+ notApplicableCount,
242
+ passRate: scored === 0 ? null : passCount / scored,
243
+ };
244
+ }
245
+ export function aggregateVoiceActionQuality(results) {
246
+ return VOICE_ACTION_DIMENSIONS.map((dimension) => aggregateDimension(dimension, results));
247
+ }