@oscharko-dev/keiko-evaluations 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/discussion/fixtures/correction.d.ts +5 -0
- package/dist/discussion/fixtures/correction.d.ts.map +1 -0
- package/dist/discussion/fixtures/correction.js +53 -0
- package/dist/discussion/fixtures/index.d.ts +5 -0
- package/dist/discussion/fixtures/index.d.ts.map +1 -0
- package/dist/discussion/fixtures/index.js +17 -0
- package/dist/discussion/fixtures/no-voice.d.ts +6 -0
- package/dist/discussion/fixtures/no-voice.d.ts.map +1 -0
- package/dist/discussion/fixtures/no-voice.js +79 -0
- package/dist/discussion/fixtures/voice.d.ts +5 -0
- package/dist/discussion/fixtures/voice.d.ts.map +1 -0
- package/dist/discussion/fixtures/voice.js +57 -0
- package/dist/discussion/index.d.ts +6 -0
- package/dist/discussion/index.d.ts.map +1 -0
- package/dist/discussion/index.js +9 -0
- package/dist/discussion/render.d.ts +3 -0
- package/dist/discussion/render.d.ts.map +1 -0
- package/dist/discussion/render.js +49 -0
- package/dist/discussion/runner.d.ts +13 -0
- package/dist/discussion/runner.d.ts.map +1 -0
- package/dist/discussion/runner.js +80 -0
- package/dist/discussion/scorer.d.ts +8 -0
- package/dist/discussion/scorer.d.ts.map +1 -0
- package/dist/discussion/scorer.js +225 -0
- package/dist/discussion/types.d.ts +71 -0
- package/dist/discussion/types.d.ts.map +1 -0
- package/dist/discussion/types.js +29 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -0
- package/dist/voice-action/fixtures/adversarial.d.ts +9 -0
- package/dist/voice-action/fixtures/adversarial.d.ts.map +1 -0
- package/dist/voice-action/fixtures/adversarial.js +163 -0
- package/dist/voice-action/fixtures/index.d.ts +5 -0
- package/dist/voice-action/fixtures/index.d.ts.map +1 -0
- package/dist/voice-action/fixtures/index.js +17 -0
- package/dist/voice-action/fixtures/no-voice.d.ts +5 -0
- package/dist/voice-action/fixtures/no-voice.d.ts.map +1 -0
- package/dist/voice-action/fixtures/no-voice.js +37 -0
- package/dist/voice-action/fixtures/segment.d.ts +11 -0
- package/dist/voice-action/fixtures/segment.d.ts.map +1 -0
- package/dist/voice-action/fixtures/segment.js +25 -0
- package/dist/voice-action/fixtures/voice.d.ts +6 -0
- package/dist/voice-action/fixtures/voice.d.ts.map +1 -0
- package/dist/voice-action/fixtures/voice.js +74 -0
- package/dist/voice-action/index.d.ts +6 -0
- package/dist/voice-action/index.d.ts.map +1 -0
- package/dist/voice-action/index.js +10 -0
- package/dist/voice-action/render.d.ts +3 -0
- package/dist/voice-action/render.d.ts.map +1 -0
- package/dist/voice-action/render.js +49 -0
- package/dist/voice-action/runner.d.ts +14 -0
- package/dist/voice-action/runner.d.ts.map +1 -0
- package/dist/voice-action/runner.js +149 -0
- package/dist/voice-action/scorer.d.ts +8 -0
- package/dist/voice-action/scorer.d.ts.map +1 -0
- package/dist/voice-action/scorer.js +247 -0
- package/dist/voice-action/types.d.ts +82 -0
- package/dist/voice-action/types.d.ts.map +1 -0
- package/dist/voice-action/types.js +30 -0
- package/dist/voice-twin/capability.d.ts +4 -0
- package/dist/voice-twin/capability.d.ts.map +1 -0
- package/dist/voice-twin/capability.js +26 -0
- package/dist/voice-twin/fixtures/full-realtime.d.ts +3 -0
- package/dist/voice-twin/fixtures/full-realtime.d.ts.map +1 -0
- package/dist/voice-twin/fixtures/full-realtime.js +36 -0
- package/dist/voice-twin/fixtures/index.d.ts +5 -0
- package/dist/voice-twin/fixtures/index.d.ts.map +1 -0
- package/dist/voice-twin/fixtures/index.js +21 -0
- package/dist/voice-twin/fixtures/no-voice.d.ts +3 -0
- package/dist/voice-twin/fixtures/no-voice.d.ts.map +1 -0
- package/dist/voice-twin/fixtures/no-voice.js +33 -0
- package/dist/voice-twin/fixtures/privacy.d.ts +3 -0
- package/dist/voice-twin/fixtures/privacy.d.ts.map +1 -0
- package/dist/voice-twin/fixtures/privacy.js +69 -0
- package/dist/voice-twin/fixtures/speech-output.d.ts +3 -0
- package/dist/voice-twin/fixtures/speech-output.d.ts.map +1 -0
- package/dist/voice-twin/fixtures/speech-output.js +32 -0
- package/dist/voice-twin/fixtures/stt-only.d.ts +3 -0
- package/dist/voice-twin/fixtures/stt-only.d.ts.map +1 -0
- package/dist/voice-twin/fixtures/stt-only.js +35 -0
- package/dist/voice-twin/index.d.ts +10 -0
- package/dist/voice-twin/index.d.ts.map +1 -0
- package/dist/voice-twin/index.js +14 -0
- package/dist/voice-twin/metrics.d.ts +10 -0
- package/dist/voice-twin/metrics.d.ts.map +1 -0
- package/dist/voice-twin/metrics.js +142 -0
- package/dist/voice-twin/privacy.d.ts +9 -0
- package/dist/voice-twin/privacy.d.ts.map +1 -0
- package/dist/voice-twin/privacy.js +100 -0
- package/dist/voice-twin/profiles.d.ts +15 -0
- package/dist/voice-twin/profiles.d.ts.map +1 -0
- package/dist/voice-twin/profiles.js +58 -0
- package/dist/voice-twin/render.d.ts +3 -0
- package/dist/voice-twin/render.d.ts.map +1 -0
- package/dist/voice-twin/render.js +53 -0
- package/dist/voice-twin/runner.d.ts +13 -0
- package/dist/voice-twin/runner.d.ts.map +1 -0
- package/dist/voice-twin/runner.js +141 -0
- package/dist/voice-twin/scorer.d.ts +8 -0
- package/dist/voice-twin/scorer.d.ts.map +1 -0
- package/dist/voice-twin/scorer.js +323 -0
- package/dist/voice-twin/types.d.ts +149 -0
- package/dist/voice-twin/types.d.ts.map +1 -0
- package/dist/voice-twin/types.js +45 -0
- package/package.json +9 -9
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { DiscussionEvalFixture } from "../types.js";
|
|
2
|
+
export declare const evidenceCheckCorrection: DiscussionEvalFixture;
|
|
3
|
+
export declare const voiceEvidenceCheckCorrection: DiscussionEvalFixture;
|
|
4
|
+
export declare const CORRECTION_FIXTURES: readonly DiscussionEvalFixture[];
|
|
5
|
+
//# sourceMappingURL=correction.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"correction.d.ts","sourceRoot":"","sources":["../../../src/discussion/fixtures/correction.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,aAAa,CAAC;AAEzD,eAAO,MAAM,uBAAuB,EAAE,qBAuBrC,CAAC;AAEF,eAAO,MAAM,4BAA4B,EAAE,qBAuB1C,CAAC;AAEF,eAAO,MAAM,mBAAmB,EAAE,SAAS,qBAAqB,EAG/D,CAAC"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
// Correction / contradiction-handling discussion fixtures (Epic #491, Issue #502; AC3). These pin the
|
|
2
|
+
// evidence-check mode's strict citation discipline and the disclose-and-defer contradiction policy that
|
|
3
|
+
// governs how Keiko handles an unresolved contradiction (defer to the user). Pure value modules.
|
|
4
|
+
export const evidenceCheckCorrection = {
|
|
5
|
+
name: "evidence-check-correction",
|
|
6
|
+
category: "correction",
|
|
7
|
+
description: "Evidence-check mode in the `none` profile; strict citation discipline and disclose-and-defer on contradiction.",
|
|
8
|
+
profile: "none",
|
|
9
|
+
mode: "evidence-check",
|
|
10
|
+
topicId: "topic-evidence-check",
|
|
11
|
+
dimensions: new Set([
|
|
12
|
+
"mode-appropriateness",
|
|
13
|
+
"disagreement-completeness",
|
|
14
|
+
"uncertainty-discipline",
|
|
15
|
+
"evidence-citation-discipline",
|
|
16
|
+
"correction-handling",
|
|
17
|
+
"capability-gating",
|
|
18
|
+
]),
|
|
19
|
+
oracle: {
|
|
20
|
+
expectedMandatedFacets: ["evidence", "assumptions", "uncertainty"],
|
|
21
|
+
expectedGatingAllowed: false,
|
|
22
|
+
expectedUncertaintyDisclosure: true,
|
|
23
|
+
expectedDecisionRecommendation: false,
|
|
24
|
+
expectedContradictionPolicies: ["disclose-and-defer"],
|
|
25
|
+
},
|
|
26
|
+
};
|
|
27
|
+
export const voiceEvidenceCheckCorrection = {
|
|
28
|
+
name: "voice-evidence-check-correction",
|
|
29
|
+
category: "correction",
|
|
30
|
+
description: "Evidence-check mode under a speech-to-text profile; same correction handling reached via voice.",
|
|
31
|
+
profile: "speech-to-text",
|
|
32
|
+
mode: "evidence-check",
|
|
33
|
+
topicId: "topic-evidence-check-stt",
|
|
34
|
+
dimensions: new Set([
|
|
35
|
+
"mode-appropriateness",
|
|
36
|
+
"disagreement-completeness",
|
|
37
|
+
"uncertainty-discipline",
|
|
38
|
+
"evidence-citation-discipline",
|
|
39
|
+
"correction-handling",
|
|
40
|
+
"capability-gating",
|
|
41
|
+
]),
|
|
42
|
+
oracle: {
|
|
43
|
+
expectedMandatedFacets: ["evidence", "assumptions", "uncertainty"],
|
|
44
|
+
expectedGatingAllowed: true,
|
|
45
|
+
expectedUncertaintyDisclosure: true,
|
|
46
|
+
expectedDecisionRecommendation: false,
|
|
47
|
+
expectedContradictionPolicies: ["disclose-and-defer"],
|
|
48
|
+
},
|
|
49
|
+
};
|
|
50
|
+
export const CORRECTION_FIXTURES = [
|
|
51
|
+
evidenceCheckCorrection,
|
|
52
|
+
voiceEvidenceCheckCorrection,
|
|
53
|
+
];
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { DiscussionEvalFixture, DiscussionFixtureCategory } from "../types.js";
|
|
2
|
+
export declare const ALL_DISCUSSION_FIXTURES: readonly DiscussionEvalFixture[];
|
|
3
|
+
export declare function discussionFixturesForCategory(category: DiscussionFixtureCategory): readonly DiscussionEvalFixture[];
|
|
4
|
+
export declare function discussionFixtureByName(name: string): DiscussionEvalFixture | undefined;
|
|
5
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/discussion/fixtures/index.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,qBAAqB,EAAE,yBAAyB,EAAE,MAAM,aAAa,CAAC;AAEpF,eAAO,MAAM,uBAAuB,EAAE,SAAS,qBAAqB,EAInE,CAAC;AAEF,wBAAgB,6BAA6B,CAC3C,QAAQ,EAAE,yBAAyB,GAClC,SAAS,qBAAqB,EAAE,CAElC;AAED,wBAAgB,uBAAuB,CAAC,IAAI,EAAE,MAAM,GAAG,qBAAqB,GAAG,SAAS,CAEvF"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// Discussion Intelligence fixture registry (Epic #491, Issue #502). ALL_DISCUSSION_FIXTURES is the
|
|
2
|
+
// canonical list the runner and suite tests consume; the selectors resolve a category or a fixture name
|
|
3
|
+
// against it. Mirrors the Prompt Enhancer fixtures/index.ts layout.
|
|
4
|
+
import { NO_VOICE_FIXTURES } from "./no-voice.js";
|
|
5
|
+
import { VOICE_FIXTURES } from "./voice.js";
|
|
6
|
+
import { CORRECTION_FIXTURES } from "./correction.js";
|
|
7
|
+
export const ALL_DISCUSSION_FIXTURES = [
|
|
8
|
+
...NO_VOICE_FIXTURES,
|
|
9
|
+
...VOICE_FIXTURES,
|
|
10
|
+
...CORRECTION_FIXTURES,
|
|
11
|
+
];
|
|
12
|
+
export function discussionFixturesForCategory(category) {
|
|
13
|
+
return ALL_DISCUSSION_FIXTURES.filter((f) => f.category === category);
|
|
14
|
+
}
|
|
15
|
+
export function discussionFixtureByName(name) {
|
|
16
|
+
return ALL_DISCUSSION_FIXTURES.find((f) => f.name === name);
|
|
17
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { DiscussionEvalFixture } from "../types.js";
|
|
2
|
+
export declare const noVoiceChallenge: DiscussionEvalFixture;
|
|
3
|
+
export declare const noVoiceDecide: DiscussionEvalFixture;
|
|
4
|
+
export declare const noVoiceBrainstorm: DiscussionEvalFixture;
|
|
5
|
+
export declare const NO_VOICE_FIXTURES: readonly DiscussionEvalFixture[];
|
|
6
|
+
//# sourceMappingURL=no-voice.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"no-voice.d.ts","sourceRoot":"","sources":["../../../src/discussion/fixtures/no-voice.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,aAAa,CAAC;AAEzD,eAAO,MAAM,gBAAgB,EAAE,qBAuB9B,CAAC;AAEF,eAAO,MAAM,aAAa,EAAE,qBAsB3B,CAAC;AAEF,eAAO,MAAM,iBAAiB,EAAE,qBAyB/B,CAAC;AAEF,eAAO,MAAM,iBAAiB,EAAE,SAAS,qBAAqB,EAI7D,CAAC"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
// No-voice (text-first) discussion fixtures (Epic #491, Issue #502; AC1/AC6). These exercise the
|
|
2
|
+
// discussion scaffold in the `none` voice profile, proving every mode works text-first and that
|
|
3
|
+
// capability gating denies spoken turns for a profile with no capture capability. Pure value modules.
|
|
4
|
+
export const noVoiceChallenge = {
|
|
5
|
+
name: "no-voice-challenge",
|
|
6
|
+
category: "no-voice",
|
|
7
|
+
description: "Challenge mode in the text-first `none` profile; full disagreement structure mandated.",
|
|
8
|
+
profile: "none",
|
|
9
|
+
mode: "challenge",
|
|
10
|
+
topicId: "topic-challenge-text",
|
|
11
|
+
dimensions: new Set([
|
|
12
|
+
"mode-appropriateness",
|
|
13
|
+
"disagreement-completeness",
|
|
14
|
+
"uncertainty-discipline",
|
|
15
|
+
"evidence-citation-discipline",
|
|
16
|
+
"correction-handling",
|
|
17
|
+
"capability-gating",
|
|
18
|
+
]),
|
|
19
|
+
oracle: {
|
|
20
|
+
expectedMandatedFacets: ["evidence", "assumptions", "uncertainty"],
|
|
21
|
+
expectedGatingAllowed: false,
|
|
22
|
+
expectedUncertaintyDisclosure: true,
|
|
23
|
+
expectedDecisionRecommendation: false,
|
|
24
|
+
expectedContradictionPolicies: ["disclose-and-defer"],
|
|
25
|
+
},
|
|
26
|
+
};
|
|
27
|
+
export const noVoiceDecide = {
|
|
28
|
+
name: "no-voice-decide",
|
|
29
|
+
category: "no-voice",
|
|
30
|
+
description: "Decide mode in the text-first `none` profile; produces a decision recommendation.",
|
|
31
|
+
profile: "none",
|
|
32
|
+
mode: "decide",
|
|
33
|
+
topicId: "topic-decide-text",
|
|
34
|
+
dimensions: new Set([
|
|
35
|
+
"mode-appropriateness",
|
|
36
|
+
"disagreement-completeness",
|
|
37
|
+
"uncertainty-discipline",
|
|
38
|
+
"evidence-citation-discipline",
|
|
39
|
+
"correction-handling",
|
|
40
|
+
"capability-gating",
|
|
41
|
+
]),
|
|
42
|
+
oracle: {
|
|
43
|
+
expectedMandatedFacets: ["evidence", "assumptions", "uncertainty"],
|
|
44
|
+
expectedGatingAllowed: false,
|
|
45
|
+
expectedUncertaintyDisclosure: true,
|
|
46
|
+
expectedDecisionRecommendation: true,
|
|
47
|
+
expectedContradictionPolicies: ["disclose-and-defer"],
|
|
48
|
+
},
|
|
49
|
+
};
|
|
50
|
+
export const noVoiceBrainstorm = {
|
|
51
|
+
name: "no-voice-brainstorm",
|
|
52
|
+
category: "no-voice",
|
|
53
|
+
description: "Brainstorm mode in the `none` profile; relaxes uncertainty disclosure and expands options.",
|
|
54
|
+
profile: "none",
|
|
55
|
+
mode: "brainstorm",
|
|
56
|
+
topicId: "topic-brainstorm-text",
|
|
57
|
+
dimensions: new Set([
|
|
58
|
+
"mode-appropriateness",
|
|
59
|
+
"disagreement-completeness",
|
|
60
|
+
"uncertainty-discipline",
|
|
61
|
+
"evidence-citation-discipline",
|
|
62
|
+
"correction-handling",
|
|
63
|
+
"capability-gating",
|
|
64
|
+
]),
|
|
65
|
+
oracle: {
|
|
66
|
+
// Brainstorm relaxes the disagreement structure: it mandates evidence + assumptions but NOT
|
|
67
|
+
// uncertainty, so the facet set is a strict subset of the full three.
|
|
68
|
+
expectedMandatedFacets: ["evidence", "assumptions"],
|
|
69
|
+
expectedGatingAllowed: false,
|
|
70
|
+
expectedUncertaintyDisclosure: false,
|
|
71
|
+
expectedDecisionRecommendation: false,
|
|
72
|
+
expectedContradictionPolicies: ["synthesize-with-caveats"],
|
|
73
|
+
},
|
|
74
|
+
};
|
|
75
|
+
export const NO_VOICE_FIXTURES = [
|
|
76
|
+
noVoiceChallenge,
|
|
77
|
+
noVoiceDecide,
|
|
78
|
+
noVoiceBrainstorm,
|
|
79
|
+
];
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { DiscussionEvalFixture } from "../types.js";
|
|
2
|
+
export declare const voiceSttReview: DiscussionEvalFixture;
|
|
3
|
+
export declare const voiceRealtimeBargeInRecovery: DiscussionEvalFixture;
|
|
4
|
+
export declare const VOICE_FIXTURES: readonly DiscussionEvalFixture[];
|
|
5
|
+
//# sourceMappingURL=voice.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"voice.d.ts","sourceRoot":"","sources":["../../../src/discussion/fixtures/voice.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,aAAa,CAAC;AAEzD,eAAO,MAAM,cAAc,EAAE,qBAuB5B,CAAC;AAEF,eAAO,MAAM,4BAA4B,EAAE,qBA0B1C,CAAC;AAEF,eAAO,MAAM,cAAc,EAAE,SAAS,qBAAqB,EAG1D,CAAC"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
// Voice-enabled discussion fixtures (Epic #491, Issue #502; AC2/AC4/AC6). These exercise the SAME
|
|
2
|
+
// discussion scaffold under voice-capable profiles (`speech-to-text`, `full-realtime`), proving voice
|
|
3
|
+
// reuses the intelligence (no parallel stack) and that an interrupted spoken turn recovers without
|
|
4
|
+
// losing the active mode/topicId/turnIndex. Pure value modules.
|
|
5
|
+
export const voiceSttReview = {
|
|
6
|
+
name: "voice-stt-review",
|
|
7
|
+
category: "voice",
|
|
8
|
+
description: "Review mode under a speech-to-text profile; voice may drive the same intelligence (no parallel stack).",
|
|
9
|
+
profile: "speech-to-text",
|
|
10
|
+
mode: "review",
|
|
11
|
+
topicId: "topic-review-stt",
|
|
12
|
+
dimensions: new Set([
|
|
13
|
+
"mode-appropriateness",
|
|
14
|
+
"disagreement-completeness",
|
|
15
|
+
"uncertainty-discipline",
|
|
16
|
+
"evidence-citation-discipline",
|
|
17
|
+
"correction-handling",
|
|
18
|
+
"capability-gating",
|
|
19
|
+
]),
|
|
20
|
+
oracle: {
|
|
21
|
+
expectedMandatedFacets: ["evidence", "assumptions", "uncertainty"],
|
|
22
|
+
expectedGatingAllowed: true,
|
|
23
|
+
expectedUncertaintyDisclosure: true,
|
|
24
|
+
expectedDecisionRecommendation: false,
|
|
25
|
+
expectedContradictionPolicies: ["disclose-and-defer"],
|
|
26
|
+
},
|
|
27
|
+
};
|
|
28
|
+
export const voiceRealtimeBargeInRecovery = {
|
|
29
|
+
name: "voice-realtime-barge-in-recovery",
|
|
30
|
+
category: "voice",
|
|
31
|
+
description: "Decide mode under full-realtime voice with a barge-in interruption; recovery must preserve context (AC4).",
|
|
32
|
+
profile: "full-realtime",
|
|
33
|
+
mode: "decide",
|
|
34
|
+
topicId: "topic-decide-realtime",
|
|
35
|
+
interruption: true,
|
|
36
|
+
dimensions: new Set([
|
|
37
|
+
"mode-appropriateness",
|
|
38
|
+
"disagreement-completeness",
|
|
39
|
+
"uncertainty-discipline",
|
|
40
|
+
"evidence-citation-discipline",
|
|
41
|
+
"correction-handling",
|
|
42
|
+
"interruption-recovery",
|
|
43
|
+
"capability-gating",
|
|
44
|
+
]),
|
|
45
|
+
oracle: {
|
|
46
|
+
expectedMandatedFacets: ["evidence", "assumptions", "uncertainty"],
|
|
47
|
+
expectedGatingAllowed: true,
|
|
48
|
+
expectedUncertaintyDisclosure: true,
|
|
49
|
+
expectedDecisionRecommendation: true,
|
|
50
|
+
expectedContradictionPolicies: ["disclose-and-defer"],
|
|
51
|
+
expectsRecoveredContext: true,
|
|
52
|
+
},
|
|
53
|
+
};
|
|
54
|
+
export const VOICE_FIXTURES = [
|
|
55
|
+
voiceSttReview,
|
|
56
|
+
voiceRealtimeBargeInRecovery,
|
|
57
|
+
];
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { deriveDiscussionObservation, runDiscussionEvaluation } from "./runner.js";
|
|
2
|
+
export { scoreDiscussionQuality, aggregateDiscussionQuality } from "./scorer.js";
|
|
3
|
+
export { renderDiscussionSummary } from "./render.js";
|
|
4
|
+
export { ALL_DISCUSSION_FIXTURES, discussionFixturesForCategory, discussionFixtureByName, } from "./fixtures/index.js";
|
|
5
|
+
export { DISCUSSION_QUALITY_DIMENSIONS, DISCUSSION_FIXTURE_CATEGORIES, DISCUSSION_EVAL_SCHEMA_VERSION, type DiscussionQualityDimension, type DiscussionFixtureCategory, type DiscussionOracle, type DiscussionEvalFixture, type DiscussionRecoveryTrajectory, type DiscussionObservation, type DiscussionQualityOutcome, type DiscussionDimensionResult, type DiscussionFixtureResult, type DiscussionScorecardEntry, type DiscussionEvalSummary, type DiscussionScorecard, } from "./types.js";
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/discussion/index.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,2BAA2B,EAAE,uBAAuB,EAAE,MAAM,aAAa,CAAC;AACnF,OAAO,EAAE,sBAAsB,EAAE,0BAA0B,EAAE,MAAM,aAAa,CAAC;AACjF,OAAO,EAAE,uBAAuB,EAAE,MAAM,aAAa,CAAC;AACtD,OAAO,EACL,uBAAuB,EACvB,6BAA6B,EAC7B,uBAAuB,GACxB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,6BAA6B,EAC7B,6BAA6B,EAC7B,8BAA8B,EAC9B,KAAK,0BAA0B,EAC/B,KAAK,yBAAyB,EAC9B,KAAK,gBAAgB,EACrB,KAAK,qBAAqB,EAC1B,KAAK,4BAA4B,EACjC,KAAK,qBAAqB,EAC1B,KAAK,wBAAwB,EAC7B,KAAK,yBAAyB,EAC9B,KAAK,uBAAuB,EAC5B,KAAK,wBAAwB,EAC7B,KAAK,qBAAqB,EAC1B,KAAK,mBAAmB,GACzB,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
// Public barrel for the Discussion Intelligence evaluation suite (Epic #491, Issue #502; ADR-0065).
|
|
2
|
+
// Exposes the deterministic observation derivation, the seven-dimension scorer, the suite runner, the
|
|
3
|
+
// scorecard renderer, the fixture registry, and the result/fixture types. Re-exported from the package
|
|
4
|
+
// barrel as a single auditable namespace (`DiscussionEval`), mirroring `PromptEnhancerEval`.
|
|
5
|
+
export { deriveDiscussionObservation, runDiscussionEvaluation } from "./runner.js";
|
|
6
|
+
export { scoreDiscussionQuality, aggregateDiscussionQuality } from "./scorer.js";
|
|
7
|
+
export { renderDiscussionSummary } from "./render.js";
|
|
8
|
+
export { ALL_DISCUSSION_FIXTURES, discussionFixturesForCategory, discussionFixtureByName, } from "./fixtures/index.js";
|
|
9
|
+
export { DISCUSSION_QUALITY_DIMENSIONS, DISCUSSION_FIXTURE_CATEGORIES, DISCUSSION_EVAL_SCHEMA_VERSION, } from "./types.js";
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"render.d.ts","sourceRoot":"","sources":["../../src/discussion/render.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAGV,mBAAmB,EAEpB,MAAM,YAAY,CAAC;AA2BpB,wBAAgB,uBAAuB,CAAC,SAAS,EAAE,mBAAmB,GAAG,MAAM,CA6B9E"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
// renderDiscussionSummary (Issue #502): DiscussionScorecard -> human-readable string. One line per
|
|
2
|
+
// fixture (name, category, dimension pass/fail glyphs), a per-dimension table, the covered-mode line,
|
|
3
|
+
// the AC6 no-voice / voice coverage line, and a Go/No-Go verdict. The scorecard carries only
|
|
4
|
+
// harness-authored, content-free fields (counts, closed-vocabulary labels, numeric scores), so this
|
|
5
|
+
// renderer performs no redaction — it only formats fields that are safe to print.
|
|
6
|
+
function glyph(result) {
|
|
7
|
+
if (result.outcome === "pass") {
|
|
8
|
+
return "PASS";
|
|
9
|
+
}
|
|
10
|
+
if (result.outcome === "fail") {
|
|
11
|
+
return "FAIL";
|
|
12
|
+
}
|
|
13
|
+
return "n/a";
|
|
14
|
+
}
|
|
15
|
+
function fixtureLine(fixture) {
|
|
16
|
+
const dims = fixture.dimensionResults
|
|
17
|
+
.filter((d) => d.outcome !== "not-applicable")
|
|
18
|
+
.map((d) => `${d.dimension}=${glyph(d)}`)
|
|
19
|
+
.join(" ");
|
|
20
|
+
const verdict = fixture.fullyPassed ? "OK" : "FAIL";
|
|
21
|
+
return `- ${fixture.fixtureName} [${fixture.category}] ${verdict} ${dims}`.trimEnd();
|
|
22
|
+
}
|
|
23
|
+
function dimensionLine(entry) {
|
|
24
|
+
const rate = entry.passRate === null ? "n/a" : `${(entry.passRate * 100).toFixed(0)}%`;
|
|
25
|
+
const verdict = entry.failCount > 0 ? "FAIL" : entry.passCount > 0 ? "PASS" : "n/a";
|
|
26
|
+
return ` ${entry.dimension.padEnd(28)} ${verdict.padEnd(5)} pass=${String(entry.passCount)} fail=${String(entry.failCount)} n/a=${String(entry.notApplicableCount)} rate=${rate}`;
|
|
27
|
+
}
|
|
28
|
+
export function renderDiscussionSummary(scorecard) {
|
|
29
|
+
const lines = [];
|
|
30
|
+
lines.push(`Discussion Intelligence evaluation summary (schema v${scorecard.schemaVersion})`);
|
|
31
|
+
lines.push(`Fixtures: ${String(scorecard.summary.totalFixtures)} total, ${String(scorecard.summary.fullyPassedFixtures)} fully passed`);
|
|
32
|
+
lines.push(`Modes covered: ${String(scorecard.coveredModes.length)} (${scorecard.coveredModes.join(", ")})`);
|
|
33
|
+
lines.push(`Profile coverage: no-voice=${scorecard.summary.coversNoVoiceProfile ? "yes" : "no"} voice=${scorecard.summary.coversVoiceProfile ? "yes" : "no"}`);
|
|
34
|
+
lines.push("");
|
|
35
|
+
lines.push("Fixtures:");
|
|
36
|
+
for (const fixture of scorecard.fixtureResults) {
|
|
37
|
+
lines.push(fixtureLine(fixture));
|
|
38
|
+
}
|
|
39
|
+
lines.push("");
|
|
40
|
+
lines.push("Dimensions:");
|
|
41
|
+
for (const entry of scorecard.dimensions) {
|
|
42
|
+
lines.push(dimensionLine(entry));
|
|
43
|
+
}
|
|
44
|
+
lines.push("");
|
|
45
|
+
lines.push(scorecard.summary.goNoGo === "GO"
|
|
46
|
+
? "Verdict: GO - every exercised discussion-quality dimension passed across no-voice and voice profiles."
|
|
47
|
+
: "Verdict: NO-GO - a dimension failed or a profile coverage gate was unmet (see table above).");
|
|
48
|
+
return lines.join("\n");
|
|
49
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { type DiscussionEvalFixture, type DiscussionObservation, type DiscussionScorecard } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Derive the deterministic observation for one fixture: the mode plan, its directive templates rendered
|
|
4
|
+
* verbatim, the voice-gating verdict, and (when the fixture declares one) the interruption-recovery
|
|
5
|
+
* trajectory. Pure data derivation over the frozen contract tables.
|
|
6
|
+
*/
|
|
7
|
+
export declare function deriveDiscussionObservation(fixture: DiscussionEvalFixture): DiscussionObservation;
|
|
8
|
+
/**
|
|
9
|
+
* Run the Discussion Intelligence evaluation suite and return a fully aggregated scorecard. Pure and
|
|
10
|
+
* deterministic. Pass an explicit fixture list to scope the run (the suite tests use the default set).
|
|
11
|
+
*/
|
|
12
|
+
export declare function runDiscussionEvaluation(fixtures?: readonly DiscussionEvalFixture[]): DiscussionScorecard;
|
|
13
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/discussion/runner.ts"],"names":[],"mappings":"AAiBA,OAAO,EAEL,KAAK,qBAAqB,EAG1B,KAAK,qBAAqB,EAE1B,KAAK,mBAAmB,EAEzB,MAAM,YAAY,CAAC;AAkBpB;;;;GAIG;AACH,wBAAgB,2BAA2B,CAAC,OAAO,EAAE,qBAAqB,GAAG,qBAAqB,CAejG;AA8BD;;;GAGG;AACH,wBAAgB,uBAAuB,CACrC,QAAQ,GAAE,SAAS,qBAAqB,EAA4B,GACnE,mBAAmB,CAarB"}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
// Discussion Intelligence evaluation runner (Epic #491, Issue #502; ADR-0065).
|
|
2
|
+
//
|
|
3
|
+
// Derives a deterministic observation for each fixture from the frozen keiko-contracts tables and the
|
|
4
|
+
// pure recovery helpers, scores the seven discussion-quality dimensions, aggregates a scorecard, and
|
|
5
|
+
// derives the offline Go/No-Go verdict. Pure: no IO, clock, randomness, or model dispatch.
|
|
6
|
+
import { DISCUSSION_DIRECTIVE_TEMPLATES, applyDiscussionInterruption, applyDiscussionRecovery, beginDiscussionTurn, discussionModePlan, voiceCanDriveDiscussion, } from "@oscharko-dev/keiko-contracts";
|
|
7
|
+
import { ALL_DISCUSSION_FIXTURES } from "./fixtures/index.js";
|
|
8
|
+
import { aggregateDiscussionQuality, scoreDiscussionQuality } from "./scorer.js";
|
|
9
|
+
import { DISCUSSION_EVAL_SCHEMA_VERSION, } from "./types.js";
|
|
10
|
+
// The fixed turn index every fixture seeds. Recovery's load-bearing property is that this value is
|
|
11
|
+
// preserved across interrupt -> recover, not its magnitude, so a single constant suffices.
|
|
12
|
+
const FIXTURE_TURN_INDEX = 0;
|
|
13
|
+
// Begin a turn, interrupt it (active -> interrupted), then recover it (interrupted -> recovered). The
|
|
14
|
+
// contract helpers preserve mode/topicId/turnIndex; the scorer asserts that preservation (AC4).
|
|
15
|
+
function deriveRecoveryTrajectory(mode, topicId) {
|
|
16
|
+
const initial = beginDiscussionTurn(mode, topicId, FIXTURE_TURN_INDEX);
|
|
17
|
+
const interrupted = applyDiscussionInterruption(initial);
|
|
18
|
+
const recovered = applyDiscussionRecovery(interrupted);
|
|
19
|
+
return { initial, interrupted, recovered };
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Derive the deterministic observation for one fixture: the mode plan, its directive templates rendered
|
|
23
|
+
* verbatim, the voice-gating verdict, and (when the fixture declares one) the interruption-recovery
|
|
24
|
+
* trajectory. Pure data derivation over the frozen contract tables.
|
|
25
|
+
*/
|
|
26
|
+
export function deriveDiscussionObservation(fixture) {
|
|
27
|
+
const plan = discussionModePlan(fixture.mode);
|
|
28
|
+
const renderedDirectives = plan.directives.map((directive) => DISCUSSION_DIRECTIVE_TEMPLATES[directive]);
|
|
29
|
+
const gatingAllowed = voiceCanDriveDiscussion(fixture.profile);
|
|
30
|
+
if (fixture.interruption !== true) {
|
|
31
|
+
return { plan, renderedDirectives, gatingAllowed };
|
|
32
|
+
}
|
|
33
|
+
return {
|
|
34
|
+
plan,
|
|
35
|
+
renderedDirectives,
|
|
36
|
+
gatingAllowed,
|
|
37
|
+
recovery: deriveRecoveryTrajectory(fixture.mode, fixture.topicId),
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
function runFixture(fixture) {
|
|
41
|
+
const observation = deriveDiscussionObservation(fixture);
|
|
42
|
+
const dimensionResults = scoreDiscussionQuality(fixture, observation);
|
|
43
|
+
return {
|
|
44
|
+
fixtureName: fixture.name,
|
|
45
|
+
category: fixture.category,
|
|
46
|
+
observation,
|
|
47
|
+
dimensionResults,
|
|
48
|
+
fullyPassed: dimensionResults.every((d) => d.outcome !== "fail"),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
function summarize(fixtureResults, dimensions) {
|
|
52
|
+
const allClean = dimensions.every((d) => d.failCount === 0);
|
|
53
|
+
const coversNoVoiceProfile = fixtureResults.some((f) => f.category === "no-voice");
|
|
54
|
+
const coversVoiceProfile = fixtureResults.some((f) => f.observation.gatingAllowed);
|
|
55
|
+
return {
|
|
56
|
+
totalFixtures: fixtureResults.length,
|
|
57
|
+
fullyPassedFixtures: fixtureResults.filter((f) => f.fullyPassed).length,
|
|
58
|
+
coversNoVoiceProfile,
|
|
59
|
+
coversVoiceProfile,
|
|
60
|
+
goNoGo: allClean && coversNoVoiceProfile && coversVoiceProfile ? "GO" : "NO-GO",
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Run the Discussion Intelligence evaluation suite and return a fully aggregated scorecard. Pure and
|
|
65
|
+
* deterministic. Pass an explicit fixture list to scope the run (the suite tests use the default set).
|
|
66
|
+
*/
|
|
67
|
+
export function runDiscussionEvaluation(fixtures = ALL_DISCUSSION_FIXTURES) {
|
|
68
|
+
const fixtureResults = fixtures.map(runFixture);
|
|
69
|
+
const dimensions = aggregateDiscussionQuality(fixtureResults.map((f) => f.dimensionResults));
|
|
70
|
+
const coveredModes = [
|
|
71
|
+
...new Set(fixtureResults.map((f) => f.observation.plan.mode)),
|
|
72
|
+
];
|
|
73
|
+
return {
|
|
74
|
+
schemaVersion: DISCUSSION_EVAL_SCHEMA_VERSION,
|
|
75
|
+
fixtureResults,
|
|
76
|
+
dimensions,
|
|
77
|
+
summary: summarize(fixtureResults, dimensions),
|
|
78
|
+
coveredModes,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { type DiscussionDimensionResult, type DiscussionEvalFixture, type DiscussionObservation, type DiscussionScorecardEntry } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Score one fixture's observation across all seven dimensions. A dimension the fixture does not declare
|
|
4
|
+
* is "not-applicable". Pure.
|
|
5
|
+
*/
|
|
6
|
+
export declare function scoreDiscussionQuality(fixture: DiscussionEvalFixture, obs: DiscussionObservation): readonly DiscussionDimensionResult[];
|
|
7
|
+
export declare function aggregateDiscussionQuality(results: readonly (readonly DiscussionDimensionResult[])[]): readonly DiscussionScorecardEntry[];
|
|
8
|
+
//# sourceMappingURL=scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scorer.d.ts","sourceRoot":"","sources":["../../src/discussion/scorer.ts"],"names":[],"mappings":"AAuBA,OAAO,EAEL,KAAK,yBAAyB,EAC9B,KAAK,qBAAqB,EAC1B,KAAK,qBAAqB,EAG1B,KAAK,wBAAwB,EAC9B,MAAM,YAAY,CAAC;AAyNpB;;;GAGG;AACH,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,qBAAqB,EAC9B,GAAG,EAAE,qBAAqB,GACzB,SAAS,yBAAyB,EAAE,CAUtC;AA8BD,wBAAgB,0BAA0B,CACxC,OAAO,EAAE,SAAS,CAAC,SAAS,yBAAyB,EAAE,CAAC,EAAE,GACzD,SAAS,wBAAwB,EAAE,CAErC"}
|