@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +60 -1
- package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
- package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
- package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
- package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
- package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
- package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
- package/esm/simulator-ui/src/AppShell.js +6 -4
- package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
- package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
- package/esm/simulator-ui/src/verify_unified.js +25 -0
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
- package/esm/src/server.d.ts.map +1 -1
- package/esm/src/server.js +227 -45
- package/esm/src/server_isograph_environment.d.ts +11 -4
- package/esm/src/server_isograph_environment.d.ts.map +1 -1
- package/esm/src/server_isograph_environment.js +5 -0
- package/esm/src/simulator_graphql.d.ts +11 -4
- package/esm/src/simulator_graphql.d.ts.map +1 -1
- package/esm/src/simulator_graphql.js +278 -245
- package/package.json +2 -2
- package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
- package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
- package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
- package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
- package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
- package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
- package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
- package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
- package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
- package/script/simulator-ui/src/AppShell.js +6 -4
- package/script/simulator-ui/src/verify_unified.d.ts +17 -0
- package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
- package/script/simulator-ui/src/verify_unified.js +29 -0
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
- package/script/src/server.d.ts.map +1 -1
- package/script/src/server.js +227 -45
- package/script/src/server_isograph_environment.d.ts +11 -4
- package/script/src/server_isograph_environment.d.ts.map +1 -1
- package/script/src/server_isograph_environment.js +5 -0
- package/script/src/simulator_graphql.d.ts +11 -4
- package/script/src/simulator_graphql.d.ts.map +1 -1
- package/script/src/simulator_graphql.js +278 -245
- package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
- package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
- package/esm/simulator-ui/src/verify_metrics.js +0 -260
- package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
- package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
- package/script/simulator-ui/src/verify_metrics.js +0 -264
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
export type VerifyVerdict = "PASS" | "WARN" | "FAIL";
|
|
2
|
-
export type VerifyCalibrationRun = {
|
|
3
|
-
id: string;
|
|
4
|
-
status: "running" | "completed" | "error";
|
|
5
|
-
runAt?: string;
|
|
6
|
-
result?: unknown;
|
|
7
|
-
};
|
|
8
|
-
export type VerifyOutlier = {
|
|
9
|
-
key: string;
|
|
10
|
-
label: string;
|
|
11
|
-
sampleSize: number;
|
|
12
|
-
agreementRate: number | null;
|
|
13
|
-
scoreDelta: number | null;
|
|
14
|
-
passFlip: boolean;
|
|
15
|
-
instability: boolean;
|
|
16
|
-
minScore: number | null;
|
|
17
|
-
maxScore: number | null;
|
|
18
|
-
minRunId?: string;
|
|
19
|
-
maxRunId?: string;
|
|
20
|
-
turnIndex?: number;
|
|
21
|
-
messageRefId?: string;
|
|
22
|
-
};
|
|
23
|
-
export type VerifyConsistencyReport = {
|
|
24
|
-
sampleSize: number;
|
|
25
|
-
comparableExampleCount: number;
|
|
26
|
-
agreementRate: number | null;
|
|
27
|
-
scoreSpreadMin: number | null;
|
|
28
|
-
scoreSpreadMedian: number | null;
|
|
29
|
-
scoreSpreadMax: number | null;
|
|
30
|
-
instabilityCount: number;
|
|
31
|
-
verdict: VerifyVerdict;
|
|
32
|
-
verdictReason: string;
|
|
33
|
-
outliers: Array<VerifyOutlier>;
|
|
34
|
-
};
|
|
35
|
-
export declare const VERIFY_CONSISTENCY_THRESHOLDS: {
|
|
36
|
-
readonly minSampleSize: 6;
|
|
37
|
-
readonly instabilityScoreDelta: 1.5;
|
|
38
|
-
readonly pass: {
|
|
39
|
-
readonly agreementMin: 0.9;
|
|
40
|
-
readonly maxSpread: 1;
|
|
41
|
-
readonly maxInstabilityCount: 0;
|
|
42
|
-
};
|
|
43
|
-
readonly warn: {
|
|
44
|
-
readonly agreementMin: 0.75;
|
|
45
|
-
readonly maxSpread: 2;
|
|
46
|
-
readonly maxInstabilityCount: 2;
|
|
47
|
-
};
|
|
48
|
-
};
|
|
49
|
-
export declare function buildVerifyConsistencyReport(runs: Array<VerifyCalibrationRun>): VerifyConsistencyReport;
|
|
50
|
-
//# sourceMappingURL=verify_metrics.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"verify_metrics.d.ts","sourceRoot":"","sources":["../../../src/simulator-ui/src/verify_metrics.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,aAAa,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,CAAC;AAErD,MAAM,MAAM,oBAAoB,GAAG;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,SAAS,GAAG,WAAW,GAAG,OAAO,CAAC;IAC1C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB,CAAC;AAkBF,MAAM,MAAM,aAAa,GAAG;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,QAAQ,EAAE,OAAO,CAAC;IAClB,WAAW,EAAE,OAAO,CAAC;IACrB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF,MAAM,MAAM,uBAAuB,GAAG;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,gBAAgB,EAAE,MAAM,CAAC;IACzB,OAAO,EAAE,aAAa,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,KAAK,CAAC,aAAa,CAAC,CAAC;CAChC,CAAC;AAEF,eAAO,MAAM,6BAA6B;;;;;;;;;;;;;CAahC,CAAC;AA0JX,wBAAgB,4BAA4B,CAC1C,IAAI,EAAE,KAAK,CAAC,oBAAoB,CAAC,GAChC,uBAAuB,CA0HzB"}
|
|
@@ -1,260 +0,0 @@
|
|
|
1
|
-
export const VERIFY_CONSISTENCY_THRESHOLDS = {
|
|
2
|
-
minSampleSize: 6,
|
|
3
|
-
instabilityScoreDelta: 1.5,
|
|
4
|
-
pass: {
|
|
5
|
-
agreementMin: 0.9,
|
|
6
|
-
maxSpread: 1,
|
|
7
|
-
maxInstabilityCount: 0,
|
|
8
|
-
},
|
|
9
|
-
warn: {
|
|
10
|
-
agreementMin: 0.75,
|
|
11
|
-
maxSpread: 2,
|
|
12
|
-
maxInstabilityCount: 2,
|
|
13
|
-
},
|
|
14
|
-
};
|
|
15
|
-
const pickPayload = (result) => {
|
|
16
|
-
if (!result || typeof result !== "object")
|
|
17
|
-
return {};
|
|
18
|
-
const record = result;
|
|
19
|
-
if (record.payload &&
|
|
20
|
-
typeof record.payload === "object" &&
|
|
21
|
-
!Array.isArray(record.payload)) {
|
|
22
|
-
return record.payload;
|
|
23
|
-
}
|
|
24
|
-
return record;
|
|
25
|
-
};
|
|
26
|
-
const extractScoreReasonPass = (result) => {
|
|
27
|
-
const payload = pickPayload(result);
|
|
28
|
-
const score = typeof payload.score === "number" && Number.isFinite(payload.score)
|
|
29
|
-
? payload.score
|
|
30
|
-
: undefined;
|
|
31
|
-
const reason = typeof payload.reason === "string"
|
|
32
|
-
? payload.reason
|
|
33
|
-
: undefined;
|
|
34
|
-
if (typeof payload.pass === "boolean") {
|
|
35
|
-
return { score, reason, pass: payload.pass };
|
|
36
|
-
}
|
|
37
|
-
if (typeof payload.passed === "boolean") {
|
|
38
|
-
return { score, reason, pass: payload.passed };
|
|
39
|
-
}
|
|
40
|
-
if (typeof payload.verdict === "string") {
|
|
41
|
-
const verdict = payload.verdict.trim().toLowerCase();
|
|
42
|
-
if (verdict === "pass")
|
|
43
|
-
return { score, reason, pass: true };
|
|
44
|
-
if (verdict === "fail")
|
|
45
|
-
return { score, reason, pass: false };
|
|
46
|
-
}
|
|
47
|
-
if (typeof score === "number") {
|
|
48
|
-
return { score, reason, pass: score >= 0 };
|
|
49
|
-
}
|
|
50
|
-
return { score, reason };
|
|
51
|
-
};
|
|
52
|
-
const flattenRunExamples = (run) => {
|
|
53
|
-
if (!run.result || typeof run.result !== "object")
|
|
54
|
-
return [];
|
|
55
|
-
const record = run.result;
|
|
56
|
-
if (record.mode === "turns" && Array.isArray(record.turns)) {
|
|
57
|
-
const buckets = [];
|
|
58
|
-
record.turns.forEach((turn, fallbackIndex) => {
|
|
59
|
-
if (!turn || typeof turn !== "object")
|
|
60
|
-
return;
|
|
61
|
-
const turnRecord = turn;
|
|
62
|
-
const index = typeof turnRecord.index === "number" &&
|
|
63
|
-
Number.isFinite(turnRecord.index)
|
|
64
|
-
? Math.max(0, Math.round(turnRecord.index))
|
|
65
|
-
: fallbackIndex;
|
|
66
|
-
const messageRefId = typeof turnRecord.messageRefId === "string" &&
|
|
67
|
-
turnRecord.messageRefId.trim().length > 0
|
|
68
|
-
? turnRecord.messageRefId
|
|
69
|
-
: undefined;
|
|
70
|
-
const key = messageRefId ? `ref:${messageRefId}` : `turn:${index}`;
|
|
71
|
-
const label = `Assistant turn ${fallbackIndex + 1}`;
|
|
72
|
-
const parsed = extractScoreReasonPass(turnRecord.result);
|
|
73
|
-
buckets.push({
|
|
74
|
-
key,
|
|
75
|
-
label,
|
|
76
|
-
points: [{
|
|
77
|
-
runId: run.id,
|
|
78
|
-
runAt: run.runAt,
|
|
79
|
-
score: parsed.score,
|
|
80
|
-
pass: parsed.pass,
|
|
81
|
-
reason: parsed.reason,
|
|
82
|
-
turnIndex: index,
|
|
83
|
-
messageRefId,
|
|
84
|
-
}],
|
|
85
|
-
});
|
|
86
|
-
});
|
|
87
|
-
return buckets;
|
|
88
|
-
}
|
|
89
|
-
const parsed = extractScoreReasonPass(run.result);
|
|
90
|
-
return [{
|
|
91
|
-
key: "conversation",
|
|
92
|
-
label: "Conversation score",
|
|
93
|
-
points: [{
|
|
94
|
-
runId: run.id,
|
|
95
|
-
runAt: run.runAt,
|
|
96
|
-
score: parsed.score,
|
|
97
|
-
pass: parsed.pass,
|
|
98
|
-
reason: parsed.reason,
|
|
99
|
-
}],
|
|
100
|
-
}];
|
|
101
|
-
};
|
|
102
|
-
const median = (values) => {
|
|
103
|
-
if (!values.length)
|
|
104
|
-
return null;
|
|
105
|
-
const sorted = [...values].sort((a, b) => a - b);
|
|
106
|
-
const mid = Math.floor(sorted.length / 2);
|
|
107
|
-
if (sorted.length % 2 === 1)
|
|
108
|
-
return sorted[mid];
|
|
109
|
-
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
110
|
-
};
|
|
111
|
-
const round2 = (value) => Math.round(value * 100) / 100;
|
|
112
|
-
const resolveVerdict = (input) => {
|
|
113
|
-
const t = VERIFY_CONSISTENCY_THRESHOLDS;
|
|
114
|
-
if (input.sampleSize < t.minSampleSize) {
|
|
115
|
-
return {
|
|
116
|
-
verdict: "WARN",
|
|
117
|
-
reason: `Need at least ${t.minSampleSize} samples before issuing a firm verdict.`,
|
|
118
|
-
};
|
|
119
|
-
}
|
|
120
|
-
if (input.agreementRate === null) {
|
|
121
|
-
return {
|
|
122
|
-
verdict: "WARN",
|
|
123
|
-
reason: "No comparable pass/fail evidence was found in the sampled runs.",
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
const spreadMax = input.spreadMax ?? 0;
|
|
127
|
-
if (input.agreementRate >= t.pass.agreementMin &&
|
|
128
|
-
spreadMax <= t.pass.maxSpread &&
|
|
129
|
-
input.instabilityCount <= t.pass.maxInstabilityCount) {
|
|
130
|
-
return {
|
|
131
|
-
verdict: "PASS",
|
|
132
|
-
reason: "Agreement, spread, and instability all meet PASS thresholds.",
|
|
133
|
-
};
|
|
134
|
-
}
|
|
135
|
-
if (input.agreementRate >= t.warn.agreementMin &&
|
|
136
|
-
spreadMax <= t.warn.maxSpread &&
|
|
137
|
-
input.instabilityCount <= t.warn.maxInstabilityCount) {
|
|
138
|
-
return {
|
|
139
|
-
verdict: "WARN",
|
|
140
|
-
reason: "Some variation was detected, but results remain within WARN thresholds.",
|
|
141
|
-
};
|
|
142
|
-
}
|
|
143
|
-
return {
|
|
144
|
-
verdict: "FAIL",
|
|
145
|
-
reason: "Agreement/spread instability exceeds WARN thresholds.",
|
|
146
|
-
};
|
|
147
|
-
};
|
|
148
|
-
export function buildVerifyConsistencyReport(runs) {
|
|
149
|
-
const completedRuns = runs.filter((run) => run.status === "completed");
|
|
150
|
-
const sampleSize = completedRuns.length;
|
|
151
|
-
const bucketsByKey = new Map();
|
|
152
|
-
completedRuns.forEach((run) => {
|
|
153
|
-
flattenRunExamples(run).forEach((entry) => {
|
|
154
|
-
const existing = bucketsByKey.get(entry.key);
|
|
155
|
-
if (!existing) {
|
|
156
|
-
bucketsByKey.set(entry.key, {
|
|
157
|
-
key: entry.key,
|
|
158
|
-
label: entry.label,
|
|
159
|
-
points: [...entry.points],
|
|
160
|
-
});
|
|
161
|
-
return;
|
|
162
|
-
}
|
|
163
|
-
existing.points.push(...entry.points);
|
|
164
|
-
});
|
|
165
|
-
});
|
|
166
|
-
const outliers = [];
|
|
167
|
-
let agreementVotes = 0;
|
|
168
|
-
let agreementTotal = 0;
|
|
169
|
-
const scoreDeltas = [];
|
|
170
|
-
bucketsByKey.forEach((bucket) => {
|
|
171
|
-
const scores = bucket.points
|
|
172
|
-
.map((point) => point.score)
|
|
173
|
-
.filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
174
|
-
const minScore = scores.length ? Math.min(...scores) : null;
|
|
175
|
-
const maxScore = scores.length ? Math.max(...scores) : null;
|
|
176
|
-
const scoreDelta = minScore !== null && maxScore !== null
|
|
177
|
-
? round2(maxScore - minScore)
|
|
178
|
-
: null;
|
|
179
|
-
const passVotes = bucket.points
|
|
180
|
-
.map((point) => point.pass)
|
|
181
|
-
.filter((pass) => typeof pass === "boolean");
|
|
182
|
-
const passCount = passVotes.filter((value) => value).length;
|
|
183
|
-
const failCount = passVotes.length - passCount;
|
|
184
|
-
const agreementRate = passVotes.length > 0
|
|
185
|
-
? round2(Math.max(passCount, failCount) / passVotes.length)
|
|
186
|
-
: null;
|
|
187
|
-
if (passVotes.length > 0) {
|
|
188
|
-
agreementVotes += Math.max(passCount, failCount);
|
|
189
|
-
agreementTotal += passVotes.length;
|
|
190
|
-
}
|
|
191
|
-
if (scoreDelta !== null) {
|
|
192
|
-
scoreDeltas.push(scoreDelta);
|
|
193
|
-
}
|
|
194
|
-
const passFlip = passCount > 0 && failCount > 0;
|
|
195
|
-
const instability = passFlip ||
|
|
196
|
-
(scoreDelta !== null &&
|
|
197
|
-
scoreDelta > VERIFY_CONSISTENCY_THRESHOLDS.instabilityScoreDelta);
|
|
198
|
-
const minPoint = minScore === null
|
|
199
|
-
? undefined
|
|
200
|
-
: bucket.points.find((point) => point.score === minScore);
|
|
201
|
-
const maxPoint = maxScore === null
|
|
202
|
-
? undefined
|
|
203
|
-
: bucket.points.find((point) => point.score === maxScore);
|
|
204
|
-
outliers.push({
|
|
205
|
-
key: bucket.key,
|
|
206
|
-
label: bucket.label,
|
|
207
|
-
sampleSize: bucket.points.length,
|
|
208
|
-
agreementRate,
|
|
209
|
-
scoreDelta,
|
|
210
|
-
passFlip,
|
|
211
|
-
instability,
|
|
212
|
-
minScore,
|
|
213
|
-
maxScore,
|
|
214
|
-
minRunId: minPoint?.runId,
|
|
215
|
-
maxRunId: maxPoint?.runId,
|
|
216
|
-
turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
|
|
217
|
-
messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
|
|
218
|
-
});
|
|
219
|
-
});
|
|
220
|
-
outliers.sort((a, b) => {
|
|
221
|
-
if (a.instability !== b.instability)
|
|
222
|
-
return a.instability ? -1 : 1;
|
|
223
|
-
if (a.passFlip !== b.passFlip)
|
|
224
|
-
return a.passFlip ? -1 : 1;
|
|
225
|
-
const aDelta = a.scoreDelta ?? -1;
|
|
226
|
-
const bDelta = b.scoreDelta ?? -1;
|
|
227
|
-
if (aDelta !== bDelta)
|
|
228
|
-
return bDelta - aDelta;
|
|
229
|
-
if (a.sampleSize !== b.sampleSize)
|
|
230
|
-
return b.sampleSize - a.sampleSize;
|
|
231
|
-
return a.label.localeCompare(b.label);
|
|
232
|
-
});
|
|
233
|
-
const agreementRate = agreementTotal > 0
|
|
234
|
-
? round2(agreementVotes / agreementTotal)
|
|
235
|
-
: null;
|
|
236
|
-
const scoreSpreadMin = scoreDeltas.length ? Math.min(...scoreDeltas) : null;
|
|
237
|
-
const scoreSpreadMax = scoreDeltas.length ? Math.max(...scoreDeltas) : null;
|
|
238
|
-
const scoreSpreadMedian = median(scoreDeltas);
|
|
239
|
-
const instabilityCount = outliers.filter((entry) => entry.instability).length;
|
|
240
|
-
const verdict = resolveVerdict({
|
|
241
|
-
sampleSize,
|
|
242
|
-
agreementRate,
|
|
243
|
-
spreadMax: scoreSpreadMax,
|
|
244
|
-
instabilityCount,
|
|
245
|
-
});
|
|
246
|
-
return {
|
|
247
|
-
sampleSize,
|
|
248
|
-
comparableExampleCount: outliers.length,
|
|
249
|
-
agreementRate,
|
|
250
|
-
scoreSpreadMin: scoreSpreadMin === null ? null : round2(scoreSpreadMin),
|
|
251
|
-
scoreSpreadMedian: scoreSpreadMedian === null
|
|
252
|
-
? null
|
|
253
|
-
: round2(scoreSpreadMedian),
|
|
254
|
-
scoreSpreadMax: scoreSpreadMax === null ? null : round2(scoreSpreadMax),
|
|
255
|
-
instabilityCount,
|
|
256
|
-
verdict: verdict.verdict,
|
|
257
|
-
verdictReason: verdict.reason,
|
|
258
|
-
outliers,
|
|
259
|
-
};
|
|
260
|
-
}
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
export type VerifyVerdict = "PASS" | "WARN" | "FAIL";
|
|
2
|
-
export type VerifyCalibrationRun = {
|
|
3
|
-
id: string;
|
|
4
|
-
status: "running" | "completed" | "error";
|
|
5
|
-
runAt?: string;
|
|
6
|
-
result?: unknown;
|
|
7
|
-
};
|
|
8
|
-
export type VerifyOutlier = {
|
|
9
|
-
key: string;
|
|
10
|
-
label: string;
|
|
11
|
-
sampleSize: number;
|
|
12
|
-
agreementRate: number | null;
|
|
13
|
-
scoreDelta: number | null;
|
|
14
|
-
passFlip: boolean;
|
|
15
|
-
instability: boolean;
|
|
16
|
-
minScore: number | null;
|
|
17
|
-
maxScore: number | null;
|
|
18
|
-
minRunId?: string;
|
|
19
|
-
maxRunId?: string;
|
|
20
|
-
turnIndex?: number;
|
|
21
|
-
messageRefId?: string;
|
|
22
|
-
};
|
|
23
|
-
export type VerifyConsistencyReport = {
|
|
24
|
-
sampleSize: number;
|
|
25
|
-
comparableExampleCount: number;
|
|
26
|
-
agreementRate: number | null;
|
|
27
|
-
scoreSpreadMin: number | null;
|
|
28
|
-
scoreSpreadMedian: number | null;
|
|
29
|
-
scoreSpreadMax: number | null;
|
|
30
|
-
instabilityCount: number;
|
|
31
|
-
verdict: VerifyVerdict;
|
|
32
|
-
verdictReason: string;
|
|
33
|
-
outliers: Array<VerifyOutlier>;
|
|
34
|
-
};
|
|
35
|
-
export declare const VERIFY_CONSISTENCY_THRESHOLDS: {
|
|
36
|
-
readonly minSampleSize: 6;
|
|
37
|
-
readonly instabilityScoreDelta: 1.5;
|
|
38
|
-
readonly pass: {
|
|
39
|
-
readonly agreementMin: 0.9;
|
|
40
|
-
readonly maxSpread: 1;
|
|
41
|
-
readonly maxInstabilityCount: 0;
|
|
42
|
-
};
|
|
43
|
-
readonly warn: {
|
|
44
|
-
readonly agreementMin: 0.75;
|
|
45
|
-
readonly maxSpread: 2;
|
|
46
|
-
readonly maxInstabilityCount: 2;
|
|
47
|
-
};
|
|
48
|
-
};
|
|
49
|
-
export declare function buildVerifyConsistencyReport(runs: Array<VerifyCalibrationRun>): VerifyConsistencyReport;
|
|
50
|
-
//# sourceMappingURL=verify_metrics.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"verify_metrics.d.ts","sourceRoot":"","sources":["../../../src/simulator-ui/src/verify_metrics.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,aAAa,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,CAAC;AAErD,MAAM,MAAM,oBAAoB,GAAG;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,SAAS,GAAG,WAAW,GAAG,OAAO,CAAC;IAC1C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB,CAAC;AAkBF,MAAM,MAAM,aAAa,GAAG;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,QAAQ,EAAE,OAAO,CAAC;IAClB,WAAW,EAAE,OAAO,CAAC;IACrB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF,MAAM,MAAM,uBAAuB,GAAG;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,gBAAgB,EAAE,MAAM,CAAC;IACzB,OAAO,EAAE,aAAa,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,KAAK,CAAC,aAAa,CAAC,CAAC;CAChC,CAAC;AAEF,eAAO,MAAM,6BAA6B;;;;;;;;;;;;;CAahC,CAAC;AA0JX,wBAAgB,4BAA4B,CAC1C,IAAI,EAAE,KAAK,CAAC,oBAAoB,CAAC,GAChC,uBAAuB,CA0HzB"}
|
|
@@ -1,264 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.VERIFY_CONSISTENCY_THRESHOLDS = void 0;
|
|
4
|
-
exports.buildVerifyConsistencyReport = buildVerifyConsistencyReport;
|
|
5
|
-
exports.VERIFY_CONSISTENCY_THRESHOLDS = {
|
|
6
|
-
minSampleSize: 6,
|
|
7
|
-
instabilityScoreDelta: 1.5,
|
|
8
|
-
pass: {
|
|
9
|
-
agreementMin: 0.9,
|
|
10
|
-
maxSpread: 1,
|
|
11
|
-
maxInstabilityCount: 0,
|
|
12
|
-
},
|
|
13
|
-
warn: {
|
|
14
|
-
agreementMin: 0.75,
|
|
15
|
-
maxSpread: 2,
|
|
16
|
-
maxInstabilityCount: 2,
|
|
17
|
-
},
|
|
18
|
-
};
|
|
19
|
-
const pickPayload = (result) => {
|
|
20
|
-
if (!result || typeof result !== "object")
|
|
21
|
-
return {};
|
|
22
|
-
const record = result;
|
|
23
|
-
if (record.payload &&
|
|
24
|
-
typeof record.payload === "object" &&
|
|
25
|
-
!Array.isArray(record.payload)) {
|
|
26
|
-
return record.payload;
|
|
27
|
-
}
|
|
28
|
-
return record;
|
|
29
|
-
};
|
|
30
|
-
const extractScoreReasonPass = (result) => {
|
|
31
|
-
const payload = pickPayload(result);
|
|
32
|
-
const score = typeof payload.score === "number" && Number.isFinite(payload.score)
|
|
33
|
-
? payload.score
|
|
34
|
-
: undefined;
|
|
35
|
-
const reason = typeof payload.reason === "string"
|
|
36
|
-
? payload.reason
|
|
37
|
-
: undefined;
|
|
38
|
-
if (typeof payload.pass === "boolean") {
|
|
39
|
-
return { score, reason, pass: payload.pass };
|
|
40
|
-
}
|
|
41
|
-
if (typeof payload.passed === "boolean") {
|
|
42
|
-
return { score, reason, pass: payload.passed };
|
|
43
|
-
}
|
|
44
|
-
if (typeof payload.verdict === "string") {
|
|
45
|
-
const verdict = payload.verdict.trim().toLowerCase();
|
|
46
|
-
if (verdict === "pass")
|
|
47
|
-
return { score, reason, pass: true };
|
|
48
|
-
if (verdict === "fail")
|
|
49
|
-
return { score, reason, pass: false };
|
|
50
|
-
}
|
|
51
|
-
if (typeof score === "number") {
|
|
52
|
-
return { score, reason, pass: score >= 0 };
|
|
53
|
-
}
|
|
54
|
-
return { score, reason };
|
|
55
|
-
};
|
|
56
|
-
const flattenRunExamples = (run) => {
|
|
57
|
-
if (!run.result || typeof run.result !== "object")
|
|
58
|
-
return [];
|
|
59
|
-
const record = run.result;
|
|
60
|
-
if (record.mode === "turns" && Array.isArray(record.turns)) {
|
|
61
|
-
const buckets = [];
|
|
62
|
-
record.turns.forEach((turn, fallbackIndex) => {
|
|
63
|
-
if (!turn || typeof turn !== "object")
|
|
64
|
-
return;
|
|
65
|
-
const turnRecord = turn;
|
|
66
|
-
const index = typeof turnRecord.index === "number" &&
|
|
67
|
-
Number.isFinite(turnRecord.index)
|
|
68
|
-
? Math.max(0, Math.round(turnRecord.index))
|
|
69
|
-
: fallbackIndex;
|
|
70
|
-
const messageRefId = typeof turnRecord.messageRefId === "string" &&
|
|
71
|
-
turnRecord.messageRefId.trim().length > 0
|
|
72
|
-
? turnRecord.messageRefId
|
|
73
|
-
: undefined;
|
|
74
|
-
const key = messageRefId ? `ref:${messageRefId}` : `turn:${index}`;
|
|
75
|
-
const label = `Assistant turn ${fallbackIndex + 1}`;
|
|
76
|
-
const parsed = extractScoreReasonPass(turnRecord.result);
|
|
77
|
-
buckets.push({
|
|
78
|
-
key,
|
|
79
|
-
label,
|
|
80
|
-
points: [{
|
|
81
|
-
runId: run.id,
|
|
82
|
-
runAt: run.runAt,
|
|
83
|
-
score: parsed.score,
|
|
84
|
-
pass: parsed.pass,
|
|
85
|
-
reason: parsed.reason,
|
|
86
|
-
turnIndex: index,
|
|
87
|
-
messageRefId,
|
|
88
|
-
}],
|
|
89
|
-
});
|
|
90
|
-
});
|
|
91
|
-
return buckets;
|
|
92
|
-
}
|
|
93
|
-
const parsed = extractScoreReasonPass(run.result);
|
|
94
|
-
return [{
|
|
95
|
-
key: "conversation",
|
|
96
|
-
label: "Conversation score",
|
|
97
|
-
points: [{
|
|
98
|
-
runId: run.id,
|
|
99
|
-
runAt: run.runAt,
|
|
100
|
-
score: parsed.score,
|
|
101
|
-
pass: parsed.pass,
|
|
102
|
-
reason: parsed.reason,
|
|
103
|
-
}],
|
|
104
|
-
}];
|
|
105
|
-
};
|
|
106
|
-
const median = (values) => {
|
|
107
|
-
if (!values.length)
|
|
108
|
-
return null;
|
|
109
|
-
const sorted = [...values].sort((a, b) => a - b);
|
|
110
|
-
const mid = Math.floor(sorted.length / 2);
|
|
111
|
-
if (sorted.length % 2 === 1)
|
|
112
|
-
return sorted[mid];
|
|
113
|
-
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
114
|
-
};
|
|
115
|
-
const round2 = (value) => Math.round(value * 100) / 100;
|
|
116
|
-
const resolveVerdict = (input) => {
|
|
117
|
-
const t = exports.VERIFY_CONSISTENCY_THRESHOLDS;
|
|
118
|
-
if (input.sampleSize < t.minSampleSize) {
|
|
119
|
-
return {
|
|
120
|
-
verdict: "WARN",
|
|
121
|
-
reason: `Need at least ${t.minSampleSize} samples before issuing a firm verdict.`,
|
|
122
|
-
};
|
|
123
|
-
}
|
|
124
|
-
if (input.agreementRate === null) {
|
|
125
|
-
return {
|
|
126
|
-
verdict: "WARN",
|
|
127
|
-
reason: "No comparable pass/fail evidence was found in the sampled runs.",
|
|
128
|
-
};
|
|
129
|
-
}
|
|
130
|
-
const spreadMax = input.spreadMax ?? 0;
|
|
131
|
-
if (input.agreementRate >= t.pass.agreementMin &&
|
|
132
|
-
spreadMax <= t.pass.maxSpread &&
|
|
133
|
-
input.instabilityCount <= t.pass.maxInstabilityCount) {
|
|
134
|
-
return {
|
|
135
|
-
verdict: "PASS",
|
|
136
|
-
reason: "Agreement, spread, and instability all meet PASS thresholds.",
|
|
137
|
-
};
|
|
138
|
-
}
|
|
139
|
-
if (input.agreementRate >= t.warn.agreementMin &&
|
|
140
|
-
spreadMax <= t.warn.maxSpread &&
|
|
141
|
-
input.instabilityCount <= t.warn.maxInstabilityCount) {
|
|
142
|
-
return {
|
|
143
|
-
verdict: "WARN",
|
|
144
|
-
reason: "Some variation was detected, but results remain within WARN thresholds.",
|
|
145
|
-
};
|
|
146
|
-
}
|
|
147
|
-
return {
|
|
148
|
-
verdict: "FAIL",
|
|
149
|
-
reason: "Agreement/spread instability exceeds WARN thresholds.",
|
|
150
|
-
};
|
|
151
|
-
};
|
|
152
|
-
function buildVerifyConsistencyReport(runs) {
|
|
153
|
-
const completedRuns = runs.filter((run) => run.status === "completed");
|
|
154
|
-
const sampleSize = completedRuns.length;
|
|
155
|
-
const bucketsByKey = new Map();
|
|
156
|
-
completedRuns.forEach((run) => {
|
|
157
|
-
flattenRunExamples(run).forEach((entry) => {
|
|
158
|
-
const existing = bucketsByKey.get(entry.key);
|
|
159
|
-
if (!existing) {
|
|
160
|
-
bucketsByKey.set(entry.key, {
|
|
161
|
-
key: entry.key,
|
|
162
|
-
label: entry.label,
|
|
163
|
-
points: [...entry.points],
|
|
164
|
-
});
|
|
165
|
-
return;
|
|
166
|
-
}
|
|
167
|
-
existing.points.push(...entry.points);
|
|
168
|
-
});
|
|
169
|
-
});
|
|
170
|
-
const outliers = [];
|
|
171
|
-
let agreementVotes = 0;
|
|
172
|
-
let agreementTotal = 0;
|
|
173
|
-
const scoreDeltas = [];
|
|
174
|
-
bucketsByKey.forEach((bucket) => {
|
|
175
|
-
const scores = bucket.points
|
|
176
|
-
.map((point) => point.score)
|
|
177
|
-
.filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
178
|
-
const minScore = scores.length ? Math.min(...scores) : null;
|
|
179
|
-
const maxScore = scores.length ? Math.max(...scores) : null;
|
|
180
|
-
const scoreDelta = minScore !== null && maxScore !== null
|
|
181
|
-
? round2(maxScore - minScore)
|
|
182
|
-
: null;
|
|
183
|
-
const passVotes = bucket.points
|
|
184
|
-
.map((point) => point.pass)
|
|
185
|
-
.filter((pass) => typeof pass === "boolean");
|
|
186
|
-
const passCount = passVotes.filter((value) => value).length;
|
|
187
|
-
const failCount = passVotes.length - passCount;
|
|
188
|
-
const agreementRate = passVotes.length > 0
|
|
189
|
-
? round2(Math.max(passCount, failCount) / passVotes.length)
|
|
190
|
-
: null;
|
|
191
|
-
if (passVotes.length > 0) {
|
|
192
|
-
agreementVotes += Math.max(passCount, failCount);
|
|
193
|
-
agreementTotal += passVotes.length;
|
|
194
|
-
}
|
|
195
|
-
if (scoreDelta !== null) {
|
|
196
|
-
scoreDeltas.push(scoreDelta);
|
|
197
|
-
}
|
|
198
|
-
const passFlip = passCount > 0 && failCount > 0;
|
|
199
|
-
const instability = passFlip ||
|
|
200
|
-
(scoreDelta !== null &&
|
|
201
|
-
scoreDelta > exports.VERIFY_CONSISTENCY_THRESHOLDS.instabilityScoreDelta);
|
|
202
|
-
const minPoint = minScore === null
|
|
203
|
-
? undefined
|
|
204
|
-
: bucket.points.find((point) => point.score === minScore);
|
|
205
|
-
const maxPoint = maxScore === null
|
|
206
|
-
? undefined
|
|
207
|
-
: bucket.points.find((point) => point.score === maxScore);
|
|
208
|
-
outliers.push({
|
|
209
|
-
key: bucket.key,
|
|
210
|
-
label: bucket.label,
|
|
211
|
-
sampleSize: bucket.points.length,
|
|
212
|
-
agreementRate,
|
|
213
|
-
scoreDelta,
|
|
214
|
-
passFlip,
|
|
215
|
-
instability,
|
|
216
|
-
minScore,
|
|
217
|
-
maxScore,
|
|
218
|
-
minRunId: minPoint?.runId,
|
|
219
|
-
maxRunId: maxPoint?.runId,
|
|
220
|
-
turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
|
|
221
|
-
messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
|
|
222
|
-
});
|
|
223
|
-
});
|
|
224
|
-
outliers.sort((a, b) => {
|
|
225
|
-
if (a.instability !== b.instability)
|
|
226
|
-
return a.instability ? -1 : 1;
|
|
227
|
-
if (a.passFlip !== b.passFlip)
|
|
228
|
-
return a.passFlip ? -1 : 1;
|
|
229
|
-
const aDelta = a.scoreDelta ?? -1;
|
|
230
|
-
const bDelta = b.scoreDelta ?? -1;
|
|
231
|
-
if (aDelta !== bDelta)
|
|
232
|
-
return bDelta - aDelta;
|
|
233
|
-
if (a.sampleSize !== b.sampleSize)
|
|
234
|
-
return b.sampleSize - a.sampleSize;
|
|
235
|
-
return a.label.localeCompare(b.label);
|
|
236
|
-
});
|
|
237
|
-
const agreementRate = agreementTotal > 0
|
|
238
|
-
? round2(agreementVotes / agreementTotal)
|
|
239
|
-
: null;
|
|
240
|
-
const scoreSpreadMin = scoreDeltas.length ? Math.min(...scoreDeltas) : null;
|
|
241
|
-
const scoreSpreadMax = scoreDeltas.length ? Math.max(...scoreDeltas) : null;
|
|
242
|
-
const scoreSpreadMedian = median(scoreDeltas);
|
|
243
|
-
const instabilityCount = outliers.filter((entry) => entry.instability).length;
|
|
244
|
-
const verdict = resolveVerdict({
|
|
245
|
-
sampleSize,
|
|
246
|
-
agreementRate,
|
|
247
|
-
spreadMax: scoreSpreadMax,
|
|
248
|
-
instabilityCount,
|
|
249
|
-
});
|
|
250
|
-
return {
|
|
251
|
-
sampleSize,
|
|
252
|
-
comparableExampleCount: outliers.length,
|
|
253
|
-
agreementRate,
|
|
254
|
-
scoreSpreadMin: scoreSpreadMin === null ? null : round2(scoreSpreadMin),
|
|
255
|
-
scoreSpreadMedian: scoreSpreadMedian === null
|
|
256
|
-
? null
|
|
257
|
-
: round2(scoreSpreadMedian),
|
|
258
|
-
scoreSpreadMax: scoreSpreadMax === null ? null : round2(scoreSpreadMax),
|
|
259
|
-
instabilityCount,
|
|
260
|
-
verdict: verdict.verdict,
|
|
261
|
-
verdictReason: verdict.reason,
|
|
262
|
-
outliers,
|
|
263
|
-
};
|
|
264
|
-
}
|