@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/CHANGELOG.md +60 -1
  2. package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
  3. package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
  4. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  5. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  6. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  7. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  8. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  9. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  10. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  11. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  12. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  13. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  14. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  15. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  16. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  17. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  18. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  19. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  20. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  21. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  22. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  23. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  24. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  25. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  26. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  27. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  28. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  29. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  30. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  31. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  32. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  33. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  34. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  35. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  36. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  37. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  38. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  39. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  40. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  41. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  42. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
  43. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  44. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  45. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  46. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  47. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  48. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  49. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  50. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  51. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  52. package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
  53. package/esm/simulator-ui/src/AppShell.js +6 -4
  54. package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
  55. package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
  56. package/esm/simulator-ui/src/verify_unified.js +25 -0
  57. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  58. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  59. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  60. package/esm/src/server.d.ts.map +1 -1
  61. package/esm/src/server.js +227 -45
  62. package/esm/src/server_isograph_environment.d.ts +11 -4
  63. package/esm/src/server_isograph_environment.d.ts.map +1 -1
  64. package/esm/src/server_isograph_environment.js +5 -0
  65. package/esm/src/simulator_graphql.d.ts +11 -4
  66. package/esm/src/simulator_graphql.d.ts.map +1 -1
  67. package/esm/src/simulator_graphql.js +278 -245
  68. package/package.json +2 -2
  69. package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
  70. package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
  71. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  72. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  73. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  74. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  75. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  76. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  77. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  78. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  79. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  80. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  81. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  82. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  83. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  84. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  85. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  86. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  87. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  88. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  89. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  90. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  91. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  92. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  93. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  94. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  95. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  96. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  97. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  98. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  99. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  100. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  101. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  102. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  103. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  104. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  105. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  106. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  107. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  108. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  109. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
  110. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  111. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  112. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  113. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  114. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  115. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  116. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  117. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  118. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  119. package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
  120. package/script/simulator-ui/src/AppShell.js +6 -4
  121. package/script/simulator-ui/src/verify_unified.d.ts +17 -0
  122. package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
  123. package/script/simulator-ui/src/verify_unified.js +29 -0
  124. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  125. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  126. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  127. package/script/src/server.d.ts.map +1 -1
  128. package/script/src/server.js +227 -45
  129. package/script/src/server_isograph_environment.d.ts +11 -4
  130. package/script/src/server_isograph_environment.d.ts.map +1 -1
  131. package/script/src/server_isograph_environment.js +5 -0
  132. package/script/src/simulator_graphql.d.ts +11 -4
  133. package/script/src/simulator_graphql.d.ts.map +1 -1
  134. package/script/src/simulator_graphql.js +278 -245
  135. package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
  136. package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  137. package/esm/simulator-ui/src/verify_metrics.js +0 -260
  138. package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
  139. package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  140. package/script/simulator-ui/src/verify_metrics.js +0 -264
@@ -1,50 +0,0 @@
1
- export type VerifyVerdict = "PASS" | "WARN" | "FAIL";
2
- export type VerifyCalibrationRun = {
3
- id: string;
4
- status: "running" | "completed" | "error";
5
- runAt?: string;
6
- result?: unknown;
7
- };
8
- export type VerifyOutlier = {
9
- key: string;
10
- label: string;
11
- sampleSize: number;
12
- agreementRate: number | null;
13
- scoreDelta: number | null;
14
- passFlip: boolean;
15
- instability: boolean;
16
- minScore: number | null;
17
- maxScore: number | null;
18
- minRunId?: string;
19
- maxRunId?: string;
20
- turnIndex?: number;
21
- messageRefId?: string;
22
- };
23
- export type VerifyConsistencyReport = {
24
- sampleSize: number;
25
- comparableExampleCount: number;
26
- agreementRate: number | null;
27
- scoreSpreadMin: number | null;
28
- scoreSpreadMedian: number | null;
29
- scoreSpreadMax: number | null;
30
- instabilityCount: number;
31
- verdict: VerifyVerdict;
32
- verdictReason: string;
33
- outliers: Array<VerifyOutlier>;
34
- };
35
- export declare const VERIFY_CONSISTENCY_THRESHOLDS: {
36
- readonly minSampleSize: 6;
37
- readonly instabilityScoreDelta: 1.5;
38
- readonly pass: {
39
- readonly agreementMin: 0.9;
40
- readonly maxSpread: 1;
41
- readonly maxInstabilityCount: 0;
42
- };
43
- readonly warn: {
44
- readonly agreementMin: 0.75;
45
- readonly maxSpread: 2;
46
- readonly maxInstabilityCount: 2;
47
- };
48
- };
49
- export declare function buildVerifyConsistencyReport(runs: Array<VerifyCalibrationRun>): VerifyConsistencyReport;
50
- //# sourceMappingURL=verify_metrics.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"verify_metrics.d.ts","sourceRoot":"","sources":["../../../src/simulator-ui/src/verify_metrics.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,aAAa,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,CAAC;AAErD,MAAM,MAAM,oBAAoB,GAAG;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,SAAS,GAAG,WAAW,GAAG,OAAO,CAAC;IAC1C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB,CAAC;AAkBF,MAAM,MAAM,aAAa,GAAG;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,QAAQ,EAAE,OAAO,CAAC;IAClB,WAAW,EAAE,OAAO,CAAC;IACrB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF,MAAM,MAAM,uBAAuB,GAAG;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,gBAAgB,EAAE,MAAM,CAAC;IACzB,OAAO,EAAE,aAAa,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,KAAK,CAAC,aAAa,CAAC,CAAC;CAChC,CAAC;AAEF,eAAO,MAAM,6BAA6B;;;;;;;;;;;;;CAahC,CAAC;AA0JX,wBAAgB,4BAA4B,CAC1C,IAAI,EAAE,KAAK,CAAC,oBAAoB,CAAC,GAChC,uBAAuB,CA0HzB"}
@@ -1,260 +0,0 @@
1
- export const VERIFY_CONSISTENCY_THRESHOLDS = {
2
- minSampleSize: 6,
3
- instabilityScoreDelta: 1.5,
4
- pass: {
5
- agreementMin: 0.9,
6
- maxSpread: 1,
7
- maxInstabilityCount: 0,
8
- },
9
- warn: {
10
- agreementMin: 0.75,
11
- maxSpread: 2,
12
- maxInstabilityCount: 2,
13
- },
14
- };
15
- const pickPayload = (result) => {
16
- if (!result || typeof result !== "object")
17
- return {};
18
- const record = result;
19
- if (record.payload &&
20
- typeof record.payload === "object" &&
21
- !Array.isArray(record.payload)) {
22
- return record.payload;
23
- }
24
- return record;
25
- };
26
- const extractScoreReasonPass = (result) => {
27
- const payload = pickPayload(result);
28
- const score = typeof payload.score === "number" && Number.isFinite(payload.score)
29
- ? payload.score
30
- : undefined;
31
- const reason = typeof payload.reason === "string"
32
- ? payload.reason
33
- : undefined;
34
- if (typeof payload.pass === "boolean") {
35
- return { score, reason, pass: payload.pass };
36
- }
37
- if (typeof payload.passed === "boolean") {
38
- return { score, reason, pass: payload.passed };
39
- }
40
- if (typeof payload.verdict === "string") {
41
- const verdict = payload.verdict.trim().toLowerCase();
42
- if (verdict === "pass")
43
- return { score, reason, pass: true };
44
- if (verdict === "fail")
45
- return { score, reason, pass: false };
46
- }
47
- if (typeof score === "number") {
48
- return { score, reason, pass: score >= 0 };
49
- }
50
- return { score, reason };
51
- };
52
- const flattenRunExamples = (run) => {
53
- if (!run.result || typeof run.result !== "object")
54
- return [];
55
- const record = run.result;
56
- if (record.mode === "turns" && Array.isArray(record.turns)) {
57
- const buckets = [];
58
- record.turns.forEach((turn, fallbackIndex) => {
59
- if (!turn || typeof turn !== "object")
60
- return;
61
- const turnRecord = turn;
62
- const index = typeof turnRecord.index === "number" &&
63
- Number.isFinite(turnRecord.index)
64
- ? Math.max(0, Math.round(turnRecord.index))
65
- : fallbackIndex;
66
- const messageRefId = typeof turnRecord.messageRefId === "string" &&
67
- turnRecord.messageRefId.trim().length > 0
68
- ? turnRecord.messageRefId
69
- : undefined;
70
- const key = messageRefId ? `ref:${messageRefId}` : `turn:${index}`;
71
- const label = `Assistant turn ${fallbackIndex + 1}`;
72
- const parsed = extractScoreReasonPass(turnRecord.result);
73
- buckets.push({
74
- key,
75
- label,
76
- points: [{
77
- runId: run.id,
78
- runAt: run.runAt,
79
- score: parsed.score,
80
- pass: parsed.pass,
81
- reason: parsed.reason,
82
- turnIndex: index,
83
- messageRefId,
84
- }],
85
- });
86
- });
87
- return buckets;
88
- }
89
- const parsed = extractScoreReasonPass(run.result);
90
- return [{
91
- key: "conversation",
92
- label: "Conversation score",
93
- points: [{
94
- runId: run.id,
95
- runAt: run.runAt,
96
- score: parsed.score,
97
- pass: parsed.pass,
98
- reason: parsed.reason,
99
- }],
100
- }];
101
- };
102
- const median = (values) => {
103
- if (!values.length)
104
- return null;
105
- const sorted = [...values].sort((a, b) => a - b);
106
- const mid = Math.floor(sorted.length / 2);
107
- if (sorted.length % 2 === 1)
108
- return sorted[mid];
109
- return (sorted[mid - 1] + sorted[mid]) / 2;
110
- };
111
- const round2 = (value) => Math.round(value * 100) / 100;
112
- const resolveVerdict = (input) => {
113
- const t = VERIFY_CONSISTENCY_THRESHOLDS;
114
- if (input.sampleSize < t.minSampleSize) {
115
- return {
116
- verdict: "WARN",
117
- reason: `Need at least ${t.minSampleSize} samples before issuing a firm verdict.`,
118
- };
119
- }
120
- if (input.agreementRate === null) {
121
- return {
122
- verdict: "WARN",
123
- reason: "No comparable pass/fail evidence was found in the sampled runs.",
124
- };
125
- }
126
- const spreadMax = input.spreadMax ?? 0;
127
- if (input.agreementRate >= t.pass.agreementMin &&
128
- spreadMax <= t.pass.maxSpread &&
129
- input.instabilityCount <= t.pass.maxInstabilityCount) {
130
- return {
131
- verdict: "PASS",
132
- reason: "Agreement, spread, and instability all meet PASS thresholds.",
133
- };
134
- }
135
- if (input.agreementRate >= t.warn.agreementMin &&
136
- spreadMax <= t.warn.maxSpread &&
137
- input.instabilityCount <= t.warn.maxInstabilityCount) {
138
- return {
139
- verdict: "WARN",
140
- reason: "Some variation was detected, but results remain within WARN thresholds.",
141
- };
142
- }
143
- return {
144
- verdict: "FAIL",
145
- reason: "Agreement/spread instability exceeds WARN thresholds.",
146
- };
147
- };
148
- export function buildVerifyConsistencyReport(runs) {
149
- const completedRuns = runs.filter((run) => run.status === "completed");
150
- const sampleSize = completedRuns.length;
151
- const bucketsByKey = new Map();
152
- completedRuns.forEach((run) => {
153
- flattenRunExamples(run).forEach((entry) => {
154
- const existing = bucketsByKey.get(entry.key);
155
- if (!existing) {
156
- bucketsByKey.set(entry.key, {
157
- key: entry.key,
158
- label: entry.label,
159
- points: [...entry.points],
160
- });
161
- return;
162
- }
163
- existing.points.push(...entry.points);
164
- });
165
- });
166
- const outliers = [];
167
- let agreementVotes = 0;
168
- let agreementTotal = 0;
169
- const scoreDeltas = [];
170
- bucketsByKey.forEach((bucket) => {
171
- const scores = bucket.points
172
- .map((point) => point.score)
173
- .filter((score) => typeof score === "number" && Number.isFinite(score));
174
- const minScore = scores.length ? Math.min(...scores) : null;
175
- const maxScore = scores.length ? Math.max(...scores) : null;
176
- const scoreDelta = minScore !== null && maxScore !== null
177
- ? round2(maxScore - minScore)
178
- : null;
179
- const passVotes = bucket.points
180
- .map((point) => point.pass)
181
- .filter((pass) => typeof pass === "boolean");
182
- const passCount = passVotes.filter((value) => value).length;
183
- const failCount = passVotes.length - passCount;
184
- const agreementRate = passVotes.length > 0
185
- ? round2(Math.max(passCount, failCount) / passVotes.length)
186
- : null;
187
- if (passVotes.length > 0) {
188
- agreementVotes += Math.max(passCount, failCount);
189
- agreementTotal += passVotes.length;
190
- }
191
- if (scoreDelta !== null) {
192
- scoreDeltas.push(scoreDelta);
193
- }
194
- const passFlip = passCount > 0 && failCount > 0;
195
- const instability = passFlip ||
196
- (scoreDelta !== null &&
197
- scoreDelta > VERIFY_CONSISTENCY_THRESHOLDS.instabilityScoreDelta);
198
- const minPoint = minScore === null
199
- ? undefined
200
- : bucket.points.find((point) => point.score === minScore);
201
- const maxPoint = maxScore === null
202
- ? undefined
203
- : bucket.points.find((point) => point.score === maxScore);
204
- outliers.push({
205
- key: bucket.key,
206
- label: bucket.label,
207
- sampleSize: bucket.points.length,
208
- agreementRate,
209
- scoreDelta,
210
- passFlip,
211
- instability,
212
- minScore,
213
- maxScore,
214
- minRunId: minPoint?.runId,
215
- maxRunId: maxPoint?.runId,
216
- turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
217
- messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
218
- });
219
- });
220
- outliers.sort((a, b) => {
221
- if (a.instability !== b.instability)
222
- return a.instability ? -1 : 1;
223
- if (a.passFlip !== b.passFlip)
224
- return a.passFlip ? -1 : 1;
225
- const aDelta = a.scoreDelta ?? -1;
226
- const bDelta = b.scoreDelta ?? -1;
227
- if (aDelta !== bDelta)
228
- return bDelta - aDelta;
229
- if (a.sampleSize !== b.sampleSize)
230
- return b.sampleSize - a.sampleSize;
231
- return a.label.localeCompare(b.label);
232
- });
233
- const agreementRate = agreementTotal > 0
234
- ? round2(agreementVotes / agreementTotal)
235
- : null;
236
- const scoreSpreadMin = scoreDeltas.length ? Math.min(...scoreDeltas) : null;
237
- const scoreSpreadMax = scoreDeltas.length ? Math.max(...scoreDeltas) : null;
238
- const scoreSpreadMedian = median(scoreDeltas);
239
- const instabilityCount = outliers.filter((entry) => entry.instability).length;
240
- const verdict = resolveVerdict({
241
- sampleSize,
242
- agreementRate,
243
- spreadMax: scoreSpreadMax,
244
- instabilityCount,
245
- });
246
- return {
247
- sampleSize,
248
- comparableExampleCount: outliers.length,
249
- agreementRate,
250
- scoreSpreadMin: scoreSpreadMin === null ? null : round2(scoreSpreadMin),
251
- scoreSpreadMedian: scoreSpreadMedian === null
252
- ? null
253
- : round2(scoreSpreadMedian),
254
- scoreSpreadMax: scoreSpreadMax === null ? null : round2(scoreSpreadMax),
255
- instabilityCount,
256
- verdict: verdict.verdict,
257
- verdictReason: verdict.reason,
258
- outliers,
259
- };
260
- }
@@ -1,50 +0,0 @@
1
- export type VerifyVerdict = "PASS" | "WARN" | "FAIL";
2
- export type VerifyCalibrationRun = {
3
- id: string;
4
- status: "running" | "completed" | "error";
5
- runAt?: string;
6
- result?: unknown;
7
- };
8
- export type VerifyOutlier = {
9
- key: string;
10
- label: string;
11
- sampleSize: number;
12
- agreementRate: number | null;
13
- scoreDelta: number | null;
14
- passFlip: boolean;
15
- instability: boolean;
16
- minScore: number | null;
17
- maxScore: number | null;
18
- minRunId?: string;
19
- maxRunId?: string;
20
- turnIndex?: number;
21
- messageRefId?: string;
22
- };
23
- export type VerifyConsistencyReport = {
24
- sampleSize: number;
25
- comparableExampleCount: number;
26
- agreementRate: number | null;
27
- scoreSpreadMin: number | null;
28
- scoreSpreadMedian: number | null;
29
- scoreSpreadMax: number | null;
30
- instabilityCount: number;
31
- verdict: VerifyVerdict;
32
- verdictReason: string;
33
- outliers: Array<VerifyOutlier>;
34
- };
35
- export declare const VERIFY_CONSISTENCY_THRESHOLDS: {
36
- readonly minSampleSize: 6;
37
- readonly instabilityScoreDelta: 1.5;
38
- readonly pass: {
39
- readonly agreementMin: 0.9;
40
- readonly maxSpread: 1;
41
- readonly maxInstabilityCount: 0;
42
- };
43
- readonly warn: {
44
- readonly agreementMin: 0.75;
45
- readonly maxSpread: 2;
46
- readonly maxInstabilityCount: 2;
47
- };
48
- };
49
- export declare function buildVerifyConsistencyReport(runs: Array<VerifyCalibrationRun>): VerifyConsistencyReport;
50
- //# sourceMappingURL=verify_metrics.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"verify_metrics.d.ts","sourceRoot":"","sources":["../../../src/simulator-ui/src/verify_metrics.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,aAAa,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,CAAC;AAErD,MAAM,MAAM,oBAAoB,GAAG;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,SAAS,GAAG,WAAW,GAAG,OAAO,CAAC;IAC1C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB,CAAC;AAkBF,MAAM,MAAM,aAAa,GAAG;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,QAAQ,EAAE,OAAO,CAAC;IAClB,WAAW,EAAE,OAAO,CAAC;IACrB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF,MAAM,MAAM,uBAAuB,GAAG;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,gBAAgB,EAAE,MAAM,CAAC;IACzB,OAAO,EAAE,aAAa,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,KAAK,CAAC,aAAa,CAAC,CAAC;CAChC,CAAC;AAEF,eAAO,MAAM,6BAA6B;;;;;;;;;;;;;CAahC,CAAC;AA0JX,wBAAgB,4BAA4B,CAC1C,IAAI,EAAE,KAAK,CAAC,oBAAoB,CAAC,GAChC,uBAAuB,CA0HzB"}
@@ -1,264 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.VERIFY_CONSISTENCY_THRESHOLDS = void 0;
4
- exports.buildVerifyConsistencyReport = buildVerifyConsistencyReport;
5
- exports.VERIFY_CONSISTENCY_THRESHOLDS = {
6
- minSampleSize: 6,
7
- instabilityScoreDelta: 1.5,
8
- pass: {
9
- agreementMin: 0.9,
10
- maxSpread: 1,
11
- maxInstabilityCount: 0,
12
- },
13
- warn: {
14
- agreementMin: 0.75,
15
- maxSpread: 2,
16
- maxInstabilityCount: 2,
17
- },
18
- };
19
- const pickPayload = (result) => {
20
- if (!result || typeof result !== "object")
21
- return {};
22
- const record = result;
23
- if (record.payload &&
24
- typeof record.payload === "object" &&
25
- !Array.isArray(record.payload)) {
26
- return record.payload;
27
- }
28
- return record;
29
- };
30
- const extractScoreReasonPass = (result) => {
31
- const payload = pickPayload(result);
32
- const score = typeof payload.score === "number" && Number.isFinite(payload.score)
33
- ? payload.score
34
- : undefined;
35
- const reason = typeof payload.reason === "string"
36
- ? payload.reason
37
- : undefined;
38
- if (typeof payload.pass === "boolean") {
39
- return { score, reason, pass: payload.pass };
40
- }
41
- if (typeof payload.passed === "boolean") {
42
- return { score, reason, pass: payload.passed };
43
- }
44
- if (typeof payload.verdict === "string") {
45
- const verdict = payload.verdict.trim().toLowerCase();
46
- if (verdict === "pass")
47
- return { score, reason, pass: true };
48
- if (verdict === "fail")
49
- return { score, reason, pass: false };
50
- }
51
- if (typeof score === "number") {
52
- return { score, reason, pass: score >= 0 };
53
- }
54
- return { score, reason };
55
- };
56
- const flattenRunExamples = (run) => {
57
- if (!run.result || typeof run.result !== "object")
58
- return [];
59
- const record = run.result;
60
- if (record.mode === "turns" && Array.isArray(record.turns)) {
61
- const buckets = [];
62
- record.turns.forEach((turn, fallbackIndex) => {
63
- if (!turn || typeof turn !== "object")
64
- return;
65
- const turnRecord = turn;
66
- const index = typeof turnRecord.index === "number" &&
67
- Number.isFinite(turnRecord.index)
68
- ? Math.max(0, Math.round(turnRecord.index))
69
- : fallbackIndex;
70
- const messageRefId = typeof turnRecord.messageRefId === "string" &&
71
- turnRecord.messageRefId.trim().length > 0
72
- ? turnRecord.messageRefId
73
- : undefined;
74
- const key = messageRefId ? `ref:${messageRefId}` : `turn:${index}`;
75
- const label = `Assistant turn ${fallbackIndex + 1}`;
76
- const parsed = extractScoreReasonPass(turnRecord.result);
77
- buckets.push({
78
- key,
79
- label,
80
- points: [{
81
- runId: run.id,
82
- runAt: run.runAt,
83
- score: parsed.score,
84
- pass: parsed.pass,
85
- reason: parsed.reason,
86
- turnIndex: index,
87
- messageRefId,
88
- }],
89
- });
90
- });
91
- return buckets;
92
- }
93
- const parsed = extractScoreReasonPass(run.result);
94
- return [{
95
- key: "conversation",
96
- label: "Conversation score",
97
- points: [{
98
- runId: run.id,
99
- runAt: run.runAt,
100
- score: parsed.score,
101
- pass: parsed.pass,
102
- reason: parsed.reason,
103
- }],
104
- }];
105
- };
106
- const median = (values) => {
107
- if (!values.length)
108
- return null;
109
- const sorted = [...values].sort((a, b) => a - b);
110
- const mid = Math.floor(sorted.length / 2);
111
- if (sorted.length % 2 === 1)
112
- return sorted[mid];
113
- return (sorted[mid - 1] + sorted[mid]) / 2;
114
- };
115
- const round2 = (value) => Math.round(value * 100) / 100;
116
- const resolveVerdict = (input) => {
117
- const t = exports.VERIFY_CONSISTENCY_THRESHOLDS;
118
- if (input.sampleSize < t.minSampleSize) {
119
- return {
120
- verdict: "WARN",
121
- reason: `Need at least ${t.minSampleSize} samples before issuing a firm verdict.`,
122
- };
123
- }
124
- if (input.agreementRate === null) {
125
- return {
126
- verdict: "WARN",
127
- reason: "No comparable pass/fail evidence was found in the sampled runs.",
128
- };
129
- }
130
- const spreadMax = input.spreadMax ?? 0;
131
- if (input.agreementRate >= t.pass.agreementMin &&
132
- spreadMax <= t.pass.maxSpread &&
133
- input.instabilityCount <= t.pass.maxInstabilityCount) {
134
- return {
135
- verdict: "PASS",
136
- reason: "Agreement, spread, and instability all meet PASS thresholds.",
137
- };
138
- }
139
- if (input.agreementRate >= t.warn.agreementMin &&
140
- spreadMax <= t.warn.maxSpread &&
141
- input.instabilityCount <= t.warn.maxInstabilityCount) {
142
- return {
143
- verdict: "WARN",
144
- reason: "Some variation was detected, but results remain within WARN thresholds.",
145
- };
146
- }
147
- return {
148
- verdict: "FAIL",
149
- reason: "Agreement/spread instability exceeds WARN thresholds.",
150
- };
151
- };
152
- function buildVerifyConsistencyReport(runs) {
153
- const completedRuns = runs.filter((run) => run.status === "completed");
154
- const sampleSize = completedRuns.length;
155
- const bucketsByKey = new Map();
156
- completedRuns.forEach((run) => {
157
- flattenRunExamples(run).forEach((entry) => {
158
- const existing = bucketsByKey.get(entry.key);
159
- if (!existing) {
160
- bucketsByKey.set(entry.key, {
161
- key: entry.key,
162
- label: entry.label,
163
- points: [...entry.points],
164
- });
165
- return;
166
- }
167
- existing.points.push(...entry.points);
168
- });
169
- });
170
- const outliers = [];
171
- let agreementVotes = 0;
172
- let agreementTotal = 0;
173
- const scoreDeltas = [];
174
- bucketsByKey.forEach((bucket) => {
175
- const scores = bucket.points
176
- .map((point) => point.score)
177
- .filter((score) => typeof score === "number" && Number.isFinite(score));
178
- const minScore = scores.length ? Math.min(...scores) : null;
179
- const maxScore = scores.length ? Math.max(...scores) : null;
180
- const scoreDelta = minScore !== null && maxScore !== null
181
- ? round2(maxScore - minScore)
182
- : null;
183
- const passVotes = bucket.points
184
- .map((point) => point.pass)
185
- .filter((pass) => typeof pass === "boolean");
186
- const passCount = passVotes.filter((value) => value).length;
187
- const failCount = passVotes.length - passCount;
188
- const agreementRate = passVotes.length > 0
189
- ? round2(Math.max(passCount, failCount) / passVotes.length)
190
- : null;
191
- if (passVotes.length > 0) {
192
- agreementVotes += Math.max(passCount, failCount);
193
- agreementTotal += passVotes.length;
194
- }
195
- if (scoreDelta !== null) {
196
- scoreDeltas.push(scoreDelta);
197
- }
198
- const passFlip = passCount > 0 && failCount > 0;
199
- const instability = passFlip ||
200
- (scoreDelta !== null &&
201
- scoreDelta > exports.VERIFY_CONSISTENCY_THRESHOLDS.instabilityScoreDelta);
202
- const minPoint = minScore === null
203
- ? undefined
204
- : bucket.points.find((point) => point.score === minScore);
205
- const maxPoint = maxScore === null
206
- ? undefined
207
- : bucket.points.find((point) => point.score === maxScore);
208
- outliers.push({
209
- key: bucket.key,
210
- label: bucket.label,
211
- sampleSize: bucket.points.length,
212
- agreementRate,
213
- scoreDelta,
214
- passFlip,
215
- instability,
216
- minScore,
217
- maxScore,
218
- minRunId: minPoint?.runId,
219
- maxRunId: maxPoint?.runId,
220
- turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
221
- messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
222
- });
223
- });
224
- outliers.sort((a, b) => {
225
- if (a.instability !== b.instability)
226
- return a.instability ? -1 : 1;
227
- if (a.passFlip !== b.passFlip)
228
- return a.passFlip ? -1 : 1;
229
- const aDelta = a.scoreDelta ?? -1;
230
- const bDelta = b.scoreDelta ?? -1;
231
- if (aDelta !== bDelta)
232
- return bDelta - aDelta;
233
- if (a.sampleSize !== b.sampleSize)
234
- return b.sampleSize - a.sampleSize;
235
- return a.label.localeCompare(b.label);
236
- });
237
- const agreementRate = agreementTotal > 0
238
- ? round2(agreementVotes / agreementTotal)
239
- : null;
240
- const scoreSpreadMin = scoreDeltas.length ? Math.min(...scoreDeltas) : null;
241
- const scoreSpreadMax = scoreDeltas.length ? Math.max(...scoreDeltas) : null;
242
- const scoreSpreadMedian = median(scoreDeltas);
243
- const instabilityCount = outliers.filter((entry) => entry.instability).length;
244
- const verdict = resolveVerdict({
245
- sampleSize,
246
- agreementRate,
247
- spreadMax: scoreSpreadMax,
248
- instabilityCount,
249
- });
250
- return {
251
- sampleSize,
252
- comparableExampleCount: outliers.length,
253
- agreementRate,
254
- scoreSpreadMin: scoreSpreadMin === null ? null : round2(scoreSpreadMin),
255
- scoreSpreadMedian: scoreSpreadMedian === null
256
- ? null
257
- : round2(scoreSpreadMedian),
258
- scoreSpreadMax: scoreSpreadMax === null ? null : round2(scoreSpreadMax),
259
- instabilityCount,
260
- verdict: verdict.verdict,
261
- verdictReason: verdict.reason,
262
- outliers,
263
- };
264
- }