@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +60 -1
- package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
- package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
- package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
- package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
- package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
- package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
- package/esm/simulator-ui/src/AppShell.js +6 -4
- package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
- package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
- package/esm/simulator-ui/src/verify_unified.js +25 -0
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
- package/esm/src/server.d.ts.map +1 -1
- package/esm/src/server.js +227 -45
- package/esm/src/server_isograph_environment.d.ts +11 -4
- package/esm/src/server_isograph_environment.d.ts.map +1 -1
- package/esm/src/server_isograph_environment.js +5 -0
- package/esm/src/simulator_graphql.d.ts +11 -4
- package/esm/src/simulator_graphql.d.ts.map +1 -1
- package/esm/src/simulator_graphql.js +278 -245
- package/package.json +2 -2
- package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
- package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
- package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
- package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
- package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
- package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
- package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
- package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
- package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
- package/script/simulator-ui/src/AppShell.js +6 -4
- package/script/simulator-ui/src/verify_unified.d.ts +17 -0
- package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
- package/script/simulator-ui/src/verify_unified.js +29 -0
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
- package/script/src/server.d.ts.map +1 -1
- package/script/src/server.js +227 -45
- package/script/src/server_isograph_environment.d.ts +11 -4
- package/script/src/server_isograph_environment.d.ts.map +1 -1
- package/script/src/server_isograph_environment.js +5 -0
- package/script/src/simulator_graphql.d.ts +11 -4
- package/script/src/simulator_graphql.d.ts.map +1 -1
- package/script/src/simulator_graphql.js +278 -245
- package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
- package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
- package/esm/simulator-ui/src/verify_metrics.js +0 -260
- package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
- package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
- package/script/simulator-ui/src/verify_metrics.js +0 -264
|
@@ -576,20 +576,6 @@ function toWorkspaceGradeRunStatus(value) {
|
|
|
576
576
|
return "COMPLETED";
|
|
577
577
|
return "ERROR";
|
|
578
578
|
}
|
|
579
|
-
const VERIFY_CONSISTENCY_THRESHOLDS = {
|
|
580
|
-
minSampleSize: 6,
|
|
581
|
-
instabilityScoreDelta: 1.5,
|
|
582
|
-
pass: {
|
|
583
|
-
agreementMin: 0.9,
|
|
584
|
-
maxSpread: 1,
|
|
585
|
-
maxInstabilityCount: 0,
|
|
586
|
-
},
|
|
587
|
-
warn: {
|
|
588
|
-
agreementMin: 0.75,
|
|
589
|
-
maxSpread: 2,
|
|
590
|
-
maxInstabilityCount: 2,
|
|
591
|
-
},
|
|
592
|
-
};
|
|
593
579
|
function round2(value) {
|
|
594
580
|
return Math.round(value * 100) / 100;
|
|
595
581
|
}
|
|
@@ -602,195 +588,179 @@ function median(values) {
|
|
|
602
588
|
return sorted[mid];
|
|
603
589
|
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
604
590
|
}
|
|
605
|
-
function
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
? turn.turnIndex
|
|
610
|
-
: fallbackIndex;
|
|
611
|
-
const key = turn.refId && turn.refId.trim().length > 0
|
|
612
|
-
? `ref:${turn.refId}`
|
|
613
|
-
: `turn:${index}`;
|
|
614
|
-
const pass = typeof turn.score === "number" ? turn.score >= 0 : undefined;
|
|
615
|
-
return {
|
|
616
|
-
key,
|
|
617
|
-
label: `Assistant turn ${turn.turnNumber || fallbackIndex + 1}`,
|
|
618
|
-
points: [{
|
|
619
|
-
runId: run.id,
|
|
620
|
-
score: turn.score,
|
|
621
|
-
pass,
|
|
622
|
-
reason: turn.reason,
|
|
623
|
-
turnIndex: index,
|
|
624
|
-
messageRefId: turn.refId,
|
|
625
|
-
}],
|
|
626
|
-
};
|
|
627
|
-
});
|
|
591
|
+
function extractRepresentativeScore(run) {
|
|
592
|
+
const summaryScore = run.summary?.score;
|
|
593
|
+
if (typeof summaryScore === "number" && Number.isFinite(summaryScore)) {
|
|
594
|
+
return summaryScore;
|
|
628
595
|
}
|
|
629
|
-
const
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
points: [{
|
|
636
|
-
runId: run.id,
|
|
637
|
-
score,
|
|
638
|
-
pass,
|
|
639
|
-
reason,
|
|
640
|
-
}],
|
|
641
|
-
}];
|
|
596
|
+
const turnScores = run.turns
|
|
597
|
+
.map((turn) => turn.score)
|
|
598
|
+
.filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
599
|
+
if (turnScores.length === 0)
|
|
600
|
+
return null;
|
|
601
|
+
return median(turnScores);
|
|
642
602
|
}
|
|
643
|
-
function
|
|
644
|
-
if (
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
reason: `Need at least ${VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize} samples before issuing a firm verdict.`,
|
|
648
|
-
};
|
|
649
|
-
}
|
|
650
|
-
if (input.agreementRate === null) {
|
|
651
|
-
return {
|
|
652
|
-
verdict: "WARN",
|
|
653
|
-
reason: "No comparable pass/fail evidence was found in the sampled runs.",
|
|
654
|
-
};
|
|
655
|
-
}
|
|
656
|
-
const spreadMax = input.spreadMax ?? 0;
|
|
657
|
-
if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.pass.agreementMin &&
|
|
658
|
-
spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.pass.maxSpread &&
|
|
659
|
-
input.instabilityCount <=
|
|
660
|
-
VERIFY_CONSISTENCY_THRESHOLDS.pass.maxInstabilityCount) {
|
|
661
|
-
return {
|
|
662
|
-
verdict: "PASS",
|
|
663
|
-
reason: "Agreement, spread, and instability all meet PASS thresholds.",
|
|
664
|
-
};
|
|
665
|
-
}
|
|
666
|
-
if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.warn.agreementMin &&
|
|
667
|
-
spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.warn.maxSpread &&
|
|
668
|
-
input.instabilityCount <=
|
|
669
|
-
VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount) {
|
|
670
|
-
return {
|
|
671
|
-
verdict: "WARN",
|
|
672
|
-
reason: "Some variation was detected, but results remain within WARN thresholds.",
|
|
673
|
-
};
|
|
603
|
+
function extractRepresentativeReason(run) {
|
|
604
|
+
if (typeof run.summary?.reason === "string" &&
|
|
605
|
+
run.summary.reason.trim().length > 0) {
|
|
606
|
+
return run.summary.reason.trim();
|
|
674
607
|
}
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
reason: "Agreement/spread instability exceeds WARN thresholds.",
|
|
678
|
-
};
|
|
608
|
+
const turnReason = run.turns.find((turn) => typeof turn.reason === "string" && turn.reason.trim().length > 0)?.reason;
|
|
609
|
+
return typeof turnReason === "string" ? turnReason.trim() : null;
|
|
679
610
|
}
|
|
680
|
-
function
|
|
681
|
-
const
|
|
682
|
-
const
|
|
683
|
-
const
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
existing.
|
|
696
|
-
|
|
697
|
-
});
|
|
698
|
-
const outliers = [];
|
|
699
|
-
let agreementVotes = 0;
|
|
700
|
-
let agreementTotal = 0;
|
|
701
|
-
const scoreDeltas = [];
|
|
702
|
-
bucketsByKey.forEach((bucket) => {
|
|
703
|
-
const scores = bucket.points
|
|
704
|
-
.map((point) => point.score)
|
|
705
|
-
.filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
706
|
-
const minScore = scores.length > 0 ? Math.min(...scores) : null;
|
|
707
|
-
const maxScore = scores.length > 0 ? Math.max(...scores) : null;
|
|
708
|
-
const scoreDelta = minScore !== null && maxScore !== null
|
|
709
|
-
? round2(maxScore - minScore)
|
|
710
|
-
: null;
|
|
711
|
-
const passVotes = bucket.points
|
|
712
|
-
.map((point) => point.pass)
|
|
713
|
-
.filter((pass) => typeof pass === "boolean");
|
|
714
|
-
const passCount = passVotes.filter((value) => value).length;
|
|
715
|
-
const failCount = passVotes.length - passCount;
|
|
716
|
-
const agreementRate = passVotes.length > 0
|
|
717
|
-
? round2(Math.max(passCount, failCount) / passVotes.length)
|
|
718
|
-
: null;
|
|
719
|
-
if (passVotes.length > 0) {
|
|
720
|
-
agreementVotes += Math.max(passCount, failCount);
|
|
721
|
-
agreementTotal += passVotes.length;
|
|
611
|
+
function buildVerifyMetrics(args) {
|
|
612
|
+
const runById = new Map(args.runs.map((run) => [run.id, run]));
|
|
613
|
+
const reasonCounter = new Map();
|
|
614
|
+
const scoreValues = [];
|
|
615
|
+
let passCount = 0;
|
|
616
|
+
let scoreCount = 0;
|
|
617
|
+
let gradingFailureCount = 0;
|
|
618
|
+
const scenarioGroups = new Map();
|
|
619
|
+
const addReason = (kind, reason) => {
|
|
620
|
+
const normalized = typeof reason === "string" ? reason.trim() : "";
|
|
621
|
+
if (!normalized)
|
|
622
|
+
return;
|
|
623
|
+
const key = `${kind}:${normalized}`;
|
|
624
|
+
const existing = reasonCounter.get(key);
|
|
625
|
+
if (existing) {
|
|
626
|
+
existing.count += 1;
|
|
627
|
+
return;
|
|
722
628
|
}
|
|
723
|
-
|
|
724
|
-
|
|
629
|
+
reasonCounter.set(key, { key, kind, reason: normalized, count: 1 });
|
|
630
|
+
};
|
|
631
|
+
for (const request of args.batch.requests) {
|
|
632
|
+
if (!request.scenarioRunId)
|
|
633
|
+
continue;
|
|
634
|
+
const existing = scenarioGroups.get(request.scenarioRunId);
|
|
635
|
+
if (existing) {
|
|
636
|
+
existing.requests.push(request);
|
|
637
|
+
continue;
|
|
725
638
|
}
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
:
|
|
733
|
-
const maxPoint = maxScore === null
|
|
734
|
-
? undefined
|
|
735
|
-
: bucket.points.find((point) => point.score === maxScore);
|
|
736
|
-
outliers.push({
|
|
737
|
-
key: bucket.key,
|
|
738
|
-
label: bucket.label,
|
|
739
|
-
sampleSize: bucket.points.length,
|
|
740
|
-
agreementRate,
|
|
741
|
-
scoreDelta,
|
|
742
|
-
passFlip,
|
|
743
|
-
instability,
|
|
744
|
-
minRunId: minPoint?.runId,
|
|
745
|
-
maxRunId: maxPoint?.runId,
|
|
746
|
-
turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
|
|
747
|
-
messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
|
|
639
|
+
scenarioGroups.set(request.scenarioRunId, {
|
|
640
|
+
scenarioRunId: request.scenarioRunId,
|
|
641
|
+
requests: [request],
|
|
642
|
+
scores: [],
|
|
643
|
+
executionFailureCount: 0,
|
|
644
|
+
gradingFailureCount: 0,
|
|
645
|
+
completedSampleCount: 0,
|
|
748
646
|
});
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
if (
|
|
752
|
-
|
|
647
|
+
}
|
|
648
|
+
for (const request of args.batch.requests) {
|
|
649
|
+
if (request.status === "error") {
|
|
650
|
+
addReason("execution", request.error);
|
|
651
|
+
if (request.scenarioRunId) {
|
|
652
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
653
|
+
if (group)
|
|
654
|
+
group.executionFailureCount += 1;
|
|
655
|
+
}
|
|
656
|
+
continue;
|
|
657
|
+
}
|
|
658
|
+
if (request.status !== "completed" || !request.runId)
|
|
659
|
+
continue;
|
|
660
|
+
const run = runById.get(request.runId);
|
|
661
|
+
if (!run || run.status !== "completed")
|
|
662
|
+
continue;
|
|
663
|
+
const score = extractRepresentativeScore(run);
|
|
664
|
+
if (request.scenarioRunId) {
|
|
665
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
666
|
+
if (group) {
|
|
667
|
+
group.completedSampleCount += 1;
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
if (score === null)
|
|
671
|
+
continue;
|
|
672
|
+
const rounded = round2(score);
|
|
673
|
+
scoreValues.push(rounded);
|
|
674
|
+
scoreCount += 1;
|
|
675
|
+
if (rounded >= 0) {
|
|
676
|
+
passCount += 1;
|
|
753
677
|
}
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
678
|
+
else {
|
|
679
|
+
gradingFailureCount += 1;
|
|
680
|
+
addReason("grading", extractRepresentativeReason(run));
|
|
681
|
+
if (request.scenarioRunId) {
|
|
682
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
683
|
+
if (group)
|
|
684
|
+
group.gradingFailureCount += 1;
|
|
685
|
+
}
|
|
762
686
|
}
|
|
763
|
-
|
|
687
|
+
if (request.scenarioRunId) {
|
|
688
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
689
|
+
if (group) {
|
|
690
|
+
const messageRefId = run.turns.find((turn) => typeof turn.refId === "string" && turn.refId.trim().length > 0)?.refId;
|
|
691
|
+
group.scores.push({
|
|
692
|
+
runId: run.id,
|
|
693
|
+
value: rounded,
|
|
694
|
+
messageRefId,
|
|
695
|
+
});
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
const outlierScenarioRuns = [
|
|
700
|
+
...scenarioGroups.values(),
|
|
701
|
+
].map((group) => {
|
|
702
|
+
const sortedScores = [...group.scores].sort((left, right) => left.value - right.value);
|
|
703
|
+
const numericScores = sortedScores.map((entry) => entry.value);
|
|
704
|
+
const min = numericScores.length > 0 ? numericScores[0] : null;
|
|
705
|
+
const max = numericScores.length > 0
|
|
706
|
+
? numericScores[numericScores.length - 1]
|
|
707
|
+
: null;
|
|
708
|
+
const avg = numericScores.length > 0
|
|
709
|
+
? round2(numericScores.reduce((sum, value) => sum + value, 0) /
|
|
710
|
+
numericScores.length)
|
|
711
|
+
: null;
|
|
712
|
+
return {
|
|
713
|
+
key: group.scenarioRunId,
|
|
714
|
+
scenarioRunId: group.scenarioRunId,
|
|
715
|
+
gradeSampleCount: group.requests.length,
|
|
716
|
+
completedSampleCount: group.completedSampleCount,
|
|
717
|
+
executionFailureCount: group.executionFailureCount,
|
|
718
|
+
gradingFailureCount: group.gradingFailureCount,
|
|
719
|
+
averageScore: avg,
|
|
720
|
+
minScore: min === null ? null : round2(min),
|
|
721
|
+
maxScore: max === null ? null : round2(max),
|
|
722
|
+
failed: group.executionFailureCount > 0,
|
|
723
|
+
minRunId: sortedScores[0]?.runId,
|
|
724
|
+
maxRunId: sortedScores[sortedScores.length - 1]?.runId,
|
|
725
|
+
messageRefId: sortedScores[0]?.messageRefId ??
|
|
726
|
+
sortedScores[sortedScores.length - 1]?.messageRefId,
|
|
727
|
+
};
|
|
728
|
+
}).sort((left, right) => {
|
|
729
|
+
if (left.failed !== right.failed)
|
|
730
|
+
return left.failed ? -1 : 1;
|
|
731
|
+
const leftScore = left.averageScore ?? Number.POSITIVE_INFINITY;
|
|
732
|
+
const rightScore = right.averageScore ?? Number.POSITIVE_INFINITY;
|
|
733
|
+
if (leftScore !== rightScore)
|
|
734
|
+
return leftScore - rightScore;
|
|
735
|
+
return left.scenarioRunId.localeCompare(right.scenarioRunId);
|
|
764
736
|
});
|
|
765
|
-
const
|
|
766
|
-
|
|
737
|
+
const scoreMin = scoreValues.length > 0 ? Math.min(...scoreValues) : null;
|
|
738
|
+
const scoreMax = scoreValues.length > 0 ? Math.max(...scoreValues) : null;
|
|
739
|
+
const scoreMedian = median(scoreValues);
|
|
740
|
+
const scoreMean = scoreValues.length > 0
|
|
741
|
+
? round2(scoreValues.reduce((sum, value) => sum + value, 0) / scoreValues.length)
|
|
767
742
|
: null;
|
|
768
|
-
const
|
|
769
|
-
? Math.min(...scoreDeltas)
|
|
770
|
-
: null;
|
|
771
|
-
const scoreSpreadMax = scoreDeltas.length > 0
|
|
772
|
-
? Math.max(...scoreDeltas)
|
|
773
|
-
: null;
|
|
774
|
-
const scoreSpreadMedian = median(scoreDeltas);
|
|
775
|
-
const instabilityCount = outliers.filter((entry) => entry.instability).length;
|
|
776
|
-
const verdict = resolveVerifyVerdict({
|
|
777
|
-
sampleSize,
|
|
778
|
-
agreementRate,
|
|
779
|
-
spreadMax: scoreSpreadMax,
|
|
780
|
-
instabilityCount,
|
|
781
|
-
});
|
|
743
|
+
const passRate = scoreCount > 0 ? round2(passCount / scoreCount) : null;
|
|
782
744
|
return {
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
745
|
+
scenarioRunCountRequested: args.batch.scenarioRuns,
|
|
746
|
+
scenarioRunCountCompleted: args.batch.scenarioRunsCompleted,
|
|
747
|
+
scenarioRunCountFailed: args.batch.scenarioRunsFailed,
|
|
748
|
+
gradeSampleCountRequested: args.batch.requested,
|
|
749
|
+
gradeSampleCountCompleted: args.batch.completed,
|
|
750
|
+
gradeSampleCountFailed: args.batch.failed,
|
|
751
|
+
executionFailureCount: args.batch.failed,
|
|
752
|
+
gradingFailureCount,
|
|
753
|
+
passRate,
|
|
754
|
+
scoreMin: scoreMin === null ? null : round2(scoreMin),
|
|
755
|
+
scoreMedian: scoreMedian === null ? null : round2(scoreMedian),
|
|
756
|
+
scoreMax: scoreMax === null ? null : round2(scoreMax),
|
|
757
|
+
scoreMean,
|
|
758
|
+
outlierScenarioRuns,
|
|
759
|
+
failureReasons: [...reasonCounter.values()].sort((left, right) => {
|
|
760
|
+
if (left.count !== right.count)
|
|
761
|
+
return right.count - left.count;
|
|
762
|
+
return left.reason.localeCompare(right.reason);
|
|
763
|
+
}),
|
|
794
764
|
};
|
|
795
765
|
}
|
|
796
766
|
function toWorkspaceVerifyBatchStatus(value) {
|
|
@@ -817,8 +787,8 @@ const WorkspaceVerifyBatchStatusEnum = builder.enumType("WorkspaceVerifyBatchSta
|
|
|
817
787
|
const WorkspaceVerifyBatchRequestStatusEnum = builder.enumType("WorkspaceVerifyBatchRequestStatus", {
|
|
818
788
|
values: ["QUEUED", "RUNNING", "COMPLETED", "ERROR"],
|
|
819
789
|
});
|
|
820
|
-
const
|
|
821
|
-
values: ["
|
|
790
|
+
const WorkspaceVerifyFailureReasonKindEnum = builder.enumType("WorkspaceVerifyFailureReasonKind", {
|
|
791
|
+
values: ["EXECUTION", "GRADING"],
|
|
822
792
|
});
|
|
823
793
|
const WorkspaceGradeTurnType = builder.objectRef("WorkspaceGradeTurn");
|
|
824
794
|
WorkspaceGradeTurnType.implement({
|
|
@@ -969,28 +939,44 @@ WorkspaceVerifyBatchRequestType.implement({
|
|
|
969
939
|
nullable: true,
|
|
970
940
|
resolve: (parent) => parent.runId ?? null,
|
|
971
941
|
}),
|
|
942
|
+
scenarioRunId: t.id({
|
|
943
|
+
nullable: true,
|
|
944
|
+
resolve: (parent) => parent.scenarioRunId ?? null,
|
|
945
|
+
}),
|
|
972
946
|
error: t.string({
|
|
973
947
|
nullable: true,
|
|
974
948
|
resolve: (parent) => parent.error ?? null,
|
|
975
949
|
}),
|
|
976
950
|
}),
|
|
977
951
|
});
|
|
978
|
-
const
|
|
979
|
-
|
|
952
|
+
const WorkspaceVerifyScenarioOutlierType = builder.objectRef("WorkspaceVerifyScenarioOutlier");
|
|
953
|
+
WorkspaceVerifyScenarioOutlierType.implement({
|
|
980
954
|
fields: (t) => ({
|
|
981
955
|
key: t.id({ resolve: (parent) => parent.key }),
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
956
|
+
scenarioRunId: t.id({ resolve: (parent) => parent.scenarioRunId }),
|
|
957
|
+
gradeSampleCount: t.int({ resolve: (parent) => parent.gradeSampleCount }),
|
|
958
|
+
completedSampleCount: t.int({
|
|
959
|
+
resolve: (parent) => parent.completedSampleCount,
|
|
960
|
+
}),
|
|
961
|
+
executionFailureCount: t.int({
|
|
962
|
+
resolve: (parent) => parent.executionFailureCount,
|
|
963
|
+
}),
|
|
964
|
+
gradingFailureCount: t.int({
|
|
965
|
+
resolve: (parent) => parent.gradingFailureCount,
|
|
966
|
+
}),
|
|
967
|
+
averageScore: t.float({
|
|
968
|
+
nullable: true,
|
|
969
|
+
resolve: (parent) => parent.averageScore,
|
|
970
|
+
}),
|
|
971
|
+
minScore: t.float({
|
|
985
972
|
nullable: true,
|
|
986
|
-
resolve: (parent) => parent.
|
|
973
|
+
resolve: (parent) => parent.minScore,
|
|
987
974
|
}),
|
|
988
|
-
|
|
975
|
+
maxScore: t.float({
|
|
989
976
|
nullable: true,
|
|
990
|
-
resolve: (parent) => parent.
|
|
977
|
+
resolve: (parent) => parent.maxScore,
|
|
991
978
|
}),
|
|
992
|
-
|
|
993
|
-
instability: t.boolean({ resolve: (parent) => parent.instability }),
|
|
979
|
+
failed: t.boolean({ resolve: (parent) => parent.failed }),
|
|
994
980
|
minRunId: t.id({
|
|
995
981
|
nullable: true,
|
|
996
982
|
resolve: (parent) => parent.minRunId ?? null,
|
|
@@ -999,53 +985,86 @@ WorkspaceVerifyOutlierType.implement({
|
|
|
999
985
|
nullable: true,
|
|
1000
986
|
resolve: (parent) => parent.maxRunId ?? null,
|
|
1001
987
|
}),
|
|
1002
|
-
turnIndex: t.int({
|
|
1003
|
-
nullable: true,
|
|
1004
|
-
resolve: (parent) => parent.turnIndex ?? null,
|
|
1005
|
-
}),
|
|
1006
988
|
messageRefId: t.id({
|
|
1007
989
|
nullable: true,
|
|
1008
990
|
resolve: (parent) => parent.messageRefId ?? null,
|
|
1009
991
|
}),
|
|
1010
992
|
}),
|
|
1011
993
|
});
|
|
994
|
+
const WorkspaceVerifyFailureReasonGroupType = builder.objectRef("WorkspaceVerifyFailureReasonGroup");
|
|
995
|
+
WorkspaceVerifyFailureReasonGroupType.implement({
|
|
996
|
+
fields: (t) => ({
|
|
997
|
+
key: t.id({ resolve: (parent) => parent.key }),
|
|
998
|
+
kind: t.field({
|
|
999
|
+
type: WorkspaceVerifyFailureReasonKindEnum,
|
|
1000
|
+
resolve: (parent) => parent.kind === "execution" ? "EXECUTION" : "GRADING",
|
|
1001
|
+
}),
|
|
1002
|
+
reason: t.string({ resolve: (parent) => parent.reason }),
|
|
1003
|
+
count: t.int({ resolve: (parent) => parent.count }),
|
|
1004
|
+
}),
|
|
1005
|
+
});
|
|
1012
1006
|
const WorkspaceVerifyMetricsType = builder.objectRef("WorkspaceVerifyMetrics");
|
|
1013
1007
|
WorkspaceVerifyMetricsType.implement({
|
|
1014
1008
|
fields: (t) => ({
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1009
|
+
scenarioRunCountRequested: t.int({
|
|
1010
|
+
resolve: (parent) => parent.scenarioRunCountRequested,
|
|
1011
|
+
}),
|
|
1012
|
+
scenarioRunCountCompleted: t.int({
|
|
1013
|
+
resolve: (parent) => parent.scenarioRunCountCompleted,
|
|
1014
|
+
}),
|
|
1015
|
+
scenarioRunCountFailed: t.int({
|
|
1016
|
+
resolve: (parent) => parent.scenarioRunCountFailed,
|
|
1017
|
+
}),
|
|
1018
|
+
gradeSampleCountRequested: t.int({
|
|
1019
|
+
resolve: (parent) => parent.gradeSampleCountRequested,
|
|
1020
|
+
}),
|
|
1021
|
+
gradeSampleCountCompleted: t.int({
|
|
1022
|
+
resolve: (parent) => parent.gradeSampleCountCompleted,
|
|
1023
|
+
}),
|
|
1024
|
+
gradeSampleCountFailed: t.int({
|
|
1025
|
+
resolve: (parent) => parent.gradeSampleCountFailed,
|
|
1019
1026
|
}),
|
|
1020
|
-
|
|
1027
|
+
executionFailureCount: t.int({
|
|
1028
|
+
resolve: (parent) => parent.executionFailureCount,
|
|
1029
|
+
}),
|
|
1030
|
+
gradingFailureCount: t.int({
|
|
1031
|
+
resolve: (parent) => parent.gradingFailureCount,
|
|
1032
|
+
}),
|
|
1033
|
+
passRate: t.float({
|
|
1021
1034
|
nullable: true,
|
|
1022
|
-
resolve: (parent) => parent.
|
|
1035
|
+
resolve: (parent) => parent.passRate,
|
|
1023
1036
|
}),
|
|
1024
|
-
|
|
1037
|
+
scoreMin: t.float({
|
|
1025
1038
|
nullable: true,
|
|
1026
|
-
resolve: (parent) => parent.
|
|
1039
|
+
resolve: (parent) => parent.scoreMin,
|
|
1027
1040
|
}),
|
|
1028
|
-
|
|
1041
|
+
scoreMedian: t.float({
|
|
1029
1042
|
nullable: true,
|
|
1030
|
-
resolve: (parent) => parent.
|
|
1043
|
+
resolve: (parent) => parent.scoreMedian,
|
|
1031
1044
|
}),
|
|
1032
|
-
|
|
1033
|
-
|
|
1045
|
+
scoreMax: t.float({
|
|
1046
|
+
nullable: true,
|
|
1047
|
+
resolve: (parent) => parent.scoreMax,
|
|
1034
1048
|
}),
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
resolve: (parent) => parent.
|
|
1049
|
+
scoreMean: t.float({
|
|
1050
|
+
nullable: true,
|
|
1051
|
+
resolve: (parent) => parent.scoreMean,
|
|
1038
1052
|
}),
|
|
1039
|
-
|
|
1040
|
-
|
|
1053
|
+
outlierScenarioRuns: t.connection({
|
|
1054
|
+
type: WorkspaceVerifyScenarioOutlierType,
|
|
1055
|
+
args: {
|
|
1056
|
+
first: t.arg.int(),
|
|
1057
|
+
after: t.arg.string(),
|
|
1058
|
+
},
|
|
1059
|
+
resolve: (parent, args) => (0, plugin_relay_1.resolveArrayConnection)({ args }, parent.outlierScenarioRuns),
|
|
1041
1060
|
}),
|
|
1042
|
-
|
|
1043
|
-
type:
|
|
1061
|
+
failureReasons: t.connection({
|
|
1062
|
+
type: WorkspaceVerifyFailureReasonGroupType,
|
|
1044
1063
|
args: {
|
|
1045
1064
|
first: t.arg.int(),
|
|
1046
1065
|
after: t.arg.string(),
|
|
1047
1066
|
},
|
|
1048
|
-
resolve: (parent, args) => (0, plugin_relay_1.resolveArrayConnection)({ args }, parent.
|
|
1067
|
+
resolve: (parent, args) => (0, plugin_relay_1.resolveArrayConnection)({ args }, parent.failureReasons),
|
|
1049
1068
|
}),
|
|
1050
1069
|
}),
|
|
1051
1070
|
});
|
|
@@ -1054,10 +1073,22 @@ WorkspaceVerifyBatchType.implement({
|
|
|
1054
1073
|
fields: (t) => ({
|
|
1055
1074
|
id: t.id({ resolve: (parent) => parent.id }),
|
|
1056
1075
|
workspaceId: t.id({ resolve: (parent) => parent.workspaceId }),
|
|
1057
|
-
|
|
1058
|
-
scenarioRunId: t.id({
|
|
1076
|
+
scenarioDeckId: t.id({
|
|
1059
1077
|
nullable: true,
|
|
1060
|
-
resolve: (parent) => parent.
|
|
1078
|
+
resolve: (parent) => parent.scenarioDeckId ?? null,
|
|
1079
|
+
}),
|
|
1080
|
+
graderId: t.id({ resolve: (parent) => parent.graderId }),
|
|
1081
|
+
scenarioRuns: t.int({
|
|
1082
|
+
resolve: (parent) => parent.scenarioRuns,
|
|
1083
|
+
}),
|
|
1084
|
+
graderRepeatsPerScenario: t.int({
|
|
1085
|
+
resolve: (parent) => parent.graderRepeatsPerScenario,
|
|
1086
|
+
}),
|
|
1087
|
+
scenarioRunsCompleted: t.int({
|
|
1088
|
+
resolve: (parent) => parent.scenarioRunsCompleted,
|
|
1089
|
+
}),
|
|
1090
|
+
scenarioRunsFailed: t.int({
|
|
1091
|
+
resolve: (parent) => parent.scenarioRunsFailed,
|
|
1061
1092
|
}),
|
|
1062
1093
|
status: t.field({
|
|
1063
1094
|
type: WorkspaceVerifyBatchStatusEnum,
|
|
@@ -1089,19 +1120,17 @@ WorkspaceVerifyBatchType.implement({
|
|
|
1089
1120
|
resolve: async (parent, _args, context) => {
|
|
1090
1121
|
if (!context.readWorkspaceGradeRuns)
|
|
1091
1122
|
return null;
|
|
1092
|
-
const
|
|
1093
|
-
.filter((request) => request.
|
|
1094
|
-
typeof request.runId === "string" &&
|
|
1123
|
+
const runIds = parent.requests
|
|
1124
|
+
.filter((request) => typeof request.runId === "string" &&
|
|
1095
1125
|
request.runId.trim().length > 0)
|
|
1096
1126
|
.map((request) => request.runId);
|
|
1097
|
-
|
|
1098
|
-
return null;
|
|
1099
|
-
const runIdSet = new Set(completedRunIds);
|
|
1127
|
+
const runIdSet = new Set(runIds);
|
|
1100
1128
|
const allRuns = await context.readWorkspaceGradeRuns(parent.workspaceId);
|
|
1101
1129
|
const selectedRuns = allRuns.filter((run) => runIdSet.has(run.id));
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1130
|
+
return buildVerifyMetrics({
|
|
1131
|
+
batch: parent,
|
|
1132
|
+
runs: selectedRuns,
|
|
1133
|
+
});
|
|
1105
1134
|
},
|
|
1106
1135
|
}),
|
|
1107
1136
|
}),
|
|
@@ -1946,7 +1975,8 @@ const WorkspaceConversationSessionStartInput = builder.inputType("WorkspaceConve
|
|
|
1946
1975
|
assistantInit: t.field({ type: "JSON" }),
|
|
1947
1976
|
graderId: t.id(),
|
|
1948
1977
|
scenarioRunId: t.id(),
|
|
1949
|
-
|
|
1978
|
+
scenarioRuns: t.int(),
|
|
1979
|
+
graderRepeatsPerScenario: t.int(),
|
|
1950
1980
|
concurrency: t.int(),
|
|
1951
1981
|
}),
|
|
1952
1982
|
});
|
|
@@ -1978,9 +2008,10 @@ const WorkspaceGradeRunCreateInput = builder.inputType("WorkspaceGradeRunCreateI
|
|
|
1978
2008
|
const WorkspaceVerifyBatchRunCreateInput = builder.inputType("WorkspaceVerifyBatchRunCreateInput", {
|
|
1979
2009
|
fields: (t) => ({
|
|
1980
2010
|
workspaceId: t.id({ required: true }),
|
|
2011
|
+
scenarioDeckId: t.id(),
|
|
1981
2012
|
graderId: t.id({ required: true }),
|
|
1982
|
-
|
|
1983
|
-
|
|
2013
|
+
scenarioRuns: t.int({ required: true }),
|
|
2014
|
+
graderRepeatsPerScenario: t.int({ required: true }),
|
|
1984
2015
|
concurrency: t.int({ required: true }),
|
|
1985
2016
|
}),
|
|
1986
2017
|
});
|
|
@@ -2188,7 +2219,8 @@ builder.mutationType({
|
|
|
2188
2219
|
assistantInit: args.input.assistantInit,
|
|
2189
2220
|
graderId: args.input.graderId ?? null,
|
|
2190
2221
|
scenarioRunId: args.input.scenarioRunId ?? null,
|
|
2191
|
-
|
|
2222
|
+
scenarioRuns: args.input.scenarioRuns ?? null,
|
|
2223
|
+
graderRepeatsPerScenario: args.input.graderRepeatsPerScenario ?? null,
|
|
2192
2224
|
concurrency: args.input.concurrency ?? null,
|
|
2193
2225
|
});
|
|
2194
2226
|
return {
|
|
@@ -2308,9 +2340,10 @@ builder.mutationType({
|
|
|
2308
2340
|
}
|
|
2309
2341
|
const batch = await context.createWorkspaceVerifyBatchRun({
|
|
2310
2342
|
workspaceId: args.input.workspaceId,
|
|
2343
|
+
scenarioDeckId: args.input.scenarioDeckId ?? null,
|
|
2311
2344
|
graderId: args.input.graderId,
|
|
2312
|
-
|
|
2313
|
-
|
|
2345
|
+
scenarioRuns: args.input.scenarioRuns,
|
|
2346
|
+
graderRepeatsPerScenario: args.input.graderRepeatsPerScenario,
|
|
2314
2347
|
concurrency: args.input.concurrency,
|
|
2315
2348
|
});
|
|
2316
2349
|
return {
|