@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +60 -1
- package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
- package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
- package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
- package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
- package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
- package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
- package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
- package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
- package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
- package/esm/simulator-ui/src/AppShell.js +6 -4
- package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
- package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
- package/esm/simulator-ui/src/verify_unified.js +25 -0
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
- package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
- package/esm/src/server.d.ts.map +1 -1
- package/esm/src/server.js +227 -45
- package/esm/src/server_isograph_environment.d.ts +11 -4
- package/esm/src/server_isograph_environment.d.ts.map +1 -1
- package/esm/src/server_isograph_environment.js +5 -0
- package/esm/src/simulator_graphql.d.ts +11 -4
- package/esm/src/simulator_graphql.d.ts.map +1 -1
- package/esm/src/simulator_graphql.js +278 -245
- package/package.json +2 -2
- package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
- package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
- package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
- package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
- package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
- package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
- package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
- package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
- package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
- package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
- package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
- package/script/simulator-ui/src/AppShell.js +6 -4
- package/script/simulator-ui/src/verify_unified.d.ts +17 -0
- package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
- package/script/simulator-ui/src/verify_unified.js +29 -0
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
- package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
- package/script/src/server.d.ts.map +1 -1
- package/script/src/server.js +227 -45
- package/script/src/server_isograph_environment.d.ts +11 -4
- package/script/src/server_isograph_environment.d.ts.map +1 -1
- package/script/src/server_isograph_environment.js +5 -0
- package/script/src/simulator_graphql.d.ts +11 -4
- package/script/src/simulator_graphql.d.ts.map +1 -1
- package/script/src/simulator_graphql.js +278 -245
- package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
- package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
- package/esm/simulator-ui/src/verify_metrics.js +0 -260
- package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
- package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
- package/script/simulator-ui/src/verify_metrics.js +0 -264
|
@@ -537,20 +537,6 @@ function toWorkspaceGradeRunStatus(value) {
|
|
|
537
537
|
return "COMPLETED";
|
|
538
538
|
return "ERROR";
|
|
539
539
|
}
|
|
540
|
-
const VERIFY_CONSISTENCY_THRESHOLDS = {
|
|
541
|
-
minSampleSize: 6,
|
|
542
|
-
instabilityScoreDelta: 1.5,
|
|
543
|
-
pass: {
|
|
544
|
-
agreementMin: 0.9,
|
|
545
|
-
maxSpread: 1,
|
|
546
|
-
maxInstabilityCount: 0,
|
|
547
|
-
},
|
|
548
|
-
warn: {
|
|
549
|
-
agreementMin: 0.75,
|
|
550
|
-
maxSpread: 2,
|
|
551
|
-
maxInstabilityCount: 2,
|
|
552
|
-
},
|
|
553
|
-
};
|
|
554
540
|
function round2(value) {
|
|
555
541
|
return Math.round(value * 100) / 100;
|
|
556
542
|
}
|
|
@@ -563,195 +549,179 @@ function median(values) {
|
|
|
563
549
|
return sorted[mid];
|
|
564
550
|
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
565
551
|
}
|
|
566
|
-
function
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
? turn.turnIndex
|
|
571
|
-
: fallbackIndex;
|
|
572
|
-
const key = turn.refId && turn.refId.trim().length > 0
|
|
573
|
-
? `ref:${turn.refId}`
|
|
574
|
-
: `turn:${index}`;
|
|
575
|
-
const pass = typeof turn.score === "number" ? turn.score >= 0 : undefined;
|
|
576
|
-
return {
|
|
577
|
-
key,
|
|
578
|
-
label: `Assistant turn ${turn.turnNumber || fallbackIndex + 1}`,
|
|
579
|
-
points: [{
|
|
580
|
-
runId: run.id,
|
|
581
|
-
score: turn.score,
|
|
582
|
-
pass,
|
|
583
|
-
reason: turn.reason,
|
|
584
|
-
turnIndex: index,
|
|
585
|
-
messageRefId: turn.refId,
|
|
586
|
-
}],
|
|
587
|
-
};
|
|
588
|
-
});
|
|
552
|
+
function extractRepresentativeScore(run) {
|
|
553
|
+
const summaryScore = run.summary?.score;
|
|
554
|
+
if (typeof summaryScore === "number" && Number.isFinite(summaryScore)) {
|
|
555
|
+
return summaryScore;
|
|
589
556
|
}
|
|
590
|
-
const
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
points: [{
|
|
597
|
-
runId: run.id,
|
|
598
|
-
score,
|
|
599
|
-
pass,
|
|
600
|
-
reason,
|
|
601
|
-
}],
|
|
602
|
-
}];
|
|
557
|
+
const turnScores = run.turns
|
|
558
|
+
.map((turn) => turn.score)
|
|
559
|
+
.filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
560
|
+
if (turnScores.length === 0)
|
|
561
|
+
return null;
|
|
562
|
+
return median(turnScores);
|
|
603
563
|
}
|
|
604
|
-
function
|
|
605
|
-
if (
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
reason: `Need at least ${VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize} samples before issuing a firm verdict.`,
|
|
609
|
-
};
|
|
610
|
-
}
|
|
611
|
-
if (input.agreementRate === null) {
|
|
612
|
-
return {
|
|
613
|
-
verdict: "WARN",
|
|
614
|
-
reason: "No comparable pass/fail evidence was found in the sampled runs.",
|
|
615
|
-
};
|
|
616
|
-
}
|
|
617
|
-
const spreadMax = input.spreadMax ?? 0;
|
|
618
|
-
if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.pass.agreementMin &&
|
|
619
|
-
spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.pass.maxSpread &&
|
|
620
|
-
input.instabilityCount <=
|
|
621
|
-
VERIFY_CONSISTENCY_THRESHOLDS.pass.maxInstabilityCount) {
|
|
622
|
-
return {
|
|
623
|
-
verdict: "PASS",
|
|
624
|
-
reason: "Agreement, spread, and instability all meet PASS thresholds.",
|
|
625
|
-
};
|
|
626
|
-
}
|
|
627
|
-
if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.warn.agreementMin &&
|
|
628
|
-
spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.warn.maxSpread &&
|
|
629
|
-
input.instabilityCount <=
|
|
630
|
-
VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount) {
|
|
631
|
-
return {
|
|
632
|
-
verdict: "WARN",
|
|
633
|
-
reason: "Some variation was detected, but results remain within WARN thresholds.",
|
|
634
|
-
};
|
|
564
|
+
function extractRepresentativeReason(run) {
|
|
565
|
+
if (typeof run.summary?.reason === "string" &&
|
|
566
|
+
run.summary.reason.trim().length > 0) {
|
|
567
|
+
return run.summary.reason.trim();
|
|
635
568
|
}
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
reason: "Agreement/spread instability exceeds WARN thresholds.",
|
|
639
|
-
};
|
|
569
|
+
const turnReason = run.turns.find((turn) => typeof turn.reason === "string" && turn.reason.trim().length > 0)?.reason;
|
|
570
|
+
return typeof turnReason === "string" ? turnReason.trim() : null;
|
|
640
571
|
}
|
|
641
|
-
function
|
|
642
|
-
const
|
|
643
|
-
const
|
|
644
|
-
const
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
existing.
|
|
657
|
-
|
|
658
|
-
});
|
|
659
|
-
const outliers = [];
|
|
660
|
-
let agreementVotes = 0;
|
|
661
|
-
let agreementTotal = 0;
|
|
662
|
-
const scoreDeltas = [];
|
|
663
|
-
bucketsByKey.forEach((bucket) => {
|
|
664
|
-
const scores = bucket.points
|
|
665
|
-
.map((point) => point.score)
|
|
666
|
-
.filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
667
|
-
const minScore = scores.length > 0 ? Math.min(...scores) : null;
|
|
668
|
-
const maxScore = scores.length > 0 ? Math.max(...scores) : null;
|
|
669
|
-
const scoreDelta = minScore !== null && maxScore !== null
|
|
670
|
-
? round2(maxScore - minScore)
|
|
671
|
-
: null;
|
|
672
|
-
const passVotes = bucket.points
|
|
673
|
-
.map((point) => point.pass)
|
|
674
|
-
.filter((pass) => typeof pass === "boolean");
|
|
675
|
-
const passCount = passVotes.filter((value) => value).length;
|
|
676
|
-
const failCount = passVotes.length - passCount;
|
|
677
|
-
const agreementRate = passVotes.length > 0
|
|
678
|
-
? round2(Math.max(passCount, failCount) / passVotes.length)
|
|
679
|
-
: null;
|
|
680
|
-
if (passVotes.length > 0) {
|
|
681
|
-
agreementVotes += Math.max(passCount, failCount);
|
|
682
|
-
agreementTotal += passVotes.length;
|
|
572
|
+
function buildVerifyMetrics(args) {
|
|
573
|
+
const runById = new Map(args.runs.map((run) => [run.id, run]));
|
|
574
|
+
const reasonCounter = new Map();
|
|
575
|
+
const scoreValues = [];
|
|
576
|
+
let passCount = 0;
|
|
577
|
+
let scoreCount = 0;
|
|
578
|
+
let gradingFailureCount = 0;
|
|
579
|
+
const scenarioGroups = new Map();
|
|
580
|
+
const addReason = (kind, reason) => {
|
|
581
|
+
const normalized = typeof reason === "string" ? reason.trim() : "";
|
|
582
|
+
if (!normalized)
|
|
583
|
+
return;
|
|
584
|
+
const key = `${kind}:${normalized}`;
|
|
585
|
+
const existing = reasonCounter.get(key);
|
|
586
|
+
if (existing) {
|
|
587
|
+
existing.count += 1;
|
|
588
|
+
return;
|
|
683
589
|
}
|
|
684
|
-
|
|
685
|
-
|
|
590
|
+
reasonCounter.set(key, { key, kind, reason: normalized, count: 1 });
|
|
591
|
+
};
|
|
592
|
+
for (const request of args.batch.requests) {
|
|
593
|
+
if (!request.scenarioRunId)
|
|
594
|
+
continue;
|
|
595
|
+
const existing = scenarioGroups.get(request.scenarioRunId);
|
|
596
|
+
if (existing) {
|
|
597
|
+
existing.requests.push(request);
|
|
598
|
+
continue;
|
|
686
599
|
}
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
:
|
|
694
|
-
const maxPoint = maxScore === null
|
|
695
|
-
? undefined
|
|
696
|
-
: bucket.points.find((point) => point.score === maxScore);
|
|
697
|
-
outliers.push({
|
|
698
|
-
key: bucket.key,
|
|
699
|
-
label: bucket.label,
|
|
700
|
-
sampleSize: bucket.points.length,
|
|
701
|
-
agreementRate,
|
|
702
|
-
scoreDelta,
|
|
703
|
-
passFlip,
|
|
704
|
-
instability,
|
|
705
|
-
minRunId: minPoint?.runId,
|
|
706
|
-
maxRunId: maxPoint?.runId,
|
|
707
|
-
turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
|
|
708
|
-
messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
|
|
600
|
+
scenarioGroups.set(request.scenarioRunId, {
|
|
601
|
+
scenarioRunId: request.scenarioRunId,
|
|
602
|
+
requests: [request],
|
|
603
|
+
scores: [],
|
|
604
|
+
executionFailureCount: 0,
|
|
605
|
+
gradingFailureCount: 0,
|
|
606
|
+
completedSampleCount: 0,
|
|
709
607
|
});
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
if (
|
|
713
|
-
|
|
608
|
+
}
|
|
609
|
+
for (const request of args.batch.requests) {
|
|
610
|
+
if (request.status === "error") {
|
|
611
|
+
addReason("execution", request.error);
|
|
612
|
+
if (request.scenarioRunId) {
|
|
613
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
614
|
+
if (group)
|
|
615
|
+
group.executionFailureCount += 1;
|
|
616
|
+
}
|
|
617
|
+
continue;
|
|
618
|
+
}
|
|
619
|
+
if (request.status !== "completed" || !request.runId)
|
|
620
|
+
continue;
|
|
621
|
+
const run = runById.get(request.runId);
|
|
622
|
+
if (!run || run.status !== "completed")
|
|
623
|
+
continue;
|
|
624
|
+
const score = extractRepresentativeScore(run);
|
|
625
|
+
if (request.scenarioRunId) {
|
|
626
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
627
|
+
if (group) {
|
|
628
|
+
group.completedSampleCount += 1;
|
|
629
|
+
}
|
|
714
630
|
}
|
|
715
|
-
if (
|
|
716
|
-
|
|
717
|
-
const
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
631
|
+
if (score === null)
|
|
632
|
+
continue;
|
|
633
|
+
const rounded = round2(score);
|
|
634
|
+
scoreValues.push(rounded);
|
|
635
|
+
scoreCount += 1;
|
|
636
|
+
if (rounded >= 0) {
|
|
637
|
+
passCount += 1;
|
|
638
|
+
}
|
|
639
|
+
else {
|
|
640
|
+
gradingFailureCount += 1;
|
|
641
|
+
addReason("grading", extractRepresentativeReason(run));
|
|
642
|
+
if (request.scenarioRunId) {
|
|
643
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
644
|
+
if (group)
|
|
645
|
+
group.gradingFailureCount += 1;
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
if (request.scenarioRunId) {
|
|
649
|
+
const group = scenarioGroups.get(request.scenarioRunId);
|
|
650
|
+
if (group) {
|
|
651
|
+
const messageRefId = run.turns.find((turn) => typeof turn.refId === "string" && turn.refId.trim().length > 0)?.refId;
|
|
652
|
+
group.scores.push({
|
|
653
|
+
runId: run.id,
|
|
654
|
+
value: rounded,
|
|
655
|
+
messageRefId,
|
|
656
|
+
});
|
|
657
|
+
}
|
|
723
658
|
}
|
|
724
|
-
|
|
659
|
+
}
|
|
660
|
+
const outlierScenarioRuns = [
|
|
661
|
+
...scenarioGroups.values(),
|
|
662
|
+
].map((group) => {
|
|
663
|
+
const sortedScores = [...group.scores].sort((left, right) => left.value - right.value);
|
|
664
|
+
const numericScores = sortedScores.map((entry) => entry.value);
|
|
665
|
+
const min = numericScores.length > 0 ? numericScores[0] : null;
|
|
666
|
+
const max = numericScores.length > 0
|
|
667
|
+
? numericScores[numericScores.length - 1]
|
|
668
|
+
: null;
|
|
669
|
+
const avg = numericScores.length > 0
|
|
670
|
+
? round2(numericScores.reduce((sum, value) => sum + value, 0) /
|
|
671
|
+
numericScores.length)
|
|
672
|
+
: null;
|
|
673
|
+
return {
|
|
674
|
+
key: group.scenarioRunId,
|
|
675
|
+
scenarioRunId: group.scenarioRunId,
|
|
676
|
+
gradeSampleCount: group.requests.length,
|
|
677
|
+
completedSampleCount: group.completedSampleCount,
|
|
678
|
+
executionFailureCount: group.executionFailureCount,
|
|
679
|
+
gradingFailureCount: group.gradingFailureCount,
|
|
680
|
+
averageScore: avg,
|
|
681
|
+
minScore: min === null ? null : round2(min),
|
|
682
|
+
maxScore: max === null ? null : round2(max),
|
|
683
|
+
failed: group.executionFailureCount > 0,
|
|
684
|
+
minRunId: sortedScores[0]?.runId,
|
|
685
|
+
maxRunId: sortedScores[sortedScores.length - 1]?.runId,
|
|
686
|
+
messageRefId: sortedScores[0]?.messageRefId ??
|
|
687
|
+
sortedScores[sortedScores.length - 1]?.messageRefId,
|
|
688
|
+
};
|
|
689
|
+
}).sort((left, right) => {
|
|
690
|
+
if (left.failed !== right.failed)
|
|
691
|
+
return left.failed ? -1 : 1;
|
|
692
|
+
const leftScore = left.averageScore ?? Number.POSITIVE_INFINITY;
|
|
693
|
+
const rightScore = right.averageScore ?? Number.POSITIVE_INFINITY;
|
|
694
|
+
if (leftScore !== rightScore)
|
|
695
|
+
return leftScore - rightScore;
|
|
696
|
+
return left.scenarioRunId.localeCompare(right.scenarioRunId);
|
|
725
697
|
});
|
|
726
|
-
const
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
const
|
|
730
|
-
?
|
|
698
|
+
const scoreMin = scoreValues.length > 0 ? Math.min(...scoreValues) : null;
|
|
699
|
+
const scoreMax = scoreValues.length > 0 ? Math.max(...scoreValues) : null;
|
|
700
|
+
const scoreMedian = median(scoreValues);
|
|
701
|
+
const scoreMean = scoreValues.length > 0
|
|
702
|
+
? round2(scoreValues.reduce((sum, value) => sum + value, 0) / scoreValues.length)
|
|
731
703
|
: null;
|
|
732
|
-
const
|
|
733
|
-
? Math.max(...scoreDeltas)
|
|
734
|
-
: null;
|
|
735
|
-
const scoreSpreadMedian = median(scoreDeltas);
|
|
736
|
-
const instabilityCount = outliers.filter((entry) => entry.instability).length;
|
|
737
|
-
const verdict = resolveVerifyVerdict({
|
|
738
|
-
sampleSize,
|
|
739
|
-
agreementRate,
|
|
740
|
-
spreadMax: scoreSpreadMax,
|
|
741
|
-
instabilityCount,
|
|
742
|
-
});
|
|
704
|
+
const passRate = scoreCount > 0 ? round2(passCount / scoreCount) : null;
|
|
743
705
|
return {
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
706
|
+
scenarioRunCountRequested: args.batch.scenarioRuns,
|
|
707
|
+
scenarioRunCountCompleted: args.batch.scenarioRunsCompleted,
|
|
708
|
+
scenarioRunCountFailed: args.batch.scenarioRunsFailed,
|
|
709
|
+
gradeSampleCountRequested: args.batch.requested,
|
|
710
|
+
gradeSampleCountCompleted: args.batch.completed,
|
|
711
|
+
gradeSampleCountFailed: args.batch.failed,
|
|
712
|
+
executionFailureCount: args.batch.failed,
|
|
713
|
+
gradingFailureCount,
|
|
714
|
+
passRate,
|
|
715
|
+
scoreMin: scoreMin === null ? null : round2(scoreMin),
|
|
716
|
+
scoreMedian: scoreMedian === null ? null : round2(scoreMedian),
|
|
717
|
+
scoreMax: scoreMax === null ? null : round2(scoreMax),
|
|
718
|
+
scoreMean,
|
|
719
|
+
outlierScenarioRuns,
|
|
720
|
+
failureReasons: [...reasonCounter.values()].sort((left, right) => {
|
|
721
|
+
if (left.count !== right.count)
|
|
722
|
+
return right.count - left.count;
|
|
723
|
+
return left.reason.localeCompare(right.reason);
|
|
724
|
+
}),
|
|
755
725
|
};
|
|
756
726
|
}
|
|
757
727
|
function toWorkspaceVerifyBatchStatus(value) {
|
|
@@ -778,8 +748,8 @@ const WorkspaceVerifyBatchStatusEnum = builder.enumType("WorkspaceVerifyBatchSta
|
|
|
778
748
|
const WorkspaceVerifyBatchRequestStatusEnum = builder.enumType("WorkspaceVerifyBatchRequestStatus", {
|
|
779
749
|
values: ["QUEUED", "RUNNING", "COMPLETED", "ERROR"],
|
|
780
750
|
});
|
|
781
|
-
const
|
|
782
|
-
values: ["
|
|
751
|
+
const WorkspaceVerifyFailureReasonKindEnum = builder.enumType("WorkspaceVerifyFailureReasonKind", {
|
|
752
|
+
values: ["EXECUTION", "GRADING"],
|
|
783
753
|
});
|
|
784
754
|
const WorkspaceGradeTurnType = builder.objectRef("WorkspaceGradeTurn");
|
|
785
755
|
WorkspaceGradeTurnType.implement({
|
|
@@ -930,28 +900,44 @@ WorkspaceVerifyBatchRequestType.implement({
|
|
|
930
900
|
nullable: true,
|
|
931
901
|
resolve: (parent) => parent.runId ?? null,
|
|
932
902
|
}),
|
|
903
|
+
scenarioRunId: t.id({
|
|
904
|
+
nullable: true,
|
|
905
|
+
resolve: (parent) => parent.scenarioRunId ?? null,
|
|
906
|
+
}),
|
|
933
907
|
error: t.string({
|
|
934
908
|
nullable: true,
|
|
935
909
|
resolve: (parent) => parent.error ?? null,
|
|
936
910
|
}),
|
|
937
911
|
}),
|
|
938
912
|
});
|
|
939
|
-
const
|
|
940
|
-
|
|
913
|
+
const WorkspaceVerifyScenarioOutlierType = builder.objectRef("WorkspaceVerifyScenarioOutlier");
|
|
914
|
+
WorkspaceVerifyScenarioOutlierType.implement({
|
|
941
915
|
fields: (t) => ({
|
|
942
916
|
key: t.id({ resolve: (parent) => parent.key }),
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
917
|
+
scenarioRunId: t.id({ resolve: (parent) => parent.scenarioRunId }),
|
|
918
|
+
gradeSampleCount: t.int({ resolve: (parent) => parent.gradeSampleCount }),
|
|
919
|
+
completedSampleCount: t.int({
|
|
920
|
+
resolve: (parent) => parent.completedSampleCount,
|
|
921
|
+
}),
|
|
922
|
+
executionFailureCount: t.int({
|
|
923
|
+
resolve: (parent) => parent.executionFailureCount,
|
|
924
|
+
}),
|
|
925
|
+
gradingFailureCount: t.int({
|
|
926
|
+
resolve: (parent) => parent.gradingFailureCount,
|
|
927
|
+
}),
|
|
928
|
+
averageScore: t.float({
|
|
946
929
|
nullable: true,
|
|
947
|
-
resolve: (parent) => parent.
|
|
930
|
+
resolve: (parent) => parent.averageScore,
|
|
948
931
|
}),
|
|
949
|
-
|
|
932
|
+
minScore: t.float({
|
|
950
933
|
nullable: true,
|
|
951
|
-
resolve: (parent) => parent.
|
|
934
|
+
resolve: (parent) => parent.minScore,
|
|
952
935
|
}),
|
|
953
|
-
|
|
954
|
-
|
|
936
|
+
maxScore: t.float({
|
|
937
|
+
nullable: true,
|
|
938
|
+
resolve: (parent) => parent.maxScore,
|
|
939
|
+
}),
|
|
940
|
+
failed: t.boolean({ resolve: (parent) => parent.failed }),
|
|
955
941
|
minRunId: t.id({
|
|
956
942
|
nullable: true,
|
|
957
943
|
resolve: (parent) => parent.minRunId ?? null,
|
|
@@ -960,53 +946,86 @@ WorkspaceVerifyOutlierType.implement({
|
|
|
960
946
|
nullable: true,
|
|
961
947
|
resolve: (parent) => parent.maxRunId ?? null,
|
|
962
948
|
}),
|
|
963
|
-
turnIndex: t.int({
|
|
964
|
-
nullable: true,
|
|
965
|
-
resolve: (parent) => parent.turnIndex ?? null,
|
|
966
|
-
}),
|
|
967
949
|
messageRefId: t.id({
|
|
968
950
|
nullable: true,
|
|
969
951
|
resolve: (parent) => parent.messageRefId ?? null,
|
|
970
952
|
}),
|
|
971
953
|
}),
|
|
972
954
|
});
|
|
955
|
+
const WorkspaceVerifyFailureReasonGroupType = builder.objectRef("WorkspaceVerifyFailureReasonGroup");
|
|
956
|
+
WorkspaceVerifyFailureReasonGroupType.implement({
|
|
957
|
+
fields: (t) => ({
|
|
958
|
+
key: t.id({ resolve: (parent) => parent.key }),
|
|
959
|
+
kind: t.field({
|
|
960
|
+
type: WorkspaceVerifyFailureReasonKindEnum,
|
|
961
|
+
resolve: (parent) => parent.kind === "execution" ? "EXECUTION" : "GRADING",
|
|
962
|
+
}),
|
|
963
|
+
reason: t.string({ resolve: (parent) => parent.reason }),
|
|
964
|
+
count: t.int({ resolve: (parent) => parent.count }),
|
|
965
|
+
}),
|
|
966
|
+
});
|
|
973
967
|
const WorkspaceVerifyMetricsType = builder.objectRef("WorkspaceVerifyMetrics");
|
|
974
968
|
WorkspaceVerifyMetricsType.implement({
|
|
975
969
|
fields: (t) => ({
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
970
|
+
scenarioRunCountRequested: t.int({
|
|
971
|
+
resolve: (parent) => parent.scenarioRunCountRequested,
|
|
972
|
+
}),
|
|
973
|
+
scenarioRunCountCompleted: t.int({
|
|
974
|
+
resolve: (parent) => parent.scenarioRunCountCompleted,
|
|
975
|
+
}),
|
|
976
|
+
scenarioRunCountFailed: t.int({
|
|
977
|
+
resolve: (parent) => parent.scenarioRunCountFailed,
|
|
978
|
+
}),
|
|
979
|
+
gradeSampleCountRequested: t.int({
|
|
980
|
+
resolve: (parent) => parent.gradeSampleCountRequested,
|
|
981
|
+
}),
|
|
982
|
+
gradeSampleCountCompleted: t.int({
|
|
983
|
+
resolve: (parent) => parent.gradeSampleCountCompleted,
|
|
984
|
+
}),
|
|
985
|
+
gradeSampleCountFailed: t.int({
|
|
986
|
+
resolve: (parent) => parent.gradeSampleCountFailed,
|
|
980
987
|
}),
|
|
981
|
-
|
|
988
|
+
executionFailureCount: t.int({
|
|
989
|
+
resolve: (parent) => parent.executionFailureCount,
|
|
990
|
+
}),
|
|
991
|
+
gradingFailureCount: t.int({
|
|
992
|
+
resolve: (parent) => parent.gradingFailureCount,
|
|
993
|
+
}),
|
|
994
|
+
passRate: t.float({
|
|
982
995
|
nullable: true,
|
|
983
|
-
resolve: (parent) => parent.
|
|
996
|
+
resolve: (parent) => parent.passRate,
|
|
984
997
|
}),
|
|
985
|
-
|
|
998
|
+
scoreMin: t.float({
|
|
986
999
|
nullable: true,
|
|
987
|
-
resolve: (parent) => parent.
|
|
1000
|
+
resolve: (parent) => parent.scoreMin,
|
|
988
1001
|
}),
|
|
989
|
-
|
|
1002
|
+
scoreMedian: t.float({
|
|
990
1003
|
nullable: true,
|
|
991
|
-
resolve: (parent) => parent.
|
|
1004
|
+
resolve: (parent) => parent.scoreMedian,
|
|
992
1005
|
}),
|
|
993
|
-
|
|
994
|
-
|
|
1006
|
+
scoreMax: t.float({
|
|
1007
|
+
nullable: true,
|
|
1008
|
+
resolve: (parent) => parent.scoreMax,
|
|
995
1009
|
}),
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
resolve: (parent) => parent.
|
|
1010
|
+
scoreMean: t.float({
|
|
1011
|
+
nullable: true,
|
|
1012
|
+
resolve: (parent) => parent.scoreMean,
|
|
999
1013
|
}),
|
|
1000
|
-
|
|
1001
|
-
|
|
1014
|
+
outlierScenarioRuns: t.connection({
|
|
1015
|
+
type: WorkspaceVerifyScenarioOutlierType,
|
|
1016
|
+
args: {
|
|
1017
|
+
first: t.arg.int(),
|
|
1018
|
+
after: t.arg.string(),
|
|
1019
|
+
},
|
|
1020
|
+
resolve: (parent, args) => resolveArrayConnection({ args }, parent.outlierScenarioRuns),
|
|
1002
1021
|
}),
|
|
1003
|
-
|
|
1004
|
-
type:
|
|
1022
|
+
failureReasons: t.connection({
|
|
1023
|
+
type: WorkspaceVerifyFailureReasonGroupType,
|
|
1005
1024
|
args: {
|
|
1006
1025
|
first: t.arg.int(),
|
|
1007
1026
|
after: t.arg.string(),
|
|
1008
1027
|
},
|
|
1009
|
-
resolve: (parent, args) => resolveArrayConnection({ args }, parent.
|
|
1028
|
+
resolve: (parent, args) => resolveArrayConnection({ args }, parent.failureReasons),
|
|
1010
1029
|
}),
|
|
1011
1030
|
}),
|
|
1012
1031
|
});
|
|
@@ -1015,10 +1034,22 @@ WorkspaceVerifyBatchType.implement({
|
|
|
1015
1034
|
fields: (t) => ({
|
|
1016
1035
|
id: t.id({ resolve: (parent) => parent.id }),
|
|
1017
1036
|
workspaceId: t.id({ resolve: (parent) => parent.workspaceId }),
|
|
1018
|
-
|
|
1019
|
-
scenarioRunId: t.id({
|
|
1037
|
+
scenarioDeckId: t.id({
|
|
1020
1038
|
nullable: true,
|
|
1021
|
-
resolve: (parent) => parent.
|
|
1039
|
+
resolve: (parent) => parent.scenarioDeckId ?? null,
|
|
1040
|
+
}),
|
|
1041
|
+
graderId: t.id({ resolve: (parent) => parent.graderId }),
|
|
1042
|
+
scenarioRuns: t.int({
|
|
1043
|
+
resolve: (parent) => parent.scenarioRuns,
|
|
1044
|
+
}),
|
|
1045
|
+
graderRepeatsPerScenario: t.int({
|
|
1046
|
+
resolve: (parent) => parent.graderRepeatsPerScenario,
|
|
1047
|
+
}),
|
|
1048
|
+
scenarioRunsCompleted: t.int({
|
|
1049
|
+
resolve: (parent) => parent.scenarioRunsCompleted,
|
|
1050
|
+
}),
|
|
1051
|
+
scenarioRunsFailed: t.int({
|
|
1052
|
+
resolve: (parent) => parent.scenarioRunsFailed,
|
|
1022
1053
|
}),
|
|
1023
1054
|
status: t.field({
|
|
1024
1055
|
type: WorkspaceVerifyBatchStatusEnum,
|
|
@@ -1050,19 +1081,17 @@ WorkspaceVerifyBatchType.implement({
|
|
|
1050
1081
|
resolve: async (parent, _args, context) => {
|
|
1051
1082
|
if (!context.readWorkspaceGradeRuns)
|
|
1052
1083
|
return null;
|
|
1053
|
-
const
|
|
1054
|
-
.filter((request) => request.
|
|
1055
|
-
typeof request.runId === "string" &&
|
|
1084
|
+
const runIds = parent.requests
|
|
1085
|
+
.filter((request) => typeof request.runId === "string" &&
|
|
1056
1086
|
request.runId.trim().length > 0)
|
|
1057
1087
|
.map((request) => request.runId);
|
|
1058
|
-
|
|
1059
|
-
return null;
|
|
1060
|
-
const runIdSet = new Set(completedRunIds);
|
|
1088
|
+
const runIdSet = new Set(runIds);
|
|
1061
1089
|
const allRuns = await context.readWorkspaceGradeRuns(parent.workspaceId);
|
|
1062
1090
|
const selectedRuns = allRuns.filter((run) => runIdSet.has(run.id));
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1091
|
+
return buildVerifyMetrics({
|
|
1092
|
+
batch: parent,
|
|
1093
|
+
runs: selectedRuns,
|
|
1094
|
+
});
|
|
1066
1095
|
},
|
|
1067
1096
|
}),
|
|
1068
1097
|
}),
|
|
@@ -1907,7 +1936,8 @@ const WorkspaceConversationSessionStartInput = builder.inputType("WorkspaceConve
|
|
|
1907
1936
|
assistantInit: t.field({ type: "JSON" }),
|
|
1908
1937
|
graderId: t.id(),
|
|
1909
1938
|
scenarioRunId: t.id(),
|
|
1910
|
-
|
|
1939
|
+
scenarioRuns: t.int(),
|
|
1940
|
+
graderRepeatsPerScenario: t.int(),
|
|
1911
1941
|
concurrency: t.int(),
|
|
1912
1942
|
}),
|
|
1913
1943
|
});
|
|
@@ -1939,9 +1969,10 @@ const WorkspaceGradeRunCreateInput = builder.inputType("WorkspaceGradeRunCreateI
|
|
|
1939
1969
|
const WorkspaceVerifyBatchRunCreateInput = builder.inputType("WorkspaceVerifyBatchRunCreateInput", {
|
|
1940
1970
|
fields: (t) => ({
|
|
1941
1971
|
workspaceId: t.id({ required: true }),
|
|
1972
|
+
scenarioDeckId: t.id(),
|
|
1942
1973
|
graderId: t.id({ required: true }),
|
|
1943
|
-
|
|
1944
|
-
|
|
1974
|
+
scenarioRuns: t.int({ required: true }),
|
|
1975
|
+
graderRepeatsPerScenario: t.int({ required: true }),
|
|
1945
1976
|
concurrency: t.int({ required: true }),
|
|
1946
1977
|
}),
|
|
1947
1978
|
});
|
|
@@ -2149,7 +2180,8 @@ builder.mutationType({
|
|
|
2149
2180
|
assistantInit: args.input.assistantInit,
|
|
2150
2181
|
graderId: args.input.graderId ?? null,
|
|
2151
2182
|
scenarioRunId: args.input.scenarioRunId ?? null,
|
|
2152
|
-
|
|
2183
|
+
scenarioRuns: args.input.scenarioRuns ?? null,
|
|
2184
|
+
graderRepeatsPerScenario: args.input.graderRepeatsPerScenario ?? null,
|
|
2153
2185
|
concurrency: args.input.concurrency ?? null,
|
|
2154
2186
|
});
|
|
2155
2187
|
return {
|
|
@@ -2269,9 +2301,10 @@ builder.mutationType({
|
|
|
2269
2301
|
}
|
|
2270
2302
|
const batch = await context.createWorkspaceVerifyBatchRun({
|
|
2271
2303
|
workspaceId: args.input.workspaceId,
|
|
2304
|
+
scenarioDeckId: args.input.scenarioDeckId ?? null,
|
|
2272
2305
|
graderId: args.input.graderId,
|
|
2273
|
-
|
|
2274
|
-
|
|
2306
|
+
scenarioRuns: args.input.scenarioRuns,
|
|
2307
|
+
graderRepeatsPerScenario: args.input.graderRepeatsPerScenario,
|
|
2275
2308
|
concurrency: args.input.concurrency,
|
|
2276
2309
|
});
|
|
2277
2310
|
return {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@bolt-foundry/gambit",
|
|
3
|
-
"version": "0.8.6-rc.
|
|
3
|
+
"version": "0.8.6-rc.2",
|
|
4
4
|
"description": "Agent harness framework for building, running, and verifying LLM workflows in Markdown and code.",
|
|
5
5
|
"homepage": "https://github.com/bolt-foundry/gambit",
|
|
6
6
|
"repository": {
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
"gambit": "bin/gambit.cjs"
|
|
25
25
|
},
|
|
26
26
|
"dependencies": {
|
|
27
|
-
"@bolt-foundry/gambit-core": "
|
|
27
|
+
"@bolt-foundry/gambit-core": "file:../../packages/gambit-core/dist/npm",
|
|
28
28
|
"@isograph/react": "0.5.2",
|
|
29
29
|
"@pothos/core": "^4.7.3",
|
|
30
30
|
"@pothos/plugin-relay": "^4.4.2",
|