agentv 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{chunk-MHWYA4CS.js → chunk-AX4CQS45.js} +300 -283
- package/dist/chunk-AX4CQS45.js.map +1 -0
- package/dist/{chunk-YXXD27OK.js → chunk-I6UE4LHZ.js} +1232 -439
- package/dist/chunk-I6UE4LHZ.js.map +1 -0
- package/dist/{chunk-NSVFUL27.js → chunk-VEAOMKNS.js} +4420 -3603
- package/dist/chunk-VEAOMKNS.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-BN5NUVAB.js → dist-XRVHRBJF.js} +16 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-DMSVE6CS.js → interactive-UBEMNJZG.js} +10 -47
- package/dist/interactive-UBEMNJZG.js.map +1 -0
- package/dist/studio/assets/index-DHxVz6M9.css +1 -0
- package/dist/studio/assets/{index-C7TnyYee.js → index-DcwjOyrk.js} +1 -1
- package/dist/studio/assets/index-Y5InSvcS.js +65 -0
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-MHWYA4CS.js.map +0 -1
- package/dist/chunk-NSVFUL27.js.map +0 -1
- package/dist/chunk-YXXD27OK.js.map +0 -1
- package/dist/interactive-DMSVE6CS.js.map +0 -1
- package/dist/studio/assets/index-jJVIJh8b.css +0 -1
- package/dist/studio/assets/index-vn54AYtS.js +0 -65
- /package/dist/{dist-BN5NUVAB.js.map → dist-XRVHRBJF.js.map} +0 -0
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-75RFVESM.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-75RFVESM.js
|
|
423
423
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
424
424
|
import path3 from "node:path";
|
|
425
425
|
import fg from "fast-glob";
|
|
@@ -633,15 +633,13 @@ async function resolveFileReference(rawValue, searchRoots) {
|
|
|
633
633
|
}
|
|
634
634
|
var CliHealthcheckHttpInputSchema = external_exports2.object({
|
|
635
635
|
url: external_exports2.string().min(1, "healthcheck URL is required"),
|
|
636
|
-
timeout_seconds: external_exports2.number().positive().optional()
|
|
637
|
-
|
|
638
|
-
});
|
|
636
|
+
timeout_seconds: external_exports2.number().positive().optional()
|
|
637
|
+
}).passthrough();
|
|
639
638
|
var CliHealthcheckCommandInputSchema = external_exports2.object({
|
|
640
639
|
command: external_exports2.string().min(1, "healthcheck command is required"),
|
|
641
640
|
cwd: external_exports2.string().optional(),
|
|
642
|
-
timeout_seconds: external_exports2.number().positive().optional()
|
|
643
|
-
|
|
644
|
-
});
|
|
641
|
+
timeout_seconds: external_exports2.number().positive().optional()
|
|
642
|
+
}).passthrough();
|
|
645
643
|
var CliHealthcheckInputSchema = external_exports2.union([
|
|
646
644
|
CliHealthcheckHttpInputSchema,
|
|
647
645
|
CliHealthcheckCommandInputSchema
|
|
@@ -653,36 +651,28 @@ var CliTargetInputSchema = external_exports2.object({
|
|
|
653
651
|
command: external_exports2.string(),
|
|
654
652
|
// Files format - optional
|
|
655
653
|
files_format: external_exports2.string().optional(),
|
|
656
|
-
filesFormat: external_exports2.string().optional(),
|
|
657
654
|
attachments_format: external_exports2.string().optional(),
|
|
658
|
-
attachmentsFormat: external_exports2.string().optional(),
|
|
659
655
|
// Working directory - optional
|
|
660
656
|
cwd: external_exports2.string().optional(),
|
|
661
657
|
// Workspace template directory - optional (mutually exclusive with cwd)
|
|
662
658
|
workspace_template: external_exports2.string().optional(),
|
|
663
|
-
workspaceTemplate: external_exports2.string().optional(),
|
|
664
659
|
// Timeout in seconds - optional
|
|
665
660
|
timeout_seconds: external_exports2.number().positive().optional(),
|
|
666
|
-
timeoutSeconds: external_exports2.number().positive().optional(),
|
|
667
661
|
// Healthcheck configuration - optional
|
|
668
662
|
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
669
663
|
// Verbose mode - optional
|
|
670
664
|
verbose: external_exports2.boolean().optional(),
|
|
671
665
|
cli_verbose: external_exports2.boolean().optional(),
|
|
672
|
-
cliVerbose: external_exports2.boolean().optional(),
|
|
673
666
|
// Keep temp files - optional
|
|
674
667
|
keep_temp_files: external_exports2.boolean().optional(),
|
|
675
|
-
keepTempFiles: external_exports2.boolean().optional(),
|
|
676
668
|
keep_output_files: external_exports2.boolean().optional(),
|
|
677
|
-
keepOutputFiles: external_exports2.boolean().optional(),
|
|
678
669
|
// Common target fields
|
|
679
670
|
grader_target: external_exports2.string().optional(),
|
|
680
671
|
judge_target: external_exports2.string().optional(),
|
|
681
672
|
// backward compat
|
|
682
673
|
workers: external_exports2.number().int().min(1).optional(),
|
|
683
|
-
provider_batching: external_exports2.boolean().optional()
|
|
684
|
-
|
|
685
|
-
});
|
|
674
|
+
provider_batching: external_exports2.boolean().optional()
|
|
675
|
+
}).passthrough();
|
|
686
676
|
var CliHealthcheckHttpSchema = external_exports2.object({
|
|
687
677
|
url: external_exports2.string().min(1),
|
|
688
678
|
timeoutMs: external_exports2.number().positive().optional()
|
|
@@ -707,7 +697,7 @@ var CliTargetConfigSchema = external_exports2.object({
|
|
|
707
697
|
keepTempFiles: external_exports2.boolean().optional()
|
|
708
698
|
}).strict();
|
|
709
699
|
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
710
|
-
const timeoutSeconds = input.timeout_seconds
|
|
700
|
+
const timeoutSeconds = input.timeout_seconds;
|
|
711
701
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
712
702
|
if ("url" in input && input.url) {
|
|
713
703
|
const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
|
|
@@ -741,9 +731,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
741
731
|
function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
742
732
|
const targetName = input.name;
|
|
743
733
|
const command = resolveString(input.command, env, `${targetName} CLI command`, true);
|
|
744
|
-
const filesFormatSource = input.files_format ?? input.
|
|
734
|
+
const filesFormatSource = input.files_format ?? input.attachments_format;
|
|
745
735
|
const filesFormat = resolveOptionalLiteralString(filesFormatSource);
|
|
746
|
-
const workspaceTemplateSource = input.workspace_template
|
|
736
|
+
const workspaceTemplateSource = input.workspace_template;
|
|
747
737
|
let workspaceTemplate = resolveOptionalString(
|
|
748
738
|
workspaceTemplateSource,
|
|
749
739
|
env,
|
|
@@ -771,12 +761,10 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
771
761
|
if (!cwd && !workspaceTemplate && evalFilePath) {
|
|
772
762
|
cwd = path2.dirname(path2.resolve(evalFilePath));
|
|
773
763
|
}
|
|
774
|
-
const timeoutSeconds = input.timeout_seconds
|
|
764
|
+
const timeoutSeconds = input.timeout_seconds;
|
|
775
765
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
776
|
-
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose
|
|
777
|
-
const keepTempFiles = resolveOptionalBoolean(
|
|
778
|
-
input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
|
|
779
|
-
);
|
|
766
|
+
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
|
|
767
|
+
const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
|
|
780
768
|
const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
|
|
781
769
|
return {
|
|
782
770
|
command,
|
|
@@ -797,14 +785,104 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
797
785
|
"FILES",
|
|
798
786
|
"OUTPUT_FILE"
|
|
799
787
|
]);
|
|
788
|
+
var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
789
|
+
["providerBatching", "provider_batching"],
|
|
790
|
+
["subagentModeAllowed", "subagent_mode_allowed"],
|
|
791
|
+
["fallbackTargets", "fallback_targets"],
|
|
792
|
+
["resourceName", "endpoint"],
|
|
793
|
+
["baseUrl", "base_url"],
|
|
794
|
+
["apiKey", "api_key"],
|
|
795
|
+
["deploymentName", "model"],
|
|
796
|
+
["thinkingBudget", "thinking_budget"],
|
|
797
|
+
["maxTokens", "max_output_tokens"],
|
|
798
|
+
["apiFormat", "api_format"],
|
|
799
|
+
["timeoutSeconds", "timeout_seconds"],
|
|
800
|
+
["logDir", "log_dir"],
|
|
801
|
+
["logDirectory", "log_directory"],
|
|
802
|
+
["logFormat", "log_format"],
|
|
803
|
+
["logOutputFormat", "log_output_format"],
|
|
804
|
+
["systemPrompt", "system_prompt"],
|
|
805
|
+
["maxTurns", "max_turns"],
|
|
806
|
+
["maxBudgetUsd", "max_budget_usd"],
|
|
807
|
+
["dryRun", "dry_run"],
|
|
808
|
+
["subagentRoot", "subagent_root"],
|
|
809
|
+
["filesFormat", "files_format"],
|
|
810
|
+
["attachmentsFormat", "attachments_format"],
|
|
811
|
+
["cliUrl", "cli_url"],
|
|
812
|
+
["cliPath", "cli_path"],
|
|
813
|
+
["githubToken", "github_token"],
|
|
814
|
+
["sessionDir", "session_dir"],
|
|
815
|
+
["sessionId", "session_id"],
|
|
816
|
+
["sessionStateDir", "session_state_dir"],
|
|
817
|
+
["maxRetries", "max_retries"],
|
|
818
|
+
["retryInitialDelayMs", "retry_initial_delay_ms"],
|
|
819
|
+
["retryMaxDelayMs", "retry_max_delay_ms"],
|
|
820
|
+
["retryBackoffFactor", "retry_backoff_factor"],
|
|
821
|
+
["retryStatusCodes", "retry_status_codes"]
|
|
822
|
+
]);
|
|
823
|
+
var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
824
|
+
["timeoutSeconds", "timeout_seconds"]
|
|
825
|
+
]);
|
|
826
|
+
function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
|
|
827
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
828
|
+
return [];
|
|
829
|
+
}
|
|
830
|
+
const warnings = [];
|
|
831
|
+
for (const [camelCaseField, snakeCaseField] of aliases) {
|
|
832
|
+
if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
|
|
833
|
+
warnings.push({
|
|
834
|
+
location: `${location}.${camelCaseField}`,
|
|
835
|
+
message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
|
|
836
|
+
});
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
return warnings;
|
|
840
|
+
}
|
|
841
|
+
function assertNoDeprecatedCamelCaseTargetFields(definition) {
|
|
842
|
+
if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
|
|
843
|
+
throw new Error(
|
|
844
|
+
`${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
|
|
845
|
+
);
|
|
846
|
+
}
|
|
847
|
+
const warning = findDeprecatedCamelCaseTargetWarnings(
|
|
848
|
+
definition,
|
|
849
|
+
`target "${definition.name}"`
|
|
850
|
+
)[0];
|
|
851
|
+
if (!warning) {
|
|
852
|
+
return;
|
|
853
|
+
}
|
|
854
|
+
const fieldMatch = warning.message.match(/field '([^']+)'/);
|
|
855
|
+
const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
|
|
856
|
+
const field = fieldMatch?.[1] ?? "unknown";
|
|
857
|
+
const replacement = replacementMatch?.[1] ?? "snake_case";
|
|
858
|
+
throw new Error(
|
|
859
|
+
`${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
|
|
860
|
+
);
|
|
861
|
+
}
|
|
862
|
+
function findDeprecatedCamelCaseTargetWarnings(target, location) {
|
|
863
|
+
const warnings = collectDeprecatedCamelCaseWarnings(
|
|
864
|
+
target,
|
|
865
|
+
location,
|
|
866
|
+
DEPRECATED_TARGET_CAMEL_CASE_FIELDS
|
|
867
|
+
);
|
|
868
|
+
if (typeof target !== "object" || target === null || Array.isArray(target)) {
|
|
869
|
+
return warnings;
|
|
870
|
+
}
|
|
871
|
+
const healthcheck = target.healthcheck;
|
|
872
|
+
warnings.push(
|
|
873
|
+
...collectDeprecatedCamelCaseWarnings(
|
|
874
|
+
healthcheck,
|
|
875
|
+
`${location}.healthcheck`,
|
|
876
|
+
DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
|
|
877
|
+
)
|
|
878
|
+
);
|
|
879
|
+
return warnings;
|
|
880
|
+
}
|
|
800
881
|
var COMMON_TARGET_SETTINGS = [
|
|
801
882
|
"use_target",
|
|
802
883
|
"provider_batching",
|
|
803
|
-
"providerBatching",
|
|
804
884
|
"subagent_mode_allowed",
|
|
805
|
-
"
|
|
806
|
-
"fallback_targets",
|
|
807
|
-
"fallbackTargets"
|
|
885
|
+
"fallback_targets"
|
|
808
886
|
];
|
|
809
887
|
var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
|
|
810
888
|
var BASE_TARGET_SCHEMA = external_exports2.object({
|
|
@@ -816,43 +894,40 @@ var BASE_TARGET_SCHEMA = external_exports2.object({
|
|
|
816
894
|
// backward compat
|
|
817
895
|
workers: external_exports2.number().int().min(1).optional(),
|
|
818
896
|
workspace_template: external_exports2.string().optional(),
|
|
819
|
-
workspaceTemplate: external_exports2.string().optional(),
|
|
820
897
|
subagent_mode_allowed: external_exports2.boolean().optional(),
|
|
821
|
-
fallback_targets: external_exports2.array(external_exports2.string().min(1)).optional()
|
|
822
|
-
fallbackTargets: external_exports2.array(external_exports2.string().min(1)).optional()
|
|
898
|
+
fallback_targets: external_exports2.array(external_exports2.string().min(1)).optional()
|
|
823
899
|
}).passthrough();
|
|
824
900
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
901
|
+
var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
|
|
825
902
|
var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
|
826
|
-
function normalizeAzureApiVersion(value) {
|
|
903
|
+
function normalizeAzureApiVersion(value, apiFormat) {
|
|
904
|
+
const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
|
|
827
905
|
if (!value) {
|
|
828
|
-
return
|
|
906
|
+
return defaultVersion;
|
|
829
907
|
}
|
|
830
908
|
const trimmed = value.trim();
|
|
831
909
|
if (trimmed.length === 0) {
|
|
832
|
-
return
|
|
910
|
+
return defaultVersion;
|
|
833
911
|
}
|
|
834
912
|
const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
|
|
835
|
-
return withoutPrefix.length > 0 ? withoutPrefix :
|
|
913
|
+
return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
|
|
836
914
|
}
|
|
837
915
|
function resolveRetryConfig(target) {
|
|
838
|
-
const maxRetries = resolveOptionalNumber(
|
|
839
|
-
target.max_retries ?? target.maxRetries,
|
|
840
|
-
`${target.name} max retries`
|
|
841
|
-
);
|
|
916
|
+
const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
|
|
842
917
|
const initialDelayMs = resolveOptionalNumber(
|
|
843
|
-
target.retry_initial_delay_ms
|
|
918
|
+
target.retry_initial_delay_ms,
|
|
844
919
|
`${target.name} retry initial delay`
|
|
845
920
|
);
|
|
846
921
|
const maxDelayMs = resolveOptionalNumber(
|
|
847
|
-
target.retry_max_delay_ms
|
|
922
|
+
target.retry_max_delay_ms,
|
|
848
923
|
`${target.name} retry max delay`
|
|
849
924
|
);
|
|
850
925
|
const backoffFactor = resolveOptionalNumber(
|
|
851
|
-
target.retry_backoff_factor
|
|
926
|
+
target.retry_backoff_factor,
|
|
852
927
|
`${target.name} retry backoff factor`
|
|
853
928
|
);
|
|
854
929
|
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
855
|
-
target.retry_status_codes
|
|
930
|
+
target.retry_status_codes,
|
|
856
931
|
`${target.name} retry status codes`
|
|
857
932
|
);
|
|
858
933
|
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
@@ -912,9 +987,10 @@ function resolveDelegatedTargetDefinition(name21, definitions, env = process.env
|
|
|
912
987
|
`Target "${name21}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
|
|
913
988
|
);
|
|
914
989
|
}
|
|
915
|
-
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
990
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
|
|
991
|
+
assertNoDeprecatedCamelCaseTargetFields(definition);
|
|
916
992
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
917
|
-
if (parsed.workspace_template !== void 0
|
|
993
|
+
if (parsed.workspace_template !== void 0) {
|
|
918
994
|
throw new Error(
|
|
919
995
|
`${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
|
|
920
996
|
);
|
|
@@ -930,13 +1006,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
930
1006
|
`${parsed.name} provider`,
|
|
931
1007
|
true
|
|
932
1008
|
).toLowerCase();
|
|
933
|
-
const providerBatching = resolveOptionalBoolean(
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
const subagentModeAllowed = resolveOptionalBoolean(
|
|
937
|
-
parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
|
|
938
|
-
);
|
|
939
|
-
const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
|
|
1009
|
+
const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
|
|
1010
|
+
const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
|
|
1011
|
+
const fallbackTargets = parsed.fallback_targets;
|
|
940
1012
|
const base = {
|
|
941
1013
|
name: parsed.name,
|
|
942
1014
|
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
@@ -1086,20 +1158,22 @@ function normalizeOpenAIBaseUrl(value) {
|
|
|
1086
1158
|
return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
|
|
1087
1159
|
}
|
|
1088
1160
|
function resolveAzureConfig(target, env) {
|
|
1089
|
-
const endpointSource = target.endpoint ?? target.resource
|
|
1090
|
-
const apiKeySource = target.api_key
|
|
1091
|
-
const deploymentSource = target.deployment ?? target.
|
|
1161
|
+
const endpointSource = target.endpoint ?? target.resource;
|
|
1162
|
+
const apiKeySource = target.api_key;
|
|
1163
|
+
const deploymentSource = target.deployment ?? target.model;
|
|
1092
1164
|
const versionSource = target.version ?? target.api_version;
|
|
1093
1165
|
const temperatureSource = target.temperature;
|
|
1094
|
-
const maxTokensSource = target.max_output_tokens
|
|
1166
|
+
const maxTokensSource = target.max_output_tokens;
|
|
1095
1167
|
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
1096
1168
|
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
1097
1169
|
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
1170
|
+
const apiFormat = resolveApiFormat(target, env, target.name);
|
|
1098
1171
|
const version = normalizeAzureApiVersion(
|
|
1099
1172
|
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
1100
1173
|
allowLiteral: true,
|
|
1101
1174
|
optionalEnv: true
|
|
1102
|
-
})
|
|
1175
|
+
}),
|
|
1176
|
+
apiFormat
|
|
1103
1177
|
);
|
|
1104
1178
|
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
1105
1179
|
const maxOutputTokens = resolveOptionalNumber(
|
|
@@ -1112,13 +1186,17 @@ function resolveAzureConfig(target, env) {
|
|
|
1112
1186
|
deploymentName,
|
|
1113
1187
|
apiKey,
|
|
1114
1188
|
version,
|
|
1189
|
+
apiFormat,
|
|
1115
1190
|
temperature,
|
|
1116
1191
|
maxOutputTokens,
|
|
1117
1192
|
retry
|
|
1118
1193
|
};
|
|
1119
1194
|
}
|
|
1120
|
-
function resolveApiFormat(target, targetName) {
|
|
1121
|
-
const raw = target.api_format
|
|
1195
|
+
function resolveApiFormat(target, env, targetName) {
|
|
1196
|
+
const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
|
|
1197
|
+
allowLiteral: true,
|
|
1198
|
+
optionalEnv: true
|
|
1199
|
+
});
|
|
1122
1200
|
if (raw === void 0) return void 0;
|
|
1123
1201
|
if (raw === "chat" || raw === "responses") return raw;
|
|
1124
1202
|
throw new Error(
|
|
@@ -1126,11 +1204,11 @@ function resolveApiFormat(target, targetName) {
|
|
|
1126
1204
|
);
|
|
1127
1205
|
}
|
|
1128
1206
|
function resolveOpenAIConfig(target, env) {
|
|
1129
|
-
const endpointSource = target.endpoint ?? target.base_url
|
|
1130
|
-
const apiKeySource = target.api_key
|
|
1207
|
+
const endpointSource = target.endpoint ?? target.base_url;
|
|
1208
|
+
const apiKeySource = target.api_key;
|
|
1131
1209
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
1132
1210
|
const temperatureSource = target.temperature;
|
|
1133
|
-
const maxTokensSource = target.max_output_tokens
|
|
1211
|
+
const maxTokensSource = target.max_output_tokens;
|
|
1134
1212
|
const baseURL = normalizeOpenAIBaseUrl(
|
|
1135
1213
|
resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
|
|
1136
1214
|
allowLiteral: true,
|
|
@@ -1144,17 +1222,17 @@ function resolveOpenAIConfig(target, env) {
|
|
|
1144
1222
|
baseURL,
|
|
1145
1223
|
apiKey,
|
|
1146
1224
|
model,
|
|
1147
|
-
apiFormat: resolveApiFormat(target, target.name),
|
|
1225
|
+
apiFormat: resolveApiFormat(target, env, target.name),
|
|
1148
1226
|
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
1149
1227
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
1150
1228
|
retry
|
|
1151
1229
|
};
|
|
1152
1230
|
}
|
|
1153
1231
|
function resolveOpenRouterConfig(target, env) {
|
|
1154
|
-
const apiKeySource = target.api_key
|
|
1232
|
+
const apiKeySource = target.api_key;
|
|
1155
1233
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
1156
1234
|
const temperatureSource = target.temperature;
|
|
1157
|
-
const maxTokensSource = target.max_output_tokens
|
|
1235
|
+
const maxTokensSource = target.max_output_tokens;
|
|
1158
1236
|
const retry = resolveRetryConfig(target);
|
|
1159
1237
|
return {
|
|
1160
1238
|
apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
|
|
@@ -1165,11 +1243,11 @@ function resolveOpenRouterConfig(target, env) {
|
|
|
1165
1243
|
};
|
|
1166
1244
|
}
|
|
1167
1245
|
function resolveAnthropicConfig(target, env) {
|
|
1168
|
-
const apiKeySource = target.api_key
|
|
1246
|
+
const apiKeySource = target.api_key;
|
|
1169
1247
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
1170
1248
|
const temperatureSource = target.temperature;
|
|
1171
|
-
const maxTokensSource = target.max_output_tokens
|
|
1172
|
-
const thinkingBudgetSource = target.thinking_budget
|
|
1249
|
+
const maxTokensSource = target.max_output_tokens;
|
|
1250
|
+
const thinkingBudgetSource = target.thinking_budget;
|
|
1173
1251
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
1174
1252
|
const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
|
|
1175
1253
|
const retry = resolveRetryConfig(target);
|
|
@@ -1183,10 +1261,10 @@ function resolveAnthropicConfig(target, env) {
|
|
|
1183
1261
|
};
|
|
1184
1262
|
}
|
|
1185
1263
|
function resolveGeminiConfig(target, env) {
|
|
1186
|
-
const apiKeySource = target.api_key
|
|
1264
|
+
const apiKeySource = target.api_key;
|
|
1187
1265
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
1188
1266
|
const temperatureSource = target.temperature;
|
|
1189
|
-
const maxTokensSource = target.max_output_tokens
|
|
1267
|
+
const maxTokensSource = target.max_output_tokens;
|
|
1190
1268
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
|
|
1191
1269
|
const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
|
|
1192
1270
|
allowLiteral: true,
|
|
@@ -1206,11 +1284,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
1206
1284
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
1207
1285
|
const argsSource = target.args ?? target.arguments;
|
|
1208
1286
|
const cwdSource = target.cwd;
|
|
1209
|
-
const workspaceTemplateSource = target.workspace_template
|
|
1210
|
-
const timeoutSource = target.timeout_seconds
|
|
1211
|
-
const logDirSource = target.log_dir ?? target.
|
|
1212
|
-
const logFormatSource = target.log_format ?? target.
|
|
1213
|
-
const systemPromptSource = target.system_prompt
|
|
1287
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
1288
|
+
const timeoutSource = target.timeout_seconds;
|
|
1289
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1290
|
+
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
1291
|
+
const systemPromptSource = target.system_prompt;
|
|
1214
1292
|
const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
|
|
1215
1293
|
allowLiteral: true,
|
|
1216
1294
|
optionalEnv: true
|
|
@@ -1274,16 +1352,16 @@ function normalizeCodexLogFormat(value) {
|
|
|
1274
1352
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
1275
1353
|
}
|
|
1276
1354
|
function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
1277
|
-
const cliUrlSource = target.cli_url
|
|
1278
|
-
const cliPathSource = target.cli_path
|
|
1279
|
-
const githubTokenSource = target.github_token
|
|
1355
|
+
const cliUrlSource = target.cli_url;
|
|
1356
|
+
const cliPathSource = target.cli_path;
|
|
1357
|
+
const githubTokenSource = target.github_token;
|
|
1280
1358
|
const modelSource = target.model;
|
|
1281
1359
|
const cwdSource = target.cwd;
|
|
1282
|
-
const workspaceTemplateSource = target.workspace_template
|
|
1283
|
-
const timeoutSource = target.timeout_seconds
|
|
1284
|
-
const logDirSource = target.log_dir ?? target.
|
|
1285
|
-
const logFormatSource = target.log_format
|
|
1286
|
-
const systemPromptSource = target.system_prompt
|
|
1360
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
1361
|
+
const timeoutSource = target.timeout_seconds;
|
|
1362
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1363
|
+
const logFormatSource = target.log_format;
|
|
1364
|
+
const systemPromptSource = target.system_prompt;
|
|
1287
1365
|
const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
|
|
1288
1366
|
allowLiteral: true,
|
|
1289
1367
|
optionalEnv: true
|
|
@@ -1356,11 +1434,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
1356
1434
|
const modelSource = target.model;
|
|
1357
1435
|
const argsSource = target.args ?? target.arguments;
|
|
1358
1436
|
const cwdSource = target.cwd;
|
|
1359
|
-
const workspaceTemplateSource = target.workspace_template
|
|
1360
|
-
const timeoutSource = target.timeout_seconds
|
|
1361
|
-
const logDirSource = target.log_dir ?? target.
|
|
1362
|
-
const logFormatSource = target.log_format
|
|
1363
|
-
const systemPromptSource = target.system_prompt
|
|
1437
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
1438
|
+
const timeoutSource = target.timeout_seconds;
|
|
1439
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1440
|
+
const logFormatSource = target.log_format;
|
|
1441
|
+
const systemPromptSource = target.system_prompt;
|
|
1364
1442
|
const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
|
|
1365
1443
|
allowLiteral: true,
|
|
1366
1444
|
optionalEnv: true
|
|
@@ -1424,16 +1502,16 @@ function normalizeCopilotLogFormat(value) {
|
|
|
1424
1502
|
}
|
|
1425
1503
|
function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
1426
1504
|
const subproviderSource = target.subprovider;
|
|
1427
|
-
const modelSource = target.model ?? target.pi_model
|
|
1428
|
-
const apiKeySource = target.api_key
|
|
1429
|
-
const toolsSource = target.tools ?? target.pi_tools
|
|
1430
|
-
const thinkingSource = target.thinking ?? target.pi_thinking
|
|
1505
|
+
const modelSource = target.model ?? target.pi_model;
|
|
1506
|
+
const apiKeySource = target.api_key;
|
|
1507
|
+
const toolsSource = target.tools ?? target.pi_tools;
|
|
1508
|
+
const thinkingSource = target.thinking ?? target.pi_thinking;
|
|
1431
1509
|
const cwdSource = target.cwd;
|
|
1432
|
-
const workspaceTemplateSource = target.workspace_template
|
|
1433
|
-
const timeoutSource = target.timeout_seconds
|
|
1434
|
-
const logDirSource = target.log_dir ?? target.
|
|
1435
|
-
const logFormatSource = target.log_format
|
|
1436
|
-
const systemPromptSource = target.system_prompt
|
|
1510
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
1511
|
+
const timeoutSource = target.timeout_seconds;
|
|
1512
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1513
|
+
const logFormatSource = target.log_format;
|
|
1514
|
+
const systemPromptSource = target.system_prompt;
|
|
1437
1515
|
const subprovider = resolveOptionalString(
|
|
1438
1516
|
subproviderSource,
|
|
1439
1517
|
env,
|
|
@@ -1451,7 +1529,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
1451
1529
|
allowLiteral: false,
|
|
1452
1530
|
optionalEnv: true
|
|
1453
1531
|
});
|
|
1454
|
-
const baseUrlSource = target.base_url ?? target.
|
|
1532
|
+
const baseUrlSource = target.base_url ?? target.endpoint;
|
|
1455
1533
|
const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
|
|
1456
1534
|
allowLiteral: true,
|
|
1457
1535
|
optionalEnv: true
|
|
@@ -1510,16 +1588,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
1510
1588
|
function resolvePiCliConfig(target, env, evalFilePath) {
|
|
1511
1589
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
1512
1590
|
const subproviderSource = target.subprovider;
|
|
1513
|
-
const modelSource = target.model ?? target.pi_model
|
|
1514
|
-
const apiKeySource = target.api_key
|
|
1515
|
-
const toolsSource = target.tools ?? target.pi_tools
|
|
1516
|
-
const thinkingSource = target.thinking ?? target.pi_thinking
|
|
1591
|
+
const modelSource = target.model ?? target.pi_model;
|
|
1592
|
+
const apiKeySource = target.api_key;
|
|
1593
|
+
const toolsSource = target.tools ?? target.pi_tools;
|
|
1594
|
+
const thinkingSource = target.thinking ?? target.pi_thinking;
|
|
1517
1595
|
const cwdSource = target.cwd;
|
|
1518
|
-
const workspaceTemplateSource = target.workspace_template
|
|
1519
|
-
const timeoutSource = target.timeout_seconds
|
|
1520
|
-
const logDirSource = target.log_dir ?? target.
|
|
1521
|
-
const logFormatSource = target.log_format
|
|
1522
|
-
const systemPromptSource = target.system_prompt
|
|
1596
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
1597
|
+
const timeoutSource = target.timeout_seconds;
|
|
1598
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1599
|
+
const logFormatSource = target.log_format;
|
|
1600
|
+
const systemPromptSource = target.system_prompt;
|
|
1523
1601
|
const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
|
|
1524
1602
|
allowLiteral: true,
|
|
1525
1603
|
optionalEnv: true
|
|
@@ -1538,7 +1616,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
1538
1616
|
allowLiteral: false,
|
|
1539
1617
|
optionalEnv: true
|
|
1540
1618
|
});
|
|
1541
|
-
const baseUrlSource = target.base_url ?? target.
|
|
1619
|
+
const baseUrlSource = target.base_url ?? target.endpoint;
|
|
1542
1620
|
const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
|
|
1543
1621
|
allowLiteral: true,
|
|
1544
1622
|
optionalEnv: true
|
|
@@ -1596,11 +1674,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
1596
1674
|
function resolveClaudeConfig(target, env, evalFilePath) {
|
|
1597
1675
|
const modelSource = target.model;
|
|
1598
1676
|
const cwdSource = target.cwd;
|
|
1599
|
-
const workspaceTemplateSource = target.workspace_template
|
|
1600
|
-
const timeoutSource = target.timeout_seconds
|
|
1601
|
-
const logDirSource = target.log_dir ?? target.
|
|
1602
|
-
const logFormatSource = target.log_format ?? target.
|
|
1603
|
-
const systemPromptSource = target.system_prompt
|
|
1677
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
1678
|
+
const timeoutSource = target.timeout_seconds;
|
|
1679
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1680
|
+
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
|
|
1681
|
+
const systemPromptSource = target.system_prompt;
|
|
1604
1682
|
const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
|
|
1605
1683
|
allowLiteral: true,
|
|
1606
1684
|
optionalEnv: true
|
|
@@ -1633,8 +1711,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
1633
1711
|
});
|
|
1634
1712
|
const logFormat = normalizeClaudeLogFormat(logFormatSource);
|
|
1635
1713
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
1636
|
-
const maxTurns = typeof target.max_turns === "number" ? target.max_turns :
|
|
1637
|
-
const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd :
|
|
1714
|
+
const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
|
|
1715
|
+
const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
|
|
1638
1716
|
return {
|
|
1639
1717
|
model,
|
|
1640
1718
|
systemPrompt,
|
|
@@ -1665,9 +1743,7 @@ function resolveMockConfig(target) {
|
|
|
1665
1743
|
return { response };
|
|
1666
1744
|
}
|
|
1667
1745
|
function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
|
|
1668
|
-
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
1669
|
-
target.workspace_template ?? target.workspaceTemplate
|
|
1670
|
-
);
|
|
1746
|
+
const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
|
|
1671
1747
|
let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
|
|
1672
1748
|
workspaceTemplateEnvVar,
|
|
1673
1749
|
env,
|
|
@@ -1682,9 +1758,9 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
|
|
|
1682
1758
|
}
|
|
1683
1759
|
const executableSource = target.executable;
|
|
1684
1760
|
const waitSource = target.wait;
|
|
1685
|
-
const dryRunSource = target.dry_run
|
|
1686
|
-
const subagentRootSource = target.subagent_root
|
|
1687
|
-
const timeoutSource = target.timeout_seconds
|
|
1761
|
+
const dryRunSource = target.dry_run;
|
|
1762
|
+
const subagentRootSource = target.subagent_root;
|
|
1763
|
+
const timeoutSource = target.timeout_seconds;
|
|
1688
1764
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
1689
1765
|
const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
|
|
1690
1766
|
allowLiteral: true,
|
|
@@ -1719,8 +1795,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
1719
1795
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
1720
1796
|
if (!parseResult.success) {
|
|
1721
1797
|
const firstError = parseResult.error.errors[0];
|
|
1722
|
-
const
|
|
1723
|
-
const prefix =
|
|
1798
|
+
const path410 = firstError?.path.join(".") || "";
|
|
1799
|
+
const prefix = path410 ? `${target.name} ${path410}: ` : `${target.name}: `;
|
|
1724
1800
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
1725
1801
|
}
|
|
1726
1802
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -1735,7 +1811,7 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
1735
1811
|
}
|
|
1736
1812
|
function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
|
|
1737
1813
|
const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
|
|
1738
|
-
const timeoutSeconds = target.timeout_seconds
|
|
1814
|
+
const timeoutSeconds = target.timeout_seconds;
|
|
1739
1815
|
const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
|
|
1740
1816
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
1741
1817
|
allowLiteral: true,
|
|
@@ -1799,10 +1875,10 @@ function resolveDiscover(value, targetName) {
|
|
|
1799
1875
|
throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
|
|
1800
1876
|
}
|
|
1801
1877
|
function resolveCopilotLogConfig(target, env) {
|
|
1802
|
-
const sessionDirSource = target.session_dir
|
|
1803
|
-
const sessionIdSource = target.session_id
|
|
1878
|
+
const sessionDirSource = target.session_dir;
|
|
1879
|
+
const sessionIdSource = target.session_id;
|
|
1804
1880
|
const discoverSource = target.discover;
|
|
1805
|
-
const sessionStateDirSource = target.session_state_dir
|
|
1881
|
+
const sessionStateDirSource = target.session_state_dir;
|
|
1806
1882
|
const cwdSource = target.cwd;
|
|
1807
1883
|
return {
|
|
1808
1884
|
sessionDir: resolveOptionalString(
|
|
@@ -1975,6 +2051,15 @@ var AGENT_PROVIDER_KINDS = [
|
|
|
1975
2051
|
"vscode",
|
|
1976
2052
|
"vscode-insiders"
|
|
1977
2053
|
];
|
|
2054
|
+
var LLM_GRADER_CAPABLE_KINDS = [
|
|
2055
|
+
"openai",
|
|
2056
|
+
"openrouter",
|
|
2057
|
+
"azure",
|
|
2058
|
+
"anthropic",
|
|
2059
|
+
"gemini",
|
|
2060
|
+
"agentv",
|
|
2061
|
+
"mock"
|
|
2062
|
+
];
|
|
1978
2063
|
var KNOWN_PROVIDERS = [
|
|
1979
2064
|
"openai",
|
|
1980
2065
|
"openrouter",
|
|
@@ -1994,7 +2079,8 @@ var KNOWN_PROVIDERS = [
|
|
|
1994
2079
|
"mock",
|
|
1995
2080
|
"vscode",
|
|
1996
2081
|
"vscode-insiders",
|
|
1997
|
-
"agentv"
|
|
2082
|
+
"agentv",
|
|
2083
|
+
"transcript"
|
|
1998
2084
|
];
|
|
1999
2085
|
var PROVIDER_ALIASES = [
|
|
2000
2086
|
"azure-openai",
|
|
@@ -6803,7 +6889,7 @@ function createOpenRouter(options = {}) {
|
|
|
6803
6889
|
);
|
|
6804
6890
|
const createChatModel = (modelId, settings = {}) => new OpenRouterChatLanguageModel(modelId, settings, {
|
|
6805
6891
|
provider: "openrouter.chat",
|
|
6806
|
-
url: ({ path:
|
|
6892
|
+
url: ({ path: path50 }) => `${baseURL}${path50}`,
|
|
6807
6893
|
headers: getHeaders,
|
|
6808
6894
|
compatibility,
|
|
6809
6895
|
fetch: options.fetch,
|
|
@@ -6811,7 +6897,7 @@ function createOpenRouter(options = {}) {
|
|
|
6811
6897
|
});
|
|
6812
6898
|
const createCompletionModel = (modelId, settings = {}) => new OpenRouterCompletionLanguageModel(modelId, settings, {
|
|
6813
6899
|
provider: "openrouter.completion",
|
|
6814
|
-
url: ({ path:
|
|
6900
|
+
url: ({ path: path50 }) => `${baseURL}${path50}`,
|
|
6815
6901
|
headers: getHeaders,
|
|
6816
6902
|
compatibility,
|
|
6817
6903
|
fetch: options.fetch,
|
|
@@ -6819,14 +6905,14 @@ function createOpenRouter(options = {}) {
|
|
|
6819
6905
|
});
|
|
6820
6906
|
const createEmbeddingModel = (modelId, settings = {}) => new OpenRouterEmbeddingModel(modelId, settings, {
|
|
6821
6907
|
provider: "openrouter.embedding",
|
|
6822
|
-
url: ({ path:
|
|
6908
|
+
url: ({ path: path50 }) => `${baseURL}${path50}`,
|
|
6823
6909
|
headers: getHeaders,
|
|
6824
6910
|
fetch: options.fetch,
|
|
6825
6911
|
extraBody: options.extraBody
|
|
6826
6912
|
});
|
|
6827
6913
|
const createImageModel = (modelId, settings = {}) => new OpenRouterImageModel(modelId, settings, {
|
|
6828
6914
|
provider: "openrouter.image",
|
|
6829
|
-
url: ({ path:
|
|
6915
|
+
url: ({ path: path50 }) => `${baseURL}${path50}`,
|
|
6830
6916
|
headers: getHeaders,
|
|
6831
6917
|
fetch: options.fetch,
|
|
6832
6918
|
extraBody: options.extraBody
|
|
@@ -14345,11 +14431,13 @@ import { tmpdir } from "node:os";
|
|
|
14345
14431
|
import path19 from "node:path";
|
|
14346
14432
|
import { execSync as execSync2 } from "node:child_process";
|
|
14347
14433
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
14348
|
-
import { accessSync as accessSync2, createWriteStream as createWriteStream6 } from "node:fs";
|
|
14434
|
+
import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
|
|
14349
14435
|
import { mkdir as mkdir7 } from "node:fs/promises";
|
|
14350
|
-
import
|
|
14436
|
+
import path21 from "node:path";
|
|
14351
14437
|
import { createInterface } from "node:readline";
|
|
14352
|
-
import { fileURLToPath as fileURLToPath3 } from "node:url";
|
|
14438
|
+
import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
|
|
14439
|
+
import os2 from "node:os";
|
|
14440
|
+
import path20 from "node:path";
|
|
14353
14441
|
import { exec as exec2 } from "node:child_process";
|
|
14354
14442
|
import { constants as constants3, access as access3, stat as stat5 } from "node:fs/promises";
|
|
14355
14443
|
import path322 from "node:path";
|
|
@@ -14358,18 +14446,16 @@ import { stat as stat4, writeFile as writeFile4 } from "node:fs/promises";
|
|
|
14358
14446
|
import path30 from "node:path";
|
|
14359
14447
|
import { constants as constants22 } from "node:fs";
|
|
14360
14448
|
import { access as access22, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
|
|
14361
|
-
import path21 from "node:path";
|
|
14362
14449
|
import path222 from "node:path";
|
|
14363
14450
|
import path23 from "node:path";
|
|
14364
|
-
import { readFile as readFile9 } from "node:fs/promises";
|
|
14365
14451
|
import path24 from "node:path";
|
|
14452
|
+
import { readFile as readFile9 } from "node:fs/promises";
|
|
14453
|
+
import path25 from "node:path";
|
|
14366
14454
|
import { exec, spawn as spawn4 } from "node:child_process";
|
|
14367
14455
|
import { mkdir as mkdir9, writeFile as writeFile2 } from "node:fs/promises";
|
|
14368
14456
|
import path27 from "node:path";
|
|
14369
14457
|
import { promisify as promisify2 } from "node:util";
|
|
14370
14458
|
import path26 from "node:path";
|
|
14371
|
-
import os2 from "node:os";
|
|
14372
|
-
import path25 from "node:path";
|
|
14373
14459
|
import { copyFile, mkdir as mkdir10, readFile as readFile10, readdir as readdir3, stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
|
|
14374
14460
|
import path29 from "node:path";
|
|
14375
14461
|
import path28 from "node:path";
|
|
@@ -14420,12 +14506,15 @@ import { existsSync as existsSync5 } from "node:fs";
|
|
|
14420
14506
|
import path45 from "node:path";
|
|
14421
14507
|
import { mkdir as mkdir15, readFile as readFile13, writeFile as writeFile8 } from "node:fs/promises";
|
|
14422
14508
|
import path46 from "node:path";
|
|
14423
|
-
import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
14509
|
+
import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
14424
14510
|
import path47 from "node:path";
|
|
14425
14511
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
14426
14512
|
import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
14427
14513
|
import { homedir as homedir3 } from "node:os";
|
|
14428
14514
|
import path48 from "node:path";
|
|
14515
|
+
import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
|
|
14516
|
+
import { homedir as homedir4 } from "node:os";
|
|
14517
|
+
import path49 from "node:path";
|
|
14429
14518
|
import { readFile as readFile14 } from "node:fs/promises";
|
|
14430
14519
|
function computeTraceSummary(messages) {
|
|
14431
14520
|
const toolCallCounts = {};
|
|
@@ -15213,8 +15302,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15213
15302
|
const negate = rawEvaluator.negate === true ? true : void 0;
|
|
15214
15303
|
if (isCustomType) {
|
|
15215
15304
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15216
|
-
const required2 =
|
|
15217
|
-
|
|
15305
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15306
|
+
rawEvaluator.required,
|
|
15307
|
+
rawEvaluator.min_score,
|
|
15308
|
+
name21,
|
|
15309
|
+
evalId
|
|
15310
|
+
);
|
|
15311
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
|
|
15218
15312
|
const config2 = {};
|
|
15219
15313
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
15220
15314
|
if (!knownProps2.has(key) && value !== void 0) {
|
|
@@ -15226,6 +15320,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15226
15320
|
type: customTypeName,
|
|
15227
15321
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15228
15322
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15323
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15229
15324
|
...negate !== void 0 ? { negate } : {},
|
|
15230
15325
|
...Object.keys(config2).length > 0 ? { config: config2 } : {}
|
|
15231
15326
|
});
|
|
@@ -15295,7 +15390,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15295
15390
|
);
|
|
15296
15391
|
}
|
|
15297
15392
|
}
|
|
15298
|
-
const required2 =
|
|
15393
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15394
|
+
rawEvaluator.required,
|
|
15395
|
+
rawEvaluator.min_score,
|
|
15396
|
+
name21,
|
|
15397
|
+
evalId
|
|
15398
|
+
);
|
|
15299
15399
|
const knownProps2 = /* @__PURE__ */ new Set([
|
|
15300
15400
|
"name",
|
|
15301
15401
|
"type",
|
|
@@ -15321,6 +15421,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15321
15421
|
resolvedCwd,
|
|
15322
15422
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15323
15423
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15424
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15324
15425
|
...negate !== void 0 ? { negate } : {},
|
|
15325
15426
|
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
15326
15427
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
@@ -15449,7 +15550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15449
15550
|
};
|
|
15450
15551
|
}
|
|
15451
15552
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15452
|
-
const required2 =
|
|
15553
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15554
|
+
rawEvaluator.required,
|
|
15555
|
+
rawEvaluator.min_score,
|
|
15556
|
+
name21,
|
|
15557
|
+
evalId
|
|
15558
|
+
);
|
|
15453
15559
|
evaluators.push({
|
|
15454
15560
|
name: name21,
|
|
15455
15561
|
type: "composite",
|
|
@@ -15457,6 +15563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15457
15563
|
aggregator,
|
|
15458
15564
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15459
15565
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15566
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15460
15567
|
...negate !== void 0 ? { negate } : {}
|
|
15461
15568
|
});
|
|
15462
15569
|
continue;
|
|
@@ -15567,7 +15674,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15567
15674
|
continue;
|
|
15568
15675
|
}
|
|
15569
15676
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15570
|
-
const required2 =
|
|
15677
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15678
|
+
rawEvaluator.required,
|
|
15679
|
+
rawEvaluator.min_score,
|
|
15680
|
+
name21,
|
|
15681
|
+
evalId
|
|
15682
|
+
);
|
|
15571
15683
|
const config2 = {
|
|
15572
15684
|
name: name21,
|
|
15573
15685
|
type: "tool-trajectory",
|
|
@@ -15576,6 +15688,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15576
15688
|
...expected ? { expected } : {},
|
|
15577
15689
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15578
15690
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15691
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15579
15692
|
...negate !== void 0 ? { negate } : {},
|
|
15580
15693
|
...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
|
|
15581
15694
|
};
|
|
@@ -15638,7 +15751,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15638
15751
|
const aggregation = asString(rawEvaluator.aggregation);
|
|
15639
15752
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
15640
15753
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15641
|
-
const required2 =
|
|
15754
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15755
|
+
rawEvaluator.required,
|
|
15756
|
+
rawEvaluator.min_score,
|
|
15757
|
+
name21,
|
|
15758
|
+
evalId
|
|
15759
|
+
);
|
|
15642
15760
|
evaluators.push({
|
|
15643
15761
|
name: name21,
|
|
15644
15762
|
type: "field-accuracy",
|
|
@@ -15646,6 +15764,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15646
15764
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
15647
15765
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15648
15766
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15767
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15649
15768
|
...negate !== void 0 ? { negate } : {}
|
|
15650
15769
|
});
|
|
15651
15770
|
continue;
|
|
@@ -15659,13 +15778,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15659
15778
|
continue;
|
|
15660
15779
|
}
|
|
15661
15780
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15662
|
-
const required2 =
|
|
15781
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15782
|
+
rawEvaluator.required,
|
|
15783
|
+
rawEvaluator.min_score,
|
|
15784
|
+
name21,
|
|
15785
|
+
evalId
|
|
15786
|
+
);
|
|
15663
15787
|
evaluators.push({
|
|
15664
15788
|
name: name21,
|
|
15665
15789
|
type: "latency",
|
|
15666
15790
|
threshold,
|
|
15667
15791
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15668
15792
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15793
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15669
15794
|
...negate !== void 0 ? { negate } : {}
|
|
15670
15795
|
});
|
|
15671
15796
|
continue;
|
|
@@ -15679,13 +15804,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15679
15804
|
continue;
|
|
15680
15805
|
}
|
|
15681
15806
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15682
|
-
const required2 =
|
|
15807
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15808
|
+
rawEvaluator.required,
|
|
15809
|
+
rawEvaluator.min_score,
|
|
15810
|
+
name21,
|
|
15811
|
+
evalId
|
|
15812
|
+
);
|
|
15683
15813
|
evaluators.push({
|
|
15684
15814
|
name: name21,
|
|
15685
15815
|
type: "cost",
|
|
15686
15816
|
budget,
|
|
15687
15817
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15688
15818
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15819
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15689
15820
|
...negate !== void 0 ? { negate } : {}
|
|
15690
15821
|
});
|
|
15691
15822
|
continue;
|
|
@@ -15717,13 +15848,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15717
15848
|
continue;
|
|
15718
15849
|
}
|
|
15719
15850
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15720
|
-
const required2 =
|
|
15851
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15852
|
+
rawEvaluator.required,
|
|
15853
|
+
rawEvaluator.min_score,
|
|
15854
|
+
name21,
|
|
15855
|
+
evalId
|
|
15856
|
+
);
|
|
15721
15857
|
evaluators.push({
|
|
15722
15858
|
name: name21,
|
|
15723
15859
|
type: "token-usage",
|
|
15724
15860
|
...validLimits,
|
|
15725
15861
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15726
15862
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15863
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15727
15864
|
...negate !== void 0 ? { negate } : {}
|
|
15728
15865
|
});
|
|
15729
15866
|
continue;
|
|
@@ -15769,13 +15906,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15769
15906
|
continue;
|
|
15770
15907
|
}
|
|
15771
15908
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15772
|
-
const required2 =
|
|
15909
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15910
|
+
rawEvaluator.required,
|
|
15911
|
+
rawEvaluator.min_score,
|
|
15912
|
+
name21,
|
|
15913
|
+
evalId
|
|
15914
|
+
);
|
|
15773
15915
|
evaluators.push({
|
|
15774
15916
|
name: name21,
|
|
15775
15917
|
type: "execution-metrics",
|
|
15776
15918
|
...validThresholds,
|
|
15777
15919
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15778
15920
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15921
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15779
15922
|
...negate !== void 0 ? { negate } : {}
|
|
15780
15923
|
});
|
|
15781
15924
|
continue;
|
|
@@ -15789,7 +15932,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15789
15932
|
const rawShouldTrigger = rawEvaluator.should_trigger;
|
|
15790
15933
|
const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
|
|
15791
15934
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15792
|
-
const required2 =
|
|
15935
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15936
|
+
rawEvaluator.required,
|
|
15937
|
+
rawEvaluator.min_score,
|
|
15938
|
+
name21,
|
|
15939
|
+
evalId
|
|
15940
|
+
);
|
|
15793
15941
|
evaluators.push({
|
|
15794
15942
|
name: name21,
|
|
15795
15943
|
type: "skill-trigger",
|
|
@@ -15797,6 +15945,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15797
15945
|
...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
|
|
15798
15946
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15799
15947
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15948
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15800
15949
|
...negate !== void 0 ? { negate } : {}
|
|
15801
15950
|
});
|
|
15802
15951
|
continue;
|
|
@@ -15808,13 +15957,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15808
15957
|
continue;
|
|
15809
15958
|
}
|
|
15810
15959
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15811
|
-
const required2 =
|
|
15960
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15961
|
+
rawEvaluator.required,
|
|
15962
|
+
rawEvaluator.min_score,
|
|
15963
|
+
name21,
|
|
15964
|
+
evalId
|
|
15965
|
+
);
|
|
15812
15966
|
evaluators.push({
|
|
15813
15967
|
name: name21,
|
|
15814
15968
|
type: "contains",
|
|
15815
15969
|
value,
|
|
15816
15970
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15817
15971
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15972
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15818
15973
|
...negate !== void 0 ? { negate } : {}
|
|
15819
15974
|
});
|
|
15820
15975
|
continue;
|
|
@@ -15828,13 +15983,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15828
15983
|
continue;
|
|
15829
15984
|
}
|
|
15830
15985
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15831
|
-
const required2 =
|
|
15986
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
15987
|
+
rawEvaluator.required,
|
|
15988
|
+
rawEvaluator.min_score,
|
|
15989
|
+
name21,
|
|
15990
|
+
evalId
|
|
15991
|
+
);
|
|
15832
15992
|
evaluators.push({
|
|
15833
15993
|
name: name21,
|
|
15834
15994
|
type: typeValue,
|
|
15835
15995
|
value,
|
|
15836
15996
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15837
15997
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
15998
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15838
15999
|
...negate !== void 0 ? { negate } : {}
|
|
15839
16000
|
});
|
|
15840
16001
|
continue;
|
|
@@ -15846,13 +16007,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15846
16007
|
continue;
|
|
15847
16008
|
}
|
|
15848
16009
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15849
|
-
const required2 =
|
|
16010
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16011
|
+
rawEvaluator.required,
|
|
16012
|
+
rawEvaluator.min_score,
|
|
16013
|
+
name21,
|
|
16014
|
+
evalId
|
|
16015
|
+
);
|
|
15850
16016
|
evaluators.push({
|
|
15851
16017
|
name: name21,
|
|
15852
16018
|
type: "icontains",
|
|
15853
16019
|
value,
|
|
15854
16020
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15855
16021
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16022
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15856
16023
|
...negate !== void 0 ? { negate } : {}
|
|
15857
16024
|
});
|
|
15858
16025
|
continue;
|
|
@@ -15866,13 +16033,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15866
16033
|
continue;
|
|
15867
16034
|
}
|
|
15868
16035
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15869
|
-
const required2 =
|
|
16036
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16037
|
+
rawEvaluator.required,
|
|
16038
|
+
rawEvaluator.min_score,
|
|
16039
|
+
name21,
|
|
16040
|
+
evalId
|
|
16041
|
+
);
|
|
15870
16042
|
evaluators.push({
|
|
15871
16043
|
name: name21,
|
|
15872
16044
|
type: typeValue,
|
|
15873
16045
|
value,
|
|
15874
16046
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15875
16047
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16048
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15876
16049
|
...negate !== void 0 ? { negate } : {}
|
|
15877
16050
|
});
|
|
15878
16051
|
continue;
|
|
@@ -15884,13 +16057,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15884
16057
|
continue;
|
|
15885
16058
|
}
|
|
15886
16059
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15887
|
-
const required2 =
|
|
16060
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16061
|
+
rawEvaluator.required,
|
|
16062
|
+
rawEvaluator.min_score,
|
|
16063
|
+
name21,
|
|
16064
|
+
evalId
|
|
16065
|
+
);
|
|
15888
16066
|
evaluators.push({
|
|
15889
16067
|
name: name21,
|
|
15890
16068
|
type: typeValue,
|
|
15891
16069
|
value,
|
|
15892
16070
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15893
16071
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16072
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15894
16073
|
...negate !== void 0 ? { negate } : {}
|
|
15895
16074
|
});
|
|
15896
16075
|
continue;
|
|
@@ -15903,7 +16082,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15903
16082
|
}
|
|
15904
16083
|
const flags = asString(rawEvaluator.flags);
|
|
15905
16084
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15906
|
-
const required2 =
|
|
16085
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16086
|
+
rawEvaluator.required,
|
|
16087
|
+
rawEvaluator.min_score,
|
|
16088
|
+
name21,
|
|
16089
|
+
evalId
|
|
16090
|
+
);
|
|
15907
16091
|
evaluators.push({
|
|
15908
16092
|
name: name21,
|
|
15909
16093
|
type: "regex",
|
|
@@ -15911,18 +16095,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15911
16095
|
...flags !== void 0 ? { flags } : {},
|
|
15912
16096
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15913
16097
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16098
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15914
16099
|
...negate !== void 0 ? { negate } : {}
|
|
15915
16100
|
});
|
|
15916
16101
|
continue;
|
|
15917
16102
|
}
|
|
15918
16103
|
if (typeValue === "is-json") {
|
|
15919
16104
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15920
|
-
const required2 =
|
|
16105
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16106
|
+
rawEvaluator.required,
|
|
16107
|
+
rawEvaluator.min_score,
|
|
16108
|
+
name21,
|
|
16109
|
+
evalId
|
|
16110
|
+
);
|
|
15921
16111
|
evaluators.push({
|
|
15922
16112
|
name: name21,
|
|
15923
16113
|
type: "is-json",
|
|
15924
16114
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15925
16115
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16116
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15926
16117
|
...negate !== void 0 ? { negate } : {}
|
|
15927
16118
|
});
|
|
15928
16119
|
continue;
|
|
@@ -15934,13 +16125,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15934
16125
|
continue;
|
|
15935
16126
|
}
|
|
15936
16127
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15937
|
-
const required2 =
|
|
16128
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16129
|
+
rawEvaluator.required,
|
|
16130
|
+
rawEvaluator.min_score,
|
|
16131
|
+
name21,
|
|
16132
|
+
evalId
|
|
16133
|
+
);
|
|
15938
16134
|
evaluators.push({
|
|
15939
16135
|
name: name21,
|
|
15940
16136
|
type: "equals",
|
|
15941
16137
|
value,
|
|
15942
16138
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15943
16139
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16140
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15944
16141
|
...negate !== void 0 ? { negate } : {}
|
|
15945
16142
|
});
|
|
15946
16143
|
continue;
|
|
@@ -15976,7 +16173,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15976
16173
|
continue;
|
|
15977
16174
|
}
|
|
15978
16175
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
15979
|
-
const required2 =
|
|
16176
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16177
|
+
rawEvaluator.required,
|
|
16178
|
+
rawEvaluator.min_score,
|
|
16179
|
+
name21,
|
|
16180
|
+
evalId
|
|
16181
|
+
);
|
|
15980
16182
|
evaluators.push({
|
|
15981
16183
|
name: name21,
|
|
15982
16184
|
type: "llm-grader",
|
|
@@ -15984,6 +16186,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15984
16186
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
15985
16187
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
15986
16188
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16189
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
15987
16190
|
...negate !== void 0 ? { negate } : {}
|
|
15988
16191
|
});
|
|
15989
16192
|
continue;
|
|
@@ -16053,7 +16256,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
16053
16256
|
continue;
|
|
16054
16257
|
}
|
|
16055
16258
|
const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
16056
|
-
const required2 =
|
|
16259
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
16260
|
+
rawEvaluator.required,
|
|
16261
|
+
rawEvaluator.min_score,
|
|
16262
|
+
name21,
|
|
16263
|
+
evalId
|
|
16264
|
+
);
|
|
16057
16265
|
evaluators.push({
|
|
16058
16266
|
name: name21,
|
|
16059
16267
|
type: "llm-grader",
|
|
@@ -16061,12 +16269,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
16061
16269
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
16062
16270
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
16063
16271
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
16272
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
16064
16273
|
...negate !== void 0 ? { negate } : {}
|
|
16065
16274
|
});
|
|
16066
16275
|
continue;
|
|
16067
16276
|
}
|
|
16068
16277
|
const weight = validateWeight(rawEvaluator.weight, name21, evalId);
|
|
16069
|
-
const required =
|
|
16278
|
+
const { required, min_score } = parseRequiredAndMinScore(
|
|
16279
|
+
rawEvaluator.required,
|
|
16280
|
+
rawEvaluator.min_score,
|
|
16281
|
+
name21,
|
|
16282
|
+
evalId
|
|
16283
|
+
);
|
|
16070
16284
|
const knownProps = /* @__PURE__ */ new Set([
|
|
16071
16285
|
"name",
|
|
16072
16286
|
"type",
|
|
@@ -16077,6 +16291,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
16077
16291
|
"weight",
|
|
16078
16292
|
"config",
|
|
16079
16293
|
"required",
|
|
16294
|
+
"min_score",
|
|
16080
16295
|
"negate",
|
|
16081
16296
|
"max_steps",
|
|
16082
16297
|
"maxSteps",
|
|
@@ -16106,6 +16321,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
16106
16321
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
16107
16322
|
...weight !== void 0 ? { weight } : {},
|
|
16108
16323
|
...required !== void 0 ? { required } : {},
|
|
16324
|
+
...min_score !== void 0 ? { min_score } : {},
|
|
16109
16325
|
...negate !== void 0 ? { negate } : {},
|
|
16110
16326
|
...finalConfig ? { config: finalConfig } : {},
|
|
16111
16327
|
...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
|
|
@@ -16237,10 +16453,23 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
16237
16453
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
|
|
16238
16454
|
}
|
|
16239
16455
|
}
|
|
16240
|
-
function
|
|
16241
|
-
|
|
16242
|
-
if (typeof
|
|
16243
|
-
|
|
16456
|
+
function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
|
|
16457
|
+
const result = {};
|
|
16458
|
+
if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
|
|
16459
|
+
result.min_score = rawMinScore;
|
|
16460
|
+
}
|
|
16461
|
+
if (rawRequired === true) {
|
|
16462
|
+
result.required = true;
|
|
16463
|
+
} else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
|
|
16464
|
+
if (result.min_score === void 0) {
|
|
16465
|
+
result.min_score = rawRequired;
|
|
16466
|
+
}
|
|
16467
|
+
result.required = rawRequired;
|
|
16468
|
+
logWarning2(
|
|
16469
|
+
`Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
16470
|
+
);
|
|
16471
|
+
}
|
|
16472
|
+
return result;
|
|
16244
16473
|
}
|
|
16245
16474
|
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
16246
16475
|
if (rawWeight === void 0) {
|
|
@@ -16283,16 +16512,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
16283
16512
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
16284
16513
|
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
16285
16514
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
16515
|
+
let minScore;
|
|
16286
16516
|
let requiredMinScore;
|
|
16287
16517
|
let required;
|
|
16288
|
-
if (typeof rawRubric.
|
|
16289
|
-
const
|
|
16290
|
-
if (
|
|
16518
|
+
if (typeof rawRubric.min_score === "number") {
|
|
16519
|
+
const ms = rawRubric.min_score;
|
|
16520
|
+
if (ms <= 0 || ms > 1) {
|
|
16291
16521
|
throw new Error(
|
|
16292
|
-
`Invalid
|
|
16522
|
+
`Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
|
|
16293
16523
|
);
|
|
16294
16524
|
}
|
|
16295
|
-
|
|
16525
|
+
minScore = ms;
|
|
16526
|
+
requiredMinScore = Math.round(ms * 10);
|
|
16527
|
+
} else if (typeof rawRubric.required_min_score === "number") {
|
|
16528
|
+
const rms = rawRubric.required_min_score;
|
|
16529
|
+
if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
|
|
16530
|
+
throw new Error(
|
|
16531
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
|
|
16532
|
+
);
|
|
16533
|
+
}
|
|
16534
|
+
requiredMinScore = rms;
|
|
16535
|
+
minScore = rms / 10;
|
|
16536
|
+
logWarning2(
|
|
16537
|
+
`Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
|
|
16538
|
+
);
|
|
16296
16539
|
}
|
|
16297
16540
|
if (typeof rawRubric.required === "boolean") {
|
|
16298
16541
|
required = rawRubric.required;
|
|
@@ -16312,6 +16555,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
16312
16555
|
weight,
|
|
16313
16556
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
16314
16557
|
...required !== void 0 ? { required } : {},
|
|
16558
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
16315
16559
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
16316
16560
|
score_ranges: scoreRanges
|
|
16317
16561
|
});
|
|
@@ -16328,6 +16572,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
16328
16572
|
weight,
|
|
16329
16573
|
// Default to required: true if not specified (backward compatibility)
|
|
16330
16574
|
required: required ?? true,
|
|
16575
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
16331
16576
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
16332
16577
|
});
|
|
16333
16578
|
}
|
|
@@ -16456,12 +16701,22 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
16456
16701
|
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
16457
16702
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
16458
16703
|
};
|
|
16704
|
+
let inlineMinScore;
|
|
16705
|
+
let inlineRequiredMinScore;
|
|
16706
|
+
if (typeof rubric.min_score === "number") {
|
|
16707
|
+
inlineMinScore = rubric.min_score;
|
|
16708
|
+
inlineRequiredMinScore = Math.round(inlineMinScore * 10);
|
|
16709
|
+
} else if (typeof rubric.required_min_score === "number") {
|
|
16710
|
+
inlineRequiredMinScore = rubric.required_min_score;
|
|
16711
|
+
inlineMinScore = inlineRequiredMinScore / 10;
|
|
16712
|
+
}
|
|
16459
16713
|
if (scoreRanges && scoreRanges.length > 0) {
|
|
16460
16714
|
return {
|
|
16461
16715
|
...baseRubric,
|
|
16462
16716
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
16463
16717
|
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
16464
|
-
...
|
|
16718
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
16719
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
|
|
16465
16720
|
score_ranges: scoreRanges
|
|
16466
16721
|
};
|
|
16467
16722
|
}
|
|
@@ -16469,7 +16724,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
16469
16724
|
...baseRubric,
|
|
16470
16725
|
outcome: expectedOutcome,
|
|
16471
16726
|
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
16472
|
-
...
|
|
16727
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
16728
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
|
|
16473
16729
|
};
|
|
16474
16730
|
}).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
|
|
16475
16731
|
if (rubricItems.length === 0) {
|
|
@@ -16851,6 +17107,9 @@ function resolveExpectedMessages(raw) {
|
|
|
16851
17107
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
16852
17108
|
var ANSI_RED2 = "\x1B[31m";
|
|
16853
17109
|
var ANSI_RESET6 = "\x1B[0m";
|
|
17110
|
+
function matchesFilter(id, filter2) {
|
|
17111
|
+
return typeof filter2 === "string" ? micromatch.isMatch(id, filter2) : filter2.some((pattern) => micromatch.isMatch(id, pattern));
|
|
17112
|
+
}
|
|
16854
17113
|
function detectFormat(filePath) {
|
|
16855
17114
|
const ext = path6.extname(filePath).toLowerCase();
|
|
16856
17115
|
if (ext === ".jsonl") return "jsonl";
|
|
@@ -16918,40 +17177,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
16918
17177
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
16919
17178
|
const rawFile = await readFile5(absoluteTestPath, "utf8");
|
|
16920
17179
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
16921
|
-
const
|
|
16922
|
-
const
|
|
17180
|
+
const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
17181
|
+
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
16923
17182
|
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
|
|
16924
17183
|
const globalExecution = sidecar.execution;
|
|
16925
17184
|
if (verbose) {
|
|
16926
17185
|
console.log(`
|
|
16927
|
-
[JSONL
|
|
17186
|
+
[JSONL Suite: ${evalFilePath}]`);
|
|
16928
17187
|
console.log(` Cases: ${rawCases.length}`);
|
|
16929
|
-
console.log(`
|
|
17188
|
+
console.log(` Suite: ${suiteName}`);
|
|
16930
17189
|
if (sidecar.description) {
|
|
16931
17190
|
console.log(` Description: ${sidecar.description}`);
|
|
16932
17191
|
}
|
|
16933
17192
|
}
|
|
16934
17193
|
const results = [];
|
|
16935
17194
|
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
16936
|
-
const
|
|
17195
|
+
const testCaseConfig = rawCases[lineIndex];
|
|
16937
17196
|
const lineNumber = lineIndex + 1;
|
|
16938
|
-
const id = asString4(
|
|
16939
|
-
if (filterPattern && (!id || !
|
|
17197
|
+
const id = asString4(testCaseConfig.id);
|
|
17198
|
+
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
|
|
16940
17199
|
continue;
|
|
16941
17200
|
}
|
|
16942
|
-
const conversationId = asString4(
|
|
16943
|
-
let outcome = asString4(
|
|
16944
|
-
if (!outcome &&
|
|
16945
|
-
outcome = asString4(
|
|
17201
|
+
const conversationId = asString4(testCaseConfig.conversation_id);
|
|
17202
|
+
let outcome = asString4(testCaseConfig.criteria);
|
|
17203
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
17204
|
+
outcome = asString4(testCaseConfig.expected_outcome);
|
|
16946
17205
|
if (outcome) {
|
|
16947
17206
|
logWarning4(
|
|
16948
|
-
`Test '${asString4(
|
|
17207
|
+
`Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
16949
17208
|
);
|
|
16950
17209
|
}
|
|
16951
17210
|
}
|
|
16952
|
-
const rawInputMessages = resolveInputMessages(
|
|
16953
|
-
const expectedMessages = resolveExpectedMessages(
|
|
16954
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
17211
|
+
const rawInputMessages = resolveInputMessages(testCaseConfig);
|
|
17212
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
17213
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
|
|
16955
17214
|
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
16956
17215
|
logError2(
|
|
16957
17216
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
@@ -16988,18 +17247,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
16988
17247
|
}
|
|
16989
17248
|
}
|
|
16990
17249
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
16991
|
-
const caseExecution = isJsonObject(
|
|
17250
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
16992
17251
|
const mergedExecution = caseExecution ?? globalExecution;
|
|
16993
|
-
const
|
|
17252
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
16994
17253
|
let evaluators;
|
|
16995
17254
|
try {
|
|
16996
|
-
evaluators = await parseEvaluators(
|
|
17255
|
+
evaluators = await parseEvaluators(
|
|
17256
|
+
testCaseConfig,
|
|
17257
|
+
mergedExecution,
|
|
17258
|
+
searchRoots,
|
|
17259
|
+
id ?? "unknown"
|
|
17260
|
+
);
|
|
16997
17261
|
} catch (error) {
|
|
16998
17262
|
const message = error instanceof Error ? error.message : String(error);
|
|
16999
17263
|
logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
|
|
17000
17264
|
continue;
|
|
17001
17265
|
}
|
|
17002
|
-
const inlineRubrics =
|
|
17266
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
17003
17267
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
17004
17268
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
17005
17269
|
if (rubricEvaluator) {
|
|
@@ -17010,7 +17274,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
17010
17274
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
17011
17275
|
const testCase = {
|
|
17012
17276
|
id,
|
|
17013
|
-
|
|
17277
|
+
suite: suiteName,
|
|
17014
17278
|
conversation_id: conversationId,
|
|
17015
17279
|
question,
|
|
17016
17280
|
input: inputMessages,
|
|
@@ -17018,7 +17282,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
17018
17282
|
reference_answer: referenceAnswer,
|
|
17019
17283
|
file_paths: userFilePaths,
|
|
17020
17284
|
criteria: outcome ?? "",
|
|
17021
|
-
evaluator:
|
|
17285
|
+
evaluator: testCaseEvaluatorKind,
|
|
17022
17286
|
assertions: evaluators
|
|
17023
17287
|
};
|
|
17024
17288
|
results.push(testCase);
|
|
@@ -17194,6 +17458,9 @@ function buildChatPromptFromSegments(options) {
|
|
|
17194
17458
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
17195
17459
|
var ANSI_RED3 = "\x1B[31m";
|
|
17196
17460
|
var ANSI_RESET7 = "\x1B[0m";
|
|
17461
|
+
function matchesFilter2(id, filter2) {
|
|
17462
|
+
return typeof filter2 === "string" ? micromatch2.isMatch(id, filter2) : filter2.some((pattern) => micromatch2.isMatch(id, pattern));
|
|
17463
|
+
}
|
|
17197
17464
|
function resolveTests(suite) {
|
|
17198
17465
|
if (suite.tests !== void 0) return suite.tests;
|
|
17199
17466
|
if (suite.eval_cases !== void 0) {
|
|
@@ -17273,18 +17540,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
17273
17540
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
17274
17541
|
}
|
|
17275
17542
|
const suite = interpolated;
|
|
17276
|
-
const
|
|
17277
|
-
const
|
|
17278
|
-
const
|
|
17279
|
-
const
|
|
17543
|
+
const suiteNameFromFile = asString5(suite.name)?.trim();
|
|
17544
|
+
const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
17545
|
+
const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
|
|
17546
|
+
const rawTestCases = resolveTests(suite);
|
|
17280
17547
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
17281
17548
|
const evalFileDir = path7.dirname(absoluteTestPath);
|
|
17282
|
-
let
|
|
17283
|
-
if (typeof
|
|
17284
|
-
const externalPath = path7.resolve(evalFileDir,
|
|
17285
|
-
|
|
17286
|
-
} else if (Array.isArray(
|
|
17287
|
-
|
|
17549
|
+
let expandedTestCases;
|
|
17550
|
+
if (typeof rawTestCases === "string") {
|
|
17551
|
+
const externalPath = path7.resolve(evalFileDir, rawTestCases);
|
|
17552
|
+
expandedTestCases = await loadCasesFromFile(externalPath);
|
|
17553
|
+
} else if (Array.isArray(rawTestCases)) {
|
|
17554
|
+
expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
|
|
17288
17555
|
} else {
|
|
17289
17556
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
17290
17557
|
}
|
|
@@ -17299,32 +17566,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
17299
17566
|
}
|
|
17300
17567
|
const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
|
|
17301
17568
|
const results = [];
|
|
17302
|
-
for (const
|
|
17303
|
-
if (!isJsonObject(
|
|
17569
|
+
for (const rawTestCase of expandedTestCases) {
|
|
17570
|
+
if (!isJsonObject(rawTestCase)) {
|
|
17304
17571
|
logWarning5("Skipping invalid test entry (expected object)");
|
|
17305
17572
|
continue;
|
|
17306
17573
|
}
|
|
17307
|
-
const
|
|
17308
|
-
const id = asString5(
|
|
17309
|
-
if (filterPattern && (!id || !
|
|
17574
|
+
const testCaseConfig = rawTestCase;
|
|
17575
|
+
const id = asString5(testCaseConfig.id);
|
|
17576
|
+
if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
|
|
17310
17577
|
continue;
|
|
17311
17578
|
}
|
|
17312
|
-
const conversationId = asString5(
|
|
17313
|
-
let outcome = asString5(
|
|
17314
|
-
if (!outcome &&
|
|
17315
|
-
outcome = asString5(
|
|
17579
|
+
const conversationId = asString5(testCaseConfig.conversation_id);
|
|
17580
|
+
let outcome = asString5(testCaseConfig.criteria);
|
|
17581
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
17582
|
+
outcome = asString5(testCaseConfig.expected_outcome);
|
|
17316
17583
|
if (outcome) {
|
|
17317
17584
|
logWarning5(
|
|
17318
|
-
`Test '${asString5(
|
|
17585
|
+
`Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
17319
17586
|
);
|
|
17320
17587
|
}
|
|
17321
17588
|
}
|
|
17322
|
-
const caseExecution = isJsonObject(
|
|
17589
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
17323
17590
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
17591
|
+
const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
|
|
17324
17592
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
17325
|
-
const testInputMessages = resolveInputMessages(
|
|
17326
|
-
const expectedMessages = resolveExpectedMessages(
|
|
17327
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
17593
|
+
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
17594
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
17595
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
17328
17596
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
17329
17597
|
logError3(
|
|
17330
17598
|
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
@@ -17371,16 +17639,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
17371
17639
|
}
|
|
17372
17640
|
}
|
|
17373
17641
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
17374
|
-
const
|
|
17642
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
17375
17643
|
let evaluators;
|
|
17376
17644
|
try {
|
|
17377
|
-
evaluators = await parseEvaluators(
|
|
17645
|
+
evaluators = await parseEvaluators(
|
|
17646
|
+
testCaseConfig,
|
|
17647
|
+
globalExecution,
|
|
17648
|
+
searchRoots,
|
|
17649
|
+
id ?? "unknown"
|
|
17650
|
+
);
|
|
17378
17651
|
} catch (error) {
|
|
17379
17652
|
const message = error instanceof Error ? error.message : String(error);
|
|
17380
17653
|
logError3(`Skipping test '${id}': ${message}`);
|
|
17381
17654
|
continue;
|
|
17382
17655
|
}
|
|
17383
|
-
const inlineRubrics =
|
|
17656
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
17384
17657
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
17385
17658
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
17386
17659
|
if (rubricEvaluator) {
|
|
@@ -17389,13 +17662,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
17389
17662
|
}
|
|
17390
17663
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
17391
17664
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
17392
|
-
const caseWorkspace = await resolveWorkspaceConfig(
|
|
17665
|
+
const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
|
|
17393
17666
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
17394
|
-
const metadata = isJsonObject(
|
|
17395
|
-
const caseTargets = extractTargetsFromTestCase(
|
|
17667
|
+
const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
|
|
17668
|
+
const caseTargets = extractTargetsFromTestCase(testCaseConfig);
|
|
17396
17669
|
const testCase = {
|
|
17397
17670
|
id,
|
|
17398
|
-
|
|
17671
|
+
suite: suiteName,
|
|
17399
17672
|
category: options?.category,
|
|
17400
17673
|
conversation_id: conversationId,
|
|
17401
17674
|
question,
|
|
@@ -17404,11 +17677,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
17404
17677
|
reference_answer: referenceAnswer,
|
|
17405
17678
|
file_paths: userFilePaths,
|
|
17406
17679
|
criteria: outcome ?? "",
|
|
17407
|
-
evaluator:
|
|
17680
|
+
evaluator: testCaseEvaluatorKind,
|
|
17408
17681
|
assertions: evaluators,
|
|
17409
17682
|
workspace: mergedWorkspace,
|
|
17410
17683
|
metadata,
|
|
17411
|
-
targets: caseTargets
|
|
17684
|
+
targets: caseTargets,
|
|
17685
|
+
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
|
|
17412
17686
|
};
|
|
17413
17687
|
results.push(testCase);
|
|
17414
17688
|
}
|
|
@@ -17939,7 +18213,7 @@ var AzureProvider = class {
|
|
|
17939
18213
|
};
|
|
17940
18214
|
this.retryConfig = config.retry;
|
|
17941
18215
|
const azure = createAzure(buildAzureOptions(config));
|
|
17942
|
-
this.model = azure.chat(config.deploymentName);
|
|
18216
|
+
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
17943
18217
|
}
|
|
17944
18218
|
id;
|
|
17945
18219
|
kind = "azure";
|
|
@@ -18065,7 +18339,9 @@ function buildAzureOptions(config) {
|
|
|
18065
18339
|
const options = {
|
|
18066
18340
|
apiKey: config.apiKey,
|
|
18067
18341
|
apiVersion: config.version,
|
|
18068
|
-
|
|
18342
|
+
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
18343
|
+
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
18344
|
+
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
18069
18345
|
};
|
|
18070
18346
|
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
18071
18347
|
if (baseURL) {
|
|
@@ -21575,6 +21851,22 @@ function extractAzureResourceName(baseUrl) {
|
|
|
21575
21851
|
if (urlMatch) return urlMatch[1];
|
|
21576
21852
|
return baseUrl;
|
|
21577
21853
|
}
|
|
21854
|
+
function normalizeAzureSdkBaseUrl(baseUrl) {
|
|
21855
|
+
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
21856
|
+
if (!trimmed) {
|
|
21857
|
+
return trimmed;
|
|
21858
|
+
}
|
|
21859
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
21860
|
+
return `https://${trimmed}.openai.azure.com/openai/v1`;
|
|
21861
|
+
}
|
|
21862
|
+
if (/\/openai\/v1$/i.test(trimmed)) {
|
|
21863
|
+
return trimmed;
|
|
21864
|
+
}
|
|
21865
|
+
if (/\/openai$/i.test(trimmed)) {
|
|
21866
|
+
return `${trimmed}/v1`;
|
|
21867
|
+
}
|
|
21868
|
+
return `${trimmed}/openai/v1`;
|
|
21869
|
+
}
|
|
21578
21870
|
function extractPiTextContent(content) {
|
|
21579
21871
|
if (typeof content === "string") {
|
|
21580
21872
|
return content;
|
|
@@ -22397,6 +22689,30 @@ async function defaultPiRunner(options) {
|
|
|
22397
22689
|
});
|
|
22398
22690
|
});
|
|
22399
22691
|
}
|
|
22692
|
+
var logged = false;
|
|
22693
|
+
function getAgentvHome() {
|
|
22694
|
+
const envHome = process.env.AGENTV_HOME;
|
|
22695
|
+
if (envHome && envHome !== "undefined") {
|
|
22696
|
+
if (!logged) {
|
|
22697
|
+
logged = true;
|
|
22698
|
+
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
22699
|
+
}
|
|
22700
|
+
return envHome;
|
|
22701
|
+
}
|
|
22702
|
+
return path20.join(os2.homedir(), ".agentv");
|
|
22703
|
+
}
|
|
22704
|
+
function getWorkspacesRoot() {
|
|
22705
|
+
return path20.join(getAgentvHome(), "workspaces");
|
|
22706
|
+
}
|
|
22707
|
+
function getSubagentsRoot() {
|
|
22708
|
+
return path20.join(getAgentvHome(), "subagents");
|
|
22709
|
+
}
|
|
22710
|
+
function getTraceStateRoot() {
|
|
22711
|
+
return path20.join(getAgentvHome(), "trace-state");
|
|
22712
|
+
}
|
|
22713
|
+
function getWorkspacePoolRoot() {
|
|
22714
|
+
return path20.join(getAgentvHome(), "workspace-pool");
|
|
22715
|
+
}
|
|
22400
22716
|
var piCodingAgentModule = null;
|
|
22401
22717
|
var piAiModule = null;
|
|
22402
22718
|
var loadingPromise = null;
|
|
@@ -22414,46 +22730,126 @@ async function promptInstall() {
|
|
|
22414
22730
|
rl.close();
|
|
22415
22731
|
}
|
|
22416
22732
|
}
|
|
22417
|
-
function
|
|
22418
|
-
|
|
22419
|
-
|
|
22420
|
-
|
|
22733
|
+
function findManagedSdkInstallRoot() {
|
|
22734
|
+
return path21.join(getAgentvHome(), "deps", "pi-sdk");
|
|
22735
|
+
}
|
|
22736
|
+
function resolveGlobalNpmRoot() {
|
|
22737
|
+
try {
|
|
22738
|
+
const root = execSync2("npm root -g", {
|
|
22739
|
+
encoding: "utf-8",
|
|
22740
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
22741
|
+
}).trim();
|
|
22742
|
+
return root.length > 0 ? root : void 0;
|
|
22743
|
+
} catch {
|
|
22744
|
+
return void 0;
|
|
22745
|
+
}
|
|
22746
|
+
}
|
|
22747
|
+
function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
|
|
22748
|
+
return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
|
|
22749
|
+
}
|
|
22750
|
+
function findAccessiblePath(paths) {
|
|
22751
|
+
for (const candidate of paths) {
|
|
22421
22752
|
try {
|
|
22422
|
-
|
|
22423
|
-
|
|
22424
|
-
return dir;
|
|
22753
|
+
accessSync2(candidate);
|
|
22754
|
+
return candidate;
|
|
22425
22755
|
} catch {
|
|
22426
|
-
const parent = path20.dirname(dir);
|
|
22427
|
-
if (parent === dir) break;
|
|
22428
|
-
dir = parent;
|
|
22429
22756
|
}
|
|
22430
22757
|
}
|
|
22431
|
-
return
|
|
22758
|
+
return void 0;
|
|
22432
22759
|
}
|
|
22433
|
-
async function
|
|
22760
|
+
async function tryImportLocalSdkModules() {
|
|
22434
22761
|
try {
|
|
22435
22762
|
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
22436
22763
|
import("@mariozechner/pi-coding-agent"),
|
|
22437
22764
|
import("@mariozechner/pi-ai")
|
|
22438
22765
|
]);
|
|
22766
|
+
return true;
|
|
22439
22767
|
} catch {
|
|
22440
|
-
|
|
22441
|
-
|
|
22442
|
-
|
|
22443
|
-
|
|
22444
|
-
|
|
22445
|
-
|
|
22446
|
-
|
|
22447
|
-
|
|
22448
|
-
|
|
22449
|
-
|
|
22450
|
-
|
|
22451
|
-
|
|
22452
|
-
|
|
22453
|
-
|
|
22454
|
-
|
|
22768
|
+
return false;
|
|
22769
|
+
}
|
|
22770
|
+
}
|
|
22771
|
+
async function tryImportManagedSdkModules() {
|
|
22772
|
+
const managedRoot = findManagedSdkInstallRoot();
|
|
22773
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
22774
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
|
|
22775
|
+
]);
|
|
22776
|
+
const piAiEntry = findAccessiblePath([
|
|
22777
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
|
|
22778
|
+
path21.join(
|
|
22779
|
+
managedRoot,
|
|
22780
|
+
"node_modules",
|
|
22781
|
+
"@mariozechner",
|
|
22782
|
+
"pi-coding-agent",
|
|
22783
|
+
"node_modules",
|
|
22784
|
+
"@mariozechner",
|
|
22785
|
+
"pi-ai",
|
|
22786
|
+
"dist",
|
|
22787
|
+
"index.js"
|
|
22788
|
+
)
|
|
22789
|
+
]);
|
|
22790
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
22791
|
+
try {
|
|
22792
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
22793
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
22794
|
+
import(pathToFileURL(piAiEntry).href)
|
|
22795
|
+
]);
|
|
22796
|
+
return true;
|
|
22797
|
+
} catch {
|
|
22798
|
+
return false;
|
|
22799
|
+
}
|
|
22800
|
+
}
|
|
22801
|
+
async function tryImportGlobalSdkModules() {
|
|
22802
|
+
const globalNpmRoot = resolveGlobalNpmRoot();
|
|
22803
|
+
if (!globalNpmRoot) return false;
|
|
22804
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
22805
|
+
buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
|
|
22806
|
+
]);
|
|
22807
|
+
const piAiEntry = findAccessiblePath([
|
|
22808
|
+
buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
|
|
22809
|
+
path21.join(
|
|
22810
|
+
globalNpmRoot,
|
|
22811
|
+
"@mariozechner",
|
|
22812
|
+
"pi-coding-agent",
|
|
22813
|
+
"node_modules",
|
|
22814
|
+
"@mariozechner",
|
|
22815
|
+
"pi-ai",
|
|
22816
|
+
"dist",
|
|
22817
|
+
"index.js"
|
|
22818
|
+
)
|
|
22819
|
+
]);
|
|
22820
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
22821
|
+
try {
|
|
22822
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
22823
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
22824
|
+
import(pathToFileURL(piAiEntry).href)
|
|
22825
|
+
]);
|
|
22826
|
+
return true;
|
|
22827
|
+
} catch {
|
|
22828
|
+
return false;
|
|
22829
|
+
}
|
|
22830
|
+
}
|
|
22831
|
+
function installSdkModules(installDir) {
|
|
22832
|
+
console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
|
|
22833
|
+
mkdirSync(installDir, { recursive: true });
|
|
22834
|
+
execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
|
|
22835
|
+
cwd: installDir,
|
|
22836
|
+
stdio: "inherit"
|
|
22837
|
+
});
|
|
22838
|
+
}
|
|
22839
|
+
async function doLoadSdkModules() {
|
|
22840
|
+
if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
|
|
22841
|
+
return;
|
|
22842
|
+
}
|
|
22843
|
+
if (await promptInstall()) {
|
|
22844
|
+
const installDir = findManagedSdkInstallRoot();
|
|
22845
|
+
installSdkModules(installDir);
|
|
22846
|
+
if (await tryImportManagedSdkModules()) {
|
|
22847
|
+
return;
|
|
22455
22848
|
}
|
|
22456
22849
|
}
|
|
22850
|
+
throw new Error(
|
|
22851
|
+
"pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
|
|
22852
|
+
);
|
|
22457
22853
|
}
|
|
22458
22854
|
async function loadSdkModules() {
|
|
22459
22855
|
if (!piCodingAgentModule || !piAiModule) {
|
|
@@ -22510,12 +22906,16 @@ var PiCodingAgentProvider = class {
|
|
|
22510
22906
|
try {
|
|
22511
22907
|
const cwd = this.resolveCwd(request.cwd);
|
|
22512
22908
|
const rawProvider = this.config.subprovider ?? "google";
|
|
22513
|
-
const
|
|
22909
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
|
|
22910
|
+
const hasBaseUrl = !!normalizedBaseUrl;
|
|
22514
22911
|
const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
|
|
22515
22912
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
22516
22913
|
this.setApiKeyEnv(rawProvider, hasBaseUrl);
|
|
22517
|
-
this.setBaseUrlEnv(rawProvider, hasBaseUrl);
|
|
22914
|
+
this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
|
|
22518
22915
|
let model = sdk.getModel(providerName, modelId);
|
|
22916
|
+
if (model && normalizedBaseUrl) {
|
|
22917
|
+
model = { ...model, baseUrl: normalizedBaseUrl };
|
|
22918
|
+
}
|
|
22519
22919
|
if (!model) {
|
|
22520
22920
|
const envProvider = providerName.replace(/-responses$/, "");
|
|
22521
22921
|
model = {
|
|
@@ -22523,7 +22923,7 @@ var PiCodingAgentProvider = class {
|
|
|
22523
22923
|
name: modelId,
|
|
22524
22924
|
api: providerName,
|
|
22525
22925
|
provider: envProvider,
|
|
22526
|
-
baseUrl:
|
|
22926
|
+
baseUrl: normalizedBaseUrl ?? "",
|
|
22527
22927
|
reasoning: false,
|
|
22528
22928
|
input: ["text"],
|
|
22529
22929
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
@@ -22690,19 +23090,27 @@ ${fileList}`;
|
|
|
22690
23090
|
}
|
|
22691
23091
|
}
|
|
22692
23092
|
/** Maps config baseUrl to the provider-specific env var the SDK reads. */
|
|
22693
|
-
setBaseUrlEnv(providerName, hasBaseUrl = false) {
|
|
22694
|
-
|
|
23093
|
+
setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
|
|
23094
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
|
|
23095
|
+
if (!normalizedBaseUrl) return;
|
|
22695
23096
|
const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
|
|
22696
23097
|
if (envKey) {
|
|
22697
|
-
process.env[envKey] =
|
|
23098
|
+
process.env[envKey] = normalizedBaseUrl;
|
|
22698
23099
|
}
|
|
22699
23100
|
}
|
|
23101
|
+
normalizeSdkBaseUrl(providerName, baseUrl) {
|
|
23102
|
+
if (!baseUrl) return void 0;
|
|
23103
|
+
if (providerName.toLowerCase() === "azure") {
|
|
23104
|
+
return normalizeAzureSdkBaseUrl(baseUrl);
|
|
23105
|
+
}
|
|
23106
|
+
return baseUrl;
|
|
23107
|
+
}
|
|
22700
23108
|
resolveCwd(cwdOverride) {
|
|
22701
23109
|
if (cwdOverride) {
|
|
22702
|
-
return
|
|
23110
|
+
return path21.resolve(cwdOverride);
|
|
22703
23111
|
}
|
|
22704
23112
|
if (this.config.cwd) {
|
|
22705
|
-
return
|
|
23113
|
+
return path21.resolve(this.config.cwd);
|
|
22706
23114
|
}
|
|
22707
23115
|
return process.cwd();
|
|
22708
23116
|
}
|
|
@@ -22721,9 +23129,9 @@ ${fileList}`;
|
|
|
22721
23129
|
}
|
|
22722
23130
|
resolveLogDirectory() {
|
|
22723
23131
|
if (this.config.logDir) {
|
|
22724
|
-
return
|
|
23132
|
+
return path21.resolve(this.config.logDir);
|
|
22725
23133
|
}
|
|
22726
|
-
return
|
|
23134
|
+
return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
22727
23135
|
}
|
|
22728
23136
|
async createStreamLogger(request) {
|
|
22729
23137
|
const logDir = this.resolveLogDirectory();
|
|
@@ -22737,7 +23145,7 @@ ${fileList}`;
|
|
|
22737
23145
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
22738
23146
|
return void 0;
|
|
22739
23147
|
}
|
|
22740
|
-
const filePath =
|
|
23148
|
+
const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
|
|
22741
23149
|
try {
|
|
22742
23150
|
const logger = await PiStreamLogger2.create({
|
|
22743
23151
|
filePath,
|
|
@@ -22961,7 +23369,7 @@ async function readDirEntries(target) {
|
|
|
22961
23369
|
const entries = await readdir2(target, { withFileTypes: true });
|
|
22962
23370
|
return entries.map((entry) => ({
|
|
22963
23371
|
name: entry.name,
|
|
22964
|
-
absolutePath:
|
|
23372
|
+
absolutePath: path222.join(target, entry.name),
|
|
22965
23373
|
isDirectory: entry.isDirectory()
|
|
22966
23374
|
}));
|
|
22967
23375
|
}
|
|
@@ -22975,7 +23383,7 @@ async function removeIfExists(target) {
|
|
|
22975
23383
|
}
|
|
22976
23384
|
}
|
|
22977
23385
|
function pathToFileUri2(filePath) {
|
|
22978
|
-
const absolutePath =
|
|
23386
|
+
const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
|
|
22979
23387
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
22980
23388
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
22981
23389
|
return `file:///${normalizedPath}`;
|
|
@@ -23067,8 +23475,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
|
|
|
23067
23475
|
});
|
|
23068
23476
|
}
|
|
23069
23477
|
function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
|
|
23070
|
-
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${
|
|
23071
|
-
const responseList = responseFiles.map((file) => `"${
|
|
23478
|
+
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
|
|
23479
|
+
const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
|
|
23072
23480
|
return renderTemplate2(templateContent, {
|
|
23073
23481
|
requestFiles: requestLines,
|
|
23074
23482
|
responseList
|
|
@@ -23128,7 +23536,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
23128
23536
|
}
|
|
23129
23537
|
async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
23130
23538
|
if (!silent) {
|
|
23131
|
-
const fileList = responseFilesFinal.map((file) =>
|
|
23539
|
+
const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
|
|
23132
23540
|
console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
|
|
23133
23541
|
}
|
|
23134
23542
|
const deadline = Date.now() + timeoutMs;
|
|
@@ -23137,7 +23545,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
23137
23545
|
while (pending.size > 0) {
|
|
23138
23546
|
if (Date.now() >= deadline) {
|
|
23139
23547
|
if (!silent) {
|
|
23140
|
-
const remaining = [...pending].map((f) =>
|
|
23548
|
+
const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
|
|
23141
23549
|
console.error(
|
|
23142
23550
|
`error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
|
|
23143
23551
|
);
|
|
@@ -23184,30 +23592,6 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
23184
23592
|
}
|
|
23185
23593
|
return true;
|
|
23186
23594
|
}
|
|
23187
|
-
var logged = false;
|
|
23188
|
-
function getAgentvHome() {
|
|
23189
|
-
const envHome = process.env.AGENTV_HOME;
|
|
23190
|
-
if (envHome && envHome !== "undefined") {
|
|
23191
|
-
if (!logged) {
|
|
23192
|
-
logged = true;
|
|
23193
|
-
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
23194
|
-
}
|
|
23195
|
-
return envHome;
|
|
23196
|
-
}
|
|
23197
|
-
return path25.join(os2.homedir(), ".agentv");
|
|
23198
|
-
}
|
|
23199
|
-
function getWorkspacesRoot() {
|
|
23200
|
-
return path25.join(getAgentvHome(), "workspaces");
|
|
23201
|
-
}
|
|
23202
|
-
function getSubagentsRoot() {
|
|
23203
|
-
return path25.join(getAgentvHome(), "subagents");
|
|
23204
|
-
}
|
|
23205
|
-
function getTraceStateRoot() {
|
|
23206
|
-
return path25.join(getAgentvHome(), "trace-state");
|
|
23207
|
-
}
|
|
23208
|
-
function getWorkspacePoolRoot() {
|
|
23209
|
-
return path25.join(getAgentvHome(), "workspace-pool");
|
|
23210
|
-
}
|
|
23211
23595
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
23212
23596
|
var DEFAULT_ALIVE_FILENAME = ".alive";
|
|
23213
23597
|
function getDefaultSubagentRoot(vscodeCmd = "code") {
|
|
@@ -24428,9 +24812,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
24428
24812
|
const resolved = resolveTargetDefinition(definition, env);
|
|
24429
24813
|
return createProvider(resolved);
|
|
24430
24814
|
}
|
|
24431
|
-
var
|
|
24432
|
-
|
|
24433
|
-
|
|
24815
|
+
var DEFAULT_THRESHOLD = 0.8;
|
|
24816
|
+
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
24817
|
+
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
24818
|
+
return score >= threshold ? "pass" : "fail";
|
|
24434
24819
|
}
|
|
24435
24820
|
function clampScore(value) {
|
|
24436
24821
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -24612,13 +24997,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
24612
24997
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
24613
24998
|
const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
24614
24999
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
24615
|
-
const
|
|
25000
|
+
const path50 = await import("node:path");
|
|
24616
25001
|
const { randomUUID: randomUUID10 } = await import("node:crypto");
|
|
24617
|
-
const dir =
|
|
25002
|
+
const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
24618
25003
|
await mkdir16(dir, { recursive: true });
|
|
24619
|
-
const stdinPath =
|
|
24620
|
-
const stdoutPath =
|
|
24621
|
-
const stderrPath =
|
|
25004
|
+
const stdinPath = path50.join(dir, "stdin.txt");
|
|
25005
|
+
const stdoutPath = path50.join(dir, "stdout.txt");
|
|
25006
|
+
const stderrPath = path50.join(dir, "stderr.txt");
|
|
24622
25007
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
24623
25008
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
24624
25009
|
const { spawn: spawn5 } = await import("node:child_process");
|
|
@@ -25799,7 +26184,7 @@ ${outputSchema2}`;
|
|
|
25799
26184
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
25800
26185
|
for (const rubric of rubrics) {
|
|
25801
26186
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
25802
|
-
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
26187
|
+
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
25803
26188
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
25804
26189
|
if (rubric.outcome) {
|
|
25805
26190
|
parts.push(`Description: ${rubric.outcome}`);
|
|
@@ -25853,54 +26238,106 @@ ${outputSchema2}`;
|
|
|
25853
26238
|
async runWithRetry(options) {
|
|
25854
26239
|
const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
25855
26240
|
let lastError;
|
|
26241
|
+
let lastInvalidResponse;
|
|
26242
|
+
let shouldAttemptStructureFix = false;
|
|
25856
26243
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
25857
26244
|
try {
|
|
25858
|
-
const
|
|
25859
|
-
|
|
25860
|
-
|
|
25861
|
-
|
|
25862
|
-
|
|
25863
|
-
|
|
25864
|
-
|
|
25865
|
-
|
|
25866
|
-
|
|
25867
|
-
|
|
25868
|
-
|
|
25869
|
-
|
|
25870
|
-
|
|
25871
|
-
|
|
25872
|
-
|
|
25873
|
-
|
|
25874
|
-
]
|
|
25875
|
-
}
|
|
25876
|
-
],
|
|
25877
|
-
...modelOptions
|
|
25878
|
-
}) : await generateText({
|
|
25879
|
-
model,
|
|
25880
|
-
system: systemPrompt,
|
|
25881
|
-
prompt: userPrompt,
|
|
25882
|
-
...modelOptions
|
|
25883
|
-
});
|
|
25884
|
-
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
25885
|
-
const rawUsage = result.usage;
|
|
25886
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
25887
|
-
return { data: data2, tokenUsage };
|
|
26245
|
+
const result = await this.generateStructuredResponse({
|
|
26246
|
+
context: context2,
|
|
26247
|
+
graderProvider,
|
|
26248
|
+
systemPrompt,
|
|
26249
|
+
userPrompt,
|
|
26250
|
+
images
|
|
26251
|
+
});
|
|
26252
|
+
const canRepairResponse = result.text.trim().length > 0;
|
|
26253
|
+
lastInvalidResponse = canRepairResponse ? result : void 0;
|
|
26254
|
+
let data;
|
|
26255
|
+
try {
|
|
26256
|
+
data = schema.parse(parseJsonFromText(result.text));
|
|
26257
|
+
} catch (e) {
|
|
26258
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
26259
|
+
shouldAttemptStructureFix = canRepairResponse;
|
|
26260
|
+
continue;
|
|
25888
26261
|
}
|
|
25889
|
-
|
|
25890
|
-
|
|
26262
|
+
return {
|
|
26263
|
+
data,
|
|
26264
|
+
providerResponse: result.providerResponse,
|
|
26265
|
+
tokenUsage: result.tokenUsage
|
|
26266
|
+
};
|
|
26267
|
+
} catch (e) {
|
|
26268
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
26269
|
+
}
|
|
26270
|
+
}
|
|
26271
|
+
if (shouldAttemptStructureFix && lastInvalidResponse) {
|
|
26272
|
+
try {
|
|
26273
|
+
const repaired = await this.generateStructuredResponse({
|
|
26274
|
+
context: context2,
|
|
26275
|
+
graderProvider,
|
|
25891
26276
|
systemPrompt,
|
|
25892
|
-
|
|
25893
|
-
|
|
25894
|
-
|
|
25895
|
-
|
|
26277
|
+
userPrompt: buildStructureRepairPrompt({
|
|
26278
|
+
validationError: lastError?.message ?? "Schema validation failed",
|
|
26279
|
+
invalidResponse: lastInvalidResponse.text
|
|
26280
|
+
})
|
|
25896
26281
|
});
|
|
25897
|
-
const data = schema.parse(parseJsonFromText(
|
|
25898
|
-
return {
|
|
26282
|
+
const data = schema.parse(parseJsonFromText(repaired.text));
|
|
26283
|
+
return {
|
|
26284
|
+
data,
|
|
26285
|
+
providerResponse: repaired.providerResponse,
|
|
26286
|
+
tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
|
|
26287
|
+
};
|
|
25899
26288
|
} catch (e) {
|
|
25900
26289
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
25901
26290
|
}
|
|
25902
26291
|
}
|
|
25903
|
-
throw new Error(
|
|
26292
|
+
throw new Error(
|
|
26293
|
+
`Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
|
|
26294
|
+
);
|
|
26295
|
+
}
|
|
26296
|
+
async generateStructuredResponse(options) {
|
|
26297
|
+
const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
26298
|
+
const model = graderProvider.asLanguageModel?.();
|
|
26299
|
+
if (model) {
|
|
26300
|
+
const modelOptions = {
|
|
26301
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
26302
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
26303
|
+
};
|
|
26304
|
+
const hasImages = images && images.length > 0;
|
|
26305
|
+
const result = hasImages ? await generateText({
|
|
26306
|
+
model,
|
|
26307
|
+
system: systemPrompt,
|
|
26308
|
+
messages: [
|
|
26309
|
+
{
|
|
26310
|
+
role: "user",
|
|
26311
|
+
content: [
|
|
26312
|
+
{ type: "text", text: userPrompt },
|
|
26313
|
+
...toAiSdkImageParts(images)
|
|
26314
|
+
]
|
|
26315
|
+
}
|
|
26316
|
+
],
|
|
26317
|
+
...modelOptions
|
|
26318
|
+
}) : await generateText({
|
|
26319
|
+
model,
|
|
26320
|
+
system: systemPrompt,
|
|
26321
|
+
prompt: userPrompt,
|
|
26322
|
+
...modelOptions
|
|
26323
|
+
});
|
|
26324
|
+
const rawUsage = result.usage;
|
|
26325
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
26326
|
+
return { text: result.text, tokenUsage };
|
|
26327
|
+
}
|
|
26328
|
+
const response = await graderProvider.invoke({
|
|
26329
|
+
question: userPrompt,
|
|
26330
|
+
systemPrompt,
|
|
26331
|
+
evalCaseId: context2.evalCase.id,
|
|
26332
|
+
attempt: context2.attempt,
|
|
26333
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
26334
|
+
temperature: this.temperature
|
|
26335
|
+
});
|
|
26336
|
+
return {
|
|
26337
|
+
text: extractLastAssistantContent(response.output),
|
|
26338
|
+
providerResponse: response,
|
|
26339
|
+
tokenUsage: response.tokenUsage
|
|
26340
|
+
};
|
|
25904
26341
|
}
|
|
25905
26342
|
};
|
|
25906
26343
|
function buildOutputSchema() {
|
|
@@ -25920,6 +26357,29 @@ function buildOutputSchema() {
|
|
|
25920
26357
|
"}"
|
|
25921
26358
|
].join("\n");
|
|
25922
26359
|
}
|
|
26360
|
+
function buildStructureRepairPrompt(options) {
|
|
26361
|
+
const { validationError, invalidResponse } = options;
|
|
26362
|
+
return [
|
|
26363
|
+
"The following evaluation response has useful grading content but invalid JSON structure.",
|
|
26364
|
+
"Repair it to satisfy the schema in the system prompt.",
|
|
26365
|
+
"Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
|
|
26366
|
+
"",
|
|
26367
|
+
"Validation error:",
|
|
26368
|
+
validationError,
|
|
26369
|
+
"",
|
|
26370
|
+
"Invalid response:",
|
|
26371
|
+
invalidResponse
|
|
26372
|
+
].join("\n");
|
|
26373
|
+
}
|
|
26374
|
+
function sumTokenUsage(first, second) {
|
|
26375
|
+
if (!first && !second) {
|
|
26376
|
+
return void 0;
|
|
26377
|
+
}
|
|
26378
|
+
return {
|
|
26379
|
+
input: (first?.input ?? 0) + (second?.input ?? 0),
|
|
26380
|
+
output: (first?.output ?? 0) + (second?.output ?? 0)
|
|
26381
|
+
};
|
|
26382
|
+
}
|
|
25923
26383
|
function buildRubricOutputSchema() {
|
|
25924
26384
|
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
25925
26385
|
You must return a valid JSON object matching this schema:
|
|
@@ -26019,19 +26479,21 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
26019
26479
|
rawScores[rubric.id] = rawScore;
|
|
26020
26480
|
totalWeight += rubric.weight;
|
|
26021
26481
|
weightedScoreSum += normalizedScore * rubric.weight;
|
|
26022
|
-
let
|
|
26023
|
-
if (rubric.
|
|
26024
|
-
|
|
26482
|
+
let minScoreThreshold;
|
|
26483
|
+
if (rubric.min_score !== void 0) {
|
|
26484
|
+
minScoreThreshold = rubric.min_score;
|
|
26485
|
+
} else if (rubric.required_min_score !== void 0) {
|
|
26486
|
+
minScoreThreshold = rubric.required_min_score / 10;
|
|
26025
26487
|
} else if (rubric.required === true) {
|
|
26026
|
-
|
|
26488
|
+
minScoreThreshold = 1;
|
|
26027
26489
|
}
|
|
26028
26490
|
const matchingRange = rubric.score_ranges?.find(
|
|
26029
26491
|
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
26030
26492
|
);
|
|
26031
26493
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
26032
26494
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
26033
|
-
const passed = !(
|
|
26034
|
-
if (
|
|
26495
|
+
const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
|
|
26496
|
+
if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
|
|
26035
26497
|
failedRequired = true;
|
|
26036
26498
|
}
|
|
26037
26499
|
assertions.push({
|
|
@@ -26108,11 +26570,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
26108
26570
|
execute: async (input) => {
|
|
26109
26571
|
try {
|
|
26110
26572
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
26111
|
-
const
|
|
26112
|
-
if (
|
|
26573
|
+
const stat11 = await fs2.stat(resolved);
|
|
26574
|
+
if (stat11.isDirectory()) {
|
|
26113
26575
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
26114
26576
|
}
|
|
26115
|
-
const buffer = Buffer.alloc(Math.min(
|
|
26577
|
+
const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
|
|
26116
26578
|
const fd = await fs2.open(resolved, "r");
|
|
26117
26579
|
try {
|
|
26118
26580
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -26120,8 +26582,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
26120
26582
|
await fd.close();
|
|
26121
26583
|
}
|
|
26122
26584
|
const content = buffer.toString("utf-8");
|
|
26123
|
-
const truncated =
|
|
26124
|
-
return { content, truncated, size:
|
|
26585
|
+
const truncated = stat11.size > MAX_FILE_SIZE;
|
|
26586
|
+
return { content, truncated, size: stat11.size };
|
|
26125
26587
|
} catch (error) {
|
|
26126
26588
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
26127
26589
|
}
|
|
@@ -26172,8 +26634,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
26172
26634
|
const ext = path35.extname(entry.name).toLowerCase();
|
|
26173
26635
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
26174
26636
|
try {
|
|
26175
|
-
const
|
|
26176
|
-
if (
|
|
26637
|
+
const stat11 = await fs2.stat(fullPath);
|
|
26638
|
+
if (stat11.size > MAX_FILE_SIZE) continue;
|
|
26177
26639
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
26178
26640
|
const lines = content.split("\n");
|
|
26179
26641
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -26806,115 +27268,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
26806
27268
|
* Evaluate a single field against the expected value.
|
|
26807
27269
|
*/
|
|
26808
27270
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
26809
|
-
const { path:
|
|
26810
|
-
const candidateValue = resolvePath(candidateData,
|
|
26811
|
-
const expectedValue = resolvePath(expectedData,
|
|
27271
|
+
const { path: path50, match, required = true, weight = 1 } = fieldConfig;
|
|
27272
|
+
const candidateValue = resolvePath(candidateData, path50);
|
|
27273
|
+
const expectedValue = resolvePath(expectedData, path50);
|
|
26812
27274
|
if (expectedValue === void 0) {
|
|
26813
27275
|
return {
|
|
26814
|
-
path:
|
|
27276
|
+
path: path50,
|
|
26815
27277
|
score: 1,
|
|
26816
27278
|
// No expected value means no comparison needed
|
|
26817
27279
|
weight,
|
|
26818
27280
|
hit: true,
|
|
26819
|
-
message: `${
|
|
27281
|
+
message: `${path50}: no expected value`
|
|
26820
27282
|
};
|
|
26821
27283
|
}
|
|
26822
27284
|
if (candidateValue === void 0) {
|
|
26823
27285
|
if (required) {
|
|
26824
27286
|
return {
|
|
26825
|
-
path:
|
|
27287
|
+
path: path50,
|
|
26826
27288
|
score: 0,
|
|
26827
27289
|
weight,
|
|
26828
27290
|
hit: false,
|
|
26829
|
-
message: `${
|
|
27291
|
+
message: `${path50} (required, missing)`
|
|
26830
27292
|
};
|
|
26831
27293
|
}
|
|
26832
27294
|
return {
|
|
26833
|
-
path:
|
|
27295
|
+
path: path50,
|
|
26834
27296
|
score: 1,
|
|
26835
27297
|
// Don't penalize missing optional fields
|
|
26836
27298
|
weight: 0,
|
|
26837
27299
|
// Zero weight means it won't affect the score
|
|
26838
27300
|
hit: true,
|
|
26839
|
-
message: `${
|
|
27301
|
+
message: `${path50}: optional field missing`
|
|
26840
27302
|
};
|
|
26841
27303
|
}
|
|
26842
27304
|
switch (match) {
|
|
26843
27305
|
case "exact":
|
|
26844
|
-
return this.compareExact(
|
|
27306
|
+
return this.compareExact(path50, candidateValue, expectedValue, weight);
|
|
26845
27307
|
case "numeric_tolerance":
|
|
26846
27308
|
return this.compareNumericTolerance(
|
|
26847
|
-
|
|
27309
|
+
path50,
|
|
26848
27310
|
candidateValue,
|
|
26849
27311
|
expectedValue,
|
|
26850
27312
|
fieldConfig,
|
|
26851
27313
|
weight
|
|
26852
27314
|
);
|
|
26853
27315
|
case "date":
|
|
26854
|
-
return this.compareDate(
|
|
27316
|
+
return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
|
|
26855
27317
|
default:
|
|
26856
27318
|
return {
|
|
26857
|
-
path:
|
|
27319
|
+
path: path50,
|
|
26858
27320
|
score: 0,
|
|
26859
27321
|
weight,
|
|
26860
27322
|
hit: false,
|
|
26861
|
-
message: `${
|
|
27323
|
+
message: `${path50}: unknown match type "${match}"`
|
|
26862
27324
|
};
|
|
26863
27325
|
}
|
|
26864
27326
|
}
|
|
26865
27327
|
/**
|
|
26866
27328
|
* Exact equality comparison.
|
|
26867
27329
|
*/
|
|
26868
|
-
compareExact(
|
|
27330
|
+
compareExact(path50, candidateValue, expectedValue, weight) {
|
|
26869
27331
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
26870
27332
|
return {
|
|
26871
|
-
path:
|
|
27333
|
+
path: path50,
|
|
26872
27334
|
score: 1,
|
|
26873
27335
|
weight,
|
|
26874
27336
|
hit: true,
|
|
26875
|
-
message:
|
|
27337
|
+
message: path50
|
|
26876
27338
|
};
|
|
26877
27339
|
}
|
|
26878
27340
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
26879
27341
|
return {
|
|
26880
|
-
path:
|
|
27342
|
+
path: path50,
|
|
26881
27343
|
score: 0,
|
|
26882
27344
|
weight,
|
|
26883
27345
|
hit: false,
|
|
26884
|
-
message: `${
|
|
27346
|
+
message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
26885
27347
|
};
|
|
26886
27348
|
}
|
|
26887
27349
|
return {
|
|
26888
|
-
path:
|
|
27350
|
+
path: path50,
|
|
26889
27351
|
score: 0,
|
|
26890
27352
|
weight,
|
|
26891
27353
|
hit: false,
|
|
26892
|
-
message: `${
|
|
27354
|
+
message: `${path50} (value mismatch)`
|
|
26893
27355
|
};
|
|
26894
27356
|
}
|
|
26895
27357
|
/**
|
|
26896
27358
|
* Numeric comparison with absolute or relative tolerance.
|
|
26897
27359
|
*/
|
|
26898
|
-
compareNumericTolerance(
|
|
27360
|
+
compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
26899
27361
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
26900
27362
|
const candidateNum = toNumber(candidateValue);
|
|
26901
27363
|
const expectedNum = toNumber(expectedValue);
|
|
26902
27364
|
if (candidateNum === null || expectedNum === null) {
|
|
26903
27365
|
return {
|
|
26904
|
-
path:
|
|
27366
|
+
path: path50,
|
|
26905
27367
|
score: 0,
|
|
26906
27368
|
weight,
|
|
26907
27369
|
hit: false,
|
|
26908
|
-
message: `${
|
|
27370
|
+
message: `${path50} (non-numeric value)`
|
|
26909
27371
|
};
|
|
26910
27372
|
}
|
|
26911
27373
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
26912
27374
|
return {
|
|
26913
|
-
path:
|
|
27375
|
+
path: path50,
|
|
26914
27376
|
score: 0,
|
|
26915
27377
|
weight,
|
|
26916
27378
|
hit: false,
|
|
26917
|
-
message: `${
|
|
27379
|
+
message: `${path50} (invalid numeric value)`
|
|
26918
27380
|
};
|
|
26919
27381
|
}
|
|
26920
27382
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -26927,61 +27389,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
26927
27389
|
}
|
|
26928
27390
|
if (withinTolerance) {
|
|
26929
27391
|
return {
|
|
26930
|
-
path:
|
|
27392
|
+
path: path50,
|
|
26931
27393
|
score: 1,
|
|
26932
27394
|
weight,
|
|
26933
27395
|
hit: true,
|
|
26934
|
-
message: `${
|
|
27396
|
+
message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
|
|
26935
27397
|
};
|
|
26936
27398
|
}
|
|
26937
27399
|
return {
|
|
26938
|
-
path:
|
|
27400
|
+
path: path50,
|
|
26939
27401
|
score: 0,
|
|
26940
27402
|
weight,
|
|
26941
27403
|
hit: false,
|
|
26942
|
-
message: `${
|
|
27404
|
+
message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
26943
27405
|
};
|
|
26944
27406
|
}
|
|
26945
27407
|
/**
|
|
26946
27408
|
* Date comparison with format normalization.
|
|
26947
27409
|
*/
|
|
26948
|
-
compareDate(
|
|
27410
|
+
compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
26949
27411
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
26950
27412
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
26951
27413
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
26952
27414
|
if (candidateDate === null) {
|
|
26953
27415
|
return {
|
|
26954
|
-
path:
|
|
27416
|
+
path: path50,
|
|
26955
27417
|
score: 0,
|
|
26956
27418
|
weight,
|
|
26957
27419
|
hit: false,
|
|
26958
|
-
message: `${
|
|
27420
|
+
message: `${path50} (unparseable candidate date)`
|
|
26959
27421
|
};
|
|
26960
27422
|
}
|
|
26961
27423
|
if (expectedDate === null) {
|
|
26962
27424
|
return {
|
|
26963
|
-
path:
|
|
27425
|
+
path: path50,
|
|
26964
27426
|
score: 0,
|
|
26965
27427
|
weight,
|
|
26966
27428
|
hit: false,
|
|
26967
|
-
message: `${
|
|
27429
|
+
message: `${path50} (unparseable expected date)`
|
|
26968
27430
|
};
|
|
26969
27431
|
}
|
|
26970
27432
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
26971
27433
|
return {
|
|
26972
|
-
path:
|
|
27434
|
+
path: path50,
|
|
26973
27435
|
score: 1,
|
|
26974
27436
|
weight,
|
|
26975
27437
|
hit: true,
|
|
26976
|
-
message:
|
|
27438
|
+
message: path50
|
|
26977
27439
|
};
|
|
26978
27440
|
}
|
|
26979
27441
|
return {
|
|
26980
|
-
path:
|
|
27442
|
+
path: path50,
|
|
26981
27443
|
score: 0,
|
|
26982
27444
|
weight,
|
|
26983
27445
|
hit: false,
|
|
26984
|
-
message: `${
|
|
27446
|
+
message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
26985
27447
|
};
|
|
26986
27448
|
}
|
|
26987
27449
|
/**
|
|
@@ -27014,11 +27476,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
27014
27476
|
};
|
|
27015
27477
|
}
|
|
27016
27478
|
};
|
|
27017
|
-
function resolvePath(obj,
|
|
27018
|
-
if (!
|
|
27479
|
+
function resolvePath(obj, path50) {
|
|
27480
|
+
if (!path50 || !obj) {
|
|
27019
27481
|
return void 0;
|
|
27020
27482
|
}
|
|
27021
|
-
const parts =
|
|
27483
|
+
const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
27022
27484
|
let current = obj;
|
|
27023
27485
|
for (const part of parts) {
|
|
27024
27486
|
if (current === null || current === void 0) {
|
|
@@ -27500,8 +27962,8 @@ var TokenUsageEvaluator = class {
|
|
|
27500
27962
|
};
|
|
27501
27963
|
}
|
|
27502
27964
|
};
|
|
27503
|
-
function getNestedValue(obj,
|
|
27504
|
-
const parts =
|
|
27965
|
+
function getNestedValue(obj, path50) {
|
|
27966
|
+
const parts = path50.split(".");
|
|
27505
27967
|
let current = obj;
|
|
27506
27968
|
for (const part of parts) {
|
|
27507
27969
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -29224,7 +29686,7 @@ var WorkspacePoolManager = class {
|
|
|
29224
29686
|
}
|
|
29225
29687
|
/**
|
|
29226
29688
|
* Reset an existing slot for reuse:
|
|
29227
|
-
* 1. Reset repos (git reset --hard
|
|
29689
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
29228
29690
|
* 2. Re-copy template files (skip repo directories)
|
|
29229
29691
|
*/
|
|
29230
29692
|
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
@@ -29237,7 +29699,17 @@ var WorkspacePoolManager = class {
|
|
|
29237
29699
|
continue;
|
|
29238
29700
|
}
|
|
29239
29701
|
const ref = repo.checkout?.ref ?? "HEAD";
|
|
29240
|
-
|
|
29702
|
+
const resolve2 = repo.checkout?.resolve ?? "remote";
|
|
29703
|
+
if (resolve2 === "remote") {
|
|
29704
|
+
const fetchArgs = ["fetch", "origin", ref];
|
|
29705
|
+
if (repo.clone?.depth) {
|
|
29706
|
+
fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
|
|
29707
|
+
}
|
|
29708
|
+
await git(fetchArgs, { cwd: repoDir });
|
|
29709
|
+
await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
|
|
29710
|
+
} else {
|
|
29711
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
29712
|
+
}
|
|
29241
29713
|
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
29242
29714
|
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
29243
29715
|
}
|
|
@@ -29520,7 +29992,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
29520
29992
|
}
|
|
29521
29993
|
return result.stdout;
|
|
29522
29994
|
}
|
|
29523
|
-
function classifyQualityStatus(score, threshold =
|
|
29995
|
+
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
29524
29996
|
return score >= threshold ? "ok" : "quality_failure";
|
|
29525
29997
|
}
|
|
29526
29998
|
function buildSkippedEvaluatorError(scores) {
|
|
@@ -29612,7 +30084,7 @@ async function runEvaluation(options) {
|
|
|
29612
30084
|
const filteredEvalCases = filterEvalCases(evalCases, filter2);
|
|
29613
30085
|
if (filteredEvalCases.length === 0) {
|
|
29614
30086
|
if (filter2) {
|
|
29615
|
-
throw new Error(`No tests matched filter '${filter2}' in ${evalFilePath}`);
|
|
30087
|
+
throw new Error(`No tests matched filter '${formatFilter(filter2)}' in ${evalFilePath}`);
|
|
29616
30088
|
}
|
|
29617
30089
|
return [];
|
|
29618
30090
|
}
|
|
@@ -29664,6 +30136,9 @@ async function runEvaluation(options) {
|
|
|
29664
30136
|
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
29665
30137
|
const resolvedGrader = resolveTargetByName(graderName);
|
|
29666
30138
|
if (!resolvedGrader) {
|
|
30139
|
+
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
30140
|
+
return void 0;
|
|
30141
|
+
}
|
|
29667
30142
|
return getOrCreateProvider(targetContext);
|
|
29668
30143
|
}
|
|
29669
30144
|
return getOrCreateProvider(resolvedGrader);
|
|
@@ -29994,7 +30469,7 @@ async function runEvaluation(options) {
|
|
|
29994
30469
|
const budgetResult = {
|
|
29995
30470
|
timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
29996
30471
|
testId: evalCase.id,
|
|
29997
|
-
|
|
30472
|
+
suite: evalCase.suite,
|
|
29998
30473
|
category: evalCase.category,
|
|
29999
30474
|
score: 0,
|
|
30000
30475
|
assertions: [],
|
|
@@ -30031,7 +30506,7 @@ async function runEvaluation(options) {
|
|
|
30031
30506
|
const haltResult = {
|
|
30032
30507
|
timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
30033
30508
|
testId: evalCase.id,
|
|
30034
|
-
|
|
30509
|
+
suite: evalCase.suite,
|
|
30035
30510
|
category: evalCase.category,
|
|
30036
30511
|
score: 0,
|
|
30037
30512
|
assertions: [],
|
|
@@ -30343,7 +30818,7 @@ async function runBatchEvaluation(options) {
|
|
|
30343
30818
|
targetResolver,
|
|
30344
30819
|
availableTargets,
|
|
30345
30820
|
verbose,
|
|
30346
|
-
threshold: batchThreshold
|
|
30821
|
+
threshold: evalCase.threshold ?? batchThreshold
|
|
30347
30822
|
});
|
|
30348
30823
|
if (providerError) {
|
|
30349
30824
|
result = {
|
|
@@ -30805,8 +31280,9 @@ async function runEvalCase(options) {
|
|
|
30805
31280
|
fileChanges,
|
|
30806
31281
|
workspacePath,
|
|
30807
31282
|
verbose,
|
|
30808
|
-
threshold: caseThreshold
|
|
31283
|
+
threshold: evalCase.threshold ?? caseThreshold
|
|
30809
31284
|
});
|
|
31285
|
+
const effectiveThreshold = evalCase.threshold ?? caseThreshold;
|
|
30810
31286
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
30811
31287
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
30812
31288
|
const evalRunTokenUsage = tokenUsage || graderTokens ? {
|
|
@@ -30820,7 +31296,7 @@ async function runEvalCase(options) {
|
|
|
30820
31296
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
30821
31297
|
};
|
|
30822
31298
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
30823
|
-
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score,
|
|
31299
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
|
|
30824
31300
|
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
30825
31301
|
const finalResult = providerError ? {
|
|
30826
31302
|
...result,
|
|
@@ -31021,7 +31497,8 @@ async function evaluateCandidate(options) {
|
|
|
31021
31497
|
targetResolver,
|
|
31022
31498
|
availableTargets,
|
|
31023
31499
|
fileChanges,
|
|
31024
|
-
workspacePath
|
|
31500
|
+
workspacePath,
|
|
31501
|
+
threshold: evalThreshold
|
|
31025
31502
|
});
|
|
31026
31503
|
const completedAt = nowFn();
|
|
31027
31504
|
let agentRequest;
|
|
@@ -31052,7 +31529,7 @@ async function evaluateCandidate(options) {
|
|
|
31052
31529
|
return {
|
|
31053
31530
|
timestamp: completedAt.toISOString(),
|
|
31054
31531
|
testId: evalCase.id,
|
|
31055
|
-
|
|
31532
|
+
suite: evalCase.suite,
|
|
31056
31533
|
category: evalCase.category,
|
|
31057
31534
|
conversationId: evalCase.conversation_id,
|
|
31058
31535
|
score: score.score,
|
|
@@ -31095,7 +31572,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
31095
31572
|
targetResolver,
|
|
31096
31573
|
availableTargets,
|
|
31097
31574
|
fileChanges,
|
|
31098
|
-
workspacePath
|
|
31575
|
+
workspacePath,
|
|
31576
|
+
threshold
|
|
31099
31577
|
} = options;
|
|
31100
31578
|
if (evalCase.assertions && evalCase.assertions.length > 0) {
|
|
31101
31579
|
return runEvaluatorList({
|
|
@@ -31121,7 +31599,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
31121
31599
|
targetResolver,
|
|
31122
31600
|
availableTargets,
|
|
31123
31601
|
fileChanges,
|
|
31124
|
-
workspacePath
|
|
31602
|
+
workspacePath,
|
|
31603
|
+
threshold
|
|
31125
31604
|
});
|
|
31126
31605
|
}
|
|
31127
31606
|
const evaluatorKind = evalCase.evaluator ?? "llm-grader";
|
|
@@ -31223,7 +31702,8 @@ async function runEvaluatorList(options) {
|
|
|
31223
31702
|
name: evaluatorConfig.name,
|
|
31224
31703
|
type: evaluatorConfig.type,
|
|
31225
31704
|
weight,
|
|
31226
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
31705
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
31706
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
31227
31707
|
});
|
|
31228
31708
|
scores.push({
|
|
31229
31709
|
name: evaluatorConfig.name,
|
|
@@ -31258,7 +31738,8 @@ async function runEvaluatorList(options) {
|
|
|
31258
31738
|
name: evaluatorConfig.name ?? "unknown",
|
|
31259
31739
|
type: evaluatorConfig.type ?? "llm-grader",
|
|
31260
31740
|
weight,
|
|
31261
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
31741
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
31742
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
31262
31743
|
});
|
|
31263
31744
|
scores.push({
|
|
31264
31745
|
name: evaluatorConfig.name ?? "unknown",
|
|
@@ -31292,9 +31773,10 @@ async function runEvaluatorList(options) {
|
|
|
31292
31773
|
}
|
|
31293
31774
|
}
|
|
31294
31775
|
}
|
|
31776
|
+
const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
|
|
31295
31777
|
const hasRequiredFailure = scored.some((entry) => {
|
|
31296
31778
|
if (!entry.required) return false;
|
|
31297
|
-
const minScore = typeof entry.required === "number" ? entry.required :
|
|
31779
|
+
const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
|
|
31298
31780
|
return entry.score.score < minScore;
|
|
31299
31781
|
});
|
|
31300
31782
|
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
@@ -31305,17 +31787,23 @@ async function runEvaluatorList(options) {
|
|
|
31305
31787
|
const expectedAspectCount = assertions.length || 1;
|
|
31306
31788
|
const score = {
|
|
31307
31789
|
score: aggregateScore,
|
|
31308
|
-
verdict: scoreToVerdict(aggregateScore),
|
|
31790
|
+
verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
|
|
31309
31791
|
assertions,
|
|
31310
31792
|
expectedAspectCount
|
|
31311
31793
|
};
|
|
31312
31794
|
return { score, scores };
|
|
31313
31795
|
}
|
|
31796
|
+
function formatFilter(filter2) {
|
|
31797
|
+
return typeof filter2 === "string" ? filter2 : filter2.join(", ");
|
|
31798
|
+
}
|
|
31799
|
+
function matchesFilter3(id, filter2) {
|
|
31800
|
+
return typeof filter2 === "string" ? micromatch3.isMatch(id, filter2) : filter2.some((pattern) => micromatch3.isMatch(id, pattern));
|
|
31801
|
+
}
|
|
31314
31802
|
function filterEvalCases(evalCases, filter2) {
|
|
31315
31803
|
if (!filter2) {
|
|
31316
31804
|
return evalCases;
|
|
31317
31805
|
}
|
|
31318
|
-
return evalCases.filter((evalCase) =>
|
|
31806
|
+
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter2));
|
|
31319
31807
|
}
|
|
31320
31808
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
31321
31809
|
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
@@ -31402,7 +31890,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
31402
31890
|
return {
|
|
31403
31891
|
timestamp: timestamp.toISOString(),
|
|
31404
31892
|
testId: evalCase.id,
|
|
31405
|
-
|
|
31893
|
+
suite: evalCase.suite,
|
|
31406
31894
|
category: evalCase.category,
|
|
31407
31895
|
conversationId: evalCase.conversation_id,
|
|
31408
31896
|
score: 0,
|
|
@@ -31666,6 +32154,7 @@ async function evaluate(config) {
|
|
|
31666
32154
|
verbose: config.verbose,
|
|
31667
32155
|
maxConcurrency: config.workers ?? 3,
|
|
31668
32156
|
filter: config.filter,
|
|
32157
|
+
threshold: config.threshold,
|
|
31669
32158
|
evalCases,
|
|
31670
32159
|
onResult: async (result) => {
|
|
31671
32160
|
collectedResults.push(result);
|
|
@@ -31676,19 +32165,19 @@ async function evaluate(config) {
|
|
|
31676
32165
|
const durationMs = Date.now() - startTime;
|
|
31677
32166
|
return {
|
|
31678
32167
|
results: allResults,
|
|
31679
|
-
summary: computeSummary(allResults, durationMs)
|
|
32168
|
+
summary: computeSummary(allResults, durationMs, config.threshold)
|
|
31680
32169
|
};
|
|
31681
32170
|
}
|
|
31682
32171
|
function mapAssertionType(type) {
|
|
31683
32172
|
return type.replace(/_/g, "-");
|
|
31684
32173
|
}
|
|
31685
|
-
function computeSummary(results, durationMs) {
|
|
32174
|
+
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
31686
32175
|
const total = results.length;
|
|
31687
32176
|
let passed = 0;
|
|
31688
32177
|
let scoreSum = 0;
|
|
31689
32178
|
for (const r of results) {
|
|
31690
32179
|
scoreSum += r.score;
|
|
31691
|
-
if (r.score >=
|
|
32180
|
+
if (r.score >= threshold) {
|
|
31692
32181
|
passed++;
|
|
31693
32182
|
}
|
|
31694
32183
|
}
|
|
@@ -31798,7 +32287,7 @@ var CONFIG_FILE_NAMES = [
|
|
|
31798
32287
|
];
|
|
31799
32288
|
async function loadTsConfig(projectRoot) {
|
|
31800
32289
|
const { existsSync: existsSync7 } = await import("node:fs");
|
|
31801
|
-
const { pathToFileURL } = await import("node:url");
|
|
32290
|
+
const { pathToFileURL: pathToFileURL2 } = await import("node:url");
|
|
31802
32291
|
const { join: join2 } = await import("node:path");
|
|
31803
32292
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
31804
32293
|
const filePath = join2(projectRoot, fileName);
|
|
@@ -31806,7 +32295,7 @@ async function loadTsConfig(projectRoot) {
|
|
|
31806
32295
|
continue;
|
|
31807
32296
|
}
|
|
31808
32297
|
try {
|
|
31809
|
-
const fileUrl =
|
|
32298
|
+
const fileUrl = pathToFileURL2(filePath).href;
|
|
31810
32299
|
const mod = await import(fileUrl);
|
|
31811
32300
|
const config = mod.default ?? mod;
|
|
31812
32301
|
return AgentVConfigSchema.parse(config);
|
|
@@ -31953,7 +32442,7 @@ function saveProjectRegistry(registry) {
|
|
|
31953
32442
|
const registryPath = getProjectsRegistryPath();
|
|
31954
32443
|
const dir = path47.dirname(registryPath);
|
|
31955
32444
|
if (!existsSync6(dir)) {
|
|
31956
|
-
|
|
32445
|
+
mkdirSync2(dir, { recursive: true });
|
|
31957
32446
|
}
|
|
31958
32447
|
writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
|
|
31959
32448
|
}
|
|
@@ -32213,7 +32702,7 @@ var OtelTraceExporter = class {
|
|
|
32213
32702
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
32214
32703
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
32215
32704
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
32216
|
-
if (result.
|
|
32705
|
+
if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
|
|
32217
32706
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
32218
32707
|
if (captureContent && result.output.length > 0) {
|
|
32219
32708
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -32422,7 +32911,7 @@ var OtelStreamingObserver = class {
|
|
|
32422
32911
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
32423
32912
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
32424
32913
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
32425
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
32914
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
|
|
32426
32915
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
32427
32916
|
}
|
|
32428
32917
|
/** Create and immediately export a tool span */
|
|
@@ -32768,7 +33257,230 @@ function extractToolResultContent(content) {
|
|
|
32768
33257
|
}
|
|
32769
33258
|
return parts.length > 0 ? parts.join("") : void 0;
|
|
32770
33259
|
}
|
|
32771
|
-
|
|
33260
|
+
function parseCodexSession(jsonl) {
|
|
33261
|
+
const messages = [];
|
|
33262
|
+
let sessionId = "";
|
|
33263
|
+
let cwd;
|
|
33264
|
+
let model;
|
|
33265
|
+
let version;
|
|
33266
|
+
let startTimestamp;
|
|
33267
|
+
let endTimestamp;
|
|
33268
|
+
const pendingCalls = /* @__PURE__ */ new Map();
|
|
33269
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
33270
|
+
for (const line of lines) {
|
|
33271
|
+
let entry;
|
|
33272
|
+
try {
|
|
33273
|
+
entry = JSON.parse(line);
|
|
33274
|
+
} catch {
|
|
33275
|
+
continue;
|
|
33276
|
+
}
|
|
33277
|
+
if (!entry.type) continue;
|
|
33278
|
+
if (entry.timestamp) {
|
|
33279
|
+
if (!startTimestamp) startTimestamp = entry.timestamp;
|
|
33280
|
+
endTimestamp = entry.timestamp;
|
|
33281
|
+
}
|
|
33282
|
+
const payload = entry.payload ?? {};
|
|
33283
|
+
switch (entry.type) {
|
|
33284
|
+
case "session_meta": {
|
|
33285
|
+
sessionId = String(payload.id ?? "");
|
|
33286
|
+
cwd = payload.cwd ? String(payload.cwd) : void 0;
|
|
33287
|
+
version = payload.cli_version ? String(payload.cli_version) : void 0;
|
|
33288
|
+
if (payload.model && !model) {
|
|
33289
|
+
model = String(payload.model);
|
|
33290
|
+
}
|
|
33291
|
+
break;
|
|
33292
|
+
}
|
|
33293
|
+
case "turn_context": {
|
|
33294
|
+
if (payload.model && !model) {
|
|
33295
|
+
model = String(payload.model);
|
|
33296
|
+
}
|
|
33297
|
+
if (payload.cwd && !cwd) {
|
|
33298
|
+
cwd = String(payload.cwd);
|
|
33299
|
+
}
|
|
33300
|
+
break;
|
|
33301
|
+
}
|
|
33302
|
+
case "response_item": {
|
|
33303
|
+
const itemType = String(payload.type ?? "");
|
|
33304
|
+
const role = String(payload.role ?? "");
|
|
33305
|
+
switch (itemType) {
|
|
33306
|
+
case "message": {
|
|
33307
|
+
if (role === "developer") break;
|
|
33308
|
+
const content = extractResponseItemContent(payload.content);
|
|
33309
|
+
if (role === "user" && content) {
|
|
33310
|
+
messages.push({ role: "user", content });
|
|
33311
|
+
} else if (role === "assistant" && content) {
|
|
33312
|
+
messages.push({ role: "assistant", content });
|
|
33313
|
+
}
|
|
33314
|
+
break;
|
|
33315
|
+
}
|
|
33316
|
+
case "function_call": {
|
|
33317
|
+
const toolName = String(payload.name ?? "");
|
|
33318
|
+
const callId = String(payload.call_id ?? "");
|
|
33319
|
+
let input;
|
|
33320
|
+
if (typeof payload.arguments === "string") {
|
|
33321
|
+
try {
|
|
33322
|
+
input = JSON.parse(payload.arguments);
|
|
33323
|
+
} catch {
|
|
33324
|
+
input = payload.arguments;
|
|
33325
|
+
}
|
|
33326
|
+
} else {
|
|
33327
|
+
input = payload.arguments;
|
|
33328
|
+
}
|
|
33329
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
33330
|
+
const msgIdx = messages.length;
|
|
33331
|
+
messages.push({
|
|
33332
|
+
role: "assistant",
|
|
33333
|
+
toolCalls: [toolCall]
|
|
33334
|
+
});
|
|
33335
|
+
if (callId) {
|
|
33336
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
33337
|
+
}
|
|
33338
|
+
break;
|
|
33339
|
+
}
|
|
33340
|
+
case "custom_tool_call": {
|
|
33341
|
+
const toolName = String(payload.name ?? "");
|
|
33342
|
+
const callId = String(payload.call_id ?? "");
|
|
33343
|
+
let input;
|
|
33344
|
+
if (typeof payload.arguments === "string") {
|
|
33345
|
+
try {
|
|
33346
|
+
input = JSON.parse(payload.arguments);
|
|
33347
|
+
} catch {
|
|
33348
|
+
input = payload.arguments;
|
|
33349
|
+
}
|
|
33350
|
+
} else {
|
|
33351
|
+
input = payload.arguments;
|
|
33352
|
+
}
|
|
33353
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
33354
|
+
const msgIdx = messages.length;
|
|
33355
|
+
messages.push({
|
|
33356
|
+
role: "assistant",
|
|
33357
|
+
toolCalls: [toolCall]
|
|
33358
|
+
});
|
|
33359
|
+
if (callId) {
|
|
33360
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
33361
|
+
}
|
|
33362
|
+
break;
|
|
33363
|
+
}
|
|
33364
|
+
case "function_call_output":
|
|
33365
|
+
case "custom_tool_call_output": {
|
|
33366
|
+
const callId = String(payload.call_id ?? "");
|
|
33367
|
+
const pending = pendingCalls.get(callId);
|
|
33368
|
+
if (pending) {
|
|
33369
|
+
const existingMsg = messages[pending.msgIdx];
|
|
33370
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
33371
|
+
existingCalls[pending.toolIdx] = {
|
|
33372
|
+
...existingCalls[pending.toolIdx],
|
|
33373
|
+
output: payload.output
|
|
33374
|
+
};
|
|
33375
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
33376
|
+
pendingCalls.delete(callId);
|
|
33377
|
+
}
|
|
33378
|
+
break;
|
|
33379
|
+
}
|
|
33380
|
+
// Skip reasoning blocks (thinking tokens)
|
|
33381
|
+
case "reasoning":
|
|
33382
|
+
break;
|
|
33383
|
+
}
|
|
33384
|
+
break;
|
|
33385
|
+
}
|
|
33386
|
+
}
|
|
33387
|
+
}
|
|
33388
|
+
let durationMs;
|
|
33389
|
+
if (startTimestamp && endTimestamp) {
|
|
33390
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
33391
|
+
}
|
|
33392
|
+
const source = {
|
|
33393
|
+
provider: "codex",
|
|
33394
|
+
sessionId,
|
|
33395
|
+
cwd,
|
|
33396
|
+
startedAt: startTimestamp,
|
|
33397
|
+
model,
|
|
33398
|
+
version
|
|
33399
|
+
};
|
|
33400
|
+
return {
|
|
33401
|
+
messages,
|
|
33402
|
+
source,
|
|
33403
|
+
// Codex rollout files don't include token counts (only rate limit info)
|
|
33404
|
+
tokenUsage: void 0,
|
|
33405
|
+
durationMs,
|
|
33406
|
+
costUsd: null
|
|
33407
|
+
};
|
|
33408
|
+
}
|
|
33409
|
+
function extractResponseItemContent(content) {
|
|
33410
|
+
if (typeof content === "string") return content;
|
|
33411
|
+
if (!Array.isArray(content)) return void 0;
|
|
33412
|
+
const parts = [];
|
|
33413
|
+
for (const block of content) {
|
|
33414
|
+
if (typeof block === "object" && block !== null) {
|
|
33415
|
+
const b = block;
|
|
33416
|
+
if (typeof b.text === "string") {
|
|
33417
|
+
parts.push(b.text);
|
|
33418
|
+
}
|
|
33419
|
+
}
|
|
33420
|
+
}
|
|
33421
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
33422
|
+
}
|
|
33423
|
+
var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
|
|
33424
|
+
async function discoverCodexSessions(opts) {
|
|
33425
|
+
const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
|
|
33426
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
33427
|
+
const sessions = [];
|
|
33428
|
+
let yearDirs;
|
|
33429
|
+
try {
|
|
33430
|
+
yearDirs = await readdir8(sessionsDir);
|
|
33431
|
+
} catch {
|
|
33432
|
+
return [];
|
|
33433
|
+
}
|
|
33434
|
+
for (const year of yearDirs) {
|
|
33435
|
+
const yearPath = path48.join(sessionsDir, year);
|
|
33436
|
+
let monthDirs;
|
|
33437
|
+
try {
|
|
33438
|
+
monthDirs = await readdir8(yearPath);
|
|
33439
|
+
} catch {
|
|
33440
|
+
continue;
|
|
33441
|
+
}
|
|
33442
|
+
for (const month of monthDirs) {
|
|
33443
|
+
const monthPath = path48.join(yearPath, month);
|
|
33444
|
+
let dayDirs;
|
|
33445
|
+
try {
|
|
33446
|
+
dayDirs = await readdir8(monthPath);
|
|
33447
|
+
} catch {
|
|
33448
|
+
continue;
|
|
33449
|
+
}
|
|
33450
|
+
for (const day of dayDirs) {
|
|
33451
|
+
if (opts?.date) {
|
|
33452
|
+
const dirDate = `${year}-${month}-${day}`;
|
|
33453
|
+
if (dirDate !== opts.date) continue;
|
|
33454
|
+
}
|
|
33455
|
+
const dayPath = path48.join(monthPath, day);
|
|
33456
|
+
let files;
|
|
33457
|
+
try {
|
|
33458
|
+
files = await readdir8(dayPath);
|
|
33459
|
+
} catch {
|
|
33460
|
+
continue;
|
|
33461
|
+
}
|
|
33462
|
+
for (const file of files) {
|
|
33463
|
+
if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
|
|
33464
|
+
const filePath = path48.join(dayPath, file);
|
|
33465
|
+
const nameWithoutExt = file.replace(/\.jsonl$/, "");
|
|
33466
|
+
const parts = nameWithoutExt.split("-");
|
|
33467
|
+
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
33468
|
+
let updatedAt;
|
|
33469
|
+
try {
|
|
33470
|
+
const fileStat = await stat9(filePath);
|
|
33471
|
+
updatedAt = fileStat.mtime;
|
|
33472
|
+
} catch {
|
|
33473
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
33474
|
+
}
|
|
33475
|
+
sessions.push({ sessionId, filePath, filename: file, updatedAt });
|
|
33476
|
+
}
|
|
33477
|
+
}
|
|
33478
|
+
}
|
|
33479
|
+
}
|
|
33480
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
33481
|
+
return sessions.slice(0, limit);
|
|
33482
|
+
}
|
|
33483
|
+
var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
|
|
32772
33484
|
function encodeProjectPath(projectPath) {
|
|
32773
33485
|
return projectPath.replace(/\//g, "-");
|
|
32774
33486
|
}
|
|
@@ -32777,7 +33489,7 @@ async function discoverClaudeSessions(opts) {
|
|
|
32777
33489
|
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
32778
33490
|
let projectDirs;
|
|
32779
33491
|
try {
|
|
32780
|
-
projectDirs = await
|
|
33492
|
+
projectDirs = await readdir9(projectsDir);
|
|
32781
33493
|
} catch {
|
|
32782
33494
|
return [];
|
|
32783
33495
|
}
|
|
@@ -32787,10 +33499,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
32787
33499
|
}
|
|
32788
33500
|
const sessions = [];
|
|
32789
33501
|
for (const projectDir of projectDirs) {
|
|
32790
|
-
const dirPath =
|
|
33502
|
+
const dirPath = path49.join(projectsDir, projectDir);
|
|
32791
33503
|
let entries;
|
|
32792
33504
|
try {
|
|
32793
|
-
entries = await
|
|
33505
|
+
entries = await readdir9(dirPath);
|
|
32794
33506
|
} catch {
|
|
32795
33507
|
continue;
|
|
32796
33508
|
}
|
|
@@ -32798,10 +33510,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
32798
33510
|
if (!entry.endsWith(".jsonl")) continue;
|
|
32799
33511
|
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
32800
33512
|
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
32801
|
-
const filePath =
|
|
33513
|
+
const filePath = path49.join(dirPath, entry);
|
|
32802
33514
|
let updatedAt;
|
|
32803
33515
|
try {
|
|
32804
|
-
const fileStat = await
|
|
33516
|
+
const fileStat = await stat10(filePath);
|
|
32805
33517
|
updatedAt = fileStat.mtime;
|
|
32806
33518
|
} catch {
|
|
32807
33519
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -32817,9 +33529,82 @@ async function discoverClaudeSessions(opts) {
|
|
|
32817
33529
|
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
32818
33530
|
return sessions.slice(0, limit);
|
|
32819
33531
|
}
|
|
33532
|
+
function toTranscriptJsonLine(entry) {
|
|
33533
|
+
const firstUserMessage = entry.messages.find((m) => m.role === "user");
|
|
33534
|
+
const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
|
|
33535
|
+
return {
|
|
33536
|
+
input,
|
|
33537
|
+
output: entry.messages,
|
|
33538
|
+
token_usage: entry.tokenUsage ? {
|
|
33539
|
+
input: entry.tokenUsage.input,
|
|
33540
|
+
output: entry.tokenUsage.output,
|
|
33541
|
+
cached: entry.tokenUsage.cached
|
|
33542
|
+
} : void 0,
|
|
33543
|
+
duration_ms: entry.durationMs,
|
|
33544
|
+
cost_usd: entry.costUsd,
|
|
33545
|
+
source: {
|
|
33546
|
+
provider: entry.source.provider,
|
|
33547
|
+
session_id: entry.source.sessionId,
|
|
33548
|
+
model: entry.source.model,
|
|
33549
|
+
timestamp: entry.source.startedAt,
|
|
33550
|
+
git_branch: entry.source.gitBranch,
|
|
33551
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
33552
|
+
version: entry.source.version
|
|
33553
|
+
}
|
|
33554
|
+
};
|
|
33555
|
+
}
|
|
33556
|
+
async function readTranscriptJsonl(filePath) {
|
|
33557
|
+
const text2 = await readFile14(filePath, "utf8");
|
|
33558
|
+
return text2.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
33559
|
+
}
|
|
32820
33560
|
async function readTranscriptFile(filePath) {
|
|
32821
33561
|
return readFile14(filePath, "utf8");
|
|
32822
33562
|
}
|
|
33563
|
+
var TranscriptProvider = class _TranscriptProvider {
|
|
33564
|
+
id;
|
|
33565
|
+
kind = "transcript";
|
|
33566
|
+
targetName;
|
|
33567
|
+
lines;
|
|
33568
|
+
cursor = 0;
|
|
33569
|
+
constructor(targetName, lines) {
|
|
33570
|
+
this.targetName = targetName;
|
|
33571
|
+
this.id = `transcript:${targetName}`;
|
|
33572
|
+
this.lines = lines;
|
|
33573
|
+
}
|
|
33574
|
+
/**
|
|
33575
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
33576
|
+
*/
|
|
33577
|
+
static async fromFile(filePath) {
|
|
33578
|
+
const lines = await readTranscriptJsonl(filePath);
|
|
33579
|
+
if (lines.length === 0) {
|
|
33580
|
+
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
33581
|
+
}
|
|
33582
|
+
const providerName = lines[0].source.provider ?? "transcript";
|
|
33583
|
+
return new _TranscriptProvider(providerName, lines);
|
|
33584
|
+
}
|
|
33585
|
+
get lineCount() {
|
|
33586
|
+
return this.lines.length;
|
|
33587
|
+
}
|
|
33588
|
+
async invoke(_request) {
|
|
33589
|
+
if (this.cursor >= this.lines.length) {
|
|
33590
|
+
throw new Error(
|
|
33591
|
+
`Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
|
|
33592
|
+
);
|
|
33593
|
+
}
|
|
33594
|
+
const line = this.lines[this.cursor++];
|
|
33595
|
+
return {
|
|
33596
|
+
output: line.output,
|
|
33597
|
+
tokenUsage: line.token_usage ? {
|
|
33598
|
+
input: line.token_usage.input,
|
|
33599
|
+
output: line.token_usage.output,
|
|
33600
|
+
cached: line.token_usage.cached
|
|
33601
|
+
} : void 0,
|
|
33602
|
+
durationMs: line.duration_ms,
|
|
33603
|
+
costUsd: line.cost_usd ?? void 0,
|
|
33604
|
+
startTime: line.source.timestamp
|
|
33605
|
+
};
|
|
33606
|
+
}
|
|
33607
|
+
};
|
|
32823
33608
|
function createAgentKernel() {
|
|
32824
33609
|
return { status: "stub" };
|
|
32825
33610
|
}
|
|
@@ -32843,6 +33628,7 @@ export {
|
|
|
32843
33628
|
buildSearchRoots,
|
|
32844
33629
|
resolveFileReference,
|
|
32845
33630
|
CLI_PLACEHOLDERS,
|
|
33631
|
+
findDeprecatedCamelCaseTargetWarnings,
|
|
32846
33632
|
COMMON_TARGET_SETTINGS,
|
|
32847
33633
|
resolveDelegatedTargetDefinition,
|
|
32848
33634
|
resolveTargetDefinition,
|
|
@@ -32887,17 +33673,18 @@ export {
|
|
|
32887
33673
|
subscribeToCodexLogEntries,
|
|
32888
33674
|
consumeCopilotCliLogEntries,
|
|
32889
33675
|
subscribeToCopilotCliLogEntries,
|
|
33676
|
+
parseCopilotEvents,
|
|
32890
33677
|
discoverCopilotSessions,
|
|
32891
33678
|
consumeCopilotSdkLogEntries,
|
|
32892
33679
|
subscribeToCopilotSdkLogEntries,
|
|
32893
33680
|
consumePiLogEntries,
|
|
32894
33681
|
subscribeToPiLogEntries,
|
|
32895
|
-
ProviderRegistry,
|
|
32896
33682
|
getAgentvHome,
|
|
32897
33683
|
getWorkspacesRoot,
|
|
32898
33684
|
getSubagentsRoot,
|
|
32899
33685
|
getTraceStateRoot,
|
|
32900
33686
|
getWorkspacePoolRoot,
|
|
33687
|
+
ProviderRegistry,
|
|
32901
33688
|
ensureVSCodeSubagents,
|
|
32902
33689
|
readTargetDefinitions,
|
|
32903
33690
|
listTargetNames,
|
|
@@ -32905,6 +33692,7 @@ export {
|
|
|
32905
33692
|
createBuiltinProviderRegistry,
|
|
32906
33693
|
createProvider,
|
|
32907
33694
|
resolveAndCreateProvider,
|
|
33695
|
+
DEFAULT_THRESHOLD,
|
|
32908
33696
|
PASS_THRESHOLD,
|
|
32909
33697
|
scoreToVerdict,
|
|
32910
33698
|
clampScore,
|
|
@@ -32992,8 +33780,13 @@ export {
|
|
|
32992
33780
|
OtelTraceExporter,
|
|
32993
33781
|
OtelStreamingObserver,
|
|
32994
33782
|
parseClaudeSession,
|
|
33783
|
+
parseCodexSession,
|
|
33784
|
+
discoverCodexSessions,
|
|
32995
33785
|
discoverClaudeSessions,
|
|
33786
|
+
toTranscriptJsonLine,
|
|
33787
|
+
readTranscriptJsonl,
|
|
32996
33788
|
readTranscriptFile,
|
|
33789
|
+
TranscriptProvider,
|
|
32997
33790
|
createAgentKernel
|
|
32998
33791
|
};
|
|
32999
|
-
//# sourceMappingURL=chunk-
|
|
33792
|
+
//# sourceMappingURL=chunk-I6UE4LHZ.js.map
|