@wix/evalforge-types 0.45.0 → 0.47.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +152 -87
- package/build/index.js.map +4 -4
- package/build/index.mjs +143 -86
- package/build/index.mjs.map +4 -4
- package/build/types/agent/adapter.d.ts +3 -0
- package/build/types/common/models.d.ts +1 -1
- package/build/types/evaluation/conversation.d.ts +108 -0
- package/build/types/evaluation/eval-result.d.ts +25 -0
- package/build/types/evaluation/eval-run.d.ts +25 -0
- package/build/types/evaluation/index.d.ts +1 -0
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -929,17 +929,17 @@ var WebhookIdentityType;
|
|
|
929
929
|
})(WebhookIdentityType || (WebhookIdentityType = {}));
|
|
930
930
|
|
|
931
931
|
// src/common/models.ts
|
|
932
|
-
var
|
|
932
|
+
var AVAILABLE_CLAUDE_MODEL_IDS = Object.values(
|
|
933
933
|
ClaudeModel
|
|
934
934
|
).filter(
|
|
935
935
|
(v) => typeof v === "string" && v !== ClaudeModel.UNKNOWN_CLAUDE_MODEL
|
|
936
936
|
);
|
|
937
937
|
var PREFERRED_JUDGE_MODEL = "CLAUDE_4_5_HAIKU_1_0";
|
|
938
|
-
var DEFAULT_JUDGE_MODEL =
|
|
938
|
+
var DEFAULT_JUDGE_MODEL = AVAILABLE_CLAUDE_MODEL_IDS.includes(
|
|
939
939
|
PREFERRED_JUDGE_MODEL
|
|
940
|
-
) ? PREFERRED_JUDGE_MODEL :
|
|
940
|
+
) ? PREFERRED_JUDGE_MODEL : AVAILABLE_CLAUDE_MODEL_IDS[0];
|
|
941
941
|
var ClaudeModelSchema = z4.enum(
|
|
942
|
-
|
|
942
|
+
AVAILABLE_CLAUDE_MODEL_IDS
|
|
943
943
|
);
|
|
944
944
|
var AVAILABLE_OPENAI_MODEL_IDS = Object.values(
|
|
945
945
|
Model
|
|
@@ -950,7 +950,7 @@ var OpenAIModelSchema = z4.enum(
|
|
|
950
950
|
AVAILABLE_OPENAI_MODEL_IDS
|
|
951
951
|
);
|
|
952
952
|
var ALL_AVAILABLE_MODEL_IDS = [
|
|
953
|
-
...
|
|
953
|
+
...AVAILABLE_CLAUDE_MODEL_IDS,
|
|
954
954
|
...AVAILABLE_OPENAI_MODEL_IDS
|
|
955
955
|
];
|
|
956
956
|
var AnyModelSchema = z4.enum(
|
|
@@ -1697,7 +1697,7 @@ var LLMTraceSchema = z26.object({
|
|
|
1697
1697
|
});
|
|
1698
1698
|
|
|
1699
1699
|
// src/evaluation/eval-result.ts
|
|
1700
|
-
import { z as
|
|
1700
|
+
import { z as z30 } from "zod";
|
|
1701
1701
|
|
|
1702
1702
|
// src/evaluation/eval-run.ts
|
|
1703
1703
|
import { z as z28 } from "zod";
|
|
@@ -1944,6 +1944,53 @@ var EvaluationLogSchema = z28.object({
|
|
|
1944
1944
|
});
|
|
1945
1945
|
var LLM_TIMEOUT = 12e4;
|
|
1946
1946
|
|
|
1947
|
+
// src/evaluation/conversation.ts
|
|
1948
|
+
import { z as z29 } from "zod";
|
|
1949
|
+
var TextBlockSchema = z29.object({
|
|
1950
|
+
type: z29.literal("text"),
|
|
1951
|
+
text: z29.string()
|
|
1952
|
+
});
|
|
1953
|
+
var ThinkingBlockSchema = z29.object({
|
|
1954
|
+
type: z29.literal("thinking"),
|
|
1955
|
+
thinking: z29.string()
|
|
1956
|
+
});
|
|
1957
|
+
var ToolUseBlockSchema = z29.object({
|
|
1958
|
+
type: z29.literal("tool_use"),
|
|
1959
|
+
toolName: z29.string(),
|
|
1960
|
+
toolId: z29.string(),
|
|
1961
|
+
input: z29.unknown()
|
|
1962
|
+
});
|
|
1963
|
+
var ToolResultBlockSchema = z29.object({
|
|
1964
|
+
type: z29.literal("tool_result"),
|
|
1965
|
+
toolUseId: z29.string(),
|
|
1966
|
+
content: z29.string(),
|
|
1967
|
+
isError: z29.boolean().optional()
|
|
1968
|
+
});
|
|
1969
|
+
var ConversationBlockSchema = z29.discriminatedUnion("type", [
|
|
1970
|
+
TextBlockSchema,
|
|
1971
|
+
ThinkingBlockSchema,
|
|
1972
|
+
ToolUseBlockSchema,
|
|
1973
|
+
ToolResultBlockSchema
|
|
1974
|
+
]);
|
|
1975
|
+
var ConversationMessageRoles = [
|
|
1976
|
+
"assistant",
|
|
1977
|
+
"user",
|
|
1978
|
+
"system"
|
|
1979
|
+
];
|
|
1980
|
+
var ConversationMessageSchema = z29.object({
|
|
1981
|
+
role: z29.enum(ConversationMessageRoles),
|
|
1982
|
+
content: z29.array(ConversationBlockSchema),
|
|
1983
|
+
timestamp: z29.string()
|
|
1984
|
+
});
|
|
1985
|
+
var ScenarioConversationSchema = z29.object({
|
|
1986
|
+
id: z29.string(),
|
|
1987
|
+
projectId: z29.string(),
|
|
1988
|
+
evalRunId: z29.string(),
|
|
1989
|
+
resultId: z29.string(),
|
|
1990
|
+
messages: z29.array(ConversationMessageSchema),
|
|
1991
|
+
createdAt: z29.string()
|
|
1992
|
+
});
|
|
1993
|
+
|
|
1947
1994
|
// src/evaluation/eval-result.ts
|
|
1948
1995
|
var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
1949
1996
|
AssertionResultStatus2["PASSED"] = "passed";
|
|
@@ -1952,97 +1999,99 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
1952
1999
|
AssertionResultStatus2["ERROR"] = "error";
|
|
1953
2000
|
return AssertionResultStatus2;
|
|
1954
2001
|
})(AssertionResultStatus || {});
|
|
1955
|
-
var AssertionResultSchema =
|
|
1956
|
-
id:
|
|
1957
|
-
assertionId:
|
|
1958
|
-
assertionType:
|
|
1959
|
-
assertionName:
|
|
1960
|
-
status:
|
|
1961
|
-
message:
|
|
1962
|
-
expected:
|
|
1963
|
-
actual:
|
|
1964
|
-
duration:
|
|
1965
|
-
details:
|
|
1966
|
-
llmTraceSteps:
|
|
1967
|
-
});
|
|
1968
|
-
var EvalRunResultSchema =
|
|
1969
|
-
id:
|
|
1970
|
-
targetId:
|
|
1971
|
-
targetName:
|
|
2002
|
+
var AssertionResultSchema = z30.object({
|
|
2003
|
+
id: z30.string(),
|
|
2004
|
+
assertionId: z30.string(),
|
|
2005
|
+
assertionType: z30.string(),
|
|
2006
|
+
assertionName: z30.string(),
|
|
2007
|
+
status: z30.enum(AssertionResultStatus),
|
|
2008
|
+
message: z30.string().optional(),
|
|
2009
|
+
expected: z30.string().optional(),
|
|
2010
|
+
actual: z30.string().optional(),
|
|
2011
|
+
duration: z30.number().optional(),
|
|
2012
|
+
details: z30.record(z30.string(), z30.unknown()).optional(),
|
|
2013
|
+
llmTraceSteps: z30.array(LLMTraceStepSchema).optional()
|
|
2014
|
+
});
|
|
2015
|
+
var EvalRunResultSchema = z30.object({
|
|
2016
|
+
id: z30.string(),
|
|
2017
|
+
targetId: z30.string(),
|
|
2018
|
+
targetName: z30.string().optional(),
|
|
1972
2019
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
1973
|
-
skillVersionId:
|
|
2020
|
+
skillVersionId: z30.string().optional(),
|
|
1974
2021
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
1975
|
-
skillVersion:
|
|
1976
|
-
scenarioId:
|
|
1977
|
-
scenarioName:
|
|
2022
|
+
skillVersion: z30.string().optional(),
|
|
2023
|
+
scenarioId: z30.string(),
|
|
2024
|
+
scenarioName: z30.string(),
|
|
1978
2025
|
modelConfig: ModelConfigSchema.optional(),
|
|
1979
|
-
assertionResults:
|
|
2026
|
+
assertionResults: z30.array(AssertionResultSchema),
|
|
1980
2027
|
metrics: EvalMetricsSchema.optional(),
|
|
1981
|
-
passed:
|
|
1982
|
-
failed:
|
|
1983
|
-
passRate:
|
|
1984
|
-
duration:
|
|
1985
|
-
outputText:
|
|
1986
|
-
files:
|
|
1987
|
-
fileDiffs:
|
|
2028
|
+
passed: z30.number(),
|
|
2029
|
+
failed: z30.number(),
|
|
2030
|
+
passRate: z30.number(),
|
|
2031
|
+
duration: z30.number(),
|
|
2032
|
+
outputText: z30.string().optional(),
|
|
2033
|
+
files: z30.array(ExpectedFileSchema).optional(),
|
|
2034
|
+
fileDiffs: z30.array(DiffContentSchema).optional(),
|
|
1988
2035
|
/** Full template files after execution with status indicators */
|
|
1989
|
-
templateFiles:
|
|
1990
|
-
startedAt:
|
|
1991
|
-
completedAt:
|
|
1992
|
-
llmTrace: LLMTraceSchema.optional()
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2036
|
+
templateFiles: z30.array(TemplateFileSchema).optional(),
|
|
2037
|
+
startedAt: z30.string().optional(),
|
|
2038
|
+
completedAt: z30.string().optional(),
|
|
2039
|
+
llmTrace: LLMTraceSchema.optional(),
|
|
2040
|
+
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
2041
|
+
conversation: z30.array(ConversationMessageSchema).optional()
|
|
2042
|
+
});
|
|
2043
|
+
var PromptResultSchema = z30.object({
|
|
2044
|
+
text: z30.string(),
|
|
2045
|
+
files: z30.array(z30.unknown()).optional(),
|
|
2046
|
+
finishReason: z30.string().optional(),
|
|
2047
|
+
reasoning: z30.string().optional(),
|
|
2048
|
+
reasoningDetails: z30.unknown().optional(),
|
|
2049
|
+
toolCalls: z30.array(z30.unknown()).optional(),
|
|
2050
|
+
toolResults: z30.array(z30.unknown()).optional(),
|
|
2051
|
+
warnings: z30.array(z30.unknown()).optional(),
|
|
2052
|
+
sources: z30.array(z30.unknown()).optional(),
|
|
2053
|
+
steps: z30.array(z30.unknown()),
|
|
2054
|
+
generationTimeMs: z30.number(),
|
|
2055
|
+
prompt: z30.string(),
|
|
2056
|
+
systemPrompt: z30.string(),
|
|
2057
|
+
usage: z30.object({
|
|
2058
|
+
totalTokens: z30.number().optional(),
|
|
2059
|
+
totalMicrocentsSpent: z30.number().optional()
|
|
2011
2060
|
})
|
|
2012
2061
|
});
|
|
2013
|
-
var EvaluationResultSchema =
|
|
2014
|
-
id:
|
|
2015
|
-
runId:
|
|
2016
|
-
timestamp:
|
|
2062
|
+
var EvaluationResultSchema = z30.object({
|
|
2063
|
+
id: z30.string(),
|
|
2064
|
+
runId: z30.string(),
|
|
2065
|
+
timestamp: z30.number(),
|
|
2017
2066
|
promptResult: PromptResultSchema,
|
|
2018
|
-
testResults:
|
|
2019
|
-
tags:
|
|
2020
|
-
feedback:
|
|
2021
|
-
score:
|
|
2022
|
-
suiteId:
|
|
2023
|
-
});
|
|
2024
|
-
var LeanEvaluationResultSchema =
|
|
2025
|
-
id:
|
|
2026
|
-
runId:
|
|
2027
|
-
timestamp:
|
|
2028
|
-
tags:
|
|
2029
|
-
scenarioId:
|
|
2030
|
-
scenarioVersion:
|
|
2031
|
-
targetId:
|
|
2032
|
-
targetVersion:
|
|
2033
|
-
suiteId:
|
|
2034
|
-
score:
|
|
2035
|
-
time:
|
|
2036
|
-
microcentsSpent:
|
|
2067
|
+
testResults: z30.array(z30.unknown()),
|
|
2068
|
+
tags: z30.array(z30.string()).optional(),
|
|
2069
|
+
feedback: z30.string().optional(),
|
|
2070
|
+
score: z30.number(),
|
|
2071
|
+
suiteId: z30.string().optional()
|
|
2072
|
+
});
|
|
2073
|
+
var LeanEvaluationResultSchema = z30.object({
|
|
2074
|
+
id: z30.string(),
|
|
2075
|
+
runId: z30.string(),
|
|
2076
|
+
timestamp: z30.number(),
|
|
2077
|
+
tags: z30.array(z30.string()).optional(),
|
|
2078
|
+
scenarioId: z30.string(),
|
|
2079
|
+
scenarioVersion: z30.number().optional(),
|
|
2080
|
+
targetId: z30.string(),
|
|
2081
|
+
targetVersion: z30.number().optional(),
|
|
2082
|
+
suiteId: z30.string().optional(),
|
|
2083
|
+
score: z30.number(),
|
|
2084
|
+
time: z30.number().optional(),
|
|
2085
|
+
microcentsSpent: z30.number().optional()
|
|
2037
2086
|
});
|
|
2038
2087
|
|
|
2039
2088
|
// src/project/project.ts
|
|
2040
|
-
import { z as
|
|
2089
|
+
import { z as z31 } from "zod";
|
|
2041
2090
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
2042
|
-
appId:
|
|
2043
|
-
appSecret:
|
|
2044
|
-
useWixAuth:
|
|
2045
|
-
useBase44Auth:
|
|
2091
|
+
appId: z31.string().optional().describe("The ID of the app in Dev Center"),
|
|
2092
|
+
appSecret: z31.string().optional().describe("The secret of the app in Dev Center"),
|
|
2093
|
+
useWixAuth: z31.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
|
|
2094
|
+
useBase44Auth: z31.boolean().optional().describe("Enable Base44 auth for evaluations")
|
|
2046
2095
|
});
|
|
2047
2096
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
2048
2097
|
id: true,
|
|
@@ -2216,7 +2265,7 @@ function getSystemAssertion(id) {
|
|
|
2216
2265
|
export {
|
|
2217
2266
|
AGENT_TYPE_LABELS,
|
|
2218
2267
|
ALL_AVAILABLE_MODEL_IDS,
|
|
2219
|
-
|
|
2268
|
+
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
2220
2269
|
AVAILABLE_OPENAI_MODEL_IDS,
|
|
2221
2270
|
AVAILABLE_RUN_COMMANDS,
|
|
2222
2271
|
AVAILABLE_TOOL_NAMES,
|
|
@@ -2244,6 +2293,9 @@ export {
|
|
|
2244
2293
|
ClaudeModelSchema,
|
|
2245
2294
|
CommandExecutionSchema,
|
|
2246
2295
|
CommandExecutionTestSchema,
|
|
2296
|
+
ConversationBlockSchema,
|
|
2297
|
+
ConversationMessageRoles,
|
|
2298
|
+
ConversationMessageSchema,
|
|
2247
2299
|
CostAssertionSchema,
|
|
2248
2300
|
CostConfigSchema,
|
|
2249
2301
|
CreateAgentInputSchema,
|
|
@@ -2316,6 +2368,7 @@ export {
|
|
|
2316
2368
|
SYSTEM_ASSERTIONS,
|
|
2317
2369
|
SYSTEM_ASSERTION_IDS,
|
|
2318
2370
|
ScenarioAssertionLinkSchema,
|
|
2371
|
+
ScenarioConversationSchema,
|
|
2319
2372
|
SiteConfigTestSchema,
|
|
2320
2373
|
SkillFileSchema,
|
|
2321
2374
|
SkillMetadataSchema,
|
|
@@ -2340,12 +2393,16 @@ export {
|
|
|
2340
2393
|
TestSuiteSchema,
|
|
2341
2394
|
TestType,
|
|
2342
2395
|
TestTypeSchema,
|
|
2396
|
+
TextBlockSchema,
|
|
2397
|
+
ThinkingBlockSchema,
|
|
2343
2398
|
TimeAssertionSchema,
|
|
2344
2399
|
TimeConfigSchema,
|
|
2345
2400
|
TokenUsageSchema,
|
|
2346
2401
|
ToolCalledWithParamAssertionSchema,
|
|
2347
2402
|
ToolCalledWithParamConfigSchema,
|
|
2403
|
+
ToolResultBlockSchema,
|
|
2348
2404
|
ToolTestSchema,
|
|
2405
|
+
ToolUseBlockSchema,
|
|
2349
2406
|
TriggerMetadataSchema,
|
|
2350
2407
|
TriggerSchema,
|
|
2351
2408
|
TriggerType,
|