veryfront 0.1.534 → 0.1.536
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/cli/templates/manifest.d.ts +405 -405
- package/esm/cli/templates/manifest.js +451 -451
- package/esm/deno.d.ts +1 -0
- package/esm/deno.js +5 -4
- package/esm/extensions/ext-llm-anthropic/src/index.d.ts.map +1 -1
- package/esm/extensions/ext-llm-anthropic/src/index.js +1 -0
- package/esm/extensions/ext-llm-google/src/index.d.ts.map +1 -1
- package/esm/extensions/ext-llm-google/src/index.js +1 -0
- package/esm/extensions/ext-llm-openai/src/index.d.ts.map +1 -1
- package/esm/extensions/ext-llm-openai/src/index.js +1 -0
- package/esm/src/agent/testing/durable-run-canaries/cli-runner.d.ts +18 -0
- package/esm/src/agent/testing/durable-run-canaries/cli-runner.d.ts.map +1 -0
- package/esm/src/agent/testing/durable-run-canaries/cli-runner.js +65 -0
- package/esm/src/agent/testing/durable-run-canaries/index.d.ts +1 -0
- package/esm/src/agent/testing/durable-run-canaries/index.d.ts.map +1 -1
- package/esm/src/agent/testing/durable-run-canaries/index.js +1 -0
- package/esm/src/agent/testing/index.d.ts +2 -2
- package/esm/src/agent/testing/index.d.ts.map +1 -1
- package/esm/src/agent/testing/index.js +2 -2
- package/esm/src/agent/testing/live-evals/cli-runner.d.ts +36 -0
- package/esm/src/agent/testing/live-evals/cli-runner.d.ts.map +1 -0
- package/esm/src/agent/testing/live-evals/cli-runner.js +143 -0
- package/esm/src/agent/testing/live-evals/index.d.ts +1 -0
- package/esm/src/agent/testing/live-evals/index.d.ts.map +1 -1
- package/esm/src/agent/testing/live-evals/index.js +1 -0
- package/esm/src/server/dev-ui/manifest.d.ts +17 -17
- package/esm/src/server/dev-ui/manifest.js +17 -17
- package/esm/src/utils/version-constant.d.ts +1 -1
- package/esm/src/utils/version-constant.js +1 -1
- package/package.json +1 -1
- package/src/cli/templates/manifest.js +451 -451
- package/src/deno.js +5 -4
- package/src/extensions/ext-llm-anthropic/src/index.ts +1 -0
- package/src/extensions/ext-llm-google/src/index.ts +1 -0
- package/src/extensions/ext-llm-openai/src/index.ts +1 -0
- package/src/src/agent/testing/durable-run-canaries/cli-runner.ts +117 -0
- package/src/src/agent/testing/durable-run-canaries/index.ts +5 -0
- package/src/src/agent/testing/index.ts +7 -0
- package/src/src/agent/testing/live-evals/cli-runner.ts +234 -0
- package/src/src/agent/testing/live-evals/index.ts +6 -0
- package/src/src/server/dev-ui/manifest.js +17 -17
- package/src/src/utils/version-constant.ts +1 -1
package/src/deno.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export default {
|
|
2
2
|
"name": "veryfront",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.536",
|
|
4
4
|
"license": "Apache-2.0",
|
|
5
5
|
"nodeModulesDir": "auto",
|
|
6
6
|
"workspace": [
|
|
@@ -331,8 +331,8 @@ export default {
|
|
|
331
331
|
"fmt": "deno fmt src/ cli/ react/",
|
|
332
332
|
"fmt:check": "deno fmt --check src/ cli/ react/",
|
|
333
333
|
"typecheck": "deno task generate && deno check src/index.ts cli/main.ts src/server/index.ts src/routing/api/index.ts src/rendering/index.ts src/platform/index.ts src/platform/adapters/index.ts src/build/index.ts src/build/production-build/index.ts src/transforms/index.ts src/config/index.ts src/utils/index.ts src/data/index.ts src/security/index.ts src/middleware/index.ts src/server/handlers/dev/index.ts src/server/handlers/request/api/index.ts src/rendering/cache/index.ts src/rendering/cache/stores/index.ts src/rendering/rsc/actions/index.ts src/html/index.ts src/modules/index.ts src/proxy/main.ts src/chat/index.ts src/markdown/index.ts src/mdx/index.ts src/fs/index.ts src/oauth/index.ts src/agent/index.ts src/agent/service/route-export.check.ts src/tool/index.ts src/workflow/index.ts src/prompt/index.ts src/resource/index.ts src/jobs/index.ts src/mcp/index.ts src/provider/index.ts",
|
|
334
|
-
"verify": "deno task generate && deno fmt --check src/ cli/ react/ && DENO_NO_PACKAGE_JSON=1 deno lint src/ cli/ react/ && deno task lint:style && deno task lint:cli-boundary && deno task lint:wildcard-exports && deno task lint:barrel-jsdoc && deno task lint:ban-zod && deno task lint:core-deps && deno task lint:dependency-boundaries && deno task docs:validate && deno task typecheck && deno task test && deno task test:e2e:binary",
|
|
335
|
-
"verify:quick": "deno task generate && deno fmt --check src/ cli/ react/ && DENO_NO_PACKAGE_JSON=1 deno lint src/ cli/ react/ && deno task lint:style && deno task lint:cli-boundary && deno task lint:wildcard-exports && deno task lint:barrel-jsdoc && deno task lint:ban-zod && deno task lint:core-deps && deno task lint:dependency-boundaries && deno task docs:validate && deno task typecheck",
|
|
334
|
+
"verify": "deno task generate && deno fmt --check src/ cli/ react/ && DENO_NO_PACKAGE_JSON=1 deno lint src/ cli/ react/ && deno task lint:style && deno task lint:cli-boundary && deno task lint:wildcard-exports && deno task lint:barrel-jsdoc && deno task lint:ban-zod && deno task lint:core-deps && deno task lint:dependency-boundaries && deno task lint:extension-contracts && deno task docs:validate && deno task typecheck && deno task test && deno task test:e2e:binary",
|
|
335
|
+
"verify:quick": "deno task generate && deno fmt --check src/ cli/ react/ && DENO_NO_PACKAGE_JSON=1 deno lint src/ cli/ react/ && deno task lint:style && deno task lint:cli-boundary && deno task lint:wildcard-exports && deno task lint:barrel-jsdoc && deno task lint:ban-zod && deno task lint:core-deps && deno task lint:dependency-boundaries && deno task lint:extension-contracts && deno task docs:validate && deno task typecheck",
|
|
336
336
|
"docs": "rm -rf docs/reference && deno run --allow-read --allow-write --allow-run scripts/docs/generate-api-reference.ts",
|
|
337
337
|
"docs:copy": "rm -rf ../../docs/docs/code/reference && cp -r docs/reference/ ../../docs/docs/code/reference/",
|
|
338
338
|
"docs:validate": "deno run --allow-read scripts/docs/validate-api-reference.ts",
|
|
@@ -341,6 +341,7 @@ export default {
|
|
|
341
341
|
"lint:ban-zod": "deno run --allow-read scripts/lint/ban-zod-imports.ts",
|
|
342
342
|
"lint:core-deps": "deno run --allow-read scripts/lint/audit-core-deps.ts",
|
|
343
343
|
"lint:dependency-boundaries": "deno run --allow-read scripts/lint/audit-dependency-boundaries.ts",
|
|
344
|
+
"lint:extension-contracts": "deno run --allow-read --allow-env --allow-sys scripts/lint/audit-extension-contracts.ts",
|
|
344
345
|
"lint:ban-console": "deno run --allow-read scripts/lint/ban-console.ts",
|
|
345
346
|
"lint:ban-deep-imports": "deno run --allow-read scripts/lint/ban-deep-imports.ts",
|
|
346
347
|
"lint:imports": "deno run --allow-read scripts/lint/no-cross-boundary-relative-imports.ts",
|
|
@@ -354,7 +355,7 @@ export default {
|
|
|
354
355
|
"lint:wildcard-exports": "deno run --allow-read scripts/lint/ban-wildcard-exports.ts",
|
|
355
356
|
"lint:deps": "deno run --allow-read scripts/lint/audit-deps.ts",
|
|
356
357
|
"lint:barrel-jsdoc": "deno run --allow-read scripts/lint/check-barrel-jsdoc.ts",
|
|
357
|
-
"test:scripts": "deno test --config=scripts/test.deno.json --no-check --allow-read --allow-write scripts/build/generate-sbom.test.ts scripts/build/npm-react-shims.test.ts scripts/lint/audit-core-deps.test.ts scripts/lint/audit-dependency-boundaries.test.ts scripts/lint/audit-deps.test.ts scripts/security/audit-npm.test.ts scripts/security/submit-dependency-snapshot.test.ts",
|
|
358
|
+
"test:scripts": "deno test --config=scripts/test.deno.json --no-check --allow-read --allow-write scripts/build/generate-sbom.test.ts scripts/build/npm-react-shims.test.ts scripts/lint/audit-core-deps.test.ts scripts/lint/audit-dependency-boundaries.test.ts scripts/lint/audit-extension-contracts.test.ts scripts/lint/audit-deps.test.ts scripts/security/audit-npm.test.ts scripts/security/submit-dependency-snapshot.test.ts",
|
|
358
359
|
"test:cross-runtime": "deno run --allow-all src/platform/compat/cross-runtime.test.ts",
|
|
359
360
|
"test:node": "node ./tests/node/run-tests.mjs 'src/**/*.test.ts'",
|
|
360
361
|
"test:bun": "node ./tests/bun/run-tests.mjs src/",
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import { dirname, resolve } from "node:path";
|
|
3
|
+
import { cwd as getProcessCwd } from "node:process";
|
|
4
|
+
import { type LiveEvalApiContext } from "../live-evals/api-client.js";
|
|
5
|
+
import { resolveDurableRunCanaryEnvironment } from "./environment.js";
|
|
6
|
+
import {
|
|
7
|
+
createDurableRunCanaryRunner,
|
|
8
|
+
type DurableRunCanaryCase,
|
|
9
|
+
type DurableRunCanaryResult,
|
|
10
|
+
type DurableRunCanaryRunnerConfig,
|
|
11
|
+
} from "./runner.js";
|
|
12
|
+
|
|
13
|
+
type EnvRecord = Record<string, string | undefined>;
|
|
14
|
+
|
|
15
|
+
export interface DurableRunCanaryCliCaseFactoryInput {
|
|
16
|
+
context: LiveEvalApiContext;
|
|
17
|
+
requestTimeoutMs: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface RunDurableRunCanaryCliInput {
|
|
21
|
+
env: EnvRecord;
|
|
22
|
+
agentId: string;
|
|
23
|
+
createCases: (input: DurableRunCanaryCliCaseFactoryInput) => DurableRunCanaryCase[];
|
|
24
|
+
cwd?: string;
|
|
25
|
+
log?: (message: string) => void;
|
|
26
|
+
createRunner?: (
|
|
27
|
+
config: DurableRunCanaryRunnerConfig,
|
|
28
|
+
) => ReturnType<typeof createDurableRunCanaryRunner>;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function createTimestampedReportPath(input: {
|
|
32
|
+
cwd: string;
|
|
33
|
+
directory: string;
|
|
34
|
+
}): string {
|
|
35
|
+
return resolve(
|
|
36
|
+
input.cwd,
|
|
37
|
+
".omx/logs",
|
|
38
|
+
input.directory,
|
|
39
|
+
`${new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-")}.json`,
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export async function runDurableRunCanaryCli(
|
|
44
|
+
input: RunDurableRunCanaryCliInput,
|
|
45
|
+
): Promise<number> {
|
|
46
|
+
const log = input.log ?? console.log;
|
|
47
|
+
const cwd = input.cwd ?? getProcessCwd();
|
|
48
|
+
const { apiUrl, authToken, projectId, requestTimeoutMs, keepSuccessfulEvidence } =
|
|
49
|
+
resolveDurableRunCanaryEnvironment(input.env);
|
|
50
|
+
const reportPath = input.env.DURABLE_CANARY_REPORT_PATH ??
|
|
51
|
+
createTimestampedReportPath({ cwd, directory: "durable-run-staging-canaries" });
|
|
52
|
+
|
|
53
|
+
if (!authToken) {
|
|
54
|
+
throw new Error("Missing VERYFRONT_TOKEN");
|
|
55
|
+
}
|
|
56
|
+
if (!projectId) {
|
|
57
|
+
throw new Error("Missing AG_UI_EVAL_PROJECT_ID");
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const context: LiveEvalApiContext = {
|
|
61
|
+
apiUrl,
|
|
62
|
+
authToken,
|
|
63
|
+
projectId: projectId || null,
|
|
64
|
+
};
|
|
65
|
+
const createRunner = input.createRunner ?? createDurableRunCanaryRunner;
|
|
66
|
+
const { runCase } = createRunner({
|
|
67
|
+
apiUrl,
|
|
68
|
+
authToken,
|
|
69
|
+
agentId: input.agentId,
|
|
70
|
+
projectId: projectId || null,
|
|
71
|
+
requestTimeoutMs,
|
|
72
|
+
keepSuccessfulEvidence,
|
|
73
|
+
});
|
|
74
|
+
const testCases = input.createCases({
|
|
75
|
+
context,
|
|
76
|
+
requestTimeoutMs,
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
log(`Durable run canaries -> ${apiUrl}`);
|
|
80
|
+
log(`Project scope -> ${projectId}`);
|
|
81
|
+
|
|
82
|
+
const results: DurableRunCanaryResult[] = [];
|
|
83
|
+
for (const testCase of testCases) {
|
|
84
|
+
log(`\n[run] ${testCase.label}`);
|
|
85
|
+
const result = await runCase(testCase);
|
|
86
|
+
results.push(result);
|
|
87
|
+
log(`[${result.status}] ${result.id}: ${result.details}`);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const summary = {
|
|
91
|
+
passed: results.filter((result) => result.status === "pass").length,
|
|
92
|
+
failed: results.filter((result) => result.status === "fail").length,
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
await mkdir(dirname(reportPath), { recursive: true });
|
|
96
|
+
await writeFile(
|
|
97
|
+
reportPath,
|
|
98
|
+
JSON.stringify(
|
|
99
|
+
{
|
|
100
|
+
generatedAt: new Date().toISOString(),
|
|
101
|
+
apiUrl,
|
|
102
|
+
projectId,
|
|
103
|
+
results,
|
|
104
|
+
summary,
|
|
105
|
+
},
|
|
106
|
+
null,
|
|
107
|
+
2,
|
|
108
|
+
),
|
|
109
|
+
);
|
|
110
|
+
|
|
111
|
+
log("\nSummary");
|
|
112
|
+
log(`passed: ${summary.passed}`);
|
|
113
|
+
log(`failed: ${summary.failed}`);
|
|
114
|
+
log(`report: ${reportPath}`);
|
|
115
|
+
|
|
116
|
+
return summary.failed > 0 ? 1 : 0;
|
|
117
|
+
}
|
|
@@ -27,6 +27,7 @@ export {
|
|
|
27
27
|
type DurableRunCanaryApiClient,
|
|
28
28
|
type DurableRunCanaryApiConfig,
|
|
29
29
|
type DurableRunCanaryCase,
|
|
30
|
+
type DurableRunCanaryCliCaseFactoryInput,
|
|
30
31
|
type DurableRunCanaryCreateRootRunInput,
|
|
31
32
|
type DurableRunCanaryEnvironment,
|
|
32
33
|
type DurableRunCanaryMessage,
|
|
@@ -41,6 +42,8 @@ export {
|
|
|
41
42
|
getDurableRunCanaryMessageSchema,
|
|
42
43
|
parseDurableRunCanaryRunSummary,
|
|
43
44
|
resolveDurableRunCanaryEnvironment,
|
|
45
|
+
runDurableRunCanaryCli,
|
|
46
|
+
type RunDurableRunCanaryCliInput,
|
|
44
47
|
stringifyUnknown,
|
|
45
48
|
} from "./durable-run-canaries/index.js";
|
|
46
49
|
|
|
@@ -86,6 +89,8 @@ export {
|
|
|
86
89
|
type LiveEvalCaseSelectionInput,
|
|
87
90
|
type LiveEvalCaseSurface,
|
|
88
91
|
type LiveEvalCaseTagRule,
|
|
92
|
+
type LiveEvalCliCaseFactoryInput,
|
|
93
|
+
type LiveEvalCliCaseGroups,
|
|
89
94
|
type LiveEvalContext,
|
|
90
95
|
type LiveEvalConversationInput,
|
|
91
96
|
type LiveEvalCreateConversationInput,
|
|
@@ -112,6 +117,8 @@ export {
|
|
|
112
117
|
printRuntimeConfidencePreflight,
|
|
113
118
|
resolveLiveEvalEnvironment,
|
|
114
119
|
resolveLiveEvalRequestedCaseIds,
|
|
120
|
+
runLiveEvalCli,
|
|
121
|
+
type RunLiveEvalCliInput,
|
|
115
122
|
type RuntimeConfidencePreflightResult,
|
|
116
123
|
type RuntimePerformanceSummary,
|
|
117
124
|
selectLiveEvalCases,
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import { dirname, resolve } from "node:path";
|
|
3
|
+
import { cwd as getProcessCwd } from "node:process";
|
|
4
|
+
import { buildRuntimePerformanceSummary, type LiveEvalRuntime } from "./performance.js";
|
|
5
|
+
import {
|
|
6
|
+
buildLiveEvalCaseTagSummary,
|
|
7
|
+
buildLiveEvalRuntimeSummary,
|
|
8
|
+
buildLiveEvalStatusSummary,
|
|
9
|
+
resolveLiveEvalRequestedCaseIds,
|
|
10
|
+
selectLiveEvalCases,
|
|
11
|
+
} from "./report.js";
|
|
12
|
+
import {
|
|
13
|
+
containsSkillLoad,
|
|
14
|
+
countStepStartedEvents,
|
|
15
|
+
createLiveEvalCaseSupport,
|
|
16
|
+
hasFinished,
|
|
17
|
+
type LiveEvalCase,
|
|
18
|
+
type LiveEvalRunnerConfig,
|
|
19
|
+
} from "./runner.js";
|
|
20
|
+
import { getLiveEvalProjectFile, type LiveEvalApiContext } from "./api-client.js";
|
|
21
|
+
import { resolveLiveEvalEnvironment } from "./environment.js";
|
|
22
|
+
import type { LiveEvalResultRecord } from "./result.js";
|
|
23
|
+
|
|
24
|
+
type EnvRecord = Record<string, string | undefined>;
|
|
25
|
+
|
|
26
|
+
export interface LiveEvalCliCaseGroups {
|
|
27
|
+
readOnlyCases: LiveEvalCase[];
|
|
28
|
+
writeCases: LiveEvalCase[];
|
|
29
|
+
experimentalWriteCases: LiveEvalCase[];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface LiveEvalCliCaseFactoryInput {
|
|
33
|
+
authToken: string;
|
|
34
|
+
endpoint: string;
|
|
35
|
+
projectId: string | null;
|
|
36
|
+
branchId: string | null;
|
|
37
|
+
model: string | null;
|
|
38
|
+
requestTimeoutMs: number;
|
|
39
|
+
enableLlmJudge: boolean;
|
|
40
|
+
hasFinished: typeof hasFinished;
|
|
41
|
+
containsSkillLoad: typeof containsSkillLoad;
|
|
42
|
+
countStepStartedEvents: typeof countStepStartedEvents;
|
|
43
|
+
verifyFileExists: ReturnType<typeof createLiveEvalCaseSupport>["verifyFileExists"];
|
|
44
|
+
withJudge: ReturnType<typeof createLiveEvalCaseSupport>["withJudge"];
|
|
45
|
+
judgeLlm: ReturnType<typeof createLiveEvalCaseSupport>["judgeLlm"];
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export interface RunLiveEvalCliInput {
|
|
49
|
+
env: EnvRecord;
|
|
50
|
+
caseSets: Record<string, readonly string[]>;
|
|
51
|
+
createCases: (input: LiveEvalCliCaseFactoryInput) => LiveEvalCliCaseGroups;
|
|
52
|
+
runtimes?: readonly LiveEvalRuntime[];
|
|
53
|
+
cwd?: string;
|
|
54
|
+
log?: (message: string) => void;
|
|
55
|
+
error?: (message: string) => void;
|
|
56
|
+
createCaseSupport?: (
|
|
57
|
+
config: LiveEvalRunnerConfig,
|
|
58
|
+
) => ReturnType<typeof createLiveEvalCaseSupport>;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function splitCsvEnv(value: string | undefined): Set<string> {
|
|
62
|
+
return new Set(
|
|
63
|
+
(value ?? "")
|
|
64
|
+
.split(",")
|
|
65
|
+
.map((entry) => entry.trim())
|
|
66
|
+
.filter((entry) => entry.length > 0),
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function createTimestampedReportPath(input: {
|
|
71
|
+
cwd: string;
|
|
72
|
+
directory: string;
|
|
73
|
+
}): string {
|
|
74
|
+
return resolve(
|
|
75
|
+
input.cwd,
|
|
76
|
+
".omx/logs",
|
|
77
|
+
input.directory,
|
|
78
|
+
`${new Date().toISOString().replaceAll(":", "-").replaceAll(".", "-")}.json`,
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export async function runLiveEvalCli(input: RunLiveEvalCliInput): Promise<number> {
|
|
83
|
+
const log = input.log ?? console.log;
|
|
84
|
+
const error = input.error ?? console.error;
|
|
85
|
+
const cwd = input.cwd ?? getProcessCwd();
|
|
86
|
+
const { endpoint, authToken, apiUrl, projectId, branchId, model } = resolveLiveEvalEnvironment(
|
|
87
|
+
input.env,
|
|
88
|
+
);
|
|
89
|
+
const requestedRuntimeSelection = input.runtimes ?? ["framework"];
|
|
90
|
+
const runWriteEvals = input.env.AG_UI_EVAL_WRITE === "1";
|
|
91
|
+
const runExperimentalWriteEvals = input.env.AG_UI_EVAL_EXPERIMENTAL === "1";
|
|
92
|
+
const requestTimeoutMs = Number(input.env.AG_UI_EVAL_TIMEOUT_MS ?? "240000");
|
|
93
|
+
const progressLogIntervalMs = Number(input.env.AG_UI_EVAL_PROGRESS_MS ?? "15000");
|
|
94
|
+
const reportPath = input.env.AG_UI_EVAL_REPORT_PATH ??
|
|
95
|
+
createTimestampedReportPath({ cwd, directory: "ag-ui-live-evals" });
|
|
96
|
+
const requestedCaseIds = splitCsvEnv(input.env.AG_UI_EVAL_CASES);
|
|
97
|
+
const requestedCaseTags = splitCsvEnv(input.env.AG_UI_EVAL_TAGS);
|
|
98
|
+
const requestedCaseSetId = input.env.AG_UI_EVAL_CASE_SET?.trim() || null;
|
|
99
|
+
const enableLlmJudge = input.env.AG_UI_EVAL_LLM_JUDGE === "1";
|
|
100
|
+
|
|
101
|
+
const apiContext: LiveEvalApiContext = {
|
|
102
|
+
apiUrl,
|
|
103
|
+
authToken,
|
|
104
|
+
projectId: projectId ?? null,
|
|
105
|
+
};
|
|
106
|
+
const createCaseSupport = input.createCaseSupport ?? createLiveEvalCaseSupport;
|
|
107
|
+
const { judgeLlm, runEval, verifyFileExists, withJudge } = createCaseSupport({
|
|
108
|
+
endpoint,
|
|
109
|
+
authToken,
|
|
110
|
+
apiUrl,
|
|
111
|
+
projectId: projectId ?? null,
|
|
112
|
+
branchId: branchId ?? null,
|
|
113
|
+
model: model ?? null,
|
|
114
|
+
requestTimeoutMs,
|
|
115
|
+
progressLogIntervalMs,
|
|
116
|
+
enableLlmJudge,
|
|
117
|
+
readProjectFile: (readerInput) => getLiveEvalProjectFile(apiContext, readerInput),
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
const { readOnlyCases, writeCases, experimentalWriteCases } = input.createCases({
|
|
121
|
+
authToken,
|
|
122
|
+
endpoint,
|
|
123
|
+
projectId: projectId ?? null,
|
|
124
|
+
branchId: branchId ?? null,
|
|
125
|
+
model: model ?? null,
|
|
126
|
+
requestTimeoutMs,
|
|
127
|
+
enableLlmJudge,
|
|
128
|
+
hasFinished,
|
|
129
|
+
containsSkillLoad,
|
|
130
|
+
countStepStartedEvents,
|
|
131
|
+
verifyFileExists,
|
|
132
|
+
withJudge,
|
|
133
|
+
judgeLlm,
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
if (authToken.length === 0) {
|
|
137
|
+
error("Missing VERYFRONT_TOKEN");
|
|
138
|
+
return 1;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
log(`AG-UI live evals -> ${endpoint}`);
|
|
142
|
+
log(`Veryfront API -> ${apiUrl}`);
|
|
143
|
+
log(`Project scope -> ${projectId ?? "none"}`);
|
|
144
|
+
log(`Runtime -> ${requestedRuntimeSelection.join(", ")}`);
|
|
145
|
+
log(`Write evals -> ${runWriteEvals ? "enabled" : "disabled"}`);
|
|
146
|
+
log(`Experimental evals -> ${runExperimentalWriteEvals ? "enabled" : "disabled"}`);
|
|
147
|
+
log(`Case set -> ${requestedCaseSetId ?? "none"}`);
|
|
148
|
+
log(`Case tags -> ${requestedCaseTags.size > 0 ? [...requestedCaseTags].join(", ") : "none"}`);
|
|
149
|
+
|
|
150
|
+
const allCases = [...readOnlyCases, ...writeCases, ...experimentalWriteCases];
|
|
151
|
+
const resolvedRequestedCaseIds = resolveLiveEvalRequestedCaseIds({
|
|
152
|
+
caseSets: input.caseSets,
|
|
153
|
+
requestedCaseIds,
|
|
154
|
+
requestedCaseSetId,
|
|
155
|
+
});
|
|
156
|
+
const cases = selectLiveEvalCases({
|
|
157
|
+
allCases,
|
|
158
|
+
readOnlyCases,
|
|
159
|
+
writeCases,
|
|
160
|
+
experimentalWriteCases,
|
|
161
|
+
requestedCaseIds: resolvedRequestedCaseIds,
|
|
162
|
+
requestedCaseTags,
|
|
163
|
+
runWriteEvals,
|
|
164
|
+
runExperimentalWriteEvals,
|
|
165
|
+
});
|
|
166
|
+
const selectedCaseTagSummary = buildLiveEvalCaseTagSummary(cases);
|
|
167
|
+
|
|
168
|
+
if (cases.length === 0) {
|
|
169
|
+
error("No eval cases selected.");
|
|
170
|
+
return 1;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const results: LiveEvalResultRecord[] = [];
|
|
174
|
+
|
|
175
|
+
for (const runtime of requestedRuntimeSelection) {
|
|
176
|
+
log(`\n[runtime] ${runtime}`);
|
|
177
|
+
for (const testCase of cases) {
|
|
178
|
+
log(`\n[run] ${runtime} :: ${testCase.label}`);
|
|
179
|
+
const result = await runEval(testCase, runtime);
|
|
180
|
+
results.push(result);
|
|
181
|
+
log(`[${runtime}] [${result.status}] ${result.details}`);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const summary = buildLiveEvalStatusSummary(results);
|
|
186
|
+
const runtimeSummary = buildLiveEvalRuntimeSummary(requestedRuntimeSelection, results);
|
|
187
|
+
const runtimePerformanceSummary = buildRuntimePerformanceSummary(results);
|
|
188
|
+
|
|
189
|
+
log("\nSummary");
|
|
190
|
+
log(`passed: ${summary.passed}`);
|
|
191
|
+
log(`failed: ${summary.failed}`);
|
|
192
|
+
log(`skipped: ${summary.skipped}`);
|
|
193
|
+
for (const runtime of requestedRuntimeSelection) {
|
|
194
|
+
const currentRuntimeSummary = runtimeSummary[runtime];
|
|
195
|
+
log(
|
|
196
|
+
`${runtime}: passed=${currentRuntimeSummary.passed} failed=${currentRuntimeSummary.failed} skipped=${currentRuntimeSummary.skipped}`,
|
|
197
|
+
);
|
|
198
|
+
const performance = runtimePerformanceSummary[runtime];
|
|
199
|
+
log(
|
|
200
|
+
`${runtime}: avg=${performance.avgDurationMs}ms p50=${performance.p50DurationMs}ms p95=${performance.p95DurationMs}ms min=${performance.minDurationMs}ms max=${performance.maxDurationMs}ms`,
|
|
201
|
+
);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
await mkdir(dirname(reportPath), { recursive: true });
|
|
205
|
+
await writeFile(
|
|
206
|
+
reportPath,
|
|
207
|
+
JSON.stringify(
|
|
208
|
+
{
|
|
209
|
+
generatedAt: new Date().toISOString(),
|
|
210
|
+
endpoint,
|
|
211
|
+
apiUrl,
|
|
212
|
+
projectId: projectId ?? null,
|
|
213
|
+
runtimes: requestedRuntimeSelection,
|
|
214
|
+
writeEvals: runWriteEvals,
|
|
215
|
+
requestedCaseIds: [...resolvedRequestedCaseIds],
|
|
216
|
+
requestedCaseTags: [...requestedCaseTags],
|
|
217
|
+
requestedCaseSetId,
|
|
218
|
+
caseMetadata: Object.fromEntries(
|
|
219
|
+
cases.map((testCase) => [testCase.id, testCase.metadata ?? { tags: [] }]),
|
|
220
|
+
),
|
|
221
|
+
selectedCaseTagSummary,
|
|
222
|
+
results,
|
|
223
|
+
summary,
|
|
224
|
+
runtimeSummary,
|
|
225
|
+
runtimePerformanceSummary,
|
|
226
|
+
},
|
|
227
|
+
null,
|
|
228
|
+
2,
|
|
229
|
+
),
|
|
230
|
+
);
|
|
231
|
+
log(`report: ${reportPath}`);
|
|
232
|
+
|
|
233
|
+
return summary.failed > 0 ? 1 : 0;
|
|
234
|
+
}
|