@wix/evalforge-evaluator 0.178.0 → 0.180.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/build/index.js +6918 -239
- package/build/index.js.map +4 -4
- package/build/index.mjs +6947 -246
- package/build/index.mjs.map +4 -4
- package/build/types/ambassador-converters.d.ts +68 -0
- package/build/types/api-client.d.ts +46 -3
- package/build/types/config.d.ts +12 -1
- package/build/types/run-scenario/agents/shared/trace-emit.d.ts +5 -2
- package/build/types/run-scenario/index.d.ts +4 -2
- package/build/types/run-scenario/run-agent-with-context.d.ts +4 -2
- package/package.json +13 -5
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain ↔ ambassador wire-type converters. This module owns the proto↔domain
|
|
3
|
+
* boundary for the evaluator: every assumption about how ambassador payloads
|
|
4
|
+
* map onto `@wix/evalforge-types` lives here, in both directions.
|
|
5
|
+
*
|
|
6
|
+
* Write side (domain → wire, for `AddEvalRunResult` / `PushTraceEvent`)
|
|
7
|
+
* mirrors, in reverse, the backend's proto → domain converters in
|
|
8
|
+
* `packages/eval-backend/src/grpc/converter.ts`:
|
|
9
|
+
* - enum values gain their proto prefix and are uppercased
|
|
10
|
+
* (`'tool_use'` → `'LLM_STEP_TYPE_TOOL_USE'`),
|
|
11
|
+
* - ISO timestamp strings become `Date` objects (the ambassador serializer
|
|
12
|
+
* handles `google.protobuf.Timestamp` fields as `Date`),
|
|
13
|
+
* - domain field names map onto their proto counterparts
|
|
14
|
+
* (`isError` → `errored`, `isInfrastructure` → `infrastructure`,
|
|
15
|
+
* `isComplete` → `complete`, flat conversation blocks → proto oneof).
|
|
16
|
+
*
|
|
17
|
+
* Read side (wire → domain, for the GET calls) normalizes what the backend's
|
|
18
|
+
* `*ToProto` response converters produced:
|
|
19
|
+
* - enum values lose their proto prefix (when present) and are lowercased
|
|
20
|
+
* back to the domain values (`'SDK'` → `'sdk'`, `'CLAUDE'` → `'claude'`),
|
|
21
|
+
* with `*_UNSPECIFIED` sentinels mapped to `undefined`,
|
|
22
|
+
* - `Date` objects become the ISO strings domain timestamps are typed as,
|
|
23
|
+
* - wire-renamed structures fold back onto their domain fields
|
|
24
|
+
* (`Template.uploadedSource.files` → `sourceFiles`, the
|
|
25
|
+
* `CapabilityVersion` content oneof → flat `content`).
|
|
26
|
+
*/
|
|
27
|
+
import type { EvalRunResult as WireEvalRunResult, LiveTraceEvent as WireLiveTraceEvent, EvalRun as WireEvalRun } from '@wix/ambassador-evalforge-v1-eval-run/types';
|
|
28
|
+
import type { Agent as WireAgent } from '@wix/ambassador-evalforge-v1-agent/types';
|
|
29
|
+
import type { TestScenario as WireTestScenario } from '@wix/ambassador-evalforge-v1-test-scenario/types';
|
|
30
|
+
import type { Template as WireTemplate } from '@wix/ambassador-evalforge-v1-project/types';
|
|
31
|
+
import type { Preset as WirePreset } from '@wix/ambassador-evalforge-v1-preset/types';
|
|
32
|
+
import type { Capability as WireCapability, CapabilityVersion as WireCapabilityVersion } from '@wix/ambassador-evalforge-v1-capability/types';
|
|
33
|
+
import type { Agent, Capability, CapabilityVersion, EvalRun, EvalRunResult, LiveTraceEvent, Preset, Template, TestScenario } from '@wix/evalforge-types';
|
|
34
|
+
export declare function evalRunResultToProto(result: EvalRunResult): WireEvalRunResult;
|
|
35
|
+
export declare function liveTraceEventToProto(event: LiveTraceEvent): WireLiveTraceEvent;
|
|
36
|
+
export declare function agentFromProto(wire: WireAgent): Agent;
|
|
37
|
+
/**
|
|
38
|
+
* The wire `EvalRun` is a lean projection: it does not carry `results`,
|
|
39
|
+
* `aggregateMetrics`, job fields, or trace data. The evaluator never consumes
|
|
40
|
+
* those (it only reads run configuration — scenario/capability/agent IDs,
|
|
41
|
+
* variables, runsPerScenario), so the required domain fields get empty
|
|
42
|
+
* defaults.
|
|
43
|
+
*/
|
|
44
|
+
export declare function evalRunFromProto(wire: WireEvalRun): EvalRun;
|
|
45
|
+
/**
|
|
46
|
+
* Note: the proto `TestScenario` does not carry the legacy inline
|
|
47
|
+
* `assertions` / `assertionIds` fields — only `assertionLinks`. Scenarios
|
|
48
|
+
* still relying on inline assertions lose them over the gRPC read path.
|
|
49
|
+
*/
|
|
50
|
+
export declare function testScenarioFromProto(wire: WireTestScenario): TestScenario;
|
|
51
|
+
/** The wire nests uploaded files under a `uploadedSource` oneof branch. */
|
|
52
|
+
export declare function templateFromProto(wire: WireTemplate): Template;
|
|
53
|
+
export declare function presetFromProto(wire: WirePreset): Preset;
|
|
54
|
+
/**
|
|
55
|
+
* Wire `CapabilityType` values are already the bare domain values
|
|
56
|
+
* (`SKILL` / `SUB_AGENT` / `RULE` / `MCP`); only the `UNSPECIFIED`
|
|
57
|
+
* sentinel carries the proto prefix. Guard it to `undefined` for parity with
|
|
58
|
+
* the `fromProtoEnum` callers, so a malformed/future `UNSPECIFIED` can't pass
|
|
59
|
+
* through as a "valid" capabilityType.
|
|
60
|
+
*/
|
|
61
|
+
export declare function capabilityFromProto(wire: WireCapability): Capability;
|
|
62
|
+
/**
|
|
63
|
+
* The wire splits `content` into a per-type oneof
|
|
64
|
+
* (`skillContent` / `subAgentContent` / `ruleContent` / `mcpContent` /
|
|
65
|
+
* `pluginContent`); the domain keeps a single `content` field discriminated
|
|
66
|
+
* by the parent capability's `capabilityType`.
|
|
67
|
+
*/
|
|
68
|
+
export declare function capabilityVersionFromProto(wire: WireCapabilityVersion, projectId: string): CapabilityVersion;
|
|
@@ -1,7 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* API Client for fetching data from the eval server.
|
|
3
|
+
*
|
|
4
|
+
* Migration in progress: backend calls are moving from raw `fetch` against
|
|
5
|
+
* the legacy REST surface to the gRPC ambassador surface via `@wix/http-client`.
|
|
6
|
+
* Migrated: `getEvalRun`, `getScenario`, `getAgent`, `getTemplate`, `getPreset`,
|
|
7
|
+
* `getCapability`, `getCapabilityVersion`, `addResult`, `clearResults`,
|
|
8
|
+
* `pushTraceEvent`. Still on legacy REST: `updateEvalRun` — the gRPC
|
|
9
|
+
* `UpdateEvalRun` handler only forwards user-editable fields
|
|
10
|
+
* (name/description/comparison*), not the system state transitions
|
|
11
|
+
* (status/completedAt/jobError/jobStatus) the evaluator writes.
|
|
3
12
|
*/
|
|
4
|
-
import
|
|
13
|
+
import { type IHttpClient } from '@wix/http-client';
|
|
14
|
+
import type { EvalRun, EvalRunResult, LiveTraceEvent, TestScenario, Template, Agent, Preset, CapabilityWithLatestVersion, CapabilityVersion } from '@wix/evalforge-types';
|
|
5
15
|
export interface ApiClient {
|
|
6
16
|
getEvalRun(projectId: string, id: string): Promise<EvalRun>;
|
|
7
17
|
getScenario(projectId: string, id: string): Promise<TestScenario>;
|
|
@@ -12,6 +22,7 @@ export interface ApiClient {
|
|
|
12
22
|
getCapabilityVersion(projectId: string, capabilityId: string, versionId: string): Promise<CapabilityVersion>;
|
|
13
23
|
addResult(projectId: string, evalRunId: string, result: EvalRunResult): Promise<void>;
|
|
14
24
|
clearResults(projectId: string, evalRunId: string): Promise<void>;
|
|
25
|
+
pushTraceEvent(projectId: string, evalRunId: string, event: LiveTraceEvent): Promise<void>;
|
|
15
26
|
updateEvalRun(projectId: string, evalRunId: string, update: Partial<EvalRun>): Promise<void>;
|
|
16
27
|
}
|
|
17
28
|
/**
|
|
@@ -26,11 +37,43 @@ export interface ApiClientOptions {
|
|
|
26
37
|
*/
|
|
27
38
|
routeHeader?: string;
|
|
28
39
|
/**
|
|
29
|
-
* Optional Bearer token for authenticating with public
|
|
30
|
-
*
|
|
40
|
+
* Optional Bearer token for authenticating with the legacy REST public
|
|
41
|
+
* endpoints (`/public/...`). Used by the not-yet-migrated `fetch` paths.
|
|
31
42
|
*/
|
|
32
43
|
authToken?: string;
|
|
44
|
+
/**
|
|
45
|
+
* Optional S2S-signed token for authenticating ambassador/gRPC calls.
|
|
46
|
+
* Wired into the underlying `HttpClient` via `getAppToken`. When absent
|
|
47
|
+
* (local dev), `getAppToken` is omitted and requests go out unauthenticated.
|
|
48
|
+
*/
|
|
49
|
+
grpcAuthToken?: string;
|
|
50
|
+
/**
|
|
51
|
+
* Override the underlying http client. Tests inject a mock built with
|
|
52
|
+
* `@wix/http-client-testkit` here; production code never sets this and a
|
|
53
|
+
* real `HttpClient` is constructed from `serverUrl` + auth/route options.
|
|
54
|
+
*/
|
|
55
|
+
httpClient?: IHttpClient;
|
|
33
56
|
}
|
|
57
|
+
/**
|
|
58
|
+
* Resolve the `baseURL` for the ambassador `HttpClient` from the configured
|
|
59
|
+
* `serverUrl`.
|
|
60
|
+
*
|
|
61
|
+
* The ambassador request factories emit fully-qualified gateway paths that
|
|
62
|
+
* already include the `/_api/evalforge-backend` prefix (e.g.
|
|
63
|
+
* `/_api/evalforge-backend/v1/projects/{id}/eval-runs/{id}`). `serverUrl` also
|
|
64
|
+
* carries that prefix — it is the REST base used by `putJson`/trace-push — so
|
|
65
|
+
* using it directly as the HttpClient `baseURL` duplicates the prefix
|
|
66
|
+
* (`.../_api/evalforge-backend/_api/evalforge-backend/v1/...`) and every
|
|
67
|
+
* ambassador call 404s: the run starts but the evaluator can't fetch its data,
|
|
68
|
+
* so the eval fails with no results.
|
|
69
|
+
*
|
|
70
|
+
* Returning the origin only keeps the ambassador-emitted path intact while the
|
|
71
|
+
* REST writes keep using the full `serverUrl`. Origin-only inputs (the local
|
|
72
|
+
* in-cluster `http://host:port`) are returned unchanged.
|
|
73
|
+
*
|
|
74
|
+
* @param serverUrl - Base URL of the server (e.g., "https://bo.wix.com/_api/evalforge-backend")
|
|
75
|
+
*/
|
|
76
|
+
export declare function resolveAmbassadorBaseUrl(serverUrl: string): string;
|
|
34
77
|
/**
|
|
35
78
|
* Create an API client for the eval server.
|
|
36
79
|
*
|
package/build/types/config.d.ts
CHANGED
|
@@ -15,7 +15,12 @@ export interface EvaluatorConfig {
|
|
|
15
15
|
aiGatewayHeaders: Record<string, string>;
|
|
16
16
|
/** Directory for storing evaluation working directories */
|
|
17
17
|
evaluationsDir?: string;
|
|
18
|
-
/**
|
|
18
|
+
/**
|
|
19
|
+
* Legacy REST URL for pushing trace events (set by the backend job runner
|
|
20
|
+
* for remote jobs). Its presence enables remote trace push; the push itself
|
|
21
|
+
* now goes through the gRPC `PushTraceEvent` RPC, so the URL value is only
|
|
22
|
+
* used by the (disabled) diagnostics module.
|
|
23
|
+
*/
|
|
19
24
|
tracePushUrl?: string;
|
|
20
25
|
/**
|
|
21
26
|
* Optional x-wix-route header value for deploy preview routing.
|
|
@@ -28,6 +33,12 @@ export interface EvaluatorConfig {
|
|
|
28
33
|
* When set, this token is included as Authorization header in API requests.
|
|
29
34
|
*/
|
|
30
35
|
authToken?: string;
|
|
36
|
+
/**
|
|
37
|
+
* Optional S2S-signed token for authenticating gRPC/ambassador calls.
|
|
38
|
+
* When set, it is forwarded via the Wix HttpClient `getAppToken` hook. When
|
|
39
|
+
* absent (e.g., local development), gRPC calls go out unauthenticated.
|
|
40
|
+
*/
|
|
41
|
+
grpcAuthToken?: string;
|
|
31
42
|
}
|
|
32
43
|
/**
|
|
33
44
|
* Load evaluator configuration from environment variables.
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import type { LiveTraceEvent } from '@wix/evalforge-types';
|
|
2
2
|
/**
|
|
3
3
|
* Emit a live trace event to stdout for the backend to capture.
|
|
4
|
-
*
|
|
4
|
+
*
|
|
5
|
+
* When a `pushEvent` callback is provided (remote job execution, where stdout
|
|
6
|
+
* is not accessible), the event is also pushed to the backend through it.
|
|
7
|
+
* The callback is fire-and-forget; it must not throw.
|
|
5
8
|
*/
|
|
6
|
-
export declare function emitTraceEvent(event: LiveTraceEvent,
|
|
9
|
+
export declare function emitTraceEvent(event: LiveTraceEvent, pushEvent?: (event: LiveTraceEvent) => void): void;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { EvalRunResult, TestScenario, Template } from '@wix/evalforge-types';
|
|
1
|
+
import type { EvalRunResult, LiveTraceEvent, TestScenario, Template } from '@wix/evalforge-types';
|
|
2
2
|
import type { EvaluatorConfig } from '../config.js';
|
|
3
3
|
import type { Assertion } from '@wix/evalforge-types';
|
|
4
4
|
import type { EvaluationData } from '../fetch-evaluation-data.js';
|
|
@@ -13,6 +13,8 @@ export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
|
|
|
13
13
|
* @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
|
|
14
14
|
* @param template - Optional pre-fetched template entity
|
|
15
15
|
* @param resolvedAssertions - Optional assertions resolved from assertionIds
|
|
16
|
+
* @param pushEvent - Optional callback pushing trace events to the backend
|
|
17
|
+
* (remote job execution, where stdout is not accessible)
|
|
16
18
|
* @returns Complete evaluation result
|
|
17
19
|
*/
|
|
18
|
-
export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[]): Promise<EvalRunResult>;
|
|
20
|
+
export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[], pushEvent?: (event: LiveTraceEvent) => void): Promise<EvalRunResult>;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { TestScenario } from '@wix/evalforge-types';
|
|
1
|
+
import type { TestScenario, LiveTraceEvent } from '@wix/evalforge-types';
|
|
2
2
|
import type { EvaluatorConfig } from '../config.js';
|
|
3
3
|
import type { PartialEvalRunResult } from './types.js';
|
|
4
4
|
import type { EvaluationData } from '../fetch-evaluation-data.js';
|
|
@@ -14,6 +14,8 @@ import type { EvaluationData } from '../fetch-evaluation-data.js';
|
|
|
14
14
|
* @param scenario - The test scenario to run
|
|
15
15
|
* @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
|
|
16
16
|
* @param workDir - Optional working directory for the scenario
|
|
17
|
+
* @param pushEvent - Optional callback pushing trace events to the backend
|
|
18
|
+
* (remote job execution, where stdout is not accessible)
|
|
17
19
|
* @returns Partial result without assertion fields
|
|
18
20
|
*/
|
|
19
|
-
export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, workDir?: string): Promise<PartialEvalRunResult>;
|
|
21
|
+
export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, workDir?: string, pushEvent?: (event: LiveTraceEvent) => void): Promise<PartialEvalRunResult>;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wix/evalforge-evaluator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.180.0",
|
|
4
4
|
"description": "EvalForge Evaluator",
|
|
5
5
|
"bin": "./build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -22,9 +22,9 @@
|
|
|
22
22
|
"@ai-sdk/openai": "^3.0.39",
|
|
23
23
|
"@anthropic-ai/claude-agent-sdk": "^0.2.63",
|
|
24
24
|
"@anthropic-ai/claude-code": "^2.1.63",
|
|
25
|
-
"@wix/eval-assertions": "0.
|
|
26
|
-
"@wix/evalforge-github-client": "0.
|
|
27
|
-
"@wix/evalforge-types": "0.
|
|
25
|
+
"@wix/eval-assertions": "0.70.0",
|
|
26
|
+
"@wix/evalforge-github-client": "0.70.0",
|
|
27
|
+
"@wix/evalforge-types": "0.95.0",
|
|
28
28
|
"ai": "^6.0.107",
|
|
29
29
|
"diff": "^7.0.0",
|
|
30
30
|
"tar": "^7.5.3",
|
|
@@ -35,6 +35,14 @@
|
|
|
35
35
|
"@types/diff": "^7.0.2",
|
|
36
36
|
"@types/node": "^22.19.3",
|
|
37
37
|
"@types/tar": "^6.1.13",
|
|
38
|
+
"@wix/ambassador-evalforge-v1-agent": "^1.0.5",
|
|
39
|
+
"@wix/ambassador-evalforge-v1-capability": "^1.0.8",
|
|
40
|
+
"@wix/ambassador-evalforge-v1-eval-run": "^1.0.8",
|
|
41
|
+
"@wix/ambassador-evalforge-v1-preset": "^1.0.5",
|
|
42
|
+
"@wix/ambassador-evalforge-v1-project": "^1.0.6",
|
|
43
|
+
"@wix/ambassador-evalforge-v1-test-scenario": "^1.0.5",
|
|
44
|
+
"@wix/http-client": "^2.85.0",
|
|
45
|
+
"@wix/http-client-testkit": "^1.764.0",
|
|
38
46
|
"dotenv": "^17.2.3",
|
|
39
47
|
"esbuild": "^0.27.2",
|
|
40
48
|
"eslint": "^9.39.2",
|
|
@@ -63,5 +71,5 @@
|
|
|
63
71
|
"artifactId": "evalforge-evaluator"
|
|
64
72
|
}
|
|
65
73
|
},
|
|
66
|
-
"falconPackageHash": "
|
|
74
|
+
"falconPackageHash": "5ce316680bd58179748e28f277b656cd718bb6b1e7272af06059a26d"
|
|
67
75
|
}
|