@wix/evalforge-evaluator 0.178.0 → 0.179.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Domain ↔ ambassador wire-type converters. This module owns the proto↔domain
3
+ * boundary for the evaluator: every assumption about how ambassador payloads
4
+ * map onto `@wix/evalforge-types` lives here, in both directions.
5
+ *
6
+ * Write side (domain → wire, for `AddEvalRunResult` / `PushTraceEvent`)
7
+ * mirrors, in reverse, the backend's proto → domain converters in
8
+ * `packages/eval-backend/src/grpc/converter.ts`:
9
+ * - enum values gain their proto prefix and are uppercased
10
+ * (`'tool_use'` → `'LLM_STEP_TYPE_TOOL_USE'`),
11
+ * - ISO timestamp strings become `Date` objects (the ambassador serializer
12
+ * handles `google.protobuf.Timestamp` fields as `Date`),
13
+ * - domain field names map onto their proto counterparts
14
+ * (`isError` → `errored`, `isInfrastructure` → `infrastructure`,
15
+ * `isComplete` → `complete`, flat conversation blocks → proto oneof).
16
+ *
17
+ * Read side (wire → domain, for the GET calls) normalizes what the backend's
18
+ * `*ToProto` response converters produced:
19
+ * - enum values lose their proto prefix (when present) and are lowercased
20
+ * back to the domain values (`'SDK'` → `'sdk'`, `'CLAUDE'` → `'claude'`),
21
+ * with `*_UNSPECIFIED` sentinels mapped to `undefined`,
22
+ * - `Date` objects become the ISO strings domain timestamps are typed as,
23
+ * - wire-renamed structures fold back onto their domain fields
24
+ * (`Template.uploadedSource.files` → `sourceFiles`, the
25
+ * `CapabilityVersion` content oneof → flat `content`).
26
+ */
27
+ import type { EvalRunResult as WireEvalRunResult, LiveTraceEvent as WireLiveTraceEvent, EvalRun as WireEvalRun } from '@wix/ambassador-evalforge-v1-eval-run/types';
28
+ import type { Agent as WireAgent } from '@wix/ambassador-evalforge-v1-agent/types';
29
+ import type { TestScenario as WireTestScenario } from '@wix/ambassador-evalforge-v1-test-scenario/types';
30
+ import type { Template as WireTemplate } from '@wix/ambassador-evalforge-v1-project/types';
31
+ import type { Preset as WirePreset } from '@wix/ambassador-evalforge-v1-preset/types';
32
+ import type { Capability as WireCapability, CapabilityVersion as WireCapabilityVersion } from '@wix/ambassador-evalforge-v1-capability/types';
33
+ import type { Agent, Capability, CapabilityVersion, EvalRun, EvalRunResult, LiveTraceEvent, Preset, Template, TestScenario } from '@wix/evalforge-types';
34
+ export declare function evalRunResultToProto(result: EvalRunResult): WireEvalRunResult;
35
+ export declare function liveTraceEventToProto(event: LiveTraceEvent): WireLiveTraceEvent;
36
+ export declare function agentFromProto(wire: WireAgent): Agent;
37
+ /**
38
+ * The wire `EvalRun` is a lean projection: it does not carry `results`,
39
+ * `aggregateMetrics`, job fields, or trace data. The evaluator never consumes
40
+ * those (it only reads run configuration — scenario/capability/agent IDs,
41
+ * variables, runsPerScenario), so the required domain fields get empty
42
+ * defaults.
43
+ */
44
+ export declare function evalRunFromProto(wire: WireEvalRun): EvalRun;
45
+ /**
46
+ * Note: the proto `TestScenario` does not carry the legacy inline
47
+ * `assertions` / `assertionIds` fields — only `assertionLinks`. Scenarios
48
+ * still relying on inline assertions lose them over the gRPC read path.
49
+ */
50
+ export declare function testScenarioFromProto(wire: WireTestScenario): TestScenario;
51
+ /** The wire nests uploaded files under a `uploadedSource` oneof branch. */
52
+ export declare function templateFromProto(wire: WireTemplate): Template;
53
+ export declare function presetFromProto(wire: WirePreset): Preset;
54
+ /**
55
+ * Wire `CapabilityType` values are already the bare domain values
56
+ * (`SKILL` / `SUB_AGENT` / `RULE` / `MCP`); only the `UNSPECIFIED`
57
+ * sentinel carries the proto prefix. Guard it to `undefined` for parity with
58
+ * the `fromProtoEnum` callers, so a malformed/future `UNSPECIFIED` can't pass
59
+ * through as a "valid" capabilityType.
60
+ */
61
+ export declare function capabilityFromProto(wire: WireCapability): Capability;
62
+ /**
63
+ * The wire splits `content` into a per-type oneof
64
+ * (`skillContent` / `subAgentContent` / `ruleContent` / `mcpContent` /
65
+ * `pluginContent`); the domain keeps a single `content` field discriminated
66
+ * by the parent capability's `capabilityType`.
67
+ */
68
+ export declare function capabilityVersionFromProto(wire: WireCapabilityVersion, projectId: string): CapabilityVersion;
@@ -1,7 +1,17 @@
1
1
  /**
2
2
  * API Client for fetching data from the eval server.
3
+ *
4
+ * Migration in progress: backend calls are moving from raw `fetch` against
5
+ * the legacy REST surface to the gRPC ambassador surface via `@wix/http-client`.
6
+ * Migrated: `getEvalRun`, `getScenario`, `getAgent`, `getTemplate`, `getPreset`,
7
+ * `getCapability`, `getCapabilityVersion`, `addResult`, `clearResults`,
8
+ * `pushTraceEvent`. Still on legacy REST: `updateEvalRun` — the gRPC
9
+ * `UpdateEvalRun` handler only forwards user-editable fields
10
+ * (name/description/comparison*), not the system state transitions
11
+ * (status/completedAt/jobError/jobStatus) the evaluator writes.
3
12
  */
4
- import type { EvalRun, EvalRunResult, TestScenario, Template, Agent, Preset, CapabilityWithLatestVersion, CapabilityVersion } from '@wix/evalforge-types';
13
+ import { type IHttpClient } from '@wix/http-client';
14
+ import type { EvalRun, EvalRunResult, LiveTraceEvent, TestScenario, Template, Agent, Preset, CapabilityWithLatestVersion, CapabilityVersion } from '@wix/evalforge-types';
5
15
  export interface ApiClient {
6
16
  getEvalRun(projectId: string, id: string): Promise<EvalRun>;
7
17
  getScenario(projectId: string, id: string): Promise<TestScenario>;
@@ -12,6 +22,7 @@ export interface ApiClient {
12
22
  getCapabilityVersion(projectId: string, capabilityId: string, versionId: string): Promise<CapabilityVersion>;
13
23
  addResult(projectId: string, evalRunId: string, result: EvalRunResult): Promise<void>;
14
24
  clearResults(projectId: string, evalRunId: string): Promise<void>;
25
+ pushTraceEvent(projectId: string, evalRunId: string, event: LiveTraceEvent): Promise<void>;
15
26
  updateEvalRun(projectId: string, evalRunId: string, update: Partial<EvalRun>): Promise<void>;
16
27
  }
17
28
  /**
@@ -26,10 +37,22 @@ export interface ApiClientOptions {
26
37
  */
27
38
  routeHeader?: string;
28
39
  /**
29
- * Optional Bearer token for authenticating with public endpoints.
30
- * When set, this token is included as Authorization header in all requests.
40
+ * Optional Bearer token for authenticating with the legacy REST public
41
+ * endpoints (`/public/...`). Used by the not-yet-migrated `fetch` paths.
31
42
  */
32
43
  authToken?: string;
44
+ /**
45
+ * Optional S2S-signed token for authenticating ambassador/gRPC calls.
46
+ * Wired into the underlying `HttpClient` via `getAppToken`. When absent
47
+ * (local dev), `getAppToken` is omitted and requests go out unauthenticated.
48
+ */
49
+ grpcAuthToken?: string;
50
+ /**
51
+ * Override the underlying http client. Tests inject a mock built with
52
+ * `@wix/http-client-testkit` here; production code never sets this and a
53
+ * real `HttpClient` is constructed from `serverUrl` + auth/route options.
54
+ */
55
+ httpClient?: IHttpClient;
33
56
  }
34
57
  /**
35
58
  * Create an API client for the eval server.
@@ -15,7 +15,12 @@ export interface EvaluatorConfig {
15
15
  aiGatewayHeaders: Record<string, string>;
16
16
  /** Directory for storing evaluation working directories */
17
17
  evaluationsDir?: string;
18
- /** URL to push trace events to (for remote job execution) */
18
+ /**
19
+ * Legacy REST URL for pushing trace events (set by the backend job runner
20
+ * for remote jobs). Its presence enables remote trace push; the push itself
21
+ * now goes through the gRPC `PushTraceEvent` RPC, so the URL value is only
22
+ * used by the (disabled) diagnostics module.
23
+ */
19
24
  tracePushUrl?: string;
20
25
  /**
21
26
  * Optional x-wix-route header value for deploy preview routing.
@@ -28,6 +33,12 @@ export interface EvaluatorConfig {
28
33
  * When set, this token is included as Authorization header in API requests.
29
34
  */
30
35
  authToken?: string;
36
+ /**
37
+ * Optional S2S-signed token for authenticating gRPC/ambassador calls.
38
+ * When set, it is forwarded via the Wix HttpClient `getAppToken` hook. When
39
+ * absent (e.g., local development), gRPC calls go out unauthenticated.
40
+ */
41
+ grpcAuthToken?: string;
31
42
  }
32
43
  /**
33
44
  * Load evaluator configuration from environment variables.
@@ -1,6 +1,9 @@
1
1
  import type { LiveTraceEvent } from '@wix/evalforge-types';
2
2
  /**
3
3
  * Emit a live trace event to stdout for the backend to capture.
4
- * Also pushes to HTTP endpoint if tracePushUrl is provided (for remote job execution).
4
+ *
5
+ * When a `pushEvent` callback is provided (remote job execution, where stdout
6
+ * is not accessible), the event is also pushed to the backend through it.
7
+ * The callback is fire-and-forget; it must not throw.
5
8
  */
6
- export declare function emitTraceEvent(event: LiveTraceEvent, tracePushUrl?: string, routeHeader?: string, authToken?: string): void;
9
+ export declare function emitTraceEvent(event: LiveTraceEvent, pushEvent?: (event: LiveTraceEvent) => void): void;
@@ -1,4 +1,4 @@
1
- import type { EvalRunResult, TestScenario, Template } from '@wix/evalforge-types';
1
+ import type { EvalRunResult, LiveTraceEvent, TestScenario, Template } from '@wix/evalforge-types';
2
2
  import type { EvaluatorConfig } from '../config.js';
3
3
  import type { Assertion } from '@wix/evalforge-types';
4
4
  import type { EvaluationData } from '../fetch-evaluation-data.js';
@@ -13,6 +13,8 @@ export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
13
13
  * @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
14
14
  * @param template - Optional pre-fetched template entity
15
15
  * @param resolvedAssertions - Optional assertions resolved from assertionIds
16
+ * @param pushEvent - Optional callback pushing trace events to the backend
17
+ * (remote job execution, where stdout is not accessible)
16
18
  * @returns Complete evaluation result
17
19
  */
18
- export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[]): Promise<EvalRunResult>;
20
+ export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[], pushEvent?: (event: LiveTraceEvent) => void): Promise<EvalRunResult>;
@@ -1,4 +1,4 @@
1
- import type { TestScenario } from '@wix/evalforge-types';
1
+ import type { TestScenario, LiveTraceEvent } from '@wix/evalforge-types';
2
2
  import type { EvaluatorConfig } from '../config.js';
3
3
  import type { PartialEvalRunResult } from './types.js';
4
4
  import type { EvaluationData } from '../fetch-evaluation-data.js';
@@ -14,6 +14,8 @@ import type { EvaluationData } from '../fetch-evaluation-data.js';
14
14
  * @param scenario - The test scenario to run
15
15
  * @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
16
16
  * @param workDir - Optional working directory for the scenario
17
+ * @param pushEvent - Optional callback pushing trace events to the backend
18
+ * (remote job execution, where stdout is not accessible)
17
19
  * @returns Partial result without assertion fields
18
20
  */
19
- export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, workDir?: string): Promise<PartialEvalRunResult>;
21
+ export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, workDir?: string, pushEvent?: (event: LiveTraceEvent) => void): Promise<PartialEvalRunResult>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@wix/evalforge-evaluator",
3
- "version": "0.178.0",
3
+ "version": "0.179.0",
4
4
  "description": "EvalForge Evaluator",
5
5
  "bin": "./build/index.js",
6
6
  "files": [
@@ -22,9 +22,9 @@
22
22
  "@ai-sdk/openai": "^3.0.39",
23
23
  "@anthropic-ai/claude-agent-sdk": "^0.2.63",
24
24
  "@anthropic-ai/claude-code": "^2.1.63",
25
- "@wix/eval-assertions": "0.69.0",
26
- "@wix/evalforge-github-client": "0.69.0",
27
- "@wix/evalforge-types": "0.94.0",
25
+ "@wix/eval-assertions": "0.70.0",
26
+ "@wix/evalforge-github-client": "0.70.0",
27
+ "@wix/evalforge-types": "0.95.0",
28
28
  "ai": "^6.0.107",
29
29
  "diff": "^7.0.0",
30
30
  "tar": "^7.5.3",
@@ -35,6 +35,14 @@
35
35
  "@types/diff": "^7.0.2",
36
36
  "@types/node": "^22.19.3",
37
37
  "@types/tar": "^6.1.13",
38
+ "@wix/ambassador-evalforge-v1-agent": "^1.0.5",
39
+ "@wix/ambassador-evalforge-v1-capability": "^1.0.8",
40
+ "@wix/ambassador-evalforge-v1-eval-run": "^1.0.8",
41
+ "@wix/ambassador-evalforge-v1-preset": "^1.0.5",
42
+ "@wix/ambassador-evalforge-v1-project": "^1.0.6",
43
+ "@wix/ambassador-evalforge-v1-test-scenario": "^1.0.5",
44
+ "@wix/http-client": "^2.85.0",
45
+ "@wix/http-client-testkit": "^1.764.0",
38
46
  "dotenv": "^17.2.3",
39
47
  "esbuild": "^0.27.2",
40
48
  "eslint": "^9.39.2",
@@ -63,5 +71,5 @@
63
71
  "artifactId": "evalforge-evaluator"
64
72
  }
65
73
  },
66
- "falconPackageHash": "c769147b98c8dd7089e778d69f01fd2f28992417075cc9caedacf864"
74
+ "falconPackageHash": "ec6cd7242df122dacec1fb89f46734131adb2f8a98afb93362b7bef7"
67
75
  }