npm - @wix/evalforge-evaluator - Versions diffs - 0.178.0 → 0.179.0 - Mend

@wix/evalforge-evaluator 0.178.0 → 0.179.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +13 -2
package/build/index.js +6911 -239
package/build/index.js.map +4 -4
package/build/index.mjs +6940 -246
package/build/index.mjs.map +4 -4
package/build/types/ambassador-converters.d.ts +68 -0
package/build/types/api-client.d.ts +26 -3
package/build/types/config.d.ts +12 -1
package/build/types/run-scenario/agents/shared/trace-emit.d.ts +5 -2
package/build/types/run-scenario/index.d.ts +4 -2
package/build/types/run-scenario/run-agent-with-context.d.ts +4 -2
package/package.json +13 -5

package/build/types/ambassador-converters.d.ts ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * Domain ↔ ambassador wire-type converters. This module owns the proto↔domain
+ * boundary for the evaluator: every assumption about how ambassador payloads
+ * map onto `@wix/evalforge-types` lives here, in both directions.
+ *
+ * Write side (domain → wire, for `AddEvalRunResult` / `PushTraceEvent`)
+ * mirrors, in reverse, the backend's proto → domain converters in
+ * `packages/eval-backend/src/grpc/converter.ts`:
+ * - enum values gain their proto prefix and are uppercased
+ *   (`'tool_use'` → `'LLM_STEP_TYPE_TOOL_USE'`),
+ * - ISO timestamp strings become `Date` objects (the ambassador serializer
+ *   handles `google.protobuf.Timestamp` fields as `Date`),
+ * - domain field names map onto their proto counterparts
+ *   (`isError` → `errored`, `isInfrastructure` → `infrastructure`,
+ *   `isComplete` → `complete`, flat conversation blocks → proto oneof).
+ *
+ * Read side (wire → domain, for the GET calls) normalizes what the backend's
+ * `*ToProto` response converters produced:
+ * - enum values lose their proto prefix (when present) and are lowercased
+ *   back to the domain values (`'SDK'` → `'sdk'`, `'CLAUDE'` → `'claude'`),
+ *   with `*_UNSPECIFIED` sentinels mapped to `undefined`,
+ * - `Date` objects become the ISO strings domain timestamps are typed as,
+ * - wire-renamed structures fold back onto their domain fields
+ *   (`Template.uploadedSource.files` → `sourceFiles`, the
+ *   `CapabilityVersion` content oneof → flat `content`).
+ */
+import type { EvalRunResult as WireEvalRunResult, LiveTraceEvent as WireLiveTraceEvent, EvalRun as WireEvalRun } from '@wix/ambassador-evalforge-v1-eval-run/types';
+import type { Agent as WireAgent } from '@wix/ambassador-evalforge-v1-agent/types';
+import type { TestScenario as WireTestScenario } from '@wix/ambassador-evalforge-v1-test-scenario/types';
+import type { Template as WireTemplate } from '@wix/ambassador-evalforge-v1-project/types';
+import type { Preset as WirePreset } from '@wix/ambassador-evalforge-v1-preset/types';
+import type { Capability as WireCapability, CapabilityVersion as WireCapabilityVersion } from '@wix/ambassador-evalforge-v1-capability/types';
+import type { Agent, Capability, CapabilityVersion, EvalRun, EvalRunResult, LiveTraceEvent, Preset, Template, TestScenario } from '@wix/evalforge-types';
+export declare function evalRunResultToProto(result: EvalRunResult): WireEvalRunResult;
+export declare function liveTraceEventToProto(event: LiveTraceEvent): WireLiveTraceEvent;
+export declare function agentFromProto(wire: WireAgent): Agent;
+/**
+ * The wire `EvalRun` is a lean projection: it does not carry `results`,
+ * `aggregateMetrics`, job fields, or trace data. The evaluator never consumes
+ * those (it only reads run configuration — scenario/capability/agent IDs,
+ * variables, runsPerScenario), so the required domain fields get empty
+ * defaults.
+ */
+export declare function evalRunFromProto(wire: WireEvalRun): EvalRun;
+/**
+ * Note: the proto `TestScenario` does not carry the legacy inline
+ * `assertions` / `assertionIds` fields — only `assertionLinks`. Scenarios
+ * still relying on inline assertions lose them over the gRPC read path.
+ */
+export declare function testScenarioFromProto(wire: WireTestScenario): TestScenario;
+/** The wire nests uploaded files under a `uploadedSource` oneof branch. */
+export declare function templateFromProto(wire: WireTemplate): Template;
+export declare function presetFromProto(wire: WirePreset): Preset;
+/**
+ * Wire `CapabilityType` values are already the bare domain values
+ * (`SKILL` / `SUB_AGENT` / `RULE` / `MCP`); only the `UNSPECIFIED`
+ * sentinel carries the proto prefix. Guard it to `undefined` for parity with
+ * the `fromProtoEnum` callers, so a malformed/future `UNSPECIFIED` can't pass
+ * through as a "valid" capabilityType.
+ */
+export declare function capabilityFromProto(wire: WireCapability): Capability;
+/**
+ * The wire splits `content` into a per-type oneof
+ * (`skillContent` / `subAgentContent` / `ruleContent` / `mcpContent` /
+ * `pluginContent`); the domain keeps a single `content` field discriminated
+ * by the parent capability's `capabilityType`.
+ */
+export declare function capabilityVersionFromProto(wire: WireCapabilityVersion, projectId: string): CapabilityVersion;

package/build/types/api-client.d.ts CHANGED Viewed

@@ -1,7 +1,17 @@
 /**
  * API Client for fetching data from the eval server.
+ *
+ * Migration in progress: backend calls are moving from raw `fetch` against
+ * the legacy REST surface to the gRPC ambassador surface via `@wix/http-client`.
+ * Migrated: `getEvalRun`, `getScenario`, `getAgent`, `getTemplate`, `getPreset`,
+ * `getCapability`, `getCapabilityVersion`, `addResult`, `clearResults`,
+ * `pushTraceEvent`. Still on legacy REST: `updateEvalRun` — the gRPC
+ * `UpdateEvalRun` handler only forwards user-editable fields
+ * (name/description/comparison*), not the system state transitions
+ * (status/completedAt/jobError/jobStatus) the evaluator writes.
  */
-import type { EvalRun, EvalRunResult, TestScenario, Template, Agent, Preset, CapabilityWithLatestVersion, CapabilityVersion } from '@wix/evalforge-types';
+import { type IHttpClient } from '@wix/http-client';
+import type { EvalRun, EvalRunResult, LiveTraceEvent, TestScenario, Template, Agent, Preset, CapabilityWithLatestVersion, CapabilityVersion } from '@wix/evalforge-types';
 export interface ApiClient {
     getEvalRun(projectId: string, id: string): Promise<EvalRun>;
     getScenario(projectId: string, id: string): Promise<TestScenario>;
@@ -12,6 +22,7 @@ export interface ApiClient {
     getCapabilityVersion(projectId: string, capabilityId: string, versionId: string): Promise<CapabilityVersion>;
     addResult(projectId: string, evalRunId: string, result: EvalRunResult): Promise<void>;
     clearResults(projectId: string, evalRunId: string): Promise<void>;
+    pushTraceEvent(projectId: string, evalRunId: string, event: LiveTraceEvent): Promise<void>;
     updateEvalRun(projectId: string, evalRunId: string, update: Partial<EvalRun>): Promise<void>;
 }
 /**
@@ -26,10 +37,22 @@ export interface ApiClientOptions {
      */
     routeHeader?: string;
     /**
-     * Optional Bearer token for authenticating with public endpoints.
-     * When set, this token is included as Authorization header in all requests.
+     * Optional Bearer token for authenticating with the legacy REST public
+     * endpoints (`/public/...`). Used by the not-yet-migrated `fetch` paths.
      */
     authToken?: string;
+    /**
+     * Optional S2S-signed token for authenticating ambassador/gRPC calls.
+     * Wired into the underlying `HttpClient` via `getAppToken`. When absent
+     * (local dev), `getAppToken` is omitted and requests go out unauthenticated.
+     */
+    grpcAuthToken?: string;
+    /**
+     * Override the underlying http client. Tests inject a mock built with
+     * `@wix/http-client-testkit` here; production code never sets this and a
+     * real `HttpClient` is constructed from `serverUrl` + auth/route options.
+     */
+    httpClient?: IHttpClient;
 }
 /**
  * Create an API client for the eval server.

package/build/types/config.d.ts CHANGED Viewed

@@ -15,7 +15,12 @@ export interface EvaluatorConfig {
     aiGatewayHeaders: Record<string, string>;
     /** Directory for storing evaluation working directories */
     evaluationsDir?: string;
-    /** URL to push trace events to (for remote job execution) */
+    /**
+     * Legacy REST URL for pushing trace events (set by the backend job runner
+     * for remote jobs). Its presence enables remote trace push; the push itself
+     * now goes through the gRPC `PushTraceEvent` RPC, so the URL value is only
+     * used by the (disabled) diagnostics module.
+     */
     tracePushUrl?: string;
     /**
      * Optional x-wix-route header value for deploy preview routing.
@@ -28,6 +33,12 @@ export interface EvaluatorConfig {
      * When set, this token is included as Authorization header in API requests.
      */
     authToken?: string;
+    /**
+     * Optional S2S-signed token for authenticating gRPC/ambassador calls.
+     * When set, it is forwarded via the Wix HttpClient `getAppToken` hook. When
+     * absent (e.g., local development), gRPC calls go out unauthenticated.
+     */
+    grpcAuthToken?: string;
 }
 /**
  * Load evaluator configuration from environment variables.

package/build/types/run-scenario/agents/shared/trace-emit.d.ts CHANGED Viewed

@@ -1,6 +1,9 @@
 import type { LiveTraceEvent } from '@wix/evalforge-types';
 /**
  * Emit a live trace event to stdout for the backend to capture.
- * Also pushes to HTTP endpoint if tracePushUrl is provided (for remote job execution).
+ *
+ * When a `pushEvent` callback is provided (remote job execution, where stdout
+ * is not accessible), the event is also pushed to the backend through it.
+ * The callback is fire-and-forget; it must not throw.
  */
-export declare function emitTraceEvent(event: LiveTraceEvent, tracePushUrl?: string, routeHeader?: string, authToken?: string): void;
+export declare function emitTraceEvent(event: LiveTraceEvent, pushEvent?: (event: LiveTraceEvent) => void): void;

package/build/types/run-scenario/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { EvalRunResult, TestScenario, Template } from '@wix/evalforge-types';
+import type { EvalRunResult, LiveTraceEvent, TestScenario, Template } from '@wix/evalforge-types';
 import type { EvaluatorConfig } from '../config.js';
 import type { Assertion } from '@wix/evalforge-types';
 import type { EvaluationData } from '../fetch-evaluation-data.js';
@@ -13,6 +13,8 @@ export type { ScenarioItem, EvaluationData } from '../fetch-evaluation-data.js';
  * @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
  * @param template - Optional pre-fetched template entity
  * @param resolvedAssertions - Optional assertions resolved from assertionIds
+ * @param pushEvent - Optional callback pushing trace events to the backend
+ *   (remote job execution, where stdout is not accessible)
  * @returns Complete evaluation result
  */
-export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[]): Promise<EvalRunResult>;
+export declare function runScenario(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, template?: Template, resolvedAssertions?: Assertion[], pushEvent?: (event: LiveTraceEvent) => void): Promise<EvalRunResult>;

package/build/types/run-scenario/run-agent-with-context.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { TestScenario } from '@wix/evalforge-types';
+import type { TestScenario, LiveTraceEvent } from '@wix/evalforge-types';
 import type { EvaluatorConfig } from '../config.js';
 import type { PartialEvalRunResult } from './types.js';
 import type { EvaluationData } from '../fetch-evaluation-data.js';
@@ -14,6 +14,8 @@ import type { EvaluationData } from '../fetch-evaluation-data.js';
  * @param scenario - The test scenario to run
  * @param evalData - Fetched evaluation data (skills, agent, mcps, subAgents)
  * @param workDir - Optional working directory for the scenario
+ * @param pushEvent - Optional callback pushing trace events to the backend
+ *   (remote job execution, where stdout is not accessible)
  * @returns Partial result without assertion fields
  */
-export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, workDir?: string): Promise<PartialEvalRunResult>;
+export declare function runAgentWithContext(config: EvaluatorConfig, evalRunId: string, scenario: TestScenario, evalData: EvaluationData, workDir?: string, pushEvent?: (event: LiveTraceEvent) => void): Promise<PartialEvalRunResult>;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@wix/evalforge-evaluator",
-  "version": "0.178.0",
+  "version": "0.179.0",
   "description": "EvalForge Evaluator",
   "bin": "./build/index.js",
   "files": [
@@ -22,9 +22,9 @@
     "@ai-sdk/openai": "^3.0.39",
     "@anthropic-ai/claude-agent-sdk": "^0.2.63",
     "@anthropic-ai/claude-code": "^2.1.63",
-    "@wix/eval-assertions": "0.69.0",
-    "@wix/evalforge-github-client": "0.69.0",
-    "@wix/evalforge-types": "0.94.0",
+    "@wix/eval-assertions": "0.70.0",
+    "@wix/evalforge-github-client": "0.70.0",
+    "@wix/evalforge-types": "0.95.0",
     "ai": "^6.0.107",
     "diff": "^7.0.0",
     "tar": "^7.5.3",
@@ -35,6 +35,14 @@
     "@types/diff": "^7.0.2",
     "@types/node": "^22.19.3",
     "@types/tar": "^6.1.13",
+    "@wix/ambassador-evalforge-v1-agent": "^1.0.5",
+    "@wix/ambassador-evalforge-v1-capability": "^1.0.8",
+    "@wix/ambassador-evalforge-v1-eval-run": "^1.0.8",
+    "@wix/ambassador-evalforge-v1-preset": "^1.0.5",
+    "@wix/ambassador-evalforge-v1-project": "^1.0.6",
+    "@wix/ambassador-evalforge-v1-test-scenario": "^1.0.5",
+    "@wix/http-client": "^2.85.0",
+    "@wix/http-client-testkit": "^1.764.0",
     "dotenv": "^17.2.3",
     "esbuild": "^0.27.2",
     "eslint": "^9.39.2",
@@ -63,5 +71,5 @@
       "artifactId": "evalforge-evaluator"
     }
   },
-  "falconPackageHash": "c769147b98c8dd7089e778d69f01fd2f28992417075cc9caedacf864"
+  "falconPackageHash": "ec6cd7242df122dacec1fb89f46734131adb2f8a98afb93362b7bef7"
 }