npm - @wix/evalforge-evaluator - Versions diffs - 0.182.0 → 0.184.0 - Mend

@wix/evalforge-evaluator 0.182.0 → 0.184.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/build/types/run-scenario/agents/claude-code/execute.d.ts CHANGED Viewed

@@ -1,11 +1,12 @@
 import type { SkillWithLatestVersion, TestScenario, LLMTrace, ConversationMessage, TriggerPromptImage } from '@wix/evalforge-types';
+import type { CapturedStep } from '../../types.js';
 import type { ClaudeCodeExecutionOptions, ClaudeCodeExecutionResult } from './types.js';
 /**
  * Import SDK types directly from Claude Agent SDK.
  * Type-only imports are erased at compile time - zero runtime overhead.
  * The SDK is still dynamically imported at runtime in executeWithClaudeCode().
  */
-import type { SDKUserMessage, SDKMessage } from '@anthropic-ai/claude-agent-sdk' with { 'resolution-mode': 'import' };
+import type { SDKResultMessage, SDKUserMessage, SDKMessage } from '@anthropic-ai/claude-agent-sdk' with { 'resolution-mode': 'import' };
 /**
  * Message with timestamp — tracks when each message was received.
  */
@@ -46,3 +47,24 @@ export declare function executeWithClaudeCode(skills: SkillWithLatestVersion[],
     llmTrace: LLMTrace;
     conversation: ConversationMessage[];
 }>;
+/**
+ * Process SDK messages into CapturedSteps for LLM trace building.
+ * Uses actual timestamps from when messages were received to calculate durations.
+ */
+export declare function processMessages(timestampedMessages: TimestampedMessage[], startTime: Date, endTime: Date): {
+    steps: CapturedStep[];
+    result?: SDKResultMessage;
+};
+/**
+ * Build LLM trace from captured steps.
+ * Calculates per-step costs using model pricing and sums tokens from steps as fallback.
+ */
+export declare function buildLLMTraceFromSteps(steps: CapturedStep[], totalDurationMs: number, usage: {
+    inputTokens: number;
+    outputTokens: number;
+    totalTokens: number;
+    costUsd?: number;
+    cacheReadTokens?: number;
+    cacheWriteTokens?: number;
+    durationApiMs?: number;
+}, model: string): LLMTrace;

package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts CHANGED Viewed

@@ -1,4 +1,6 @@
 import type { MCPEntity } from '@wix/evalforge-types';
+import { type FetchGitHubFileFn } from '../shared/resolve-capability-content.js';
+export type { FetchGitHubFileFn };
 /**
  * Write .mcp.json at the project root (cwd) for Claude Code to discover MCPs.
  *
@@ -9,8 +11,12 @@ import type { MCPEntity } from '@wix/evalforge-types';
  * Any `{{placeholder}}` values in the config are resolved before writing
  * (e.g. `{{wix-auth-token}}` → token from ~/.wix/auth/api-key.json).
  *
+ * For MCPs with a GitHub source, the latest config is live-fetched.
+ * For inline MCPs, the stored config is used directly.
+ *
  * @see https://code.claude.com/docs/en/mcp#mcp-installation-scopes
  * @param cwd - Working directory (project root for Claude Code)
  * @param mcps - MCP entities whose config is merged into mcpServers
+ * @param fetchFn - Optional GitHub fetcher (injectable for tests)
  */
-export declare function writeMcpToFilesystem(cwd: string, mcps: MCPEntity[]): Promise<void>;
+export declare function writeMcpToFilesystem(cwd: string, mcps: MCPEntity[], fetchFn?: FetchGitHubFileFn): Promise<void>;

package/build/types/run-scenario/agents/claude-code/write-rules.d.ts CHANGED Viewed

@@ -1,4 +1,6 @@
 import type { Rule } from '@wix/evalforge-types';
+import { type FetchGitHubFileFn } from '../shared/resolve-capability-content.js';
+export type { FetchGitHubFileFn };
 /**
  * Write rule content to the filesystem based on each rule's `ruleType`.
  *
@@ -8,7 +10,11 @@ import type { Rule } from '@wix/evalforge-types';
  * - `cursor-rule` -> `{cwd}/.cursor/rules/{name}.md` (one file per rule)
  * - `generic`     -> `{cwd}/{rule.directory}` (defaults to `.opencode/rules`)
  *
+ * For rules with a GitHub source, the latest content is live-fetched.
+ * For inline rules, the stored content is used directly.
+ *
  * @param cwd - Working directory (project root for Claude Code)
  * @param rules - Rule entities to write
+ * @param fetchFn - Optional GitHub fetcher (injectable for tests)
  */
-export declare function writeRulesToFilesystem(cwd: string, rules: Rule[]): Promise<void>;
+export declare function writeRulesToFilesystem(cwd: string, rules: Rule[], fetchFn?: FetchGitHubFileFn): Promise<void>;

package/build/types/run-scenario/agents/claude-code/write-skills.d.ts CHANGED Viewed

@@ -1,18 +1,15 @@
-import type { GitHubSource, SkillFile, SkillWithLatestVersion } from '@wix/evalforge-types';
+import type { SkillWithLatestVersion } from '@wix/evalforge-types';
+import { type FetchGitHubFolderFn } from '../shared/resolve-capability-content.js';
 import { writeFilesToDirectory } from '../../utils/write-files.js';
-export type FetchGitHubFolderFn = (source: GitHubSource, options?: {
-    userAgent?: string;
-}) => Promise<SkillFile[]>;
+export type { FetchGitHubFolderFn };
 /**
  * Write all skills to the filesystem so Claude Agent SDK can discover them.
- *
- * Content resolution:
- * 1. Pinned (version has `files`): writes all files from the stored snapshot
- * 2. Live (skill has `source`, no pinned files): fetches from GitHub at runtime
+ * Content is resolved by the shared capability fetcher (stored snapshot or
+ * live GitHub fetch).
  *
  * @param cwd - Working directory where .claude/skills/ will be created
  * @param skills - All skills to write
- * @param fetchFn - Function to fetch files from GitHub (defaults to fetchGitHubFolder)
+ * @param fetchFn - Optional GitHub folder fetcher (injectable for tests)
  */
 export declare function writeSkillsToFilesystem(cwd: string, skills: SkillWithLatestVersion[], fetchFn?: FetchGitHubFolderFn): Promise<void>;
 export declare function writeSkillToFilesystem(cwd: string, skill: SkillWithLatestVersion, fetchFn?: FetchGitHubFolderFn): Promise<void>;

package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts CHANGED Viewed

@@ -1,7 +1,6 @@
-import type { SubAgent, GitHubSource } from '@wix/evalforge-types';
-import { type FetchGitHubFolderOptions } from '@wix/evalforge-github-client';
-/** Signature for the single-file GitHub fetch function (injectable for tests). */
-export type FetchGitHubFileFn = (source: GitHubSource, options?: FetchGitHubFolderOptions) => Promise<string>;
+import type { SubAgent } from '@wix/evalforge-types';
+import { type FetchGitHubFileFn } from '../shared/resolve-capability-content.js';
+export type { FetchGitHubFileFn };
 /**
  * Write sub-agent markdown files to .claude/agents/ for Claude Code to discover.
  *
@@ -11,6 +10,6 @@ export type FetchGitHubFileFn = (source: GitHubSource, options?: FetchGitHubFold
  * @see https://code.claude.com/docs/en/sub-agents#write-subagent-files
  * @param cwd - Working directory (project root for Claude Code)
  * @param subAgents - Sub-agent entities to write
- * @param fetchFn - Optional fetch function for testing (defaults to fetchGitHubFile)
+ * @param fetchFn - Optional GitHub fetcher (injectable for tests)
  */
 export declare function writeSubAgentsToFilesystem(cwd: string, subAgents: SubAgent[], fetchFn?: FetchGitHubFileFn): Promise<void>;

package/build/types/run-scenario/agents/opencode/write-skills.d.ts CHANGED Viewed

@@ -1,12 +1,9 @@
-import type { GitHubSource, SkillFile, SkillWithLatestVersion } from '@wix/evalforge-types';
-export type FetchGitHubFolderFn = (source: GitHubSource, options?: {
-    userAgent?: string;
-}) => Promise<SkillFile[]>;
+import type { SkillWithLatestVersion } from '@wix/evalforge-types';
+import { type FetchGitHubFolderFn } from '../shared/resolve-capability-content.js';
+export type { FetchGitHubFolderFn };
 /**
  * Write all skills to .opencode/skills/ so OpenCode can discover them.
- *
- * Content resolution:
- * 1. Pinned (version has `files`): writes all files from the stored snapshot
- * 2. Live (skill has `source`, no pinned files): fetches from GitHub at runtime
+ * Content is resolved by the shared capability fetcher (stored snapshot or
+ * live GitHub fetch).
  */
 export declare function writeSkillsToFilesystem(cwd: string, skills: SkillWithLatestVersion[], fetchFn?: FetchGitHubFolderFn): Promise<void>;

package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import type { SubAgent, GitHubSource } from '@wix/evalforge-types';
-import { type FetchGitHubFolderOptions } from '@wix/evalforge-github-client';
-export type FetchGitHubFileFn = (source: GitHubSource, options?: FetchGitHubFolderOptions) => Promise<string>;
+import type { SubAgent } from '@wix/evalforge-types';
+import { type FetchGitHubFileFn } from '../shared/resolve-capability-content.js';
+export type { FetchGitHubFileFn };
 /**
  * Write sub-agent markdown files to .opencode/agents/ for OpenCode to discover.
  */

package/build/types/run-scenario/agents/shared/resolve-capability-content.d.ts ADDED Viewed

@@ -0,0 +1,42 @@
+/**
+ * Capability content fetcher.
+ *
+ * Single place that resolves the runtime content of a capability for an eval
+ * run, choosing between two sources:
+ *   1. DB snapshot — the stored version content already loaded onto the entity
+ *      (skill version files, sub-agent markdown, rule text, MCP config).
+ *   2. Live GitHub — when the entity has a `source` and no usable snapshot,
+ *      the latest content is fetched from GitHub at run time.
+ *
+ * Shared by every agent runner (claude-code, opencode) and every capability
+ * type so the GitHub-vs-DB decision lives in exactly one place.
+ */
+import { type FetchGitHubFolderOptions } from '@wix/evalforge-github-client';
+import type { GitHubSource, SkillFile, SkillWithLatestVersion, SubAgent, Rule, MCPEntity } from '@wix/evalforge-types';
+/** Fetches a single file from GitHub and returns its UTF-8 content. */
+export type FetchGitHubFileFn = (source: GitHubSource, options?: FetchGitHubFolderOptions) => Promise<string>;
+/** Fetches a directory snapshot from GitHub and returns its files. */
+export type FetchGitHubFolderFn = (source: GitHubSource, options?: {
+    userAgent?: string;
+}) => Promise<SkillFile[]>;
+/**
+ * Resolve the files for a skill: the stored version snapshot when present,
+ * otherwise a live folder fetch from GitHub. Throws when neither is available.
+ */
+export declare function resolveSkillFiles(skill: SkillWithLatestVersion, fetchFn?: FetchGitHubFolderFn): Promise<SkillFile[]>;
+/**
+ * Resolve a sub-agent's markdown: live-fetch when a source is set, else the
+ * inline `subAgentMd`.
+ */
+export declare function resolveSubAgentMd(agent: SubAgent, fetchFn?: FetchGitHubFileFn): Promise<string>;
+/**
+ * Resolve a rule's text content: live-fetch when a source is set, else the
+ * inline `content`.
+ */
+export declare function resolveRuleText(rule: Rule, fetchFn?: FetchGitHubFileFn): Promise<string>;
+/**
+ * Resolve an MCP's keyed server config: live-fetch and parse the JSON config
+ * file when a source is set (unwrapping a top-level `mcpServers` key when
+ * present), else the inline `config`.
+ */
+export declare function resolveMcpConfig(mcp: MCPEntity, fetchFn?: FetchGitHubFileFn): Promise<Record<string, unknown>>;

package/build/types/run-scenario/types.d.ts CHANGED Viewed

@@ -21,6 +21,8 @@ export interface CapturedStep {
         toolName: string;
         toolUseId?: string;
         args: unknown;
+        isError?: boolean;
+        errorContent?: string;
     }>;
     toolResults?: unknown[];
     /** True if any tool result for this step's tool calls was an error */

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@wix/evalforge-evaluator",
-  "version": "0.182.0",
+  "version": "0.184.0",
   "description": "EvalForge Evaluator",
   "bin": "./build/index.js",
   "files": [
@@ -71,5 +71,5 @@
       "artifactId": "evalforge-evaluator"
     }
   },
-  "falconPackageHash": "a60b5c0fdd6d077a38372bc12537d26fcf190979f7c1ace877bfebe9"
+  "falconPackageHash": "e51797f0d074a5b087399ed74f317c6a01d157d49801b14659740125"
 }