@aliou/pi-evals 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -88,7 +88,9 @@ import * as os from "os";
88
88
  import * as path from "path";
89
89
  import { getModel } from "@mariozechner/pi-ai";
90
90
  import {
91
- createAgentSession
91
+ createAgentSession,
92
+ DefaultResourceLoader,
93
+ SessionManager
92
94
  } from "@mariozechner/pi-coding-agent";
93
95
  async function runPiTask(input, config, setup, timeout) {
94
96
  const cwd = await createWorkspace(setup);
@@ -96,10 +98,20 @@ async function runPiTask(input, config, setup, timeout) {
96
98
  config.provider,
97
99
  config.model
98
100
  );
101
+ let resourceLoader;
102
+ if (config.extensions && config.extensions.length > 0) {
103
+ const resolvedPaths = config.extensions.map((ext) => path.resolve(ext));
104
+ resourceLoader = new DefaultResourceLoader({
105
+ cwd,
106
+ additionalExtensionPaths: resolvedPaths
107
+ });
108
+ await resourceLoader.reload();
109
+ }
99
110
  const { session } = await createAgentSession({
100
111
  cwd,
101
- model
102
- // Use default coding tools (read, bash, edit, write)
112
+ model,
113
+ sessionManager: SessionManager.inMemory(cwd),
114
+ ...resourceLoader ? { resourceLoader } : {}
103
115
  });
104
116
  try {
105
117
  const timeoutMs = timeout ?? 6e4;
@@ -109,10 +121,10 @@ async function runPiTask(input, config, setup, timeout) {
109
121
  timeoutMs
110
122
  );
111
123
  });
112
- const completionPromise = new Promise((resolve) => {
124
+ const completionPromise = new Promise((resolve2) => {
113
125
  session.subscribe((event) => {
114
126
  if (event.type === "agent_end") {
115
- resolve();
127
+ resolve2();
116
128
  }
117
129
  });
118
130
  });
@@ -166,7 +178,12 @@ function extractToolCalls(messages) {
166
178
  if (!Array.isArray(content)) continue;
167
179
  for (const block of content) {
168
180
  const b = block;
169
- if (b.type === "tool_use" && b.name) {
181
+ if (b.type === "toolCall" && b.name) {
182
+ toolCalls.push({
183
+ name: b.name,
184
+ args: b.arguments ?? {}
185
+ });
186
+ } else if (b.type === "tool_use" && b.name) {
170
187
  toolCalls.push({
171
188
  name: b.name,
172
189
  args: b.input ?? {}
@@ -339,7 +356,7 @@ async function runSingleEval(evalDef, config, options) {
339
356
  return results;
340
357
  }
341
358
  function sleep(ms) {
342
- return new Promise((resolve) => setTimeout(resolve, ms));
359
+ return new Promise((resolve2) => setTimeout(resolve2, ms));
343
360
  }
344
361
  function truncate2(str, maxLen) {
345
362
  const oneLine = str.replace(/\n/g, " ").trim();
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/reporter.ts","../src/task.ts","../src/runner.ts","../src/cli.ts"],"sourcesContent":["/**\n * Reporter - console and JSON output\n */\nimport type { EvalRunSummary } from \"./types\";\n\n/**\n * Print results to console in a human-readable format\n */\nexport function printResults(summary: EvalRunSummary): void {\n const { results, total, passed, duration, totalTokens, totalCost } = summary;\n\n if (total === 0) {\n console.log(\"No eval results.\");\n return;\n }\n\n console.log();\n\n // Group results by eval name\n const byEval = groupBy(results, (r) => r.evalName);\n\n for (const [evalName, evalResults] of Object.entries(byEval)) {\n const evalPassed = evalResults.filter((r) => r.passed).length;\n const evalTotal = evalResults.length;\n const _evalStatus = evalPassed === evalTotal ? \"PASS\" : \"FAIL\";\n const statusIcon = evalPassed === evalTotal ? \"+\" : \"-\";\n\n console.log(`${statusIcon} ${evalName} (${evalPassed}/${evalTotal})`);\n\n for (const result of evalResults) {\n const icon = result.passed ? \"+\" : \"-\";\n const time = formatDuration(result.duration);\n const cost = formatCost(result.cost);\n const tokens = result.tokens.total;\n\n console.log(\n ` ${icon} ${truncate(result.input, 50)} (${time}, ${cost}, ${tokens} tok)`,\n );\n\n // Show score details\n for (const score of result.scores) {\n const scoreIcon = score.score >= 0.5 ? \"+\" : \"-\";\n const scoreValue = (score.score * 100).toFixed(0);\n console.log(` ${scoreIcon} ${score.name}: ${scoreValue}%`);\n if (score.reason && score.score < 1) {\n // Show reason for partial/failed scores\n const reasonLines = score.reason.split(\"\\n\").slice(0, 3);\n for (const line of reasonLines) {\n console.log(` ${line}`);\n }\n }\n }\n\n // Show error if present\n if (result.error) {\n console.log(` ! Error: ${result.error}`);\n }\n }\n\n console.log();\n }\n\n // Summary line\n console.log(\"─\".repeat(50));\n const passRate = total > 0 ? ((passed / total) * 100).toFixed(0) : 0;\n console.log(`Results: ${passed}/${total} passed (${passRate}%)`);\n console.log(\n `Total: ${formatCost(totalCost)}, ${totalTokens} tokens, ${formatDuration(duration)}`,\n );\n}\n\n/**\n * Print results as JSON\n */\nexport function printJson(summary: EvalRunSummary): void {\n console.log(JSON.stringify(summary, null, 2));\n}\n\n/**\n * Format duration in human-readable form\n */\nfunction formatDuration(ms: number): string {\n if (ms < 1000) return `${ms}ms`;\n const seconds = ms / 1000;\n if (seconds < 60) return `${seconds.toFixed(1)}s`;\n const minutes = Math.floor(seconds / 60);\n const remainingSeconds = (seconds % 60).toFixed(0);\n return `${minutes}m${remainingSeconds}s`;\n}\n\n/**\n * Format cost in USD\n */\nfunction formatCost(cost: number): string {\n if (cost < 0.01) return `$${cost.toFixed(4)}`;\n if (cost < 1) return `$${cost.toFixed(3)}`;\n return `$${cost.toFixed(2)}`;\n}\n\n/**\n * Truncate string to max length\n */\nfunction truncate(str: string, maxLen: number): string {\n const oneLine = str.replace(/\\n/g, \" \").trim();\n if (oneLine.length <= maxLen) return oneLine;\n return `${oneLine.slice(0, maxLen - 3)}...`;\n}\n\n/**\n * Group array items by key\n */\nfunction groupBy<T>(\n items: T[],\n keyFn: (item: T) => string,\n): Record<string, T[]> {\n const result: Record<string, T[]> = {};\n for (const item of items) {\n const key = keyFn(item);\n if (!result[key]) result[key] = [];\n result[key].push(item);\n }\n return result;\n}\n","/**\n * Pi task execution via createAgentSession SDK\n */\n\nimport * as fs from \"node:fs/promises\";\nimport * as os from \"node:os\";\nimport * as path from \"node:path\";\nimport { getModel, type KnownProvider } from \"@mariozechner/pi-ai\";\nimport {\n createAgentSession,\n type SessionStats as PiSessionStats,\n} from \"@mariozechner/pi-coding-agent\";\nimport type { PiConfig, SessionStats, TestSetup, ToolCall } from \"./types\";\n\n/**\n * Result of running a pi task\n */\nexport interface TaskResult {\n /** Agent's final response text */\n output: string;\n /** Full conversation messages */\n messages: unknown[];\n /** Tool calls made during the session */\n toolCalls: ToolCall[];\n /** Session statistics */\n stats: SessionStats;\n /** Workspace directory (for scorers) */\n cwd: string;\n}\n\n/**\n * Run a pi task in a temporary workspace\n */\nexport async function runPiTask(\n input: string,\n config: PiConfig,\n setup?: TestSetup,\n timeout?: number,\n): Promise<TaskResult> {\n // Create isolated workspace\n const cwd = await createWorkspace(setup);\n\n // Get the model (cast provider to KnownProvider - will throw if invalid)\n const model = getModel(\n config.provider as KnownProvider,\n config.model as never,\n );\n\n // Create session\n const { session } = await createAgentSession({\n cwd,\n model,\n // Use default coding tools (read, bash, edit, write)\n });\n\n try {\n // Set up timeout\n const timeoutMs = timeout ?? 60_000;\n const timeoutPromise = new Promise<never>((_, reject) => {\n setTimeout(\n () => reject(new Error(`Task timed out after ${timeoutMs}ms`)),\n timeoutMs,\n );\n });\n\n // Subscribe to events to know when done\n const completionPromise = new Promise<void>((resolve) => {\n session.subscribe((event) => {\n if (event.type === \"agent_end\") {\n resolve();\n }\n });\n });\n\n // Send the prompt\n await session.prompt(input);\n\n // Wait for completion or timeout\n await Promise.race([completionPromise, timeoutPromise]);\n\n // Get results\n const messages = session.messages;\n const piStats = session.getSessionStats();\n\n // Extract the last assistant message text\n const output = extractLastAssistantText(messages);\n\n // Extract tool calls from messages\n const toolCalls = extractToolCalls(messages);\n\n // Convert stats\n const stats = convertStats(piStats);\n\n return {\n output,\n messages,\n toolCalls,\n stats,\n cwd,\n };\n } finally {\n // Dispose session\n session.dispose();\n }\n}\n\n/**\n * Extract the text from the last assistant message\n */\nfunction extractLastAssistantText(messages: unknown[]): string {\n // Find the last assistant message\n for (let i = messages.length - 1; i >= 0; i--) {\n const msg = messages[i] as { role?: string; content?: unknown };\n if (msg.role === \"assistant\") {\n // Extract text from content\n const content = msg.content;\n if (typeof content === \"string\") {\n return content;\n }\n if (Array.isArray(content)) {\n const texts: string[] = [];\n for (const block of content) {\n if (typeof block === \"string\") {\n texts.push(block);\n } else if (block && typeof block === \"object\" && \"text\" in block) {\n texts.push(String((block as { text: unknown }).text));\n }\n }\n return texts.join(\"\\n\");\n }\n }\n }\n return \"\";\n}\n\n/**\n * Extract tool calls from assistant messages.\n * Tool calls appear as content blocks with type \"tool_use\" in assistant messages.\n */\nfunction extractToolCalls(messages: unknown[]): ToolCall[] {\n const toolCalls: ToolCall[] = [];\n\n for (const msg of messages) {\n const message = msg as { role?: string; content?: unknown };\n if (message.role !== \"assistant\") continue;\n\n const content = message.content;\n if (!Array.isArray(content)) continue;\n\n for (const block of content) {\n const b = block as { type?: string; name?: string; input?: unknown };\n if (b.type === \"tool_use\" && b.name) {\n toolCalls.push({\n name: b.name,\n args: (b.input as Record<string, unknown>) ?? {},\n });\n }\n }\n }\n\n return toolCalls;\n}\n\n/**\n * Convert Pi session stats to our format\n */\nfunction convertStats(piStats: PiSessionStats): SessionStats {\n return {\n tokens: {\n input: piStats.tokens.input,\n output: piStats.tokens.output,\n total: piStats.tokens.total,\n },\n cost: piStats.cost,\n };\n}\n\n/**\n * Create a temporary workspace with optional setup\n */\nexport async function createWorkspace(setup?: TestSetup): Promise<string> {\n const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), \"pi-eval-\"));\n\n if (setup?.files) {\n for (const [filePath, content] of Object.entries(setup.files)) {\n const fullPath = path.join(tmpDir, filePath);\n await fs.mkdir(path.dirname(fullPath), { recursive: true });\n await fs.writeFile(fullPath, content, \"utf-8\");\n }\n }\n\n if (setup?.commands) {\n const { exec } = await import(\"node:child_process\");\n const { promisify } = await import(\"node:util\");\n const execAsync = promisify(exec);\n\n for (const cmd of setup.commands) {\n await execAsync(cmd, { cwd: tmpDir });\n }\n }\n\n return tmpDir;\n}\n\n/**\n * Clean up a workspace directory\n */\nexport async function cleanupWorkspace(cwd: string): Promise<void> {\n try {\n await fs.rm(cwd, { recursive: true, force: true });\n } catch {\n // Ignore cleanup errors\n }\n}\n","/**\n * Eval runner - orchestrates sequential execution\n */\n\nimport { discoverEvals } from \"./discovery\";\nimport { cleanupWorkspace, runPiTask } from \"./task\";\nimport type {\n CliOptions,\n EvalDefinition,\n EvalRunSummary,\n GlobalConfig,\n ScoreContext,\n TestResult,\n} from \"./types\";\n\n/**\n * Run all discovered evals\n */\nexport async function runEvals(\n config: Required<GlobalConfig>,\n options: CliOptions,\n): Promise<EvalRunSummary> {\n const startTime = Date.now();\n\n // Discover eval files\n const evals = await discoverEvals(config.evalsDir);\n\n if (evals.length === 0) {\n return {\n results: [],\n total: 0,\n passed: 0,\n failed: 0,\n duration: Date.now() - startTime,\n totalTokens: 0,\n totalCost: 0,\n };\n }\n\n // Filter evals if requested\n const filterPattern = options.filter;\n const filteredEvals = filterPattern\n ? evals.filter((e) => e.name.includes(filterPattern))\n : evals;\n\n // Count total test cases\n const totalCases = filteredEvals.reduce(\n (sum, e) => sum + e.options.data.length,\n 0,\n );\n\n // Warn if too many test cases\n if (totalCases > config.warnTestCount) {\n console.warn(\n `Warning: ${totalCases} test cases. This may take a while and hit rate limits.`,\n );\n }\n\n const results: TestResult[] = [];\n let totalTokens = 0;\n let totalCost = 0;\n\n // Run evals sequentially\n for (const evalDef of filteredEvals) {\n const evalResults = await runSingleEval(evalDef, config, options);\n results.push(...evalResults);\n\n for (const result of evalResults) {\n totalTokens += result.tokens.total;\n totalCost += result.cost;\n }\n }\n\n const passed = results.filter((r) => r.passed).length;\n\n return {\n results,\n total: results.length,\n passed,\n failed: results.length - passed,\n duration: Date.now() - startTime,\n totalTokens,\n totalCost,\n };\n}\n\n/**\n * Run a single eval (all its test cases)\n */\nasync function runSingleEval(\n evalDef: EvalDefinition,\n config: Required<GlobalConfig>,\n options: CliOptions,\n): Promise<TestResult[]> {\n const results: TestResult[] = [];\n const { name, options: evalOptions } = evalDef;\n\n // Check for .only test cases\n const onlyCases = evalOptions.data.filter((tc) => tc.only);\n const testCases = onlyCases.length > 0 ? onlyCases : evalOptions.data;\n\n // Filter out skipped cases\n const runnableCases = testCases.filter((tc) => !tc.skip);\n\n for (let i = 0; i < runnableCases.length; i++) {\n const testCase = runnableCases[i];\n const startTime = Date.now();\n let cwd = \"\";\n\n try {\n // Merge config with defaults, then apply CLI/env overrides\n const piConfig = {\n ...config.defaults,\n ...evalOptions.config,\n // CLI/env overrides take precedence\n ...(options.model && { model: options.model }),\n ...(options.provider && { provider: options.provider }),\n };\n\n // Run the pi task\n const timeout = testCase.timeout ?? evalOptions.timeout ?? config.timeout;\n const taskResult = await runPiTask(\n testCase.input,\n piConfig,\n testCase.setup,\n timeout,\n );\n\n cwd = taskResult.cwd; // Save for cleanup and scorers\n\n // Build scorer context\n const ctx: ScoreContext = {\n input: testCase.input,\n output: taskResult.output,\n expected: testCase.expected,\n cwd,\n messages: taskResult.messages as never[],\n toolCalls: taskResult.toolCalls,\n stats: taskResult.stats,\n };\n\n // Run all scorers\n const scores = await Promise.all(\n evalOptions.scorers.map((scorer) => scorer.score(ctx)),\n );\n\n // Test passes if all scores >= 0.5\n const passed = scores.every((s) => s.score >= 0.5);\n\n results.push({\n evalName: name,\n input: testCase.input,\n scores,\n passed,\n duration: Date.now() - startTime,\n tokens: taskResult.stats.tokens,\n cost: taskResult.stats.cost,\n });\n\n if (options.verbose) {\n const status = passed ? \"PASS\" : \"FAIL\";\n console.log(` [${status}] ${truncate(testCase.input, 50)}`);\n }\n } catch (err) {\n results.push({\n evalName: name,\n input: testCase.input,\n scores: [],\n passed: false,\n duration: Date.now() - startTime,\n tokens: { input: 0, output: 0, total: 0 },\n cost: 0,\n error: (err as Error).message,\n });\n\n if (options.verbose) {\n console.log(` [ERROR] ${truncate(testCase.input, 50)}`);\n console.log(` ${(err as Error).message}`);\n }\n } finally {\n // Clean up workspace if we have one\n if (cwd) {\n await cleanupWorkspace(cwd);\n }\n }\n\n // Delay between tests (rate limiting)\n if (i < runnableCases.length - 1) {\n await sleep(config.delayBetweenTests);\n }\n }\n\n return results;\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => setTimeout(resolve, ms));\n}\n\nfunction truncate(str: string, maxLen: number): string {\n const oneLine = str.replace(/\\n/g, \" \").trim();\n if (oneLine.length <= maxLen) return oneLine;\n return `${oneLine.slice(0, maxLen - 3)}...`;\n}\n","#!/usr/bin/env node\n/**\n * CLI entry point for pi-evals\n */\nimport { loadConfig } from \"./config\";\nimport { printJson, printResults } from \"./reporter\";\nimport { runEvals } from \"./runner\";\nimport type { CliOptions } from \"./types\";\n\nasync function main(): Promise<void> {\n const options = parseArgs(process.argv.slice(2));\n\n if (options.help) {\n printHelp();\n process.exit(0);\n }\n\n // Load config\n const config = await loadConfig(options.config);\n\n console.log(`Running evals from ${config.evalsDir}...`);\n\n // Run evals\n const summary = await runEvals(config, options);\n\n // Output results\n if (options.json) {\n printJson(summary);\n } else {\n printResults(summary);\n }\n\n // Check threshold\n if (options.threshold !== undefined) {\n const passRate =\n summary.total > 0 ? (summary.passed / summary.total) * 100 : 0;\n if (passRate < options.threshold) {\n console.log(\n `\\nFailed: pass rate ${passRate.toFixed(0)}% < threshold ${options.threshold}%`,\n );\n process.exit(1);\n }\n }\n\n // Exit with error if any tests failed\n if (summary.failed > 0) {\n process.exit(1);\n }\n}\n\ninterface ParsedOptions extends CliOptions {\n help?: boolean;\n model?: string;\n provider?: string;\n}\n\nfunction parseArgs(args: string[]): ParsedOptions {\n const options: ParsedOptions = {};\n\n for (let i = 0; i < args.length; i++) {\n const arg = args[i];\n\n if (arg === \"--help\" || arg === \"-h\") {\n options.help = true;\n } else if (arg === \"--json\") {\n options.json = true;\n } else if (arg === \"--verbose\" || arg === \"-v\") {\n options.verbose = true;\n } else if (arg === \"--filter\" || arg === \"-f\") {\n options.filter = args[++i];\n } else if (arg.startsWith(\"--filter=\")) {\n options.filter = arg.split(\"=\")[1];\n } else if (arg === \"--threshold\" || arg === \"-t\") {\n options.threshold = parseInt(args[++i], 10);\n } else if (arg.startsWith(\"--threshold=\")) {\n options.threshold = parseInt(arg.split(\"=\")[1], 10);\n } else if (arg === \"--config\" || arg === \"-c\") {\n options.config = args[++i];\n } else if (arg.startsWith(\"--config=\")) {\n options.config = arg.split(\"=\")[1];\n } else if (arg === \"--model\" || arg === \"-m\") {\n options.model = args[++i];\n } else if (arg.startsWith(\"--model=\")) {\n options.model = arg.split(\"=\")[1];\n } else if (arg === \"--provider\" || arg === \"-p\") {\n options.provider = args[++i];\n } else if (arg.startsWith(\"--provider=\")) {\n options.provider = arg.split(\"=\")[1];\n }\n }\n\n // Environment variable overrides (lower priority than CLI args)\n options.model = options.model ?? process.env.PI_EVAL_MODEL;\n options.provider = options.provider ?? process.env.PI_EVAL_PROVIDER;\n\n return options;\n}\n\nfunction printHelp(): void {\n console.log(`\npi-evals - Eval framework for pi coding agent\n\nUsage:\n pi-evals [options]\n\nOptions:\n -h, --help Show this help message\n -f, --filter <pattern> Filter evals by name substring\n -t, --threshold <pct> Minimum pass percentage to exit 0\n -c, --config <path> Config file path (default: pi-evals.config.ts)\n -m, --model <model> Override model (also: PI_EVAL_MODEL env var)\n -p, --provider <name> Override provider (also: PI_EVAL_PROVIDER env var)\n -v, --verbose Show detailed output during run\n --json Output results as JSON\n\nExamples:\n pi-evals # Run all evals\n pi-evals --filter \"file-creation\" # Run matching evals\n pi-evals --threshold 80 # Fail if < 80% pass\n pi-evals --json > results.json # JSON output for CI\n pi-evals -p github -m gpt-4o # Use GitHub Models\n PI_EVAL_PROVIDER=github PI_EVAL_MODEL=gpt-4o pi-evals # Via env vars\n`);\n}\n\nmain().catch((err) => {\n console.error(\"Error:\", err.message);\n process.exit(1);\n});\n"],"mappings":";;;;;;;AAQO,SAAS,aAAa,SAA+B;AAC1D,QAAM,EAAE,SAAS,OAAO,QAAQ,UAAU,aAAa,UAAU,IAAI;AAErE,MAAI,UAAU,GAAG;AACf,YAAQ,IAAI,kBAAkB;AAC9B;AAAA,EACF;AAEA,UAAQ,IAAI;AAGZ,QAAM,SAAS,QAAQ,SAAS,CAAC,MAAM,EAAE,QAAQ;AAEjD,aAAW,CAAC,UAAU,WAAW,KAAK,OAAO,QAAQ,MAAM,GAAG;AAC5D,UAAM,aAAa,YAAY,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AACvD,UAAM,YAAY,YAAY;AAC9B,UAAM,cAAc,eAAe,YAAY,SAAS;AACxD,UAAM,aAAa,eAAe,YAAY,MAAM;AAEpD,YAAQ,IAAI,GAAG,UAAU,IAAI,QAAQ,KAAK,UAAU,IAAI,SAAS,GAAG;AAEpE,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,OAAO,SAAS,MAAM;AACnC,YAAM,OAAO,eAAe,OAAO,QAAQ;AAC3C,YAAM,OAAO,WAAW,OAAO,IAAI;AACnC,YAAM,SAAS,OAAO,OAAO;AAE7B,cAAQ;AAAA,QACN,KAAK,IAAI,IAAI,SAAS,OAAO,OAAO,EAAE,CAAC,KAAK,IAAI,KAAK,IAAI,KAAK,MAAM;AAAA,MACtE;AAGA,iBAAW,SAAS,OAAO,QAAQ;AACjC,cAAM,YAAY,MAAM,SAAS,MAAM,MAAM;AAC7C,cAAM,cAAc,MAAM,QAAQ,KAAK,QAAQ,CAAC;AAChD,gBAAQ,IAAI,OAAO,SAAS,IAAI,MAAM,IAAI,KAAK,UAAU,GAAG;AAC5D,YAAI,MAAM,UAAU,MAAM,QAAQ,GAAG;AAEnC,gBAAM,cAAc,MAAM,OAAO,MAAM,IAAI,EAAE,MAAM,GAAG,CAAC;AACvD,qBAAW,QAAQ,aAAa;AAC9B,oBAAQ,IAAI,SAAS,IAAI,EAAE;AAAA,UAC7B;AAAA,QACF;AAAA,MACF;AAGA,UAAI,OAAO,OAAO;AAChB,gBAAQ,IAAI,gBAAgB,OAAO,KAAK,EAAE;AAAA,MAC5C;AAAA,IACF;AAEA,YAAQ,IAAI;AAAA,EACd;AAGA,UAAQ,IAAI,SAAI,OAAO,EAAE,CAAC;AAC1B,QAAM,WAAW,QAAQ,KAAM,SAAS,QAAS,KAAK,QAAQ,CAAC,IAAI;AACnE,UAAQ,IAAI,YAAY,MAAM,IAAI,KAAK,YAAY,QAAQ,IAAI;AAC/D,UAAQ;AAAA,IACN,UAAU,WAAW,SAAS,CAAC,KAAK,WAAW,YAAY,eAAe,QAAQ,CAAC;AAAA,EACrF;AACF;AAKO,SAAS,UAAU,SAA+B;AACvD,UAAQ,IAAI,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAC9C;AAKA,SAAS,eAAe,IAAoB;AAC1C,MAAI,KAAK,IAAM,QAAO,GAAG,EAAE;AAC3B,QAAM,UAAU,KAAK;AACrB,MAAI,UAAU,GAAI,QAAO,GAAG,QAAQ,QAAQ,CAAC,CAAC;AAC9C,QAAM,UAAU,KAAK,MAAM,UAAU,EAAE;AACvC,QAAM,oBAAoB,UAAU,IAAI,QAAQ,CAAC;AACjD,SAAO,GAAG,OAAO,IAAI,gBAAgB;AACvC;AAKA,SAAS,WAAW,MAAsB;AACxC,MAAI,OAAO,KAAM,QAAO,IAAI,KAAK,QAAQ,CAAC,CAAC;AAC3C,MAAI,OAAO,EAAG,QAAO,IAAI,KAAK,QAAQ,CAAC,CAAC;AACxC,SAAO,IAAI,KAAK,QAAQ,CAAC,CAAC;AAC5B;AAKA,SAAS,SAAS,KAAa,QAAwB;AACrD,QAAM,UAAU,IAAI,QAAQ,OAAO,GAAG,EAAE,KAAK;AAC7C,MAAI,QAAQ,UAAU,OAAQ,QAAO;AACrC,SAAO,GAAG,QAAQ,MAAM,GAAG,SAAS,CAAC,CAAC;AACxC;AAKA,SAAS,QACP,OACA,OACqB;AACrB,QAAM,SAA8B,CAAC;AACrC,aAAW,QAAQ,OAAO;AACxB,UAAM,MAAM,MAAM,IAAI;AACtB,QAAI,CAAC,OAAO,GAAG,EAAG,QAAO,GAAG,IAAI,CAAC;AACjC,WAAO,GAAG,EAAE,KAAK,IAAI;AAAA,EACvB;AACA,SAAO;AACT;;;ACtHA,YAAY,QAAQ;AACpB,YAAY,QAAQ;AACpB,YAAY,UAAU;AACtB,SAAS,gBAAoC;AAC7C;AAAA,EACE;AAAA,OAEK;AAsBP,eAAsB,UACpB,OACA,QACA,OACA,SACqB;AAErB,QAAM,MAAM,MAAM,gBAAgB,KAAK;AAGvC,QAAM,QAAQ;AAAA,IACZ,OAAO;AAAA,IACP,OAAO;AAAA,EACT;AAGA,QAAM,EAAE,QAAQ,IAAI,MAAM,mBAAmB;AAAA,IAC3C;AAAA,IACA;AAAA;AAAA,EAEF,CAAC;AAED,MAAI;AAEF,UAAM,YAAY,WAAW;AAC7B,UAAM,iBAAiB,IAAI,QAAe,CAAC,GAAG,WAAW;AACvD;AAAA,QACE,MAAM,OAAO,IAAI,MAAM,wBAAwB,SAAS,IAAI,CAAC;AAAA,QAC7D;AAAA,MACF;AAAA,IACF,CAAC;AAGD,UAAM,oBAAoB,IAAI,QAAc,CAAC,YAAY;AACvD,cAAQ,UAAU,CAAC,UAAU;AAC3B,YAAI,MAAM,SAAS,aAAa;AAC9B,kBAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH,CAAC;AAGD,UAAM,QAAQ,OAAO,KAAK;AAG1B,UAAM,QAAQ,KAAK,CAAC,mBAAmB,cAAc,CAAC;AAGtD,UAAM,WAAW,QAAQ;AACzB,UAAM,UAAU,QAAQ,gBAAgB;AAGxC,UAAM,SAAS,yBAAyB,QAAQ;AAGhD,UAAM,YAAY,iBAAiB,QAAQ;AAG3C,UAAM,QAAQ,aAAa,OAAO;AAElC,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF,UAAE;AAEA,YAAQ,QAAQ;AAAA,EAClB;AACF;AAKA,SAAS,yBAAyB,UAA6B;AAE7D,WAAS,IAAI,SAAS,SAAS,GAAG,KAAK,GAAG,KAAK;AAC7C,UAAM,MAAM,SAAS,CAAC;AACtB,QAAI,IAAI,SAAS,aAAa;AAE5B,YAAM,UAAU,IAAI;AACpB,UAAI,OAAO,YAAY,UAAU;AAC/B,eAAO;AAAA,MACT;AACA,UAAI,MAAM,QAAQ,OAAO,GAAG;AAC1B,cAAM,QAAkB,CAAC;AACzB,mBAAW,SAAS,SAAS;AAC3B,cAAI,OAAO,UAAU,UAAU;AAC7B,kBAAM,KAAK,KAAK;AAAA,UAClB,WAAW,SAAS,OAAO,UAAU,YAAY,UAAU,OAAO;AAChE,kBAAM,KAAK,OAAQ,MAA4B,IAAI,CAAC;AAAA,UACtD;AAAA,QACF;AACA,eAAO,MAAM,KAAK,IAAI;AAAA,MACxB;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAMA,SAAS,iBAAiB,UAAiC;AACzD,QAAM,YAAwB,CAAC;AAE/B,aAAW,OAAO,UAAU;AAC1B,UAAM,UAAU;AAChB,QAAI,QAAQ,SAAS,YAAa;AAElC,UAAM,UAAU,QAAQ;AACxB,QAAI,CAAC,MAAM,QAAQ,OAAO,EAAG;AAE7B,eAAW,SAAS,SAAS;AAC3B,YAAM,IAAI;AACV,UAAI,EAAE,SAAS,cAAc,EAAE,MAAM;AACnC,kBAAU,KAAK;AAAA,UACb,MAAM,EAAE;AAAA,UACR,MAAO,EAAE,SAAqC,CAAC;AAAA,QACjD,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,aAAa,SAAuC;AAC3D,SAAO;AAAA,IACL,QAAQ;AAAA,MACN,OAAO,QAAQ,OAAO;AAAA,MACtB,QAAQ,QAAQ,OAAO;AAAA,MACvB,OAAO,QAAQ,OAAO;AAAA,IACxB;AAAA,IACA,MAAM,QAAQ;AAAA,EAChB;AACF;AAKA,eAAsB,gBAAgB,OAAoC;AACxE,QAAM,SAAS,MAAS,WAAa,UAAQ,UAAO,GAAG,UAAU,CAAC;AAElE,MAAI,OAAO,OAAO;AAChB,eAAW,CAAC,UAAU,OAAO,KAAK,OAAO,QAAQ,MAAM,KAAK,GAAG;AAC7D,YAAM,WAAgB,UAAK,QAAQ,QAAQ;AAC3C,YAAS,SAAW,aAAQ,QAAQ,GAAG,EAAE,WAAW,KAAK,CAAC;AAC1D,YAAS,aAAU,UAAU,SAAS,OAAO;AAAA,IAC/C;AAAA,EACF;AAEA,MAAI,OAAO,UAAU;AACnB,UAAM,EAAE,KAAK,IAAI,MAAM,OAAO,eAAoB;AAClD,UAAM,EAAE,UAAU,IAAI,MAAM,OAAO,MAAW;AAC9C,UAAM,YAAY,UAAU,IAAI;AAEhC,eAAW,OAAO,MAAM,UAAU;AAChC,YAAM,UAAU,KAAK,EAAE,KAAK,OAAO,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,SAAO;AACT;AAKA,eAAsB,iBAAiB,KAA4B;AACjE,MAAI;AACF,UAAS,MAAG,KAAK,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;AAAA,EACnD,QAAQ;AAAA,EAER;AACF;;;ACnMA,eAAsB,SACpB,QACA,SACyB;AACzB,QAAM,YAAY,KAAK,IAAI;AAG3B,QAAM,QAAQ,MAAM,cAAc,OAAO,QAAQ;AAEjD,MAAI,MAAM,WAAW,GAAG;AACtB,WAAO;AAAA,MACL,SAAS,CAAC;AAAA,MACV,OAAO;AAAA,MACP,QAAQ;AAAA,MACR,QAAQ;AAAA,MACR,UAAU,KAAK,IAAI,IAAI;AAAA,MACvB,aAAa;AAAA,MACb,WAAW;AAAA,IACb;AAAA,EACF;AAGA,QAAM,gBAAgB,QAAQ;AAC9B,QAAM,gBAAgB,gBAClB,MAAM,OAAO,CAAC,MAAM,EAAE,KAAK,SAAS,aAAa,CAAC,IAClD;AAGJ,QAAM,aAAa,cAAc;AAAA,IAC/B,CAAC,KAAK,MAAM,MAAM,EAAE,QAAQ,KAAK;AAAA,IACjC;AAAA,EACF;AAGA,MAAI,aAAa,OAAO,eAAe;AACrC,YAAQ;AAAA,MACN,YAAY,UAAU;AAAA,IACxB;AAAA,EACF;AAEA,QAAM,UAAwB,CAAC;AAC/B,MAAI,cAAc;AAClB,MAAI,YAAY;AAGhB,aAAW,WAAW,eAAe;AACnC,UAAM,cAAc,MAAM,cAAc,SAAS,QAAQ,OAAO;AAChE,YAAQ,KAAK,GAAG,WAAW;AAE3B,eAAW,UAAU,aAAa;AAChC,qBAAe,OAAO,OAAO;AAC7B,mBAAa,OAAO;AAAA,IACtB;AAAA,EACF;AAEA,QAAM,SAAS,QAAQ,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAE/C,SAAO;AAAA,IACL;AAAA,IACA,OAAO,QAAQ;AAAA,IACf;AAAA,IACA,QAAQ,QAAQ,SAAS;AAAA,IACzB,UAAU,KAAK,IAAI,IAAI;AAAA,IACvB;AAAA,IACA;AAAA,EACF;AACF;AAKA,eAAe,cACb,SACA,QACA,SACuB;AACvB,QAAM,UAAwB,CAAC;AAC/B,QAAM,EAAE,MAAM,SAAS,YAAY,IAAI;AAGvC,QAAM,YAAY,YAAY,KAAK,OAAO,CAAC,OAAO,GAAG,IAAI;AACzD,QAAM,YAAY,UAAU,SAAS,IAAI,YAAY,YAAY;AAGjE,QAAM,gBAAgB,UAAU,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI;AAEvD,WAAS,IAAI,GAAG,IAAI,cAAc,QAAQ,KAAK;AAC7C,UAAM,WAAW,cAAc,CAAC;AAChC,UAAM,YAAY,KAAK,IAAI;AAC3B,QAAI,MAAM;AAEV,QAAI;AAEF,YAAM,WAAW;AAAA,QACf,GAAG,OAAO;AAAA,QACV,GAAG,YAAY;AAAA;AAAA,QAEf,GAAI,QAAQ,SAAS,EAAE,OAAO,QAAQ,MAAM;AAAA,QAC5C,GAAI,QAAQ,YAAY,EAAE,UAAU,QAAQ,SAAS;AAAA,MACvD;AAGA,YAAM,UAAU,SAAS,WAAW,YAAY,WAAW,OAAO;AAClE,YAAM,aAAa,MAAM;AAAA,QACvB,SAAS;AAAA,QACT;AAAA,QACA,SAAS;AAAA,QACT;AAAA,MACF;AAEA,YAAM,WAAW;AAGjB,YAAM,MAAoB;AAAA,QACxB,OAAO,SAAS;AAAA,QAChB,QAAQ,WAAW;AAAA,QACnB,UAAU,SAAS;AAAA,QACnB;AAAA,QACA,UAAU,WAAW;AAAA,QACrB,WAAW,WAAW;AAAA,QACtB,OAAO,WAAW;AAAA,MACpB;AAGA,YAAM,SAAS,MAAM,QAAQ;AAAA,QAC3B,YAAY,QAAQ,IAAI,CAAC,WAAW,OAAO,MAAM,GAAG,CAAC;AAAA,MACvD;AAGA,YAAM,SAAS,OAAO,MAAM,CAAC,MAAM,EAAE,SAAS,GAAG;AAEjD,cAAQ,KAAK;AAAA,QACX,UAAU;AAAA,QACV,OAAO,SAAS;AAAA,QAChB;AAAA,QACA;AAAA,QACA,UAAU,KAAK,IAAI,IAAI;AAAA,QACvB,QAAQ,WAAW,MAAM;AAAA,QACzB,MAAM,WAAW,MAAM;AAAA,MACzB,CAAC;AAED,UAAI,QAAQ,SAAS;AACnB,cAAM,SAAS,SAAS,SAAS;AACjC,gBAAQ,IAAI,MAAM,MAAM,KAAKA,UAAS,SAAS,OAAO,EAAE,CAAC,EAAE;AAAA,MAC7D;AAAA,IACF,SAAS,KAAK;AACZ,cAAQ,KAAK;AAAA,QACX,UAAU;AAAA,QACV,OAAO,SAAS;AAAA,QAChB,QAAQ,CAAC;AAAA,QACT,QAAQ;AAAA,QACR,UAAU,KAAK,IAAI,IAAI;AAAA,QACvB,QAAQ,EAAE,OAAO,GAAG,QAAQ,GAAG,OAAO,EAAE;AAAA,QACxC,MAAM;AAAA,QACN,OAAQ,IAAc;AAAA,MACxB,CAAC;AAED,UAAI,QAAQ,SAAS;AACnB,gBAAQ,IAAI,aAAaA,UAAS,SAAS,OAAO,EAAE,CAAC,EAAE;AACvD,gBAAQ,IAAI,OAAQ,IAAc,OAAO,EAAE;AAAA,MAC7C;AAAA,IACF,UAAE;AAEA,UAAI,KAAK;AACP,cAAM,iBAAiB,GAAG;AAAA,MAC5B;AAAA,IACF;AAGA,QAAI,IAAI,cAAc,SAAS,GAAG;AAChC,YAAM,MAAM,OAAO,iBAAiB;AAAA,IACtC;AAAA,EACF;AAEA,SAAO;AACT;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,EAAE,CAAC;AACzD;AAEA,SAASA,UAAS,KAAa,QAAwB;AACrD,QAAM,UAAU,IAAI,QAAQ,OAAO,GAAG,EAAE,KAAK;AAC7C,MAAI,QAAQ,UAAU,OAAQ,QAAO;AACrC,SAAO,GAAG,QAAQ,MAAM,GAAG,SAAS,CAAC,CAAC;AACxC;;;AClMA,eAAe,OAAsB;AACnC,QAAM,UAAU,UAAU,QAAQ,KAAK,MAAM,CAAC,CAAC;AAE/C,MAAI,QAAQ,MAAM;AAChB,cAAU;AACV,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,SAAS,MAAM,WAAW,QAAQ,MAAM;AAE9C,UAAQ,IAAI,sBAAsB,OAAO,QAAQ,KAAK;AAGtD,QAAM,UAAU,MAAM,SAAS,QAAQ,OAAO;AAG9C,MAAI,QAAQ,MAAM;AAChB,cAAU,OAAO;AAAA,EACnB,OAAO;AACL,iBAAa,OAAO;AAAA,EACtB;AAGA,MAAI,QAAQ,cAAc,QAAW;AACnC,UAAM,WACJ,QAAQ,QAAQ,IAAK,QAAQ,SAAS,QAAQ,QAAS,MAAM;AAC/D,QAAI,WAAW,QAAQ,WAAW;AAChC,cAAQ;AAAA,QACN;AAAA,oBAAuB,SAAS,QAAQ,CAAC,CAAC,iBAAiB,QAAQ,SAAS;AAAA,MAC9E;AACA,cAAQ,KAAK,CAAC;AAAA,IAChB;AAAA,EACF;AAGA,MAAI,QAAQ,SAAS,GAAG;AACtB,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAQA,SAAS,UAAU,MAA+B;AAChD,QAAM,UAAyB,CAAC;AAEhC,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAElB,QAAI,QAAQ,YAAY,QAAQ,MAAM;AACpC,cAAQ,OAAO;AAAA,IACjB,WAAW,QAAQ,UAAU;AAC3B,cAAQ,OAAO;AAAA,IACjB,WAAW,QAAQ,eAAe,QAAQ,MAAM;AAC9C,cAAQ,UAAU;AAAA,IACpB,WAAW,QAAQ,cAAc,QAAQ,MAAM;AAC7C,cAAQ,SAAS,KAAK,EAAE,CAAC;AAAA,IAC3B,WAAW,IAAI,WAAW,WAAW,GAAG;AACtC,cAAQ,SAAS,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IACnC,WAAW,QAAQ,iBAAiB,QAAQ,MAAM;AAChD,cAAQ,YAAY,SAAS,KAAK,EAAE,CAAC,GAAG,EAAE;AAAA,IAC5C,WAAW,IAAI,WAAW,cAAc,GAAG;AACzC,cAAQ,YAAY,SAAS,IAAI,MAAM,GAAG,EAAE,CAAC,GAAG,EAAE;AAAA,IACpD,WAAW,QAAQ,cAAc,QAAQ,MAAM;AAC7C,cAAQ,SAAS,KAAK,EAAE,CAAC;AAAA,IAC3B,WAAW,IAAI,WAAW,WAAW,GAAG;AACtC,cAAQ,SAAS,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IACnC,WAAW,QAAQ,aAAa,QAAQ,MAAM;AAC5C,cAAQ,QAAQ,KAAK,EAAE,CAAC;AAAA,IAC1B,WAAW,IAAI,WAAW,UAAU,GAAG;AACrC,cAAQ,QAAQ,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IAClC,WAAW,QAAQ,gBAAgB,QAAQ,MAAM;AAC/C,cAAQ,WAAW,KAAK,EAAE,CAAC;AAAA,IAC7B,WAAW,IAAI,WAAW,aAAa,GAAG;AACxC,cAAQ,WAAW,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IACrC;AAAA,EACF;AAGA,UAAQ,QAAQ,QAAQ,SAAS,QAAQ,IAAI;AAC7C,UAAQ,WAAW,QAAQ,YAAY,QAAQ,IAAI;AAEnD,SAAO;AACT;AAEA,SAAS,YAAkB;AACzB,UAAQ,IAAI;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,CAuBb;AACD;AAEA,KAAK,EAAE,MAAM,CAAC,QAAQ;AACpB,UAAQ,MAAM,UAAU,IAAI,OAAO;AACnC,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":["truncate"]}
1
+ {"version":3,"sources":["../src/reporter.ts","../src/task.ts","../src/runner.ts","../src/cli.ts"],"sourcesContent":["/**\n * Reporter - console and JSON output\n */\nimport type { EvalRunSummary } from \"./types\";\n\n/**\n * Print results to console in a human-readable format\n */\nexport function printResults(summary: EvalRunSummary): void {\n const { results, total, passed, duration, totalTokens, totalCost } = summary;\n\n if (total === 0) {\n console.log(\"No eval results.\");\n return;\n }\n\n console.log();\n\n // Group results by eval name\n const byEval = groupBy(results, (r) => r.evalName);\n\n for (const [evalName, evalResults] of Object.entries(byEval)) {\n const evalPassed = evalResults.filter((r) => r.passed).length;\n const evalTotal = evalResults.length;\n const _evalStatus = evalPassed === evalTotal ? \"PASS\" : \"FAIL\";\n const statusIcon = evalPassed === evalTotal ? \"+\" : \"-\";\n\n console.log(`${statusIcon} ${evalName} (${evalPassed}/${evalTotal})`);\n\n for (const result of evalResults) {\n const icon = result.passed ? \"+\" : \"-\";\n const time = formatDuration(result.duration);\n const cost = formatCost(result.cost);\n const tokens = result.tokens.total;\n\n console.log(\n ` ${icon} ${truncate(result.input, 50)} (${time}, ${cost}, ${tokens} tok)`,\n );\n\n // Show score details\n for (const score of result.scores) {\n const scoreIcon = score.score >= 0.5 ? \"+\" : \"-\";\n const scoreValue = (score.score * 100).toFixed(0);\n console.log(` ${scoreIcon} ${score.name}: ${scoreValue}%`);\n if (score.reason && score.score < 1) {\n // Show reason for partial/failed scores\n const reasonLines = score.reason.split(\"\\n\").slice(0, 3);\n for (const line of reasonLines) {\n console.log(` ${line}`);\n }\n }\n }\n\n // Show error if present\n if (result.error) {\n console.log(` ! Error: ${result.error}`);\n }\n }\n\n console.log();\n }\n\n // Summary line\n console.log(\"─\".repeat(50));\n const passRate = total > 0 ? ((passed / total) * 100).toFixed(0) : 0;\n console.log(`Results: ${passed}/${total} passed (${passRate}%)`);\n console.log(\n `Total: ${formatCost(totalCost)}, ${totalTokens} tokens, ${formatDuration(duration)}`,\n );\n}\n\n/**\n * Print results as JSON\n */\nexport function printJson(summary: EvalRunSummary): void {\n console.log(JSON.stringify(summary, null, 2));\n}\n\n/**\n * Format duration in human-readable form\n */\nfunction formatDuration(ms: number): string {\n if (ms < 1000) return `${ms}ms`;\n const seconds = ms / 1000;\n if (seconds < 60) return `${seconds.toFixed(1)}s`;\n const minutes = Math.floor(seconds / 60);\n const remainingSeconds = (seconds % 60).toFixed(0);\n return `${minutes}m${remainingSeconds}s`;\n}\n\n/**\n * Format cost in USD\n */\nfunction formatCost(cost: number): string {\n if (cost < 0.01) return `$${cost.toFixed(4)}`;\n if (cost < 1) return `$${cost.toFixed(3)}`;\n return `$${cost.toFixed(2)}`;\n}\n\n/**\n * Truncate string to max length\n */\nfunction truncate(str: string, maxLen: number): string {\n const oneLine = str.replace(/\\n/g, \" \").trim();\n if (oneLine.length <= maxLen) return oneLine;\n return `${oneLine.slice(0, maxLen - 3)}...`;\n}\n\n/**\n * Group array items by key\n */\nfunction groupBy<T>(\n items: T[],\n keyFn: (item: T) => string,\n): Record<string, T[]> {\n const result: Record<string, T[]> = {};\n for (const item of items) {\n const key = keyFn(item);\n if (!result[key]) result[key] = [];\n result[key].push(item);\n }\n return result;\n}\n","/**\n * Pi task execution via createAgentSession SDK\n */\n\nimport * as fs from \"node:fs/promises\";\nimport * as os from \"node:os\";\nimport * as path from \"node:path\";\nimport { getModel, type KnownProvider } from \"@mariozechner/pi-ai\";\nimport {\n createAgentSession,\n DefaultResourceLoader,\n type SessionStats as PiSessionStats,\n SessionManager,\n} from \"@mariozechner/pi-coding-agent\";\nimport type { PiConfig, SessionStats, TestSetup, ToolCall } from \"./types\";\n\n/**\n * Result of running a pi task\n */\nexport interface TaskResult {\n /** Agent's final response text */\n output: string;\n /** Full conversation messages */\n messages: unknown[];\n /** Tool calls made during the session */\n toolCalls: ToolCall[];\n /** Session statistics */\n stats: SessionStats;\n /** Workspace directory (for scorers) */\n cwd: string;\n}\n\n/**\n * Run a pi task in a temporary workspace\n */\nexport async function runPiTask(\n input: string,\n config: PiConfig,\n setup?: TestSetup,\n timeout?: number,\n): Promise<TaskResult> {\n // Create isolated workspace\n const cwd = await createWorkspace(setup);\n\n // Get the model (cast provider to KnownProvider - will throw if invalid)\n const model = getModel(\n config.provider as KnownProvider,\n config.model as never,\n );\n\n // Create resource loader with extensions if configured\n let resourceLoader: DefaultResourceLoader | undefined;\n if (config.extensions && config.extensions.length > 0) {\n const resolvedPaths = config.extensions.map((ext) => path.resolve(ext));\n resourceLoader = new DefaultResourceLoader({\n cwd,\n additionalExtensionPaths: resolvedPaths,\n });\n await resourceLoader.reload();\n }\n\n // Create session with in-memory session manager to avoid polluting user sessions\n const { session } = await createAgentSession({\n cwd,\n model,\n sessionManager: SessionManager.inMemory(cwd),\n ...(resourceLoader ? { resourceLoader } : {}),\n });\n\n try {\n // Set up timeout\n const timeoutMs = timeout ?? 60_000;\n const timeoutPromise = new Promise<never>((_, reject) => {\n setTimeout(\n () => reject(new Error(`Task timed out after ${timeoutMs}ms`)),\n timeoutMs,\n );\n });\n\n // Subscribe to events to know when done\n const completionPromise = new Promise<void>((resolve) => {\n session.subscribe((event) => {\n if (event.type === \"agent_end\") {\n resolve();\n }\n });\n });\n\n // Send the prompt\n await session.prompt(input);\n\n // Wait for completion or timeout\n await Promise.race([completionPromise, timeoutPromise]);\n\n // Get results\n const messages = session.messages;\n const piStats = session.getSessionStats();\n\n // Extract the last assistant message text\n const output = extractLastAssistantText(messages);\n\n // Extract tool calls from messages\n const toolCalls = extractToolCalls(messages);\n\n // Convert stats\n const stats = convertStats(piStats);\n\n return {\n output,\n messages,\n toolCalls,\n stats,\n cwd,\n };\n } finally {\n // Dispose session\n session.dispose();\n }\n}\n\n/**\n * Extract the text from the last assistant message\n */\nfunction extractLastAssistantText(messages: unknown[]): string {\n // Find the last assistant message\n for (let i = messages.length - 1; i >= 0; i--) {\n const msg = messages[i] as { role?: string; content?: unknown };\n if (msg.role === \"assistant\") {\n // Extract text from content\n const content = msg.content;\n if (typeof content === \"string\") {\n return content;\n }\n if (Array.isArray(content)) {\n const texts: string[] = [];\n for (const block of content) {\n if (typeof block === \"string\") {\n texts.push(block);\n } else if (block && typeof block === \"object\" && \"text\" in block) {\n texts.push(String((block as { text: unknown }).text));\n }\n }\n return texts.join(\"\\n\");\n }\n }\n }\n return \"\";\n}\n\n/**\n * Extract tool calls from assistant messages.\n * The pi SDK uses content blocks with type \"toolCall\" (name + arguments),\n * while the raw Anthropic format uses \"tool_use\" (name + input).\n * We support both for robustness.\n */\nfunction extractToolCalls(messages: unknown[]): ToolCall[] {\n const toolCalls: ToolCall[] = [];\n\n for (const msg of messages) {\n const message = msg as { role?: string; content?: unknown };\n if (message.role !== \"assistant\") continue;\n\n const content = message.content;\n if (!Array.isArray(content)) continue;\n\n for (const block of content) {\n const b = block as {\n type?: string;\n name?: string;\n input?: unknown;\n arguments?: unknown;\n };\n if (b.type === \"toolCall\" && b.name) {\n toolCalls.push({\n name: b.name,\n args: (b.arguments as Record<string, unknown>) ?? {},\n });\n } else if (b.type === \"tool_use\" && b.name) {\n toolCalls.push({\n name: b.name,\n args: (b.input as Record<string, unknown>) ?? {},\n });\n }\n }\n }\n\n return toolCalls;\n}\n\n/**\n * Convert Pi session stats to our format\n */\nfunction convertStats(piStats: PiSessionStats): SessionStats {\n return {\n tokens: {\n input: piStats.tokens.input,\n output: piStats.tokens.output,\n total: piStats.tokens.total,\n },\n cost: piStats.cost,\n };\n}\n\n/**\n * Create a temporary workspace with optional setup\n */\nexport async function createWorkspace(setup?: TestSetup): Promise<string> {\n const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), \"pi-eval-\"));\n\n if (setup?.files) {\n for (const [filePath, content] of Object.entries(setup.files)) {\n const fullPath = path.join(tmpDir, filePath);\n await fs.mkdir(path.dirname(fullPath), { recursive: true });\n await fs.writeFile(fullPath, content, \"utf-8\");\n }\n }\n\n if (setup?.commands) {\n const { exec } = await import(\"node:child_process\");\n const { promisify } = await import(\"node:util\");\n const execAsync = promisify(exec);\n\n for (const cmd of setup.commands) {\n await execAsync(cmd, { cwd: tmpDir });\n }\n }\n\n return tmpDir;\n}\n\n/**\n * Clean up a workspace directory\n */\nexport async function cleanupWorkspace(cwd: string): Promise<void> {\n try {\n await fs.rm(cwd, { recursive: true, force: true });\n } catch {\n // Ignore cleanup errors\n }\n}\n","/**\n * Eval runner - orchestrates sequential execution\n */\n\nimport { discoverEvals } from \"./discovery\";\nimport { cleanupWorkspace, runPiTask } from \"./task\";\nimport type {\n CliOptions,\n EvalDefinition,\n EvalRunSummary,\n GlobalConfig,\n ScoreContext,\n TestResult,\n} from \"./types\";\n\n/**\n * Run all discovered evals\n */\nexport async function runEvals(\n config: Required<GlobalConfig>,\n options: CliOptions,\n): Promise<EvalRunSummary> {\n const startTime = Date.now();\n\n // Discover eval files\n const evals = await discoverEvals(config.evalsDir);\n\n if (evals.length === 0) {\n return {\n results: [],\n total: 0,\n passed: 0,\n failed: 0,\n duration: Date.now() - startTime,\n totalTokens: 0,\n totalCost: 0,\n };\n }\n\n // Filter evals if requested\n const filterPattern = options.filter;\n const filteredEvals = filterPattern\n ? evals.filter((e) => e.name.includes(filterPattern))\n : evals;\n\n // Count total test cases\n const totalCases = filteredEvals.reduce(\n (sum, e) => sum + e.options.data.length,\n 0,\n );\n\n // Warn if too many test cases\n if (totalCases > config.warnTestCount) {\n console.warn(\n `Warning: ${totalCases} test cases. This may take a while and hit rate limits.`,\n );\n }\n\n const results: TestResult[] = [];\n let totalTokens = 0;\n let totalCost = 0;\n\n // Run evals sequentially\n for (const evalDef of filteredEvals) {\n const evalResults = await runSingleEval(evalDef, config, options);\n results.push(...evalResults);\n\n for (const result of evalResults) {\n totalTokens += result.tokens.total;\n totalCost += result.cost;\n }\n }\n\n const passed = results.filter((r) => r.passed).length;\n\n return {\n results,\n total: results.length,\n passed,\n failed: results.length - passed,\n duration: Date.now() - startTime,\n totalTokens,\n totalCost,\n };\n}\n\n/**\n * Run a single eval (all its test cases)\n */\nasync function runSingleEval(\n evalDef: EvalDefinition,\n config: Required<GlobalConfig>,\n options: CliOptions,\n): Promise<TestResult[]> {\n const results: TestResult[] = [];\n const { name, options: evalOptions } = evalDef;\n\n // Check for .only test cases\n const onlyCases = evalOptions.data.filter((tc) => tc.only);\n const testCases = onlyCases.length > 0 ? onlyCases : evalOptions.data;\n\n // Filter out skipped cases\n const runnableCases = testCases.filter((tc) => !tc.skip);\n\n for (let i = 0; i < runnableCases.length; i++) {\n const testCase = runnableCases[i];\n const startTime = Date.now();\n let cwd = \"\";\n\n try {\n // Merge config with defaults, then apply CLI/env overrides\n const piConfig = {\n ...config.defaults,\n ...evalOptions.config,\n // CLI/env overrides take precedence\n ...(options.model && { model: options.model }),\n ...(options.provider && { provider: options.provider }),\n };\n\n // Run the pi task\n const timeout = testCase.timeout ?? evalOptions.timeout ?? config.timeout;\n const taskResult = await runPiTask(\n testCase.input,\n piConfig,\n testCase.setup,\n timeout,\n );\n\n cwd = taskResult.cwd; // Save for cleanup and scorers\n\n // Build scorer context\n const ctx: ScoreContext = {\n input: testCase.input,\n output: taskResult.output,\n expected: testCase.expected,\n cwd,\n messages: taskResult.messages as never[],\n toolCalls: taskResult.toolCalls,\n stats: taskResult.stats,\n };\n\n // Run all scorers\n const scores = await Promise.all(\n evalOptions.scorers.map((scorer) => scorer.score(ctx)),\n );\n\n // Test passes if all scores >= 0.5\n const passed = scores.every((s) => s.score >= 0.5);\n\n results.push({\n evalName: name,\n input: testCase.input,\n scores,\n passed,\n duration: Date.now() - startTime,\n tokens: taskResult.stats.tokens,\n cost: taskResult.stats.cost,\n });\n\n if (options.verbose) {\n const status = passed ? \"PASS\" : \"FAIL\";\n console.log(` [${status}] ${truncate(testCase.input, 50)}`);\n }\n } catch (err) {\n results.push({\n evalName: name,\n input: testCase.input,\n scores: [],\n passed: false,\n duration: Date.now() - startTime,\n tokens: { input: 0, output: 0, total: 0 },\n cost: 0,\n error: (err as Error).message,\n });\n\n if (options.verbose) {\n console.log(` [ERROR] ${truncate(testCase.input, 50)}`);\n console.log(` ${(err as Error).message}`);\n }\n } finally {\n // Clean up workspace if we have one\n if (cwd) {\n await cleanupWorkspace(cwd);\n }\n }\n\n // Delay between tests (rate limiting)\n if (i < runnableCases.length - 1) {\n await sleep(config.delayBetweenTests);\n }\n }\n\n return results;\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => setTimeout(resolve, ms));\n}\n\nfunction truncate(str: string, maxLen: number): string {\n const oneLine = str.replace(/\\n/g, \" \").trim();\n if (oneLine.length <= maxLen) return oneLine;\n return `${oneLine.slice(0, maxLen - 3)}...`;\n}\n","#!/usr/bin/env node\n/**\n * CLI entry point for pi-evals\n */\nimport { loadConfig } from \"./config\";\nimport { printJson, printResults } from \"./reporter\";\nimport { runEvals } from \"./runner\";\nimport type { CliOptions } from \"./types\";\n\nasync function main(): Promise<void> {\n const options = parseArgs(process.argv.slice(2));\n\n if (options.help) {\n printHelp();\n process.exit(0);\n }\n\n // Load config\n const config = await loadConfig(options.config);\n\n console.log(`Running evals from ${config.evalsDir}...`);\n\n // Run evals\n const summary = await runEvals(config, options);\n\n // Output results\n if (options.json) {\n printJson(summary);\n } else {\n printResults(summary);\n }\n\n // Check threshold\n if (options.threshold !== undefined) {\n const passRate =\n summary.total > 0 ? (summary.passed / summary.total) * 100 : 0;\n if (passRate < options.threshold) {\n console.log(\n `\\nFailed: pass rate ${passRate.toFixed(0)}% < threshold ${options.threshold}%`,\n );\n process.exit(1);\n }\n }\n\n // Exit with error if any tests failed\n if (summary.failed > 0) {\n process.exit(1);\n }\n}\n\ninterface ParsedOptions extends CliOptions {\n help?: boolean;\n model?: string;\n provider?: string;\n}\n\nfunction parseArgs(args: string[]): ParsedOptions {\n const options: ParsedOptions = {};\n\n for (let i = 0; i < args.length; i++) {\n const arg = args[i];\n\n if (arg === \"--help\" || arg === \"-h\") {\n options.help = true;\n } else if (arg === \"--json\") {\n options.json = true;\n } else if (arg === \"--verbose\" || arg === \"-v\") {\n options.verbose = true;\n } else if (arg === \"--filter\" || arg === \"-f\") {\n options.filter = args[++i];\n } else if (arg.startsWith(\"--filter=\")) {\n options.filter = arg.split(\"=\")[1];\n } else if (arg === \"--threshold\" || arg === \"-t\") {\n options.threshold = parseInt(args[++i], 10);\n } else if (arg.startsWith(\"--threshold=\")) {\n options.threshold = parseInt(arg.split(\"=\")[1], 10);\n } else if (arg === \"--config\" || arg === \"-c\") {\n options.config = args[++i];\n } else if (arg.startsWith(\"--config=\")) {\n options.config = arg.split(\"=\")[1];\n } else if (arg === \"--model\" || arg === \"-m\") {\n options.model = args[++i];\n } else if (arg.startsWith(\"--model=\")) {\n options.model = arg.split(\"=\")[1];\n } else if (arg === \"--provider\" || arg === \"-p\") {\n options.provider = args[++i];\n } else if (arg.startsWith(\"--provider=\")) {\n options.provider = arg.split(\"=\")[1];\n }\n }\n\n // Environment variable overrides (lower priority than CLI args)\n options.model = options.model ?? process.env.PI_EVAL_MODEL;\n options.provider = options.provider ?? process.env.PI_EVAL_PROVIDER;\n\n return options;\n}\n\nfunction printHelp(): void {\n console.log(`\npi-evals - Eval framework for pi coding agent\n\nUsage:\n pi-evals [options]\n\nOptions:\n -h, --help Show this help message\n -f, --filter <pattern> Filter evals by name substring\n -t, --threshold <pct> Minimum pass percentage to exit 0\n -c, --config <path> Config file path (default: pi-evals.config.ts)\n -m, --model <model> Override model (also: PI_EVAL_MODEL env var)\n -p, --provider <name> Override provider (also: PI_EVAL_PROVIDER env var)\n -v, --verbose Show detailed output during run\n --json Output results as JSON\n\nExamples:\n pi-evals # Run all evals\n pi-evals --filter \"file-creation\" # Run matching evals\n pi-evals --threshold 80 # Fail if < 80% pass\n pi-evals --json > results.json # JSON output for CI\n pi-evals -p github -m gpt-4o # Use GitHub Models\n PI_EVAL_PROVIDER=github PI_EVAL_MODEL=gpt-4o pi-evals # Via env vars\n`);\n}\n\nmain().catch((err) => {\n console.error(\"Error:\", err.message);\n process.exit(1);\n});\n"],"mappings":";;;;;;;AAQO,SAAS,aAAa,SAA+B;AAC1D,QAAM,EAAE,SAAS,OAAO,QAAQ,UAAU,aAAa,UAAU,IAAI;AAErE,MAAI,UAAU,GAAG;AACf,YAAQ,IAAI,kBAAkB;AAC9B;AAAA,EACF;AAEA,UAAQ,IAAI;AAGZ,QAAM,SAAS,QAAQ,SAAS,CAAC,MAAM,EAAE,QAAQ;AAEjD,aAAW,CAAC,UAAU,WAAW,KAAK,OAAO,QAAQ,MAAM,GAAG;AAC5D,UAAM,aAAa,YAAY,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AACvD,UAAM,YAAY,YAAY;AAC9B,UAAM,cAAc,eAAe,YAAY,SAAS;AACxD,UAAM,aAAa,eAAe,YAAY,MAAM;AAEpD,YAAQ,IAAI,GAAG,UAAU,IAAI,QAAQ,KAAK,UAAU,IAAI,SAAS,GAAG;AAEpE,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,OAAO,SAAS,MAAM;AACnC,YAAM,OAAO,eAAe,OAAO,QAAQ;AAC3C,YAAM,OAAO,WAAW,OAAO,IAAI;AACnC,YAAM,SAAS,OAAO,OAAO;AAE7B,cAAQ;AAAA,QACN,KAAK,IAAI,IAAI,SAAS,OAAO,OAAO,EAAE,CAAC,KAAK,IAAI,KAAK,IAAI,KAAK,MAAM;AAAA,MACtE;AAGA,iBAAW,SAAS,OAAO,QAAQ;AACjC,cAAM,YAAY,MAAM,SAAS,MAAM,MAAM;AAC7C,cAAM,cAAc,MAAM,QAAQ,KAAK,QAAQ,CAAC;AAChD,gBAAQ,IAAI,OAAO,SAAS,IAAI,MAAM,IAAI,KAAK,UAAU,GAAG;AAC5D,YAAI,MAAM,UAAU,MAAM,QAAQ,GAAG;AAEnC,gBAAM,cAAc,MAAM,OAAO,MAAM,IAAI,EAAE,MAAM,GAAG,CAAC;AACvD,qBAAW,QAAQ,aAAa;AAC9B,oBAAQ,IAAI,SAAS,IAAI,EAAE;AAAA,UAC7B;AAAA,QACF;AAAA,MACF;AAGA,UAAI,OAAO,OAAO;AAChB,gBAAQ,IAAI,gBAAgB,OAAO,KAAK,EAAE;AAAA,MAC5C;AAAA,IACF;AAEA,YAAQ,IAAI;AAAA,EACd;AAGA,UAAQ,IAAI,SAAI,OAAO,EAAE,CAAC;AAC1B,QAAM,WAAW,QAAQ,KAAM,SAAS,QAAS,KAAK,QAAQ,CAAC,IAAI;AACnE,UAAQ,IAAI,YAAY,MAAM,IAAI,KAAK,YAAY,QAAQ,IAAI;AAC/D,UAAQ;AAAA,IACN,UAAU,WAAW,SAAS,CAAC,KAAK,WAAW,YAAY,eAAe,QAAQ,CAAC;AAAA,EACrF;AACF;AAKO,SAAS,UAAU,SAA+B;AACvD,UAAQ,IAAI,KAAK,UAAU,SAAS,MAAM,CAAC,CAAC;AAC9C;AAKA,SAAS,eAAe,IAAoB;AAC1C,MAAI,KAAK,IAAM,QAAO,GAAG,EAAE;AAC3B,QAAM,UAAU,KAAK;AACrB,MAAI,UAAU,GAAI,QAAO,GAAG,QAAQ,QAAQ,CAAC,CAAC;AAC9C,QAAM,UAAU,KAAK,MAAM,UAAU,EAAE;AACvC,QAAM,oBAAoB,UAAU,IAAI,QAAQ,CAAC;AACjD,SAAO,GAAG,OAAO,IAAI,gBAAgB;AACvC;AAKA,SAAS,WAAW,MAAsB;AACxC,MAAI,OAAO,KAAM,QAAO,IAAI,KAAK,QAAQ,CAAC,CAAC;AAC3C,MAAI,OAAO,EAAG,QAAO,IAAI,KAAK,QAAQ,CAAC,CAAC;AACxC,SAAO,IAAI,KAAK,QAAQ,CAAC,CAAC;AAC5B;AAKA,SAAS,SAAS,KAAa,QAAwB;AACrD,QAAM,UAAU,IAAI,QAAQ,OAAO,GAAG,EAAE,KAAK;AAC7C,MAAI,QAAQ,UAAU,OAAQ,QAAO;AACrC,SAAO,GAAG,QAAQ,MAAM,GAAG,SAAS,CAAC,CAAC;AACxC;AAKA,SAAS,QACP,OACA,OACqB;AACrB,QAAM,SAA8B,CAAC;AACrC,aAAW,QAAQ,OAAO;AACxB,UAAM,MAAM,MAAM,IAAI;AACtB,QAAI,CAAC,OAAO,GAAG,EAAG,QAAO,GAAG,IAAI,CAAC;AACjC,WAAO,GAAG,EAAE,KAAK,IAAI;AAAA,EACvB;AACA,SAAO;AACT;;;ACtHA,YAAY,QAAQ;AACpB,YAAY,QAAQ;AACpB,YAAY,UAAU;AACtB,SAAS,gBAAoC;AAC7C;AAAA,EACE;AAAA,EACA;AAAA,EAEA;AAAA,OACK;AAsBP,eAAsB,UACpB,OACA,QACA,OACA,SACqB;AAErB,QAAM,MAAM,MAAM,gBAAgB,KAAK;AAGvC,QAAM,QAAQ;AAAA,IACZ,OAAO;AAAA,IACP,OAAO;AAAA,EACT;AAGA,MAAI;AACJ,MAAI,OAAO,cAAc,OAAO,WAAW,SAAS,GAAG;AACrD,UAAM,gBAAgB,OAAO,WAAW,IAAI,CAAC,QAAa,aAAQ,GAAG,CAAC;AACtE,qBAAiB,IAAI,sBAAsB;AAAA,MACzC;AAAA,MACA,0BAA0B;AAAA,IAC5B,CAAC;AACD,UAAM,eAAe,OAAO;AAAA,EAC9B;AAGA,QAAM,EAAE,QAAQ,IAAI,MAAM,mBAAmB;AAAA,IAC3C;AAAA,IACA;AAAA,IACA,gBAAgB,eAAe,SAAS,GAAG;AAAA,IAC3C,GAAI,iBAAiB,EAAE,eAAe,IAAI,CAAC;AAAA,EAC7C,CAAC;AAED,MAAI;AAEF,UAAM,YAAY,WAAW;AAC7B,UAAM,iBAAiB,IAAI,QAAe,CAAC,GAAG,WAAW;AACvD;AAAA,QACE,MAAM,OAAO,IAAI,MAAM,wBAAwB,SAAS,IAAI,CAAC;AAAA,QAC7D;AAAA,MACF;AAAA,IACF,CAAC;AAGD,UAAM,oBAAoB,IAAI,QAAc,CAACA,aAAY;AACvD,cAAQ,UAAU,CAAC,UAAU;AAC3B,YAAI,MAAM,SAAS,aAAa;AAC9B,UAAAA,SAAQ;AAAA,QACV;AAAA,MACF,CAAC;AAAA,IACH,CAAC;AAGD,UAAM,QAAQ,OAAO,KAAK;AAG1B,UAAM,QAAQ,KAAK,CAAC,mBAAmB,cAAc,CAAC;AAGtD,UAAM,WAAW,QAAQ;AACzB,UAAM,UAAU,QAAQ,gBAAgB;AAGxC,UAAM,SAAS,yBAAyB,QAAQ;AAGhD,UAAM,YAAY,iBAAiB,QAAQ;AAG3C,UAAM,QAAQ,aAAa,OAAO;AAElC,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF,UAAE;AAEA,YAAQ,QAAQ;AAAA,EAClB;AACF;AAKA,SAAS,yBAAyB,UAA6B;AAE7D,WAAS,IAAI,SAAS,SAAS,GAAG,KAAK,GAAG,KAAK;AAC7C,UAAM,MAAM,SAAS,CAAC;AACtB,QAAI,IAAI,SAAS,aAAa;AAE5B,YAAM,UAAU,IAAI;AACpB,UAAI,OAAO,YAAY,UAAU;AAC/B,eAAO;AAAA,MACT;AACA,UAAI,MAAM,QAAQ,OAAO,GAAG;AAC1B,cAAM,QAAkB,CAAC;AACzB,mBAAW,SAAS,SAAS;AAC3B,cAAI,OAAO,UAAU,UAAU;AAC7B,kBAAM,KAAK,KAAK;AAAA,UAClB,WAAW,SAAS,OAAO,UAAU,YAAY,UAAU,OAAO;AAChE,kBAAM,KAAK,OAAQ,MAA4B,IAAI,CAAC;AAAA,UACtD;AAAA,QACF;AACA,eAAO,MAAM,KAAK,IAAI;AAAA,MACxB;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAQA,SAAS,iBAAiB,UAAiC;AACzD,QAAM,YAAwB,CAAC;AAE/B,aAAW,OAAO,UAAU;AAC1B,UAAM,UAAU;AAChB,QAAI,QAAQ,SAAS,YAAa;AAElC,UAAM,UAAU,QAAQ;AACxB,QAAI,CAAC,MAAM,QAAQ,OAAO,EAAG;AAE7B,eAAW,SAAS,SAAS;AAC3B,YAAM,IAAI;AAMV,UAAI,EAAE,SAAS,cAAc,EAAE,MAAM;AACnC,kBAAU,KAAK;AAAA,UACb,MAAM,EAAE;AAAA,UACR,MAAO,EAAE,aAAyC,CAAC;AAAA,QACrD,CAAC;AAAA,MACH,WAAW,EAAE,SAAS,cAAc,EAAE,MAAM;AAC1C,kBAAU,KAAK;AAAA,UACb,MAAM,EAAE;AAAA,UACR,MAAO,EAAE,SAAqC,CAAC;AAAA,QACjD,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,aAAa,SAAuC;AAC3D,SAAO;AAAA,IACL,QAAQ;AAAA,MACN,OAAO,QAAQ,OAAO;AAAA,MACtB,QAAQ,QAAQ,OAAO;AAAA,MACvB,OAAO,QAAQ,OAAO;AAAA,IACxB;AAAA,IACA,MAAM,QAAQ;AAAA,EAChB;AACF;AAKA,eAAsB,gBAAgB,OAAoC;AACxE,QAAM,SAAS,MAAS,WAAa,UAAQ,UAAO,GAAG,UAAU,CAAC;AAElE,MAAI,OAAO,OAAO;AAChB,eAAW,CAAC,UAAU,OAAO,KAAK,OAAO,QAAQ,MAAM,KAAK,GAAG;AAC7D,YAAM,WAAgB,UAAK,QAAQ,QAAQ;AAC3C,YAAS,SAAW,aAAQ,QAAQ,GAAG,EAAE,WAAW,KAAK,CAAC;AAC1D,YAAS,aAAU,UAAU,SAAS,OAAO;AAAA,IAC/C;AAAA,EACF;AAEA,MAAI,OAAO,UAAU;AACnB,UAAM,EAAE,KAAK,IAAI,MAAM,OAAO,eAAoB;AAClD,UAAM,EAAE,UAAU,IAAI,MAAM,OAAO,MAAW;AAC9C,UAAM,YAAY,UAAU,IAAI;AAEhC,eAAW,OAAO,MAAM,UAAU;AAChC,YAAM,UAAU,KAAK,EAAE,KAAK,OAAO,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,SAAO;AACT;AAKA,eAAsB,iBAAiB,KAA4B;AACjE,MAAI;AACF,UAAS,MAAG,KAAK,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;AAAA,EACnD,QAAQ;AAAA,EAER;AACF;;;AC7NA,eAAsB,SACpB,QACA,SACyB;AACzB,QAAM,YAAY,KAAK,IAAI;AAG3B,QAAM,QAAQ,MAAM,cAAc,OAAO,QAAQ;AAEjD,MAAI,MAAM,WAAW,GAAG;AACtB,WAAO;AAAA,MACL,SAAS,CAAC;AAAA,MACV,OAAO;AAAA,MACP,QAAQ;AAAA,MACR,QAAQ;AAAA,MACR,UAAU,KAAK,IAAI,IAAI;AAAA,MACvB,aAAa;AAAA,MACb,WAAW;AAAA,IACb;AAAA,EACF;AAGA,QAAM,gBAAgB,QAAQ;AAC9B,QAAM,gBAAgB,gBAClB,MAAM,OAAO,CAAC,MAAM,EAAE,KAAK,SAAS,aAAa,CAAC,IAClD;AAGJ,QAAM,aAAa,cAAc;AAAA,IAC/B,CAAC,KAAK,MAAM,MAAM,EAAE,QAAQ,KAAK;AAAA,IACjC;AAAA,EACF;AAGA,MAAI,aAAa,OAAO,eAAe;AACrC,YAAQ;AAAA,MACN,YAAY,UAAU;AAAA,IACxB;AAAA,EACF;AAEA,QAAM,UAAwB,CAAC;AAC/B,MAAI,cAAc;AAClB,MAAI,YAAY;AAGhB,aAAW,WAAW,eAAe;AACnC,UAAM,cAAc,MAAM,cAAc,SAAS,QAAQ,OAAO;AAChE,YAAQ,KAAK,GAAG,WAAW;AAE3B,eAAW,UAAU,aAAa;AAChC,qBAAe,OAAO,OAAO;AAC7B,mBAAa,OAAO;AAAA,IACtB;AAAA,EACF;AAEA,QAAM,SAAS,QAAQ,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAE/C,SAAO;AAAA,IACL;AAAA,IACA,OAAO,QAAQ;AAAA,IACf;AAAA,IACA,QAAQ,QAAQ,SAAS;AAAA,IACzB,UAAU,KAAK,IAAI,IAAI;AAAA,IACvB;AAAA,IACA;AAAA,EACF;AACF;AAKA,eAAe,cACb,SACA,QACA,SACuB;AACvB,QAAM,UAAwB,CAAC;AAC/B,QAAM,EAAE,MAAM,SAAS,YAAY,IAAI;AAGvC,QAAM,YAAY,YAAY,KAAK,OAAO,CAAC,OAAO,GAAG,IAAI;AACzD,QAAM,YAAY,UAAU,SAAS,IAAI,YAAY,YAAY;AAGjE,QAAM,gBAAgB,UAAU,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI;AAEvD,WAAS,IAAI,GAAG,IAAI,cAAc,QAAQ,KAAK;AAC7C,UAAM,WAAW,cAAc,CAAC;AAChC,UAAM,YAAY,KAAK,IAAI;AAC3B,QAAI,MAAM;AAEV,QAAI;AAEF,YAAM,WAAW;AAAA,QACf,GAAG,OAAO;AAAA,QACV,GAAG,YAAY;AAAA;AAAA,QAEf,GAAI,QAAQ,SAAS,EAAE,OAAO,QAAQ,MAAM;AAAA,QAC5C,GAAI,QAAQ,YAAY,EAAE,UAAU,QAAQ,SAAS;AAAA,MACvD;AAGA,YAAM,UAAU,SAAS,WAAW,YAAY,WAAW,OAAO;AAClE,YAAM,aAAa,MAAM;AAAA,QACvB,SAAS;AAAA,QACT;AAAA,QACA,SAAS;AAAA,QACT;AAAA,MACF;AAEA,YAAM,WAAW;AAGjB,YAAM,MAAoB;AAAA,QACxB,OAAO,SAAS;AAAA,QAChB,QAAQ,WAAW;AAAA,QACnB,UAAU,SAAS;AAAA,QACnB;AAAA,QACA,UAAU,WAAW;AAAA,QACrB,WAAW,WAAW;AAAA,QACtB,OAAO,WAAW;AAAA,MACpB;AAGA,YAAM,SAAS,MAAM,QAAQ;AAAA,QAC3B,YAAY,QAAQ,IAAI,CAAC,WAAW,OAAO,MAAM,GAAG,CAAC;AAAA,MACvD;AAGA,YAAM,SAAS,OAAO,MAAM,CAAC,MAAM,EAAE,SAAS,GAAG;AAEjD,cAAQ,KAAK;AAAA,QACX,UAAU;AAAA,QACV,OAAO,SAAS;AAAA,QAChB;AAAA,QACA;AAAA,QACA,UAAU,KAAK,IAAI,IAAI;AAAA,QACvB,QAAQ,WAAW,MAAM;AAAA,QACzB,MAAM,WAAW,MAAM;AAAA,MACzB,CAAC;AAED,UAAI,QAAQ,SAAS;AACnB,cAAM,SAAS,SAAS,SAAS;AACjC,gBAAQ,IAAI,MAAM,MAAM,KAAKC,UAAS,SAAS,OAAO,EAAE,CAAC,EAAE;AAAA,MAC7D;AAAA,IACF,SAAS,KAAK;AACZ,cAAQ,KAAK;AAAA,QACX,UAAU;AAAA,QACV,OAAO,SAAS;AAAA,QAChB,QAAQ,CAAC;AAAA,QACT,QAAQ;AAAA,QACR,UAAU,KAAK,IAAI,IAAI;AAAA,QACvB,QAAQ,EAAE,OAAO,GAAG,QAAQ,GAAG,OAAO,EAAE;AAAA,QACxC,MAAM;AAAA,QACN,OAAQ,IAAc;AAAA,MACxB,CAAC;AAED,UAAI,QAAQ,SAAS;AACnB,gBAAQ,IAAI,aAAaA,UAAS,SAAS,OAAO,EAAE,CAAC,EAAE;AACvD,gBAAQ,IAAI,OAAQ,IAAc,OAAO,EAAE;AAAA,MAC7C;AAAA,IACF,UAAE;AAEA,UAAI,KAAK;AACP,cAAM,iBAAiB,GAAG;AAAA,MAC5B;AAAA,IACF;AAGA,QAAI,IAAI,cAAc,SAAS,GAAG;AAChC,YAAM,MAAM,OAAO,iBAAiB;AAAA,IACtC;AAAA,EACF;AAEA,SAAO;AACT;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAACC,aAAY,WAAWA,UAAS,EAAE,CAAC;AACzD;AAEA,SAASD,UAAS,KAAa,QAAwB;AACrD,QAAM,UAAU,IAAI,QAAQ,OAAO,GAAG,EAAE,KAAK;AAC7C,MAAI,QAAQ,UAAU,OAAQ,QAAO;AACrC,SAAO,GAAG,QAAQ,MAAM,GAAG,SAAS,CAAC,CAAC;AACxC;;;AClMA,eAAe,OAAsB;AACnC,QAAM,UAAU,UAAU,QAAQ,KAAK,MAAM,CAAC,CAAC;AAE/C,MAAI,QAAQ,MAAM;AAChB,cAAU;AACV,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,SAAS,MAAM,WAAW,QAAQ,MAAM;AAE9C,UAAQ,IAAI,sBAAsB,OAAO,QAAQ,KAAK;AAGtD,QAAM,UAAU,MAAM,SAAS,QAAQ,OAAO;AAG9C,MAAI,QAAQ,MAAM;AAChB,cAAU,OAAO;AAAA,EACnB,OAAO;AACL,iBAAa,OAAO;AAAA,EACtB;AAGA,MAAI,QAAQ,cAAc,QAAW;AACnC,UAAM,WACJ,QAAQ,QAAQ,IAAK,QAAQ,SAAS,QAAQ,QAAS,MAAM;AAC/D,QAAI,WAAW,QAAQ,WAAW;AAChC,cAAQ;AAAA,QACN;AAAA,oBAAuB,SAAS,QAAQ,CAAC,CAAC,iBAAiB,QAAQ,SAAS;AAAA,MAC9E;AACA,cAAQ,KAAK,CAAC;AAAA,IAChB;AAAA,EACF;AAGA,MAAI,QAAQ,SAAS,GAAG;AACtB,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAQA,SAAS,UAAU,MAA+B;AAChD,QAAM,UAAyB,CAAC;AAEhC,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAElB,QAAI,QAAQ,YAAY,QAAQ,MAAM;AACpC,cAAQ,OAAO;AAAA,IACjB,WAAW,QAAQ,UAAU;AAC3B,cAAQ,OAAO;AAAA,IACjB,WAAW,QAAQ,eAAe,QAAQ,MAAM;AAC9C,cAAQ,UAAU;AAAA,IACpB,WAAW,QAAQ,cAAc,QAAQ,MAAM;AAC7C,cAAQ,SAAS,KAAK,EAAE,CAAC;AAAA,IAC3B,WAAW,IAAI,WAAW,WAAW,GAAG;AACtC,cAAQ,SAAS,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IACnC,WAAW,QAAQ,iBAAiB,QAAQ,MAAM;AAChD,cAAQ,YAAY,SAAS,KAAK,EAAE,CAAC,GAAG,EAAE;AAAA,IAC5C,WAAW,IAAI,WAAW,cAAc,GAAG;AACzC,cAAQ,YAAY,SAAS,IAAI,MAAM,GAAG,EAAE,CAAC,GAAG,EAAE;AAAA,IACpD,WAAW,QAAQ,cAAc,QAAQ,MAAM;AAC7C,cAAQ,SAAS,KAAK,EAAE,CAAC;AAAA,IAC3B,WAAW,IAAI,WAAW,WAAW,GAAG;AACtC,cAAQ,SAAS,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IACnC,WAAW,QAAQ,aAAa,QAAQ,MAAM;AAC5C,cAAQ,QAAQ,KAAK,EAAE,CAAC;AAAA,IAC1B,WAAW,IAAI,WAAW,UAAU,GAAG;AACrC,cAAQ,QAAQ,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IAClC,WAAW,QAAQ,gBAAgB,QAAQ,MAAM;AAC/C,cAAQ,WAAW,KAAK,EAAE,CAAC;AAAA,IAC7B,WAAW,IAAI,WAAW,aAAa,GAAG;AACxC,cAAQ,WAAW,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,IACrC;AAAA,EACF;AAGA,UAAQ,QAAQ,QAAQ,SAAS,QAAQ,IAAI;AAC7C,UAAQ,WAAW,QAAQ,YAAY,QAAQ,IAAI;AAEnD,SAAO;AACT;AAEA,SAAS,YAAkB;AACzB,UAAQ,IAAI;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,CAuBb;AACD;AAEA,KAAK,EAAE,MAAM,CAAC,QAAQ;AACpB,UAAQ,MAAM,UAAU,IAAI,OAAO;AACnC,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":["resolve","truncate","resolve"]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aliou/pi-evals",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "Eval framework for pi coding agent",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",