npm - agent-duelist - Versions diffs - 0.1.2 → 0.2.1 - Mend

agent-duelist 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -24,6 +24,7 @@ interface TaskInput {
     prompt: string;
     schema?: ZodSchema;
     tools?: ToolDefinition[];
+    signal?: AbortSignal;
 }
 interface TaskResult {
     output: string | Record<string, unknown>;
@@ -71,7 +72,10 @@ interface BenchmarkResult {
     };
 }
-declare function consoleReporter(results: BenchmarkResult[]): void;
+interface ConsoleReporterOptions {
+    sparklines?: boolean;
+}
+declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
 declare function jsonReporter(results: BenchmarkResult[]): string;
@@ -80,8 +84,12 @@ interface ArenaConfig {
     tasks: ArenaTask[];
     scorers?: BuiltInScorerName[];
     runs?: number;
-    /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-4o-mini. */
+    /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
     judgeModel?: string;
+    /** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
+    sparklines?: boolean;
+    /** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
+    timeout?: number;
 }
 interface RunOptions {
     /** Called after each individual benchmark completes */
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
 interface OpenAIProviderOptions {
     apiKey?: string;
     baseURL?: string;
+    timeoutMs?: number;
 }
 interface AzureOpenAIProviderOptions {
     apiKey?: string;
     endpoint?: string;
     apiVersion?: string;
     deployment?: string;
+    timeoutMs?: number;
 }
 declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
 interface OpenAICompatibleOptions {
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
     stripThinking?: boolean;
     /** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
     free?: boolean;
+    timeoutMs?: number;
 }
 declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
 /**
@@ -124,6 +135,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
  *   Used as the deployment name unless `options.deployment` overrides it.
  */
 declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
+interface GeminiProviderOptions {
+    apiKey?: string;
+    timeoutMs?: number;
+}
+declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
 interface AnthropicProviderOptions {
     apiKey?: string;
@@ -131,15 +147,64 @@ interface AnthropicProviderOptions {
 }
 declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
-interface GeminiProviderOptions {
-    apiKey?: string;
-}
-declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
 interface ModelPricing {
     inputPerToken: number;
     outputPerToken: number;
 }
 declare function registerPricing(providerId: string, pricing: ModelPricing): void;
-export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type ScoreResult, type ScorerFn, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, consoleReporter, defineArena, gemini, jsonReporter, openai, openaiCompatible, registerPricing };
+interface ScorerStats {
+    mean: number;
+    stddev: number;
+    cv: number;
+    n: number;
+    ci95Lower: number;
+    ci95Upper: number;
+}
+interface ScorerComparison {
+    providerId: string;
+    taskName: string;
+    scorerName: string;
+    baseline: ScorerStats | null;
+    current: ScorerStats;
+    delta: number | null;
+    regressed: boolean;
+    improved: boolean;
+    flaky: boolean;
+}
+interface CostSummary {
+    totalUsd: number;
+    perProvider: Map<string, number>;
+    budget: number | undefined;
+    overBudget: boolean;
+}
+interface CiReport {
+    comparisons: ScorerComparison[];
+    cost: CostSummary;
+    failed: boolean;
+    flakyResults: ScorerComparison[];
+    failureReasons: string[];
+}
+declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
+declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
+interface BaselineData {
+    timestamp: string;
+    results: BenchmarkResult[];
+}
+declare function loadBaseline(path: string): BaselineData | null;
+declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
+declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
+declare function htmlReporter(results: BenchmarkResult[]): string;
+interface GitHubContext {
+    token: string;
+    owner: string;
+    repo: string;
+    prNumber: number;
+}
+declare function detectGitHubContext(): GitHubContext | null;
+declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
+export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };

package/dist/index.d.ts CHANGED Viewed

@@ -24,6 +24,7 @@ interface TaskInput {
     prompt: string;
     schema?: ZodSchema;
     tools?: ToolDefinition[];
+    signal?: AbortSignal;
 }
 interface TaskResult {
     output: string | Record<string, unknown>;
@@ -71,7 +72,10 @@ interface BenchmarkResult {
     };
 }
-declare function consoleReporter(results: BenchmarkResult[]): void;
+interface ConsoleReporterOptions {
+    sparklines?: boolean;
+}
+declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
 declare function jsonReporter(results: BenchmarkResult[]): string;
@@ -80,8 +84,12 @@ interface ArenaConfig {
     tasks: ArenaTask[];
     scorers?: BuiltInScorerName[];
     runs?: number;
-    /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-4o-mini. */
+    /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
     judgeModel?: string;
+    /** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
+    sparklines?: boolean;
+    /** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
+    timeout?: number;
 }
 interface RunOptions {
     /** Called after each individual benchmark completes */
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
 interface OpenAIProviderOptions {
     apiKey?: string;
     baseURL?: string;
+    timeoutMs?: number;
 }
 interface AzureOpenAIProviderOptions {
     apiKey?: string;
     endpoint?: string;
     apiVersion?: string;
     deployment?: string;
+    timeoutMs?: number;
 }
 declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
 interface OpenAICompatibleOptions {
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
     stripThinking?: boolean;
     /** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
     free?: boolean;
+    timeoutMs?: number;
 }
 declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
 /**
@@ -124,6 +135,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
  *   Used as the deployment name unless `options.deployment` overrides it.
  */
 declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
+interface GeminiProviderOptions {
+    apiKey?: string;
+    timeoutMs?: number;
+}
+declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
 interface AnthropicProviderOptions {
     apiKey?: string;
@@ -131,15 +147,64 @@ interface AnthropicProviderOptions {
 }
 declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
-interface GeminiProviderOptions {
-    apiKey?: string;
-}
-declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
 interface ModelPricing {
     inputPerToken: number;
     outputPerToken: number;
 }
 declare function registerPricing(providerId: string, pricing: ModelPricing): void;
-export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type ScoreResult, type ScorerFn, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, consoleReporter, defineArena, gemini, jsonReporter, openai, openaiCompatible, registerPricing };
+interface ScorerStats {
+    mean: number;
+    stddev: number;
+    cv: number;
+    n: number;
+    ci95Lower: number;
+    ci95Upper: number;
+}
+interface ScorerComparison {
+    providerId: string;
+    taskName: string;
+    scorerName: string;
+    baseline: ScorerStats | null;
+    current: ScorerStats;
+    delta: number | null;
+    regressed: boolean;
+    improved: boolean;
+    flaky: boolean;
+}
+interface CostSummary {
+    totalUsd: number;
+    perProvider: Map<string, number>;
+    budget: number | undefined;
+    overBudget: boolean;
+}
+interface CiReport {
+    comparisons: ScorerComparison[];
+    cost: CostSummary;
+    failed: boolean;
+    flakyResults: ScorerComparison[];
+    failureReasons: string[];
+}
+declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
+declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
+interface BaselineData {
+    timestamp: string;
+    results: BenchmarkResult[];
+}
+declare function loadBaseline(path: string): BaselineData | null;
+declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
+declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
+declare function htmlReporter(results: BenchmarkResult[]): string;
+interface GitHubContext {
+    token: string;
+    owner: string;
+    repo: string;
+    prNumber: number;
+}
+declare function detectGitHubContext(): GitHubContext | null;
+declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
+export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };