agent-duelist 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -24,6 +24,7 @@ interface TaskInput {
24
24
  prompt: string;
25
25
  schema?: ZodSchema;
26
26
  tools?: ToolDefinition[];
27
+ signal?: AbortSignal;
27
28
  }
28
29
  interface TaskResult {
29
30
  output: string | Record<string, unknown>;
@@ -71,7 +72,10 @@ interface BenchmarkResult {
71
72
  };
72
73
  }
73
74
 
74
- declare function consoleReporter(results: BenchmarkResult[]): void;
75
+ interface ConsoleReporterOptions {
76
+ sparklines?: boolean;
77
+ }
78
+ declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
75
79
 
76
80
  declare function jsonReporter(results: BenchmarkResult[]): string;
77
81
 
@@ -80,8 +84,12 @@ interface ArenaConfig {
80
84
  tasks: ArenaTask[];
81
85
  scorers?: BuiltInScorerName[];
82
86
  runs?: number;
83
- /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-4o-mini. */
87
+ /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
84
88
  judgeModel?: string;
89
+ /** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
90
+ sparklines?: boolean;
91
+ /** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
92
+ timeout?: number;
85
93
  }
86
94
  interface RunOptions {
87
95
  /** Called after each individual benchmark completes */
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
96
104
  interface OpenAIProviderOptions {
97
105
  apiKey?: string;
98
106
  baseURL?: string;
107
+ timeoutMs?: number;
99
108
  }
100
109
  interface AzureOpenAIProviderOptions {
101
110
  apiKey?: string;
102
111
  endpoint?: string;
103
112
  apiVersion?: string;
104
113
  deployment?: string;
114
+ timeoutMs?: number;
105
115
  }
106
116
  declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
107
117
  interface OpenAICompatibleOptions {
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
115
125
  stripThinking?: boolean;
116
126
  /** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
117
127
  free?: boolean;
128
+ timeoutMs?: number;
118
129
  }
119
130
  declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
120
131
  /**
@@ -133,6 +144,7 @@ declare function anthropic(model: string, options?: AnthropicProviderOptions): A
133
144
 
134
145
  interface GeminiProviderOptions {
135
146
  apiKey?: string;
147
+ timeoutMs?: number;
136
148
  }
137
149
  declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
138
150
 
@@ -142,4 +154,56 @@ interface ModelPricing {
142
154
  }
143
155
  declare function registerPricing(providerId: string, pricing: ModelPricing): void;
144
156
 
145
- export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type ScoreResult, type ScorerFn, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, consoleReporter, defineArena, gemini, jsonReporter, openai, openaiCompatible, registerPricing };
157
+ interface ScorerStats {
158
+ mean: number;
159
+ stddev: number;
160
+ cv: number;
161
+ n: number;
162
+ ci95Lower: number;
163
+ ci95Upper: number;
164
+ }
165
+ interface ScorerComparison {
166
+ providerId: string;
167
+ taskName: string;
168
+ scorerName: string;
169
+ baseline: ScorerStats | null;
170
+ current: ScorerStats;
171
+ delta: number | null;
172
+ regressed: boolean;
173
+ improved: boolean;
174
+ flaky: boolean;
175
+ }
176
+ interface CostSummary {
177
+ totalUsd: number;
178
+ perProvider: Map<string, number>;
179
+ budget: number | undefined;
180
+ overBudget: boolean;
181
+ }
182
+ interface CiReport {
183
+ comparisons: ScorerComparison[];
184
+ cost: CostSummary;
185
+ failed: boolean;
186
+ flakyResults: ScorerComparison[];
187
+ failureReasons: string[];
188
+ }
189
+ declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
190
+ declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
191
+ interface BaselineData {
192
+ timestamp: string;
193
+ results: BenchmarkResult[];
194
+ }
195
+ declare function loadBaseline(path: string): BaselineData | null;
196
+ declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
197
+
198
+ declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
199
+
200
+ interface GitHubContext {
201
+ token: string;
202
+ owner: string;
203
+ repo: string;
204
+ prNumber: number;
205
+ }
206
+ declare function detectGitHubContext(): GitHubContext | null;
207
+ declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
208
+
209
+ export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
package/dist/index.d.ts CHANGED
@@ -24,6 +24,7 @@ interface TaskInput {
24
24
  prompt: string;
25
25
  schema?: ZodSchema;
26
26
  tools?: ToolDefinition[];
27
+ signal?: AbortSignal;
27
28
  }
28
29
  interface TaskResult {
29
30
  output: string | Record<string, unknown>;
@@ -71,7 +72,10 @@ interface BenchmarkResult {
71
72
  };
72
73
  }
73
74
 
74
- declare function consoleReporter(results: BenchmarkResult[]): void;
75
+ interface ConsoleReporterOptions {
76
+ sparklines?: boolean;
77
+ }
78
+ declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
75
79
 
76
80
  declare function jsonReporter(results: BenchmarkResult[]): string;
77
81
 
@@ -80,8 +84,12 @@ interface ArenaConfig {
80
84
  tasks: ArenaTask[];
81
85
  scorers?: BuiltInScorerName[];
82
86
  runs?: number;
83
- /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-4o-mini. */
87
+ /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
84
88
  judgeModel?: string;
89
+ /** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
90
+ sparklines?: boolean;
91
+ /** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
92
+ timeout?: number;
85
93
  }
86
94
  interface RunOptions {
87
95
  /** Called after each individual benchmark completes */
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
96
104
  interface OpenAIProviderOptions {
97
105
  apiKey?: string;
98
106
  baseURL?: string;
107
+ timeoutMs?: number;
99
108
  }
100
109
  interface AzureOpenAIProviderOptions {
101
110
  apiKey?: string;
102
111
  endpoint?: string;
103
112
  apiVersion?: string;
104
113
  deployment?: string;
114
+ timeoutMs?: number;
105
115
  }
106
116
  declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
107
117
  interface OpenAICompatibleOptions {
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
115
125
  stripThinking?: boolean;
116
126
  /** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
117
127
  free?: boolean;
128
+ timeoutMs?: number;
118
129
  }
119
130
  declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
120
131
  /**
@@ -133,6 +144,7 @@ declare function anthropic(model: string, options?: AnthropicProviderOptions): A
133
144
 
134
145
  interface GeminiProviderOptions {
135
146
  apiKey?: string;
147
+ timeoutMs?: number;
136
148
  }
137
149
  declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
138
150
 
@@ -142,4 +154,56 @@ interface ModelPricing {
142
154
  }
143
155
  declare function registerPricing(providerId: string, pricing: ModelPricing): void;
144
156
 
145
- export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type ScoreResult, type ScorerFn, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, consoleReporter, defineArena, gemini, jsonReporter, openai, openaiCompatible, registerPricing };
157
+ interface ScorerStats {
158
+ mean: number;
159
+ stddev: number;
160
+ cv: number;
161
+ n: number;
162
+ ci95Lower: number;
163
+ ci95Upper: number;
164
+ }
165
+ interface ScorerComparison {
166
+ providerId: string;
167
+ taskName: string;
168
+ scorerName: string;
169
+ baseline: ScorerStats | null;
170
+ current: ScorerStats;
171
+ delta: number | null;
172
+ regressed: boolean;
173
+ improved: boolean;
174
+ flaky: boolean;
175
+ }
176
+ interface CostSummary {
177
+ totalUsd: number;
178
+ perProvider: Map<string, number>;
179
+ budget: number | undefined;
180
+ overBudget: boolean;
181
+ }
182
+ interface CiReport {
183
+ comparisons: ScorerComparison[];
184
+ cost: CostSummary;
185
+ failed: boolean;
186
+ flakyResults: ScorerComparison[];
187
+ failureReasons: string[];
188
+ }
189
+ declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
190
+ declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
191
+ interface BaselineData {
192
+ timestamp: string;
193
+ results: BenchmarkResult[];
194
+ }
195
+ declare function loadBaseline(path: string): BaselineData | null;
196
+ declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
197
+
198
+ declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
199
+
200
+ interface GitHubContext {
201
+ token: string;
202
+ owner: string;
203
+ repo: string;
204
+ prNumber: number;
205
+ }
206
+ declare function detectGitHubContext(): GitHubContext | null;
207
+ declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
208
+
209
+ export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };