agent-duelist 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -24,6 +24,7 @@ interface TaskInput {
24
24
  prompt: string;
25
25
  schema?: ZodSchema;
26
26
  tools?: ToolDefinition[];
27
+ signal?: AbortSignal;
27
28
  }
28
29
  interface TaskResult {
29
30
  output: string | Record<string, unknown>;
@@ -71,7 +72,10 @@ interface BenchmarkResult {
71
72
  };
72
73
  }
73
74
 
74
- declare function consoleReporter(results: BenchmarkResult[]): void;
75
+ interface ConsoleReporterOptions {
76
+ sparklines?: boolean;
77
+ }
78
+ declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
75
79
 
76
80
  declare function jsonReporter(results: BenchmarkResult[]): string;
77
81
 
@@ -80,8 +84,12 @@ interface ArenaConfig {
80
84
  tasks: ArenaTask[];
81
85
  scorers?: BuiltInScorerName[];
82
86
  runs?: number;
83
- /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-4o-mini. */
87
+ /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
84
88
  judgeModel?: string;
89
+ /** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
90
+ sparklines?: boolean;
91
+ /** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
92
+ timeout?: number;
85
93
  }
86
94
  interface RunOptions {
87
95
  /** Called after each individual benchmark completes */
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
96
104
  interface OpenAIProviderOptions {
97
105
  apiKey?: string;
98
106
  baseURL?: string;
107
+ timeoutMs?: number;
99
108
  }
100
109
  interface AzureOpenAIProviderOptions {
101
110
  apiKey?: string;
102
111
  endpoint?: string;
103
112
  apiVersion?: string;
104
113
  deployment?: string;
114
+ timeoutMs?: number;
105
115
  }
106
116
  declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
107
117
  interface OpenAICompatibleOptions {
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
115
125
  stripThinking?: boolean;
116
126
  /** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
117
127
  free?: boolean;
128
+ timeoutMs?: number;
118
129
  }
119
130
  declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
120
131
  /**
@@ -124,6 +135,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
124
135
  * Used as the deployment name unless `options.deployment` overrides it.
125
136
  */
126
137
  declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
138
+ interface GeminiProviderOptions {
139
+ apiKey?: string;
140
+ timeoutMs?: number;
141
+ }
142
+ declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
127
143
 
128
144
  interface AnthropicProviderOptions {
129
145
  apiKey?: string;
@@ -131,15 +147,64 @@ interface AnthropicProviderOptions {
131
147
  }
132
148
  declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
133
149
 
134
- interface GeminiProviderOptions {
135
- apiKey?: string;
136
- }
137
- declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
138
-
139
150
  interface ModelPricing {
140
151
  inputPerToken: number;
141
152
  outputPerToken: number;
142
153
  }
143
154
  declare function registerPricing(providerId: string, pricing: ModelPricing): void;
144
155
 
145
- export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type ScoreResult, type ScorerFn, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, consoleReporter, defineArena, gemini, jsonReporter, openai, openaiCompatible, registerPricing };
156
+ interface ScorerStats {
157
+ mean: number;
158
+ stddev: number;
159
+ cv: number;
160
+ n: number;
161
+ ci95Lower: number;
162
+ ci95Upper: number;
163
+ }
164
+ interface ScorerComparison {
165
+ providerId: string;
166
+ taskName: string;
167
+ scorerName: string;
168
+ baseline: ScorerStats | null;
169
+ current: ScorerStats;
170
+ delta: number | null;
171
+ regressed: boolean;
172
+ improved: boolean;
173
+ flaky: boolean;
174
+ }
175
+ interface CostSummary {
176
+ totalUsd: number;
177
+ perProvider: Map<string, number>;
178
+ budget: number | undefined;
179
+ overBudget: boolean;
180
+ }
181
+ interface CiReport {
182
+ comparisons: ScorerComparison[];
183
+ cost: CostSummary;
184
+ failed: boolean;
185
+ flakyResults: ScorerComparison[];
186
+ failureReasons: string[];
187
+ }
188
+ declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
189
+ declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
190
+ interface BaselineData {
191
+ timestamp: string;
192
+ results: BenchmarkResult[];
193
+ }
194
+ declare function loadBaseline(path: string): BaselineData | null;
195
+ declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
196
+
197
+ declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
198
+
199
+ declare function htmlReporter(results: BenchmarkResult[]): string;
200
+
201
+ interface GitHubContext {
202
+ token: string;
203
+ owner: string;
204
+ repo: string;
205
+ prNumber: number;
206
+ }
207
+ declare function detectGitHubContext(): GitHubContext | null;
208
+ declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
209
+
210
+ export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
package/dist/index.d.ts CHANGED
@@ -24,6 +24,7 @@ interface TaskInput {
24
24
  prompt: string;
25
25
  schema?: ZodSchema;
26
26
  tools?: ToolDefinition[];
27
+ signal?: AbortSignal;
27
28
  }
28
29
  interface TaskResult {
29
30
  output: string | Record<string, unknown>;
@@ -71,7 +72,10 @@ interface BenchmarkResult {
71
72
  };
72
73
  }
73
74
 
74
- declare function consoleReporter(results: BenchmarkResult[]): void;
75
+ interface ConsoleReporterOptions {
76
+ sparklines?: boolean;
77
+ }
78
+ declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
75
79
 
76
80
  declare function jsonReporter(results: BenchmarkResult[]): string;
77
81
 
@@ -80,8 +84,12 @@ interface ArenaConfig {
80
84
  tasks: ArenaTask[];
81
85
  scorers?: BuiltInScorerName[];
82
86
  runs?: number;
83
- /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-4o-mini. */
87
+ /** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
84
88
  judgeModel?: string;
89
+ /** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
90
+ sparklines?: boolean;
91
+ /** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
92
+ timeout?: number;
85
93
  }
86
94
  interface RunOptions {
87
95
  /** Called after each individual benchmark completes */
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
96
104
  interface OpenAIProviderOptions {
97
105
  apiKey?: string;
98
106
  baseURL?: string;
107
+ timeoutMs?: number;
99
108
  }
100
109
  interface AzureOpenAIProviderOptions {
101
110
  apiKey?: string;
102
111
  endpoint?: string;
103
112
  apiVersion?: string;
104
113
  deployment?: string;
114
+ timeoutMs?: number;
105
115
  }
106
116
  declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
107
117
  interface OpenAICompatibleOptions {
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
115
125
  stripThinking?: boolean;
116
126
  /** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
117
127
  free?: boolean;
128
+ timeoutMs?: number;
118
129
  }
119
130
  declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
120
131
  /**
@@ -124,6 +135,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
124
135
  * Used as the deployment name unless `options.deployment` overrides it.
125
136
  */
126
137
  declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
138
+ interface GeminiProviderOptions {
139
+ apiKey?: string;
140
+ timeoutMs?: number;
141
+ }
142
+ declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
127
143
 
128
144
  interface AnthropicProviderOptions {
129
145
  apiKey?: string;
@@ -131,15 +147,64 @@ interface AnthropicProviderOptions {
131
147
  }
132
148
  declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
133
149
 
134
- interface GeminiProviderOptions {
135
- apiKey?: string;
136
- }
137
- declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
138
-
139
150
  interface ModelPricing {
140
151
  inputPerToken: number;
141
152
  outputPerToken: number;
142
153
  }
143
154
  declare function registerPricing(providerId: string, pricing: ModelPricing): void;
144
155
 
145
- export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type ScoreResult, type ScorerFn, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, consoleReporter, defineArena, gemini, jsonReporter, openai, openaiCompatible, registerPricing };
156
+ interface ScorerStats {
157
+ mean: number;
158
+ stddev: number;
159
+ cv: number;
160
+ n: number;
161
+ ci95Lower: number;
162
+ ci95Upper: number;
163
+ }
164
+ interface ScorerComparison {
165
+ providerId: string;
166
+ taskName: string;
167
+ scorerName: string;
168
+ baseline: ScorerStats | null;
169
+ current: ScorerStats;
170
+ delta: number | null;
171
+ regressed: boolean;
172
+ improved: boolean;
173
+ flaky: boolean;
174
+ }
175
+ interface CostSummary {
176
+ totalUsd: number;
177
+ perProvider: Map<string, number>;
178
+ budget: number | undefined;
179
+ overBudget: boolean;
180
+ }
181
+ interface CiReport {
182
+ comparisons: ScorerComparison[];
183
+ cost: CostSummary;
184
+ failed: boolean;
185
+ flakyResults: ScorerComparison[];
186
+ failureReasons: string[];
187
+ }
188
+ declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
189
+ declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
190
+ interface BaselineData {
191
+ timestamp: string;
192
+ results: BenchmarkResult[];
193
+ }
194
+ declare function loadBaseline(path: string): BaselineData | null;
195
+ declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
196
+
197
+ declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
198
+
199
+ declare function htmlReporter(results: BenchmarkResult[]): string;
200
+
201
+ interface GitHubContext {
202
+ token: string;
203
+ owner: string;
204
+ repo: string;
205
+ prNumber: number;
206
+ }
207
+ declare function detectGitHubContext(): GitHubContext | null;
208
+ declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
209
+
210
+ export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };