agent-duelist 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -59
- package/dist/cli.js +1793 -394
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1774 -396
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +73 -8
- package/dist/index.d.ts +73 -8
- package/dist/index.js +1765 -395
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/index.d.cts
CHANGED
|
@@ -24,6 +24,7 @@ interface TaskInput {
|
|
|
24
24
|
prompt: string;
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
|
+
signal?: AbortSignal;
|
|
27
28
|
}
|
|
28
29
|
interface TaskResult {
|
|
29
30
|
output: string | Record<string, unknown>;
|
|
@@ -71,7 +72,10 @@ interface BenchmarkResult {
|
|
|
71
72
|
};
|
|
72
73
|
}
|
|
73
74
|
|
|
74
|
-
|
|
75
|
+
interface ConsoleReporterOptions {
|
|
76
|
+
sparklines?: boolean;
|
|
77
|
+
}
|
|
78
|
+
declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
|
|
75
79
|
|
|
76
80
|
declare function jsonReporter(results: BenchmarkResult[]): string;
|
|
77
81
|
|
|
@@ -80,8 +84,12 @@ interface ArenaConfig {
|
|
|
80
84
|
tasks: ArenaTask[];
|
|
81
85
|
scorers?: BuiltInScorerName[];
|
|
82
86
|
runs?: number;
|
|
83
|
-
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-
|
|
87
|
+
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
|
|
84
88
|
judgeModel?: string;
|
|
89
|
+
/** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
|
|
90
|
+
sparklines?: boolean;
|
|
91
|
+
/** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
|
|
92
|
+
timeout?: number;
|
|
85
93
|
}
|
|
86
94
|
interface RunOptions {
|
|
87
95
|
/** Called after each individual benchmark completes */
|
|
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
|
|
|
96
104
|
interface OpenAIProviderOptions {
|
|
97
105
|
apiKey?: string;
|
|
98
106
|
baseURL?: string;
|
|
107
|
+
timeoutMs?: number;
|
|
99
108
|
}
|
|
100
109
|
interface AzureOpenAIProviderOptions {
|
|
101
110
|
apiKey?: string;
|
|
102
111
|
endpoint?: string;
|
|
103
112
|
apiVersion?: string;
|
|
104
113
|
deployment?: string;
|
|
114
|
+
timeoutMs?: number;
|
|
105
115
|
}
|
|
106
116
|
declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
|
|
107
117
|
interface OpenAICompatibleOptions {
|
|
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
|
|
|
115
125
|
stripThinking?: boolean;
|
|
116
126
|
/** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
|
|
117
127
|
free?: boolean;
|
|
128
|
+
timeoutMs?: number;
|
|
118
129
|
}
|
|
119
130
|
declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
|
|
120
131
|
/**
|
|
@@ -124,6 +135,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
|
|
|
124
135
|
* Used as the deployment name unless `options.deployment` overrides it.
|
|
125
136
|
*/
|
|
126
137
|
declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
|
|
138
|
+
interface GeminiProviderOptions {
|
|
139
|
+
apiKey?: string;
|
|
140
|
+
timeoutMs?: number;
|
|
141
|
+
}
|
|
142
|
+
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
127
143
|
|
|
128
144
|
interface AnthropicProviderOptions {
|
|
129
145
|
apiKey?: string;
|
|
@@ -131,15 +147,64 @@ interface AnthropicProviderOptions {
|
|
|
131
147
|
}
|
|
132
148
|
declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
|
|
133
149
|
|
|
134
|
-
interface GeminiProviderOptions {
|
|
135
|
-
apiKey?: string;
|
|
136
|
-
}
|
|
137
|
-
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
138
|
-
|
|
139
150
|
interface ModelPricing {
|
|
140
151
|
inputPerToken: number;
|
|
141
152
|
outputPerToken: number;
|
|
142
153
|
}
|
|
143
154
|
declare function registerPricing(providerId: string, pricing: ModelPricing): void;
|
|
144
155
|
|
|
145
|
-
|
|
156
|
+
interface ScorerStats {
|
|
157
|
+
mean: number;
|
|
158
|
+
stddev: number;
|
|
159
|
+
cv: number;
|
|
160
|
+
n: number;
|
|
161
|
+
ci95Lower: number;
|
|
162
|
+
ci95Upper: number;
|
|
163
|
+
}
|
|
164
|
+
interface ScorerComparison {
|
|
165
|
+
providerId: string;
|
|
166
|
+
taskName: string;
|
|
167
|
+
scorerName: string;
|
|
168
|
+
baseline: ScorerStats | null;
|
|
169
|
+
current: ScorerStats;
|
|
170
|
+
delta: number | null;
|
|
171
|
+
regressed: boolean;
|
|
172
|
+
improved: boolean;
|
|
173
|
+
flaky: boolean;
|
|
174
|
+
}
|
|
175
|
+
interface CostSummary {
|
|
176
|
+
totalUsd: number;
|
|
177
|
+
perProvider: Map<string, number>;
|
|
178
|
+
budget: number | undefined;
|
|
179
|
+
overBudget: boolean;
|
|
180
|
+
}
|
|
181
|
+
interface CiReport {
|
|
182
|
+
comparisons: ScorerComparison[];
|
|
183
|
+
cost: CostSummary;
|
|
184
|
+
failed: boolean;
|
|
185
|
+
flakyResults: ScorerComparison[];
|
|
186
|
+
failureReasons: string[];
|
|
187
|
+
}
|
|
188
|
+
declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
|
|
189
|
+
declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
|
|
190
|
+
interface BaselineData {
|
|
191
|
+
timestamp: string;
|
|
192
|
+
results: BenchmarkResult[];
|
|
193
|
+
}
|
|
194
|
+
declare function loadBaseline(path: string): BaselineData | null;
|
|
195
|
+
declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
|
|
196
|
+
|
|
197
|
+
declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
|
|
198
|
+
|
|
199
|
+
declare function htmlReporter(results: BenchmarkResult[]): string;
|
|
200
|
+
|
|
201
|
+
interface GitHubContext {
|
|
202
|
+
token: string;
|
|
203
|
+
owner: string;
|
|
204
|
+
repo: string;
|
|
205
|
+
prNumber: number;
|
|
206
|
+
}
|
|
207
|
+
declare function detectGitHubContext(): GitHubContext | null;
|
|
208
|
+
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
209
|
+
|
|
210
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
package/dist/index.d.ts
CHANGED
|
@@ -24,6 +24,7 @@ interface TaskInput {
|
|
|
24
24
|
prompt: string;
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
|
+
signal?: AbortSignal;
|
|
27
28
|
}
|
|
28
29
|
interface TaskResult {
|
|
29
30
|
output: string | Record<string, unknown>;
|
|
@@ -71,7 +72,10 @@ interface BenchmarkResult {
|
|
|
71
72
|
};
|
|
72
73
|
}
|
|
73
74
|
|
|
74
|
-
|
|
75
|
+
interface ConsoleReporterOptions {
|
|
76
|
+
sparklines?: boolean;
|
|
77
|
+
}
|
|
78
|
+
declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
|
|
75
79
|
|
|
76
80
|
declare function jsonReporter(results: BenchmarkResult[]): string;
|
|
77
81
|
|
|
@@ -80,8 +84,12 @@ interface ArenaConfig {
|
|
|
80
84
|
tasks: ArenaTask[];
|
|
81
85
|
scorers?: BuiltInScorerName[];
|
|
82
86
|
runs?: number;
|
|
83
|
-
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-
|
|
87
|
+
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
|
|
84
88
|
judgeModel?: string;
|
|
89
|
+
/** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
|
|
90
|
+
sparklines?: boolean;
|
|
91
|
+
/** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
|
|
92
|
+
timeout?: number;
|
|
85
93
|
}
|
|
86
94
|
interface RunOptions {
|
|
87
95
|
/** Called after each individual benchmark completes */
|
|
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
|
|
|
96
104
|
interface OpenAIProviderOptions {
|
|
97
105
|
apiKey?: string;
|
|
98
106
|
baseURL?: string;
|
|
107
|
+
timeoutMs?: number;
|
|
99
108
|
}
|
|
100
109
|
interface AzureOpenAIProviderOptions {
|
|
101
110
|
apiKey?: string;
|
|
102
111
|
endpoint?: string;
|
|
103
112
|
apiVersion?: string;
|
|
104
113
|
deployment?: string;
|
|
114
|
+
timeoutMs?: number;
|
|
105
115
|
}
|
|
106
116
|
declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
|
|
107
117
|
interface OpenAICompatibleOptions {
|
|
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
|
|
|
115
125
|
stripThinking?: boolean;
|
|
116
126
|
/** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
|
|
117
127
|
free?: boolean;
|
|
128
|
+
timeoutMs?: number;
|
|
118
129
|
}
|
|
119
130
|
declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
|
|
120
131
|
/**
|
|
@@ -124,6 +135,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
|
|
|
124
135
|
* Used as the deployment name unless `options.deployment` overrides it.
|
|
125
136
|
*/
|
|
126
137
|
declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
|
|
138
|
+
interface GeminiProviderOptions {
|
|
139
|
+
apiKey?: string;
|
|
140
|
+
timeoutMs?: number;
|
|
141
|
+
}
|
|
142
|
+
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
127
143
|
|
|
128
144
|
interface AnthropicProviderOptions {
|
|
129
145
|
apiKey?: string;
|
|
@@ -131,15 +147,64 @@ interface AnthropicProviderOptions {
|
|
|
131
147
|
}
|
|
132
148
|
declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
|
|
133
149
|
|
|
134
|
-
interface GeminiProviderOptions {
|
|
135
|
-
apiKey?: string;
|
|
136
|
-
}
|
|
137
|
-
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
138
|
-
|
|
139
150
|
interface ModelPricing {
|
|
140
151
|
inputPerToken: number;
|
|
141
152
|
outputPerToken: number;
|
|
142
153
|
}
|
|
143
154
|
declare function registerPricing(providerId: string, pricing: ModelPricing): void;
|
|
144
155
|
|
|
145
|
-
|
|
156
|
+
interface ScorerStats {
|
|
157
|
+
mean: number;
|
|
158
|
+
stddev: number;
|
|
159
|
+
cv: number;
|
|
160
|
+
n: number;
|
|
161
|
+
ci95Lower: number;
|
|
162
|
+
ci95Upper: number;
|
|
163
|
+
}
|
|
164
|
+
interface ScorerComparison {
|
|
165
|
+
providerId: string;
|
|
166
|
+
taskName: string;
|
|
167
|
+
scorerName: string;
|
|
168
|
+
baseline: ScorerStats | null;
|
|
169
|
+
current: ScorerStats;
|
|
170
|
+
delta: number | null;
|
|
171
|
+
regressed: boolean;
|
|
172
|
+
improved: boolean;
|
|
173
|
+
flaky: boolean;
|
|
174
|
+
}
|
|
175
|
+
interface CostSummary {
|
|
176
|
+
totalUsd: number;
|
|
177
|
+
perProvider: Map<string, number>;
|
|
178
|
+
budget: number | undefined;
|
|
179
|
+
overBudget: boolean;
|
|
180
|
+
}
|
|
181
|
+
interface CiReport {
|
|
182
|
+
comparisons: ScorerComparison[];
|
|
183
|
+
cost: CostSummary;
|
|
184
|
+
failed: boolean;
|
|
185
|
+
flakyResults: ScorerComparison[];
|
|
186
|
+
failureReasons: string[];
|
|
187
|
+
}
|
|
188
|
+
declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
|
|
189
|
+
declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
|
|
190
|
+
interface BaselineData {
|
|
191
|
+
timestamp: string;
|
|
192
|
+
results: BenchmarkResult[];
|
|
193
|
+
}
|
|
194
|
+
declare function loadBaseline(path: string): BaselineData | null;
|
|
195
|
+
declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
|
|
196
|
+
|
|
197
|
+
declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
|
|
198
|
+
|
|
199
|
+
declare function htmlReporter(results: BenchmarkResult[]): string;
|
|
200
|
+
|
|
201
|
+
interface GitHubContext {
|
|
202
|
+
token: string;
|
|
203
|
+
owner: string;
|
|
204
|
+
repo: string;
|
|
205
|
+
prNumber: number;
|
|
206
|
+
}
|
|
207
|
+
declare function detectGitHubContext(): GitHubContext | null;
|
|
208
|
+
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
209
|
+
|
|
210
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|