agent-duelist 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +150 -58
- package/dist/cli.js +870 -123
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +897 -227
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -3
- package/dist/index.d.ts +67 -3
- package/dist/index.js +887 -224
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/index.d.cts
CHANGED
|
@@ -24,6 +24,7 @@ interface TaskInput {
|
|
|
24
24
|
prompt: string;
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
|
+
signal?: AbortSignal;
|
|
27
28
|
}
|
|
28
29
|
interface TaskResult {
|
|
29
30
|
output: string | Record<string, unknown>;
|
|
@@ -71,7 +72,10 @@ interface BenchmarkResult {
|
|
|
71
72
|
};
|
|
72
73
|
}
|
|
73
74
|
|
|
74
|
-
|
|
75
|
+
interface ConsoleReporterOptions {
|
|
76
|
+
sparklines?: boolean;
|
|
77
|
+
}
|
|
78
|
+
declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
|
|
75
79
|
|
|
76
80
|
declare function jsonReporter(results: BenchmarkResult[]): string;
|
|
77
81
|
|
|
@@ -80,8 +84,12 @@ interface ArenaConfig {
|
|
|
80
84
|
tasks: ArenaTask[];
|
|
81
85
|
scorers?: BuiltInScorerName[];
|
|
82
86
|
runs?: number;
|
|
83
|
-
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-
|
|
87
|
+
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
|
|
84
88
|
judgeModel?: string;
|
|
89
|
+
/** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
|
|
90
|
+
sparklines?: boolean;
|
|
91
|
+
/** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
|
|
92
|
+
timeout?: number;
|
|
85
93
|
}
|
|
86
94
|
interface RunOptions {
|
|
87
95
|
/** Called after each individual benchmark completes */
|
|
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
|
|
|
96
104
|
interface OpenAIProviderOptions {
|
|
97
105
|
apiKey?: string;
|
|
98
106
|
baseURL?: string;
|
|
107
|
+
timeoutMs?: number;
|
|
99
108
|
}
|
|
100
109
|
interface AzureOpenAIProviderOptions {
|
|
101
110
|
apiKey?: string;
|
|
102
111
|
endpoint?: string;
|
|
103
112
|
apiVersion?: string;
|
|
104
113
|
deployment?: string;
|
|
114
|
+
timeoutMs?: number;
|
|
105
115
|
}
|
|
106
116
|
declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
|
|
107
117
|
interface OpenAICompatibleOptions {
|
|
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
|
|
|
115
125
|
stripThinking?: boolean;
|
|
116
126
|
/** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
|
|
117
127
|
free?: boolean;
|
|
128
|
+
timeoutMs?: number;
|
|
118
129
|
}
|
|
119
130
|
declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
|
|
120
131
|
/**
|
|
@@ -133,6 +144,7 @@ declare function anthropic(model: string, options?: AnthropicProviderOptions): A
|
|
|
133
144
|
|
|
134
145
|
interface GeminiProviderOptions {
|
|
135
146
|
apiKey?: string;
|
|
147
|
+
timeoutMs?: number;
|
|
136
148
|
}
|
|
137
149
|
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
138
150
|
|
|
@@ -142,4 +154,56 @@ interface ModelPricing {
|
|
|
142
154
|
}
|
|
143
155
|
declare function registerPricing(providerId: string, pricing: ModelPricing): void;
|
|
144
156
|
|
|
145
|
-
|
|
157
|
+
interface ScorerStats {
|
|
158
|
+
mean: number;
|
|
159
|
+
stddev: number;
|
|
160
|
+
cv: number;
|
|
161
|
+
n: number;
|
|
162
|
+
ci95Lower: number;
|
|
163
|
+
ci95Upper: number;
|
|
164
|
+
}
|
|
165
|
+
interface ScorerComparison {
|
|
166
|
+
providerId: string;
|
|
167
|
+
taskName: string;
|
|
168
|
+
scorerName: string;
|
|
169
|
+
baseline: ScorerStats | null;
|
|
170
|
+
current: ScorerStats;
|
|
171
|
+
delta: number | null;
|
|
172
|
+
regressed: boolean;
|
|
173
|
+
improved: boolean;
|
|
174
|
+
flaky: boolean;
|
|
175
|
+
}
|
|
176
|
+
interface CostSummary {
|
|
177
|
+
totalUsd: number;
|
|
178
|
+
perProvider: Map<string, number>;
|
|
179
|
+
budget: number | undefined;
|
|
180
|
+
overBudget: boolean;
|
|
181
|
+
}
|
|
182
|
+
interface CiReport {
|
|
183
|
+
comparisons: ScorerComparison[];
|
|
184
|
+
cost: CostSummary;
|
|
185
|
+
failed: boolean;
|
|
186
|
+
flakyResults: ScorerComparison[];
|
|
187
|
+
failureReasons: string[];
|
|
188
|
+
}
|
|
189
|
+
declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
|
|
190
|
+
declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
|
|
191
|
+
interface BaselineData {
|
|
192
|
+
timestamp: string;
|
|
193
|
+
results: BenchmarkResult[];
|
|
194
|
+
}
|
|
195
|
+
declare function loadBaseline(path: string): BaselineData | null;
|
|
196
|
+
declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
|
|
197
|
+
|
|
198
|
+
declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
|
|
199
|
+
|
|
200
|
+
interface GitHubContext {
|
|
201
|
+
token: string;
|
|
202
|
+
owner: string;
|
|
203
|
+
repo: string;
|
|
204
|
+
prNumber: number;
|
|
205
|
+
}
|
|
206
|
+
declare function detectGitHubContext(): GitHubContext | null;
|
|
207
|
+
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
208
|
+
|
|
209
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
package/dist/index.d.ts
CHANGED
|
@@ -24,6 +24,7 @@ interface TaskInput {
|
|
|
24
24
|
prompt: string;
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
|
+
signal?: AbortSignal;
|
|
27
28
|
}
|
|
28
29
|
interface TaskResult {
|
|
29
30
|
output: string | Record<string, unknown>;
|
|
@@ -71,7 +72,10 @@ interface BenchmarkResult {
|
|
|
71
72
|
};
|
|
72
73
|
}
|
|
73
74
|
|
|
74
|
-
|
|
75
|
+
interface ConsoleReporterOptions {
|
|
76
|
+
sparklines?: boolean;
|
|
77
|
+
}
|
|
78
|
+
declare function consoleReporter(results: BenchmarkResult[], options?: ConsoleReporterOptions): void;
|
|
75
79
|
|
|
76
80
|
declare function jsonReporter(results: BenchmarkResult[]): string;
|
|
77
81
|
|
|
@@ -80,8 +84,12 @@ interface ArenaConfig {
|
|
|
80
84
|
tasks: ArenaTask[];
|
|
81
85
|
scorers?: BuiltInScorerName[];
|
|
82
86
|
runs?: number;
|
|
83
|
-
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-
|
|
87
|
+
/** Model to use for llm-judge-correctness (e.g. 'gemini-3.1-pro-preview'). Falls back to DUELIST_JUDGE_MODEL env var, then gpt-5-mini. */
|
|
84
88
|
judgeModel?: string;
|
|
89
|
+
/** Show sparkline bars next to percentage scores. Disable if your terminal doesn't render Unicode blocks. Default: true */
|
|
90
|
+
sparklines?: boolean;
|
|
91
|
+
/** Per-request timeout in milliseconds. Requests exceeding this are marked as failures. Default: 60000 (60s) */
|
|
92
|
+
timeout?: number;
|
|
85
93
|
}
|
|
86
94
|
interface RunOptions {
|
|
87
95
|
/** Called after each individual benchmark completes */
|
|
@@ -96,12 +104,14 @@ declare function defineArena(config: ArenaConfig): Arena;
|
|
|
96
104
|
interface OpenAIProviderOptions {
|
|
97
105
|
apiKey?: string;
|
|
98
106
|
baseURL?: string;
|
|
107
|
+
timeoutMs?: number;
|
|
99
108
|
}
|
|
100
109
|
interface AzureOpenAIProviderOptions {
|
|
101
110
|
apiKey?: string;
|
|
102
111
|
endpoint?: string;
|
|
103
112
|
apiVersion?: string;
|
|
104
113
|
deployment?: string;
|
|
114
|
+
timeoutMs?: number;
|
|
105
115
|
}
|
|
106
116
|
declare function openai(model: string, options?: OpenAIProviderOptions): ArenaProvider;
|
|
107
117
|
interface OpenAICompatibleOptions {
|
|
@@ -115,6 +125,7 @@ interface OpenAICompatibleOptions {
|
|
|
115
125
|
stripThinking?: boolean;
|
|
116
126
|
/** Mark this provider as free (e.g. local Ollama models) so it registers zero-cost pricing */
|
|
117
127
|
free?: boolean;
|
|
128
|
+
timeoutMs?: number;
|
|
118
129
|
}
|
|
119
130
|
declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvider;
|
|
120
131
|
/**
|
|
@@ -133,6 +144,7 @@ declare function anthropic(model: string, options?: AnthropicProviderOptions): A
|
|
|
133
144
|
|
|
134
145
|
interface GeminiProviderOptions {
|
|
135
146
|
apiKey?: string;
|
|
147
|
+
timeoutMs?: number;
|
|
136
148
|
}
|
|
137
149
|
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
138
150
|
|
|
@@ -142,4 +154,56 @@ interface ModelPricing {
|
|
|
142
154
|
}
|
|
143
155
|
declare function registerPricing(providerId: string, pricing: ModelPricing): void;
|
|
144
156
|
|
|
145
|
-
|
|
157
|
+
interface ScorerStats {
|
|
158
|
+
mean: number;
|
|
159
|
+
stddev: number;
|
|
160
|
+
cv: number;
|
|
161
|
+
n: number;
|
|
162
|
+
ci95Lower: number;
|
|
163
|
+
ci95Upper: number;
|
|
164
|
+
}
|
|
165
|
+
interface ScorerComparison {
|
|
166
|
+
providerId: string;
|
|
167
|
+
taskName: string;
|
|
168
|
+
scorerName: string;
|
|
169
|
+
baseline: ScorerStats | null;
|
|
170
|
+
current: ScorerStats;
|
|
171
|
+
delta: number | null;
|
|
172
|
+
regressed: boolean;
|
|
173
|
+
improved: boolean;
|
|
174
|
+
flaky: boolean;
|
|
175
|
+
}
|
|
176
|
+
interface CostSummary {
|
|
177
|
+
totalUsd: number;
|
|
178
|
+
perProvider: Map<string, number>;
|
|
179
|
+
budget: number | undefined;
|
|
180
|
+
overBudget: boolean;
|
|
181
|
+
}
|
|
182
|
+
interface CiReport {
|
|
183
|
+
comparisons: ScorerComparison[];
|
|
184
|
+
cost: CostSummary;
|
|
185
|
+
failed: boolean;
|
|
186
|
+
flakyResults: ScorerComparison[];
|
|
187
|
+
failureReasons: string[];
|
|
188
|
+
}
|
|
189
|
+
declare function computeStats(results: BenchmarkResult[]): Map<string, ScorerStats>;
|
|
190
|
+
declare function compareResults(baselineStats: Map<string, ScorerStats> | null, currentStats: Map<string, ScorerStats>, thresholds: Map<string, number>, budget?: number, currentResults?: BenchmarkResult[]): CiReport;
|
|
191
|
+
interface BaselineData {
|
|
192
|
+
timestamp: string;
|
|
193
|
+
results: BenchmarkResult[];
|
|
194
|
+
}
|
|
195
|
+
declare function loadBaseline(path: string): BaselineData | null;
|
|
196
|
+
declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
|
|
197
|
+
|
|
198
|
+
declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
|
|
199
|
+
|
|
200
|
+
interface GitHubContext {
|
|
201
|
+
token: string;
|
|
202
|
+
owner: string;
|
|
203
|
+
repo: string;
|
|
204
|
+
prNumber: number;
|
|
205
|
+
}
|
|
206
|
+
declare function detectGitHubContext(): GitHubContext | null;
|
|
207
|
+
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
208
|
+
|
|
209
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|