agent-duelist 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +251 -133
- package/dist/cli.js +4945 -2351
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1405 -468
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +35 -9
- package/dist/index.d.ts +35 -9
- package/dist/index.js +1402 -468
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -25,9 +25,11 @@ interface TaskInput {
|
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
27
|
signal?: AbortSignal;
|
|
28
|
+
/** Per-request timeout in ms, forwarded to SDK HTTP clients. */
|
|
29
|
+
timeout?: number;
|
|
28
30
|
}
|
|
29
31
|
interface TaskResult {
|
|
30
|
-
output: string | Record<string, unknown
|
|
32
|
+
output: string | Record<string, unknown> | unknown[];
|
|
31
33
|
usage?: {
|
|
32
34
|
promptTokens?: number;
|
|
33
35
|
completionTokens?: number;
|
|
@@ -62,7 +64,7 @@ interface BenchmarkResult {
|
|
|
62
64
|
scores: ScoreResult[];
|
|
63
65
|
error?: string;
|
|
64
66
|
raw: {
|
|
65
|
-
output: string | Record<string, unknown
|
|
67
|
+
output: string | Record<string, unknown> | unknown[];
|
|
66
68
|
latencyMs: number;
|
|
67
69
|
usage?: {
|
|
68
70
|
promptTokens?: number;
|
|
@@ -135,6 +137,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
|
|
|
135
137
|
* Used as the deployment name unless `options.deployment` overrides it.
|
|
136
138
|
*/
|
|
137
139
|
declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
|
|
140
|
+
interface GeminiProviderOptions {
|
|
141
|
+
apiKey?: string;
|
|
142
|
+
timeoutMs?: number;
|
|
143
|
+
}
|
|
144
|
+
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
138
145
|
|
|
139
146
|
interface AnthropicProviderOptions {
|
|
140
147
|
apiKey?: string;
|
|
@@ -142,12 +149,6 @@ interface AnthropicProviderOptions {
|
|
|
142
149
|
}
|
|
143
150
|
declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
|
|
144
151
|
|
|
145
|
-
interface GeminiProviderOptions {
|
|
146
|
-
apiKey?: string;
|
|
147
|
-
timeoutMs?: number;
|
|
148
|
-
}
|
|
149
|
-
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
150
|
-
|
|
151
152
|
interface ModelPricing {
|
|
152
153
|
inputPerToken: number;
|
|
153
154
|
outputPerToken: number;
|
|
@@ -197,6 +198,31 @@ declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
|
|
|
197
198
|
|
|
198
199
|
declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
|
|
199
200
|
|
|
201
|
+
declare function htmlReporter(results: BenchmarkResult[]): string;
|
|
202
|
+
|
|
203
|
+
interface TaskPack {
|
|
204
|
+
/** Short identifier, e.g. 'structured-output' */
|
|
205
|
+
name: string;
|
|
206
|
+
/** Human-readable label for console output */
|
|
207
|
+
label: string;
|
|
208
|
+
/** One-sentence description shown in --pack list */
|
|
209
|
+
description: string;
|
|
210
|
+
/** The tasks in this pack */
|
|
211
|
+
tasks: ArenaTask[];
|
|
212
|
+
/** Recommended scorers for this pack */
|
|
213
|
+
scorers: BuiltInScorerName[];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/** Get a pack by name. Throws if not found. */
|
|
217
|
+
declare function loadPack(name: string): TaskPack;
|
|
218
|
+
/** Get all available pack names */
|
|
219
|
+
declare function listPacks(): Array<{
|
|
220
|
+
name: string;
|
|
221
|
+
label: string;
|
|
222
|
+
description: string;
|
|
223
|
+
taskCount: number;
|
|
224
|
+
}>;
|
|
225
|
+
|
|
200
226
|
interface GitHubContext {
|
|
201
227
|
token: string;
|
|
202
228
|
owner: string;
|
|
@@ -206,4 +232,4 @@ interface GitHubContext {
|
|
|
206
232
|
declare function detectGitHubContext(): GitHubContext | null;
|
|
207
233
|
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
208
234
|
|
|
209
|
-
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
|
235
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskPack, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, listPacks, loadBaseline, loadPack, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
package/dist/index.d.ts
CHANGED
|
@@ -25,9 +25,11 @@ interface TaskInput {
|
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
27
|
signal?: AbortSignal;
|
|
28
|
+
/** Per-request timeout in ms, forwarded to SDK HTTP clients. */
|
|
29
|
+
timeout?: number;
|
|
28
30
|
}
|
|
29
31
|
interface TaskResult {
|
|
30
|
-
output: string | Record<string, unknown
|
|
32
|
+
output: string | Record<string, unknown> | unknown[];
|
|
31
33
|
usage?: {
|
|
32
34
|
promptTokens?: number;
|
|
33
35
|
completionTokens?: number;
|
|
@@ -62,7 +64,7 @@ interface BenchmarkResult {
|
|
|
62
64
|
scores: ScoreResult[];
|
|
63
65
|
error?: string;
|
|
64
66
|
raw: {
|
|
65
|
-
output: string | Record<string, unknown
|
|
67
|
+
output: string | Record<string, unknown> | unknown[];
|
|
66
68
|
latencyMs: number;
|
|
67
69
|
usage?: {
|
|
68
70
|
promptTokens?: number;
|
|
@@ -135,6 +137,11 @@ declare function openaiCompatible(options: OpenAICompatibleOptions): ArenaProvid
|
|
|
135
137
|
* Used as the deployment name unless `options.deployment` overrides it.
|
|
136
138
|
*/
|
|
137
139
|
declare function azureOpenai(model: string, options?: AzureOpenAIProviderOptions): ArenaProvider;
|
|
140
|
+
interface GeminiProviderOptions {
|
|
141
|
+
apiKey?: string;
|
|
142
|
+
timeoutMs?: number;
|
|
143
|
+
}
|
|
144
|
+
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
138
145
|
|
|
139
146
|
interface AnthropicProviderOptions {
|
|
140
147
|
apiKey?: string;
|
|
@@ -142,12 +149,6 @@ interface AnthropicProviderOptions {
|
|
|
142
149
|
}
|
|
143
150
|
declare function anthropic(model: string, options?: AnthropicProviderOptions): ArenaProvider;
|
|
144
151
|
|
|
145
|
-
interface GeminiProviderOptions {
|
|
146
|
-
apiKey?: string;
|
|
147
|
-
timeoutMs?: number;
|
|
148
|
-
}
|
|
149
|
-
declare function gemini(model: string, options?: GeminiProviderOptions): ArenaProvider;
|
|
150
|
-
|
|
151
152
|
interface ModelPricing {
|
|
152
153
|
inputPerToken: number;
|
|
153
154
|
outputPerToken: number;
|
|
@@ -197,6 +198,31 @@ declare function saveBaseline(path: string, results: BenchmarkResult[]): void;
|
|
|
197
198
|
|
|
198
199
|
declare function markdownReporter(report: CiReport, _current: BenchmarkResult[]): string;
|
|
199
200
|
|
|
201
|
+
declare function htmlReporter(results: BenchmarkResult[]): string;
|
|
202
|
+
|
|
203
|
+
interface TaskPack {
|
|
204
|
+
/** Short identifier, e.g. 'structured-output' */
|
|
205
|
+
name: string;
|
|
206
|
+
/** Human-readable label for console output */
|
|
207
|
+
label: string;
|
|
208
|
+
/** One-sentence description shown in --pack list */
|
|
209
|
+
description: string;
|
|
210
|
+
/** The tasks in this pack */
|
|
211
|
+
tasks: ArenaTask[];
|
|
212
|
+
/** Recommended scorers for this pack */
|
|
213
|
+
scorers: BuiltInScorerName[];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/** Get a pack by name. Throws if not found. */
|
|
217
|
+
declare function loadPack(name: string): TaskPack;
|
|
218
|
+
/** Get all available pack names */
|
|
219
|
+
declare function listPacks(): Array<{
|
|
220
|
+
name: string;
|
|
221
|
+
label: string;
|
|
222
|
+
description: string;
|
|
223
|
+
taskCount: number;
|
|
224
|
+
}>;
|
|
225
|
+
|
|
200
226
|
interface GitHubContext {
|
|
201
227
|
token: string;
|
|
202
228
|
owner: string;
|
|
@@ -206,4 +232,4 @@ interface GitHubContext {
|
|
|
206
232
|
declare function detectGitHubContext(): GitHubContext | null;
|
|
207
233
|
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
208
234
|
|
|
209
|
-
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
|
235
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskPack, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, listPacks, loadBaseline, loadPack, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|