agent-duelist 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +246 -142
- package/dist/cli.js +2004 -62
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +334 -105
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -3
- package/dist/index.d.ts +28 -3
- package/dist/index.js +332 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -25,9 +25,11 @@ interface TaskInput {
|
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
27
|
signal?: AbortSignal;
|
|
28
|
+
/** Per-request timeout in ms, forwarded to SDK HTTP clients. */
|
|
29
|
+
timeout?: number;
|
|
28
30
|
}
|
|
29
31
|
interface TaskResult {
|
|
30
|
-
output: string | Record<string, unknown
|
|
32
|
+
output: string | Record<string, unknown> | unknown[];
|
|
31
33
|
usage?: {
|
|
32
34
|
promptTokens?: number;
|
|
33
35
|
completionTokens?: number;
|
|
@@ -62,7 +64,7 @@ interface BenchmarkResult {
|
|
|
62
64
|
scores: ScoreResult[];
|
|
63
65
|
error?: string;
|
|
64
66
|
raw: {
|
|
65
|
-
output: string | Record<string, unknown
|
|
67
|
+
output: string | Record<string, unknown> | unknown[];
|
|
66
68
|
latencyMs: number;
|
|
67
69
|
usage?: {
|
|
68
70
|
promptTokens?: number;
|
|
@@ -198,6 +200,29 @@ declare function markdownReporter(report: CiReport, _current: BenchmarkResult[])
|
|
|
198
200
|
|
|
199
201
|
declare function htmlReporter(results: BenchmarkResult[]): string;
|
|
200
202
|
|
|
203
|
+
interface TaskPack {
|
|
204
|
+
/** Short identifier, e.g. 'structured-output' */
|
|
205
|
+
name: string;
|
|
206
|
+
/** Human-readable label for console output */
|
|
207
|
+
label: string;
|
|
208
|
+
/** One-sentence description shown in --pack list */
|
|
209
|
+
description: string;
|
|
210
|
+
/** The tasks in this pack */
|
|
211
|
+
tasks: ArenaTask[];
|
|
212
|
+
/** Recommended scorers for this pack */
|
|
213
|
+
scorers: BuiltInScorerName[];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/** Get a pack by name. Throws if not found. */
|
|
217
|
+
declare function loadPack(name: string): TaskPack;
|
|
218
|
+
/** Get all available pack names */
|
|
219
|
+
declare function listPacks(): Array<{
|
|
220
|
+
name: string;
|
|
221
|
+
label: string;
|
|
222
|
+
description: string;
|
|
223
|
+
taskCount: number;
|
|
224
|
+
}>;
|
|
225
|
+
|
|
201
226
|
interface GitHubContext {
|
|
202
227
|
token: string;
|
|
203
228
|
owner: string;
|
|
@@ -207,4 +232,4 @@ interface GitHubContext {
|
|
|
207
232
|
declare function detectGitHubContext(): GitHubContext | null;
|
|
208
233
|
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
209
234
|
|
|
210
|
-
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
|
235
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskPack, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, listPacks, loadBaseline, loadPack, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
package/dist/index.d.ts
CHANGED
|
@@ -25,9 +25,11 @@ interface TaskInput {
|
|
|
25
25
|
schema?: ZodSchema;
|
|
26
26
|
tools?: ToolDefinition[];
|
|
27
27
|
signal?: AbortSignal;
|
|
28
|
+
/** Per-request timeout in ms, forwarded to SDK HTTP clients. */
|
|
29
|
+
timeout?: number;
|
|
28
30
|
}
|
|
29
31
|
interface TaskResult {
|
|
30
|
-
output: string | Record<string, unknown
|
|
32
|
+
output: string | Record<string, unknown> | unknown[];
|
|
31
33
|
usage?: {
|
|
32
34
|
promptTokens?: number;
|
|
33
35
|
completionTokens?: number;
|
|
@@ -62,7 +64,7 @@ interface BenchmarkResult {
|
|
|
62
64
|
scores: ScoreResult[];
|
|
63
65
|
error?: string;
|
|
64
66
|
raw: {
|
|
65
|
-
output: string | Record<string, unknown
|
|
67
|
+
output: string | Record<string, unknown> | unknown[];
|
|
66
68
|
latencyMs: number;
|
|
67
69
|
usage?: {
|
|
68
70
|
promptTokens?: number;
|
|
@@ -198,6 +200,29 @@ declare function markdownReporter(report: CiReport, _current: BenchmarkResult[])
|
|
|
198
200
|
|
|
199
201
|
declare function htmlReporter(results: BenchmarkResult[]): string;
|
|
200
202
|
|
|
203
|
+
interface TaskPack {
|
|
204
|
+
/** Short identifier, e.g. 'structured-output' */
|
|
205
|
+
name: string;
|
|
206
|
+
/** Human-readable label for console output */
|
|
207
|
+
label: string;
|
|
208
|
+
/** One-sentence description shown in --pack list */
|
|
209
|
+
description: string;
|
|
210
|
+
/** The tasks in this pack */
|
|
211
|
+
tasks: ArenaTask[];
|
|
212
|
+
/** Recommended scorers for this pack */
|
|
213
|
+
scorers: BuiltInScorerName[];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/** Get a pack by name. Throws if not found. */
|
|
217
|
+
declare function loadPack(name: string): TaskPack;
|
|
218
|
+
/** Get all available pack names */
|
|
219
|
+
declare function listPacks(): Array<{
|
|
220
|
+
name: string;
|
|
221
|
+
label: string;
|
|
222
|
+
description: string;
|
|
223
|
+
taskCount: number;
|
|
224
|
+
}>;
|
|
225
|
+
|
|
201
226
|
interface GitHubContext {
|
|
202
227
|
token: string;
|
|
203
228
|
owner: string;
|
|
@@ -207,4 +232,4 @@ interface GitHubContext {
|
|
|
207
232
|
declare function detectGitHubContext(): GitHubContext | null;
|
|
208
233
|
declare function upsertPrComment(ctx: GitHubContext, body: string, marker: string): Promise<void>;
|
|
209
234
|
|
|
210
|
-
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, loadBaseline, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|
|
235
|
+
export { type Arena, type ArenaConfig, type ArenaProvider, type ArenaTask, type BenchmarkResult, type BuiltInScorerName, type CiReport, type CostSummary, type ScoreResult, type ScorerComparison, type ScorerFn, type ScorerStats, type TaskInput, type TaskPack, type TaskResult, type ToolCall, type ToolDefinition, anthropic, azureOpenai, compareResults, computeStats, consoleReporter, defineArena, detectGitHubContext, gemini, htmlReporter, jsonReporter, listPacks, loadBaseline, loadPack, markdownReporter, openai, openaiCompatible, registerPricing, saveBaseline, upsertPrComment };
|