veto-sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +209 -0
- package/dist/benchmark/cli.d.ts +22 -0
- package/dist/benchmark/cli.d.ts.map +1 -0
- package/dist/benchmark/cli.js +238 -0
- package/dist/benchmark/cli.js.map +1 -0
- package/dist/benchmark/index.d.ts +10 -0
- package/dist/benchmark/index.d.ts.map +1 -0
- package/dist/benchmark/index.js +10 -0
- package/dist/benchmark/index.js.map +1 -0
- package/dist/benchmark/loader.d.ts +19 -0
- package/dist/benchmark/loader.d.ts.map +1 -0
- package/dist/benchmark/loader.js +321 -0
- package/dist/benchmark/loader.js.map +1 -0
- package/dist/benchmark/metrics.d.ts +35 -0
- package/dist/benchmark/metrics.d.ts.map +1 -0
- package/dist/benchmark/metrics.js +195 -0
- package/dist/benchmark/metrics.js.map +1 -0
- package/dist/benchmark/runner.d.ts +39 -0
- package/dist/benchmark/runner.d.ts.map +1 -0
- package/dist/benchmark/runner.js +279 -0
- package/dist/benchmark/runner.js.map +1 -0
- package/dist/benchmark/types.d.ts +188 -0
- package/dist/benchmark/types.d.ts.map +1 -0
- package/dist/benchmark/types.js +24 -0
- package/dist/benchmark/types.js.map +1 -0
- package/dist/cli/bin.d.ts +8 -0
- package/dist/cli/bin.d.ts.map +1 -0
- package/dist/cli/bin.js +120 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/cli/config.d.ts +126 -0
- package/dist/cli/config.d.ts.map +1 -0
- package/dist/cli/config.js +137 -0
- package/dist/cli/config.js.map +1 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/init.d.ts +64 -0
- package/dist/cli/init.d.ts.map +1 -0
- package/dist/cli/init.js +160 -0
- package/dist/cli/init.js.map +1 -0
- package/dist/cli/templates.d.ts +22 -0
- package/dist/cli/templates.d.ts.map +1 -0
- package/dist/cli/templates.js +132 -0
- package/dist/cli/templates.js.map +1 -0
- package/dist/core/history.d.ts +104 -0
- package/dist/core/history.d.ts.map +1 -0
- package/dist/core/history.js +148 -0
- package/dist/core/history.js.map +1 -0
- package/dist/core/index.d.ts +10 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +10 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/interceptor.d.ts +96 -0
- package/dist/core/interceptor.d.ts.map +1 -0
- package/dist/core/interceptor.js +227 -0
- package/dist/core/interceptor.js.map +1 -0
- package/dist/core/validator.d.ts +107 -0
- package/dist/core/validator.d.ts.map +1 -0
- package/dist/core/validator.js +263 -0
- package/dist/core/validator.js.map +1 -0
- package/dist/core/veto.d.ts +265 -0
- package/dist/core/veto.d.ts.map +1 -0
- package/dist/core/veto.js +681 -0
- package/dist/core/veto.js.map +1 -0
- package/dist/index.d.ts +43 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/dist/kernel/client.d.ts +82 -0
- package/dist/kernel/client.d.ts.map +1 -0
- package/dist/kernel/client.js +162 -0
- package/dist/kernel/client.js.map +1 -0
- package/dist/kernel/index.d.ts +9 -0
- package/dist/kernel/index.d.ts.map +1 -0
- package/dist/kernel/index.js +9 -0
- package/dist/kernel/index.js.map +1 -0
- package/dist/kernel/prompt.d.ts +27 -0
- package/dist/kernel/prompt.d.ts.map +1 -0
- package/dist/kernel/prompt.js +127 -0
- package/dist/kernel/prompt.js.map +1 -0
- package/dist/kernel/types.d.ts +85 -0
- package/dist/kernel/types.d.ts.map +1 -0
- package/dist/kernel/types.js +52 -0
- package/dist/kernel/types.js.map +1 -0
- package/dist/providers/adapters.d.ts +167 -0
- package/dist/providers/adapters.d.ts.map +1 -0
- package/dist/providers/adapters.js +244 -0
- package/dist/providers/adapters.js.map +1 -0
- package/dist/providers/index.d.ts +11 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +11 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/types.d.ts +92 -0
- package/dist/providers/types.d.ts.map +1 -0
- package/dist/providers/types.js +10 -0
- package/dist/providers/types.js.map +1 -0
- package/dist/rules/api-client.d.ts +103 -0
- package/dist/rules/api-client.d.ts.map +1 -0
- package/dist/rules/api-client.js +241 -0
- package/dist/rules/api-client.js.map +1 -0
- package/dist/rules/index.d.ts +10 -0
- package/dist/rules/index.d.ts.map +1 -0
- package/dist/rules/index.js +10 -0
- package/dist/rules/index.js.map +1 -0
- package/dist/rules/loader.d.ts +116 -0
- package/dist/rules/loader.d.ts.map +1 -0
- package/dist/rules/loader.js +300 -0
- package/dist/rules/loader.js.map +1 -0
- package/dist/rules/rule-validator.d.ts +135 -0
- package/dist/rules/rule-validator.d.ts.map +1 -0
- package/dist/rules/rule-validator.js +239 -0
- package/dist/rules/rule-validator.js.map +1 -0
- package/dist/rules/types.d.ts +162 -0
- package/dist/rules/types.d.ts.map +1 -0
- package/dist/rules/types.js +16 -0
- package/dist/rules/types.js.map +1 -0
- package/dist/types/config.d.ts +171 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/config.js +31 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/index.d.ts +8 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +8 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/tool.d.ts +156 -0
- package/dist/types/tool.d.ts.map +1 -0
- package/dist/types/tool.js +27 -0
- package/dist/types/tool.js.map +1 -0
- package/dist/utils/glob.d.ts +21 -0
- package/dist/utils/glob.d.ts.map +1 -0
- package/dist/utils/glob.js +147 -0
- package/dist/utils/glob.js.map +1 -0
- package/dist/utils/id.d.ts +28 -0
- package/dist/utils/id.d.ts.map +1 -0
- package/dist/utils/id.js +43 -0
- package/dist/utils/id.js.map +1 -0
- package/dist/utils/index.d.ts +9 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +9 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/logger.d.ts +97 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +153 -0
- package/dist/utils/logger.js.map +1 -0
- package/package.json +90 -0
package/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# veto
|
|
2
|
+
|
|
3
|
+
**Guardrails for AI agent tool calls.**
|
|
4
|
+
|
|
5
|
+
Intercept, validate, and control tool calls before execution. Works with OpenAI, Anthropic, Google, and any provider.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install veto
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npx veto init # Creates veto/ directory with config and rules
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
```typescript
|
|
20
|
+
import { Veto, toOpenAITools, ToolCallDeniedError } from 'veto';
|
|
21
|
+
|
|
22
|
+
// Define tools with handlers
|
|
23
|
+
const tools = [
|
|
24
|
+
{
|
|
25
|
+
name: 'read_file',
|
|
26
|
+
description: 'Read a file',
|
|
27
|
+
inputSchema: {
|
|
28
|
+
type: 'object',
|
|
29
|
+
properties: { path: { type: 'string' } },
|
|
30
|
+
required: ['path']
|
|
31
|
+
},
|
|
32
|
+
handler: async ({ path }) => fs.readFileSync(path, 'utf-8')
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
name: 'write_file',
|
|
36
|
+
description: 'Write a file',
|
|
37
|
+
inputSchema: {
|
|
38
|
+
type: 'object',
|
|
39
|
+
properties: { path: { type: 'string' }, content: { type: 'string' } },
|
|
40
|
+
required: ['path', 'content']
|
|
41
|
+
},
|
|
42
|
+
handler: async ({ path, content }) => {
|
|
43
|
+
fs.writeFileSync(path, content);
|
|
44
|
+
return 'OK';
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
];
|
|
48
|
+
|
|
49
|
+
// Initialize and wrap
|
|
50
|
+
const veto = await Veto.init();
|
|
51
|
+
const { definitions, implementations } = veto.wrapTools(tools);
|
|
52
|
+
|
|
53
|
+
// Use with OpenAI
|
|
54
|
+
const response = await openai.chat.completions.create({
|
|
55
|
+
model: 'gpt-4',
|
|
56
|
+
tools: toOpenAITools(definitions),
|
|
57
|
+
messages: [{ role: 'user', content: 'Read /etc/passwd' }]
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// Execute - validation happens automatically
|
|
61
|
+
for (const call of response.choices[0].message.tool_calls ?? []) {
|
|
62
|
+
try {
|
|
63
|
+
const result = await implementations[call.function.name](
|
|
64
|
+
JSON.parse(call.function.arguments)
|
|
65
|
+
);
|
|
66
|
+
console.log('Result:', result);
|
|
67
|
+
} catch (error) {
|
|
68
|
+
if (error instanceof ToolCallDeniedError) {
|
|
69
|
+
console.log('Blocked:', error.reason);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Configuration
|
|
76
|
+
|
|
77
|
+
### veto/config.yaml
|
|
78
|
+
|
|
79
|
+
```yaml
|
|
80
|
+
version: "1.0"
|
|
81
|
+
mode: "strict" # strict = block, log = allow but log
|
|
82
|
+
|
|
83
|
+
api:
|
|
84
|
+
baseUrl: "http://localhost:8080"
|
|
85
|
+
endpoint: "/tool/call/check"
|
|
86
|
+
timeout: 10000
|
|
87
|
+
|
|
88
|
+
rules:
|
|
89
|
+
directory: "./rules"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### veto/rules/defaults.yaml
|
|
93
|
+
|
|
94
|
+
```yaml
|
|
95
|
+
rules:
|
|
96
|
+
- id: block-system-paths
|
|
97
|
+
name: Block system path access
|
|
98
|
+
severity: critical
|
|
99
|
+
action: block
|
|
100
|
+
tools: [read_file, write_file]
|
|
101
|
+
conditions:
|
|
102
|
+
- field: arguments.path
|
|
103
|
+
operator: starts_with
|
|
104
|
+
value: /etc
|
|
105
|
+
|
|
106
|
+
- id: block-home-deletion
|
|
107
|
+
name: Prevent home directory deletion
|
|
108
|
+
severity: critical
|
|
109
|
+
action: block
|
|
110
|
+
tools: [delete_file, run_command]
|
|
111
|
+
conditions:
|
|
112
|
+
- field: arguments.path
|
|
113
|
+
operator: matches
|
|
114
|
+
value: "^/home/.*"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Provider Adapters
|
|
118
|
+
|
|
119
|
+
### OpenAI
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
import { toOpenAITools, fromOpenAIToolCall } from 'veto';
|
|
123
|
+
|
|
124
|
+
const tools = toOpenAITools(definitions);
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Anthropic
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
import { toAnthropicTools, fromAnthropicToolUse } from 'veto';
|
|
131
|
+
|
|
132
|
+
const tools = toAnthropicTools(definitions);
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Google
|
|
136
|
+
|
|
137
|
+
```typescript
|
|
138
|
+
import { toGoogleTool, fromGoogleFunctionCall } from 'veto';
|
|
139
|
+
|
|
140
|
+
const tools = toGoogleTool(definitions);
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Rule Conditions
|
|
144
|
+
|
|
145
|
+
| Operator | Description |
|
|
146
|
+
|----------|-------------|
|
|
147
|
+
| `equals` | Exact match |
|
|
148
|
+
| `not_equals` | Not equal |
|
|
149
|
+
| `contains` | String contains |
|
|
150
|
+
| `not_contains` | String does not contain |
|
|
151
|
+
| `starts_with` | String starts with |
|
|
152
|
+
| `ends_with` | String ends with |
|
|
153
|
+
| `matches` | Regex pattern |
|
|
154
|
+
| `greater_than` | Numeric > |
|
|
155
|
+
| `less_than` | Numeric < |
|
|
156
|
+
| `in` | Value in list |
|
|
157
|
+
| `not_in` | Value not in list |
|
|
158
|
+
|
|
159
|
+
## API
|
|
160
|
+
|
|
161
|
+
### `Veto.init(options?)`
|
|
162
|
+
|
|
163
|
+
Initialize Veto with configuration.
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
const veto = await Veto.init({
|
|
167
|
+
configDir: './my-veto',
|
|
168
|
+
mode: 'log',
|
|
169
|
+
logLevel: 'debug'
|
|
170
|
+
});
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### `veto.wrapTools(tools)`
|
|
174
|
+
|
|
175
|
+
Wrap tools to add validation.
|
|
176
|
+
|
|
177
|
+
```typescript
|
|
178
|
+
const { definitions, implementations } = veto.wrapTools(tools);
|
|
179
|
+
// definitions: schemas for AI provider (no handlers)
|
|
180
|
+
// implementations: callable functions with validation
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### `veto.validateToolCall(call)`
|
|
184
|
+
|
|
185
|
+
Manual validation for custom flows.
|
|
186
|
+
|
|
187
|
+
```typescript
|
|
188
|
+
const result = await veto.validateToolCall({
|
|
189
|
+
id: 'call_123',
|
|
190
|
+
name: 'read_file',
|
|
191
|
+
arguments: { path: '/etc/passwd' }
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
if (result.allowed) {
|
|
195
|
+
// proceed
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### `veto.getMode()`
|
|
200
|
+
|
|
201
|
+
Get current mode (`'strict'` or `'log'`).
|
|
202
|
+
|
|
203
|
+
### `veto.getLoadedRules()`
|
|
204
|
+
|
|
205
|
+
Get all loaded rules.
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
Apache-2.0
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI for running Veto benchmarks.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* npx tsx src/benchmark/cli.ts [options]
|
|
7
|
+
*
|
|
8
|
+
* Options:
|
|
9
|
+
* --dataset <path> Path to dataset (glob pattern)
|
|
10
|
+
* --model <name> Model to benchmark
|
|
11
|
+
* --max-samples <n> Maximum samples to evaluate
|
|
12
|
+
* --output <path> Output JSON file path
|
|
13
|
+
* --no-shuffle Don't shuffle samples
|
|
14
|
+
* --seed <n> Random seed for shuffling
|
|
15
|
+
* --concurrency <n> Parallel requests (default: 1)
|
|
16
|
+
* --base-url <url> Ollama base URL
|
|
17
|
+
* --help Show this help
|
|
18
|
+
*
|
|
19
|
+
* @module benchmark/cli
|
|
20
|
+
*/
|
|
21
|
+
export {};
|
|
22
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/benchmark/cli.ts"],"names":[],"mappings":";AACA;;;;;;;;;;;;;;;;;;GAkBG"}
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI for running Veto benchmarks.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* npx tsx src/benchmark/cli.ts [options]
|
|
7
|
+
*
|
|
8
|
+
* Options:
|
|
9
|
+
* --dataset <path> Path to dataset (glob pattern)
|
|
10
|
+
* --model <name> Model to benchmark
|
|
11
|
+
* --max-samples <n> Maximum samples to evaluate
|
|
12
|
+
* --output <path> Output JSON file path
|
|
13
|
+
* --no-shuffle Don't shuffle samples
|
|
14
|
+
* --seed <n> Random seed for shuffling
|
|
15
|
+
* --concurrency <n> Parallel requests (default: 1)
|
|
16
|
+
* --base-url <url> Ollama base URL
|
|
17
|
+
* --help Show this help
|
|
18
|
+
*
|
|
19
|
+
* @module benchmark/cli
|
|
20
|
+
*/
|
|
21
|
+
import { resolve } from 'node:path';
|
|
22
|
+
import { runBenchmark, formatReportConsole, saveReportJson, createProgressLogger, } from './runner.js';
|
|
23
|
+
import { DEFAULT_BENCHMARK_CONFIG } from './types.js';
|
|
24
|
+
import { createLogger } from '../utils/logger.js';
|
|
25
|
+
import { KernelClient } from '../kernel/client.js';
|
|
26
|
+
function parseArgs(args) {
|
|
27
|
+
const config = {};
|
|
28
|
+
for (let i = 0; i < args.length; i++) {
|
|
29
|
+
const arg = args[i];
|
|
30
|
+
const next = args[i + 1];
|
|
31
|
+
switch (arg) {
|
|
32
|
+
case '--help':
|
|
33
|
+
case '-h':
|
|
34
|
+
config.help = true;
|
|
35
|
+
break;
|
|
36
|
+
case '--dataset':
|
|
37
|
+
case '-d':
|
|
38
|
+
config.datasetPath = next;
|
|
39
|
+
i++;
|
|
40
|
+
break;
|
|
41
|
+
case '--model':
|
|
42
|
+
case '-m':
|
|
43
|
+
config.kernel = { ...DEFAULT_BENCHMARK_CONFIG.kernel, model: next };
|
|
44
|
+
i++;
|
|
45
|
+
break;
|
|
46
|
+
case '--max-samples':
|
|
47
|
+
case '-n':
|
|
48
|
+
config.maxSamples = parseInt(next, 10);
|
|
49
|
+
i++;
|
|
50
|
+
break;
|
|
51
|
+
case '--output':
|
|
52
|
+
case '-o':
|
|
53
|
+
config.outputPath = next;
|
|
54
|
+
config.outputFormat = 'both';
|
|
55
|
+
i++;
|
|
56
|
+
break;
|
|
57
|
+
case '--no-shuffle':
|
|
58
|
+
config.shuffle = false;
|
|
59
|
+
break;
|
|
60
|
+
case '--seed':
|
|
61
|
+
config.seed = parseInt(next, 10);
|
|
62
|
+
i++;
|
|
63
|
+
break;
|
|
64
|
+
case '--concurrency':
|
|
65
|
+
case '-c':
|
|
66
|
+
config.concurrency = parseInt(next, 10);
|
|
67
|
+
i++;
|
|
68
|
+
break;
|
|
69
|
+
case '--base-url':
|
|
70
|
+
config.kernel = {
|
|
71
|
+
...DEFAULT_BENCHMARK_CONFIG.kernel,
|
|
72
|
+
...config.kernel,
|
|
73
|
+
baseUrl: next,
|
|
74
|
+
};
|
|
75
|
+
i++;
|
|
76
|
+
break;
|
|
77
|
+
case '--include-results':
|
|
78
|
+
config.includeResults = true;
|
|
79
|
+
break;
|
|
80
|
+
case '--verbose':
|
|
81
|
+
case '-v':
|
|
82
|
+
config.verbose = true;
|
|
83
|
+
break;
|
|
84
|
+
case '--test-connection':
|
|
85
|
+
config.testConnection = true;
|
|
86
|
+
break;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return config;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Print help message.
|
|
93
|
+
*/
|
|
94
|
+
function printHelp() {
|
|
95
|
+
console.log(`
|
|
96
|
+
╔═══════════════════════════════════════════════════════════════╗
|
|
97
|
+
║ VETO BENCHMARK CLI ║
|
|
98
|
+
╚═══════════════════════════════════════════════════════════════╝
|
|
99
|
+
|
|
100
|
+
Usage:
|
|
101
|
+
npx tsx src/benchmark/cli.ts [options]
|
|
102
|
+
|
|
103
|
+
Options:
|
|
104
|
+
--dataset, -d <path> Path to dataset (glob pattern)
|
|
105
|
+
Default: data/batches/**/*.jsonl
|
|
106
|
+
|
|
107
|
+
--model, -m <name> Model to benchmark
|
|
108
|
+
Default: ${DEFAULT_BENCHMARK_CONFIG.kernel.model}
|
|
109
|
+
|
|
110
|
+
--max-samples, -n <n> Maximum samples to evaluate (0 = all)
|
|
111
|
+
Default: 0 (all samples)
|
|
112
|
+
|
|
113
|
+
--output, -o <path> Output JSON file path
|
|
114
|
+
Default: none (console only)
|
|
115
|
+
|
|
116
|
+
--no-shuffle Don't shuffle samples before evaluation
|
|
117
|
+
|
|
118
|
+
--seed <n> Random seed for shuffling
|
|
119
|
+
Default: random
|
|
120
|
+
|
|
121
|
+
--concurrency, -c <n> Parallel requests (not yet implemented)
|
|
122
|
+
Default: 1
|
|
123
|
+
|
|
124
|
+
--base-url <url> Ollama API base URL
|
|
125
|
+
Default: ${DEFAULT_BENCHMARK_CONFIG.kernel.baseUrl}
|
|
126
|
+
|
|
127
|
+
--include-results Include all individual results in JSON output
|
|
128
|
+
|
|
129
|
+
--verbose, -v Enable debug logging
|
|
130
|
+
|
|
131
|
+
--test-connection Test kernel connection before running
|
|
132
|
+
|
|
133
|
+
--help, -h Show this help message
|
|
134
|
+
|
|
135
|
+
Examples:
|
|
136
|
+
# Run full benchmark
|
|
137
|
+
npx tsx src/benchmark/cli.ts
|
|
138
|
+
|
|
139
|
+
# Run with 100 samples
|
|
140
|
+
npx tsx src/benchmark/cli.ts -n 100
|
|
141
|
+
|
|
142
|
+
# Save results to JSON
|
|
143
|
+
npx tsx src/benchmark/cli.ts -o results.json
|
|
144
|
+
|
|
145
|
+
# Use specific model
|
|
146
|
+
npx tsx src/benchmark/cli.ts -m veto-warden:latest
|
|
147
|
+
|
|
148
|
+
# Benchmark specific category
|
|
149
|
+
npx tsx src/benchmark/cli.ts -d "data/batches/finance/*.jsonl"
|
|
150
|
+
`);
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Main CLI entry point.
|
|
154
|
+
*/
|
|
155
|
+
async function main() {
|
|
156
|
+
const args = process.argv.slice(2);
|
|
157
|
+
const userConfig = parseArgs(args);
|
|
158
|
+
if (userConfig.help) {
|
|
159
|
+
printHelp();
|
|
160
|
+
process.exit(0);
|
|
161
|
+
}
|
|
162
|
+
// Merge with defaults
|
|
163
|
+
const config = {
|
|
164
|
+
...DEFAULT_BENCHMARK_CONFIG,
|
|
165
|
+
...userConfig,
|
|
166
|
+
kernel: {
|
|
167
|
+
...DEFAULT_BENCHMARK_CONFIG.kernel,
|
|
168
|
+
...userConfig.kernel,
|
|
169
|
+
},
|
|
170
|
+
};
|
|
171
|
+
// Resolve dataset path relative to cwd
|
|
172
|
+
config.datasetPath = resolve(process.cwd(), config.datasetPath);
|
|
173
|
+
console.log('');
|
|
174
|
+
console.log('╔═══════════════════════════════════════════════════════════════╗');
|
|
175
|
+
console.log('║ VETO BENCHMARK ║');
|
|
176
|
+
console.log('╚═══════════════════════════════════════════════════════════════╝');
|
|
177
|
+
console.log('');
|
|
178
|
+
console.log(`Dataset: ${config.datasetPath}`);
|
|
179
|
+
console.log(`Model: ${config.kernel.model}`);
|
|
180
|
+
console.log(`Max Samples: ${config.maxSamples || 'all'}`);
|
|
181
|
+
console.log(`Shuffle: ${config.shuffle}`);
|
|
182
|
+
console.log('');
|
|
183
|
+
// Create logger with appropriate level
|
|
184
|
+
const logger = createLogger(userConfig.verbose ? 'debug' : 'info');
|
|
185
|
+
// Test connection if requested
|
|
186
|
+
if (userConfig.testConnection) {
|
|
187
|
+
console.log('Testing kernel connection...');
|
|
188
|
+
const kernelConfig = config.kernel;
|
|
189
|
+
const kernelClient = new KernelClient({ config: kernelConfig, logger });
|
|
190
|
+
try {
|
|
191
|
+
const healthy = await kernelClient.healthCheck();
|
|
192
|
+
if (healthy) {
|
|
193
|
+
console.log('✅ Kernel connection successful');
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
console.log('❌ Kernel health check failed');
|
|
197
|
+
process.exit(1);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
catch (error) {
|
|
201
|
+
console.error('❌ Kernel connection error:', error instanceof Error ? error.message : error);
|
|
202
|
+
if (error instanceof Error && error.cause) {
|
|
203
|
+
console.error(' Cause:', error.cause.message);
|
|
204
|
+
}
|
|
205
|
+
process.exit(1);
|
|
206
|
+
}
|
|
207
|
+
if (!config.datasetPath) {
|
|
208
|
+
process.exit(0);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
try {
|
|
212
|
+
const report = await runBenchmark({
|
|
213
|
+
config,
|
|
214
|
+
onProgress: createProgressLogger(),
|
|
215
|
+
logger,
|
|
216
|
+
});
|
|
217
|
+
// Print console report
|
|
218
|
+
console.log(formatReportConsole(report));
|
|
219
|
+
// Save JSON if requested
|
|
220
|
+
if (config.outputPath) {
|
|
221
|
+
const outputPath = resolve(process.cwd(), config.outputPath);
|
|
222
|
+
saveReportJson(report, outputPath);
|
|
223
|
+
console.log(`\nReport saved to: ${outputPath}`);
|
|
224
|
+
}
|
|
225
|
+
// Exit with error code if accuracy is below threshold
|
|
226
|
+
if (report.metrics.accuracy < 0.9) {
|
|
227
|
+
console.log('\n⚠️ Warning: Accuracy below 90% threshold');
|
|
228
|
+
process.exit(1);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
catch (error) {
|
|
232
|
+
console.error('\n❌ Benchmark failed:', error instanceof Error ? error.message : error);
|
|
233
|
+
process.exit(1);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
// Run CLI
|
|
237
|
+
main().catch(console.error);
|
|
238
|
+
//# sourceMappingURL=cli.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../../src/benchmark/cli.ts"],"names":[],"mappings":";AACA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EACL,YAAY,EACZ,mBAAmB,EACnB,cAAc,EACd,oBAAoB,GACrB,MAAM,aAAa,CAAC;AAErB,OAAO,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAC;AACtD,OAAO,EAAE,YAAY,EAAe,MAAM,oBAAoB,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AAYnD,SAAS,SAAS,CAAC,IAAc;IAC/B,MAAM,MAAM,GAAe,EAAE,CAAC;IAE9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAEzB,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,QAAQ,CAAC;YACd,KAAK,IAAI;gBACP,MAAM,CAAC,IAAI,GAAG,IAAI,CAAC;gBACnB,MAAM;YACR,KAAK,WAAW,CAAC;YACjB,KAAK,IAAI;gBACP,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC;gBAC1B,CAAC,EAAE,CAAC;gBACJ,MAAM;YACR,KAAK,SAAS,CAAC;YACf,KAAK,IAAI;gBACP,MAAM,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,CAAC,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;gBACpE,CAAC,EAAE,CAAC;gBACJ,MAAM;YACR,KAAK,eAAe,CAAC;YACrB,KAAK,IAAI;gBACP,MAAM,CAAC,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBACvC,CAAC,EAAE,CAAC;gBACJ,MAAM;YACR,KAAK,UAAU,CAAC;YAChB,KAAK,IAAI;gBACP,MAAM,CAAC,UAAU,GAAG,IAAI,CAAC;gBACzB,MAAM,CAAC,YAAY,GAAG,MAAM,CAAC;gBAC7B,CAAC,EAAE,CAAC;gBACJ,MAAM;YACR,KAAK,cAAc;gBACjB,MAAM,CAAC,OAAO,GAAG,KAAK,CAAC;gBACvB,MAAM;YACR,KAAK,QAAQ;gBACX,MAAM,CAAC,IAAI,GAAG,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBACjC,CAAC,EAAE,CAAC;gBACJ,MAAM;YACR,KAAK,eAAe,CAAC;YACrB,KAAK,IAAI;gBACP,MAAM,CAAC,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBACxC,CAAC,EAAE,CAAC;gBACJ,MAAM;YACR,KAAK,YAAY;gBACf,MAAM,CAAC,MAAM,GAAG;oBACd,GAAG,wBAAwB,CAAC,MAAM;oBAClC,GAAG,MAAM,CAAC,MAAM;oBAChB,OAAO,EAAE,IAAI;iBACd,CAAC;gBACF,CAAC,EAAE,CAAC;gBACJ,MAAM;YACR,KAAK,mBAAmB;gBACtB,MAAM,CAAC,cAAc,GAAG,IAAI,CAAC;gBAC7B,MAAM;YACR,KAAK,WAAW,CAAC;YACjB,KAAK,IAAI;gBACP,MAAM,CAAC,OAAO,GAAG,IAAI,CAAC;gBACtB,MAAM;YACR,KAAK,mBAAmB;gBACtB,MAAM,CAAC,cAAc,GAAG,IAAI,CAAC;gBAC7B,MAAM;QACV,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,SAAS;IAChB,OAAO,CAAC,GAAG,CAAC;;;;;;;;;;;;;sCAawB,wBAAwB,CAAC,MAAM,CAAC,KAAK;;;;;;;;;;;;;;;;;sCAiBrC,wBAAwB,CAAC,MAAM,CAAC,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;CAyB5E,CAAC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,IAAI;IACjB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACnC,MAAM,UAAU,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAEnC,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC;QACpB,SAAS,EAAE,CAAC;QACZ,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,sBAAsB;IACtB,MAAM,MAAM,GAAoB;QAC9B,GAAG,wBAAwB;QAC3B,GAAG,UAAU;QACb,MAAM,EAAE;YACN,GAAG,wBAAwB,CAAC,MAAM;YAClC,GAAG,UAAU,CAAC,MAAM;SACrB;KACF,CAAC;IAEF,uCAAuC;IACvC,MAAM,CAAC,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,MAAM,CAAC,WAAW,CAAC,CAAC;IAEhE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAChB,OAAO,CAAC,GAAG,CAAC,mEAAmE,CAAC,CAAC;IACjF,OAAO,CAAC,GAAG,CAAC,mEAAmE,CAAC,CAAC;IACjF,OAAO,CAAC,GAAG,CAAC,mEAAmE,CAAC,CAAC;IACjF,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAChB,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC;IACjD,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IAClD,OAAO,CAAC,GAAG,CAAC,gBAAgB,MAAM,CAAC,UAAU,IAAI,KAAK,EAAE,CAAC,CAAC;IAC1D,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,OAAO,EAAE,CAAC,CAAC;IAC7C,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEhB,uCAAuC;IACvC,MAAM,MAAM,GAAW,YAAY,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAE3E,+BAA+B;IAC/B,IAAI,UAAU,CAAC,cAAc,EAAE,CAAC;QAC9B,OAAO,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC;QAC5C,MAAM,YAAY,GAAiB,MAAM,CAAC,MAAM,CAAC;QACjD,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,EAAE,CAAC,CAAC;QAExE,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,YAAY,CAAC,WAAW,EAAE,CAAC;YACjD,IAAI,OAAO,EAAE,CAAC;gBACZ,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;YAChD,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC;gBAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,4BAA4B,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YAC5F,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;gBAC1C,OAAO,CAAC,KAAK,CAAC,WAAW,EAAG,KAAK,CAAC,KAAe,CAAC,OAAO,CAAC,CAAC;YAC7D,CAAC;YACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;YACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;YAChC,MAAM;YACN,UAAU,EAAE,oBAAoB,EAAE;YAClC,MAAM;SACP,CAAC,CAAC;QAEH,uBAAuB;QACvB,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAC,CAAC;QAEzC,yBAAyB;QACzB,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;YACtB,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;YAC7D,cAAc,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,sBAAsB,UAAU,EAAE,CAAC,CAAC;QAClD,CAAC;QAED,sDAAsD;QACtD,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,GAAG,GAAG,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,6CAA6C,CAAC,CAAC;YAC3D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IAEH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,uBAAuB,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QACvF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED,UAAU;AACV,IAAI,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/benchmark/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,YAAY,CAAC;AAC3B,cAAc,aAAa,CAAC;AAC5B,cAAc,cAAc,CAAC;AAC7B,cAAc,aAAa,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/benchmark/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,YAAY,CAAC;AAC3B,cAAc,aAAa,CAAC;AAC5B,cAAc,cAAc,CAAC;AAC7B,cAAc,aAAa,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dataset loader for benchmark samples.
|
|
3
|
+
*
|
|
4
|
+
* Parses JSONL training data files and extracts benchmark samples
|
|
5
|
+
* with tool calls, rules, and expected decisions.
|
|
6
|
+
*
|
|
7
|
+
* @module benchmark/loader
|
|
8
|
+
*/
|
|
9
|
+
import type { BenchmarkSample } from './types.js';
|
|
10
|
+
/**
|
|
11
|
+
* Load benchmark samples from a glob pattern.
|
|
12
|
+
*
|
|
13
|
+
* @param pattern - Glob pattern for JSONL files
|
|
14
|
+
* @param maxSamples - Maximum samples to load (0 = all)
|
|
15
|
+
* @param shuffle - Whether to shuffle samples
|
|
16
|
+
* @param seed - Random seed for shuffling
|
|
17
|
+
*/
|
|
18
|
+
export declare function loadBenchmarkSamples(pattern: string, maxSamples?: number, shuffle?: boolean, seed?: number): Promise<BenchmarkSample[]>;
|
|
19
|
+
//# sourceMappingURL=loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../src/benchmark/loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAMH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AA4BlD;;;;;;;GAOG;AACH,wBAAsB,oBAAoB,CACxC,OAAO,EAAE,MAAM,EACf,UAAU,GAAE,MAAU,EACtB,OAAO,GAAE,OAAe,EACxB,IAAI,CAAC,EAAE,MAAM,GACZ,OAAO,CAAC,eAAe,EAAE,CAAC,CA6C5B"}
|