@pauly4010/evalai-sdk 1.3.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +80 -4
- package/dist/__tests__/assertions.test.d.ts +1 -0
- package/dist/__tests__/assertions.test.js +288 -0
- package/dist/__tests__/client.test.d.ts +1 -0
- package/dist/__tests__/client.test.js +185 -0
- package/dist/__tests__/testing.test.d.ts +1 -0
- package/dist/__tests__/testing.test.js +230 -0
- package/dist/__tests__/workflows.test.d.ts +1 -0
- package/dist/__tests__/workflows.test.js +222 -0
- package/dist/cli/check.d.ts +58 -0
- package/dist/cli/check.js +215 -0
- package/dist/cli/index.d.ts +4 -2
- package/dist/cli/index.js +38 -175
- package/dist/client.d.ts +14 -1
- package/dist/client.js +56 -6
- package/dist/index.d.ts +1 -0
- package/dist/index.js +6 -1
- package/dist/types.d.ts +8 -0
- package/dist/workflows.js +2 -7
- package/package.json +17 -29
- package/LICENSE +0 -21
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* evalai check — CI/CD evaluation gate
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* evalai check --minScore 92 --evaluationId 42
|
|
8
|
+
* evalai check --minScore 90 --maxDrop 5 --evaluationId 42
|
|
9
|
+
* evalai check --policy HIPAA --evaluationId 42
|
|
10
|
+
* evalai check --baseline published --evaluationId 42
|
|
11
|
+
*
|
|
12
|
+
* Flags:
|
|
13
|
+
* --minScore <n> Fail if quality score < n (0-100)
|
|
14
|
+
* --maxDrop <n> Fail if score dropped > n points from baseline
|
|
15
|
+
* --minN <n> Fail if total test cases < n (low sample size)
|
|
16
|
+
* --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
|
|
17
|
+
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
18
|
+
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
19
|
+
* --evaluationId <id> Required. The evaluation to gate on.
|
|
20
|
+
* --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
|
|
21
|
+
* --apiKey <key> API key (default: EVALAI_API_KEY env var)
|
|
22
|
+
*
|
|
23
|
+
* Exit codes:
|
|
24
|
+
* 0 — Gate passed
|
|
25
|
+
* 1 — Gate failed: score below threshold
|
|
26
|
+
* 2 — Gate failed: regression exceeded maxDrop
|
|
27
|
+
* 3 — Gate failed: policy violation
|
|
28
|
+
* 4 — API error / network failure
|
|
29
|
+
* 5 — Invalid arguments
|
|
30
|
+
* 6 — Gate failed: total test cases < minN
|
|
31
|
+
* 7 — Gate failed: weak evidence (evidenceLevel === 'weak')
|
|
32
|
+
*
|
|
33
|
+
* Environment:
|
|
34
|
+
* EVALAI_BASE_URL — API base URL (default: http://localhost:3000)
|
|
35
|
+
* EVALAI_API_KEY — API key for authentication
|
|
36
|
+
*/
|
|
37
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
38
|
+
exports.EXIT = void 0;
|
|
39
|
+
exports.parseArgs = parseArgs;
|
|
40
|
+
exports.runCheck = runCheck;
|
|
41
|
+
// Standardized exit codes
|
|
42
|
+
exports.EXIT = {
|
|
43
|
+
PASS: 0,
|
|
44
|
+
SCORE_BELOW: 1,
|
|
45
|
+
REGRESSION: 2,
|
|
46
|
+
POLICY_VIOLATION: 3,
|
|
47
|
+
API_ERROR: 4,
|
|
48
|
+
BAD_ARGS: 5,
|
|
49
|
+
LOW_N: 6,
|
|
50
|
+
WEAK_EVIDENCE: 7,
|
|
51
|
+
};
|
|
52
|
+
function parseArgs(argv) {
|
|
53
|
+
const args = {};
|
|
54
|
+
for (let i = 0; i < argv.length; i++) {
|
|
55
|
+
const arg = argv[i];
|
|
56
|
+
if (arg.startsWith('--')) {
|
|
57
|
+
const key = arg.slice(2);
|
|
58
|
+
const next = argv[i + 1];
|
|
59
|
+
if (next !== undefined && !next.startsWith('--')) {
|
|
60
|
+
args[key] = next;
|
|
61
|
+
i++;
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
args[key] = 'true'; // bare flag
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
const baseUrl = args.baseUrl || process.env.EVALAI_BASE_URL || 'http://localhost:3000';
|
|
69
|
+
const apiKey = args.apiKey || process.env.EVALAI_API_KEY || '';
|
|
70
|
+
const minScore = parseInt(args.minScore || '0');
|
|
71
|
+
const maxDrop = args.maxDrop ? parseInt(args.maxDrop) : undefined;
|
|
72
|
+
const minN = args.minN ? parseInt(args.minN) : undefined;
|
|
73
|
+
const allowWeakEvidence = args.allowWeakEvidence === 'true' || args.allowWeakEvidence === '1';
|
|
74
|
+
const evaluationId = args.evaluationId || '';
|
|
75
|
+
const policy = args.policy || undefined;
|
|
76
|
+
const baseline = (args.baseline === 'previous'
|
|
77
|
+
? 'previous'
|
|
78
|
+
: args.baseline === 'production'
|
|
79
|
+
? 'production'
|
|
80
|
+
: 'published');
|
|
81
|
+
if (!apiKey) {
|
|
82
|
+
console.error('Error: --apiKey or EVALAI_API_KEY is required');
|
|
83
|
+
process.exit(exports.EXIT.BAD_ARGS);
|
|
84
|
+
}
|
|
85
|
+
if (!evaluationId) {
|
|
86
|
+
console.error('Error: --evaluationId is required');
|
|
87
|
+
process.exit(exports.EXIT.BAD_ARGS);
|
|
88
|
+
}
|
|
89
|
+
if (isNaN(minScore) || minScore < 0 || minScore > 100) {
|
|
90
|
+
console.error('Error: --minScore must be 0-100');
|
|
91
|
+
process.exit(exports.EXIT.BAD_ARGS);
|
|
92
|
+
}
|
|
93
|
+
if (minN !== undefined && (isNaN(minN) || minN < 1)) {
|
|
94
|
+
console.error('Error: --minN must be a positive number');
|
|
95
|
+
process.exit(exports.EXIT.BAD_ARGS);
|
|
96
|
+
}
|
|
97
|
+
return { baseUrl, apiKey, minScore, maxDrop, minN, allowWeakEvidence, evaluationId, policy, baseline };
|
|
98
|
+
}
|
|
99
|
+
async function runCheck(args) {
|
|
100
|
+
const headers = { Authorization: `Bearer ${args.apiKey}` };
|
|
101
|
+
// ── 1. Fetch latest quality score ──
|
|
102
|
+
const scoreUrl = `${args.baseUrl}/api/quality?evaluationId=${args.evaluationId}&action=latest&baseline=${args.baseline}`;
|
|
103
|
+
let scoreRes;
|
|
104
|
+
try {
|
|
105
|
+
scoreRes = await fetch(scoreUrl, { headers });
|
|
106
|
+
}
|
|
107
|
+
catch (err) {
|
|
108
|
+
console.error(`EvalAI gate ERROR: Network failure — ${err.message}`);
|
|
109
|
+
return exports.EXIT.API_ERROR;
|
|
110
|
+
}
|
|
111
|
+
if (!scoreRes.ok) {
|
|
112
|
+
const body = await scoreRes.text();
|
|
113
|
+
console.error(`EvalAI gate ERROR: API returned ${scoreRes.status} — ${body}`);
|
|
114
|
+
return exports.EXIT.API_ERROR;
|
|
115
|
+
}
|
|
116
|
+
const data = (await scoreRes.json());
|
|
117
|
+
const score = data?.score ?? 0;
|
|
118
|
+
const total = data?.total ?? null;
|
|
119
|
+
const evidenceLevel = data?.evidenceLevel ?? null;
|
|
120
|
+
const baselineScore = data?.baselineScore ?? null;
|
|
121
|
+
const regressionDelta = data?.regressionDelta ?? null;
|
|
122
|
+
const baselineMissing = data?.baselineMissing === true;
|
|
123
|
+
const breakdown = data?.breakdown ?? {};
|
|
124
|
+
// ── Gate: baseline missing (when baseline comparison requested) ──
|
|
125
|
+
if (baselineMissing && (args.baseline !== 'published' || args.maxDrop !== undefined)) {
|
|
126
|
+
console.error(`\n✗ FAILED: baseline (${args.baseline}) not found. ` +
|
|
127
|
+
`Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`);
|
|
128
|
+
return exports.EXIT.API_ERROR;
|
|
129
|
+
}
|
|
130
|
+
// ── Gate: minN (low sample size) ──
|
|
131
|
+
if (args.minN !== undefined && total !== null && total < args.minN) {
|
|
132
|
+
console.error(`\n✗ FAILED: total test cases (${total}) < minN (${args.minN})`);
|
|
133
|
+
return exports.EXIT.LOW_N;
|
|
134
|
+
}
|
|
135
|
+
// ── Gate: allowWeakEvidence ──
|
|
136
|
+
if (!args.allowWeakEvidence && evidenceLevel === 'weak') {
|
|
137
|
+
console.error(`\n✗ FAILED: evidence level is 'weak' (use --allowWeakEvidence to permit)`);
|
|
138
|
+
return exports.EXIT.WEAK_EVIDENCE;
|
|
139
|
+
}
|
|
140
|
+
// ── Print summary ──
|
|
141
|
+
console.log('┌─────────────────────────────────────────┐');
|
|
142
|
+
console.log(`│ EvalAI Quality Score: ${String(score).padStart(3)}/100 │`);
|
|
143
|
+
console.log('├─────────────────────────────────────────┤');
|
|
144
|
+
if (baselineScore !== null) {
|
|
145
|
+
const delta = regressionDelta ?? 0;
|
|
146
|
+
const arrow = delta >= 0 ? '▲' : '▼';
|
|
147
|
+
console.log(`│ Baseline: ${baselineScore} ${arrow} ${Math.abs(delta)} pts │`);
|
|
148
|
+
}
|
|
149
|
+
if (breakdown) {
|
|
150
|
+
const pct = (v) => `${Math.round((v ?? 0) * 100)}%`;
|
|
151
|
+
console.log(`│ Pass: ${pct(breakdown.passRate)} Safety: ${pct(breakdown.safety)} Judge: ${pct(breakdown.judge)} │`);
|
|
152
|
+
}
|
|
153
|
+
if (data?.flags && data.flags.length > 0) {
|
|
154
|
+
console.log(`│ Flags: ${data.flags.join(', ').padEnd(30)} │`);
|
|
155
|
+
}
|
|
156
|
+
console.log('└─────────────────────────────────────────┘');
|
|
157
|
+
// ── 2. Gate: minimum score ──
|
|
158
|
+
if (args.minScore > 0 && score < args.minScore) {
|
|
159
|
+
console.error(`\n✗ FAILED: score=${score} < minScore=${args.minScore}`);
|
|
160
|
+
return exports.EXIT.SCORE_BELOW;
|
|
161
|
+
}
|
|
162
|
+
// ── 3. Gate: maximum drop from baseline ──
|
|
163
|
+
if (args.maxDrop !== undefined && regressionDelta !== null && regressionDelta < -(args.maxDrop)) {
|
|
164
|
+
console.error(`\n✗ FAILED: score dropped ${Math.abs(regressionDelta)} pts from baseline ` +
|
|
165
|
+
`(max allowed: ${args.maxDrop})`);
|
|
166
|
+
return exports.EXIT.REGRESSION;
|
|
167
|
+
}
|
|
168
|
+
// ── 4. Gate: policy compliance ──
|
|
169
|
+
if (args.policy) {
|
|
170
|
+
const policyUrl = `${args.baseUrl}/api/quality?evaluationId=${args.evaluationId}&action=latest`;
|
|
171
|
+
// Check policy-specific flags
|
|
172
|
+
const policyFlags = (data?.flags ?? []);
|
|
173
|
+
// Policy mapping: each policy has a set of required conditions
|
|
174
|
+
const policyChecks = {
|
|
175
|
+
HIPAA: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK'] },
|
|
176
|
+
SOC2: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
|
|
177
|
+
GDPR: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
|
|
178
|
+
PCI_DSS: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
|
|
179
|
+
FINRA_4511: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
|
|
180
|
+
};
|
|
181
|
+
const policyName = args.policy.toUpperCase();
|
|
182
|
+
const check = policyChecks[policyName];
|
|
183
|
+
if (!check) {
|
|
184
|
+
console.error(`\n✗ Unknown policy: ${args.policy}. Available: ${Object.keys(policyChecks).join(', ')}`);
|
|
185
|
+
return exports.EXIT.BAD_ARGS;
|
|
186
|
+
}
|
|
187
|
+
// Check safety rate
|
|
188
|
+
const safetyRate = breakdown?.safety ?? 0;
|
|
189
|
+
if (safetyRate < check.requiredSafetyRate) {
|
|
190
|
+
console.error(`\n✗ POLICY VIOLATION (${policyName}): safety rate ${Math.round(safetyRate * 100)}% < ` +
|
|
191
|
+
`required ${Math.round(check.requiredSafetyRate * 100)}%`);
|
|
192
|
+
return exports.EXIT.POLICY_VIOLATION;
|
|
193
|
+
}
|
|
194
|
+
// Check for disqualifying flags
|
|
195
|
+
const violations = policyFlags.filter(f => check.maxFlags.includes(f));
|
|
196
|
+
if (violations.length > 0) {
|
|
197
|
+
console.error(`\n✗ POLICY VIOLATION (${policyName}): ${violations.join(', ')}`);
|
|
198
|
+
return exports.EXIT.POLICY_VIOLATION;
|
|
199
|
+
}
|
|
200
|
+
console.log(`\n✓ Policy ${policyName}: COMPLIANT`);
|
|
201
|
+
}
|
|
202
|
+
console.log('\n✓ EvalAI gate PASSED');
|
|
203
|
+
return exports.EXIT.PASS;
|
|
204
|
+
}
|
|
205
|
+
// Main entry point
|
|
206
|
+
const isDirectRun = typeof require !== 'undefined' && require.main === module;
|
|
207
|
+
if (isDirectRun) {
|
|
208
|
+
const args = parseArgs(process.argv.slice(2));
|
|
209
|
+
runCheck(args).then((code) => {
|
|
210
|
+
process.exit(code);
|
|
211
|
+
}).catch((err) => {
|
|
212
|
+
console.error(`EvalAI gate ERROR: ${err.message}`);
|
|
213
|
+
process.exit(exports.EXIT.API_ERROR);
|
|
214
|
+
});
|
|
215
|
+
}
|
package/dist/cli/index.d.ts
CHANGED
package/dist/cli/index.js
CHANGED
|
@@ -1,181 +1,44 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
"use strict";
|
|
3
3
|
/**
|
|
4
|
-
*
|
|
5
|
-
*
|
|
4
|
+
* evalai — EvalAI CLI
|
|
5
|
+
*
|
|
6
|
+
* Commands:
|
|
7
|
+
* evalai check — CI/CD evaluation gate (see evalai check --help)
|
|
6
8
|
*/
|
|
7
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
8
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
9
|
-
};
|
|
10
9
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
-
const
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
|
|
15
|
-
const
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
// Initialize project
|
|
22
|
-
program
|
|
23
|
-
.command('init')
|
|
24
|
-
.description('Initialize a new evaluation project')
|
|
25
|
-
.option('-d, --dir <directory>', 'Project directory', '.')
|
|
26
|
-
.action(async (options) => {
|
|
27
|
-
const dir = path_1.default.resolve(options.dir);
|
|
28
|
-
console.log('🚀 Initializing EvalAI project...');
|
|
29
|
-
// Create directory structure
|
|
30
|
-
await promises_1.default.mkdir(path_1.default.join(dir, '.evalai'), { recursive: true });
|
|
31
|
-
await promises_1.default.mkdir(path_1.default.join(dir, '.evalai', 'snapshots'), { recursive: true });
|
|
32
|
-
await promises_1.default.mkdir(path_1.default.join(dir, 'evaluations'), { recursive: true });
|
|
33
|
-
// Create config file
|
|
34
|
-
const config = {
|
|
35
|
-
apiKey: process.env.EVALAI_API_KEY || '',
|
|
36
|
-
projectId: process.env.EVALAI_PROJECT_ID || '',
|
|
37
|
-
baseUrl: 'http://localhost:3000/api',
|
|
38
|
-
debug: false,
|
|
39
|
-
retry: {
|
|
40
|
-
maxAttempts: 3,
|
|
41
|
-
backoff: 'exponential'
|
|
42
|
-
}
|
|
43
|
-
};
|
|
44
|
-
await promises_1.default.writeFile(path_1.default.join(dir, 'evalai.config.json'), JSON.stringify(config, null, 2));
|
|
45
|
-
// Create example evaluation file
|
|
46
|
-
const exampleEval = `import { AIEvalClient, createTestSuite, expect } from '@pauly4010/evalai-sdk'
|
|
47
|
-
|
|
48
|
-
const client = AIEvalClient.init()
|
|
49
|
-
|
|
50
|
-
const suite = createTestSuite('example-evaluation', {
|
|
51
|
-
cases: [
|
|
52
|
-
{
|
|
53
|
-
input: 'What is 2+2?',
|
|
54
|
-
expected: '4',
|
|
55
|
-
name: 'simple-math'
|
|
56
|
-
},
|
|
57
|
-
{
|
|
58
|
-
input: 'Explain AI in simple terms',
|
|
59
|
-
expected: (output) => {
|
|
60
|
-
expect(output).toContainKeywords(['artificial', 'intelligence'])
|
|
61
|
-
expect(output).toHaveLength({ min: 50, max: 500 })
|
|
62
|
-
return true
|
|
63
|
-
},
|
|
64
|
-
name: 'ai-explanation'
|
|
65
|
-
}
|
|
66
|
-
]
|
|
67
|
-
})
|
|
68
|
-
|
|
69
|
-
// Run the test suite
|
|
70
|
-
suite.run().then(results => {
|
|
71
|
-
console.log('Test Results:', results)
|
|
72
|
-
console.log(\`Passed: \${results.passed}/\${results.total}\`)
|
|
73
|
-
})
|
|
74
|
-
`;
|
|
75
|
-
await promises_1.default.writeFile(path_1.default.join(dir, 'evaluations', 'example.ts'), exampleEval);
|
|
76
|
-
console.log('✅ Project initialized successfully!');
|
|
77
|
-
console.log('\nNext steps:');
|
|
78
|
-
console.log('1. Set your API key: export EVALAI_API_KEY=your-key');
|
|
79
|
-
console.log('2. Set your project ID: export EVALAI_PROJECT_ID=your-project');
|
|
80
|
-
console.log('3. Run evaluations: npx evalai eval:run');
|
|
81
|
-
});
|
|
82
|
-
// Run evaluations
|
|
83
|
-
program
|
|
84
|
-
.command('eval:run')
|
|
85
|
-
.description('Run evaluation tests')
|
|
86
|
-
.option('-c, --config <path>', 'Config file path', './evalai.config.json')
|
|
87
|
-
.option('-f, --file <path>', 'Evaluation file to run')
|
|
88
|
-
.action(async (options) => {
|
|
89
|
-
console.log('🧪 Running evaluations...');
|
|
90
|
-
// Load config
|
|
91
|
-
const configPath = path_1.default.resolve(options.config);
|
|
92
|
-
let config;
|
|
93
|
-
try {
|
|
94
|
-
const configContent = await promises_1.default.readFile(configPath, 'utf-8');
|
|
95
|
-
config = JSON.parse(configContent);
|
|
96
|
-
}
|
|
97
|
-
catch (error) {
|
|
98
|
-
console.error('❌ Config file not found. Run "evalai init" first.');
|
|
99
|
-
process.exit(1);
|
|
100
|
-
}
|
|
101
|
-
const client = client_1.AIEvalClient.init(config);
|
|
102
|
-
// If file specified, run that file
|
|
103
|
-
if (options.file) {
|
|
104
|
-
console.log(`Running ${options.file}...`);
|
|
105
|
-
// Dynamic import of evaluation file would go here
|
|
106
|
-
// This requires compilation step for TS files
|
|
107
|
-
}
|
|
108
|
-
else {
|
|
109
|
-
// Run all evaluations in the evaluations directory
|
|
110
|
-
console.log('Running all evaluations...');
|
|
111
|
-
}
|
|
112
|
-
console.log('✅ Evaluations completed!');
|
|
113
|
-
});
|
|
114
|
-
// List traces
|
|
115
|
-
program
|
|
116
|
-
.command('traces')
|
|
117
|
-
.description('List and filter traces')
|
|
118
|
-
.option('-l, --limit <number>', 'Number of traces to show', '10')
|
|
119
|
-
.option('--failed', 'Show only failed traces')
|
|
120
|
-
.option('--slow', 'Show slow traces (>5s)')
|
|
121
|
-
.action(async (options) => {
|
|
122
|
-
const configPath = path_1.default.resolve('./evalai.config.json');
|
|
123
|
-
let config;
|
|
124
|
-
try {
|
|
125
|
-
const configContent = await promises_1.default.readFile(configPath, 'utf-8');
|
|
126
|
-
config = JSON.parse(configContent);
|
|
127
|
-
}
|
|
128
|
-
catch (error) {
|
|
129
|
-
console.error('❌ Config file not found. Run "evalai init" first.');
|
|
130
|
-
process.exit(1);
|
|
131
|
-
}
|
|
132
|
-
const client = client_1.AIEvalClient.init(config);
|
|
133
|
-
console.log('📊 Fetching traces...');
|
|
134
|
-
// API call to get traces would go here
|
|
135
|
-
console.log(`Showing ${options.limit} traces`);
|
|
136
|
-
});
|
|
137
|
-
// Export data
|
|
138
|
-
program
|
|
139
|
-
.command('export')
|
|
140
|
-
.description('Export data from EvalAI')
|
|
141
|
-
.option('-f, --format <format>', 'Export format (json, csv, xlsx)', 'json')
|
|
142
|
-
.option('-o, --output <path>', 'Output file path', './export')
|
|
143
|
-
.option('-t, --type <type>', 'Data type (traces, evaluations, all)', 'all')
|
|
144
|
-
.action(async (options) => {
|
|
145
|
-
const configPath = path_1.default.resolve('./evalai.config.json');
|
|
146
|
-
let config;
|
|
147
|
-
try {
|
|
148
|
-
const configContent = await promises_1.default.readFile(configPath, 'utf-8');
|
|
149
|
-
config = JSON.parse(configContent);
|
|
150
|
-
}
|
|
151
|
-
catch (error) {
|
|
152
|
-
console.error('❌ Config file not found. Run "evalai init" first.');
|
|
153
|
-
process.exit(1);
|
|
154
|
-
}
|
|
155
|
-
const client = client_1.AIEvalClient.init(config);
|
|
156
|
-
console.log(`📥 Exporting data as ${options.format}...`);
|
|
157
|
-
const data = await (0, export_1.exportData)(client, {
|
|
158
|
-
format: options.format,
|
|
159
|
-
includeTraces: true,
|
|
160
|
-
includeEvaluations: true
|
|
10
|
+
const check_1 = require("./check");
|
|
11
|
+
const argv = process.argv.slice(2);
|
|
12
|
+
const subcommand = argv[0];
|
|
13
|
+
if (subcommand === 'check') {
|
|
14
|
+
const args = (0, check_1.parseArgs)(argv.slice(1));
|
|
15
|
+
(0, check_1.runCheck)(args)
|
|
16
|
+
.then((code) => process.exit(code))
|
|
17
|
+
.catch((err) => {
|
|
18
|
+
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
19
|
+
process.exit(4);
|
|
161
20
|
});
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
console.log(`EvalAI CLI
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
evalai check [options] CI/CD evaluation gate
|
|
27
|
+
|
|
28
|
+
Options for check:
|
|
29
|
+
--evaluationId <id> Required. Evaluation to gate on.
|
|
30
|
+
--apiKey <key> API key (or EVALAI_API_KEY env)
|
|
31
|
+
--minScore <n> Fail if score < n (0-100)
|
|
32
|
+
--maxDrop <n> Fail if score dropped > n from baseline
|
|
33
|
+
--minN <n> Fail if total test cases < n
|
|
34
|
+
--allowWeakEvidence Allow weak evidence level
|
|
35
|
+
--policy <name> Enforce policy (HIPAA, SOC2, GDPR, etc.)
|
|
36
|
+
--baseline <mode> "published" or "previous"
|
|
37
|
+
--baseUrl <url> API base URL
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
41
|
+
evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
42
|
+
`);
|
|
43
|
+
process.exit(subcommand === '--help' || subcommand === '-h' ? 0 : 1);
|
|
44
|
+
}
|
package/dist/client.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { ClientConfig, Trace, CreateTraceParams, ListTracesParams, Evaluation, CreateEvaluationParams, UpdateEvaluationParams, ListEvaluationsParams, LLMJudgeResult, RunLLMJudgeParams, TestCase, CreateTestCaseParams, EvaluationRun, CreateRunParams, Span, CreateSpanParams, OrganizationLimits, Annotation, CreateAnnotationParams, ListAnnotationsParams, AnnotationTask, CreateAnnotationTaskParams, ListAnnotationTasksParams, AnnotationItem, CreateAnnotationItemParams, ListAnnotationItemsParams, APIKey, APIKeyWithSecret, CreateAPIKeyParams, UpdateAPIKeyParams, ListAPIKeysParams, APIKeyUsage, Webhook, CreateWebhookParams, UpdateWebhookParams, ListWebhooksParams, WebhookDelivery, ListWebhookDeliveriesParams, UsageStats, GetUsageParams, UsageSummary, LLMJudgeConfig, CreateLLMJudgeConfigParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, LLMJudgeAlignment, GetLLMJudgeAlignmentParams, Organization } from './types';
|
|
1
|
+
import { ClientConfig, Trace, CreateTraceParams, ListTracesParams, Evaluation, CreateEvaluationParams, UpdateEvaluationParams, ListEvaluationsParams, LLMJudgeResult, RunLLMJudgeParams, TestCase, CreateTestCaseParams, EvaluationRun, CreateRunParams, Span, CreateSpanParams, UpdateTraceParams, OrganizationLimits, Annotation, CreateAnnotationParams, ListAnnotationsParams, AnnotationTask, CreateAnnotationTaskParams, ListAnnotationTasksParams, AnnotationItem, CreateAnnotationItemParams, ListAnnotationItemsParams, APIKey, APIKeyWithSecret, CreateAPIKeyParams, UpdateAPIKeyParams, ListAPIKeysParams, APIKeyUsage, Webhook, CreateWebhookParams, UpdateWebhookParams, ListWebhooksParams, WebhookDelivery, ListWebhookDeliveriesParams, UsageStats, GetUsageParams, UsageSummary, LLMJudgeConfig, CreateLLMJudgeConfigParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, LLMJudgeAlignment, GetLLMJudgeAlignmentParams, Organization } from './types';
|
|
2
2
|
import { Logger } from './logger';
|
|
3
3
|
/**
|
|
4
4
|
* AI Evaluation Platform SDK Client
|
|
@@ -126,6 +126,19 @@ declare class TraceAPI {
|
|
|
126
126
|
* Get a single trace by ID
|
|
127
127
|
*/
|
|
128
128
|
get(id: number): Promise<Trace>;
|
|
129
|
+
/**
|
|
130
|
+
* Update an existing trace (e.g. set status, duration, metadata on completion)
|
|
131
|
+
*
|
|
132
|
+
* @example
|
|
133
|
+
* ```typescript
|
|
134
|
+
* await client.traces.update(42, {
|
|
135
|
+
* status: 'success',
|
|
136
|
+
* durationMs: 1234,
|
|
137
|
+
* metadata: { output: 'done' }
|
|
138
|
+
* });
|
|
139
|
+
* ```
|
|
140
|
+
*/
|
|
141
|
+
update<TMetadata = Record<string, any>>(id: number, params: UpdateTraceParams<TMetadata>): Promise<Trace<TMetadata>>;
|
|
129
142
|
/**
|
|
130
143
|
* Create a span for a trace
|
|
131
144
|
*/
|
package/dist/client.js
CHANGED
|
@@ -76,13 +76,45 @@ class AIEvalClient {
|
|
|
76
76
|
this.cache = new cache_1.RequestCache(config.cacheSize || 1000);
|
|
77
77
|
// Initialize request batcher if enabled (default: enabled)
|
|
78
78
|
if (config.enableBatching !== false) {
|
|
79
|
+
const MAX_CONCURRENCY = 5;
|
|
79
80
|
this.batcher = new batch_1.RequestBatcher(async (requests) => {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
const results = [];
|
|
82
|
+
const executing = [];
|
|
83
|
+
for (const req of requests) {
|
|
84
|
+
const task = (async () => {
|
|
85
|
+
try {
|
|
86
|
+
const data = await this.request(req.endpoint, {
|
|
87
|
+
method: req.method,
|
|
88
|
+
body: req.body ? JSON.stringify(req.body) : undefined,
|
|
89
|
+
headers: req.headers,
|
|
90
|
+
});
|
|
91
|
+
results.push({ id: req.id, status: 200, data });
|
|
92
|
+
}
|
|
93
|
+
catch (err) {
|
|
94
|
+
results.push({
|
|
95
|
+
id: req.id,
|
|
96
|
+
status: err?.statusCode || 500,
|
|
97
|
+
data: null,
|
|
98
|
+
error: err?.message || 'Unknown error',
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
})();
|
|
102
|
+
executing.push(task);
|
|
103
|
+
if (executing.length >= MAX_CONCURRENCY) {
|
|
104
|
+
await Promise.race(executing);
|
|
105
|
+
// Remove settled promises
|
|
106
|
+
for (let i = executing.length - 1; i >= 0; i--) {
|
|
107
|
+
const settled = await Promise.race([
|
|
108
|
+
executing[i].then(() => true),
|
|
109
|
+
Promise.resolve(false),
|
|
110
|
+
]);
|
|
111
|
+
if (settled)
|
|
112
|
+
executing.splice(i, 1);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
await Promise.allSettled(executing);
|
|
117
|
+
return results;
|
|
86
118
|
}, {
|
|
87
119
|
maxBatchSize: config.batchSize || 10,
|
|
88
120
|
batchDelay: config.batchDelay || 50,
|
|
@@ -338,6 +370,24 @@ class TraceAPI {
|
|
|
338
370
|
async get(id) {
|
|
339
371
|
return this.client.request(`/api/traces/${id}`);
|
|
340
372
|
}
|
|
373
|
+
/**
|
|
374
|
+
* Update an existing trace (e.g. set status, duration, metadata on completion)
|
|
375
|
+
*
|
|
376
|
+
* @example
|
|
377
|
+
* ```typescript
|
|
378
|
+
* await client.traces.update(42, {
|
|
379
|
+
* status: 'success',
|
|
380
|
+
* durationMs: 1234,
|
|
381
|
+
* metadata: { output: 'done' }
|
|
382
|
+
* });
|
|
383
|
+
* ```
|
|
384
|
+
*/
|
|
385
|
+
async update(id, params) {
|
|
386
|
+
return this.client.request(`/api/traces/${id}`, {
|
|
387
|
+
method: 'PATCH',
|
|
388
|
+
body: JSON.stringify(params),
|
|
389
|
+
});
|
|
390
|
+
}
|
|
341
391
|
/**
|
|
342
392
|
* Create a span for a trace
|
|
343
393
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -31,5 +31,6 @@ export { WorkflowTracer, createWorkflowTracer, traceWorkflowStep, traceLangChain
|
|
|
31
31
|
export type { ClientConfig as AIEvalConfig, Trace as TraceData, Span as SpanData, Evaluation as EvaluationData, LLMJudgeResult as LLMJudgeData, RetryConfig, GenericMetadata as AnnotationData, TracedResponse, TestCase, TestResult, SnapshotData, ExportOptions, ImportOptions, StreamOptions, BatchOptions } from './types';
|
|
32
32
|
export { EvaluationTemplates, type EvaluationTemplateType, type FeatureUsage, type OrganizationLimits } from './types';
|
|
33
33
|
export type { Annotation, CreateAnnotationParams, ListAnnotationsParams, AnnotationTask, CreateAnnotationTaskParams, ListAnnotationTasksParams, AnnotationItem, CreateAnnotationItemParams, ListAnnotationItemsParams, APIKey, APIKeyWithSecret, CreateAPIKeyParams, UpdateAPIKeyParams, ListAPIKeysParams, APIKeyUsage, Webhook, CreateWebhookParams, UpdateWebhookParams, ListWebhooksParams, WebhookDelivery, ListWebhookDeliveriesParams, UsageStats, GetUsageParams, UsageSummary, LLMJudgeConfig, CreateLLMJudgeConfigParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, LLMJudgeAlignment, GetLLMJudgeAlignmentParams, Organization, } from './types';
|
|
34
|
+
export { parseArgs, runCheck, EXIT, type CheckArgs } from './cli/check';
|
|
34
35
|
import { AIEvalClient } from './client';
|
|
35
36
|
export default AIEvalClient;
|
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
*/
|
|
10
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
11
|
exports.decodeCursor = exports.encodeCursor = exports.autoPaginate = exports.createPaginatedIterator = exports.PaginatedIterator = exports.CacheTTL = exports.RequestCache = exports.RateLimiter = exports.batchRead = exports.streamEvaluation = exports.batchProcess = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.hasValidCodeSyntax = exports.containsAllRequiredFields = exports.followsInstructions = exports.hasNoToxicity = exports.respondedWithinTime = exports.hasFactualAccuracy = exports.containsLanguage = exports.hasReadabilityScore = exports.matchesSchema = exports.hasNoHallucinations = exports.isValidURL = exports.isValidEmail = exports.withinRange = exports.similarTo = exports.hasSentiment = exports.notContainsPII = exports.containsJSON = exports.hasLength = exports.matchesPattern = exports.containsKeywords = exports.expect = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalAIError = exports.AIEvalClient = void 0;
|
|
12
|
-
exports.EvaluationTemplates = exports.traceAutoGen = exports.traceCrewAI = exports.traceLangChainAgent = exports.traceWorkflowStep = exports.createWorkflowTracer = exports.WorkflowTracer = exports.traceAnthropic = exports.traceOpenAI = exports.Logger = exports.RequestBatcher = void 0;
|
|
12
|
+
exports.EXIT = exports.runCheck = exports.parseArgs = exports.EvaluationTemplates = exports.traceAutoGen = exports.traceCrewAI = exports.traceLangChainAgent = exports.traceWorkflowStep = exports.createWorkflowTracer = exports.WorkflowTracer = exports.traceAnthropic = exports.traceOpenAI = exports.Logger = exports.RequestBatcher = void 0;
|
|
13
13
|
// Main SDK exports
|
|
14
14
|
var client_1 = require("./client");
|
|
15
15
|
Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
|
|
@@ -106,6 +106,11 @@ Object.defineProperty(exports, "traceAutoGen", { enumerable: true, get: function
|
|
|
106
106
|
// New exports for v1.1.0
|
|
107
107
|
var types_1 = require("./types");
|
|
108
108
|
Object.defineProperty(exports, "EvaluationTemplates", { enumerable: true, get: function () { return types_1.EvaluationTemplates; } });
|
|
109
|
+
// CLI (programmatic use)
|
|
110
|
+
var check_1 = require("./cli/check");
|
|
111
|
+
Object.defineProperty(exports, "parseArgs", { enumerable: true, get: function () { return check_1.parseArgs; } });
|
|
112
|
+
Object.defineProperty(exports, "runCheck", { enumerable: true, get: function () { return check_1.runCheck; } });
|
|
113
|
+
Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return check_1.EXIT; } });
|
|
109
114
|
// Default export for convenience
|
|
110
115
|
const client_2 = require("./client");
|
|
111
116
|
exports.default = client_2.AIEvalClient;
|
package/dist/types.d.ts
CHANGED
|
@@ -111,6 +111,14 @@ export interface CreateTraceParams<TMetadata = Record<string, any>> {
|
|
|
111
111
|
durationMs?: number;
|
|
112
112
|
metadata?: TMetadata;
|
|
113
113
|
}
|
|
114
|
+
/**
|
|
115
|
+
* Parameters for updating an existing trace
|
|
116
|
+
*/
|
|
117
|
+
export interface UpdateTraceParams<TMetadata = Record<string, any>> {
|
|
118
|
+
status?: 'pending' | 'success' | 'error';
|
|
119
|
+
durationMs?: number;
|
|
120
|
+
metadata?: TMetadata;
|
|
121
|
+
}
|
|
114
122
|
/**
|
|
115
123
|
* Parameters for listing traces
|
|
116
124
|
*/
|
package/dist/workflows.js
CHANGED
|
@@ -135,13 +135,8 @@ class WorkflowTracer {
|
|
|
135
135
|
const durationMs = Date.now() - new Date(this.currentWorkflow.startedAt).getTime();
|
|
136
136
|
// Calculate total cost
|
|
137
137
|
const totalCost = this.costs.reduce((sum, cost) => sum + parseFloat(cost.totalCost), 0);
|
|
138
|
-
// Update the trace with
|
|
139
|
-
|
|
140
|
-
const traceId = `${this.options.tracePrefix}-complete-${this.currentWorkflow.traceId}`;
|
|
141
|
-
await this.client.traces.create({
|
|
142
|
-
name: `Workflow: ${this.currentWorkflow.name}`,
|
|
143
|
-
traceId,
|
|
144
|
-
organizationId: this.options.organizationId,
|
|
138
|
+
// Update the original trace with completion data
|
|
139
|
+
await this.client.traces.update(this.currentWorkflow.traceId, {
|
|
145
140
|
status: status === 'completed' ? 'success' : 'error',
|
|
146
141
|
durationMs,
|
|
147
142
|
metadata: (0, context_1.mergeWithContext)({
|